Browse Source

本地部署

master
guanjz 1 month ago
commit
86b867f541
  1. 8
      .idea/.gitignore
  2. 13
      .idea/compiler.xml
  3. 20
      .idea/jarRepositories.xml
  4. 14
      .idea/misc.xml
  5. 124
      .idea/uiDesigner.xml
  6. 23
      .project
  7. BIN
      NsantegouvListRe.jar
  8. 8
      bin/.idea/.gitignore
  9. 13
      bin/.idea/compiler.xml
  10. 20
      bin/.idea/jarRepositories.xml
  11. 14
      bin/.idea/misc.xml
  12. 23
      bin/.project
  13. 167
      bin/hs_err_pid15760.log
  14. 6
      bin/keywords.txt
  15. 138
      bin/pom.xml
  16. 281
      bin/processed_urls.txt
  17. 1
      bin/proxy.txt
  18. BIN
      bin/src/main/java/com/example/Inka.class
  19. BIN
      bin/src/main/java/com/example/NSFAwardCrawler.class
  20. BIN
      bin/src/main/java/com/example/PatentscopeSeleniumCrawler.class
  21. BIN
      bin/src/main/java/com/example/ProxyIPChecker.class
  22. BIN
      bin/src/main/java/com/example/StringFieldExtractor.class
  23. BIN
      bin/src/main/java/com/example/getInKa.class
  24. BIN
      bin/src/main/java/com/example/jsonGetOk.class
  25. BIN
      bin/src/main/java/com/example/ook.class
  26. BIN
      bin/src/main/java/com/example/oook.class
  27. BIN
      bin/src/main/java/com/example/projTopic.class
  28. BIN
      bin/src/main/java/com/example/saveInES.class
  29. BIN
      bin/src/main/java/com/example/test.class
  30. BIN
      bin/src/main/java/com/example/test2.class
  31. BIN
      bin/src/main/java/com/example/testContent.class
  32. BIN
      bin/src/main/java/com/example/umlistTest.class
  33. 4
      bin/target/classes/META-INF/MANIFEST.MF
  34. BIN
      bin/target/es-crawler-1.0-SNAPSHOT-jar-with-dependencies.jar
  35. BIN
      bin/target/es-crawler-1.0-SNAPSHOT.jar
  36. 5
      bin/target/maven-archiver/pom.properties
  37. 1
      bin/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst
  38. 1
      bin/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst
  39. 0
      bin/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/createdFiles.lst
  40. 0
      bin/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/inputFiles.lst
  41. 167
      hs_err_pid15760.log
  42. 1045
      keywords.txt
  43. BIN
      original_captcha.png
  44. 150
      pom.xml
  45. BIN
      preprocessed_captcha.png
  46. 281
      processed_urls.txt
  47. 1
      proxy.txt
  48. 119
      src/main/java/com/example/AusContent.java
  49. 200
      src/main/java/com/example/AusList.java
  50. 173
      src/main/java/com/example/CaptchaOCR.java
  51. 81
      src/main/java/com/example/CsAirScraper.java
  52. 404
      src/main/java/com/example/CtriScraper.java
  53. 121
      src/main/java/com/example/CtriScraperContent.java
  54. 113
      src/main/java/com/example/Inka.java
  55. 111
      src/main/java/com/example/NSFAwardCrawler.java
  56. 130
      src/main/java/com/example/PatentscopeSeleniumCrawler.java
  57. 25
      src/main/java/com/example/ProxyIPChecker.java
  58. 496
      src/main/java/com/example/ScraperWithCaptcha.java
  59. 74
      src/main/java/com/example/StringFieldExtractor.java
  60. 60
      src/main/java/com/example/WipoPatentsSelenium.java
  61. 594
      src/main/java/com/example/cliniTopic.java
  62. 438
      src/main/java/com/example/drks.java
  63. 165
      src/main/java/com/example/getInKa.java
  64. 47
      src/main/java/com/example/jsonGetOk.java
  65. 256
      src/main/java/com/example/ook.java
  66. 524
      src/main/java/com/example/oook.java
  67. 403
      src/main/java/com/example/projTopic.java
  68. 122
      src/main/java/com/example/saveInES.java
  69. 101
      src/main/java/com/example/test.java
  70. 103
      src/main/java/com/example/testContent.java
  71. 340
      src/main/java/com/example/testList.java
  72. 22
      src/main/java/com/example/umlistTest.java
  73. 12
      src/main/resources/logback.xml
  74. BIN
      target/classes/com/example/AusContent.class
  75. BIN
      target/classes/com/example/AusList.class
  76. BIN
      target/classes/com/example/CaptchaOCR.class
  77. BIN
      target/classes/com/example/CsAirScraper.class
  78. BIN
      target/classes/com/example/CtriScraper.class
  79. BIN
      target/classes/com/example/CtriScraperContent.class
  80. BIN
      target/classes/com/example/Inka.class
  81. BIN
      target/classes/com/example/NSFAwardCrawler.class
  82. BIN
      target/classes/com/example/PatentscopeSeleniumCrawler.class
  83. BIN
      target/classes/com/example/ProxyIPChecker.class
  84. BIN
      target/classes/com/example/ScraperWithCaptcha$1.class
  85. BIN
      target/classes/com/example/ScraperWithCaptcha$PageInfo.class
  86. BIN
      target/classes/com/example/ScraperWithCaptcha.class
  87. BIN
      target/classes/com/example/StringFieldExtractor.class
  88. BIN
      target/classes/com/example/WipoPatentsSelenium.class
  89. BIN
      target/classes/com/example/cliniTopic.class
  90. BIN
      target/classes/com/example/drks.class
  91. BIN
      target/classes/com/example/getInKa.class
  92. BIN
      target/classes/com/example/jsonGetOk.class
  93. BIN
      target/classes/com/example/ook.class
  94. BIN
      target/classes/com/example/oook.class
  95. BIN
      target/classes/com/example/projTopic.class
  96. BIN
      target/classes/com/example/saveInES.class
  97. BIN
      target/classes/com/example/test.class
  98. BIN
      target/classes/com/example/testContent.class
  99. BIN
      target/classes/com/example/testList.class
  100. BIN
      target/classes/com/example/umlistTest.class

8
.idea/.gitignore

@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
# Editor-based HTTP Client requests
/httpRequests/

13
.idea/compiler.xml

@ -0,0 +1,13 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="CompilerConfiguration">
<annotationProcessing>
<profile name="Maven default annotation processors profile" enabled="true">
<sourceOutputDir name="target/generated-sources/annotations" />
<sourceTestOutputDir name="target/generated-test-sources/test-annotations" />
<outputRelativeToContentRoot value="true" />
<module name="DaKaES" />
</profile>
</annotationProcessing>
</component>
</project>

20
.idea/jarRepositories.xml

@ -0,0 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="RemoteRepositoriesConfiguration">
<remote-repository>
<option name="id" value="central" />
<option name="name" value="Central Repository" />
<option name="url" value="https://repo.maven.apache.org/maven2" />
</remote-repository>
<remote-repository>
<option name="id" value="central" />
<option name="name" value="Maven Central repository" />
<option name="url" value="https://repo1.maven.org/maven2" />
</remote-repository>
<remote-repository>
<option name="id" value="jboss.community" />
<option name="name" value="JBoss Community repository" />
<option name="url" value="https://repository.jboss.org/nexus/content/repositories/public/" />
</remote-repository>
</component>
</project>

14
.idea/misc.xml

@ -0,0 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ExternalStorageConfigurationManager" enabled="true" />
<component name="MavenProjectsManager">
<option name="originalFiles">
<list>
<option value="$PROJECT_DIR$/pom.xml" />
</list>
</option>
</component>
<component name="ProjectRootManager" version="2" languageLevel="JDK_1_8" default="true" project-jdk-name="corretto-1.8" project-jdk-type="JavaSDK">
<output url="file://$PROJECT_DIR$/out" />
</component>
</project>

124
.idea/uiDesigner.xml

@ -0,0 +1,124 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Palette2">
<group name="Swing">
<item class="com.intellij.uiDesigner.HSpacer" tooltip-text="Horizontal Spacer" icon="/com/intellij/uiDesigner/icons/hspacer.png" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="1" hsize-policy="6" anchor="0" fill="1" />
</item>
<item class="com.intellij.uiDesigner.VSpacer" tooltip-text="Vertical Spacer" icon="/com/intellij/uiDesigner/icons/vspacer.png" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="1" anchor="0" fill="2" />
</item>
<item class="javax.swing.JPanel" icon="/com/intellij/uiDesigner/icons/panel.png" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="3" hsize-policy="3" anchor="0" fill="3" />
</item>
<item class="javax.swing.JScrollPane" icon="/com/intellij/uiDesigner/icons/scrollPane.png" removable="false" auto-create-binding="false" can-attach-label="true">
<default-constraints vsize-policy="7" hsize-policy="7" anchor="0" fill="3" />
</item>
<item class="javax.swing.JButton" icon="/com/intellij/uiDesigner/icons/button.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="3" anchor="0" fill="1" />
<initial-values>
<property name="text" value="Button" />
</initial-values>
</item>
<item class="javax.swing.JRadioButton" icon="/com/intellij/uiDesigner/icons/radioButton.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="3" anchor="8" fill="0" />
<initial-values>
<property name="text" value="RadioButton" />
</initial-values>
</item>
<item class="javax.swing.JCheckBox" icon="/com/intellij/uiDesigner/icons/checkBox.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="3" anchor="8" fill="0" />
<initial-values>
<property name="text" value="CheckBox" />
</initial-values>
</item>
<item class="javax.swing.JLabel" icon="/com/intellij/uiDesigner/icons/label.png" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="0" anchor="8" fill="0" />
<initial-values>
<property name="text" value="Label" />
</initial-values>
</item>
<item class="javax.swing.JTextField" icon="/com/intellij/uiDesigner/icons/textField.png" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1">
<preferred-size width="150" height="-1" />
</default-constraints>
</item>
<item class="javax.swing.JPasswordField" icon="/com/intellij/uiDesigner/icons/passwordField.png" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1">
<preferred-size width="150" height="-1" />
</default-constraints>
</item>
<item class="javax.swing.JFormattedTextField" icon="/com/intellij/uiDesigner/icons/formattedTextField.png" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1">
<preferred-size width="150" height="-1" />
</default-constraints>
</item>
<item class="javax.swing.JTextArea" icon="/com/intellij/uiDesigner/icons/textArea.png" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JTextPane" icon="/com/intellij/uiDesigner/icons/textPane.png" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JEditorPane" icon="/com/intellij/uiDesigner/icons/editorPane.png" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JComboBox" icon="/com/intellij/uiDesigner/icons/comboBox.png" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="0" hsize-policy="2" anchor="8" fill="1" />
</item>
<item class="javax.swing.JTable" icon="/com/intellij/uiDesigner/icons/table.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JList" icon="/com/intellij/uiDesigner/icons/list.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="2" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JTree" icon="/com/intellij/uiDesigner/icons/tree.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JTabbedPane" icon="/com/intellij/uiDesigner/icons/tabbedPane.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="3" hsize-policy="3" anchor="0" fill="3">
<preferred-size width="200" height="200" />
</default-constraints>
</item>
<item class="javax.swing.JSplitPane" icon="/com/intellij/uiDesigner/icons/splitPane.png" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="3" hsize-policy="3" anchor="0" fill="3">
<preferred-size width="200" height="200" />
</default-constraints>
</item>
<item class="javax.swing.JSpinner" icon="/com/intellij/uiDesigner/icons/spinner.png" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1" />
</item>
<item class="javax.swing.JSlider" icon="/com/intellij/uiDesigner/icons/slider.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1" />
</item>
<item class="javax.swing.JSeparator" icon="/com/intellij/uiDesigner/icons/separator.png" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3" />
</item>
<item class="javax.swing.JProgressBar" icon="/com/intellij/uiDesigner/icons/progressbar.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="0" fill="1" />
</item>
<item class="javax.swing.JToolBar" icon="/com/intellij/uiDesigner/icons/toolbar.png" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="0" fill="1">
<preferred-size width="-1" height="20" />
</default-constraints>
</item>
<item class="javax.swing.JToolBar$Separator" icon="/com/intellij/uiDesigner/icons/toolbarSeparator.png" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="0" anchor="0" fill="1" />
</item>
<item class="javax.swing.JScrollBar" icon="/com/intellij/uiDesigner/icons/scrollbar.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="0" anchor="0" fill="2" />
</item>
</group>
</component>
</project>

23
.project

@ -0,0 +1,23 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>DaKaES</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.jdt.core.javabuilder</name>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.m2e.core.maven2Builder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.jdt.core.javanature</nature>
<nature>org.eclipse.m2e.core.maven2Nature</nature>
</natures>
</projectDescription>

BIN
NsantegouvListRe.jar

8
bin/.idea/.gitignore

@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
# Editor-based HTTP Client requests
/httpRequests/

13
bin/.idea/compiler.xml

@ -0,0 +1,13 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="CompilerConfiguration">
<annotationProcessing>
<profile name="Maven default annotation processors profile" enabled="true">
<sourceOutputDir name="target/generated-sources/annotations" />
<sourceTestOutputDir name="target/generated-test-sources/test-annotations" />
<outputRelativeToContentRoot value="true" />
<module name="DaKaES" />
</profile>
</annotationProcessing>
</component>
</project>

20
bin/.idea/jarRepositories.xml

@ -0,0 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="RemoteRepositoriesConfiguration">
<remote-repository>
<option name="id" value="central" />
<option name="name" value="Central Repository" />
<option name="url" value="https://repo.maven.apache.org/maven2" />
</remote-repository>
<remote-repository>
<option name="id" value="central" />
<option name="name" value="Maven Central repository" />
<option name="url" value="https://repo1.maven.org/maven2" />
</remote-repository>
<remote-repository>
<option name="id" value="jboss.community" />
<option name="name" value="JBoss Community repository" />
<option name="url" value="https://repository.jboss.org/nexus/content/repositories/public/" />
</remote-repository>
</component>
</project>

14
bin/.idea/misc.xml

@ -0,0 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ExternalStorageConfigurationManager" enabled="true" />
<component name="MavenProjectsManager">
<option name="originalFiles">
<list>
<option value="$PROJECT_DIR$/pom.xml" />
</list>
</option>
</component>
<component name="ProjectRootManager" version="2" languageLevel="JDK_1_8" default="true" project-jdk-name="corretto-1.8" project-jdk-type="JavaSDK">
<output url="file://$PROJECT_DIR$/out" />
</component>
</project>

23
bin/.project

@ -0,0 +1,23 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>DaKaES</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.jdt.core.javabuilder</name>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.m2e.core.maven2Builder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.jdt.core.javanature</nature>
<nature>org.eclipse.m2e.core.maven2Nature</nature>
</natures>
</projectDescription>

167
bin/hs_err_pid15760.log

@ -0,0 +1,167 @@
#
# There is insufficient memory for the Java Runtime Environment to continue.
# Native memory allocation (malloc) failed to allocate 1048576 bytes for AllocateHeap
# Possible reasons:
# The system is out of physical RAM or swap space
# The process is running with CompressedOops enabled, and the Java Heap may be blocking the growth of the native heap
# Possible solutions:
# Reduce memory load on the system
# Increase physical memory or swap space
# Check if swap backing store is full
# Decrease Java heap size (-Xmx/-Xms)
# Decrease number of Java threads
# Decrease Java thread stack sizes (-Xss)
# Set larger code cache with -XX:ReservedCodeCacheSize=
# JVM is running with Zero Based Compressed Oops mode in which the Java heap is
# placed in the first 32GB address space. The Java Heap base address is the
# maximum limit for the native heap growth. Please use -XX:HeapBaseMinAddress
# to set the Java Heap base and to place the Java Heap above 32GB virtual address.
# This output file may be truncated or incomplete.
#
# Out of Memory Error (memory/allocation.inline.hpp:61), pid=15760, tid=0x0000000000003334
#
# JRE version: (8.0_422-b05) (build )
# Java VM: OpenJDK 64-Bit Server VM (25.422-b05 mixed mode windows-amd64 compressed oops)
# Failed to write core dump. Minidumps are not enabled by default on client versions of Windows
#
--------------- T H R E A D ---------------
Current thread (0x00000271b7d7d800): JavaThread "Unknown thread" [_thread_in_vm, id=13108, stack(0x00000082a1500000,0x00000082a1600000)]
Stack: [0x00000082a1500000,0x00000082a1600000]
[error occurred during error reporting (printing stack bounds), id 0xc0000005]
Native frames: (J=compiled Java code, j=interpreted, Vv=VM code, C=native code)
--------------- P R O C E S S ---------------
Java Threads: ( => current thread )
Other Threads:
=>0x00000271b7d7d800 (exited) JavaThread "Unknown thread" [_thread_in_vm, id=13108, stack(0x00000082a1500000,0x00000082a1600000)]
VM state:not at safepoint (normal execution)
VM Mutex/Monitor currently owned by a thread: None
heap address: 0x00000006c4000000, size: 4032 MB, Compressed Oops mode: Zero based, Oop shift amount: 3
Narrow klass base: 0x0000000000000000, Narrow klass shift: 3
Compressed class space size: 1073741824 Address: 0x00000007c0000000
Heap:
PSYoungGen total 75264K, used 1290K [0x000000076c000000, 0x0000000771400000, 0x00000007c0000000)
eden space 64512K, 2% used [0x000000076c000000,0x000000076c142900,0x000000076ff00000)
from space 10752K, 0% used [0x0000000770980000,0x0000000770980000,0x0000000771400000)
to space 10752K, 0% used [0x000000076ff00000,0x000000076ff00000,0x0000000770980000)
ParOldGen total 172032K, used 0K [0x00000006c4000000, 0x00000006ce800000, 0x000000076c000000)
object space 172032K, 0% used [0x00000006c4000000,0x00000006c4000000,0x00000006ce800000)
Metaspace used 790K, capacity 4480K, committed 4480K, reserved 1056768K
class space used 76K, capacity 384K, committed 384K, reserved 1048576K
Card table byte_map: [0x00000271c8b70000,0x00000271c9360000] byte_map_base: 0x00000271c5550000
Marking Bits: (ParMarkBitMap*) 0x00000000521f38d0
Begin Bits: [0x00000271c98a0000, 0x00000271cd7a0000)
End Bits: [0x00000271cd7a0000, 0x00000271d16a0000)
Polling page: 0x00000271b7eb0000
CodeCache: size=245760Kb used=328Kb max_used=328Kb free=245431Kb
bounds [0x00000271b97b0000, 0x00000271b9a20000, 0x00000271c87b0000]
total_blobs=57 nmethods=0 adapters=38
compilation: enabled
Compilation events (0 events):
No events
GC Heap History (0 events):
No events
Deoptimization events (0 events):
No events
Classes redefined (0 events):
No events
Internal exceptions (0 events):
No events
Events (10 events):
Event: 0.012 loading class java/lang/Short
Event: 0.013 loading class java/lang/Short done
Event: 0.013 loading class java/lang/Integer
Event: 0.013 loading class java/lang/Integer done
Event: 0.013 loading class java/lang/Long
Event: 0.013 loading class java/lang/Long done
Event: 0.013 loading class java/lang/NullPointerException
Event: 0.013 loading class java/lang/NullPointerException done
Event: 0.013 loading class java/lang/ArithmeticException
Event: 0.013 loading class java/lang/ArithmeticException done
Dynamic libraries:
0x00007ff7d7590000 - 0x00007ff7d75d6000 C:\Users\18264\.jdks\corretto-1.8.0_422\bin\java.exe
0x00007ffa1d0b0000 - 0x00007ffa1d2a8000 C:\Windows\SYSTEM32\ntdll.dll
0x00007ffa1ce90000 - 0x00007ffa1cf52000 C:\Windows\System32\KERNEL32.DLL
0x00007ffa1add0000 - 0x00007ffa1b0cf000 C:\Windows\System32\KERNELBASE.dll
0x00007ffa1c470000 - 0x00007ffa1c51f000 C:\Windows\System32\ADVAPI32.dll
0x00007ffa1cf60000 - 0x00007ffa1cffe000 C:\Windows\System32\msvcrt.dll
0x00007ffa1cdf0000 - 0x00007ffa1ce8f000 C:\Windows\System32\sechost.dll
0x00007ffa1c580000 - 0x00007ffa1c6a3000 C:\Windows\System32\RPCRT4.dll
0x00007ffa1ada0000 - 0x00007ffa1adc7000 C:\Windows\System32\bcrypt.dll
0x00007ffa1be50000 - 0x00007ffa1bfed000 C:\Windows\System32\USER32.dll
0x00007ffa1a7a0000 - 0x00007ffa1a7c2000 C:\Windows\System32\win32u.dll
0x00007ffa1bff0000 - 0x00007ffa1c01b000 C:\Windows\System32\GDI32.dll
0x00007ffa1ac80000 - 0x00007ffa1ad9a000 C:\Windows\System32\gdi32full.dll
0x00007ffa1aaa0000 - 0x00007ffa1ab3d000 C:\Windows\System32\msvcp_win.dll
0x00007ffa1a9a0000 - 0x00007ffa1aaa0000 C:\Windows\System32\ucrtbase.dll
0x00007ffa00e00000 - 0x00007ffa0109a000 C:\Windows\WinSxS\amd64_microsoft.windows.common-controls_6595b64144ccf1df_6.0.19041.4355_none_60b8b9eb71f62e16\COMCTL32.dll
0x00007ffa1c030000 - 0x00007ffa1c05f000 C:\Windows\System32\IMM32.DLL
0x00007ffa10f70000 - 0x00007ffa10f85000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\vcruntime140.dll
0x00007ff9ceb10000 - 0x00007ff9cebab000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\msvcp140.dll
0x0000000051a10000 - 0x000000005226c000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\server\jvm.dll
0x00007ffa1c020000 - 0x00007ffa1c028000 C:\Windows\System32\PSAPI.DLL
0x00007ff9fac50000 - 0x00007ff9fac59000 C:\Windows\SYSTEM32\WSOCK32.dll
0x00007ffa0d800000 - 0x00007ffa0d827000 C:\Windows\SYSTEM32\WINMM.dll
0x00007ffa0ff90000 - 0x00007ffa0ff9a000 C:\Windows\SYSTEM32\VERSION.dll
0x00007ffa1c060000 - 0x00007ffa1c0cb000 C:\Windows\System32\WS2_32.dll
0x00007ffa18f70000 - 0x00007ffa18f82000 C:\Windows\SYSTEM32\kernel.appcore.dll
0x00007ffa10fc0000 - 0x00007ffa10fd0000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\verify.dll
0x00007ffa0aec0000 - 0x00007ffa0aeeb000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\java.dll
0x00007ff9ca260000 - 0x00007ff9ca296000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\jdwp.dll
0x00007ffa0af80000 - 0x00007ffa0af89000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\npt.dll
0x00007ff9c1ab0000 - 0x00007ff9c1ae2000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\instrument.dll
0x00007ffa008e0000 - 0x00007ffa008f8000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\zip.dll
VM Arguments:
jvm_args: -agentlib:jdwp=transport=dt_socket,address=127.0.0.1:56727,suspend=y,server=n -javaagent:C:\Users\18264\AppData\Local\JetBrains\IntelliJIdea2021.1\captureAgent\debugger-agent.jar -Dfile.encoding=UTF-8
java_command: com.example.saveInES
java_class_path (initial): C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\charsets.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\access-bridge-64.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\cldrdata.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\dnsns.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\jaccess.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\jfxrt.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\localedata.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\nashorn.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\sunec.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\sunjce_provider.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\sunmscapi.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\sunpkcs11.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\zipfs.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\jce.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\jfr.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\jfxswt.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\jsse.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\management-agent.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\resources.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\rt.jar;F:\workTest\DaKaES\target\classes;C:\Users\18264\.m2\repository\org\elasticsearch\client\elasticsearch-rest-high-level-client\7.17.0\elasticsearch-rest-high-level-client-7.17.0.jar;C:\Users\18264\.m2\repository\org\elasticsearch\elasticsearch\7.17.0\elasticsearch-7.17.0.jar;C:\Users\18264\.m2\repository\org\elasticsearch\elasticsearch-core\7.17.0\elasticsearch-core-7.17.0.jar;C:\Users\18264\.m2\repository\org\elasticsearch\elasticsearch-secure-sm\7.17.0\elasticsearch-secure-sm-7.17.0.jar;C:\Users\18264\.m2\repository\org\elasticsearch\elasticsearch-x-content\7.17.0\elasticsearch-x-content-7.17.0.jar;C:\Users\18264\.m2\repository\org\yaml\snakeyaml\1.26\snakeyaml-1.26.jar;C:\Users\18264\.m2\repository\c
Launcher Type: SUN_STANDARD
Environment Variables:
JAVA_HOME=E:\java
PATH=C:\Program Files\Common Files\Oracle\Java\javapath;D:\vm\bin\;E:\app\18264\product\11.2.0\dbhome_1\bin;C:\Windows\system32;C:\Windows;C:\Windows\System32\Wbem;C:\Windows\System32\WindowsPowerShell\v1.0\;C:\Windows\System32\OpenSSH\;C:\Program Files (x86)\NVIDIA Corporation\PhysX\Common;C:\Program Files\NVIDIA Corporation\NVIDIA NvDLISR;C:\Windows\system32;C:\Windows;C:\Windows\System32\Wbem;C:\Windows\System32\WindowsPowerShell\v1.0\;C:\Windows\System32\OpenSSH\;E:\java\bin;F:\mysql\mysql-5.7.37-winx64\mysql-5.7.37-winx64\bin;D:\matlab\Matlab R2022a\bin;C:\Program Files (x86)\dotnet\;C:\Program Files\dotnet\;D:\winscp\WinSCP\;F:\javaAbout\apache-maven-3.6.3\bin;C:\Program Files\Git\cmd;F:\tool\nvm\nvm;F:\tool\node;C:\Users\18264\AppData\Local\Programs\Python\Python311\Scripts\;C:\Users\18264\AppData\Local\Programs\Python\Python311\;C:\Users\18264\AppData\Local\Programs\Python\Python37\Scripts\;C:\Users\18264\AppData\Local\Programs\Python\Python37\;C:\Users\18264\AppData\Local\Programs\Python\Launcher\;C:\Users\18264\AppData\Local\Microsoft\WindowsApps;D:\Microsoft VS Code\bin;F:\idea\IntelliJ IDEA 2021.1.3\bin;;F:\tool\nvm\nvm;F:\tool\node
USERNAME=18264
OS=Windows_NT
PROCESSOR_IDENTIFIER=Intel64 Family 6 Model 141 Stepping 1, GenuineIntel
--------------- S Y S T E M ---------------
OS: Windows 10 , 64 bit Build 19041 (10.0.19041.5438)
CPU:total 16 (initial active 16) (8 cores per cpu, 2 threads per core) family 6 model 141 stepping 1, cmov, cx8, fxsr, mmx, sse, sse2, sse3, ssse3, sse4.1, sse4.2, popcnt, avx, avx2, aes, clmul, erms, 3dnowpref, lzcnt, ht, tsc, tscinvbit, bmi1, bmi2, adx
Memory: 4k page, physical 16509736k(919328k free), swap 36170532k(5620k free)
vm_info: OpenJDK 64-Bit Server VM (25.422-b05) for windows-amd64 JRE (1.8.0_422-b05), built on Jul 11 2024 17:20:01 by "Administrator" with MS VC++ 15.9 (VS2017)
time: Tue Mar 4 14:31:48 2025
timezone: Intel64 Family 6 Model 141 Stepping 1, GenuineIntel
elapsed time: 0.022707 seconds (0d 0h 0m 0s)

6
bin/keywords.txt

@ -0,0 +1,6 @@
Montpellier Institute of Virology, France
Ontario Public Health Laboratory, Canada
University of Texas Biosafety Laboratory, USA
Korea National Institute of Infectious Diseases (KCDC)
Israel Institute of Life Sciences
Biosafety Laboratory, University of Basel, Switzerland

138
bin/pom.xml

@ -0,0 +1,138 @@
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.example</groupId>
<artifactId>es-crawler</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
</properties>
<dependencies>
<!-- Elasticsearch High Level REST Client -->
<dependency>
<groupId>org.elasticsearch.client</groupId>
<artifactId>elasticsearch-rest-high-level-client</artifactId>
<version>7.17.0</version>
</dependency>
<dependency>
<groupId>co.elastic.clients</groupId>
<artifactId>elasticsearch-java</artifactId>
<version>7.17.15</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.15.0</version>
</dependency>
<!-- Jsoup HTML parser -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.17.2</version>
</dependency>
<!-- OkHttp -->
<dependency>
<groupId>com.squareup.okhttp3</groupId>
<artifactId>okhttp</artifactId>
<version>4.9.3</version>
</dependency>
<!-- Logging -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.36</version>
</dependency>
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
<version>1.2.11</version>
</dependency>
<!-- Kafka 客户端 -->
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>3.9.0</version>
</dependency>
<!-- Selenium Java -->
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>4.10.0</version>
</dependency>
<!-- WebDriver Manager -->
<dependency>
<groupId>io.github.bonigarcia</groupId>
<artifactId>webdrivermanager</artifactId>
<version>5.6.2</version>
</dependency>
<dependency>
<groupId>org.json</groupId>
<artifactId>json</artifactId>
<version>20230227</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.10.1</version>
</dependency>
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.61.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<!-- 编译插件,保持 Java 8 配置 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.1</version>
<configuration>
<source>8</source>
<target>8</target>
</configuration>
</plugin>
<!-- Assembly 插件,打包包含依赖的可执行 JAR -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.3.0</version>
<configuration>
<archive>
<manifest>
<mainClass>com.example.projTopic</mainClass> <!-- 替换为你的主类全路径 -->
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

281
bin/processed_urls.txt

@ -0,0 +1,281 @@
https://www.zyctd.com/zixun/201/1055143.html
https://www.zyctd.com/zixun/201/861786.html
https://www.zyctd.com/zixun/201/1053482.html
https://www.zyctd.com/zixun/201/269419.html
https://www.zyctd.com/zixun/201/1053149.html
https://www.zyctd.com/zixun/201/1023926.html
https://www.zyctd.com/zixun/201/435325.html
https://www.zyctd.com/zixun/201/1050302.html
https://www.zyctd.com/zixun/201/880441.html
https://www.zyctd.com/zixun/201/1019635.html
https://www.zyctd.com/zixun/201/970572.html
https://www.zyctd.com/zixun/201/912277.html
https://www.zyctd.com/zixun/201/372444.html
https://www.zyctd.com/zixun/201/1073629.html
https://www.zyctd.com/zixun/201/1069386.html
https://www.zyctd.com/zixun/201/730410.html
https://www.zyctd.com/zixun/201/953220.html
https://www.zyctd.com/zixun/201/1074339.html
https://www.zyctd.com/zixun/201/1072317.html
https://www.zyctd.com/zixun/201/294794.html
https://www.zyctd.com/zixun/201/267592.html
https://www.zyctd.com/zixun/201/979665.html
https://www.zyctd.com/zixun/201/869885.html
https://www.zyctd.com/zixun/201/1054064.html
https://www.zyctd.com/zixun/201/1049331.html
https://www.zyctd.com/zixun/201/442647.html
https://www.zyctd.com/zixun/201/285992.html
https://www.zyctd.com/zixun/201/1037972.html
https://www.zyctd.com/zixun/201/799801.html
https://www.zyctd.com/zixun/201/916078.html
https://www.zyctd.com/zixun/201/456647.html
https://www.zyctd.com/zixun/201/812121.html
https://www.zyctd.com/zixun/201/1042740.html
https://www.zyctd.com/zixun/201/1042708.html
https://www.zyctd.com/zixun/201/840450.html
https://www.zyctd.com/zixun/201/320749.html
https://www.zyctd.com/zixun/201/496106.html
https://www.zyctd.com/zixun/201/850201.html
https://www.zyctd.com/zixun/201/277145.html
https://www.zyctd.com/zixun/201/299091.html
https://www.zyctd.com/zixun/201/266080.html
https://www.zyctd.com/zixun/201/1051925.html
https://www.zyctd.com/zixun/201/898081.html
https://www.zyctd.com/zixun/201/873280.html
https://www.zyctd.com/zixun/201/703880.html
https://www.zyctd.com/zixun/201/873126.html
https://www.zyctd.com/zixun/201/887931.html
https://www.zyctd.com/zixun/201/432742.html
https://www.zyctd.com/zixun/201/1040431.html
https://www.zyctd.com/zixun/201/1040223.html
https://www.zyctd.com/zixun/201/858118.html
https://www.zyctd.com/zixun/201/971286.html
https://www.zyctd.com/zixun/201/458488.html
https://www.zyctd.com/zixun/201/1079381.html
https://www.zyctd.com/zixun/201/263578.html
https://www.zyctd.com/zixun/201/553513.html
https://www.zyctd.com/zixun/201/286229.html
https://www.zyctd.com/zixun/201/285365.html
https://www.zyctd.com/zixun/201/352921.html
https://www.zyctd.com/zixun/201/503267.html
https://www.zyctd.com/zixun/201/391337.html
https://www.zyctd.com/zixun/201/813052.html
https://www.zyctd.com/zixun/201/1053556.html
https://www.zyctd.com/zixun/201/1041197.html
https://www.zyctd.com/zixun/201/287420.html
https://www.zyctd.com/zixun/201/291563.html
https://www.zyctd.com/zixun/201/948250.html
https://www.zyctd.com/zixun/201/289034.html
https://www.zyctd.com/zixun/201/795965.html
https://www.zyctd.com/zixun/201/292962.html
https://www.zyctd.com/zixun/201/975850.html
https://www.zyctd.com/zixun/201/275335.html
https://www.zyctd.com/zixun/201/1031992.html
https://www.zyctd.com/zixun/201/1033886.html
https://www.zyctd.com/zixun/201/999510.html
https://www.zyctd.com/zixun/201/270144.html
https://www.zyctd.com/zixun/201/1055519.html
https://www.zyctd.com/zixun/201/272205.html
https://www.zyctd.com/zixun/201/526059.html
https://www.zyctd.com/zixun/201/456640.html
https://www.zyctd.com/zixun/201/267952.html
https://www.zyctd.com/zixun/201/803469.html
https://www.zyctd.com/zixun/201/270763.html
https://www.zyctd.com/zixun/201/1072987.html
https://www.zyctd.com/zixun/201/265176.html
https://www.zyctd.com/zixun/201/1022141.html
https://www.zyctd.com/zixun/201/290173.html
https://www.zyctd.com/zixun/201/269175.html
https://www.zyctd.com/zixun/201/744991.html
https://www.zyctd.com/zixun/201/1019131.html
https://www.zyctd.com/zixun/201/717054.html
https://www.zyctd.com/zixun/201/517358.html
https://www.zyctd.com/zixun/201/1058505.html
https://www.zyctd.com/zixun/201/905515.html
https://www.zyctd.com/zixun/201/287395.html
https://www.zyctd.com/zixun/201/934873.html
https://www.zyctd.com/zixun/201/1051317.html
https://www.zyctd.com/zixun/201/926018.html
https://www.zyctd.com/zixun/201/334511.html
https://www.zyctd.com/zixun/201/845896.html
https://www.zyctd.com/zixun/201/587785.html
https://www.zyctd.com/zixun/201/288376.html
https://www.zyctd.com/zixun/201/851405.html
https://www.zyctd.com/zixun/201/941404.html
https://www.zyctd.com/zixun/201/881855.html
https://www.zyctd.com/zixun/201/602632.html
https://www.zyctd.com/zixun/201/293601.html
https://www.zyctd.com/zixun/201/541809.html
https://www.zyctd.com/zixun/201/335120.html
https://www.zyctd.com/zixun/201/1031137.html
https://www.zyctd.com/zixun/201/960101.html
https://www.zyctd.com/zixun/201/1077142.html
https://www.zyctd.com/zixun/201/1063222.html
https://www.zyctd.com/zixun/201/681466.html
https://www.zyctd.com/zixun/201/1031130.html
https://www.zyctd.com/zixun/201/1073734.html
https://www.zyctd.com/zixun/201/1062186.html
https://www.zyctd.com/zixun/201/1046628.html
https://www.zyctd.com/zixun/201/358892.html
https://www.zyctd.com/zixun/201/285361.html
https://www.zyctd.com/zixun/201/1059889.html
https://www.zyctd.com/zixun/201/297824.html
https://www.zyctd.com/zixun/201/844307.html
https://www.zyctd.com/zixun/201/900524.html
https://www.zyctd.com/zixun/201/1057636.html
https://www.zyctd.com/zixun/201/1010080.html
https://www.zyctd.com/zixun/201/409152.html
https://www.zyctd.com/zixun/201/402782.html
https://www.zyctd.com/zixun/201/770296.html
https://www.zyctd.com/zixun/201/1040602.html
https://www.zyctd.com/zixun/201/606503.html
https://www.zyctd.com/zixun/201/784471.html
https://www.zyctd.com/zixun/201/466097.html
https://www.zyctd.com/zixun/201/1071160.html
https://www.zyctd.com/zixun/201/623226.html
https://www.zyctd.com/zixun/201/948264.html
https://www.zyctd.com/zixun/201/293462.html
https://www.zyctd.com/zixun/201/829348.html
https://www.zyctd.com/zixun/201/332369.html
https://www.zyctd.com/zixun/201/907461.html
https://www.zyctd.com/zixun/201/756555.html
https://www.zyctd.com/zixun/201/717915.html
https://www.zyctd.com/zixun/201/262203.html
https://www.zyctd.com/zixun/201/1055787.html
https://www.zyctd.com/zixun/201/432336.html
https://www.zyctd.com/zixun/201/907489.html
https://www.zyctd.com/zixun/201/1014686.html
https://www.zyctd.com/zixun/201/1053320.html
https://www.zyctd.com/zixun/201/480020.html
https://www.zyctd.com/zixun/201/287423.html
https://www.zyctd.com/zixun/201/385289.html
https://www.zyctd.com/zixun/201/1030421.html
https://www.zyctd.com/zixun/201/527648.html
https://www.zyctd.com/zixun/201/972959.html
https://www.zyctd.com/zixun/201/408767.html
https://www.zyctd.com/zixun/201/724887.html
https://www.zyctd.com/zixun/201/291480.html
https://www.zyctd.com/zixun/201/472544.html
https://www.zyctd.com/zixun/201/724873.html
https://www.zyctd.com/zixun/201/281751.html
https://www.zyctd.com/zixun/201/1049693.html
https://www.zyctd.com/zixun/201/869619.html
https://www.zyctd.com/zixun/201/355497.html
https://www.zyctd.com/zixun/201/341623.html
https://www.zyctd.com/zixun/201/450753.html
https://www.zyctd.com/zixun/201/1065837.html
https://www.zyctd.com/zixun/201/1031331.html
https://www.zyctd.com/zixun/201/669727.html
https://www.zyctd.com/zixun/201/1034010.html
https://www.zyctd.com/zixun/201/1054058.html
https://www.zyctd.com/zixun/201/954613.html
https://www.zyctd.com/zixun/201/715584.html
https://www.zyctd.com/zixun/201/1051110.html
https://www.zyctd.com/zixun/201/269963.html
https://www.zyctd.com/zixun/201/1048128.html
https://www.zyctd.com/zixun/201/793207.html
https://www.zyctd.com/zixun/201/284310.html
https://www.zyctd.com/zixun/201/282639.html
https://www.zyctd.com/zixun/201/1068138.html
https://www.zyctd.com/zixun/201/340678.html
https://www.zyctd.com/zixun/201/294371.html
https://www.zyctd.com/zixun/201/324277.html
https://www.zyctd.com/zixun/201/1048931.html
https://www.zyctd.com/zixun/201/851398.html
https://www.zyctd.com/zixun/201/263527.html
https://www.zyctd.com/zixun/201/919480.html
https://www.zyctd.com/zixun/201/685442.html
https://www.zyctd.com/zixun/201/428325.html
https://www.zyctd.com/zixun/201/1032698.html
https://www.zyctd.com/zixun/201/1003367.html
https://www.zyctd.com/zixun/201/852315.html
https://www.zyctd.com/zixun/201/283156.html
https://www.zyctd.com/zixun/201/262484.html
https://www.zyctd.com/zixun/201/1065225.html
https://www.zyctd.com/zixun/201/763331.html
https://www.zyctd.com/zixun/201/1066158.html
https://www.zyctd.com/zixun/201/1047744.html
https://www.zyctd.com/zixun/201/842795.html
https://www.zyctd.com/zixun/201/975374.html
https://www.zyctd.com/zixun/201/1055865.html
https://www.zyctd.com/zixun/201/1017367.html
https://www.zyctd.com/zixun/201/1057711.html
https://www.zyctd.com/zixun/201/1074295.html
https://www.zyctd.com/zixun/201/283647.html
https://www.zyctd.com/zixun/201/286896.html
https://www.zyctd.com/zixun/201/1043393.html
https://www.zyctd.com/zixun/201/305888.html
https://www.zyctd.com/zixun/201/487258.html
https://www.zyctd.com/zixun/201/1045652.html
https://www.zyctd.com/zixun/201/1064905.html
https://www.zyctd.com/zixun/201/515636.html
https://www.zyctd.com/zixun/201/1038609.html
https://www.zyctd.com/zixun/201/438083.html
https://www.zyctd.com/zixun/201/297327.html
https://www.zyctd.com/zixun/201/773537.html
https://www.zyctd.com/zixun/201/1043589.html
https://www.zyctd.com/zixun/201/815712.html
https://www.zyctd.com/zixun/201/698595.html
https://www.zyctd.com/zixun/201/269800.html
https://www.zyctd.com/zixun/201/1030332.html
https://www.zyctd.com/zixun/201/422676.html
https://www.zyctd.com/zixun/201/290130.html
https://www.zyctd.com/zixun/201/270359.html
https://www.zyctd.com/zixun/201/995604.html
https://www.zyctd.com/zixun/201/1074993.html
https://www.zyctd.com/zixun/201/1054825.html
https://www.zyctd.com/zixun/201/918577.html
https://www.zyctd.com/zixun/201/686527.html
https://www.zyctd.com/zixun/201/297509.html
https://www.zyctd.com/zixun/201/622708.html
https://www.zyctd.com/zixun/201/469870.html
https://www.zyctd.com/zixun/201/844328.html
https://www.zyctd.com/zixun/201/394508.html
https://www.zyctd.com/zixun/201/271744.html
https://www.zyctd.com/zixun/201/1054940.html
https://www.zyctd.com/zixun/201/732818.html
https://www.zyctd.com/zixun/201/1049547.html
https://www.zyctd.com/zixun/201/1059684.html
https://www.zyctd.com/zixun/201/1055301.html
https://www.zyctd.com/zixun/201/962068.html
https://www.zyctd.com/zixun/201/451355.html
https://www.zyctd.com/zixun/201/1056174.html
https://www.zyctd.com/zixun/201/930540.html
https://www.zyctd.com/zixun/201/871656.html
https://www.zyctd.com/zixun/201/363246.html
https://www.zyctd.com/zixun/201/845672.html
https://www.zyctd.com/zixun/201/452965.html
https://www.zyctd.com/zixun/201/1065920.html
https://www.zyctd.com/zixun/201/1058808.html
https://www.zyctd.com/zixun/201/986868.html
https://www.zyctd.com/zixun/201/489785.html
https://www.zyctd.com/zixun/201/307946.html
https://www.zyctd.com/zixun/201/833359.html
https://www.zyctd.com/zixun/201/806969.html
https://www.zyctd.com/zixun/201/1050812.html
https://www.zyctd.com/zixun/201/1033696.html
https://www.zyctd.com/zixun/201/501167.html
https://www.zyctd.com/zixun/201/1078919.html
https://www.zyctd.com/zixun/201/1036495.html
https://www.zyctd.com/zixun/201/1008736.html
https://www.zyctd.com/zixun/201/1054264.html
https://www.zyctd.com/zixun/201/493152.html
https://www.zyctd.com/zixun/201/685456.html
https://www.zyctd.com/zixun/201/995597.html
https://www.zyctd.com/zixun/201/905501.html
https://www.zyctd.com/zixun/201/347573.html
https://www.zyctd.com/zixun/201/1045494.html
https://www.zyctd.com/zixun/201/549775.html
https://www.zyctd.com/zixun/201/1037336.html
https://www.zyctd.com/zixun/201/1034972.html
https://www.zyctd.com/zixun/201/653046.html
https://www.zyctd.com/zixun/201/316612.html
https://www.zyctd.com/zixun/201/447064.html
https://www.zyctd.com/zixun/201/307603.html
https://www.zyctd.com/zixun/201/263437.html
https://www.zyctd.com/zixun/201/894490.html
https://www.zyctd.com/zixun/201/368629.html
https://www.zyctd.com/zixun/201/273285.html
https://www.zyctd.com/zixun/201/1059618.html
https://www.zyctd.com/zixun/201/459237.html

1
bin/proxy.txt

@ -0,0 +1 @@
127.0.0.1:7897

BIN
bin/src/main/java/com/example/Inka.class

BIN
bin/src/main/java/com/example/NSFAwardCrawler.class

BIN
bin/src/main/java/com/example/PatentscopeSeleniumCrawler.class

BIN
bin/src/main/java/com/example/ProxyIPChecker.class

BIN
bin/src/main/java/com/example/StringFieldExtractor.class

BIN
bin/src/main/java/com/example/getInKa.class

BIN
bin/src/main/java/com/example/jsonGetOk.class

BIN
bin/src/main/java/com/example/ook.class

BIN
bin/src/main/java/com/example/oook.class

BIN
bin/src/main/java/com/example/projTopic.class

BIN
bin/src/main/java/com/example/saveInES.class

BIN
bin/src/main/java/com/example/test.class

BIN
bin/src/main/java/com/example/test2.class

BIN
bin/src/main/java/com/example/testContent.class

BIN
bin/src/main/java/com/example/umlistTest.class

4
bin/target/classes/META-INF/MANIFEST.MF

@ -0,0 +1,4 @@
Manifest-Version: 1.0
Build-Jdk-Spec: 22
Created-By: Maven Integration for Eclipse

BIN
bin/target/es-crawler-1.0-SNAPSHOT-jar-with-dependencies.jar

BIN
bin/target/es-crawler-1.0-SNAPSHOT.jar

5
bin/target/maven-archiver/pom.properties

@ -0,0 +1,5 @@
#Generated by Maven
#Fri Apr 18 18:29:46 CST 2025
version=1.0-SNAPSHOT
groupId=com.example
artifactId=es-crawler

1
bin/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst

@ -0,0 +1 @@
com\example\projTopic.class

1
bin/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst

@ -0,0 +1 @@
F:\workTest\DaKaES\src\main\java\com\example\projTopic.java

0
bin/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/createdFiles.lst

0
bin/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/inputFiles.lst

167
hs_err_pid15760.log

@ -0,0 +1,167 @@
#
# There is insufficient memory for the Java Runtime Environment to continue.
# Native memory allocation (malloc) failed to allocate 1048576 bytes for AllocateHeap
# Possible reasons:
# The system is out of physical RAM or swap space
# The process is running with CompressedOops enabled, and the Java Heap may be blocking the growth of the native heap
# Possible solutions:
# Reduce memory load on the system
# Increase physical memory or swap space
# Check if swap backing store is full
# Decrease Java heap size (-Xmx/-Xms)
# Decrease number of Java threads
# Decrease Java thread stack sizes (-Xss)
# Set larger code cache with -XX:ReservedCodeCacheSize=
# JVM is running with Zero Based Compressed Oops mode in which the Java heap is
# placed in the first 32GB address space. The Java Heap base address is the
# maximum limit for the native heap growth. Please use -XX:HeapBaseMinAddress
# to set the Java Heap base and to place the Java Heap above 32GB virtual address.
# This output file may be truncated or incomplete.
#
# Out of Memory Error (memory/allocation.inline.hpp:61), pid=15760, tid=0x0000000000003334
#
# JRE version: (8.0_422-b05) (build )
# Java VM: OpenJDK 64-Bit Server VM (25.422-b05 mixed mode windows-amd64 compressed oops)
# Failed to write core dump. Minidumps are not enabled by default on client versions of Windows
#
--------------- T H R E A D ---------------
Current thread (0x00000271b7d7d800): JavaThread "Unknown thread" [_thread_in_vm, id=13108, stack(0x00000082a1500000,0x00000082a1600000)]
Stack: [0x00000082a1500000,0x00000082a1600000]
[error occurred during error reporting (printing stack bounds), id 0xc0000005]
Native frames: (J=compiled Java code, j=interpreted, Vv=VM code, C=native code)
--------------- P R O C E S S ---------------
Java Threads: ( => current thread )
Other Threads:
=>0x00000271b7d7d800 (exited) JavaThread "Unknown thread" [_thread_in_vm, id=13108, stack(0x00000082a1500000,0x00000082a1600000)]
VM state:not at safepoint (normal execution)
VM Mutex/Monitor currently owned by a thread: None
heap address: 0x00000006c4000000, size: 4032 MB, Compressed Oops mode: Zero based, Oop shift amount: 3
Narrow klass base: 0x0000000000000000, Narrow klass shift: 3
Compressed class space size: 1073741824 Address: 0x00000007c0000000
Heap:
PSYoungGen total 75264K, used 1290K [0x000000076c000000, 0x0000000771400000, 0x00000007c0000000)
eden space 64512K, 2% used [0x000000076c000000,0x000000076c142900,0x000000076ff00000)
from space 10752K, 0% used [0x0000000770980000,0x0000000770980000,0x0000000771400000)
to space 10752K, 0% used [0x000000076ff00000,0x000000076ff00000,0x0000000770980000)
ParOldGen total 172032K, used 0K [0x00000006c4000000, 0x00000006ce800000, 0x000000076c000000)
object space 172032K, 0% used [0x00000006c4000000,0x00000006c4000000,0x00000006ce800000)
Metaspace used 790K, capacity 4480K, committed 4480K, reserved 1056768K
class space used 76K, capacity 384K, committed 384K, reserved 1048576K
Card table byte_map: [0x00000271c8b70000,0x00000271c9360000] byte_map_base: 0x00000271c5550000
Marking Bits: (ParMarkBitMap*) 0x00000000521f38d0
Begin Bits: [0x00000271c98a0000, 0x00000271cd7a0000)
End Bits: [0x00000271cd7a0000, 0x00000271d16a0000)
Polling page: 0x00000271b7eb0000
CodeCache: size=245760Kb used=328Kb max_used=328Kb free=245431Kb
bounds [0x00000271b97b0000, 0x00000271b9a20000, 0x00000271c87b0000]
total_blobs=57 nmethods=0 adapters=38
compilation: enabled
Compilation events (0 events):
No events
GC Heap History (0 events):
No events
Deoptimization events (0 events):
No events
Classes redefined (0 events):
No events
Internal exceptions (0 events):
No events
Events (10 events):
Event: 0.012 loading class java/lang/Short
Event: 0.013 loading class java/lang/Short done
Event: 0.013 loading class java/lang/Integer
Event: 0.013 loading class java/lang/Integer done
Event: 0.013 loading class java/lang/Long
Event: 0.013 loading class java/lang/Long done
Event: 0.013 loading class java/lang/NullPointerException
Event: 0.013 loading class java/lang/NullPointerException done
Event: 0.013 loading class java/lang/ArithmeticException
Event: 0.013 loading class java/lang/ArithmeticException done
Dynamic libraries:
0x00007ff7d7590000 - 0x00007ff7d75d6000 C:\Users\18264\.jdks\corretto-1.8.0_422\bin\java.exe
0x00007ffa1d0b0000 - 0x00007ffa1d2a8000 C:\Windows\SYSTEM32\ntdll.dll
0x00007ffa1ce90000 - 0x00007ffa1cf52000 C:\Windows\System32\KERNEL32.DLL
0x00007ffa1add0000 - 0x00007ffa1b0cf000 C:\Windows\System32\KERNELBASE.dll
0x00007ffa1c470000 - 0x00007ffa1c51f000 C:\Windows\System32\ADVAPI32.dll
0x00007ffa1cf60000 - 0x00007ffa1cffe000 C:\Windows\System32\msvcrt.dll
0x00007ffa1cdf0000 - 0x00007ffa1ce8f000 C:\Windows\System32\sechost.dll
0x00007ffa1c580000 - 0x00007ffa1c6a3000 C:\Windows\System32\RPCRT4.dll
0x00007ffa1ada0000 - 0x00007ffa1adc7000 C:\Windows\System32\bcrypt.dll
0x00007ffa1be50000 - 0x00007ffa1bfed000 C:\Windows\System32\USER32.dll
0x00007ffa1a7a0000 - 0x00007ffa1a7c2000 C:\Windows\System32\win32u.dll
0x00007ffa1bff0000 - 0x00007ffa1c01b000 C:\Windows\System32\GDI32.dll
0x00007ffa1ac80000 - 0x00007ffa1ad9a000 C:\Windows\System32\gdi32full.dll
0x00007ffa1aaa0000 - 0x00007ffa1ab3d000 C:\Windows\System32\msvcp_win.dll
0x00007ffa1a9a0000 - 0x00007ffa1aaa0000 C:\Windows\System32\ucrtbase.dll
0x00007ffa00e00000 - 0x00007ffa0109a000 C:\Windows\WinSxS\amd64_microsoft.windows.common-controls_6595b64144ccf1df_6.0.19041.4355_none_60b8b9eb71f62e16\COMCTL32.dll
0x00007ffa1c030000 - 0x00007ffa1c05f000 C:\Windows\System32\IMM32.DLL
0x00007ffa10f70000 - 0x00007ffa10f85000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\vcruntime140.dll
0x00007ff9ceb10000 - 0x00007ff9cebab000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\msvcp140.dll
0x0000000051a10000 - 0x000000005226c000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\server\jvm.dll
0x00007ffa1c020000 - 0x00007ffa1c028000 C:\Windows\System32\PSAPI.DLL
0x00007ff9fac50000 - 0x00007ff9fac59000 C:\Windows\SYSTEM32\WSOCK32.dll
0x00007ffa0d800000 - 0x00007ffa0d827000 C:\Windows\SYSTEM32\WINMM.dll
0x00007ffa0ff90000 - 0x00007ffa0ff9a000 C:\Windows\SYSTEM32\VERSION.dll
0x00007ffa1c060000 - 0x00007ffa1c0cb000 C:\Windows\System32\WS2_32.dll
0x00007ffa18f70000 - 0x00007ffa18f82000 C:\Windows\SYSTEM32\kernel.appcore.dll
0x00007ffa10fc0000 - 0x00007ffa10fd0000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\verify.dll
0x00007ffa0aec0000 - 0x00007ffa0aeeb000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\java.dll
0x00007ff9ca260000 - 0x00007ff9ca296000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\jdwp.dll
0x00007ffa0af80000 - 0x00007ffa0af89000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\npt.dll
0x00007ff9c1ab0000 - 0x00007ff9c1ae2000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\instrument.dll
0x00007ffa008e0000 - 0x00007ffa008f8000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\zip.dll
VM Arguments:
jvm_args: -agentlib:jdwp=transport=dt_socket,address=127.0.0.1:56727,suspend=y,server=n -javaagent:C:\Users\18264\AppData\Local\JetBrains\IntelliJIdea2021.1\captureAgent\debugger-agent.jar -Dfile.encoding=UTF-8
java_command: com.example.saveInES
java_class_path (initial): C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\charsets.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\access-bridge-64.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\cldrdata.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\dnsns.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\jaccess.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\jfxrt.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\localedata.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\nashorn.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\sunec.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\sunjce_provider.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\sunmscapi.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\sunpkcs11.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\zipfs.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\jce.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\jfr.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\jfxswt.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\jsse.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\management-agent.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\resources.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\rt.jar;F:\workTest\DaKaES\target\classes;C:\Users\18264\.m2\repository\org\elasticsearch\client\elasticsearch-rest-high-level-client\7.17.0\elasticsearch-rest-high-level-client-7.17.0.jar;C:\Users\18264\.m2\repository\org\elasticsearch\elasticsearch\7.17.0\elasticsearch-7.17.0.jar;C:\Users\18264\.m2\repository\org\elasticsearch\elasticsearch-core\7.17.0\elasticsearch-core-7.17.0.jar;C:\Users\18264\.m2\repository\org\elasticsearch\elasticsearch-secure-sm\7.17.0\elasticsearch-secure-sm-7.17.0.jar;C:\Users\18264\.m2\repository\org\elasticsearch\elasticsearch-x-content\7.17.0\elasticsearch-x-content-7.17.0.jar;C:\Users\18264\.m2\repository\org\yaml\snakeyaml\1.26\snakeyaml-1.26.jar;C:\Users\18264\.m2\repository\c
Launcher Type: SUN_STANDARD
Environment Variables:
JAVA_HOME=E:\java
PATH=C:\Program Files\Common Files\Oracle\Java\javapath;D:\vm\bin\;E:\app\18264\product\11.2.0\dbhome_1\bin;C:\Windows\system32;C:\Windows;C:\Windows\System32\Wbem;C:\Windows\System32\WindowsPowerShell\v1.0\;C:\Windows\System32\OpenSSH\;C:\Program Files (x86)\NVIDIA Corporation\PhysX\Common;C:\Program Files\NVIDIA Corporation\NVIDIA NvDLISR;C:\Windows\system32;C:\Windows;C:\Windows\System32\Wbem;C:\Windows\System32\WindowsPowerShell\v1.0\;C:\Windows\System32\OpenSSH\;E:\java\bin;F:\mysql\mysql-5.7.37-winx64\mysql-5.7.37-winx64\bin;D:\matlab\Matlab R2022a\bin;C:\Program Files (x86)\dotnet\;C:\Program Files\dotnet\;D:\winscp\WinSCP\;F:\javaAbout\apache-maven-3.6.3\bin;C:\Program Files\Git\cmd;F:\tool\nvm\nvm;F:\tool\node;C:\Users\18264\AppData\Local\Programs\Python\Python311\Scripts\;C:\Users\18264\AppData\Local\Programs\Python\Python311\;C:\Users\18264\AppData\Local\Programs\Python\Python37\Scripts\;C:\Users\18264\AppData\Local\Programs\Python\Python37\;C:\Users\18264\AppData\Local\Programs\Python\Launcher\;C:\Users\18264\AppData\Local\Microsoft\WindowsApps;D:\Microsoft VS Code\bin;F:\idea\IntelliJ IDEA 2021.1.3\bin;;F:\tool\nvm\nvm;F:\tool\node
USERNAME=18264
OS=Windows_NT
PROCESSOR_IDENTIFIER=Intel64 Family 6 Model 141 Stepping 1, GenuineIntel
--------------- S Y S T E M ---------------
OS: Windows 10 , 64 bit Build 19041 (10.0.19041.5438)
CPU:total 16 (initial active 16) (8 cores per cpu, 2 threads per core) family 6 model 141 stepping 1, cmov, cx8, fxsr, mmx, sse, sse2, sse3, ssse3, sse4.1, sse4.2, popcnt, avx, avx2, aes, clmul, erms, 3dnowpref, lzcnt, ht, tsc, tscinvbit, bmi1, bmi2, adx
Memory: 4k page, physical 16509736k(919328k free), swap 36170532k(5620k free)
vm_info: OpenJDK 64-Bit Server VM (25.422-b05) for windows-amd64 JRE (1.8.0_422-b05), built on Jul 11 2024 17:20:01 by "Administrator" with MS VC++ 15.9 (VS2017)
time: Tue Mar 4 14:31:48 2025
timezone: Intel64 Family 6 Model 141 Stepping 1, GenuineIntel
elapsed time: 0.022707 seconds (0d 0h 0m 0s)

1045
keywords.txt
File diff suppressed because it is too large
View File

BIN
original_captcha.png

After

Width: 80  |  Height: 30  |  Size: 5.7 KiB

150
pom.xml

@ -0,0 +1,150 @@
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.example</groupId>
<artifactId>es-crawler</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
</properties>
<dependencies>
<!-- Elasticsearch High Level REST Client -->
<dependency>
<groupId>org.elasticsearch.client</groupId>
<artifactId>elasticsearch-rest-high-level-client</artifactId>
<version>7.17.0</version>
</dependency>
<dependency>
<groupId>co.elastic.clients</groupId>
<artifactId>elasticsearch-java</artifactId>
<version>7.17.15</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.15.0</version>
</dependency>
<!-- Jsoup HTML parser -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.17.2</version>
</dependency>
<!-- OkHttp -->
<dependency>
<groupId>com.squareup.okhttp3</groupId>
<artifactId>okhttp</artifactId>
<version>4.9.3</version>
</dependency>
<!-- Logging -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.36</version>
</dependency>
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
<version>1.2.11</version>
</dependency>
<!-- Kafka 客户端 -->
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>3.9.0</version>
</dependency>
<!-- Selenium Java -->
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>4.10.0</version>
</dependency>
<!-- WebDriver Manager -->
<dependency>
<groupId>io.github.bonigarcia</groupId>
<artifactId>webdrivermanager</artifactId>
<version>5.6.2</version>
</dependency>
<dependency>
<groupId>org.json</groupId>
<artifactId>json</artifactId>
<version>20230227</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.10.1</version>
</dependency>
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.61.0</version>
</dependency>
<dependency>
<groupId>net.sourceforge.tess4j</groupId>
<artifactId>tess4j</artifactId>
<version>4.5.4</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents.client5</groupId>
<artifactId>httpclient5</artifactId>
<version>5.3.1</version>
</dependency>
</dependencies>
<build>
<plugins>
<!-- 编译插件,保持 Java 8 配置 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.1</version>
<configuration>
<source>8</source>
<target>8</target>
</configuration>
</plugin>
<!-- Assembly 插件,打包包含依赖的可执行 JAR -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.3.0</version>
<configuration>
<archive>
<manifest>
<mainClass>com.example.CtriScraper</mainClass> <!-- 替换为你的主类全路径 -->
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

BIN
preprocessed_captcha.png

After

Width: 80  |  Height: 30  |  Size: 398 B

281
processed_urls.txt

@ -0,0 +1,281 @@
https://www.zyctd.com/zixun/201/1055143.html
https://www.zyctd.com/zixun/201/861786.html
https://www.zyctd.com/zixun/201/1053482.html
https://www.zyctd.com/zixun/201/269419.html
https://www.zyctd.com/zixun/201/1053149.html
https://www.zyctd.com/zixun/201/1023926.html
https://www.zyctd.com/zixun/201/435325.html
https://www.zyctd.com/zixun/201/1050302.html
https://www.zyctd.com/zixun/201/880441.html
https://www.zyctd.com/zixun/201/1019635.html
https://www.zyctd.com/zixun/201/970572.html
https://www.zyctd.com/zixun/201/912277.html
https://www.zyctd.com/zixun/201/372444.html
https://www.zyctd.com/zixun/201/1073629.html
https://www.zyctd.com/zixun/201/1069386.html
https://www.zyctd.com/zixun/201/730410.html
https://www.zyctd.com/zixun/201/953220.html
https://www.zyctd.com/zixun/201/1074339.html
https://www.zyctd.com/zixun/201/1072317.html
https://www.zyctd.com/zixun/201/294794.html
https://www.zyctd.com/zixun/201/267592.html
https://www.zyctd.com/zixun/201/979665.html
https://www.zyctd.com/zixun/201/869885.html
https://www.zyctd.com/zixun/201/1054064.html
https://www.zyctd.com/zixun/201/1049331.html
https://www.zyctd.com/zixun/201/442647.html
https://www.zyctd.com/zixun/201/285992.html
https://www.zyctd.com/zixun/201/1037972.html
https://www.zyctd.com/zixun/201/799801.html
https://www.zyctd.com/zixun/201/916078.html
https://www.zyctd.com/zixun/201/456647.html
https://www.zyctd.com/zixun/201/812121.html
https://www.zyctd.com/zixun/201/1042740.html
https://www.zyctd.com/zixun/201/1042708.html
https://www.zyctd.com/zixun/201/840450.html
https://www.zyctd.com/zixun/201/320749.html
https://www.zyctd.com/zixun/201/496106.html
https://www.zyctd.com/zixun/201/850201.html
https://www.zyctd.com/zixun/201/277145.html
https://www.zyctd.com/zixun/201/299091.html
https://www.zyctd.com/zixun/201/266080.html
https://www.zyctd.com/zixun/201/1051925.html
https://www.zyctd.com/zixun/201/898081.html
https://www.zyctd.com/zixun/201/873280.html
https://www.zyctd.com/zixun/201/703880.html
https://www.zyctd.com/zixun/201/873126.html
https://www.zyctd.com/zixun/201/887931.html
https://www.zyctd.com/zixun/201/432742.html
https://www.zyctd.com/zixun/201/1040431.html
https://www.zyctd.com/zixun/201/1040223.html
https://www.zyctd.com/zixun/201/858118.html
https://www.zyctd.com/zixun/201/971286.html
https://www.zyctd.com/zixun/201/458488.html
https://www.zyctd.com/zixun/201/1079381.html
https://www.zyctd.com/zixun/201/263578.html
https://www.zyctd.com/zixun/201/553513.html
https://www.zyctd.com/zixun/201/286229.html
https://www.zyctd.com/zixun/201/285365.html
https://www.zyctd.com/zixun/201/352921.html
https://www.zyctd.com/zixun/201/503267.html
https://www.zyctd.com/zixun/201/391337.html
https://www.zyctd.com/zixun/201/813052.html
https://www.zyctd.com/zixun/201/1053556.html
https://www.zyctd.com/zixun/201/1041197.html
https://www.zyctd.com/zixun/201/287420.html
https://www.zyctd.com/zixun/201/291563.html
https://www.zyctd.com/zixun/201/948250.html
https://www.zyctd.com/zixun/201/289034.html
https://www.zyctd.com/zixun/201/795965.html
https://www.zyctd.com/zixun/201/292962.html
https://www.zyctd.com/zixun/201/975850.html
https://www.zyctd.com/zixun/201/275335.html
https://www.zyctd.com/zixun/201/1031992.html
https://www.zyctd.com/zixun/201/1033886.html
https://www.zyctd.com/zixun/201/999510.html
https://www.zyctd.com/zixun/201/270144.html
https://www.zyctd.com/zixun/201/1055519.html
https://www.zyctd.com/zixun/201/272205.html
https://www.zyctd.com/zixun/201/526059.html
https://www.zyctd.com/zixun/201/456640.html
https://www.zyctd.com/zixun/201/267952.html
https://www.zyctd.com/zixun/201/803469.html
https://www.zyctd.com/zixun/201/270763.html
https://www.zyctd.com/zixun/201/1072987.html
https://www.zyctd.com/zixun/201/265176.html
https://www.zyctd.com/zixun/201/1022141.html
https://www.zyctd.com/zixun/201/290173.html
https://www.zyctd.com/zixun/201/269175.html
https://www.zyctd.com/zixun/201/744991.html
https://www.zyctd.com/zixun/201/1019131.html
https://www.zyctd.com/zixun/201/717054.html
https://www.zyctd.com/zixun/201/517358.html
https://www.zyctd.com/zixun/201/1058505.html
https://www.zyctd.com/zixun/201/905515.html
https://www.zyctd.com/zixun/201/287395.html
https://www.zyctd.com/zixun/201/934873.html
https://www.zyctd.com/zixun/201/1051317.html
https://www.zyctd.com/zixun/201/926018.html
https://www.zyctd.com/zixun/201/334511.html
https://www.zyctd.com/zixun/201/845896.html
https://www.zyctd.com/zixun/201/587785.html
https://www.zyctd.com/zixun/201/288376.html
https://www.zyctd.com/zixun/201/851405.html
https://www.zyctd.com/zixun/201/941404.html
https://www.zyctd.com/zixun/201/881855.html
https://www.zyctd.com/zixun/201/602632.html
https://www.zyctd.com/zixun/201/293601.html
https://www.zyctd.com/zixun/201/541809.html
https://www.zyctd.com/zixun/201/335120.html
https://www.zyctd.com/zixun/201/1031137.html
https://www.zyctd.com/zixun/201/960101.html
https://www.zyctd.com/zixun/201/1077142.html
https://www.zyctd.com/zixun/201/1063222.html
https://www.zyctd.com/zixun/201/681466.html
https://www.zyctd.com/zixun/201/1031130.html
https://www.zyctd.com/zixun/201/1073734.html
https://www.zyctd.com/zixun/201/1062186.html
https://www.zyctd.com/zixun/201/1046628.html
https://www.zyctd.com/zixun/201/358892.html
https://www.zyctd.com/zixun/201/285361.html
https://www.zyctd.com/zixun/201/1059889.html
https://www.zyctd.com/zixun/201/297824.html
https://www.zyctd.com/zixun/201/844307.html
https://www.zyctd.com/zixun/201/900524.html
https://www.zyctd.com/zixun/201/1057636.html
https://www.zyctd.com/zixun/201/1010080.html
https://www.zyctd.com/zixun/201/409152.html
https://www.zyctd.com/zixun/201/402782.html
https://www.zyctd.com/zixun/201/770296.html
https://www.zyctd.com/zixun/201/1040602.html
https://www.zyctd.com/zixun/201/606503.html
https://www.zyctd.com/zixun/201/784471.html
https://www.zyctd.com/zixun/201/466097.html
https://www.zyctd.com/zixun/201/1071160.html
https://www.zyctd.com/zixun/201/623226.html
https://www.zyctd.com/zixun/201/948264.html
https://www.zyctd.com/zixun/201/293462.html
https://www.zyctd.com/zixun/201/829348.html
https://www.zyctd.com/zixun/201/332369.html
https://www.zyctd.com/zixun/201/907461.html
https://www.zyctd.com/zixun/201/756555.html
https://www.zyctd.com/zixun/201/717915.html
https://www.zyctd.com/zixun/201/262203.html
https://www.zyctd.com/zixun/201/1055787.html
https://www.zyctd.com/zixun/201/432336.html
https://www.zyctd.com/zixun/201/907489.html
https://www.zyctd.com/zixun/201/1014686.html
https://www.zyctd.com/zixun/201/1053320.html
https://www.zyctd.com/zixun/201/480020.html
https://www.zyctd.com/zixun/201/287423.html
https://www.zyctd.com/zixun/201/385289.html
https://www.zyctd.com/zixun/201/1030421.html
https://www.zyctd.com/zixun/201/527648.html
https://www.zyctd.com/zixun/201/972959.html
https://www.zyctd.com/zixun/201/408767.html
https://www.zyctd.com/zixun/201/724887.html
https://www.zyctd.com/zixun/201/291480.html
https://www.zyctd.com/zixun/201/472544.html
https://www.zyctd.com/zixun/201/724873.html
https://www.zyctd.com/zixun/201/281751.html
https://www.zyctd.com/zixun/201/1049693.html
https://www.zyctd.com/zixun/201/869619.html
https://www.zyctd.com/zixun/201/355497.html
https://www.zyctd.com/zixun/201/341623.html
https://www.zyctd.com/zixun/201/450753.html
https://www.zyctd.com/zixun/201/1065837.html
https://www.zyctd.com/zixun/201/1031331.html
https://www.zyctd.com/zixun/201/669727.html
https://www.zyctd.com/zixun/201/1034010.html
https://www.zyctd.com/zixun/201/1054058.html
https://www.zyctd.com/zixun/201/954613.html
https://www.zyctd.com/zixun/201/715584.html
https://www.zyctd.com/zixun/201/1051110.html
https://www.zyctd.com/zixun/201/269963.html
https://www.zyctd.com/zixun/201/1048128.html
https://www.zyctd.com/zixun/201/793207.html
https://www.zyctd.com/zixun/201/284310.html
https://www.zyctd.com/zixun/201/282639.html
https://www.zyctd.com/zixun/201/1068138.html
https://www.zyctd.com/zixun/201/340678.html
https://www.zyctd.com/zixun/201/294371.html
https://www.zyctd.com/zixun/201/324277.html
https://www.zyctd.com/zixun/201/1048931.html
https://www.zyctd.com/zixun/201/851398.html
https://www.zyctd.com/zixun/201/263527.html
https://www.zyctd.com/zixun/201/919480.html
https://www.zyctd.com/zixun/201/685442.html
https://www.zyctd.com/zixun/201/428325.html
https://www.zyctd.com/zixun/201/1032698.html
https://www.zyctd.com/zixun/201/1003367.html
https://www.zyctd.com/zixun/201/852315.html
https://www.zyctd.com/zixun/201/283156.html
https://www.zyctd.com/zixun/201/262484.html
https://www.zyctd.com/zixun/201/1065225.html
https://www.zyctd.com/zixun/201/763331.html
https://www.zyctd.com/zixun/201/1066158.html
https://www.zyctd.com/zixun/201/1047744.html
https://www.zyctd.com/zixun/201/842795.html
https://www.zyctd.com/zixun/201/975374.html
https://www.zyctd.com/zixun/201/1055865.html
https://www.zyctd.com/zixun/201/1017367.html
https://www.zyctd.com/zixun/201/1057711.html
https://www.zyctd.com/zixun/201/1074295.html
https://www.zyctd.com/zixun/201/283647.html
https://www.zyctd.com/zixun/201/286896.html
https://www.zyctd.com/zixun/201/1043393.html
https://www.zyctd.com/zixun/201/305888.html
https://www.zyctd.com/zixun/201/487258.html
https://www.zyctd.com/zixun/201/1045652.html
https://www.zyctd.com/zixun/201/1064905.html
https://www.zyctd.com/zixun/201/515636.html
https://www.zyctd.com/zixun/201/1038609.html
https://www.zyctd.com/zixun/201/438083.html
https://www.zyctd.com/zixun/201/297327.html
https://www.zyctd.com/zixun/201/773537.html
https://www.zyctd.com/zixun/201/1043589.html
https://www.zyctd.com/zixun/201/815712.html
https://www.zyctd.com/zixun/201/698595.html
https://www.zyctd.com/zixun/201/269800.html
https://www.zyctd.com/zixun/201/1030332.html
https://www.zyctd.com/zixun/201/422676.html
https://www.zyctd.com/zixun/201/290130.html
https://www.zyctd.com/zixun/201/270359.html
https://www.zyctd.com/zixun/201/995604.html
https://www.zyctd.com/zixun/201/1074993.html
https://www.zyctd.com/zixun/201/1054825.html
https://www.zyctd.com/zixun/201/918577.html
https://www.zyctd.com/zixun/201/686527.html
https://www.zyctd.com/zixun/201/297509.html
https://www.zyctd.com/zixun/201/622708.html
https://www.zyctd.com/zixun/201/469870.html
https://www.zyctd.com/zixun/201/844328.html
https://www.zyctd.com/zixun/201/394508.html
https://www.zyctd.com/zixun/201/271744.html
https://www.zyctd.com/zixun/201/1054940.html
https://www.zyctd.com/zixun/201/732818.html
https://www.zyctd.com/zixun/201/1049547.html
https://www.zyctd.com/zixun/201/1059684.html
https://www.zyctd.com/zixun/201/1055301.html
https://www.zyctd.com/zixun/201/962068.html
https://www.zyctd.com/zixun/201/451355.html
https://www.zyctd.com/zixun/201/1056174.html
https://www.zyctd.com/zixun/201/930540.html
https://www.zyctd.com/zixun/201/871656.html
https://www.zyctd.com/zixun/201/363246.html
https://www.zyctd.com/zixun/201/845672.html
https://www.zyctd.com/zixun/201/452965.html
https://www.zyctd.com/zixun/201/1065920.html
https://www.zyctd.com/zixun/201/1058808.html
https://www.zyctd.com/zixun/201/986868.html
https://www.zyctd.com/zixun/201/489785.html
https://www.zyctd.com/zixun/201/307946.html
https://www.zyctd.com/zixun/201/833359.html
https://www.zyctd.com/zixun/201/806969.html
https://www.zyctd.com/zixun/201/1050812.html
https://www.zyctd.com/zixun/201/1033696.html
https://www.zyctd.com/zixun/201/501167.html
https://www.zyctd.com/zixun/201/1078919.html
https://www.zyctd.com/zixun/201/1036495.html
https://www.zyctd.com/zixun/201/1008736.html
https://www.zyctd.com/zixun/201/1054264.html
https://www.zyctd.com/zixun/201/493152.html
https://www.zyctd.com/zixun/201/685456.html
https://www.zyctd.com/zixun/201/995597.html
https://www.zyctd.com/zixun/201/905501.html
https://www.zyctd.com/zixun/201/347573.html
https://www.zyctd.com/zixun/201/1045494.html
https://www.zyctd.com/zixun/201/549775.html
https://www.zyctd.com/zixun/201/1037336.html
https://www.zyctd.com/zixun/201/1034972.html
https://www.zyctd.com/zixun/201/653046.html
https://www.zyctd.com/zixun/201/316612.html
https://www.zyctd.com/zixun/201/447064.html
https://www.zyctd.com/zixun/201/307603.html
https://www.zyctd.com/zixun/201/263437.html
https://www.zyctd.com/zixun/201/894490.html
https://www.zyctd.com/zixun/201/368629.html
https://www.zyctd.com/zixun/201/273285.html
https://www.zyctd.com/zixun/201/1059618.html
https://www.zyctd.com/zixun/201/459237.html

1
proxy.txt

@ -0,0 +1 @@
127.0.0.1:7897

119
src/main/java/com/example/AusContent.java

@ -0,0 +1,119 @@
package com.example;
import okhttp3.*;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
public class AusContent {
public static void main(String[] args) throws IOException {
OkHttpClient client = new OkHttpClient().newBuilder()
.build();
MediaType mediaType = MediaType.parse("text/plain");
RequestBody body = RequestBody.create(mediaType, "");
Request request = new Request.Builder()
.url("https://www.anzctr.org.auTrial/Registration/TrialReview.aspx?id=389345&isReview=true")
.get()
.build();
Response response = client.newCall(request).execute();
String html = response.body().string();
Document parse = Jsoup.parse(html);
String title = parse.select("#ctl00_body_CXSTUDYTITLE").text();
String registNum = parse.select("#ctl00_body_CXACTRNUMBER").text();
String registTime = convertDate(parse.select("#ctl00_body_CXAPPROVALDATE").text());
String sponsor = parse.select("#ctl00_body_repeater_TXFUNDINGSOURCE_ctl00_CXTYPE").text();
String studyType = parse.select("#ctl00_body_CXSTUDYTYPE").text();
String phase = parse.select("#ctl00_body_CXPHASE").text();
String disease = parse.select("#ctl00_body_repeater_TXHEALTHCONDITION_ctl00_CXHEALTHCONDITION").text();
String SD1 = parse.select("#ctl00_body_CXPURPOSE").text();
String SD2 = parse.select("#ctl00_body_CXALLOCATION").text();
String SD3 = parse.select("#ctl00_body_CXCONCEALMENT").text();
String SD4 = parse.select("#ctl00_body_CXSEQUENCE").text();
String SD5 = parse.select("#ctl00_body_CXMASKING").text();
String SD6 = parse.select("#ctl00_body_maskingdiv > div > div.review-element-content").text();
String SD7 = parse.select("#ctl00_body_CXASSIGNMENT").text();
String SD8 = parse.select("#ctl00_body_CXPHASE").text();
String SD9 = parse.select("#ctl00_body_CXENDPOINT").text();
String SD10 = parse.select("#ctl00_body_CXSTATISTICALMETHODS").text();
String SD11 = parse.select("#ctl00_body_interventional_div > div:nth-child(8) > div > div.review-element-content").text();
String studyObjective = parse.select("#ctl00_body_CXPURPOSE").text();
String inclusionCriteria = parse.select("#ctl00_body_CXINCLUSIVECRITERIA").text();
String exclusionCriteria = parse.select("#ctl00_body_CXEXCLUSIVECRITERIA").text();
String currentStatus = parse.select("#ctl00_body_CXRECRUITMENTSTATUS").text();
String enrollment = parse.select("#ctl00_body_CXSAMPLESIZE").text();
String country = parse.select("#ctl00_body_repeater_TXCOUNTRYOUTSIDEAUSTRALIA_ctl01_CXCOUNTRY").text();
String intervention = parse.select("#ctl00_body_trialDiv > div:nth-child(30) > div > div.review-element-content").text();
Map<String,Object> studyDesign = new HashMap<>();
studyDesign.put("Purpose of the study",SD1);
studyDesign.put("Allocation to intervention",SD2);
studyDesign.put("Procedure for enrolling a subject and allocating the treatment (allocation concealment procedures)",SD3);
studyDesign.put("Methods used to generate the sequence in which subjects will be randomised (sequence generation)",SD4);
studyDesign.put("Masking / blinding",SD5);
studyDesign.put("Who is / are masked / blinded?",SD6);
studyDesign.put("Intervention assignment",SD7);
studyDesign.put("Other design features",SD11);
studyDesign.put("Phase",SD8);
studyDesign.put("Type of endpoint/s",SD9);
studyDesign.put("Statistical methods / analysis",SD10);
Map<String,Object> resultData = new HashMap<>();
resultData.put("title",title);
resultData.put("registNum",registNum);
resultData.put("registTime",registTime);
resultData.put("registStatus","");
resultData.put("registTitle","");
resultData.put("fullTitle","");
resultData.put("sponsor",sponsor);
resultData.put("sponsorPart","");
resultData.put("studyType",studyType);
resultData.put("phase",phase);
resultData.put("disease",disease);
resultData.put("studyDesign",studyDesign);
resultData.put("studyObjective",studyObjective);
resultData.put("studyStartDate","");
resultData.put("inclusionCriteria",inclusionCriteria);
resultData.put("exclusionCriteria",exclusionCriteria);
resultData.put("currentStatus",currentStatus);
resultData.put("enrollment",enrollment);
resultData.put("country",country);
resultData.put("tagTime","");
resultData.put("intervention",intervention);
resultData.put("primaryOutcome","");
resultData.put("crawlTime",getCurrentTime());
// resultData.put("crawlUrl",url);
resultData.put("postTime",registTime);
resultData.put("content","content");
resultData.put("forwardcontent","forwardcontent");
System.out.println(resultData);
}
public static String convertDate(String inputDate) {
try {
SimpleDateFormat inputFormat = new SimpleDateFormat("d/MM/yyyy");
Date date = inputFormat.parse(inputDate);
SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
return outputFormat.format(date);
} catch (ParseException e) {
return "Invalid date format";
}
}
public static String getCurrentTime() {
// 创建 DateTimeFormatter指定输出格式
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
// 获取当前时间
LocalDateTime now = LocalDateTime.now();
// 格式化
return now.format(formatter);
}
}

200
src/main/java/com/example/AusList.java

@ -0,0 +1,200 @@
package com.example;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class AusList {
public static void main(String[] args) throws Exception {
String targetUrl = "https://www.anzctr.org.au/TrialSearch.aspx?page=20";
String baseUrl = "https://www.anzctr.org.au/TrialSearch.aspx";
String postUrl = "https://www.anzctr.org.au/TrialSearch.aspx";
String pageNumber = targetUrl.contains("?page=") ? targetUrl.split("page=")[1] : "1";
int page = Integer.parseInt(pageNumber);
System.out.println("Page Number: " + page);
// 存储 cookies
Set<String> cookieSet = new HashSet<>();
String sessionId = null;
// 第一步初始 GET 请求获取 cookies ViewState
URL initialUrl = new URL(baseUrl);
HttpURLConnection initialConn = (HttpURLConnection) initialUrl.openConnection();
initialConn.setRequestMethod("GET");
initialConn.setRequestProperty("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36");
initialConn.setRequestProperty("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7");
initialConn.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.9,th;q=0.8,en;q=0.7");
initialConn.setRequestProperty("Cache-Control", "no-cache");
initialConn.setRequestProperty("Pragma", "no-cache");
initialConn.setRequestProperty("Upgrade-Insecure-Requests", "1");
initialConn.setRequestProperty("Sec-Fetch-Dest", "document");
initialConn.setRequestProperty("Sec-Fetch-Mode", "navigate");
initialConn.setRequestProperty("Sec-Fetch-Site", "same-origin");
initialConn.setRequestProperty("Sec-Fetch-User", "?1");
initialConn.setRequestProperty("Sec-CH-UA",
"\"Google Chrome\";v=\"135\", \"Not-A.Brand\";v=\"8\", \"Chromium\";v=\"135\"");
initialConn.setRequestProperty("Sec-CH-UA-Mobile", "?0");
initialConn.setRequestProperty("Sec-CH-UA-Platform", "\"Windows\"");
// initialConn.setRequestProperty("Cookie",
// "ASP.NET_SessionId=gkhw0unpeytexsa40v1sdjf1; __utma=2822752...; _ga=...");
initialConn.setInstanceFollowRedirects(false);
initialConn.setConnectTimeout(10000);
initialConn.setReadTimeout(10000);
// 捕获 cookies
sessionId = updateCookies(initialConn, cookieSet);
// 读取响应内容以获取 ViewState
BufferedReader in = new BufferedReader(new InputStreamReader(initialConn.getInputStream()));
StringBuilder content = new StringBuilder();
String inputLine;
while ((inputLine = in.readLine()) != null) {
content.append(inputLine);
}
in.close();
initialConn.disconnect();
// 提取初始 ViewState
Map<String, String> viewStateData = extractViewStateData(content.toString());
String viewState = viewStateData.get("__VIEWSTATE");
String viewStateGen = viewStateData.get("__VIEWSTATEGENERATOR");
String eventValidation = viewStateData.get("__EVENTVALIDATION");
String payload = buildPostData(viewState,eventValidation,viewStateGen,page,sessionId);
HttpURLConnection conn = (HttpURLConnection) new URL(postUrl).openConnection();
conn.setRequestMethod("POST");
conn.setDoOutput(true);
conn.setInstanceFollowRedirects(false);
conn.setConnectTimeout(10000);
conn.setReadTimeout(10000);
// 设置请求头仿浏览器
conn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8");
conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36");
conn.setRequestProperty("Accept", "*/*");
conn.setRequestProperty("X-Requested-With", "XMLHttpRequest");
conn.setRequestProperty("X-MicrosoftAjax", "Delta=true");
conn.setRequestProperty("Referer", "https://www.anzctr.org.au/TrialSearch.aspx");
conn.setRequestProperty("Origin", "https://www.anzctr.org.au");
// 构建 POST 表单数据
String postData = payload;
// 写入 POST 数据
try (OutputStream os = conn.getOutputStream()) {
byte[] input = postData.getBytes(StandardCharsets.UTF_8);
os.write(input);
}
// 读取响应
BufferedReader re = new BufferedReader(new InputStreamReader(conn.getInputStream(), StandardCharsets.UTF_8));
StringBuilder response = new StringBuilder();
String line;
while ((line = re.readLine()) != null) {
response.append(line);
}
String html = response.toString();
Document parse = Jsoup.parse(html);
Elements elements =parse.select(".results-header-tools a");
for (Element element:elements){
String link = "https://www.anzctr.org.au" + element.attr("href");
System.out.println(link);
}
re.close();
conn.disconnect();
}
// 更新并返回当前连接中的 Cookie包含 JSESSIONID 的提取
private static String updateCookies(HttpURLConnection conn, Set<String> cookieSet) {
String sessionId = null;
Map<String, List<String>> headerFields = conn.getHeaderFields();
List<String> cookiesHeader = headerFields.get("Set-Cookie");
if (cookiesHeader != null) {
for (String cookie : cookiesHeader) {
String cookieValue = cookie.split(";")[0];
cookieSet.add(cookieValue);
if (cookieValue.startsWith("ASP.NET_SessionId=") || cookieValue.startsWith("csfcfc=")) {
sessionId = cookieValue;
}
}
}
return sessionId;
}
// 提取 __VIEWSTATE 隐藏字段的值
private static Map<String, String> extractViewStateData(String html) {
Map<String, String> stateMap = new HashMap<>();
// 使用三个独立正则提取三个字段
extractHiddenField(html, "__VIEWSTATE", stateMap);
extractHiddenField(html, "__VIEWSTATEGENERATOR", stateMap);
extractHiddenField(html, "__EVENTVALIDATION", stateMap);
if (!stateMap.containsKey("__VIEWSTATE")) {
System.err.println("Failed to extract __VIEWSTATE from HTML");
}
if (!stateMap.containsKey("__EVENTVALIDATION")) {
System.err.println("Failed to extract __EVENTVALIDATION from HTML");
}
if (!stateMap.containsKey("__VIEWSTATEGENERATOR")) {
System.err.println("Failed to extract __VIEWSTATEGENERATOR from HTML");
}
return stateMap;
}
private static void extractHiddenField(String html, String fieldName, Map<String, String> map) {
String regex = "(?i)<input[^>]*name=[\"']" + fieldName + "[\"'][^>]*value=[\"']([^\"']+)[\"']";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(html);
if (matcher.find()) {
map.put(fieldName, matcher.group(1));
}
}
private static String buildPostData(String viewState, String eventValidation, String viewStateGen, int page, String sessionId) {
try {
// 按照真实请求体的顺序和字段进行构建
String payload = "";
payload += URLEncoder.encode("ctl00$body$tsmAJAXScriptManager", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode("ctl00$body$tsmAJAXScriptManager|ctl00$body$tsmAJAXScriptManager", StandardCharsets.UTF_8.name()) + "&";
payload += URLEncoder.encode("ctl00_body_tsmAJAXScriptManager_HiddenField", StandardCharsets.UTF_8.name()) + "=&"; // 添加缺失字段
payload += URLEncoder.encode("__EVENTTARGET", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode("ctl00$body$tsmAJAXScriptManager", StandardCharsets.UTF_8.name()) + "&";
payload += URLEncoder.encode("__EVENTARGUMENT", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode("conditionCode=&dateOfRegistrationFrom=&interventionDescription=&interventionCodeOperator=OR&primarySponsorType=&gender=&distance=&postcode=&pageSize=20&ageGroup=&recruitmentCountryOperator=OR&recruitmentRegion=&ethicsReview=&countryOfRecruitment=&registry=&searchTxt=&studyType=&allocationToIntervention=&dateOfRegistrationTo=&recruitmentStatus=&interventionCode=&healthCondition=&healthyVolunteers=&page="+page+"&conditionCategory=&fundingSource=&trialStartDateTo=&trialStartDateFrom=&phase=", StandardCharsets.UTF_8.name()) + "&"; // 注意这里的参数字符串是完整的
payload += URLEncoder.encode("__LASTFOCUS", StandardCharsets.UTF_8.name()) + "=&";
payload += URLEncoder.encode("__VIEWSTATE", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode(viewState, StandardCharsets.UTF_8.name()) + "&";
payload += URLEncoder.encode("__VIEWSTATEGENERATOR", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode(viewStateGen, StandardCharsets.UTF_8.name()) + "&";
payload += URLEncoder.encode("__SCROLLPOSITIONX", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode("0", StandardCharsets.UTF_8.name()) + "&"; // 添加缺失字段
payload += URLEncoder.encode("__SCROLLPOSITIONY", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode("0", StandardCharsets.UTF_8.name()) + "&"; // 添加缺失字段
payload += URLEncoder.encode("__EVENTVALIDATION", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode(eventValidation, StandardCharsets.UTF_8.name()) + "&";
// ... 添加并按顺序排列其他所有字段确保名称编码与真实请求体一致 ...
// 确保最后一个字段后面没有 &
payload += URLEncoder.encode("__ASYNCPOST", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode("true", StandardCharsets.UTF_8.name());
return payload;
} catch (Exception e) {
System.err.println("Error building POST data: " + e.getMessage());
return "";
}
}
}

173
src/main/java/com/example/CaptchaOCR.java

@ -0,0 +1,173 @@
package com.example;
import java.awt.image.BufferedImage;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import javax.imageio.ImageIO;
import net.sourceforge.tess4j.Tesseract;
import net.sourceforge.tess4j.TesseractException;
// ... 其他必要的导入 ...
public class CaptchaOCR {
// Tesseract data 路径 (tessdata 文件夹所在目录)
// Windows 示例: "C:\\Program Files\\Tesseract-OCR\\tessdata"
// Linux/macOS 示例: 通常不需要设置Tess4J 会自动查找
private static final String TESSDATA_PATH = "F:\\tool\\Tesseract-OCR\\tessdata"; // 根据你的安装路径修改
/**
* 下载验证码图片
* @param imageUrl 图片的完整 URL
* @return 图片的 BufferedImage 对象
* @throws IOException 如果下载失败
*/
public static BufferedImage downloadImage(String imageUrl) throws IOException {
URL url = new URL(imageUrl);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestMethod("GET");
// 添加 User-Agent 等必要的请求头模拟浏览器
conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36");
// ... 其他头 ...
int responseCode = conn.getResponseCode();
if (responseCode == HttpURLConnection.HTTP_OK) {
try (InputStream is = conn.getInputStream()) {
// 将输入流读取到字节数组ImageIO 从字节数组读取更稳定
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte[] buffer = new byte[4096]; // 缓冲区大小可以调整
int bytesRead;
while ((bytesRead = is.read(buffer)) != -1) {
baos.write(buffer, 0, bytesRead);
}
ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
BufferedImage image = ImageIO.read(bais);
if (image == null) {
throw new IOException("Failed to read image from stream. Check image format.");
}
return image;
}
} else {
throw new IOException("Failed to download image. HTTP error code: " + responseCode);
}
}
/**
* 对验证码图片进行预处理 (基础示例转灰度+二值化)
* 这是最关键的部分需要根据验证码样式调整
* @param originalImage 原始图片
* @return 预处理后的图片
*/
public static BufferedImage preprocessImage(BufferedImage originalImage) {
// TODO: 这里是图像预处理的重点需要根据实际验证码样式进行调整和优化
// 基础处理转灰度 -> 二值化
int width = originalImage.getWidth();
int height = originalImage.getHeight();
BufferedImage grayImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_GRAY);
grayImage.getGraphics().drawImage(originalImage, 0, 0, null);
BufferedImage binaryImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_BINARY);
// 二值化阈值可能需要调整 (0-255)
int threshold = 128;
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
int gray = grayImage.getRaster().getSample(x, y, 0);
if (gray < threshold) {
binaryImage.getRaster().setSample(x, y, 0, 0); // 黑色
} else {
binaryImage.getRaster().setSample(x, y, 0, 1); // 白色
}
}
}
// TODO: 更高级的预处理包括
// - 去除干扰线噪点
// - 字符分割如果字符粘连
// - 倾斜校正
// - 调整亮度和对比度等
// 你可能需要引入更专业的图像处理库或算法
// 为了调试可以将预处理后的图片保存下来查看效果
try {
File outputfile = new File("preprocessed_captcha.png");
ImageIO.write(binaryImage, "png", outputfile);
System.out.println("Preprocessed image saved to " + outputfile.getAbsolutePath());
} catch (IOException e) {
e.printStackTrace();
}
return binaryImage; // 返回预处理后的图片
}
/**
* 使用 Tess4J 识别图片中的文字
* @param image 待识别的图片 (最好是预处理后的)
* @return 识别出的字符串
*/
public static String recognizeCaptcha(BufferedImage image) {
Tesseract tesseract = new Tesseract();
// 设置 tessdata 路径 (如果 TESSDATA_PATH 已正确设置且 Tesseract 安装正确这行可能不是必需的Tess4J 会自动查找)
// 但显式设置更保险
if (TESSDATA_PATH != null && !TESSDATA_PATH.isEmpty()) {
tesseract.setDatapath(TESSDATA_PATH);
} else {
System.out.println("TESSDATA_PATH not set. Tess4J will try to find tessdata automatically.");
}
tesseract.setLanguage("eng"); // 设置识别语言为英文 (通常包含数字)
// 如果验证码只有数字可以尝试设置仅识别数字
// tesseract.setTessVariable("tessedit_char_whitelist", "0123456789");
try {
String result = tesseract.doOCR(image);
// 清理识别结果去除空格或换行符等
result = result.trim().replaceAll("[^0-9a-zA-Z]", ""); // 根据验证码内容调整清理规则
System.out.println("OCR Result: " + result);
return result;
} catch (TesseractException e) {
System.err.println("Error during OCR: " + e.getMessage());
return null; // 识别失败
}
}
// 示例如何在你的爬虫流程中使用
public static void main(String[] args) {
String captchaImageUrl = "YOUR_CAPTCHA_IMAGE_URL"; // 从页面解析获取到的验证码图片 URL
try {
// 1. 下载图片
BufferedImage originalCaptchaImage = downloadImage(captchaImageUrl);
System.out.println("Image downloaded.");
// 2. 预处理图片
BufferedImage preprocessedImage = preprocessImage(originalCaptchaImage);
System.out.println("Image preprocessed.");
// 3. 识别验证码
String captchaCode = recognizeCaptcha(preprocessedImage);
if (captchaCode != null && !captchaCode.isEmpty()) {
System.out.println("Recognized CAPTCHA: " + captchaCode);
// 4. captchaCode 填入 POST 数据中提交表单
// ... (你的 ASP.NET WebForms POST 提交代码 captchaCode 放到对应的隐藏字段或输入框字段中) ...
// 例如postData += "&captchaInputFieldName=" + URLEncoder.encode(captchaCode, StandardCharsets.UTF_8.name());
// ... 提交 POST 请求 ...
} else {
System.out.println("Failed to recognize CAPTCHA.");
// 5. 处理识别失败的情况可能需要重试或记录日志
}
} catch (IOException e) {
System.err.println("Error downloading or processing image: " + e.getMessage());
}
// catch (URISyntaxException e) {
// System.err.println("Invalid URL: " + e.getMessage());
// } // 如果你的 downloadImage 方法 throws URISyntaxException
}
}

81
src/main/java/com/example/CsAirScraper.java

@ -0,0 +1,81 @@
package com.example;
import io.github.bonigarcia.wdm.WebDriverManager;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.core5.http.io.entity.StringEntity;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import java.util.Set;
import java.util.stream.Collectors;
public class CsAirScraper {
public static void main(String[] args) throws Exception {
// 1. 启动 Selenium访问南航主站
WebDriverManager.chromedriver().setup();
WebDriver driver = new ChromeDriver();
driver.get("https://b2c.csair.com/portal/main/flight/direct/query");
// 等待 Cookie JS 设置稍等几秒
Thread.sleep(5000); // 可根据实际页面响应调整等待时间
// 2. 获取浏览器中所有 Cookie
Set<Cookie> seleniumCookies = driver.manage().getCookies();
String cookieHeader = seleniumCookies.stream()
.map(c -> c.getName() + "=" + c.getValue())
.collect(Collectors.joining("; "));
System.out.println("获取到 Cookie: " + cookieHeader);
driver.quit(); // 关闭浏览器
// 3. 准备 HttpClient 请求携带 Cookie
try (CloseableHttpClient httpClient = HttpClients.createDefault()) {
HttpPost post = new HttpPost("https://b2c.csair.com/portal/main/flight/direct/query");
// 设置请求头
post.setHeader("Content-Type", "application/json");
post.setHeader("Cookie", cookieHeader);
post.setHeader("User-Agent", "Mozilla/5.0");
// 设置请求体JSON
String json = "{"
+ "\"action\": \"0\","
+ "\"adultNum\": \"1\","
+ "\"airLine\": 1,"
+ "\"arrCity\": \"PKX\","
+ "\"businessType\": \"COMMON\","
+ "\"cabinOrder\": \"0\","
+ "\"cache\": 0,"
+ "\"childNum\": \"0\","
+ "\"depCity\": \"CAN\","
+ "\"flightDate\": \"20250514\","
+ "\"flyType\": 0,"
+ "\"infantNum\": \"0\","
+ "\"international\": \"0\","
+ "\"isMember\": \"\","
+ "\"isMultipass\": 1,"
+ "\"language\": \"zh\","
+ "\"preUrl\": \"\","
+ "\"segType\": \"1\","
+ "\"tariffRules\": []"
+ "}";
post.setEntity(new StringEntity(json));
// 4. 发请求
try (CloseableHttpResponse response = httpClient.execute(post)) {
int code = response.getCode();
String result = EntityUtils.toString(response.getEntity());
System.out.println("状态码: " + code);
System.out.println("响应: " + result);
}
}
}
}

404
src/main/java/com/example/CtriScraper.java

@ -0,0 +1,404 @@
package com.example;
import com.fasterxml.jackson.databind.ObjectMapper;
import okhttp3.*;
import org.apache.hc.client5.http.cookie.BasicCookieStore;
import org.apache.hc.client5.http.cookie.CookieStore;
import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.entity.UrlEncodedFormEntity;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.client5.http.protocol.HttpClientContext;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.NameValuePair;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.message.BasicNameValuePair;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.Month;
import java.time.Year;
import java.time.format.DateTimeFormatter;
import java.util.*;
import java.nio.charset.StandardCharsets;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class CtriScraper {
private static final String SEARCH_FORM_URL = "https://ctri.nic.in/Clinicaltrials/advancesearchmain.php";
private static final String SEARCH_SUBMIT_URL = "https://ctri.nic.in/Clinicaltrials/advsearch.php";
private static final Pattern LINK_REGEX_PATTERN = Pattern.compile("'([^']*)'");
private static final String TOPIC_NAME = "cliniTopic";
private static final String BOOTSTRAP_SERVERS = "node-01:19092";
private static KafkaProducer<String, String> producer;
private static ObjectMapper objectMapper = new ObjectMapper();
private static final Random random = new Random();
static {
Properties props = new Properties();
props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, BOOTSTRAP_SERVERS);
props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
props.put(ProducerConfig.ACKS_CONFIG, "all"); // 等待所有副本确认
props.put(ProducerConfig.RETRIES_CONFIG, 3); // 重试次数
producer = new KafkaProducer<>(props);
}
public static List<String> getlink(Integer year, Integer month) {
List<String> linkList = new ArrayList<>(); // 用于存放提取到的链接
// 用于存储和管理 Cookies
CookieStore cookieStore = new BasicCookieStore();
// 用于在请求之间维护状态特别是关联 CookieStore
HttpClientContext context = HttpClientContext.create();
context.setCookieStore(cookieStore);
// 使用 try-with-resources 确保 HttpClient 被正确关闭
try (CloseableHttpClient httpClient = HttpClients.custom()
.setDefaultCookieStore(cookieStore) // 将cookie store绑定到client
.build()) {
// --- Step 1 & 2: 发送 GET 请求获取表单页面并解析 ---
// System.out.println("Fetching search form page..."); // 调试信息可以按需保留或删除
HttpGet getRequest = new HttpGet(SEARCH_FORM_URL);
// 添加一些伪装的 Headers 模拟浏览器访问
getRequest.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36");
getRequest.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
getRequest.setHeader("Accept-Language", "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7");
String formPageHtml = null;
try (CloseableHttpResponse response = httpClient.execute(getRequest, context)) {
int statusCode = response.getCode();
// System.out.println("GET Response Status: " + statusCode); // 调试信息
if (statusCode != 200) {
System.err.println("Error: GET request to form page failed with status code: " + statusCode);
EntityUtils.consume(response.getEntity()); // 确保消费掉实体释放连接
return null; // 获取表单页面失败返回 null
}
HttpEntity entity = response.getEntity();
if (entity != null) {
formPageHtml = EntityUtils.toString(entity, StandardCharsets.UTF_8);
EntityUtils.consume(entity); // 确保实体内容被完全消费
} else {
System.err.println("Error: Failed to get form page entity.");
return null; // 获取页面内容失败返回 null
}
}
// System.out.println("Form page fetched successfully."); // 调试信息
// --- Step 3 & 4: 解析 HTML 提取 csrf_token __ncforminfo ---
Document doc = Jsoup.parse(formPageHtml, SEARCH_FORM_URL); // 传入 base URI 有助于处理相对路径
// 查找隐藏的输入字段
Element csrfTokenInput = doc.selectFirst("input[name=csrf_token][type=hidden]");
Element ncFormInfoInput = doc.selectFirst("input[name=__ncforminfo][type=hidden]");
String csrfToken = null;
String ncFormInfo = null;
if (csrfTokenInput != null) {
csrfToken = csrfTokenInput.val();
// System.out.println("Extracted csrf_token: " + csrfToken); // 调试信息
} else {
System.err.println("Warning: Could not find csrf_token input field.");
return null; // 缺少关键 token返回 null
}
if (ncFormInfoInput != null) {
ncFormInfo = ncFormInfoInput.val();
// System.out.println("Extracted __ncforminfo: " + ncFormInfo); // 调试信息
} else {
System.err.println("Warning: Could not find __ncforminfo input field.");
return null; // 缺少关键 token返回 null
}
// 如果必要的 token 没有获取到可能无法继续 (虽然上面的检查已经覆盖这里作为双重保险)
if (csrfToken == null || ncFormInfo == null) {
System.err.println("Error: Missing required tokens. Cannot proceed with POST request.");
return null;
}
// --- Step 5 & 6: 构建 POST 请求参数并发送 ---
// System.out.println("\nPreparing POST request..."); // 调试信息
HttpPost postRequest = new HttpPost(SEARCH_SUBMIT_URL);
// 添加 Headers 模拟浏览器提交表单
postRequest.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36");
// 重要设置 Referer Header
postRequest.setHeader("Referer", SEARCH_FORM_URL);
// 添加 Origin Header
postRequest.setHeader("Origin", "https://ctri.nic.in");
postRequest.setHeader("Content-Type", "application/x-www-form-urlencoded");
postRequest.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
postRequest.setHeader("Accept-Language", "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7");
postRequest.setHeader("Pragma", "no-cache");
List<NameValuePair> params = new ArrayList<>();
// 添加你之前分析的载荷中的所有参数使用获取到的动态值
params.add(new BasicNameValuePair("stid", "1")); // 注意 stid 之前有两个这里用 1
params.add(new BasicNameValuePair("csrf_token", csrfToken)); // 使用获取到的动态 token
params.add(new BasicNameValuePair("pros", "1"));
params.add(new BasicNameValuePair("month", String.format("%02d", month))); // 格式化月份为两位数
params.add(new BasicNameValuePair("year", String.valueOf(year)));
params.add(new BasicNameValuePair("study", "0"));
params.add(new BasicNameValuePair("sdid", "0"));
params.add(new BasicNameValuePair("phaseid", "0"));
params.add(new BasicNameValuePair("psponsor", "0"));
params.add(new BasicNameValuePair("recid", "0"));
params.add(new BasicNameValuePair("state", "0"));
params.add(new BasicNameValuePair("district", "0"));
params.add(new BasicNameValuePair("searchword", ""));
params.add(new BasicNameValuePair("T4", "anyvalue")); // T4既然无效随便填
params.add(new BasicNameValuePair("btt", "Search"));
params.add(new BasicNameValuePair("__ncforminfo", ncFormInfo)); // 使用获取到的动态值
// 将参数列表设置到请求体中
postRequest.setEntity(new UrlEncodedFormEntity(params, StandardCharsets.UTF_8));
// System.out.println("Executing POST request to submit form..."); // 调试信息
try (CloseableHttpResponse postResponse = httpClient.execute(postRequest, context)) {
int postStatusCode = postResponse.getCode();
// System.out.println("POST Response Status: " + postStatusCode); // 打印状态码
if (postStatusCode != 200) {
System.err.println("Error: POST request to submit form failed with status code: " + postStatusCode);
EntityUtils.consume(postResponse.getEntity()); // 确保消费掉实体释放连接
return null; // 提交表单失败返回 null
}
HttpEntity postEntity = postResponse.getEntity();
if (postEntity != null) {
String searchResultsHtml = EntityUtils.toString(postEntity, StandardCharsets.UTF_8);
EntityUtils.consume(postEntity); // 确保实体内容被完全消费
// --- Step 7: 处理搜索结果页面 ---
// System.out.println("\nParsing search results..."); // 调试信息
Document resultsDoc = Jsoup.parse(searchResultsHtml, SEARCH_SUBMIT_URL);
Elements links = resultsDoc.select("tr a");
for (Element linkElement : links) {
String rawLink = linkElement.attr("href");
// System.out.println("Processing raw link: " + rawLink); // 调试信息
// 使用预编译的正则表达式 Pattern
Matcher matcher = LINK_REGEX_PATTERN.matcher(rawLink);
// 查找匹配项
if (matcher.find()) {
String extractedContent = matcher.group(1); // 提取单引号内的内容
// 构建完整的链接 URL
String fullLink = "https://ctri.nic.in/Clinicaltrials/" + extractedContent;
linkList.add(fullLink); // 将完整链接添加到列表中
// System.out.println("Added link: " + fullLink); // 调试信息
} else {
// 如果链接不符合模式打印警告并跳过
System.err.println("Warning: Link does not match expected pattern: " + rawLink);
}
}
// --- 返回提取到的链接列表 ---
// 循环结束后返回收集到的所有链接
// System.out.println("Finished link extraction. Returning list."); // 调试信息
return linkList;
} else {
System.err.println("Error: Failed to get search results entity.");
return null; // 获取结果内容失败返回 null
}
}
} catch (IOException e) {
// 处理网络请求相关的异常
System.err.println("Network or IO error during scraping:");
e.printStackTrace();
return null; // 发生 IO 错误返回 null
} catch (Exception e) {
// 处理其他可能的异常例如解析错误或 NPE
System.err.println("An unexpected error occurred during scraping:");
e.printStackTrace();
return null; // 发生其他错误返回 null
}
}
public static void main(String[] args) {
for (Integer year = Year.now().getValue(); year >= 2024; year--) {
int monthStart = (year == Year.now().getValue()) ? LocalDate.now().getMonthValue() : 12;
for (Integer month = monthStart; month >= 1; month--) {
try {
List<String> links = getlink(year, month);
if (links == null) {
System.out.println("年份 " + year + " 月份 " + month + " 抓取失败!");
continue;
}
if (links.isEmpty()) {
System.out.println("年份 " + year + " 月份 " + month + " 无数据!");
continue;
}
int sleepTime = random.nextInt(1001) + 3000;
int count = 0;
for (String url : links) {
try {
Map<String, Object> result = reslutData(url);
result.put("crawlUrl", url);
String registNum = String.valueOf(result.get("registNum"));
String jsonValue = objectMapper.writeValueAsString(result);
ProducerRecord<String, String> record = new ProducerRecord<>(TOPIC_NAME, registNum, jsonValue);
producer.send(record, (metadata, exception) -> {
if (exception == null) {
System.out.println("✅ 成功发送到 Kafka: " + registNum + " | Offset: " + metadata.offset() + " | " + url);
} else {
System.err.println("❌ Kafka 发送失败: " + exception.getMessage());
}
});
Thread.sleep(sleepTime); // 控制节奏
count++;
} catch (Exception e) {
System.err.println("抓取或发送失败: " + url);
e.printStackTrace();
}
}
System.out.println("📦 年份 " + year + " 月份 " + month + " 已完成,共上传 " + count + " 条数据。");
} catch (Exception e) {
System.err.println("处理年份 " + year + " 月份 " + month + " 失败: " + e.getMessage());
e.printStackTrace();
}
}
}
// 关闭 producer
producer.close();
}
public static Map<String,Object> reslutData(String url) throws IOException {
Map<String,Object> resultData = new HashMap<>();
OkHttpClient client = new OkHttpClient().newBuilder()
.build();
MediaType mediaType = MediaType.parse("text/plain");
RequestBody body = RequestBody.create(mediaType, "");
Request request = new Request.Builder()
.url(url)
.get()
.build();
Response response = client.newCall(request).execute();
String html = response.body().string();
Document parse = Jsoup.parse(html);
String title = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(7) > td:nth-child(2)").text();
String registNum = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(1) > td:nth-child(2) > b").text();
String registTime = extractAndConvertDate(parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(1) > td:nth-child(2)").text());
Map<String,Object> sponsor = new HashMap<>();
String SMMS = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(14) > td:nth-child(2) > table > tbody > tr > td").text();
String primarySponsor = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(15) > td:nth-child(2) > table > tbody").text();
sponsor.put("Source of Monetary or Material Support",SMMS);
sponsor.put("Primary Sponsor",primarySponsor);
String studyType = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(4) > td:nth-child(2)").text();
String phase = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(31) > td:nth-child(2)").text();
Map<String,Object> disease = new HashMap<>();
String healthType = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(21) > td:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(1)").text();
String condition = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(21) > td:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(2)").text();
disease.put("healthType",healthType);
disease.put("condition",condition);
String studyDesign = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(6) > td:nth-child(2)").text();
String inclusionCriteria = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(23) > td:nth-child(2) > table > tbody").text();
String exclusionCriteria = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(24) > td:nth-child(2) > table > tbody > tr > td:nth-child(2)").text();
String enrollment = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(30) > td:nth-child(2)").text();
String country = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(17) > td:nth-child(2)").text();
String intervention = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(22) > td:nth-child(2) > table").text();
Map<String,Object> primaryOutcome = new HashMap<>();
String firstOutcome = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(28) > td:nth-child(2) > table > tbody").text();
String secondOutcome = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(29) > td:nth-child(2) > table > tbody").text();
primaryOutcome.put("firstOutcome",firstOutcome);
primaryOutcome.put("secondOutcome",secondOutcome);
resultData.put("disease",disease);
resultData.put("primaryOutcome",primaryOutcome);
resultData.put("intervention",intervention);
resultData.put("country",country);
resultData.put("enrollment",enrollment);
resultData.put("exclusionCriteria",exclusionCriteria);
resultData.put("inclusionCriteria",inclusionCriteria);
resultData.put("studyDesign",studyDesign);
resultData.put("sponsor",sponsor);
resultData.put("title",title);
resultData.put("registNum",registNum);
resultData.put("registTime",registTime);
resultData.put("studyType",studyType);
resultData.put("phase",phase);
resultData.put("registStatus","");
resultData.put("registTitle","");
resultData.put("fullTitle","");
resultData.put("sponsorPart","");
resultData.put("studyObjective","");
resultData.put("studyStartDate","");
resultData.put("currentStatus","");
resultData.put("tagTime","");
resultData.put("crawlTime",getCurrentTime());
resultData.put("crawlUrl",url);
resultData.put("postTime",registTime);
resultData.put("content","content");
resultData.put("forwardcontent","forwardcontent");
resultData.put("cid","Nctrinicin");
return resultData;
}
public static String getCurrentTime() {
// 创建 DateTimeFormatter指定输出格式
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
// 获取当前时间
LocalDateTime now = LocalDateTime.now();
// 格式化
return now.format(formatter);
}
public static String extractAndConvertDate(String input) {
// 定义正则表达式提取 dd/MM/yyyy 格式的日期
Pattern pattern = Pattern.compile("\\[(?:Registered on|注册于):\\s*(\\d{2}/\\d{2}/\\d{4})\\]");
Matcher matcher = pattern.matcher(input);
if (matcher.find()) {
String dateStr = matcher.group(1); // 提取的日期字符串
try {
// 解析成 Date 对象
SimpleDateFormat inputFormat = new SimpleDateFormat("dd/MM/yyyy");
Date date = inputFormat.parse(dateStr);
// 格式化为 yyyy:MM:dd 00:00:00
SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MM-dd '00:00:00'");
return outputFormat.format(date);
} catch (ParseException e) {
e.printStackTrace();
}
}
return null; // 如果未匹配或转换失败
}
}

121
src/main/java/com/example/CtriScraperContent.java

@ -0,0 +1,121 @@
package com.example;
import okhttp3.*;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class CtriScraperContent {
public static void main(String[] args) throws IOException {
Map<String,Object> resultData = new HashMap<>();
String url = "https://ctri.nic.in/Clinicaltrials/pmaindet2.php?EncHid=MjQ3MjM=&Enc=&userName=";
OkHttpClient client = new OkHttpClient().newBuilder()
.build();
MediaType mediaType = MediaType.parse("text/plain");
RequestBody body = RequestBody.create(mediaType, "");
Request request = new Request.Builder()
.url(url)
.get()
.build();
Response response = client.newCall(request).execute();
String html = response.body().string();
Document parse = Jsoup.parse(html);
String title = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(7) > td:nth-child(2)").text();
String registNum = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(1) > td:nth-child(2) > b").text();
String registTime = extractAndConvertDate(parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(1) > td:nth-child(2)").text());
Map<String,Object> sponsor = new HashMap<>();
String SMMS = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(14) > td:nth-child(2) > table > tbody > tr > td").text();
String primarySponsor = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(15) > td:nth-child(2) > table > tbody").text();
sponsor.put("Source of Monetary or Material Support",SMMS);
sponsor.put("Primary Sponsor",primarySponsor);
String studyType = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(4) > td:nth-child(2)").text();
String phase = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(31) > td:nth-child(2)").text();
Map<String,Object> disease = new HashMap<>();
String healthType = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(21) > td:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(1)").text();
String condition = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(21) > td:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(2)").text();
disease.put("healthType",healthType);
disease.put("condition",condition);
String studyDesign = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(6) > td:nth-child(2)").text();
String inclusionCriteria = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(23) > td:nth-child(2) > table > tbody").text();
String exclusionCriteria = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(24) > td:nth-child(2) > table > tbody > tr > td:nth-child(2)").text();
String enrollment = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(30) > td:nth-child(2)").text();
String country = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(17) > td:nth-child(2)").text();
String intervention = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(22) > td:nth-child(2) > table").text();
Map<String,Object> primaryOutcome = new HashMap<>();
String firstOutcome = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(28) > td:nth-child(2) > table > tbody").text();
String secondOutcome = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(29) > td:nth-child(2) > table > tbody").text();
primaryOutcome.put("firstOutcome",firstOutcome);
primaryOutcome.put("secondOutcome",secondOutcome);
resultData.put("disease",disease);
resultData.put("primaryOutcome",primaryOutcome);
resultData.put("intervention",intervention);
resultData.put("country",country);
resultData.put("enrollment",enrollment);
resultData.put("exclusionCriteria",exclusionCriteria);
resultData.put("inclusionCriteria",inclusionCriteria);
resultData.put("studyDesign",studyDesign);
resultData.put("sponsor",sponsor);
resultData.put("title",title);
resultData.put("registNum",registNum);
resultData.put("registTime",registTime);
resultData.put("studyType",studyType);
resultData.put("phase",phase);
resultData.put("registStatus","");
resultData.put("registTitle","");
resultData.put("fullTitle","");
resultData.put("sponsorPart","");
resultData.put("studyObjective","");
resultData.put("studyStartDate","");
resultData.put("currentStatus","");
resultData.put("tagTime","");
resultData.put("crawlTime",getCurrentTime());
resultData.put("crawlUrl",url);
resultData.put("postTime",registTime);
resultData.put("content","content");
resultData.put("forwardcontent","forwardcontent");
System.out.println(resultData);
}
public static String getCurrentTime() {
// 创建 DateTimeFormatter指定输出格式
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
// 获取当前时间
LocalDateTime now = LocalDateTime.now();
// 格式化
return now.format(formatter);
}
public static String extractAndConvertDate(String input) {
// 定义正则表达式提取 dd/MM/yyyy 格式的日期
Pattern pattern = Pattern.compile("\\[(?:Registered on|注册于):\\s*(\\d{2}/\\d{2}/\\d{4})\\]");
Matcher matcher = pattern.matcher(input);
if (matcher.find()) {
String dateStr = matcher.group(1); // 提取的日期字符串
try {
// 解析成 Date 对象
SimpleDateFormat inputFormat = new SimpleDateFormat("dd/MM/yyyy");
Date date = inputFormat.parse(dateStr);
// 格式化为 yyyy:MM:dd 00:00:00
SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MM-dd '00:00:00'");
return outputFormat.format(date);
} catch (ParseException e) {
e.printStackTrace();
}
}
return null; // 如果未匹配或转换失败
}
}

113
src/main/java/com/example/Inka.java

@ -0,0 +1,113 @@
package com.example;
import com.fasterxml.jackson.databind.ObjectMapper;
import okhttp3.*;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.*;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Inka {
// private static final String TOPIC_NAME = "patentTopic";
// private static final String BOOTSTRAP_SERVERS = "localhost:9092";
// private static KafkaProducer<String, String> producer;
// private static ObjectMapper objectMapper = new ObjectMapper();
// private static final Random random = new Random();
private static List<String> proxyList = new ArrayList<>(); // 代理池
private static int currentProxyIndex = 0; // 当前使用的代理索引
// static {
// Properties props = new Properties();
// props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, BOOTSTRAP_SERVERS);
// props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
// props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
// props.put(ProducerConfig.ACKS_CONFIG, "all"); // 等待所有副本确认
// props.put(ProducerConfig.RETRIES_CONFIG, 3); // 重试次数
// producer = new KafkaProducer<>(props);
// try {
// proxyList = Files.readAllLines(Paths.get("proxy.txt"));
// if (proxyList.isEmpty()) {
// System.out.println("警告: proxy.txt 为空,未加载任何代理");
// } else {
// System.out.println("成功加载 " + proxyList.size() + " 个代理");
// }
// } catch (IOException e) {
// System.err.println("读取 proxy.txt 失败: " + e.getMessage());
// }
// }
public static void main(String[] args) throws IOException, InterruptedException {
String load = "javax.faces.partial.ajax=true&javax.faces.source=advancedSearchForm%3AadvancedSearchInput%3Aj_idt1225&javax.faces.partial.execute=advancedSearchForm%3AadvancedSearchInput%3Aj_idt1225+advancedSearchForm&javax.faces.partial.render=advancedSearchForm+results-container+j_idt1272&advancedSearchForm%3AadvancedSearchInput%3Aj_idt1225=advancedSearchForm%3AadvancedSearchInput%3Aj_idt1225&advancedSearchForm=advancedSearchForm&advancedSearchForm%3AadvancedSearchAssistant=on&advancedSearchForm%3AadvancedSearchInput%3Ainput=rance10&javax.faces.ViewState=-3602994148230912322%3A-6313250694718303467";
OkHttpClient client = createClientWithProxy();
MediaType mediaType = MediaType.parse("application/x-www-form-urlencoded; charset=UTF-8");
RequestBody body = RequestBody.create(mediaType, load);
// 构建请求
Request request = new Request.Builder()
.url("https://patentscope.wipo.int/search/zh/result.jsf?_vid=P21-M9APK2-00815") // 更新为 Patentscope URL
.method("POST", body)
.addHeader("Accept", "application/xml, text/xml, */*; q=0.01")
.addHeader("Accept-Language", "zh-CN,zh;q=0.9,th;q=0.8,en;q=0.7")
.addHeader("Cache-Control", "no-cache")
.addHeader("Connection", "keep-alive")
.addHeader("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8")
// .addHeader("Cookie", "JSESSIONID=F253B7B0920FFACB89354339F51E325C.wapp2nB; ABIW=balancer.cms41; _ga=GA1.1.33840258.1744249893; Hm_lvt_95e64d347633bfd0a2462e25c93606d6=1744249893; Hm_lpvt_95e64d347633bfd0a2462e25c93606d6=1744249893; HMACCOUNT=0388A9D4AC1C33F5; _pk_id.14.ec75=5aa7b2d46edf6083.1744249894.; cebs=1; _ce.clock_data=-923%2C212.87.194.3%2C1%2C33d0f257a817d1ca4c4381b87f8ad83f%2CChrome%2CJP; cebsp_=1; _pk_uid=0%3DNWFhN2IyZDQ2ZWRmNjA4Mw%3D%3D; _gcl_au=1.1.1245117354.1744249928; wipo-visitor-uunid=28f5a645185bc7b; _pk_ref.9.ec75=%5B%22%22%2C%22%22%2C1744249929%2C%22https%3A%2F%2Fwww.wipo.int%2F%22%5D; _pk_id.9.ec75=957af9d7ac871adb.1744249929.; _ga_15TSHJ0HWP=GS1.1.1744249893.1.1.1744250058.58.0.0; _ce.s=v~274adfa655dbaad3ae6a47724ee5bf89d205d10f~lcw~1744250058720~vir~new~lva~1744249893962~vpv~0~v11.cs~411929~v11.s~559ada70-15ae-11f0-a979-459b55a048ba~v11.sla~1744250058728~gtrk.la~m9apg5tj~v11.send~1744250058720~lcw~1744250058728; _pk_id.5.ec75=ab8529a634a38653.1744250080.; wipo_language=zh; _pk_ses.5.ec75=1")
.addHeader("Faces-Request", "partial/ajax")
.addHeader("Host", "patentscope.wipo.int")
.addHeader("Origin", "https://patentscope.wipo.int")
.addHeader("Pragma", "no-cache")
.addHeader("Referer", "https://patentscope.wipo.int/search/zh/result.jsf?_vid=P21-M9APK2-00815")
.addHeader("Sec-Ch-Ua", "\"Google Chrome\";v=\"135\", \"Not-A.Brand\";v=\"8\", \"Chromium\";v=\"135\"")
.addHeader("Sec-Ch-Ua-Mobile", "?0")
.addHeader("Sec-Ch-Ua-Platform", "\"Windows\"")
.addHeader("Sec-Fetch-Dest", "empty")
.addHeader("Sec-Fetch-Mode", "cors")
.addHeader("Sec-Fetch-Site", "same-origin")
.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36")
.addHeader("X-Requested-With", "XMLHttpRequest")
.build();
// 执行请求并打印响应
try (Response response = client.newCall(request).execute()) {
if (response.isSuccessful()) {
System.out.println("Response: " + response.body().string()+response.code());
} else {
System.out.println("Error: " + response.code() + " - " + response.message());
System.out.println("Response Body: " + response.body().string());
}
}
}
private static OkHttpClient createClientWithProxy() {
OkHttpClient.Builder builder = new OkHttpClient().newBuilder()
.connectTimeout(30, TimeUnit.SECONDS)
.readTimeout(30, TimeUnit.SECONDS)
.writeTimeout(30, TimeUnit.SECONDS);
if (!proxyList.isEmpty() && currentProxyIndex < proxyList.size()) {
String proxy = proxyList.get(currentProxyIndex);
String[] proxyParts = proxy.split(":");
if (proxyParts.length == 2) {
String proxyHost = proxyParts[0];
int proxyPort = Integer.parseInt(proxyParts[1]);
builder.proxy(new java.net.Proxy(java.net.Proxy.Type.HTTP,
new java.net.InetSocketAddress(proxyHost, proxyPort)));
System.out.println("使用代理: " + proxy);
}
}
return builder.build();
}
}

111
src/main/java/com/example/NSFAwardCrawler.java

@ -0,0 +1,111 @@
package com.example;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.support.ui.ExpectedConditions;
import org.openqa.selenium.support.ui.WebDriverWait;
import org.openqa.selenium.NoSuchElementException;
import java.time.Duration;
import java.util.ArrayList;
import java.util.List;
public class NSFAwardCrawler {
private static final int PAGE_SIZE = 30; // 每页基准条数
public static void main(String[] args) {
// 设置 ChromeDriver 路径
System.setProperty("webdriver.chrome.driver",
"F:\\tool\\EasySpider_0.6.2_Windows_x64\\EasySpider_windows_x64\\EasySpider\\resources\\app\\chrome_win64\\chromedriver_win64.exe");
ChromeOptions options = new ChromeOptions();
WebDriver driver = new ChromeDriver(options);
try {
String url = "https://www.nsf.gov/awardsearch/simpleSearchResult?queryText=ebola&ActiveAwards=true";
driver.get(url);
WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(10));
List<String> allAwardIds = new ArrayList<>();
int pageNumber = 1;
while (true) {
System.out.println("Processing page " + pageNumber);
// 等待页面加载完成
wait.until(ExpectedConditions.presenceOfElementLocated(By.className("listview-item")));
// 获取当前页的结果项
List<WebElement> resultItems = driver.findElements(By.className("listview-item"));
int currentPageSize = resultItems.size();
System.out.println("Found " + currentPageSize + " items on page " + pageNumber);
// 如果当前页没有结果退出
if (currentPageSize == 0) {
System.out.println("No items found on page " + pageNumber + ", stopping...");
break;
}
// 提取当前页的奖项 ID
for (WebElement item : resultItems) {
try {
String awardId = item.getAttribute("id");
if (awardId != null && !awardId.isEmpty() && !allAwardIds.contains(awardId)) {
allAwardIds.add(awardId);
}
} catch (Exception e) {
System.out.println("Error processing item: " + e.getMessage());
}
}
// 判断是否需要分页如果当前页条数小于 30认为是最后一页
if (currentPageSize < PAGE_SIZE) {
System.out.println("Page " + pageNumber + " has less than " + PAGE_SIZE + " items (" + currentPageSize + "), assuming last page, stopping...");
break;
}
// 检查下一页按钮
try {
WebElement nextButton = driver.findElement(By.name("NEXT"));
boolean isEnabled = nextButton.isEnabled();
System.out.println("Next button enabled: " + isEnabled);
if (!isEnabled) {
System.out.println("Next button is disabled, stopping...");
break;
}
// 点击下一页
nextButton.click();
Thread.sleep(2000); // 等待页面加载
pageNumber++;
} catch (NoSuchElementException e) {
System.out.println("Next button not found, stopping...");
break;
} catch (Exception e) {
System.out.println("Error clicking next button: " + e.getMessage());
break;
}
}
// 打印所有结果
System.out.println("Found " + allAwardIds.size() + " award IDs across all pages:");
for (int i = 0; i < allAwardIds.size(); i++) {
System.out.println((i + 1) + ". " + allAwardIds.get(i));
}
} catch (Exception e) {
System.out.println("An error occurred: " + e.getMessage());
} finally {
try {
Thread.sleep(2000);
} catch (InterruptedException e) {
e.printStackTrace();
}
driver.quit();
}
}
}

130
src/main/java/com/example/PatentscopeSeleniumCrawler.java

@ -0,0 +1,130 @@
package com.example;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.openqa.selenium.By;
import org.openqa.selenium.Keys;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.support.ui.ExpectedConditions;
import org.openqa.selenium.support.ui.WebDriverWait;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.time.Duration;
import java.util.Random;
public class PatentscopeSeleniumCrawler {
private static final Logger LOGGER = LoggerFactory.getLogger(PatentscopeSeleniumCrawler.class);
private static final String SEARCH_URL = "https://patentscope.wipo.int/search/en/search.jsf";
private static final String SEARCH_INPUT_ID = "simpleSearchForm:fpSearch:input";
private static final String SEARCH_BUTTON_ID = "simpleSearchForm:fpSearch:j_idt1319";
private static final Random RANDOM = new Random();
public static void main(String[] args) {
// 配置 ChromeDriver
System.setProperty("webdriver.chrome.driver", "F:\\tool\\EasySpider_0.6.2_Windows_x64\\EasySpider_windows_x64\\EasySpider\\resources\\app\\chrome_win64\\chromedriver_win64.exe");
ChromeOptions options = new ChromeOptions();
options.addArguments("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36");
options.addArguments("--disable-blink-features=AutomationControlled");
// 非无头模式便于调试
WebDriver driver = null;
try {
driver = new ChromeDriver(options);
WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(15));
// Step 1: 访问搜索页面
LOGGER.info("Navigating to {}", SEARCH_URL);
driver.get(SEARCH_URL);
Thread.sleep(2000 + RANDOM.nextInt(2000)); // 等待页面加载
// Step 2: 输入搜索关键词
LOGGER.info("Entering search query: FP:(fever)");
WebElement searchInput = wait.until(ExpectedConditions.elementToBeClickable(By.id(SEARCH_INPUT_ID)));
searchInput.clear();
searchInput.sendKeys("FP:(fever)");
Thread.sleep(500 + RANDOM.nextInt(1000)); // 等待输入生效
// Step 3: 触发搜索
LOGGER.info("Attempting to trigger search...");
try {
// 方法 1: 点击搜索按钮
WebElement searchButton = wait.until(ExpectedConditions.elementToBeClickable(By.id(SEARCH_BUTTON_ID)));
LOGGER.info("Clicking search button");
searchButton.click();
Thread.sleep(3000 + RANDOM.nextInt(2000)); // 等待 AJAX 和跳转
} catch (Exception e) {
LOGGER.warn("Button click failed, trying Enter key: {}", e.getMessage());
// 方法 2: 模拟回车
searchInput.sendKeys(Keys.ENTER);
Thread.sleep(3000 + RANDOM.nextInt(2000));
}
// Step 4: 验证跳转
String currentUrl = driver.getCurrentUrl();
LOGGER.info("Current URL: {}", currentUrl);
if (!currentUrl.contains("result.jsf")) {
LOGGER.error("Failed to redirect to result.jsf, trying advanced search...");
// 尝试高级搜索备用
driver.get("https://patentscope.wipo.int/search/en/search.jsf?advancedSearch=true");
searchInput = wait.until(ExpectedConditions.elementToBeClickable(By.id("advancedSearchForm:advancedSearchInput:input")));
searchInput.clear();
searchInput.sendKeys("FP:(fever)");
WebElement advSearchButton = wait.until(ExpectedConditions.elementToBeClickable(By.id("advancedSearchForm:advancedSearchInput:j_idt1208")));
advSearchButton.click();
Thread.sleep(3000 + RANDOM.nextInt(2000));
currentUrl = driver.getCurrentUrl();
LOGGER.info("Advanced search URL: {}", currentUrl);
}
// Step 5: 解析结果页面
if (currentUrl.contains("result.jsf")) {
LOGGER.info("Successfully reached result page");
while (true) {
Document doc = Jsoup.parse(driver.getPageSource());
Elements results = doc.select("div.result-row"); // 需确认选择器
if (results.isEmpty()) {
LOGGER.warn("No results found, verify selector or query");
}
for (Element item : results) {
String title = item.select("a.result-title__text").text(); // 需确认
String patentId = item.select("div.result__number").text(); // 需确认
LOGGER.info("Title: {}", title.isEmpty() ? "N/A" : title);
LOGGER.info("Patent ID: {}", patentId.isEmpty() ? "N/A" : patentId);
}
// 分页
WebElement nextPage = driver.findElements(By.cssSelector("a.paginator__button--next:not(.is-disabled)"))
.stream()
.filter(WebElement::isDisplayed)
.findFirst()
.orElse(null);
if (nextPage == null) {
LOGGER.info("No more pages");
break;
}
LOGGER.info("Navigating to next page");
nextPage.click();
Thread.sleep(3000 + RANDOM.nextInt(2000));
}
} else {
LOGGER.error("Still not on result page, check query or network");
}
} catch (Exception e) {
LOGGER.error("Error during crawling: {}", e.getMessage(), e);
} finally {
if (driver != null) {
driver.quit();
LOGGER.info("WebDriver closed");
}
}
}
}

25
src/main/java/com/example/ProxyIPChecker.java

@ -0,0 +1,25 @@
package com.example;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
public class ProxyIPChecker {
public static void main(String[] args) throws Exception {
URL url = new URL("http://httpbin.org/ip");
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestMethod("GET");
BufferedReader in = new BufferedReader(new InputStreamReader(conn.getInputStream(), "UTF-8"));
String inputLine;
StringBuilder response = new StringBuilder();
while ((inputLine = in.readLine()) != null) {
response.append(inputLine);
}
in.close();
System.out.println("当前公网 IP 信息:");
System.out.println(response.toString());
}
}

496
src/main/java/com/example/ScraperWithCaptcha.java

@ -0,0 +1,496 @@
package com.example;// 修改为你的包名
import java.awt.image.BufferedImage;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.imageio.ImageIO;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import net.sourceforge.tess4j.Tesseract;
import net.sourceforge.tess4j.TesseractException;
public class ScraperWithCaptcha {
// --- 需要根据目标网站修改的常量 ---
private static final String BASE_URL = "https://ctri.nic.in/Clinicaltrials/advancesearchmain.php"; // *** 替换为目标网站包含表单和验证码的页面 URL ***
private static final String FORM_SUBMIT_URL = BASE_URL; // *** 表单提交的 URL通常是页面本身或 action 属性指定的 URL ***
private static final String CAPTCHA_IMAGE_SRC_SUBSTRING = "captchasecurityimages.php"; // *** 验证码图片 src 中特有的字符串 ***
private static final String CAPTCHA_INPUT_SELECTOR = "input[name=T4]";
private static final String TARGET_FORM_SELECTOR = "form"; // *** 如果页面有多个表单指定目标表单的选择器例如 "#myFormId" ***
// --- 图像预处理相关的阈值需要根据验证码样式调试 ---
private static final int BINARY_THRESHOLD = 128; // 二值化阈值 (0-255)
// --- Tesseract 配置 (根据你的安装修改) ---
// Tesseract tessdata 文件夹的路径
private static final String TESSDATA_PATH = "F:\\tool\\Tesseract-OCR\\tessdata"; // *** 请务必修改为你的实际路径 ***
// --- 其他通用配置 ---
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36";
private Set<String> cookies = new HashSet<>(); // 存储 cookies
public static void main(String[] args) {
ScraperWithCaptcha scraper = new ScraperWithCaptcha();
try {
// 1. 获取包含表单和验证码的页面
PageInfo pageInfo = scraper.fetchPage(BASE_URL, null, null, false); // 第一次 GET 不需要 Cookies POST Data, 也不是 AJAX
if (pageInfo.htmlContent == null || pageInfo.statusCode != HttpURLConnection.HTTP_OK) {
System.err.println("Failed to fetch the initial page. Status code: " + pageInfo.statusCode);
return;
}
// 解析页面提取验证码信息和所有表单字段
Document doc = Jsoup.parse(pageInfo.htmlContent, BASE_URL);
// 提取验证码图片 URL
Element captchaImg = doc.selectFirst("img[src*=" + CAPTCHA_IMAGE_SRC_SUBSTRING + "]");
String captchaImageUrl = null;
if (captchaImg != null) {
captchaImageUrl = captchaImg.absUrl("src"); // 获取绝对 URL
System.out.println("Found CAPTCHA image URL: " + captchaImageUrl);
} else {
System.err.println("CAPTCHA image not found using selector: img[src*=" + CAPTCHA_IMAGE_SRC_SUBSTRING + "]");
// 如果找不到验证码可能无法继续
return;
}
// 提取验证码输入框的 name
Element captchaInput = doc.selectFirst(CAPTCHA_INPUT_SELECTOR);
String captchaInputName = null;
if (captchaInput != null) {
captchaInputName = captchaInput.attr("name");
System.out.println("Found CAPTCHA input field name: " + captchaInputName);
} else {
System.err.println("CAPTCHA input field not found using selector: " + CAPTCHA_INPUT_SELECTOR);
// 如果找不到输入框也无法提交
return;
}
// 2. 下载验证码图片
BufferedImage originalCaptchaImage = scraper.downloadImage(captchaImageUrl);
System.out.println("Captcha image downloaded.");
// 3. 预处理图片
BufferedImage preprocessedImage = scraper.preprocessImage(originalCaptchaImage);
System.out.println("Image preprocessed (saved as preprocessed_captcha.png).");
// 4. 识别验证码
String captchaCode = scraper.recognizeCaptcha(preprocessedImage);
if (captchaCode != null && !captchaCode.isEmpty()) {
System.out.println("Recognized CAPTCHA: " + captchaCode);
// 5. 构建包含验证码的 POST 数据
// 从页面表单中提取所有字段并设置其值
Map<String, String> formData = scraper.buildFormDataMap(doc, captchaInputName, captchaCode);
String postData = scraper.buildPostData(formData);
System.out.println("Built POST data: " + postData);
// 6. 提交表单
// 通常是标准的 POST 请求
PageInfo postResponseInfo = scraper.fetchPage(FORM_SUBMIT_URL, postData, scraper.getCookieHeader(), false); // AJAX POST
System.out.println("Form submitted. Response status code: " + postResponseInfo.statusCode);
System.out.println("POST Response Body (partial): " + (postResponseInfo.htmlContent != null && postResponseInfo.htmlContent.length() > 500 ? postResponseInfo.htmlContent.substring(0, 500) + "..." : postResponseInfo.htmlContent)); // 打印部分响应查看
// 7. 检查响应判断是否成功
// 对于标准表单提交成功通常是重定向 (302) 或返回新的页面
if (postResponseInfo.statusCode == HttpURLConnection.HTTP_MOVED_TEMP || postResponseInfo.statusCode == HttpURLConnection.HTTP_SEE_OTHER || postResponseInfo.statusCode == HttpURLConnection.HTTP_MOVED_PERM) {
String redirectUrl = postResponseInfo.redirectUrl;
System.out.println("POST resulted in redirect. Location: " + redirectUrl);
// TODO: 如果重定向到成功页面可以继续爬取该页面
// 如果重定向回原页面或错误页说明提交失败 (验证码错误或其他原因)
if (redirectUrl != null && redirectUrl.equals(BASE_URL)) { // <-- 检查是否重定向回原页面需根据实际情况判断
System.err.println("Submission failed, redirected back to the form page.");
// TODO: 实现重试逻辑 (需要重新获取页面和验证码)
}
} else if (postResponseInfo.statusCode == HttpURLConnection.HTTP_OK) {
System.out.println("POST returned OK (200). Analyzing response content...");
// TODO: 解析 postResponseInfo.htmlContent 来判断是否成功例如查找成功标志或检查是否有验证码错误提示
if (postResponseInfo.htmlContent != null && postResponseInfo.htmlContent.contains("成功标志字符串")) { // <-- *** 根据实际成功响应的特征修改 ***
System.out.println("Form submission appears successful based on content.");
// TODO: postResponseInfo.htmlContent 中提取你想要的数据
} else if (postResponseInfo.htmlContent != null && postResponseInfo.htmlContent.contains("验证码错误提示字符串")) { // <-- *** 根据实际验证码错误提示修改 ***
System.err.println("CAPTCHA appears incorrect. Need to retry.");
// TODO: 实现重试逻辑 (可能需要重新获取页面因为验证码会刷新)
} else {
System.out.println("POST returned 200, but content not clearly indicating success or failure.");
// 需要更详细地检查响应内容
}
}
else {
System.err.println("POST request failed with status code: " + postResponseInfo.statusCode);
}
} else {
System.err.println("CAPTCHA recognition failed. Cannot submit form.");
// TODO: 实现识别失败的重试逻辑
}
} catch (IOException e) {
e.printStackTrace();
System.err.println("An I/O error occurred: " + e.getMessage());
} catch (TesseractException e) {
e.printStackTrace();
System.err.println("A Tesseract OCR error occurred: " + e.getMessage());
} catch (Exception e) {
e.printStackTrace();
System.err.println("An unexpected error occurred: " + e.getMessage());
}
}
/**
* 发起 HTTP 请求 (GET POST)获取页面内容和 Cookies
*
* @param urlString 请求 URL
* @param postData POST 请求体数据 (GET 请求时为 null)
* @param cookieHeader 请求头中的 Cookie (第一次请求时为 null)
* @param isAjaxPost 是否是 AJAX POST 请求 (影响请求头设置)
* @return PageInfo 对象包含响应信息和内容
* @throws IOException
*/
private PageInfo fetchPage(String urlString, String postData, String cookieHeader, boolean isAjaxPost) throws IOException {
URL url = new URL(urlString);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
if (postData != null) {
conn.setRequestMethod("POST");
conn.setDoOutput(true); // 允许写入 POST 数据
} else {
conn.setRequestMethod("GET");
}
conn.setInstanceFollowRedirects(false);
conn.setConnectTimeout(10000);
conn.setReadTimeout(20000);
// 设置请求头 (不包括 CookieCookie 在后面统一处理)
conn.setRequestProperty("User-Agent", USER_AGENT);
if (cookieHeader != null) {
conn.setRequestProperty("Cookie", cookieHeader);
}
conn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8");
if (postData != null) {
conn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8");
if(isAjaxPost) {
conn.setRequestProperty("X-Requested-With", "XMLHttpRequest");
conn.setRequestProperty("X-MicrosoftAjax", "Delta=true");
}
try {
conn.setRequestProperty("Referer", new URL(urlString).getProtocol() + "://" + new URL(urlString).getHost() + new URL(urlString).getPath());
conn.setRequestProperty("Origin", new URL(urlString).getProtocol() + "://" + new URL(urlString).getHost());
} catch (Exception e) { }
}
// --- 写入 POST 数据 (如果是 POST 请求) ---
// 这一块必须在读取响应之前
if (postData != null) {
try (OutputStream os = conn.getOutputStream()) { // 获取输出流会触发连接
byte[] input = postData.getBytes(StandardCharsets.UTF_8);
os.write(input, 0, input.length);
} // os.close() try-with-resources 结束时自动调用数据在这里被发送
}
// --- End POST Data ---
// --- 现在可以获取响应信息了 ---
// 调用 getResponseCode() 会发送完整的请求 (包括头和体) 并接收响应头
int statusCode = conn.getResponseCode();
String redirectUrl = null;
if (statusCode == HttpURLConnection.HTTP_MOVED_TEMP || statusCode == HttpURLConnection.HTTP_SEE_OTHER || statusCode == HttpURLConnection.HTTP_MOVED_PERM) {
redirectUrl = conn.getHeaderField("Location");
}
// --- 处理 Cookies (从响应头读取) ---
// 这一块现在在获取响应码之后执行
Map<String, List<String>> headerFields = conn.getHeaderFields();
List<String> cookiesHeader = headerFields.get("Set-Cookie");
if (cookiesHeader != null) {
for (String cookie : cookiesHeader) {
String cookieValue = cookie.split(";")[0];
this.cookies.add(cookieValue);
}
}
// --- End Cookies ---
StringBuilder content = new StringBuilder();
// 只有当状态码表示成功 (2xx) 或客户端错误 (4xx) 且有响应体时才读取
if (statusCode >= 200 && statusCode < 300 || statusCode >= 400 && statusCode < 500 && conn.getContentLength() > 0) {
try (InputStream is = (statusCode >= 200 && statusCode < 300) ? conn.getInputStream() : conn.getErrorStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))) {
String line;
while ((line = reader.readLine()) != null) {
content.append(line).append("\n");
}
} catch (IOException e) {
System.err.println("Error reading response body for status " + statusCode + ": " + e.getMessage());
}
}
conn.disconnect();
PageInfo pageInfo = new PageInfo();
pageInfo.statusCode = statusCode;
pageInfo.redirectUrl = redirectUrl;
pageInfo.htmlContent = content.toString();
return pageInfo;
}
/**
* 从页面表单中提取所有字段并设置验证码字段的值
* @param doc Jsoup 解析后的 Document 对象
* @param captchaInputName 验证码输入框的 name 属性值
* @param captchaCode 识别出的验证码字符串
* @return 包含所有表单字段名称和值的 Map
*/
private Map<String, String> buildFormDataMap(Document doc, String captchaInputName, String captchaCode) {
Map<String, String> formData = new HashMap<>();
Element form = doc.selectFirst(TARGET_FORM_SELECTOR); // 找到目标表单
if (form == null) {
System.err.println("Target form not found using selector: " + TARGET_FORM_SELECTOR);
return formData; // 返回空 Map
}
Elements formElements = form.select("input, select, textarea"); // 查找表单内的所有输入元素
for (Element element : formElements) {
String name = element.attr("name");
String type = element.attr("type"); // 获取 input 的类型
String value = element.attr("value"); // 获取默认 value
if (name == null || name.isEmpty()) {
continue; // 忽略没有 name 属性的元素
}
// 处理不同类型的输入元素
if ("text".equals(type) || "hidden".equals(type) || "password".equals(type)) {
if (name.equals(captchaInputName)) {
// 这是验证码输入框填入识别结果
formData.put(name, captchaCode);
} else {
// 其他文本/隐藏字段使用默认值或留空取决于需求
formData.put(name, value != null ? value : ""); // 通常爬取时这些是空的
}
} else if ("checkbox".equals(type)) {
// 复选框如果被勾选则添加到 formData
if (element.hasAttr("checked")) {
formData.put(name, value != null ? value : "on"); // 复选框的值通常是 "on" value 属性的值
}
} else if ("radio".equals(type)) {
// 单选按钮如果被选中则添加到 formData
if (element.hasAttr("checked")) {
formData.put(name, value != null ? value : "on"); // 单选按钮的值通常是 value 属性的值
}
} else if ("select".equals(element.tagName().toLowerCase())) {
// 下拉列表找到被选中的 option 的值
Element selectedOption = element.selectFirst("option[selected]");
if (selectedOption != null) {
formData.put(name, selectedOption.attr("value"));
} else {
// 如果没有选中的项可能需要根据网站逻辑选择第一个或默认项
// 或者如果网站要求必须有值这里需要更复杂的处理
Element firstOption = element.selectFirst("option");
if (firstOption != null) {
formData.put(name, firstOption.attr("value"));
} else {
formData.put(name, ""); // 没有选项留空
}
}
} else if ("textarea".equals(element.tagName().toLowerCase())) {
// 文本域获取其文本内容
formData.put(name, element.text());
}
// TODO: 根据需要处理其他类型的 input file, submit, image, reset
// 注意submit, image 类型的 input 通常只有在它们被点击时才会被包含在表单提交数据中并且它们的值是按钮的值
}
// TODO: 如果网站通过 JavaScript 动态添加或修改了表单字段你需要找到这些字段并手动添加到 formData
// TODO: 有些表单提交按钮本身会作为 POST 数据的一部分被发送例如 name="submitButton" value="提交"
// 你可能需要确定哪个按钮触发了提交并将它的 name=value 对添加到 formData
return formData;
}
/**
* 下载验证码图片 (Java 8 兼容版本)
* @param imageUrl 图片的完整 URL
* @return 图片的 BufferedImage 对象
* @throws IOException 如果下载失败
*/
public BufferedImage downloadImage(String imageUrl) throws IOException {
URL url = new URL(imageUrl);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestMethod("GET");
conn.setRequestProperty("User-Agent", USER_AGENT);
// 下载图片时通常也需要带上 cookies确保会话一致性
conn.setRequestProperty("Cookie", getCookieHeader());
int responseCode = conn.getResponseCode();
if (responseCode == HttpURLConnection.HTTP_OK) {
try (InputStream is = conn.getInputStream()) {
// --- 兼容 Java 8 及更早版本读取 InputStream ---
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte[] buffer = new byte[4096]; // 缓冲区大小
int bytesRead;
while ((bytesRead = is.read(buffer)) != -1) {
baos.write(buffer, 0, bytesRead);
}
ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
// --- End 兼容代码 ---
BufferedImage image = ImageIO.read(bais);
if (image == null) {
throw new IOException("Failed to read image stream. Check image format or content for URL: " + imageUrl);
}
return image;
}
} else {
throw new IOException("Failed to download image. HTTP error code: " + responseCode + " for URL: " + imageUrl);
}
}
/**
* 对验证码图片进行预处理 (基础示例转灰度+二值化)
* 这是最关键的部分需要根据验证码样式调整
* @param originalImage 原始图片
* @return 预处理后的图片
*/
public BufferedImage preprocessImage(BufferedImage originalImage) {
// TODO: 这是图像预处理的重点需要根据实际验证码样式进行调整和优化
// 保存原始图片方便对比
try {
File originalFile = new File("original_captcha.png");
ImageIO.write(originalImage, "png", originalFile);
} catch (IOException e) {
e.printStackTrace();
}
// 基础处理转灰度 -> 二值化
int width = originalImage.getWidth();
int height = originalImage.getHeight();
BufferedImage grayImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_GRAY);
grayImage.getGraphics().drawImage(originalImage, 0, 0, null);
BufferedImage binaryImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_BINARY);
// 二值化阈值需要调整 (0-255)
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
int gray = grayImage.getRaster().getSample(x, y, 0);
if (gray < BINARY_THRESHOLD) {
binaryImage.getRaster().setSample(x, y, 0, 0); // 黑色
} else {
binaryImage.getRaster().setSample(x, y, 0, 1); // 白色
}
}
}
// TODO: 更高级的预处理包括去噪点去干扰线字符分割倾斜校正等
// 如果验证码只有数字可以尝试裁剪掉图片上下左右的空白或干扰区域
// 为了调试将预处理后的图片保存下来查看效果
try {
File outputfile = new File("preprocessed_captcha.png");
ImageIO.write(binaryImage, "png", outputfile);
System.out.println("Preprocessed image saved to " + outputfile.getAbsolutePath());
} catch (IOException e) {
e.printStackTrace();
}
return binaryImage; // 返回预处理后的图片
}
/**
* 使用 Tess4J 识别图片中的文字
* @param image 待识别的图片 (最好是预处理后的)
* @return 识别出的字符串 (如果失败返回 null 或空字符串)
*/
public String recognizeCaptcha(BufferedImage image) throws TesseractException {
Tesseract tesseract = new Tesseract();
// 设置 tessdata 路径 (如果 TESSDATA_PATH 已正确设置且 Tesseract 安装正确这行可能不是必需的Tess4J 会自动查找)
if (TESSDATA_PATH != null && !TESSDATA_PATH.isEmpty()) {
tesseract.setDatapath(TESSDATA_PATH);
} else {
System.err.println("WARNING: TESSDATA_PATH not set. Tess4J will try to find tessdata automatically.");
}
tesseract.setLanguage("eng"); // 设置识别语言为英文 (通常包含数字)
// 如果验证码只有数字可以尝试设置仅识别数字这有助于提高准确率
// tesseract.setTessVariable("tessedit_char_whitelist", "0123456789"); // 方法名请查阅 Tess4J 文档确认
String result = tesseract.doOCR(image);
// 清理识别结果去除空格或换行符等
result = result != null ? result.trim().replaceAll("[^0-9a-zA-Z]", "") : ""; // 根据验证码内容数字字母调整清理规则
return result;
}
/**
* 构建用于 POST 提交的表单数据字符串
* @param formDataMap 包含所有表单字段名称和值的 Map
* @return URL 编码后的表单数据字符串
* @throws IOException
*/
private String buildPostData(Map<String, String> formDataMap) throws IOException {
StringBuilder postDataBuilder = new StringBuilder();
boolean first = true;
// 遍历 Map 构建 POST 数据如果需要特定顺序使用 LinkedHashMap
for (Map.Entry<String, String> entry : formDataMap.entrySet()) {
if (!first) {
postDataBuilder.append("&");
}
postDataBuilder.append(URLEncoder.encode(entry.getKey(), StandardCharsets.UTF_8.name()))
.append("=")
.append(URLEncoder.encode(entry.getValue() != null ? entry.getValue() : "", StandardCharsets.UTF_8.name()));
first = false;
}
return postDataBuilder.toString();
}
/**
* 将存储的 cookies 格式化为 HTTP 请求头部的 Cookie 字符串
*/
private String getCookieHeader() {
StringBuilder cookieHeaderBuilder = new StringBuilder();
boolean first = true;
for (String cookie : this.cookies) {
if (!first) {
cookieHeaderBuilder.append("; ");
}
cookieHeaderBuilder.append(cookie);
first = false;
}
return cookieHeaderBuilder.toString();
}
// Helper class to hold information extracted from a page fetch
private static class PageInfo {
int statusCode;
String redirectUrl; // 如果发生重定向
String htmlContent; // 页面响应内容
// 这里不再包含 ASP.NET 特有的字段因为它是通用的
}
}

74
src/main/java/com/example/StringFieldExtractor.java

@ -0,0 +1,74 @@
package com.example;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class StringFieldExtractor {
public static void main(String[] args) {
// 输入字符串
String input = "postTime:05-06-2024 00:00:00,title:PT/013/2024,content:澳門大學-N21科研大樓六樓智慧城市物聯網國家重點實驗室(澳門大學)建造工程 OBRAS DE CONSTRUÇÃO DO LABORATÓRIO DE REFERÊNCIA DO ESTADO DE INTERNET DAS COISAS PARA A CIDADE INTELIGENTE (UNIVERSIDADE DE MACAU), LOCALIZADO NO 6.º ANDAR DO EDIFÍCIO DE INVESTIGAÇÃO CIENTÍFICA N21 DA UNIVERSIDADE DE MACAU,fileList:[https://pct.admo.um.edu.mo/wp-content/uploads/2024/06/招標文件電子檔cover-CHI.pdf###pdf, https://pct.admo.um.edu.mo/wp-content/uploads/2024/06/招標文件電子檔cover-ENG-1.pdf###pdf, https://pct.admo.um.edu.mo/wp-content/uploads/2024/07/開標結果.pdf###pdf, https://pct.admo.um.edu.mo/wp-content/uploads/2024/11/判給結果-N21-6G.pdf###pdf]";
try {
// 存储提取结果
String postTime = null;
String title = null;
String content = null;
List<String> fileList = new ArrayList<>();
// Step 1: 分割 fileList因为它包含方括号可能干扰其他字段
String fileListStr = null;
int fileListStart = input.indexOf("fileList:[");
if (fileListStart != -1) {
int fileListEnd = input.lastIndexOf("]");
if (fileListEnd != -1 && fileListEnd > fileListStart) {
fileListStr = input.substring(fileListStart + 9, fileListEnd + 1); // 提取 [..]
input = input.substring(0, fileListStart - 1); // 移除 fileList 部分
}
}
// Step 2: 解析其他字段postTime, title, content
String[] fields = input.split(",(?=\\w+:)", 3); // 按逗号分割仅在键名前
for (String field : fields) {
String[] keyValue = field.split(":", 2); // 分割键值对
if (keyValue.length == 2) {
String key = keyValue[0].trim();
String value = keyValue[1].trim();
switch (key) {
case "postTime":
postTime = value;
break;
case "title":
title = value;
break;
case "content":
content = value;
break;
}
}
}
// Step 3: 解析 fileList
if (fileListStr != null && fileListStr.startsWith("[") && fileListStr.endsWith("]")) {
String listContent = fileListStr.substring(1, fileListStr.length() - 1).trim();
if (!listContent.isEmpty()) {
// 分割列表元素注意 URL 内的逗号
String[] urls = listContent.split(",\\s*(?=https)");
for (String url : urls) {
fileList.add(url.trim());
}
}
}
// 输出结果
System.out.println("postTime: " + postTime);
System.out.println("title: " + title);
System.out.println("content: " + content);
System.out.println("fileList: " + fileList);
} catch (Exception e) {
System.err.println("Parsing error: " + e.getMessage());
e.printStackTrace();
}
}
}

60
src/main/java/com/example/WipoPatentsSelenium.java

@ -0,0 +1,60 @@
package com.example;
import io.github.bonigarcia.wdm.WebDriverManager;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import java.util.List;
public class WipoPatentsSelenium {
public static void main(String[] args) throws InterruptedException {
// 自动管理驱动
WebDriverManager.chromedriver().setup();
WebDriver driver = new ChromeDriver();
try {
driver.get("https://patentscope.wipo.int/search/en/result.jsf?query=FP:(AI)");
// 等待页面加载粗略等待
Thread.sleep(3000);
int maxPages = 3;
int currentPage = 1;
while (currentPage <= maxPages) {
System.out.println("📄 当前第 " + currentPage + " 页:");
// 找到所有结果项
List<WebElement> results = driver.findElements(By.cssSelector(".resultitem"));
for (WebElement result : results) {
String title = result.findElement(By.cssSelector(".resulttitle")).getText();
String pubNum = result.findElement(By.cssSelector(".pubNumber")).getText();
System.out.println("🔹 " + pubNum + " - " + title);
}
// 查找下一页按钮点击
WebElement nextButton = null;
try {
nextButton = driver.findElement(By.cssSelector("a[title='Next']"));
} catch (Exception e) {
System.out.println("✅ 已到最后一页或按钮未找到");
break;
}
if (nextButton != null && nextButton.isDisplayed()) {
nextButton.click();
currentPage++;
Thread.sleep(3000); // 等待下一页加载
} else {
break;
}
}
} finally {
driver.quit();
}
}
}

594
src/main/java/com/example/cliniTopic.java

@ -0,0 +1,594 @@
package com.example;
import com.fasterxml.jackson.databind.ObjectMapper;
import okhttp3.*;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.apache.kafka.common.serialization.StringSerializer;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class cliniTopic {
private static final String TOPIC_NAME = "cliniTopic";
private static final String BOOTSTRAP_SERVERS = "localhost:9092";
private static KafkaProducer<String, String> producer;
private static ObjectMapper objectMapper = new ObjectMapper();
private static final Random random = new Random();
private static List<String> proxyList = new ArrayList<>(); // 代理池
private static int currentProxyIndex = 0; // 当前使用的代理索引
static {
Properties props = new Properties();
props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, BOOTSTRAP_SERVERS);
props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
props.put(ProducerConfig.ACKS_CONFIG, "all"); // 等待所有副本确认
props.put(ProducerConfig.RETRIES_CONFIG, 3); // 重试次数
producer = new KafkaProducer<>(props);
try {
proxyList = Files.readAllLines(Paths.get("proxy.txt"));
if (proxyList.isEmpty()) {
System.out.println("警告: proxy.txt 为空,未加载任何代理");
} else {
System.out.println("成功加载 " + proxyList.size() + " 个代理");
}
} catch (IOException e) {
System.err.println("读取 proxy.txt 失败: " + e.getMessage());
}
}
public static void main(String[] args) throws IOException, InterruptedException {
List<String> keywords = Files.readAllLines(Paths.get("keywords.txt"));
List<String> cleanedKeywords = new ArrayList<>();
for (String keyword : keywords) {
String cleaned = keyword.split(",")[0].trim(); // 取逗号前的部分并去除首尾空格
cleanedKeywords.add(cleaned);
}
ExecutorService executor = Executors.newFixedThreadPool(4); // 4 个线程
for (String keyword : cleanedKeywords) {
executor.submit(() -> {
try {
int sleepTime = random.nextInt(1001) + 30000;
for (Integer i=1;i<=7;i++){
final Integer pageNum = i;
Map list = list(keyword,i);
List<String> urls = (List<String>) list.get("listUrl");
if (urls.isEmpty()){
System.out.println("没有关键词"+keyword+"检索结果");
break;
}
Integer count = Integer.parseInt(String.valueOf(list.get("count")));
Integer totalPage = Integer.parseInt(String.valueOf(list.get("totalPage")));
for(String url:urls){
Map<String,Object> result = content(url);
Thread.sleep(sleepTime);
String registNum = String.valueOf(result.get("registNum"));
String crawlUrl = String.valueOf(result.get("crawlUrl"));
try {
String jsonValue = objectMapper.writeValueAsString(result);
ProducerRecord<String, String> record = new ProducerRecord<>(TOPIC_NAME, registNum, jsonValue);
producer.send(record, (metadata, exception) -> {
if (exception == null) {
System.out.println("成功发送到Kafka - Partition: " + metadata.partition() +
", Offset: " + metadata.offset() + ", "+crawlUrl + ", "+ keyword + " , " + pageNum );
} else {
System.err.println("发送到Kafka失败: " + exception.getMessage());
}
});
} catch (Exception e) {
System.err.println("序列化或发送Kafka消息失败: " + e.getMessage());
}
Thread.sleep(sleepTime);
}
if(count<10||totalPage==i){
System.out.println("关键词"+keyword+"已检索完毕");
break;
}
}
} catch (Exception e) {
System.err.println("处理 " + keyword + " 失败: " + e.getMessage());
e.printStackTrace();
}
});
}
executor.shutdown();
executor.awaitTermination(5, TimeUnit.HOURS);
producer.close();
}
private static Map<String,Object> list(String keyword,Integer page) throws Exception{
Map<String,Object> map = new HashMap<>();
String baseUrl = "https://www.drks.de/search/de";
String hostUrl = "https://www.drks.de";
String cleanUrl = "https://www.drks.de/search/de/results";
System.out.println("Pure URL: " + cleanUrl);
System.out.println("Page Number: " + page);
// 存储 cookies
Set<String> cookieSet = new HashSet<>();
String sessionId = null;
// 第一步初始 GET 请求获取 cookies ViewState
URL initialUrl = new URL(baseUrl);
HttpURLConnection initialConn = (HttpURLConnection) initialUrl.openConnection();
initialConn.setRequestMethod("GET");
initialConn.setInstanceFollowRedirects(false);
initialConn.setConnectTimeout(10000);
initialConn.setReadTimeout(10000);
// 捕获 cookies
sessionId = updateCookies(initialConn, cookieSet);
System.out.println("Initial Cookies: " + cookieSet);
System.out.println("Initial Session ID: " + sessionId);
// 读取响应内容以获取 ViewState
BufferedReader in = new BufferedReader(new InputStreamReader(initialConn.getInputStream()));
StringBuilder content = new StringBuilder();
String inputLine;
while ((inputLine = in.readLine()) != null) {
content.append(inputLine);
}
in.close();
initialConn.disconnect();
// 提取初始 ViewState
String initialViewState = extractViewState(content.toString());
System.out.println("Initial ViewState: " + initialViewState);
// 第二步发送搜索 POST 请求
HttpURLConnection searchConn = (HttpURLConnection) new URL(baseUrl).openConnection();
searchConn.setRequestMethod("POST");
searchConn.setInstanceFollowRedirects(false);
searchConn.setDoOutput(true);
searchConn.setConnectTimeout(10000);
searchConn.setReadTimeout(10000);
// 设置搜索请求的请求头
searchConn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8");
searchConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7");
searchConn.setRequestProperty("Cookie", String.join("; ", cookieSet));
searchConn.setRequestProperty("Origin", "https://www.drks.de");
searchConn.setRequestProperty("Referer", baseUrl);
searchConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36");
// 构建搜索请求的 POST 数据
String searchPostData = buildSearchPostData(initialViewState,keyword);
// 发送搜索 POST 请求
try (OutputStream os = searchConn.getOutputStream()) {
byte[] input = searchPostData.getBytes(StandardCharsets.UTF_8);
os.write(input, 0, input.length);
}
// 更新 cookies
String searchSessionId = updateCookies(searchConn, cookieSet);
System.out.println("Search Cookies: " + cookieSet);
System.out.println("Search Session ID: " + searchSessionId);
// 处理搜索响应
int searchResponseCode = searchConn.getResponseCode();
System.out.println("Search Response Code: " + searchResponseCode);
String redirectUrl = searchConn.getHeaderField("Location");
searchConn.disconnect();
if (searchResponseCode != 302 || redirectUrl == null) {
System.err.println("Search request did not return expected 302 redirect. Response code: " + searchResponseCode);
return null;
}
System.out.println("Redirect URL (raw): " + redirectUrl);
// 解析相对 URL
if (!redirectUrl.startsWith("http")) {
redirectUrl = hostUrl + (redirectUrl.startsWith("/") ? redirectUrl : "/" + redirectUrl);
}
System.out.println("Resolved Redirect URL: " + redirectUrl);
// 第三步跟随重定向使用 GET 请求
URL resultsUrl = new URL(redirectUrl);
HttpURLConnection resultsConn = (HttpURLConnection) resultsUrl.openConnection();
resultsConn.setRequestMethod("GET");
resultsConn.setInstanceFollowRedirects(false);
resultsConn.setConnectTimeout(10000);
resultsConn.setReadTimeout(10000);
resultsConn.setRequestProperty("Cookie", String.join("; ", cookieSet));
resultsConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7");
resultsConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64ек; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36");
// 更新 cookies
String resultsSessionId = updateCookies(resultsConn, cookieSet);
System.out.println("Results Cookies: " + cookieSet);
System.out.println("Results Session ID: " + resultsSessionId);
// 读取重定向后的结果页面内容
BufferedReader resultsReader = new BufferedReader(new InputStreamReader(resultsConn.getInputStream()));
StringBuilder resultsContent = new StringBuilder();
while ((inputLine = resultsReader.readLine()) != null) {
resultsContent.append(inputLine);
}
resultsReader.close();
resultsConn.disconnect();
// 提取页面中的 ViewState状态信息用于后续请求
String viewState = extractViewState(resultsContent.toString());
System.out.println("Results ViewState: " + viewState);
// 检查 Session ID 是否一致确保会话未被重置
if (sessionId != null && !sessionId.equals(resultsSessionId)) {
System.out.println("Warning: Session ID changed. Initial: " + sessionId + ", Results: " + resultsSessionId);
}
// Step 4: 第四步发送分页请求使用 POST
HttpURLConnection postConn = (HttpURLConnection) new URL(cleanUrl).openConnection();
postConn.setRequestMethod("POST");
postConn.setInstanceFollowRedirects(false);
postConn.setDoOutput(true);
postConn.setConnectTimeout(10000);
postConn.setReadTimeout(10000);
// 设置分页请求的请求头 AJAX模拟浏览器常规请求
postConn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8");
postConn.setRequestProperty("Cookie", String.join("; ", cookieSet));
postConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7");
postConn.setRequestProperty("Origin", "https://www.drks.de");
postConn.setRequestProperty("Referer", cleanUrl);
postConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36");
postConn.setRequestProperty("Sec-Fetch-Dest", "document");
postConn.setRequestProperty("Sec-Fetch-Mode", "navigate");
// 构建分页请求的 POST 参数包括页码和 ViewState
String postData = buildPostData(viewState, page);
// 发送分页的 POST 请求
try (OutputStream os = postConn.getOutputStream()) {
byte[] input = postData.getBytes(StandardCharsets.UTF_8);
os.write(input, 0, input.length);
}
// 更新 cookies分页响应可能返回新的 Set-Cookie
String paginationSessionId = updateCookies(postConn, cookieSet);
System.out.println("Pagination Cookies: " + cookieSet);
System.out.println("Pagination Session ID: " + paginationSessionId);
// 处理分页响应
int responseCode = postConn.getResponseCode();
System.out.println("Pagination Response Code: " + responseCode);
// 读取分页响应的 HTML 内容
StringBuilder postContent = new StringBuilder();
try (BufferedReader postReader = new BufferedReader(
new InputStreamReader(
responseCode >= 400 ? postConn.getErrorStream() : postConn.getInputStream()))) {
while ((inputLine = postReader.readLine()) != null) {
postContent.append(inputLine);
}
}
Document parse = null;
if (responseCode == HttpURLConnection.HTTP_MOVED_TEMP
|| responseCode == HttpURLConnection.HTTP_MOVED_PERM
|| responseCode == HttpURLConnection.HTTP_SEE_OTHER) {
String newUrl = postConn.getHeaderField("Location");
System.out.println("Pagination Redirecting to: " + newUrl);
// 解析重定向中的相对地址为完整 URL如果是相对路径
if (!newUrl.startsWith("http")) {
newUrl = hostUrl + (newUrl.startsWith("/") ? newUrl : "/" + newUrl);
}
// 重定向
URL redirectConn = new URL(newUrl);
HttpURLConnection followConn = (HttpURLConnection) redirectConn.openConnection();
followConn.setRequestMethod("GET");
followConn.setRequestProperty("Cookie", String.join("; ", cookieSet));
followConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7");
followConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36");
BufferedReader redirectReader = new BufferedReader(new InputStreamReader(followConn.getInputStream()));
StringBuilder redirectContent = new StringBuilder();
while ((inputLine = redirectReader.readLine()) != null) {
redirectContent.append(inputLine);
}
redirectReader.close();
followConn.disconnect();
parse = Jsoup.parse(String.valueOf(redirectContent));
} else if (responseCode == 200) {
parse = Jsoup.parse(String.valueOf(postContent));
}
Elements links = parse.select("div[data-label='Titel der Studie'] a");
List<String> listUrl = new ArrayList();
Integer count = 0;
for (Element link : links) {
String href = link.attr("href");
String trueUrl = "https://www.drks.de/"+href;
listUrl.add(trueUrl);
count++;
}
String text = parse.select("div.col-md-2.pt-3.ps-0.text-md-end").text();
// 使用正则表达式提取 "第" "/" 之间的数字
String regex = "Seite\\s*(\\d+)\\s*/";
Matcher matcher = Pattern.compile(regex).matcher(text);
if (matcher.find()) {
map.put("totalPage",matcher.group(1));// 返回第一个捕获组即数字 "1"
}
map.put("listUrl",listUrl);
map.put("count",count);
map.put("keyword",keyword);
postConn.disconnect();
return map;
}
// 更新并返回当前连接中的 Cookie包含 JSESSIONID 的提取
private static String updateCookies(HttpURLConnection conn, Set<String> cookieSet) {
String sessionId = null;
Map<String, List<String>> headerFields = conn.getHeaderFields();
List<String> cookiesHeader = headerFields.get("Set-Cookie");
if (cookiesHeader != null) {
for (String cookie : cookiesHeader) {
String cookieValue = cookie.split(";")[0];
cookieSet.add(cookieValue);
if (cookieValue.startsWith("JSESSIONID=") || cookieValue.startsWith("csfcfc=")) {
sessionId = cookieValue;
}
}
}
return sessionId;
}
// 提取 __VIEWSTATE 隐藏字段的值
private static String extractViewState(String html) {
if (html == null || html.isEmpty()) {
System.err.println("HTML content is empty or null");
return "";
}
// 兼容 jakarta.faces.ViewState javax.faces.ViewState
String regex = "<input[^>]*name=[\"'](?:jakarta|javax)\\.faces\\.ViewState[\"'][^>]*value=[\"']([^\"']+)[\"']";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(html);
if (matcher.find()) {
return matcher.group(1);
}
System.err.println("Failed to extract ViewState from HTML");
return "";
}
private static Map<String,Object> content(String url)throws Exception{
OkHttpClient client = new OkHttpClient().newBuilder()
.build();
MediaType mediaType = MediaType.parse("application/json");
Request request = new Request.Builder()
.url(url)
.get()
.addHeader("Content-Type", "application/json")
.build();
Response response = client.newCall(request).execute();
String html = response.body().string();
Document parse = Jsoup.parse(html, "UTF-8");
String title = parse.select(".title-bold").text();
String registNum = parse.select(".card.trial-details-float.mb-4 .card-body dl dd:nth-child(2)").text();
String registTime = convertDate(parse.select(".card.trial-details-float.mb-4 .card-body dl dd:nth-child(6)").text());
Map<String,Object> sponsor = new HashMap<>();
String header = parse.select("body > main > div.card-body > div:nth-child(9) > div.card-body > div > div > div > div.card-header > h4").text();
String site = parse.select("body > main > div.card-body > div:nth-child(9) > div.card-body > div > div > div > div.card-body > dl > dd:nth-child(2) > div").text();
String telefon = parse.select("body > main > div.card-body > div:nth-child(9) > div.card-body > div > div > div > div.card-body > dl > dd:nth-child(4) > span").text();
String disease = parse.select("body > main > div.card-body > div:nth-child(6) > div.card-body > div > div:nth-child(2) > dl > dd:nth-child(2) > span").text();
String studyType = parse.select("body > main > div.card-body > div:nth-child(3) > div.card-body > dl").text();
String inclusionCriteria = parse.select("body > main > div.card-body > div:nth-child(7) > div.card-body > div:nth-child(2) > div:nth-child(3) > div > div.card-body > div > div.col-12.mt-3 > dl > dd > span").text();
String exclusionCriteria = parse.select("body > main > div.card-body > div:nth-child(7) > div.card-body > div:nth-child(2) > div:nth-child(4) > div > div.card-body > p > span").text();
String country = parse.select("body > main > div.card-body > div:nth-child(7) > div.card-body > div:nth-child(2) > div:nth-child(1) > div > div.card-body > dl > dd:nth-child(2)").text();
String intervention = parse.select("body > main > div.card-body > div:nth-child(4) > div.card-body > dl").text();
String primaryOutcome = parse.select("body > main > div.card-body > div:nth-child(5) > div.card-body > div > div > dl").text();
String enrollment = parse.select("body > main > div.card-body > div:nth-child(7) > div.card-body > div:nth-child(2) > div:nth-child(2) > div > div.card-body > div > div:nth-child(5) > dl > dd > span").text();
sponsor.put("header",header);
sponsor.put("site",site);
sponsor.put("telefon",telefon);
Map<String,Object> resultData = new HashMap<>();
resultData.put("title", title);
resultData.put("registNum",registNum);
resultData.put("registTime",registTime);
resultData.put("registStatus","");
resultData.put("registTitle","");
resultData.put("fullTitle","");
resultData.put("sponsor",sponsor);
resultData.put("sponsorPart","");
resultData.put("studyType",studyType);
resultData.put("phase","");
resultData.put("disease",disease);
resultData.put("studyDesign","");
resultData.put("studyObjective","");
resultData.put("studyStartDate","");
resultData.put("inclusionCriteria",inclusionCriteria);
resultData.put("exclusionCriteria",exclusionCriteria);
resultData.put("currentStatus","");
resultData.put("enrollment",enrollment);
resultData.put("country",country);
resultData.put("tagTime","");
resultData.put("intervention",intervention);
resultData.put("primaryOutcome",primaryOutcome);
resultData.put("crawlTime",getCurrentTime());
resultData.put("crawlUrl",url);
resultData.put("postTime",registTime);
resultData.put("content","content");
resultData.put("forwardcontent","forwardcontent");
resultData.put("cid","Ndrks");
return resultData;
}
// 生成搜索请求的 POST 数据
private static String buildSearchPostData(String viewState,String keyword) {
try {
return "searchForm=searchForm" +
"&searchForm%3Aj_idt80=" + keyword +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AdrksId=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AsecondaryId=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AscientificSummary=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aoutcome=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AhealthOfCondition=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AhealthyVolunteers=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aaddresses=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt128=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AipdSharingPlan=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt135%3Afrom=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt135%3Ato=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt146%3Afrom=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt146%3Ato=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt157%3Afrom=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt157%3Ato=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3Agender=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AageInYears=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AinclusionCriteria=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AexclusionCriteria=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AtrialStatus=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3ArecrutingLocation=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3Aj_idt213%3Afrom=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3Aj_idt213%3Ato=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3AtrialDesign%3Apurpose=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3AtrialDesign%3AstudyType=" +
"&searchForm%3Aj_idt287=" +
"&javax.faces.ViewState=" + URLEncoder.encode(viewState, StandardCharsets.UTF_8.name());
} catch (Exception e) {
System.err.println("Error encoding search ViewState: " + e.getMessage());
return "";
}
}
// 生成分页请求的 POST 数据
private static String buildPostData(String viewState, int page) {
int adjustedPage = page - 1;
try {
return "resultForm=resultForm" +
"&resultForm%3Asorting%3ArowsPerPage=10" +
"&resultForm%3ApaginationTop%3Aj_idt156%3A"+ adjustedPage +"%3Aj_idt158=" + page +
"&resultForm%3Asorting%3AsortingBy=SCORE" +
"&resultForm%3Asorting%3Aj_idt141=true" +
"&resultForm%3Aj_idt221%3Aj_idt223%3AdownloadConfirmation=resultForm%3Aj_idt221%3Aj_idt223%3AdownloadConfirmation" +
"&selectedType=JSON" +
"&javax.faces.ViewState=" + URLEncoder.encode(viewState, StandardCharsets.UTF_8.name());
} catch (Exception e) {
System.err.println("Error encoding pagination ViewState: " + e.getMessage());
return "";
}
}
public static String convertDate(String inputDate) {
try {
// 输入格式dd.MM.yyyy
SimpleDateFormat inputFormat = new SimpleDateFormat("dd.MM.yyyy");
// 解析输入日期
Date date = inputFormat.parse(inputDate);
// 输出格式yyyy-MM-dd HH:mm:ss
SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
// 转换为目标格式
return outputFormat.format(date);
} catch (ParseException e) {
// 处理解析异常
return "Invalid date format";
}
}
public static String getCurrentTime() {
// 创建 DateTimeFormatter指定输出格式
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
// 获取当前时间
LocalDateTime now = LocalDateTime.now();
// 格式化
return now.format(formatter);
}
private static Response executeWithRetry(OkHttpClient client, Request request, String keyword) throws IOException {
int maxRetries = proxyList.isEmpty() ? 1 : proxyList.size(); // 如果没有代理只尝试一次
int attempt = 0;
while (attempt < maxRetries) {
Response response = client.newCall(request).execute();
if (response.code() == 403) {
System.out.println("收到 403 状态码,尝试切换代理重试...");
response.close();
switchProxy();
client = createClientWithProxy(); // 使用新代理重建客户端
attempt++;
if (attempt == maxRetries) {
throw new IOException("所有代理尝试失败,仍然收到 403");
}
continue;
}
return response; // 成功或非 403 状态码直接返回
}
throw new IOException("无法执行请求,未获取响应");
}
private static OkHttpClient createClientWithProxy() {
OkHttpClient.Builder builder = new OkHttpClient().newBuilder()
.connectTimeout(30, TimeUnit.SECONDS)
.readTimeout(30, TimeUnit.SECONDS)
.writeTimeout(30, TimeUnit.SECONDS);
if (!proxyList.isEmpty() && currentProxyIndex < proxyList.size()) {
String proxy = proxyList.get(currentProxyIndex);
String[] proxyParts = proxy.split(":");
if (proxyParts.length == 2) {
String proxyHost = proxyParts[0];
int proxyPort = Integer.parseInt(proxyParts[1]);
builder.proxy(new java.net.Proxy(java.net.Proxy.Type.HTTP,
new java.net.InetSocketAddress(proxyHost, proxyPort)));
System.out.println("使用代理: " + proxy);
}
}
return builder.build();
}
private static synchronized void switchProxy() {
if (proxyList.isEmpty()) return;
currentProxyIndex = (currentProxyIndex + 1) % proxyList.size();
System.out.println("切换到新代理: " + proxyList.get(currentProxyIndex));
}
public static String increaseOffsetBy30(String originalPayload) {
// "|" 分割载荷为数组
String[] parts = originalPayload.split("\\|");
// 检查数组长度确保有足够元素
if (parts.length < 4) {
throw new IllegalArgumentException("载荷格式无效,元素不足");
}
// 找到倒数第 4 个元素的位置
int targetIndex = parts.length - 4;
try {
// 将倒数第 4 个数字解析为整数
int currentOffset = Integer.parseInt(parts[targetIndex]);
// 增加 30
int newOffset = currentOffset + 30;
// 将新值放回数组
parts[targetIndex] = String.valueOf(newOffset);
// 重新拼接载荷
return String.join("|", parts);
} catch (NumberFormatException e) {
throw new IllegalArgumentException("倒数第 4 个元素不是有效数字: " + parts[targetIndex]);
}
}
}

438
src/main/java/com/example/drks.java

@ -0,0 +1,438 @@
package com.example;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class drks {
public static void main(String[] args) throws Exception {
String targetUrl = "https://www.drks.de/search/de/results?page=4";
String baseUrl = "https://www.drks.de/search/de";
String hostUrl = "https://www.drks.de";
String cleanUrl = targetUrl.split("\\?")[0];
System.out.println("Pure URL: " + cleanUrl);
String pageNumber = targetUrl.contains("?page=") ? targetUrl.split("page=")[1] : "1";
int page = Integer.parseInt(pageNumber);
System.out.println("Page Number: " + page);
// 存储 cookies
Set<String> cookieSet = new HashSet<>();
String sessionId = null;
// 第一步初始 GET 请求获取 cookies ViewState
System.out.println("\n--- Step 1: Initial GET Request ---");
URL initialUrl = new URL(baseUrl);
HttpURLConnection initialConn = (HttpURLConnection) initialUrl.openConnection();
initialConn.setRequestMethod("GET");
initialConn.setInstanceFollowRedirects(false);
initialConn.setConnectTimeout(10000);
initialConn.setReadTimeout(10000);
initialConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36");
// 捕获 cookies
sessionId = updateCookies(initialConn, cookieSet);
System.out.println("Initial Cookies: " + cookieSet);
System.out.println("Initial Session ID: " + sessionId);
// 读取响应内容以获取 ViewState
BufferedReader in = new BufferedReader(new InputStreamReader(initialConn.getInputStream()));
StringBuilder content = new StringBuilder();
String inputLine;
while ((inputLine = in.readLine()) != null) {
content.append(inputLine);
}
in.close();
initialConn.disconnect();
// 提取初始 ViewState
String initialViewState = extractViewState(content.toString());
System.out.println("Initial ViewState: " + initialViewState);
// 第二步发送搜索 POST 请求
System.out.println("\n--- Step 2: Search POST Request ---");
HttpURLConnection searchConn = (HttpURLConnection) new URL(baseUrl).openConnection();
searchConn.setRequestMethod("POST");
searchConn.setInstanceFollowRedirects(false);
searchConn.setDoOutput(true);
searchConn.setConnectTimeout(10000);
searchConn.setReadTimeout(10000);
// 设置搜索请求的请求头
searchConn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8");
searchConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7");
searchConn.setRequestProperty("Cookie", String.join("; ", cookieSet));
searchConn.setRequestProperty("Origin", "https://www.drks.de");
searchConn.setRequestProperty("Referer", baseUrl);
searchConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36");
// 构建搜索请求的 POST 数据
String searchPostData = buildSearchPostData(initialViewState);
System.out.println("Search POST Data: " + searchPostData);
// 发送搜索 POST 请求
try (OutputStream os = searchConn.getOutputStream()) {
byte[] input = searchPostData.getBytes(StandardCharsets.UTF_8);
os.write(input, 0, input.length);
}
// 更新 cookies
String searchSessionId = updateCookies(searchConn, cookieSet);
System.out.println("Search Cookies: " + cookieSet);
System.out.println("Search Session ID: " + searchSessionId); // This is null in your output, which is a potential issue
// 处理搜索响应
int searchResponseCode = searchConn.getResponseCode();
System.out.println("Search Response Code: " + searchResponseCode);
if (searchResponseCode == 302) {
String redirectUrl = searchConn.getHeaderField("Location");
searchConn.disconnect();
if (redirectUrl == null) {
System.err.println("Search request returned 302 but no Location header found.");
return;
}
System.out.println("Redirect URL (raw): " + redirectUrl);
// 解析相对 URL
if (!redirectUrl.startsWith("http")) {
redirectUrl = hostUrl + (redirectUrl.startsWith("/") ? redirectUrl : "/" + redirectUrl);
}
System.out.println("Resolved Redirect URL: " + redirectUrl);
// 第三步跟随重定向使用 GET 请求
System.out.println("\n--- Step 3: Follow Redirect (GET Request) ---");
URL resultsUrl = new URL(redirectUrl);
HttpURLConnection resultsConn = (HttpURLConnection) resultsUrl.openConnection();
resultsConn.setRequestMethod("GET");
resultsConn.setInstanceFollowRedirects(false);
resultsConn.setConnectTimeout(10000);
resultsConn.setReadTimeout(10000);
resultsConn.setRequestProperty("Cookie", String.join("; ", cookieSet));
resultsConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7");
resultsConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36");
// 更新 cookies
String resultsSessionId = updateCookies(resultsConn, cookieSet);
System.out.println("Results Cookies: " + cookieSet);
System.out.println("Results Session ID: " + resultsSessionId);
// 读取重定向后的结果页面内容
BufferedReader resultsReader = new BufferedReader(new InputStreamReader(resultsConn.getInputStream()));
StringBuilder resultsContent = new StringBuilder();
while ((inputLine = resultsReader.readLine()) != null) {
resultsContent.append(inputLine);
}
resultsReader.close();
resultsConn.disconnect();
// 提取页面中的 ViewState状态信息用于后续请求
String viewState = extractViewState(resultsContent.toString());
System.out.println("Results ViewState: " + viewState);
// 检查 Session ID 是否一致确保会话未被重置
if (sessionId != null && !sessionId.equals(resultsSessionId)) {
System.out.println("Warning: Session ID changed. Initial: " + sessionId + ", Results: " + resultsSessionId);
}
// Step 4: 第四步发送分页请求使用 POST
System.out.println("\n--- Step 4: Pagination POST Request ---");
HttpURLConnection postConn = (HttpURLConnection) new URL(cleanUrl).openConnection();
postConn.setRequestMethod("POST");
postConn.setInstanceFollowRedirects(false);
postConn.setDoOutput(true);
postConn.setConnectTimeout(10000);
postConn.setReadTimeout(10000);
// 设置分页请求的请求头 AJAX模拟浏览器常规请求
postConn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8");
postConn.setRequestProperty("Cookie", String.join("; ", cookieSet));
postConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7");
postConn.setRequestProperty("Origin", "https://www.drks.de");
postConn.setRequestProperty("Referer", cleanUrl);
postConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36");
postConn.setRequestProperty("Sec-Fetch-Dest", "document");
postConn.setRequestProperty("Sec-Fetch-Mode", "navigate");
// 构建分页请求的 POST 参数包括页码和 ViewState
String postData = buildPostData(viewState, page);
System.out.println("Pagination POST Data: " + postData);
// 发送分页的 POST 请求
try (OutputStream os = postConn.getOutputStream()) {
byte[] input = postData.getBytes(StandardCharsets.UTF_8);
os.write(input, 0, input.length);
}
// 更新 cookies分页响应可能返回新的 Set-Cookie
String paginationSessionId = updateCookies(postConn, cookieSet);
System.out.println("Pagination Cookies: " + cookieSet);
System.out.println("Pagination Session ID: " + paginationSessionId);
// 处理分页响应
int responseCode = postConn.getResponseCode();
System.out.println("Pagination Response Code: " + responseCode);
// Read and process the pagination response
StringBuilder postContent = new StringBuilder();
try (BufferedReader postReader = new BufferedReader(
new InputStreamReader(
responseCode >= 400 ? postConn.getErrorStream() : postConn.getInputStream()))) {
while ((inputLine = postReader.readLine()) != null) {
postContent.append(inputLine);
}
}
Document parse = null;
if (responseCode == HttpURLConnection.HTTP_MOVED_TEMP
|| responseCode == HttpURLConnection.HTTP_MOVED_PERM
|| responseCode == HttpURLConnection.HTTP_SEE_OTHER) {
String newUrl = postConn.getHeaderField("Location");
System.out.println("Pagination Redirecting to: " + newUrl);
// 解析重定向中的相对地址为完整 URL如果是相对路径
if (!newUrl.startsWith("http")) {
newUrl = hostUrl + (newUrl.startsWith("/") ? newUrl : "/" + newUrl);
}
// Follow the redirect
URL redirectConnUrl = new URL(newUrl);
HttpURLConnection followConn = (HttpURLConnection) redirectConnUrl.openConnection();
followConn.setRequestMethod("GET");
followConn.setRequestProperty("Cookie", String.join("; ", cookieSet));
followConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7");
followConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36");
BufferedReader redirectReader = new BufferedReader(new InputStreamReader(followConn.getInputStream()));
StringBuilder redirectContent = new StringBuilder();
while ((inputLine = redirectReader.readLine()) != null) {
redirectContent.append(inputLine);
}
redirectReader.close();
followConn.disconnect();
System.out.println("Redirect Response: " + redirectContent);
parse = Jsoup.parse(String.valueOf(redirectContent));
} else if (responseCode == 200) {
System.out.println("Pagination Response: " + postContent);
parse = Jsoup.parse(String.valueOf(postContent));
} else {
System.err.println("Unexpected Pagination Response Code: " + responseCode);
// Optionally read and print error stream for non-200/3xx codes
try (BufferedReader errorReader = new BufferedReader(new InputStreamReader(postConn.getErrorStream()))) {
String errorLine;
System.err.println("Error Stream:");
while ((errorLine = errorReader.readLine()) != null) {
System.err.println(errorLine);
}
} catch (Exception e) {
System.err.println("Could not read error stream: " + e.getMessage());
}
return; // Exit if pagination fails unexpectedly
}
Elements links = parse.select("div[data-label='Titel der Studie'] a");
for (Element link : links) {
String href = link.attr("href");
String text = link.text();
System.out.println("链接: " + href);
System.out.println("标题: " + text);
}
String text = parse.select("div.col-md-2.pt-3.ps-0.text-md-end").text();
// 使用正则表达式提取 "第" "/" 之间的数字
String regex = "Seite\\s*(\\d+)\\s*/";
Matcher matcher = Pattern.compile(regex).matcher(text);
if (matcher.find()) {
System.out.println("总共有"+matcher.group(1));// 返回第一个捕获组即数字 "1"
}
postConn.disconnect();
} else if (searchResponseCode == 200) {
System.out.println("Search request returned 200 OK. Reading response body:");
// Read and print the response body for debugging
try (BufferedReader searchReader = new BufferedReader(new InputStreamReader(searchConn.getInputStream()))) {
String line;
StringBuilder searchResponseBody = new StringBuilder();
while ((line = searchReader.readLine()) != null) {
searchResponseBody.append(line).append("\n");
}
System.out.println("Search Response Body:\n" + searchResponseBody.toString());
} catch (Exception e) {
System.err.println("Could not read search response body: " + e.getMessage());
} finally {
searchConn.disconnect();
}
System.err.println("Search request did not return expected 302 redirect. Response code: " + searchResponseCode);
System.err.println("The website's search mechanism may have changed.");
} else {
// Handle other unexpected response codes for the search request
System.err.println("Unexpected Search Response Code: " + searchResponseCode);
try (BufferedReader errorReader = new BufferedReader(new InputStreamReader(searchConn.getErrorStream()))) {
String errorLine;
System.err.println("Error Stream:");
while ((errorLine = errorReader.readLine()) != null) {
System.err.println(errorLine);
}
} catch (Exception e) {
System.err.println("Could not read error stream for search response: " + e.getMessage());
}
searchConn.disconnect();
}
}
// 更新并返回当前连接中的 Cookie包含 JSESSIONID 的提取
private static String updateCookies(HttpURLConnection conn, Set<String> cookieSet) {
String sessionId = null;
Map<String, List<String>> headerFields = conn.getHeaderFields();
List<String> cookiesHeader = headerFields.get("Set-Cookie");
if (cookiesHeader != null) {
for (String cookie : cookiesHeader) {
String cookieValue = cookie.split(";")[0];
cookieSet.add(cookieValue);
// Prioritize JSESSIONID or csfcfc if present
if (cookieValue.startsWith("JSESSIONID=")) {
sessionId = cookieValue;
} else if (cookieValue.startsWith("csfcfc=") && sessionId == null) {
sessionId = cookieValue;
}
}
}
return sessionId;
}
// 提取 __VIEWSTATE 隐藏字段的值
private static String extractViewState(String html) {
// Try regex first for jakarta.faces.ViewState
String regexJakarta = "name=\"jakarta\\.faces\\.ViewState\"[^>]*value=\"([^\"]+)\"";
Pattern patternJakarta = Pattern.compile(regexJakarta);
Matcher matcherJakarta = patternJakarta.matcher(html);
if (matcherJakarta.find()) {
return matcherJakarta.group(1);
}
// Fallback to regex for javax.faces.ViewState (older versions or other parts of site)
String regexJavax = "name=\"javax\\.faces\\.ViewState\"[^>]*value=\"([^\"]+)\"";
Pattern patternJavax = Pattern.compile(regexJavax);
Matcher matcherJavax = patternJavax.matcher(html);
if (matcherJavax.find()) {
return matcherJavax.group(1);
}
// Fallback to string search if regex fails (less reliable)
String searchStringJakarta = "jakarta.faces.ViewState";
int startIndexJakarta = html.indexOf(searchStringJakarta);
if (startIndexJakarta != -1) {
int valueStart = html.indexOf("value=\"", startIndexJakarta) + 7;
int valueEnd = html.indexOf("\"", valueStart);
if (valueStart != -1 && valueEnd != -1) {
return html.substring(valueStart, valueEnd);
}
}
String searchStringJavax = "javax.faces.ViewState";
int startIndexJavax = html.indexOf(searchStringJavax);
if (startIndexJavax != -1) {
int valueStart = html.indexOf("value=\"", startIndexJavax) + 7;
int valueEnd = html.indexOf("\"", valueStart);
if (valueStart != -1 && valueEnd != -1) {
return html.substring(valueStart, valueEnd);
}
}
System.err.println("Failed to extract ViewState from HTML");
return ""; // Return empty string if not found
}
// 生成搜索请求的 POST 数据
private static String buildSearchPostData(String viewState) {
try {
// URL-encode the ViewState
String encodedViewState = URLEncoder.encode(viewState, StandardCharsets.UTF_8.name());
return "searchForm=searchForm" +
"&searchForm%3Aj_idt80=Midwifery" + // Assuming 'Midwifery' is the search term
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AdrksId=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AsecondaryId=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AscientificSummary=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aoutcome=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AhealthOfCondition=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AhealthyVolunteers=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aaddresses=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt128=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AipdSharingPlan=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt135%3Afrom=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt135%3Ato=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt146%3Afrom=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt146%3Ato=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt157%3Afrom=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt157%3Ato=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3Agender=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AageInYears=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AinclusionCriteria=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AexclusionCriteria=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AtrialStatus=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3ArecrutingLocation=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3Aj_idt213%3Afrom=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3Aj_idt213%3Ato=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3AtrialDesign%3Apurpose=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3AtrialDesign%3AstudyType=" +
"&searchForm%3Aj_idt287=" + // This parameter might be related to the search button click
"&jakarta.faces.ViewState=" + encodedViewState; // Changed to jakarta.faces.ViewState
} catch (Exception e) {
System.err.println("Error encoding search ViewState: " + e.getMessage());
return "";
}
}
// 生成分页请求的 POST 数据
private static String buildPostData(String viewState, int page) {
// The page parameter in the POST data might be 0-indexed or 1-indexed
// Let's assume it's 0-indexed for the parameter name and 1-indexed for the value based on your original code
int parameterPage = page - 1;
int valuePage = page; // The value sent in the form might be the actual page number
try {
// URL-encode the ViewState
String encodedViewState = URLEncoder.encode(viewState, StandardCharsets.UTF_8.name());
return "resultForm=resultForm" +
"&resultForm%3Asorting%3ArowsPerPage=10" +
// The parameter name for pagination button might have changed
// Check browser network traffic for the exact parameter name for page buttons
"&resultForm%3ApaginationTop%3Aj_idt156%3A"+ parameterPage +"%3Aj_idt158=" + valuePage +
"&resultForm%3Asorting%3AsortingBy=SCORE" +
"&resultForm%3Asorting%3Aj_idt141=true" + // This might be for sorting direction
"&resultForm%3Aj_idt221%3Aj_idt223%3AdownloadConfirmation=resultForm%3Aj_idt221%3Aj_idt223%3AdownloadConfirmation" +
"&selectedType=JSON" + // This might be for download format, potentially not needed for pagination
"&jakarta.faces.ViewState=" + encodedViewState; // Changed to jakarta.faces.ViewState
} catch (Exception e) {
System.err.println("Error encoding pagination ViewState: " + e.getMessage());
return "";
}
}
}

165
src/main/java/com/example/getInKa.java

@ -0,0 +1,165 @@
package com.example;
import org.apache.kafka.clients.producer.*;
import org.apache.kafka.common.serialization.StringSerializer;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
import java.io.*;
import java.util.*;
import java.util.concurrent.Future;
public class getInKa {
// 初始化 OkHttp 客户端用于发送 HTTP 请求
private static final OkHttpClient httpClient = new OkHttpClient();
private static final String PROCESSED_URLS_FILE = "processed_urls.txt"; // 记录已处理的 URL 文件
public static void main(String[] args) {
try {
// 获取目标 URL 列表
System.out.println("Starting URL collection...");
List<String> urls = getUrls();
System.out.println("Collected " + urls.size() + " URLs.");
// URL 中提取新闻数据并保存到 kafka
System.out.println("Starting news extraction...");
getNews(urls);
System.out.println("News extraction completed.");
} catch (IOException | InterruptedException e) {
System.out.println("Error in main: " + e.getMessage());
}
}
public static List<String> getUrls() throws IOException, InterruptedException {
List<String> urls = new ArrayList<>();
Set<String> processedUrls = loadProcessedUrls(); // 加载已处理的 URL
for (int page = 1; page <= 28; page++) {
String url = "https://www.zyctd.com/zixun/201/pz102-" + page + ".html";
Request request = new Request.Builder()
.url(url)
.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0")
.build();
System.out.println("Fetching page " + page + ": " + url);
try (Response response = httpClient.newCall(request).execute()) {
if (response.isSuccessful() && response.body() != null) {
System.out.println("Successfully fetched page " + page);
String html = response.body().string();
Document doc = Jsoup.parse(html);
Elements links = doc.select("div.zixun-list > div.zixun-item-box > div.zixun-item-title > p > a");
List<String> projectIDs = links.eachAttr("href");
System.out.println("Found " + projectIDs.size() + " URLs on page " + page);
for (String projectUrl : projectIDs) {
if (!processedUrls.contains(projectUrl)) { // 检查是否已处理
urls.add(projectUrl);
processedUrls.add(projectUrl); // 添加到已处理集合
}
}
} else {
System.out.println("Failed to fetch page " + page + ": Status code " + response.code());
}
}
Thread.sleep(1000);
}
saveProcessedUrls(processedUrls); // 保存已处理的 URL
return urls;
}
public static void getNews(List<String> urls) throws IOException {
for (int i = 0; i < urls.size(); i++) {
String url = urls.get(i);
Request request = new Request.Builder()
.url(url)
.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0")
.build();
System.out.println("Processing URL " + (i + 1) + "/" + urls.size() + ": " + url);
try (Response response = httpClient.newCall(request).execute()) {
if (response.isSuccessful() && response.body() != null) {
System.out.println("Successfully fetched news from " + url);
String html = response.body().string();
Document doc = Jsoup.parse(html);
String title = doc.select("div.info-title.t-center > h1").text().trim();
String date = doc.select("div.author.color-grey.art-info > span:nth-child(1)").text().trim();
String content = String.join("\n", doc.select("div.info-content > div > p").eachText()).trim();
if (content.isEmpty()) {
content = String.join("\n", doc.select("div.info-content > p:nth-child(2)").eachText()).trim();
}
if (!title.isEmpty() && !date.isEmpty() && !content.isEmpty()) {
Map<String, String> news = new HashMap<>();
news.put("title", title);
news.put("date", date);
news.put("content", content);
news.put("url", url);
System.out.println("Extracted news: " + news.get("title"));
saveData(news); // 调用修改后的 saveData 方法
} else {
System.out.println("Failed to extract complete data from " + url);
}
} else {
System.out.println("Failed to fetch news from " + url + ": Status code " + response.code());
}
} catch (Exception e) {
System.out.println("An error occurred while fetching " + url + ": " + e.getMessage());
}
try {
Thread.sleep(5000); // 休眠5秒
} catch (InterruptedException e) {
System.out.println("Sleep interrupted: " + e.getMessage());
}
}
}
public static void saveData(Map<String, String> news) {
Properties properties = new Properties();
properties.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092");
properties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName());
properties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName());
try (Producer<String, String> producer = new KafkaProducer<>(properties)) {
String topic = "news-topic";
String key = news.get("title");
String value = news.toString();
ProducerRecord<String, String> record = new ProducerRecord<>(topic, key, value);
producer.send(record, (metadata, exception) -> {
if (exception == null) {
System.out.println("Data sent successfully to Kafka: topic=" + metadata.topic() +
", partition=" + metadata.partition() + ", offset=" + metadata.offset());
} else {
System.err.println("Failed to send data to Kafka: " + exception.getMessage());
}
}).get();
} catch (Exception e) {
System.err.println("Error while sending data to Kafka: " + e.getMessage());
}
}
// 加载已处理的 URL
private static Set<String> loadProcessedUrls() throws IOException {
Set<String> processedUrls = new HashSet<>();
File file = new File(PROCESSED_URLS_FILE);
if (file.exists()) {
try (BufferedReader reader = new BufferedReader(new FileReader(file))) {
String line;
while ((line = reader.readLine()) != null) {
processedUrls.add(line.trim());
}
}
}
return processedUrls;
}
// 保存已处理的 URL
private static void saveProcessedUrls(Set<String> processedUrls) throws IOException {
try (BufferedWriter writer = new BufferedWriter(new FileWriter(PROCESSED_URLS_FILE))) {
for (String url : processedUrls) {
writer.write(url);
writer.newLine();
}
}
}
}

47
src/main/java/com/example/jsonGetOk.java

@ -0,0 +1,47 @@
package com.example;
import okhttp3.*;
import org.json.JSONArray;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
public class jsonGetOk {
public static void main(String[] args) throws IOException {
OkHttpClient client = new OkHttpClient().newBuilder()
.build();
MediaType mediaType = MediaType.parse("text/plain");
RequestBody body = RequestBody.create(mediaType, "");
Request request = new Request.Builder()
.url("https://www.dsscu.gov.mo/api/common/page_detail?PostType=page&EntityId=6654829e-8163-b801-0096-c02e09d690d1")
.get()
.build();
Response response = client.newCall(request).execute();
String responseBody = response.body().string();
// 解析 JSON
JSONObject jsonObject = new JSONObject(responseBody);
JSONObject data = jsonObject.getJSONObject("data");
String postTime = data.getString("onlineAt");
JSONObject metas = data.getJSONObject("metas");
String title = metas.getString("name");
String summary = metas.getString("summary");
Document parse = Jsoup.parse(summary);
String content = parse.text();
String forwardcontent = responseBody;
String fileList = metas.getString("biddersFile");
fileList = fileList+"###"+"pdf";
Map<String,Object> map = new HashMap<>();
map.put("postTime",postTime);
map.put("title",title);
map.put("content",content);
map.put("forwardcontent",forwardcontent);
map.put("fileList",fileList);
System.out.println(map);
}
}

256
src/main/java/com/example/ook.java

@ -0,0 +1,256 @@
package com.example;
import okhttp3.*;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.ZonedDateTime;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeParseException;
import java.util.Date;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class ook {
public static void main(String[] args) throws Exception {
// 1. 获取代理地址
// String proxyJson = getProxyFromLocalService();
// JSONObject proxyData = new JSONObject(proxyJson);
// String httpProxy = proxyData.getString("http"); // 例如 "http://proxy1:port"
//
// // 2. 解析代理地址
// String[] proxyParts = httpProxy.replace("http://", "").split(":");
// String proxyHost = proxyParts[0]; // proxy1
// int proxyPort = Integer.parseInt(proxyParts[1]); // port
OkHttpClient client = new OkHttpClient().newBuilder()
.connectTimeout(30, TimeUnit.SECONDS)
.readTimeout(30, TimeUnit.SECONDS)
.writeTimeout(30, TimeUnit.SECONDS)
.proxy(new Proxy(Proxy.Type.HTTP, new InetSocketAddress("127.0.0.1", 7897))) // 直接使用 7897 端口
.build();
MediaType mediaType = MediaType.parse("text/plain");
Request request = new Request.Builder()
.url("https://wrair.health.mil/News-Media/Press-Releases/")
.get()
// 添加关键请求头
.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36")
.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7")
// .addHeader("Accept-Encoding", "gzip, deflate, br, zstd")
.addHeader("Accept-Language", "zh-CN,zh;q=0.9,th;q=0.8")
.addHeader("Cache-Control", "no-cache")
.addHeader("Pragma", "no-cache")
.addHeader("Referer", "https://wrair.health.mil/News-Media/Press-Releases/")
.addHeader("Cookie", "_ga=GA1.1.516170455.1740971326; .ASPXANONYMOUS=xUBztj4Ek1vHfBPe-1QqFJhd83I4bkB1k0_d-2QrQ7drfd7R7Y6eNsyyHVjSeffyIKzy_qm5tOKOCtbvst-s9ZGWThxifCGMdJE117EQlr1OZARa0; dnn_IsMobile=False; language=en-US; ARRAffinity=c30f7cdebcf208f7c5a996cb410451c36532afc64703669607f68f04a75f4b39; _ga_CSLL4ZEK4L=GS1.1.1742349582.4.1.1742350035.0.0.0")
.addHeader("Upgrade-Insecure-Requests", "1")
.addHeader("Sec-Fetch-Dest", "document")
.addHeader("Sec-Fetch-Mode", "navigate")
.addHeader("Sec-Fetch-Site", "same-origin")
.addHeader("Sec-Fetch-User", "?1")
.addHeader("Sec-Ch-Ua", "\"Chromium\";v=\"134\", \"Not:A-Brand\";v=\"24\", \"Google Chrome\";v=\"134\"")
.addHeader("Sec-Ch-Ua-Mobile", "?0")
.addHeader("Sec-Ch-Ua-Platform", "\"Windows\"")
.addHeader("Priority", "u=0, i")
.build();
Response response = client.newCall(request).execute();
String html = response.body().string();
Document parse = Jsoup.parse(html);
// String url = "https://www.uu.se/nyheter/alla?newsResearch=researchtopic11%3Bresearchtopic7%3Bresearchtopic22%3Bresearchtopic10%3Bresearchtopic2&start=20";
// // 定义正则表达式
// String regex = "start=(\\d+)";
// Pattern pattern = Pattern.compile(regex);
// Matcher matcher = pattern.matcher(url);
// Integer start = 0;
// String postTime = convertToTimestamp(parse.select(".mr10").text());
// String title = parse.select(".hdg01").text();
// String content = parse.select(".container01 p").text();
// String forwardcontent = parse.select("#main").html();
// Map<String,Object> map = new HashMap<>();
// if (matcher.find()) {
// start = Integer.parseInt(matcher.group(1));
// System.out.println("Start: " + start); // start = 12
// }
//
// Elements allLinks = new Elements();
// Elements links = parse.select(".search-result-hit-text-container a");
// allLinks.addAll(links);
//
// int totalLinks = allLinks.size();
// int startIndex = Math.max(0, totalLinks - 10);
// for (int i = startIndex; i < totalLinks; i++) {
// Map<String, Object> task = new HashMap<String, Object>(16);
// task.put("link","https://www.uu.se"+allLinks.get(i).attr("href"));
// task.put("linktype", "newscontent"); // 設置鏈接類型為 "newscontent"
//
// System.out.println(task);
// }
Elements elements = parse.select(".title a");
for (Element element : elements) {
String link = element.attr("href");
System.out.println(link);
}
// map.put("postTime",postTime);
// map.put("title",title);
// map.put("content",content);
// map.put("forwardcontent",forwardcontent);
// System.out.println(map);
}
public ook() throws IOException {
}
// public static String convertToTimestamp(String dateStr) {
// try {
// // 定义输入格式dd MMMM , yyyy例如 "28 February , 2025"
// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("MMMM dd, yyyy", Locale.ENGLISH);
// // 定义输出格式yyyy-MM-dd HH:mm:ss
// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
//
// // 解析输入日期
// LocalDate date = LocalDate.parse(dateStr, inputFormatter);
// // 转换为带时间的格式时间设为 00:00:00
// return date.atStartOfDay().format(outputFormatter);
// } catch (Exception e) {
// e.printStackTrace();
// return null; // 或抛出异常根据需求调整
// }
// }
// public static String convertToTimestamp(String dateStr) {
// try {
// // 定义输入格式yyyy MM dd
// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("MM-dd-yyyy", Locale.CHINESE);
// // 定义输出格式yyyy-MM-dd HH:mm:ss
// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
//
// // 解析输入日期
// LocalDate date = LocalDate.parse(dateStr, inputFormatter);
// // 转换为带时间的格式时间设为 00:00:00
// return date.atStartOfDay().format(outputFormatter);
// } catch (Exception e) {
// e.printStackTrace();
// return null; // 或抛出异常根据需求调整
// }
// }
// public static String convertToTimestamp(String dateStr) {
// try {
// // 定义输入格式
// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("dd/MM/yyyy");
// // 定义输出格式
// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
//
// // 解析输入字符串为 LocalDate
// LocalDate date = LocalDate.parse(dateStr, inputFormatter);
// // 转换为 LocalDateTime设置时间为 00:00:00
// LocalDateTime dateTime = date.atStartOfDay();
// // 格式化为目标字符串
// return dateTime.format(outputFormatter);
// } catch (Exception e) {
// e.printStackTrace();
// return null; // 或者抛出异常根据需求调整
// }
// }
// public static String convertToTimestamp(String dateStr) {
// try {
// // 定义输入格式MMMM d, yyyy例如 "June 3, 2015"
// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("d MMMM, yyyy", Locale.ENGLISH);
// // 定义输出格式yyyy-MM-dd HH:mm:ss
// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
//
// // 解析输入日期
// LocalDate date = LocalDate.parse(dateStr, inputFormatter);
// // 转换为带时间的格式时间设为 00:00:00
// return date.atStartOfDay().format(outputFormatter);
// } catch (Exception e) {
// e.printStackTrace();
// return null; // 或抛出异常根据需求调整
// }
// }
// public static String convertToTimestamp(String input) {
// try {
// // 正则匹配 "d MMMM yyyy"
// Pattern pattern = Pattern.compile("\\d{1,2} [A-Za-z]+ \\d{4}");
// Matcher matcher = pattern.matcher(input);
// if (matcher.find()) {
// String dateStr = matcher.group();
// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("d MMMM yyyy", Locale.ENGLISH);
// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
// LocalDate date = LocalDate.parse(dateStr, inputFormatter);
// return date.atStartOfDay().format(outputFormatter);
// } else {
// System.out.println("No date found in: " + input);
// return null;
// }
// } catch (Exception e) {
// e.printStackTrace();
// return null;
// }
// }
// public static String convertToTimestamp(String dateStr) {
// try {
// // Parse the ISO 8601 date string (e.g., "2025-03-17T12:37:33.033Z")
// ZonedDateTime zdt = ZonedDateTime.parse(dateStr, DateTimeFormatter.ISO_DATE_TIME);
//
// // Define the output format (yyyy-MM-dd hh:mm:ss)
// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
//
// // Format the date to the desired output
// return zdt.format(outputFormatter);
// } catch (Exception e) {
// e.printStackTrace();
// return null; // Or throw an exception, depending on your needs
// }
// }
public static String convertToTimestamp(String dateStr) {
try {
// Parse "Jan. 9, 2025" (abbreviated month, dot, comma-separated)
DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("MMM. d, yyyy", Locale.ENGLISH);
LocalDate date = LocalDate.parse(dateStr, inputFormatter);
// Format to "yyyy-MM-dd HH:mm:ss" (defaulting time to 00:00:00)
DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
return date.atStartOfDay().format(outputFormatter);
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
// 调用本地代理服务获取代理地址
private static String getProxyFromLocalService() throws Exception {
OkHttpClient client = new OkHttpClient();
Request request = new Request.Builder()
.url("http://127.0.0.1:7897")
.get()
.build();
try (Response response = client.newCall(request).execute()) {
if (response.isSuccessful()) {
return response.body().string(); // 返回 JSON 字符串
} else {
throw new Exception("获取代理失败,状态码: " + response.code());
}
}
}
}

524
src/main/java/com/example/oook.java

@ -0,0 +1,524 @@
package com.example;
import okhttp3.*;
import org.json.JSONArray;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.time.*;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeParseException;
import java.util.*;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class oook {
public static void main(String[] args) throws Exception {
// 1. 获取代理地址
// String proxyJson = getProxyFromLocalService();
// JSONObject proxyData = new JSONObject(proxyJson);
// String httpProxy = proxyData.getString("http"); // 例如 "http://proxy1:port"
//
// // 2. 解析代理地址
// String[] proxyParts = httpProxy.replace("http://", "").split(":");
// String proxyHost = proxyParts[0]; // proxy1
// int proxyPort = Integer.parseInt(proxyParts[1]); // port
OkHttpClient client = new OkHttpClient().newBuilder()
.connectTimeout(30, TimeUnit.SECONDS)
.readTimeout(30, TimeUnit.SECONDS)
.writeTimeout(30, TimeUnit.SECONDS)
// .cookieJar(new CookieJar() {
// private final HashMap<String, List<Cookie>> cookieStore = new HashMap<>();
//
// @Override
// public void saveFromResponse(HttpUrl url, List<Cookie> cookies) {
// cookieStore.put(url.host(), cookies); // 保存 Cookie
// }
//
// @Override
// public List<Cookie> loadForRequest(HttpUrl url) {
// List<Cookie> cookies = cookieStore.get(url.host());
// return cookies != null ? cookies : new ArrayList<>();
// }
// })
// .followRedirects(true) // 自动处理重定向
.build();
// 发送目标请求自动获取和使用 Cookie
// Request request = new Request.Builder()
// .url("https://thl.fi/aiheet/infektiotaudit-ja-rokotukset/ajankohtaista/infektio-ja-rokotusuutiset?p_p_id=com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_L2Jk5CCjrKPN&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_L2Jk5CCjrKPN_redirect=%2Faiheet%2Finfektiotaudit-ja-rokotukset%2Fajankohtaista%2Finfektio-ja-rokotusuutiset&_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_L2Jk5CCjrKPN_delta=50&p_r_p_resetCur=false&_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_L2Jk5CCjrKPN_cur=1")
// .get()
// .addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36")
// .addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
// .addHeader("Accept-Language", "en-US,en;q=0.5")
// .addHeader("Cookie", "__cf_bm=HXf4OleH9DiJmEagV_4Wori6vFzyN4wf.CBVL57AQUI-1743471952-1.0.1.1-h0KqPKUW2_wblBJ1HWbn50Xi1EPDIxjvFhRyrkdPrAoRHNjlXk..tK_KDWGUs6f4Z1VbQUbJD1Vw3KTi9IYO5bx5af4ZqE2nABBXT.YpLKQ; _cfuvid=jdweOOZm.a8GWXZGqRHb.fiSFMKZuAppyOlkDBbafw0-1743471952167-0.0.1.1-604800000") .build();
// OkHttpClient client = new OkHttpClient().newBuilder()
// .connectTimeout(30, TimeUnit.SECONDS)
// .readTimeout(30, TimeUnit.SECONDS)
// .writeTimeout(30, TimeUnit.SECONDS)
//// .proxy(new Proxy(Proxy.Type.HTTP, new InetSocketAddress("127.0.0.1", 7897))) // 直接使用 7897 端口
// .build();
String url = "https://www.iranintl.com/en/202504116060";
MediaType mediaType = MediaType.parse("text/plain");
RequestBody body = RequestBody.create(mediaType, "");
Request request = new Request.Builder()
.url(url)
.get()
.build();
Response response = client.newCall(request).execute();
String html = response.body().string();
Document parse = Jsoup.parse(html);
// String htmlData = null;
// JSONArray jsonArray = new JSONArray(html);
// for (int i = 0; i < jsonArray.length(); i++) {
// JSONObject obj = jsonArray.getJSONObject(i);
// if ("insert".equals(obj.optString("command")) && obj.has("data")) {
// htmlData = obj.getString("data");
// break;
// }
// }
// Document doc = Jsoup.parse(htmlData);
// Elements rows = doc.select(".o-grid__item.col-1, .o-grid__item.col-2, .o-grid__item.col-3");
//
// Set<String> uniqueHrefs = new HashSet<>();
//
// for (Element row : rows) {
// Elements links = row.select("a[href]"); // 选择所有 a 标签
// if (!links.isEmpty()) {
// // 只取第一个 href
// String href = links.first().attr("href");
// uniqueHrefs.add(href);
// }
// }
////
// for (String href : uniqueHrefs) {
// System.out.println("Href: " + href);
// }
// String next = getNextPageUrl(url);
// System.out.println(next);
// JSONObject jsonObject = new JSONObject(html);
// JSONObject response1 = jsonObject.getJSONObject("response");
// JSONArray docs = response1.getJSONArray("docs");
//
// // 遍历 docs 数组提取 permalink
// for (int i = 0; i < docs.length(); i++) {
// JSONObject doc = docs.getJSONObject(i);
// String permalink = doc.getString("permalink");
// System.out.println("Permalink: " + permalink);
// }
// String url = "https://www.uu.se/nyheter/alla?newsResearch=researchtopic11%3Bresearchtopic7%3Bresearchtopic22%3Bresearchtopic10%3Bresearchtopic2&start=20";
// // 定义正则表达式
// String regex = "start=(\\d+)";
// Pattern pattern = Pattern.compile(regex);
// Matcher matcher = pattern.matcher(url);
// Integer start = 0;
// String postTime = convertToTimestamp(
// parse.select(".c-news-info__date.o-meta span.c-date").text().trim() + " " +
// parse.select(".c-news-info__date.o-meta span.c-year").text().trim()
// );
// String postTime = parse.select("tr:nth-child(3) td:nth-child(3)").text()+" 00:00:00";
String postTime = convertIsoToTimestamp(parse.select(".WrittenContentBlock-module__9pvVhW__timeAgo time").attr("datetime"));
String title = parse.select(".WrittenContentBlock-module__9pvVhW__headline").text();
String content = parse.select(".WrittenContentBlock-module__9pvVhW__body p").text();
String forwardcontent = parse.select(".page").html();
Elements imgs = parse.select(".page img");
// Elements pdfs = parse.select("tr:nth-child(3) td a");
String prefix = "";
List imgList = new ArrayList<String>();
for (Element img : imgs) {
String src = img.attr("src");
if (src != null && !src.isEmpty()) {
// 判断是否以 https 开头
String fullUrl;
if (!src.startsWith("https")) {
// 如果不以 https 开头拼接前缀
if (src.startsWith("/")) {
fullUrl = prefix + src;
} else {
fullUrl = prefix + "/" + src;
}
} else {
fullUrl = src;
}
// 拼接格式
String imgUrl = fullUrl + "###" + "avif";
imgList.add(imgUrl);
}
}
// String prefix = "";
//
// List<String> fileList = new ArrayList<String>();
//
// for (Element pdf : pdfs) {
// String pdfUrl = pdf.attr("href");
// if (pdfUrl != null && !pdfUrl.isEmpty()) {
// // 判断是否以 https 开头
// String fullUrl;
// if (!pdfUrl.startsWith("https")) {
// // 如果不以 https 开头拼接前缀
// if (pdfUrl.startsWith("/")) {
// fullUrl = prefix + pdfUrl;
// } else {
// fullUrl = prefix + "/" + pdfUrl;
// }
// } else {
// fullUrl = pdfUrl;
// }
// // 拼接格式
// String fileUrl = fullUrl + "###" + "pdf";
// fileList.add(fileUrl);
// }
// }
//
// if (matcher.find()) {
// start = Integer.parseInt(matcher.group(1));
// System.out.println("Start: " + start); // start = 12
// }
// Elements allLinks = new Elements();
// Elements links = parse.select(".card-body a");
// allLinks.addAll(links);
//
// int totalLinks = allLinks.size();
// int startIndex = Math.max(0, totalLinks - 10);
// for (int i = startIndex; i < totalLinks; i++) {
// Map<String, Object> task = new HashMap<String, Object>(16);
// task.put("link","https://www.uu.se"+allLinks.get(i).attr("href"));
// task.put("linktype", "newscontent"); // 設置鏈接類型為 "newscontent"
//
// System.out.println(task);
// }
// Elements elements = parse.select(".topic__grid__item a");
// Integer count = elements.size();
// for (Element element : elements) {
// String link = element.attr("href"); // 獲取新聞鏈接的 href 屬性
// System.out.println(link);
// }
// if(count <10){
// String nextpageurl = getPreviousYearUrl(url);
// System.out.println(nextpageurl);
// }else {
// String nextpageurl = getNextPageUrl(url);
// System.out.println(nextpageurl);
// }
Map<String,Object> map = new HashMap<>();
map.put("postTime",postTime);
map.put("title",title);
map.put("content",content);
map.put("forwardcontent",forwardcontent);
map.put("imgList",imgList);
// map.put("fileList",fileList);
System.out.println(map);
}
public oook() throws IOException {
}
// public static String convertToTimestamp(String dateStr) {
// try {
// // 定义输入格式dd MMMM , yyyy例如 "28 February , 2025"
// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("MMMM dd, yyyy", Locale.ENGLISH);
// // 定义输出格式yyyy-MM-dd HH:mm:ss
// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
// dateStr = dateStr.replace("|", "").trim();
// // 解析输入日期
// LocalDate date = LocalDate.parse(dateStr, inputFormatter);
// // 转换为带时间的格式时间设为 00:00:00
// return date.atStartOfDay().format(outputFormatter);
// } catch (Exception e) {
// e.printStackTrace();
// return null; // 或抛出异常根据需求调整
// }
// }
// public static String convertToTimestamp(String dateStr) {
// try {
// // 去掉 "Publié le" 前缀并清理多余字符
// dateStr = dateStr.replace("Publié le", "").trim();
//
// // 定义输入格式dd MMMM yyyy例如 "25 mars 2025"
// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("dd MMMM yyyy", Locale.FRENCH);
//
// // 定义输出格式yyyy-MM-dd HH:mm:ss
// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
//
// // 解析输入日期
// LocalDate date = LocalDate.parse(dateStr, inputFormatter);
//
// // 转换为带时间的格式时间设为 00:00:00
// return date.atStartOfDay().format(outputFormatter);
// } catch (Exception e) {
// e.printStackTrace();
// return null; // 或抛出异常根据需求调整
// }
// }
// public static String convertToTimestamp(String dateStr) {
// try {
// // 定义输入格式yyyy MM dd
// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("MM-dd-yyyy", Locale.CHINESE);
// // 定义输出格式yyyy-MM-dd HH:mm:ss
// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
//
// // 解析输入日期
// LocalDate date = LocalDate.parse(dateStr, inputFormatter);
// // 转换为带时间的格式时间设为 00:00:00
// return date.atStartOfDay().format(outputFormatter);
// } catch (Exception e) {
// e.printStackTrace();
// return null; // 或抛出异常根据需求调整
// }
// }
// public static String convertToTimestamp(String dateStr) {
// try {
// // 定义输入格式
// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("dd/MM/yyyy");
// // 定义输出格式
// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
//
// // 解析输入字符串为 LocalDate
// LocalDate date = LocalDate.parse(dateStr, inputFormatter);
// // 转换为 LocalDateTime设置时间为 00:00:00
// LocalDateTime dateTime = date.atStartOfDay();
// // 格式化为目标字符串
// return dateTime.format(outputFormatter);
// } catch (Exception e) {
// e.printStackTrace();
// return null; // 或者抛出异常根据需求调整
// }
// }
// public static String convertToTimestamp(String dateStr) {
// try {
// // 定义输入格式MMMM d, yyyy例如 "June 3, 2015"
// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("d MMMM, yyyy", Locale.ENGLISH);
// // 定义输出格式yyyy-MM-dd HH:mm:ss
// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
//
// // 解析输入日期
// LocalDate date = LocalDate.parse(dateStr, inputFormatter);
// // 转换为带时间的格式时间设为 00:00:00
// return date.atStartOfDay().format(outputFormatter);
// } catch (Exception e) {
// e.printStackTrace();
// return null; // 或抛出异常根据需求调整
// }
// }
// public static String convertToTimestamp(String input) {
// try {
// // 正则匹配 "d MMMM yyyy"
// Pattern pattern = Pattern.compile("\\d{1,2} [A-Za-z]+ \\d{4}");
// Matcher matcher = pattern.matcher(input);
// if (matcher.find()) {
// String dateStr = matcher.group();
// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("d MMMM yyyy", Locale.ENGLISH);
// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
// LocalDate date = LocalDate.parse(dateStr, inputFormatter);
// return date.atStartOfDay().format(outputFormatter);
// } else {
// System.out.println("No date found in: " + input);
// return null;
// }
// } catch (Exception e) {
// e.printStackTrace();
// return null;
// }
// }
// public static String convertToTimestamp(String dateStr) {
// try {
// // Parse the ISO 8601 date string (e.g., "2025-03-17T12:37:33.033Z")
// ZonedDateTime zdt = ZonedDateTime.parse(dateStr, DateTimeFormatter.ISO_DATE_TIME);
//
// // Define the output format (yyyy-MM-dd hh:mm:ss)
// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
//
// // Format the date to the desired output
// return zdt.format(outputFormatter);
// } catch (Exception e) {
// e.printStackTrace();
// return null; // Or throw an exception, depending on your needs
// }
// }
// public static String convertToTimestamp(String dateStr) {
// try {
// // Parse "Jan. 9, 2025" (abbreviated month, dot, comma-separated)
// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("MMM d, yyyy", Locale.ENGLISH);
// LocalDate date = LocalDate.parse(dateStr, inputFormatter);
//
// // Format to "yyyy-MM-dd HH:mm:ss" (defaulting time to 00:00:00)
// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
// return date.atStartOfDay().format(outputFormatter);
// } catch (Exception e) {
// e.printStackTrace();
// return null;
// }
// }
// public static String convertToTimestamp(String dateStr) {
// try {
// // 从文本中提取修改日期
// String modifiedDateStr = extractModifiedDate(dateStr);
// if (modifiedDateStr == null) {
// throw new IllegalArgumentException("无法找到修改日期");
// }
//
// // Parse "20/12/2024" (day/month/year format, Italian style)
// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("dd/MM/yyyy", Locale.ITALIAN);
// LocalDate date = LocalDate.parse(modifiedDateStr, inputFormatter);
//
// // Format to "yyyy-MM-dd HH:mm:ss" (defaulting time to 00:00:00)
// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
// return date.atStartOfDay().format(outputFormatter);
// } catch (Exception e) {
// e.printStackTrace();
// return null;
// }
// }
public static String convertIsoToTimestamp(String dateStr) {
try {
// 解析 ISO 8601 格式的 UTC 时间为 Instant
Instant instant = Instant.parse(dateStr);
// 转为本地时间系统默认时区如果你不想转换时区可以用 LocalDateTime.ofInstant
LocalDateTime localDateTime = LocalDateTime.ofInstant(instant, ZoneOffset.UTC);
// 定义输出格式
DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
return localDateTime.format(outputFormatter);
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
// public static String convertToTimestamp(String dateStr) {
// try {
// // 创建捷克语的日期格式器解析 "27. listopadu 2024"
// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("d. MMMM yyyy", new Locale("cs", "CZ"));
// LocalDate date = LocalDate.parse(dateStr, inputFormatter);
//
// // 转换为 "yyyy-MM-dd HH:mm:ss" 格式默认时间为 00:00:00
// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
// return date.atStartOfDay().format(outputFormatter);
// } catch (Exception e) {
// e.printStackTrace();
// return null;
// }
// }
// 提取修改日期的方法
private static String extractModifiedDate(String text) {
String[] lines = text.split("\n");
for (String line : lines) {
if (line.contains("Modificato")) {
// 提取 "Modificato" 后面的日期部分
String[] parts = line.split("\\s+");
for (String part : parts) {
if (part.matches("\\d{2}/\\d{2}/\\d{4}")) {
return part; // 返回 "20/12/2024"
}
}
}
}
return null; // 如果没找到修改日期返回 null
}
// 调用本地代理服务获取代理地址
private static String getProxyFromLocalService() throws Exception {
OkHttpClient client = new OkHttpClient();
Request request = new Request.Builder()
.url("http://127.0.0.1:7897")
.get()
.build();
try (Response response = client.newCall(request).execute()) {
if (response.isSuccessful()) {
return response.body().string(); // 返回 JSON 字符串
} else {
throw new Exception("获取代理失败,状态码: " + response.code());
}
}
}
public static String getNextPageUrl(String currentUrl) {
if (currentUrl == null || currentUrl.trim().isEmpty()) {
return null;
}
// // 定义基础 URL
// String baseUrl = "https://www.pasteur.dz/fr/espace-presse";
//
// // 如果是基础 URL默认第 1 下一页为 ?page=2
// if (currentUrl.equals(baseUrl)) {
// return baseUrl + "?start=5";
// }
// 定义正则表达式匹配 ?page=数字
String regex = "_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_gJ3hFqMQsykM_cur=(\\d+)";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(currentUrl);
// 如果找到 ?page=*
if (matcher.find()) {
// 提取页码group(1) 是括号中的数字部分
String pageNumStr = matcher.group(1);
try {
int currentPage = Integer.parseInt(pageNumStr);
// 替换旧页码为新页码
return matcher.replaceFirst("_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_gJ3hFqMQsykM_cur=" + (currentPage + 1));
} catch (NumberFormatException e) {
return null; // 页码解析失败
}
}else {
return null;
}
}
public static String getPreviousYearUrl(String url) {
if (url == null || url.trim().isEmpty()) {
return null;
}
// 定义正则表达式匹配年份
String yearRegex = "_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_gJ3hFqMQsykM_year=(\\d{4})";
Pattern yearPattern = Pattern.compile(yearRegex);
Matcher yearMatcher = yearPattern.matcher(url);
// 如果找到年份
if (yearMatcher.find()) {
String yearStr = yearMatcher.group(1); // 提取年份
Integer currentYear = Integer.parseInt(yearStr);
Integer previousYear = currentYear - 1; // 计算上一年
// 替换年份
url = yearMatcher.replaceFirst("_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_gJ3hFqMQsykM_year=" + previousYear);
}
// 定义正则表达式匹配页码
String pageRegex = "_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_gJ3hFqMQsykM_cur=(\\d+)";
Pattern pagePattern = Pattern.compile(pageRegex);
Matcher pageMatcher = pagePattern.matcher(url);
// 如果找到页码
if (pageMatcher.find()) {
// 替换页码为 1
return pageMatcher.replaceFirst("_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_gJ3hFqMQsykM_cur=1");
} else {
// 如果没有找到页码默认页码为 1
return url + "_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_gJ3hFqMQsykM_cur=1";
}
}
}

403
src/main/java/com/example/projTopic.java

@ -0,0 +1,403 @@
package com.example;
import com.fasterxml.jackson.databind.ObjectMapper;
import okhttp3.*;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.apache.kafka.common.serialization.StringSerializer;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class projTopic {
private static final String TOPIC_NAME = "projTopic";
private static final String BOOTSTRAP_SERVERS = "node-01:19092";
private static KafkaProducer<String, String> producer;
private static ObjectMapper objectMapper = new ObjectMapper();
private static final Random random = new Random();
private static List<String> proxyList = new ArrayList<>(); // 代理池
private static int currentProxyIndex = 0; // 当前使用的代理索引
static {
Properties props = new Properties();
props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, BOOTSTRAP_SERVERS);
props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
props.put(ProducerConfig.ACKS_CONFIG, "all"); // 等待所有副本确认
props.put(ProducerConfig.RETRIES_CONFIG, 3); // 重试次数
producer = new KafkaProducer<>(props);
try {
proxyList = Files.readAllLines(Paths.get("proxy.txt"));
if (proxyList.isEmpty()) {
System.out.println("警告: proxy.txt 为空,未加载任何代理");
} else {
System.out.println("成功加载 " + proxyList.size() + " 个代理");
}
} catch (IOException e) {
System.err.println("读取 proxy.txt 失败: " + e.getMessage());
}
}
public static void main(String[] args) throws IOException, InterruptedException {
List<String> keywords = Files.readAllLines(Paths.get("keywords.txt"));
List<String> cleanedKeywords = new ArrayList<>();
for (String keyword : keywords) {
String cleaned = keyword.split(",")[0].trim(); // 取逗号前的部分并去除首尾空格
cleaned = cleaned.replaceAll("\\s+", "+"); // 替换所有空格为 +
cleanedKeywords.add(cleaned);
}
ExecutorService executor = Executors.newFixedThreadPool(4); // 4 个线程
for (String keyword : cleanedKeywords) {
executor.submit(() -> {
try {
int sleepTime = random.nextInt(1001) + 30000;
String load = "5|0|20|https://www.nsf.gov/awardsearch/jsp/gwt/search/|57BE5CA45E781DC0159F727F8A8205EB|gov.nsf.research.awardsearch.gwt.client.SearchAwardService|getAwards|gov.nsf.research.awardsearch.gwt.bean.SearchRequestBean/3930579236|com.extjs.gxt.ui.client.data.PagingLoadConfig|java.util.HashMap/962170901|java.lang.String/2004016611|QueryText|" + keyword + "|ActiveAwards|true|com.extjs.gxt.ui.client.data.BasePagingLoadConfig/2011366567|com.extjs.gxt.ui.client.data.RpcMap/3441186752|sortField|sortDir|com.extjs.gxt.ui.client.Style$SortDir/640452531|offset|java.lang.Integer/3438268394|limit|1|2|3|4|2|5|6|5|7|2|8|9|8|10|8|11|8|12|13|0|1|14|4|15|0|16|17|0|18|19|0|20|19|30|";
for(int i=0;;i++){
OkHttpClient client = createClientWithProxy();
MediaType mediaType = MediaType.parse("text/x-gwt-rpc; charset=UTF-8");
RequestBody body = RequestBody.create(mediaType, load);
Request request = new Request.Builder()
.url("https://www.nsf.gov/awardsearch/jsp/gwt/search/.searchaward")
.method("POST", body)
.addHeader("Content-Type", "text/x-gwt-rpc; charset=UTF-8")
.addHeader("X-GWT-Module-Base", "https://www.nsf.gov/awardsearch/jsp/gwt/search/")
.addHeader("X-GWT-Permutation", "368C3CF86AA4CD7DB2250B35B844C1C2")
// .addHeader("cookie", "JSESSIONID=E9DCB88F6AD2241C9973AFEC03158ECB")
.build();
Response response = executeWithRetry(client, request, keyword);
String content = response.body().string();
Pattern pattern = Pattern.compile("\"awdNumber\",\"(\\d+)\"");
Matcher matcher = pattern.matcher(content);
List<String> numbers = new ArrayList<>(); // 用于存储匹配的数字
// 查找并提取数字
List<String> additionalNumbers = new ArrayList<>();
List<String> urls = new ArrayList<>();
// 查找匹配项
while (matcher.find()) {
// 获取捕获到的数字并将其添加到列表中
numbers.add(matcher.group(1));
}
// 输出捕获到的数字
if (numbers.isEmpty()) {
System.out.println("没找到awdNumber,继续下一种查找");
} else {
for (String number : numbers) {
additionalNumbers.add(number);
}
}
Pattern additionalPattern = Pattern.compile("\"[^\"]+\",\"(?:\\d{2}/\\d{2}/\\d{4}|\\d+\\.\\d+)\"(?:,\"(?:\\d{2}/\\d{2}/\\d{4}|\\d+\\.\\d+)\")?,\"(\\d+)\"");
Matcher additionalMatcher = additionalPattern.matcher(content);
while (additionalMatcher.find()) {
additionalNumbers.add(additionalMatcher.group(1));
}
if (additionalNumbers.isEmpty()) {
System.out.println("没找到下一页内容链接");
Thread.sleep(sleepTime);
break;
} else {
for (String number : additionalNumbers) {
String url = "https://www.nsf.gov/awardsearch/showAward?AWD_ID=" + number + "&HistoricalAwards=false";
urls.add(url);
}
}
if (!urls.isEmpty() && urls.get(0).equals("https://www.nsf.gov/awardsearch/showAward?AWD_ID=2446604&HistoricalAwards=false")) {
System.out.println("第一个 URL 是 AWD_ID=2446604,跳过关键词: " + keyword);
Thread.sleep(sleepTime);
return; // 跳出当前任务处理下一个关键词
}
for(String url:urls){
OkHttpClient client2 = createClientWithProxy();
MediaType mediaType2 = MediaType.parse("text/plain");
RequestBody body2 = RequestBody.create(mediaType2, "");
Request request2 = new Request.Builder()
.url(url)
.get()
// .addHeader("Cookie", "JSESSIONID=E9DCB88F6AD2241C9973AFEC03158ECB")
.build();
Response response2 = executeWithRetry(client2, request2, keyword);
System.out.println(response2.code());
String html = response2.body().string();
Document parse = Jsoup.parse(html);
String title = parse.select(".pageheadline").text();
String projectNum = parse.select(".clear tr:nth-child(5) .tabletext2:nth-child(2)").text();
String projectLeader = parse.select(".clear tr:nth-child(13) .tabletext2:nth-child(2)").text();
String projectStartTime = convertToTimestamp(parse.select(".clear tr:nth-child(8) .tabletext2:nth-child(2)").text());
String projectEndTime = convertToTimestamp2(parse.select(".clear tr:nth-child(9) .tabletext2:nth-child(2)").text());
String sponsorPart = parse.select(".clear tr:nth-child(2) .tabletext2:nth-child(2)").text();
String country = "USA";
String brief = parse.select(".clear.margintop25 span").text();
String sponsor = parse.select(".clear tr:nth-child(1) .tabletext2:nth-child(2)").text();
String projectFunding = parse.select(".clear tr:nth-child(12) .tabletext2:nth-child(2)").text();
String relatedProject = parse.select(".clear tr:nth-child(20) .tabletext2:nth-child(2)").text();
String awardInstrument = parse.select(".clear tr:nth-child(6) .tabletext2:nth-child(2)").text();
String programManager = parse.select(".clear tr:nth-child(7) .tabletext2:nth-child(2)").text();
String totalIntendedAwardAmount = parse.select(".clear tr:nth-child(10) .tabletext2:nth-child(2)").text();
String totalAwardedAmountToDate = parse.select(".clear tr:nth-child(11) .tabletext2:nth-child(2)").text();
String recipientSponsoredResearchOffice = parse.select(".clear tr:nth-child(14) .tabletext2:nth-child(2)").text();
String sponsorCongressionalDistrict = parse.select(".clear tr:nth-child(15) .tabletext2:nth-child(2)").text();
String primaryPlaceOfPerformance = parse.select(".clear tr:nth-child(16) .tabletext2:nth-child(2)").text();
String primaryPlaceOfPerformanceCongressionalDistrict = parse.select(".clear tr:nth-child(17) .tabletext2:nth-child(2)").text();
String uniqueEntityIdentifier = parse.select(".clear tr:nth-child(18) .tabletext2:nth-child(2)").text();
String parentUEI = parse.select(".clear tr:nth-child(19) .tabletext2:nth-child(2)").text();
String primaryProgramSource = parse.select(".clear tr:nth-child(21) .tabletext2:nth-child(2)").text();
String programReferenceCode = parse.select(".clear tr:nth-child(22) .tabletext2:nth-child(2)").text();
String programElementCode = parse.select(".clear tr:nth-child(23) .tabletext2:nth-child(2)").text();
String awardAgencyCode = parse.select(".clear tr:nth-child(24) .tabletext2:nth-child(2)").text();
String fundAgencyCode = parse.select(".clear tr:nth-child(25) .tabletext2:nth-child(2)").text();
String assistanceListingNumber = parse.select(".clear tr:nth-child(26) .tabletext2:nth-child(2)").text();
String initialAmendmentDate = convertToTimestamp(parse.select(".clear tr:nth-child(3) .tabletext2:nth-child(2)").text());
String latestAmendmentDate = convertToTimestamp(parse.select(".clear tr:nth-child(4) .tabletext2:nth-child(2)").text());
List<Map<String, Object>> citations = extractAllCitationInfo(html);
Map<String,Object> data = new HashMap<>();
data.put("title",title);
data.put("projectNum",projectNum);
data.put("projectLeader",projectLeader);
data.put("projectStartTime",projectStartTime);
data.put("projectEndTime",projectEndTime);
data.put("sponsorPart",sponsorPart);
data.put("country",country);
data.put("brief",brief);
data.put("sponsor",sponsor);
data.put("projectFunding",projectFunding);
data.put("relatedProject",relatedProject);
data.put("awardInstrument",awardInstrument);
data.put("programManager",programManager);
data.put("totalIntendedAwardAmount",totalIntendedAwardAmount);
data.put("totalAwardedAmountToDate",totalAwardedAmountToDate);
data.put("recipientSponsoredResearchOffice",recipientSponsoredResearchOffice);
data.put("sponsorCongressionalDistrict",sponsorCongressionalDistrict);
data.put("primaryPlaceOfPerformance",primaryPlaceOfPerformance);
data.put("primaryPlaceOfPerformanceCongressionalDistrict",primaryPlaceOfPerformanceCongressionalDistrict);
data.put("uniqueEntityIdentifier",uniqueEntityIdentifier);
data.put("parentUEI",parentUEI);
data.put("primaryProgramSource",primaryProgramSource);
data.put("programReferenceCode",programReferenceCode);
data.put("programElementCode",programElementCode);
data.put("awardAgencyCode",awardAgencyCode);
data.put("fundAgencyCode",fundAgencyCode);
data.put("assistanceListingNumber",assistanceListingNumber);
data.put("publications",citations);
data.put("initialAmendmentDate",initialAmendmentDate);
data.put("latestAmendmentDate",latestAmendmentDate);
data.put("crawlUrl",url);
data.put("crawlTime",localDateTime());
Map<String,Object> result = new HashMap<>();
result.put("keyword",keyword);
result.put("data",data);
try {
String jsonValue = objectMapper.writeValueAsString(result);
ProducerRecord<String, String> record = new ProducerRecord<>(TOPIC_NAME, projectNum, jsonValue);
producer.send(record, (metadata, exception) -> {
if (exception == null) {
System.out.println("成功发送到Kafka - Partition: " + metadata.partition() +
", Offset: " + metadata.offset());
} else {
System.err.println("发送到Kafka失败: " + exception.getMessage());
}
});
} catch (Exception e) {
System.err.println("序列化或发送Kafka消息失败: " + e.getMessage());
}
Thread.sleep(sleepTime);
}
load = increaseOffsetBy30(load);
}
} catch (Exception e) {
System.err.println("处理 " + keyword + " 失败: " + e.getMessage());
e.printStackTrace();
}
});
}
executor.shutdown();
executor.awaitTermination(5, TimeUnit.HOURS);
producer.close();
}
public static String convertToTimestamp(String dateStr) {
try {
// Parse "Jan. 9, 2025" (abbreviated month, dot, comma-separated)
DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("MMMM d, yyyy", Locale.ENGLISH);
LocalDate date = LocalDate.parse(dateStr, inputFormatter);
// Format to "yyyy-MM-dd HH:mm:ss" (defaulting time to 00:00:00)
DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
return date.atStartOfDay().format(outputFormatter);
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
public static String convertToTimestamp2(String dateStr) {
try {
// 移除 "(Estimated)" 部分
String cleanDateStr = dateStr.replace(" (Estimated)", "").trim();
// Parse "June 30, 2025" (full month, day, comma-separated year)
DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("MMMM d, yyyy", Locale.ENGLISH);
LocalDate date = LocalDate.parse(cleanDateStr, inputFormatter);
// Format to "yyyy-MM-dd HH:mm:ss" (defaulting time to 00:00:00)
DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
return date.atStartOfDay().format(outputFormatter);
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
public static List<Map<String, Object>> extractAllCitationInfo(String html) {
Document doc = Jsoup.parse(html);
List<Map<String, Object>> citations = new ArrayList<>();
// 选择所有 margintop15
Elements marginDivs = doc.select(".margintop15");
Pattern urlPattern = Pattern.compile("javascript:popwin\\('(.*?)'\\)");
for (Element div : marginDivs) {
Map<String, Object> info = new HashMap<>();
// 提取 span 中的文本
Elements spans = div.select("> span");
if (spans.size() >= 3) {
info.put("authors", spans.get(0).text());
info.put("title", spans.get(1).text());
info.put("year", spans.get(2).text());
}
// 提取链接
Elements links = div.select("a");
String doiUrl = "";
String citationUrl = "";
for (Element link : links) {
String href = link.attr("href");
Matcher matcher = urlPattern.matcher(href);
if (matcher.find()) {
String url = matcher.group(1);
if (link.text().contains("doi.org") && doiUrl.isEmpty()) {
doiUrl = url;
} else if (link.text().contains("引用详细信息") && citationUrl.isEmpty()) {
citationUrl = url;
}
}
}
info.put("doiUrl", doiUrl);
info.put("citationUrl", citationUrl);
// 添加到结果列表
citations.add(info);
}
return citations;
}
public static String localDateTime(){
LocalDateTime dateTime = LocalDateTime.now();
// 创建 DateTimeFormatter定义日期时间的格式
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
// 使用 formatter 格式化 LocalDateTime
String formattedDateTime = dateTime.format(formatter);
return formattedDateTime; // 输出类似: 2025-04-08 13:45:30
}
public static String increaseOffsetBy30(String originalPayload) {
// "|" 分割载荷为数组
String[] parts = originalPayload.split("\\|");
// 检查数组长度确保有足够元素
if (parts.length < 4) {
throw new IllegalArgumentException("载荷格式无效,元素不足");
}
// 找到倒数第 4 个元素的位置
int targetIndex = parts.length - 4;
try {
// 将倒数第 4 个数字解析为整数
int currentOffset = Integer.parseInt(parts[targetIndex]);
// 增加 30
int newOffset = currentOffset + 30;
// 将新值放回数组
parts[targetIndex] = String.valueOf(newOffset);
// 重新拼接载荷
return String.join("|", parts);
} catch (NumberFormatException e) {
throw new IllegalArgumentException("倒数第 4 个元素不是有效数字: " + parts[targetIndex]);
}
}
private static Response executeWithRetry(OkHttpClient client, Request request, String keyword) throws IOException {
int maxRetries = proxyList.isEmpty() ? 1 : proxyList.size(); // 如果没有代理只尝试一次
int attempt = 0;
while (attempt < maxRetries) {
Response response = client.newCall(request).execute();
if (response.code() == 403) {
System.out.println("收到 403 状态码,尝试切换代理重试...");
response.close();
switchProxy();
client = createClientWithProxy(); // 使用新代理重建客户端
attempt++;
if (attempt == maxRetries) {
throw new IOException("所有代理尝试失败,仍然收到 403");
}
continue;
}
return response; // 成功或非 403 状态码直接返回
}
throw new IOException("无法执行请求,未获取响应");
}
private static OkHttpClient createClientWithProxy() {
OkHttpClient.Builder builder = new OkHttpClient().newBuilder()
.connectTimeout(30, TimeUnit.SECONDS)
.readTimeout(30, TimeUnit.SECONDS)
.writeTimeout(30, TimeUnit.SECONDS);
if (!proxyList.isEmpty() && currentProxyIndex < proxyList.size()) {
String proxy = proxyList.get(currentProxyIndex);
String[] proxyParts = proxy.split(":");
if (proxyParts.length == 2) {
String proxyHost = proxyParts[0];
int proxyPort = Integer.parseInt(proxyParts[1]);
builder.proxy(new java.net.Proxy(java.net.Proxy.Type.HTTP,
new java.net.InetSocketAddress(proxyHost, proxyPort)));
System.out.println("使用代理: " + proxy);
}
}
return builder.build();
}
private static synchronized void switchProxy() {
if (proxyList.isEmpty()) return;
currentProxyIndex = (currentProxyIndex + 1) % proxyList.size();
System.out.println("切换到新代理: " + proxyList.get(currentProxyIndex));
}
}

122
src/main/java/com/example/saveInES.java

@ -0,0 +1,122 @@
package com.example;
import co.elastic.clients.elasticsearch.ElasticsearchClient;
import co.elastic.clients.elasticsearch.core.IndexRequest;
import co.elastic.clients.elasticsearch.core.IndexResponse;
import co.elastic.clients.json.jackson.JacksonJsonpMapper;
import co.elastic.clients.transport.ElasticsearchTransport;
import co.elastic.clients.transport.rest_client.RestClientTransport;
import org.apache.http.HttpHost;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.elasticsearch.client.RestClient;
import java.io.IOException;
import java.time.Duration;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
public class saveInES {
public static void main(String[] args) {
ElasticsearchClient esClient = createElasticsearchClient();
Properties properties = new Properties();
properties.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092");
properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName());
properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName());
properties.put(ConsumerConfig.GROUP_ID_CONFIG, "news-consumer-group");
properties.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false"); // 关闭自动提交偏移量
// 创建 Kafka 消费者
KafkaConsumer<String, String> consumer = new KafkaConsumer<>(properties);
// 订阅主题
String topic = "news-topic"; // Kafka 主题
consumer.subscribe(Collections.singletonList(topic));
// 消费消息
try {
while (true) {
// 拉取消息
ConsumerRecords<String, String> records = consumer.poll(Duration.ofMillis(1000));
// 处理消息
for (ConsumerRecord<String, String> record : records) {
System.out.println("Received message: key=" + record.key() + ", value=" + record.value());
// 将消息保存到 Elasticsearch
saveToElasticsearch(esClient, record.value());
}
consumer.commitSync();
}
} finally {
// 关闭消费者
consumer.close();
try {
esClient._transport().close();
} catch (IOException e) {
System.err.println("Error closing Elasticsearch client: " + e.getMessage());
}
}
}
/**
* 初始化 Elasticsearch 客户端
*/
private static ElasticsearchClient createElasticsearchClient() {
RestClient restClient = RestClient.builder(new HttpHost("localhost", 9200)).build();
ElasticsearchTransport transport = new RestClientTransport(restClient, new JacksonJsonpMapper());
return new ElasticsearchClient(transport);
}
/**
* 将消息保存到 Elasticsearch
*
* @param esClient Elasticsearch 客户端
* @param message 消息内容JSON 格式
*/
private static void saveToElasticsearch(ElasticsearchClient esClient, String message) {
try {
// 将消息解析为 Map假设消息是 JSON 格式
Map<String, String> news = parseMessageToMap(message);
String docId = news.get("url");
// 创建索引请求
IndexRequest<Map<String, String>> request = IndexRequest.of(b -> b
.index("news") // 索引名称
.id(docId)
.document(news) // 要保存的数据
);
// 执行索引请求
IndexResponse response = esClient.index(request);
System.out.println("Data saved to Elasticsearch: ID=" + response.id());
} catch (Exception e) {
System.err.println("Failed to save data to Elasticsearch: " + e.getMessage());
}
}
/**
* 将消息解析为 Map
*
* @param message 消息内容JSON 格式
* @return 解析后的 Map
*/
private static Map<String, String> parseMessageToMap(String message) {
// 这里假设消息是 JSON 格式例如{"title":"...", "date":"...", "content":"...", "url":"..."}
// 可以使用 JSON Jackson解析消息
// 这里简单地将消息按逗号分割并转换为 Map
Map<String, String> map = new HashMap<>();
String[] pairs = message.replace("{", "").replace("}", "").split(",");
for (String pair : pairs) {
String[] keyValue = pair.split("=");
if (keyValue.length == 2) {
map.put(keyValue[0].trim(), keyValue[1].trim());
}
}
return map;
}
}

101
src/main/java/com/example/test.java

@ -0,0 +1,101 @@
package com.example;// 注意如果你使用手动设置路径就不需要导入 WebDriverManager
// import io.github.bonigarcia.wdm.WebDriverManager;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.support.ui.WebDriverWait;
import org.openqa.selenium.support.ui.ExpectedConditions;
import org.openqa.selenium.NoSuchElementException;
import org.openqa.selenium.TimeoutException;
import java.time.Duration;
import java.util.List;
public class test { // 更改类名以示区别
public static void main(String[] args) {
// 手动设置 ChromeDriver 的路径 (如果你选择手动方式的话)
// *** 将这里的路径替换为你实际的 chromedriver.exe 路径 ***
System.setProperty("webdriver.chrome.driver", "F:\\tool\\EasySpider_0.6.2_Windows_x64\\EasySpider_windows_x64\\EasySpider\\resources\\app\\chrome_win64\\chromedriver_win64.exe");
// 如果你选择使用 WebDriverManager则使用以下代码替代上面的 System.setProperty
// import io.github.bonigarcia.wdm.WebDriverManager;
// WebDriverManager.chromedriver().setup();
WebDriver driver = null;
try {
// 配置 Chrome 选项 (可选)
ChromeOptions options = new ChromeOptions();
// options.addArguments("--headless"); // 启用无头模式
// options.addArguments("--disable-gpu");
// 初始化 WebDriver
driver = new ChromeDriver(options);
// 直接打开包含搜索条件的 URL
// 注意这里使用的 URL 已经包含了查询参数
driver.get("https://patentscope.wipo.int/search/en/result.jsf?query=FP:(AI)");
// 设置一个显式等待
WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(20));
// 由于直接打开了结果页我们不再需要等待搜索框和点击按钮
// 直接等待搜索结果列表加载
// *** 请使用浏览器开发者工具确认这里的元素定位器是否正确 ***
// "div.ps-result-list" 是一个可能的 CSS 选择器示例你需要根据实际页面确认
wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("div.ps-result-list")));
// --- 在这里添加提取搜索结果的代码 ---
// *** 请使用浏览器开发者工具确认这里的元素定位器是否正确 ***
List<WebElement> resultItems = driver.findElements(By.cssSelector("div.ps-result-item")); // 定位每个结果项
System.out.println("Found " + resultItems.size() + " results:");
for (WebElement resultItem : resultItems) {
try {
// 提取标题 (示例选择器)
// *** 请使用浏览器开发者工具确认这里的元素定位器是否正确 ***
WebElement titleElement = resultItem.findElement(By.cssSelector("span.ps-field-value.ps-field-title"));
String title = titleElement.getText().trim();
// 提取链接 (示例选择器)
// *** 请使用浏览器开发者工具确认这里的元素定位器是否正确 ***
WebElement linkElement = resultItem.findElement(By.tagName("a"));
String patentLink = linkElement.getAttribute("href");
System.out.println("Title: " + title + ", Link: " + patentLink);
} catch (NoSuchElementException e) {
System.out.println("Could not find title or link for a result item in this result item.");
continue;
}
}
// --- 处理分页如果需要---
// 这部分逻辑与之前相同你需要找到下一页按钮的定位器并实现循环点击和等待
// 尽管是直接打开结果页如果结果有多页你仍然需要处理分页来获取所有结果
// ...
} catch (TimeoutException e) {
System.err.println("等待元素超时,可能页面结构发生变化或加载缓慢: " + e.getMessage());
} catch (NoSuchElementException e) {
System.err.println("未能找到指定的元素,请检查元素定位器是否正确: " + e.getMessage());
} catch (Exception e) {
System.err.println("发生其他错误: " + e.getMessage());
e.printStackTrace();
} finally {
// 关闭浏览器
if (driver != null) {
driver.quit();
System.out.println("Browser closed.");
}
}
}
}

103
src/main/java/com/example/testContent.java

@ -0,0 +1,103 @@
package com.example;
import okhttp3.*;
import org.joda.time.DateTime;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
public class testContent {
public static void main(String[] args) throws IOException {
String url = "https://www.drks.de/search/de/trial/DRKS00036725/details";
OkHttpClient client = new OkHttpClient().newBuilder()
.build();
MediaType mediaType = MediaType.parse("application/json");
Request request = new Request.Builder()
.url(url)
.get()
// .addHeader("Cookie", "JSESSIONID=F6B6320CBBC2A27482AEFC0EC641EBF8; JSESSIONID=D9A5D49C09D091E9791733727D8AF2F1")
.addHeader("Content-Type", "application/json")
.build();
Response response = client.newCall(request).execute();
String html = response.body().string();
Document parse = Jsoup.parse(html);
String title = parse.select(".title-bold").text();
String registNum = parse.select(".card.trial-details-float.mb-4 .card-body dl dd:nth-child(2)").text();
String registTime = convertDate(parse.select(".card.trial-details-float.mb-4 .card-body dl dd:nth-child(6)").text());
Map<String,Object> sponsor = new HashMap<>();
String header = parse.select("body > main > div.card-body > div:nth-child(9) > div.card-body > div > div > div > div.card-header > h4").text();
String site = parse.select("body > main > div.card-body > div:nth-child(9) > div.card-body > div > div > div > div.card-body > dl > dd:nth-child(2) > div").text();
String telefon = parse.select("body > main > div.card-body > div:nth-child(9) > div.card-body > div > div > div > div.card-body > dl > dd:nth-child(4) > span").text();
String disease = parse.select("body > main > div.card-body > div:nth-child(6) > div.card-body > div > div:nth-child(2) > dl > dd:nth-child(2) > span").text();
String studyType = parse.select("body > main > div.card-body > div:nth-child(3) > div.card-body > dl").text();
String inclusionCriteria = parse.select("body > main > div.card-body > div:nth-child(7) > div.card-body > div:nth-child(2) > div:nth-child(3) > div > div.card-body > div > div.col-12.mt-3 > dl > dd > span").text();
String exclusionCriteria = parse.select("body > main > div.card-body > div:nth-child(7) > div.card-body > div:nth-child(2) > div:nth-child(4) > div > div.card-body > p > span").text();
String country = parse.select("body > main > div.card-body > div:nth-child(7) > div.card-body > div:nth-child(2) > div:nth-child(1) > div > div.card-body > dl > dd:nth-child(2)").text();
String intervention = parse.select("body > main > div.card-body > div:nth-child(4) > div.card-body > dl").text();
String primaryOutcome = parse.select("body > main > div.card-body > div:nth-child(5) > div.card-body > div > div > dl").text();
String enrollment = parse.select("body > main > div.card-body > div:nth-child(7) > div.card-body > div:nth-child(2) > div:nth-child(2) > div > div.card-body > div > div:nth-child(5) > dl > dd > span").text();
sponsor.put("header",header);
sponsor.put("site",site);
sponsor.put("telefon",telefon);
Map<String,Object> resultData = new HashMap<>();
resultData.put("title", title);
resultData.put("registNum",registNum);
resultData.put("registTime",registTime);
resultData.put("registStatus","无");
resultData.put("registTitle","无");
resultData.put("fullTitle","无");
resultData.put("sponsor",sponsor);
resultData.put("sponsorPart","无");
resultData.put("studyType",studyType);
resultData.put("phase","无");
resultData.put("disease",disease);
resultData.put("studyDesign","无");
resultData.put("studyObjective","无");
resultData.put("studyStartDate","无");
resultData.put("inclusionCriteria",inclusionCriteria);
resultData.put("exclusionCriteria",exclusionCriteria);
resultData.put("currentStatus","无");
resultData.put("enrollment",enrollment);
resultData.put("country",country);
resultData.put("tagTime","无");
resultData.put("intervention",intervention);
resultData.put("primaryOutcome",primaryOutcome);
resultData.put("crawlTime",getCurrentTime());
resultData.put("crawlUrl",url);
resultData.put("postTime",registTime);
resultData.put("content","content");
resultData.put("forwardcontent","forwardcontent");
System.out.println(resultData);
}
public static String convertDate(String inputDate) {
try {
// 输入格式dd.MM.yyyy
SimpleDateFormat inputFormat = new SimpleDateFormat("dd.MM.yyyy");
// 解析输入日期
Date date = inputFormat.parse(inputDate);
// 输出格式yyyy-MM-dd HH:mm:ss
SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
// 转换为目标格式
return outputFormat.format(date);
} catch (ParseException e) {
// 处理解析异常
return "Invalid date format";
}
}
public static String getCurrentTime() {
// 创建 DateTimeFormatter指定输出格式
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
// 获取当前时间
LocalDateTime now = LocalDateTime.now();
// 格式化
return now.format(formatter);
}
}

340
src/main/java/com/example/testList.java

@ -0,0 +1,340 @@
package com.example;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class testList {
public static void main(String[] args) throws Exception {
String targetUrl = "https://www.drks.de/search/de/results?page=4";
String baseUrl = "https://www.drks.de/search/de";
String hostUrl = "https://www.drks.de";
String cleanUrl = targetUrl.split("\\?")[0];
System.out.println("Pure URL: " + cleanUrl);
String pageNumber = targetUrl.contains("?page=") ? targetUrl.split("page=")[1] : "1";
int page = Integer.parseInt(pageNumber);
System.out.println("Page Number: " + page);
// 存储 cookies
Set<String> cookieSet = new HashSet<>();
String sessionId = null;
// 第一步初始 GET 请求获取 cookies ViewState
URL initialUrl = new URL(baseUrl);
HttpURLConnection initialConn = (HttpURLConnection) initialUrl.openConnection();
initialConn.setRequestMethod("GET");
initialConn.setInstanceFollowRedirects(false);
initialConn.setConnectTimeout(10000);
initialConn.setReadTimeout(10000);
// 捕获 cookies
sessionId = updateCookies(initialConn, cookieSet);
System.out.println("Initial Cookies: " + cookieSet);
System.out.println("Initial Session ID: " + sessionId);
// 读取响应内容以获取 ViewState
BufferedReader in = new BufferedReader(new InputStreamReader(initialConn.getInputStream()));
StringBuilder content = new StringBuilder();
String inputLine;
while ((inputLine = in.readLine()) != null) {
content.append(inputLine);
}
in.close();
initialConn.disconnect();
// 提取初始 ViewState
String initialViewState = extractViewState(content.toString());
System.out.println("Initial ViewState: " + initialViewState);
// 第二步发送搜索 POST 请求
HttpURLConnection searchConn = (HttpURLConnection) new URL(baseUrl).openConnection();
searchConn.setRequestMethod("POST");
searchConn.setInstanceFollowRedirects(false);
searchConn.setDoOutput(true);
searchConn.setConnectTimeout(10000);
searchConn.setReadTimeout(10000);
// 设置搜索请求的请求头
searchConn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8");
searchConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7");
searchConn.setRequestProperty("Cookie", String.join("; ", cookieSet));
searchConn.setRequestProperty("Origin", "https://www.drks.de");
searchConn.setRequestProperty("Referer", baseUrl);
searchConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36");
// 构建搜索请求的 POST 数据
String searchPostData = buildSearchPostData(initialViewState);
System.out.println("Search POST Data: " + searchPostData);
// 发送搜索 POST 请求
try (OutputStream os = searchConn.getOutputStream()) {
byte[] input = searchPostData.getBytes(StandardCharsets.UTF_8);
os.write(input, 0, input.length);
}
// 更新 cookies
String searchSessionId = updateCookies(searchConn, cookieSet);
System.out.println("Search Cookies: " + cookieSet);
System.out.println("Search Session ID: " + searchSessionId);
// 处理搜索响应
int searchResponseCode = searchConn.getResponseCode();
System.out.println("Search Response Code: " + searchResponseCode);
String redirectUrl = searchConn.getHeaderField("Location");
searchConn.disconnect();
if (searchResponseCode != 302 || redirectUrl == null) {
System.err.println("Search request did not return expected 302 redirect. Response code: " + searchResponseCode);
return;
}
System.out.println("Redirect URL (raw): " + redirectUrl);
// 解析相对 URL
if (!redirectUrl.startsWith("http")) {
redirectUrl = hostUrl + (redirectUrl.startsWith("/") ? redirectUrl : "/" + redirectUrl);
}
System.out.println("Resolved Redirect URL: " + redirectUrl);
// 第三步跟随重定向使用 GET 请求
URL resultsUrl = new URL(redirectUrl);
HttpURLConnection resultsConn = (HttpURLConnection) resultsUrl.openConnection();
resultsConn.setRequestMethod("GET");
resultsConn.setInstanceFollowRedirects(false);
resultsConn.setConnectTimeout(10000);
resultsConn.setReadTimeout(10000);
resultsConn.setRequestProperty("Cookie", String.join("; ", cookieSet));
resultsConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7");
resultsConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64ек; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36");
// 更新 cookies
String resultsSessionId = updateCookies(resultsConn, cookieSet);
System.out.println("Results Cookies: " + cookieSet);
System.out.println("Results Session ID: " + resultsSessionId);
// 读取重定向后的结果页面内容
BufferedReader resultsReader = new BufferedReader(new InputStreamReader(resultsConn.getInputStream()));
StringBuilder resultsContent = new StringBuilder();
while ((inputLine = resultsReader.readLine()) != null) {
resultsContent.append(inputLine);
}
resultsReader.close();
resultsConn.disconnect();
// 提取页面中的 ViewState状态信息用于后续请求
String viewState = extractViewState(resultsContent.toString());
System.out.println("Results ViewState: " + viewState);
// 检查 Session ID 是否一致确保会话未被重置
if (sessionId != null && !sessionId.equals(resultsSessionId)) {
System.out.println("Warning: Session ID changed. Initial: " + sessionId + ", Results: " + resultsSessionId);
}
// Step 4: 第四步发送分页请求使用 POST
HttpURLConnection postConn = (HttpURLConnection) new URL(cleanUrl).openConnection();
postConn.setRequestMethod("POST");
postConn.setInstanceFollowRedirects(false);
postConn.setDoOutput(true);
postConn.setConnectTimeout(10000);
postConn.setReadTimeout(10000);
// 设置分页请求的请求头 AJAX模拟浏览器常规请求
postConn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8");
postConn.setRequestProperty("Cookie", String.join("; ", cookieSet));
postConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7");
postConn.setRequestProperty("Origin", "https://www.drks.de");
postConn.setRequestProperty("Referer", cleanUrl);
postConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36");
postConn.setRequestProperty("Sec-Fetch-Dest", "document");
postConn.setRequestProperty("Sec-Fetch-Mode", "navigate");
// 构建分页请求的 POST 参数包括页码和 ViewState
String postData = buildPostData(viewState, page);
System.out.println("Pagination POST Data: " + postData);
// 发送分页的 POST 请求
try (OutputStream os = postConn.getOutputStream()) {
byte[] input = postData.getBytes(StandardCharsets.UTF_8);
os.write(input, 0, input.length);
}
// 更新 cookies分页响应可能返回新的 Set-Cookie
String paginationSessionId = updateCookies(postConn, cookieSet);
System.out.println("Pagination Cookies: " + cookieSet);
System.out.println("Pagination Session ID: " + paginationSessionId);
// 处理分页响应
int responseCode = postConn.getResponseCode();
System.out.println("Pagination Response Code: " + responseCode);
// 读取分页响应的 HTML 内容
StringBuilder postContent = new StringBuilder();
try (BufferedReader postReader = new BufferedReader(
new InputStreamReader(
responseCode >= 400 ? postConn.getErrorStream() : postConn.getInputStream()))) {
while ((inputLine = postReader.readLine()) != null) {
postContent.append(inputLine);
}
}
Document parse = null;
if (responseCode == HttpURLConnection.HTTP_MOVED_TEMP
|| responseCode == HttpURLConnection.HTTP_MOVED_PERM
|| responseCode == HttpURLConnection.HTTP_SEE_OTHER) {
String newUrl = postConn.getHeaderField("Location");
System.out.println("Pagination Redirecting to: " + newUrl);
// 解析重定向中的相对地址为完整 URL如果是相对路径
if (!newUrl.startsWith("http")) {
newUrl = hostUrl + (newUrl.startsWith("/") ? newUrl : "/" + newUrl);
}
// 重定向
URL redirectConn = new URL(newUrl);
HttpURLConnection followConn = (HttpURLConnection) redirectConn.openConnection();
followConn.setRequestMethod("GET");
followConn.setRequestProperty("Cookie", String.join("; ", cookieSet));
followConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7");
followConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36");
BufferedReader redirectReader = new BufferedReader(new InputStreamReader(followConn.getInputStream()));
StringBuilder redirectContent = new StringBuilder();
while ((inputLine = redirectReader.readLine()) != null) {
redirectContent.append(inputLine);
}
redirectReader.close();
followConn.disconnect();
System.out.println("Redirect Response: " + redirectContent);
parse = Jsoup.parse(String.valueOf(redirectContent));
} else if (responseCode == 200) {
System.out.println("Pagination Response: " + postContent);
parse = Jsoup.parse(String.valueOf(postContent));
}
Elements links = parse.select("div[data-label='Titel der Studie'] a");
for (Element link : links) {
String href = link.attr("href");
String text = link.text();
System.out.println("链接: " + href);
System.out.println("标题: " + text);
}
String text = parse.select("div.col-md-2.pt-3.ps-0.text-md-end").text();
// 使用正则表达式提取 "第" "/" 之间的数字
String regex = "Seite\\s*(\\d+)\\s*/";
Matcher matcher = Pattern.compile(regex).matcher(text);
if (matcher.find()) {
System.out.println("总共有"+matcher.group(1));// 返回第一个捕获组即数字 "1"
}
postConn.disconnect();
}
// 更新并返回当前连接中的 Cookie包含 JSESSIONID 的提取
private static String updateCookies(HttpURLConnection conn, Set<String> cookieSet) {
String sessionId = null;
Map<String, List<String>> headerFields = conn.getHeaderFields();
List<String> cookiesHeader = headerFields.get("Set-Cookie");
if (cookiesHeader != null) {
for (String cookie : cookiesHeader) {
String cookieValue = cookie.split(";")[0];
cookieSet.add(cookieValue);
if (cookieValue.startsWith("JSESSIONID=") || cookieValue.startsWith("csfcfc=")) {
sessionId = cookieValue;
}
}
}
return sessionId;
}
// 提取 __VIEWSTATE 隐藏字段的值
private static String extractViewState(String html) {
if (html == null || html.isEmpty()) {
System.err.println("HTML content is empty or null");
return "";
}
// 兼容 jakarta.faces.ViewState javax.faces.ViewState
String regex = "<input[^>]*name=[\"'](?:jakarta|javax)\\.faces\\.ViewState[\"'][^>]*value=[\"']([^\"']+)[\"']";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(html);
if (matcher.find()) {
return matcher.group(1);
}
System.err.println("Failed to extract ViewState from HTML");
return "";
}
// 生成搜索请求的 POST 数据
private static String buildSearchPostData(String viewState) {
try {
return "searchForm=searchForm" +
"&searchForm%3Aj_idt80=Midwifery" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AdrksId=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AsecondaryId=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AscientificSummary=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aoutcome=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AhealthOfCondition=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AhealthyVolunteers=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aaddresses=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt128=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AipdSharingPlan=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt135%3Afrom=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt135%3Ato=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt146%3Afrom=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt146%3Ato=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt157%3Afrom=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt157%3Ato=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3Agender=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AageInYears=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AinclusionCriteria=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AexclusionCriteria=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AtrialStatus=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3ArecrutingLocation=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3Aj_idt213%3Afrom=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3Aj_idt213%3Ato=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3AtrialDesign%3Apurpose=" +
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3AtrialDesign%3AstudyType=" +
"&searchForm%3Aj_idt287=" +
"&javax.faces.ViewState=" + URLEncoder.encode(viewState, StandardCharsets.UTF_8.name());
} catch (Exception e) {
System.err.println("Error encoding search ViewState: " + e.getMessage());
return "";
}
}
// 生成分页请求的 POST 数据
private static String buildPostData(String viewState, int page) {
int adjustedPage = page - 1;
try {
return "resultForm=resultForm" +
"&resultForm%3Asorting%3ArowsPerPage=10" +
"&resultForm%3ApaginationTop%3Aj_idt156%3A"+ adjustedPage +"%3Aj_idt158=" + page +
"&resultForm%3Asorting%3AsortingBy=SCORE" +
"&resultForm%3Asorting%3Aj_idt141=true" +
"&resultForm%3Aj_idt221%3Aj_idt223%3AdownloadConfirmation=resultForm%3Aj_idt221%3Aj_idt223%3AdownloadConfirmation" +
"&selectedType=JSON" +
"&javax.faces.ViewState=" + URLEncoder.encode(viewState, StandardCharsets.UTF_8.name());
} catch (Exception e) {
System.err.println("Error encoding pagination ViewState: " + e.getMessage());
return "";
}
}
}

22
src/main/java/com/example/umlistTest.java

@ -0,0 +1,22 @@
package com.example;
import okhttp3.*;
import java.io.IOException;
public class umlistTest {
public static void main(String[] args) throws IOException {
OkHttpClient client = new OkHttpClient().newBuilder()
.build();
MediaType mediaType = MediaType.parse("text/plain");
RequestBody body = RequestBody.create(mediaType, "");
Request request = new Request.Builder()
.url("http://who.int/westernpacific/publications/m/item/bi-weekly-covid-19-situation-update--11-april-2025")
.get()
// .addHeader("Cookie", "_cfuvid=Y2mczEYT8OCAEN719Uv9vPTpARSDmHju6OjSUfxYbb4-1745207891947-0.0.1.1-604800000")
.build();
Response response = client.newCall(request).execute();
String html = response.body().string();
System.out.println(html);
}
}

12
src/main/resources/logback.xml

@ -0,0 +1,12 @@
<configuration>
<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
<encoder>
<pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
</encoder>
</appender>
<root level="INFO">
<appender-ref ref="STDOUT" />
</root>
</configuration>

BIN
target/classes/com/example/AusContent.class

BIN
target/classes/com/example/AusList.class

BIN
target/classes/com/example/CaptchaOCR.class

BIN
target/classes/com/example/CsAirScraper.class

BIN
target/classes/com/example/CtriScraper.class

BIN
target/classes/com/example/CtriScraperContent.class

BIN
target/classes/com/example/Inka.class

BIN
target/classes/com/example/NSFAwardCrawler.class

BIN
target/classes/com/example/PatentscopeSeleniumCrawler.class

BIN
target/classes/com/example/ProxyIPChecker.class

BIN
target/classes/com/example/ScraperWithCaptcha$1.class

BIN
target/classes/com/example/ScraperWithCaptcha$PageInfo.class

BIN
target/classes/com/example/ScraperWithCaptcha.class

BIN
target/classes/com/example/StringFieldExtractor.class

BIN
target/classes/com/example/WipoPatentsSelenium.class

BIN
target/classes/com/example/cliniTopic.class

BIN
target/classes/com/example/drks.class

BIN
target/classes/com/example/getInKa.class

BIN
target/classes/com/example/jsonGetOk.class

BIN
target/classes/com/example/ook.class

BIN
target/classes/com/example/oook.class

BIN
target/classes/com/example/projTopic.class

BIN
target/classes/com/example/saveInES.class

BIN
target/classes/com/example/test.class

BIN
target/classes/com/example/testContent.class

BIN
target/classes/com/example/testList.class

BIN
target/classes/com/example/umlistTest.class

Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save