commit
86b867f541
108 changed files with 7781 additions and 0 deletions
-
8.idea/.gitignore
-
13.idea/compiler.xml
-
20.idea/jarRepositories.xml
-
14.idea/misc.xml
-
124.idea/uiDesigner.xml
-
23.project
-
BINNsantegouvListRe.jar
-
8bin/.idea/.gitignore
-
13bin/.idea/compiler.xml
-
20bin/.idea/jarRepositories.xml
-
14bin/.idea/misc.xml
-
23bin/.project
-
167bin/hs_err_pid15760.log
-
6bin/keywords.txt
-
138bin/pom.xml
-
281bin/processed_urls.txt
-
1bin/proxy.txt
-
BINbin/src/main/java/com/example/Inka.class
-
BINbin/src/main/java/com/example/NSFAwardCrawler.class
-
BINbin/src/main/java/com/example/PatentscopeSeleniumCrawler.class
-
BINbin/src/main/java/com/example/ProxyIPChecker.class
-
BINbin/src/main/java/com/example/StringFieldExtractor.class
-
BINbin/src/main/java/com/example/getInKa.class
-
BINbin/src/main/java/com/example/jsonGetOk.class
-
BINbin/src/main/java/com/example/ook.class
-
BINbin/src/main/java/com/example/oook.class
-
BINbin/src/main/java/com/example/projTopic.class
-
BINbin/src/main/java/com/example/saveInES.class
-
BINbin/src/main/java/com/example/test.class
-
BINbin/src/main/java/com/example/test2.class
-
BINbin/src/main/java/com/example/testContent.class
-
BINbin/src/main/java/com/example/umlistTest.class
-
4bin/target/classes/META-INF/MANIFEST.MF
-
BINbin/target/es-crawler-1.0-SNAPSHOT-jar-with-dependencies.jar
-
BINbin/target/es-crawler-1.0-SNAPSHOT.jar
-
5bin/target/maven-archiver/pom.properties
-
1bin/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst
-
1bin/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst
-
0bin/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/createdFiles.lst
-
0bin/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/inputFiles.lst
-
167hs_err_pid15760.log
-
1045keywords.txt
-
BINoriginal_captcha.png
-
150pom.xml
-
BINpreprocessed_captcha.png
-
281processed_urls.txt
-
1proxy.txt
-
119src/main/java/com/example/AusContent.java
-
200src/main/java/com/example/AusList.java
-
173src/main/java/com/example/CaptchaOCR.java
-
81src/main/java/com/example/CsAirScraper.java
-
404src/main/java/com/example/CtriScraper.java
-
121src/main/java/com/example/CtriScraperContent.java
-
113src/main/java/com/example/Inka.java
-
111src/main/java/com/example/NSFAwardCrawler.java
-
130src/main/java/com/example/PatentscopeSeleniumCrawler.java
-
25src/main/java/com/example/ProxyIPChecker.java
-
496src/main/java/com/example/ScraperWithCaptcha.java
-
74src/main/java/com/example/StringFieldExtractor.java
-
60src/main/java/com/example/WipoPatentsSelenium.java
-
594src/main/java/com/example/cliniTopic.java
-
438src/main/java/com/example/drks.java
-
165src/main/java/com/example/getInKa.java
-
47src/main/java/com/example/jsonGetOk.java
-
256src/main/java/com/example/ook.java
-
524src/main/java/com/example/oook.java
-
403src/main/java/com/example/projTopic.java
-
122src/main/java/com/example/saveInES.java
-
101src/main/java/com/example/test.java
-
103src/main/java/com/example/testContent.java
-
340src/main/java/com/example/testList.java
-
22src/main/java/com/example/umlistTest.java
-
12src/main/resources/logback.xml
-
BINtarget/classes/com/example/AusContent.class
-
BINtarget/classes/com/example/AusList.class
-
BINtarget/classes/com/example/CaptchaOCR.class
-
BINtarget/classes/com/example/CsAirScraper.class
-
BINtarget/classes/com/example/CtriScraper.class
-
BINtarget/classes/com/example/CtriScraperContent.class
-
BINtarget/classes/com/example/Inka.class
-
BINtarget/classes/com/example/NSFAwardCrawler.class
-
BINtarget/classes/com/example/PatentscopeSeleniumCrawler.class
-
BINtarget/classes/com/example/ProxyIPChecker.class
-
BINtarget/classes/com/example/ScraperWithCaptcha$1.class
-
BINtarget/classes/com/example/ScraperWithCaptcha$PageInfo.class
-
BINtarget/classes/com/example/ScraperWithCaptcha.class
-
BINtarget/classes/com/example/StringFieldExtractor.class
-
BINtarget/classes/com/example/WipoPatentsSelenium.class
-
BINtarget/classes/com/example/cliniTopic.class
-
BINtarget/classes/com/example/drks.class
-
BINtarget/classes/com/example/getInKa.class
-
BINtarget/classes/com/example/jsonGetOk.class
-
BINtarget/classes/com/example/ook.class
-
BINtarget/classes/com/example/oook.class
-
BINtarget/classes/com/example/projTopic.class
-
BINtarget/classes/com/example/saveInES.class
-
BINtarget/classes/com/example/test.class
-
BINtarget/classes/com/example/testContent.class
-
BINtarget/classes/com/example/testList.class
-
BINtarget/classes/com/example/umlistTest.class
@ -0,0 +1,8 @@ |
|||
# Default ignored files |
|||
/shelf/ |
|||
/workspace.xml |
|||
# Datasource local storage ignored files |
|||
/dataSources/ |
|||
/dataSources.local.xml |
|||
# Editor-based HTTP Client requests |
|||
/httpRequests/ |
@ -0,0 +1,13 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<project version="4"> |
|||
<component name="CompilerConfiguration"> |
|||
<annotationProcessing> |
|||
<profile name="Maven default annotation processors profile" enabled="true"> |
|||
<sourceOutputDir name="target/generated-sources/annotations" /> |
|||
<sourceTestOutputDir name="target/generated-test-sources/test-annotations" /> |
|||
<outputRelativeToContentRoot value="true" /> |
|||
<module name="DaKaES" /> |
|||
</profile> |
|||
</annotationProcessing> |
|||
</component> |
|||
</project> |
@ -0,0 +1,20 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<project version="4"> |
|||
<component name="RemoteRepositoriesConfiguration"> |
|||
<remote-repository> |
|||
<option name="id" value="central" /> |
|||
<option name="name" value="Central Repository" /> |
|||
<option name="url" value="https://repo.maven.apache.org/maven2" /> |
|||
</remote-repository> |
|||
<remote-repository> |
|||
<option name="id" value="central" /> |
|||
<option name="name" value="Maven Central repository" /> |
|||
<option name="url" value="https://repo1.maven.org/maven2" /> |
|||
</remote-repository> |
|||
<remote-repository> |
|||
<option name="id" value="jboss.community" /> |
|||
<option name="name" value="JBoss Community repository" /> |
|||
<option name="url" value="https://repository.jboss.org/nexus/content/repositories/public/" /> |
|||
</remote-repository> |
|||
</component> |
|||
</project> |
@ -0,0 +1,14 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<project version="4"> |
|||
<component name="ExternalStorageConfigurationManager" enabled="true" /> |
|||
<component name="MavenProjectsManager"> |
|||
<option name="originalFiles"> |
|||
<list> |
|||
<option value="$PROJECT_DIR$/pom.xml" /> |
|||
</list> |
|||
</option> |
|||
</component> |
|||
<component name="ProjectRootManager" version="2" languageLevel="JDK_1_8" default="true" project-jdk-name="corretto-1.8" project-jdk-type="JavaSDK"> |
|||
<output url="file://$PROJECT_DIR$/out" /> |
|||
</component> |
|||
</project> |
@ -0,0 +1,124 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<project version="4"> |
|||
<component name="Palette2"> |
|||
<group name="Swing"> |
|||
<item class="com.intellij.uiDesigner.HSpacer" tooltip-text="Horizontal Spacer" icon="/com/intellij/uiDesigner/icons/hspacer.png" removable="false" auto-create-binding="false" can-attach-label="false"> |
|||
<default-constraints vsize-policy="1" hsize-policy="6" anchor="0" fill="1" /> |
|||
</item> |
|||
<item class="com.intellij.uiDesigner.VSpacer" tooltip-text="Vertical Spacer" icon="/com/intellij/uiDesigner/icons/vspacer.png" removable="false" auto-create-binding="false" can-attach-label="false"> |
|||
<default-constraints vsize-policy="6" hsize-policy="1" anchor="0" fill="2" /> |
|||
</item> |
|||
<item class="javax.swing.JPanel" icon="/com/intellij/uiDesigner/icons/panel.png" removable="false" auto-create-binding="false" can-attach-label="false"> |
|||
<default-constraints vsize-policy="3" hsize-policy="3" anchor="0" fill="3" /> |
|||
</item> |
|||
<item class="javax.swing.JScrollPane" icon="/com/intellij/uiDesigner/icons/scrollPane.png" removable="false" auto-create-binding="false" can-attach-label="true"> |
|||
<default-constraints vsize-policy="7" hsize-policy="7" anchor="0" fill="3" /> |
|||
</item> |
|||
<item class="javax.swing.JButton" icon="/com/intellij/uiDesigner/icons/button.png" removable="false" auto-create-binding="true" can-attach-label="false"> |
|||
<default-constraints vsize-policy="0" hsize-policy="3" anchor="0" fill="1" /> |
|||
<initial-values> |
|||
<property name="text" value="Button" /> |
|||
</initial-values> |
|||
</item> |
|||
<item class="javax.swing.JRadioButton" icon="/com/intellij/uiDesigner/icons/radioButton.png" removable="false" auto-create-binding="true" can-attach-label="false"> |
|||
<default-constraints vsize-policy="0" hsize-policy="3" anchor="8" fill="0" /> |
|||
<initial-values> |
|||
<property name="text" value="RadioButton" /> |
|||
</initial-values> |
|||
</item> |
|||
<item class="javax.swing.JCheckBox" icon="/com/intellij/uiDesigner/icons/checkBox.png" removable="false" auto-create-binding="true" can-attach-label="false"> |
|||
<default-constraints vsize-policy="0" hsize-policy="3" anchor="8" fill="0" /> |
|||
<initial-values> |
|||
<property name="text" value="CheckBox" /> |
|||
</initial-values> |
|||
</item> |
|||
<item class="javax.swing.JLabel" icon="/com/intellij/uiDesigner/icons/label.png" removable="false" auto-create-binding="false" can-attach-label="false"> |
|||
<default-constraints vsize-policy="0" hsize-policy="0" anchor="8" fill="0" /> |
|||
<initial-values> |
|||
<property name="text" value="Label" /> |
|||
</initial-values> |
|||
</item> |
|||
<item class="javax.swing.JTextField" icon="/com/intellij/uiDesigner/icons/textField.png" removable="false" auto-create-binding="true" can-attach-label="true"> |
|||
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1"> |
|||
<preferred-size width="150" height="-1" /> |
|||
</default-constraints> |
|||
</item> |
|||
<item class="javax.swing.JPasswordField" icon="/com/intellij/uiDesigner/icons/passwordField.png" removable="false" auto-create-binding="true" can-attach-label="true"> |
|||
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1"> |
|||
<preferred-size width="150" height="-1" /> |
|||
</default-constraints> |
|||
</item> |
|||
<item class="javax.swing.JFormattedTextField" icon="/com/intellij/uiDesigner/icons/formattedTextField.png" removable="false" auto-create-binding="true" can-attach-label="true"> |
|||
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1"> |
|||
<preferred-size width="150" height="-1" /> |
|||
</default-constraints> |
|||
</item> |
|||
<item class="javax.swing.JTextArea" icon="/com/intellij/uiDesigner/icons/textArea.png" removable="false" auto-create-binding="true" can-attach-label="true"> |
|||
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3"> |
|||
<preferred-size width="150" height="50" /> |
|||
</default-constraints> |
|||
</item> |
|||
<item class="javax.swing.JTextPane" icon="/com/intellij/uiDesigner/icons/textPane.png" removable="false" auto-create-binding="true" can-attach-label="true"> |
|||
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3"> |
|||
<preferred-size width="150" height="50" /> |
|||
</default-constraints> |
|||
</item> |
|||
<item class="javax.swing.JEditorPane" icon="/com/intellij/uiDesigner/icons/editorPane.png" removable="false" auto-create-binding="true" can-attach-label="true"> |
|||
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3"> |
|||
<preferred-size width="150" height="50" /> |
|||
</default-constraints> |
|||
</item> |
|||
<item class="javax.swing.JComboBox" icon="/com/intellij/uiDesigner/icons/comboBox.png" removable="false" auto-create-binding="true" can-attach-label="true"> |
|||
<default-constraints vsize-policy="0" hsize-policy="2" anchor="8" fill="1" /> |
|||
</item> |
|||
<item class="javax.swing.JTable" icon="/com/intellij/uiDesigner/icons/table.png" removable="false" auto-create-binding="true" can-attach-label="false"> |
|||
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3"> |
|||
<preferred-size width="150" height="50" /> |
|||
</default-constraints> |
|||
</item> |
|||
<item class="javax.swing.JList" icon="/com/intellij/uiDesigner/icons/list.png" removable="false" auto-create-binding="true" can-attach-label="false"> |
|||
<default-constraints vsize-policy="6" hsize-policy="2" anchor="0" fill="3"> |
|||
<preferred-size width="150" height="50" /> |
|||
</default-constraints> |
|||
</item> |
|||
<item class="javax.swing.JTree" icon="/com/intellij/uiDesigner/icons/tree.png" removable="false" auto-create-binding="true" can-attach-label="false"> |
|||
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3"> |
|||
<preferred-size width="150" height="50" /> |
|||
</default-constraints> |
|||
</item> |
|||
<item class="javax.swing.JTabbedPane" icon="/com/intellij/uiDesigner/icons/tabbedPane.png" removable="false" auto-create-binding="true" can-attach-label="false"> |
|||
<default-constraints vsize-policy="3" hsize-policy="3" anchor="0" fill="3"> |
|||
<preferred-size width="200" height="200" /> |
|||
</default-constraints> |
|||
</item> |
|||
<item class="javax.swing.JSplitPane" icon="/com/intellij/uiDesigner/icons/splitPane.png" removable="false" auto-create-binding="false" can-attach-label="false"> |
|||
<default-constraints vsize-policy="3" hsize-policy="3" anchor="0" fill="3"> |
|||
<preferred-size width="200" height="200" /> |
|||
</default-constraints> |
|||
</item> |
|||
<item class="javax.swing.JSpinner" icon="/com/intellij/uiDesigner/icons/spinner.png" removable="false" auto-create-binding="true" can-attach-label="true"> |
|||
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1" /> |
|||
</item> |
|||
<item class="javax.swing.JSlider" icon="/com/intellij/uiDesigner/icons/slider.png" removable="false" auto-create-binding="true" can-attach-label="false"> |
|||
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1" /> |
|||
</item> |
|||
<item class="javax.swing.JSeparator" icon="/com/intellij/uiDesigner/icons/separator.png" removable="false" auto-create-binding="false" can-attach-label="false"> |
|||
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3" /> |
|||
</item> |
|||
<item class="javax.swing.JProgressBar" icon="/com/intellij/uiDesigner/icons/progressbar.png" removable="false" auto-create-binding="true" can-attach-label="false"> |
|||
<default-constraints vsize-policy="0" hsize-policy="6" anchor="0" fill="1" /> |
|||
</item> |
|||
<item class="javax.swing.JToolBar" icon="/com/intellij/uiDesigner/icons/toolbar.png" removable="false" auto-create-binding="false" can-attach-label="false"> |
|||
<default-constraints vsize-policy="0" hsize-policy="6" anchor="0" fill="1"> |
|||
<preferred-size width="-1" height="20" /> |
|||
</default-constraints> |
|||
</item> |
|||
<item class="javax.swing.JToolBar$Separator" icon="/com/intellij/uiDesigner/icons/toolbarSeparator.png" removable="false" auto-create-binding="false" can-attach-label="false"> |
|||
<default-constraints vsize-policy="0" hsize-policy="0" anchor="0" fill="1" /> |
|||
</item> |
|||
<item class="javax.swing.JScrollBar" icon="/com/intellij/uiDesigner/icons/scrollbar.png" removable="false" auto-create-binding="true" can-attach-label="false"> |
|||
<default-constraints vsize-policy="6" hsize-policy="0" anchor="0" fill="2" /> |
|||
</item> |
|||
</group> |
|||
</component> |
|||
</project> |
@ -0,0 +1,23 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<projectDescription> |
|||
<name>DaKaES</name> |
|||
<comment></comment> |
|||
<projects> |
|||
</projects> |
|||
<buildSpec> |
|||
<buildCommand> |
|||
<name>org.eclipse.jdt.core.javabuilder</name> |
|||
<arguments> |
|||
</arguments> |
|||
</buildCommand> |
|||
<buildCommand> |
|||
<name>org.eclipse.m2e.core.maven2Builder</name> |
|||
<arguments> |
|||
</arguments> |
|||
</buildCommand> |
|||
</buildSpec> |
|||
<natures> |
|||
<nature>org.eclipse.jdt.core.javanature</nature> |
|||
<nature>org.eclipse.m2e.core.maven2Nature</nature> |
|||
</natures> |
|||
</projectDescription> |
@ -0,0 +1,8 @@ |
|||
# Default ignored files |
|||
/shelf/ |
|||
/workspace.xml |
|||
# Datasource local storage ignored files |
|||
/dataSources/ |
|||
/dataSources.local.xml |
|||
# Editor-based HTTP Client requests |
|||
/httpRequests/ |
@ -0,0 +1,13 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<project version="4"> |
|||
<component name="CompilerConfiguration"> |
|||
<annotationProcessing> |
|||
<profile name="Maven default annotation processors profile" enabled="true"> |
|||
<sourceOutputDir name="target/generated-sources/annotations" /> |
|||
<sourceTestOutputDir name="target/generated-test-sources/test-annotations" /> |
|||
<outputRelativeToContentRoot value="true" /> |
|||
<module name="DaKaES" /> |
|||
</profile> |
|||
</annotationProcessing> |
|||
</component> |
|||
</project> |
@ -0,0 +1,20 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<project version="4"> |
|||
<component name="RemoteRepositoriesConfiguration"> |
|||
<remote-repository> |
|||
<option name="id" value="central" /> |
|||
<option name="name" value="Central Repository" /> |
|||
<option name="url" value="https://repo.maven.apache.org/maven2" /> |
|||
</remote-repository> |
|||
<remote-repository> |
|||
<option name="id" value="central" /> |
|||
<option name="name" value="Maven Central repository" /> |
|||
<option name="url" value="https://repo1.maven.org/maven2" /> |
|||
</remote-repository> |
|||
<remote-repository> |
|||
<option name="id" value="jboss.community" /> |
|||
<option name="name" value="JBoss Community repository" /> |
|||
<option name="url" value="https://repository.jboss.org/nexus/content/repositories/public/" /> |
|||
</remote-repository> |
|||
</component> |
|||
</project> |
@ -0,0 +1,14 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<project version="4"> |
|||
<component name="ExternalStorageConfigurationManager" enabled="true" /> |
|||
<component name="MavenProjectsManager"> |
|||
<option name="originalFiles"> |
|||
<list> |
|||
<option value="$PROJECT_DIR$/pom.xml" /> |
|||
</list> |
|||
</option> |
|||
</component> |
|||
<component name="ProjectRootManager" version="2" languageLevel="JDK_1_8" default="true" project-jdk-name="corretto-1.8" project-jdk-type="JavaSDK"> |
|||
<output url="file://$PROJECT_DIR$/out" /> |
|||
</component> |
|||
</project> |
@ -0,0 +1,23 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<projectDescription> |
|||
<name>DaKaES</name> |
|||
<comment></comment> |
|||
<projects> |
|||
</projects> |
|||
<buildSpec> |
|||
<buildCommand> |
|||
<name>org.eclipse.jdt.core.javabuilder</name> |
|||
<arguments> |
|||
</arguments> |
|||
</buildCommand> |
|||
<buildCommand> |
|||
<name>org.eclipse.m2e.core.maven2Builder</name> |
|||
<arguments> |
|||
</arguments> |
|||
</buildCommand> |
|||
</buildSpec> |
|||
<natures> |
|||
<nature>org.eclipse.jdt.core.javanature</nature> |
|||
<nature>org.eclipse.m2e.core.maven2Nature</nature> |
|||
</natures> |
|||
</projectDescription> |
@ -0,0 +1,167 @@ |
|||
# |
|||
# There is insufficient memory for the Java Runtime Environment to continue. |
|||
# Native memory allocation (malloc) failed to allocate 1048576 bytes for AllocateHeap |
|||
# Possible reasons: |
|||
# The system is out of physical RAM or swap space |
|||
# The process is running with CompressedOops enabled, and the Java Heap may be blocking the growth of the native heap |
|||
# Possible solutions: |
|||
# Reduce memory load on the system |
|||
# Increase physical memory or swap space |
|||
# Check if swap backing store is full |
|||
# Decrease Java heap size (-Xmx/-Xms) |
|||
# Decrease number of Java threads |
|||
# Decrease Java thread stack sizes (-Xss) |
|||
# Set larger code cache with -XX:ReservedCodeCacheSize= |
|||
# JVM is running with Zero Based Compressed Oops mode in which the Java heap is |
|||
# placed in the first 32GB address space. The Java Heap base address is the |
|||
# maximum limit for the native heap growth. Please use -XX:HeapBaseMinAddress |
|||
# to set the Java Heap base and to place the Java Heap above 32GB virtual address. |
|||
# This output file may be truncated or incomplete. |
|||
# |
|||
# Out of Memory Error (memory/allocation.inline.hpp:61), pid=15760, tid=0x0000000000003334 |
|||
# |
|||
# JRE version: (8.0_422-b05) (build ) |
|||
# Java VM: OpenJDK 64-Bit Server VM (25.422-b05 mixed mode windows-amd64 compressed oops) |
|||
# Failed to write core dump. Minidumps are not enabled by default on client versions of Windows |
|||
# |
|||
|
|||
--------------- T H R E A D --------------- |
|||
|
|||
Current thread (0x00000271b7d7d800): JavaThread "Unknown thread" [_thread_in_vm, id=13108, stack(0x00000082a1500000,0x00000082a1600000)] |
|||
|
|||
Stack: [0x00000082a1500000,0x00000082a1600000] |
|||
[error occurred during error reporting (printing stack bounds), id 0xc0000005] |
|||
|
|||
Native frames: (J=compiled Java code, j=interpreted, Vv=VM code, C=native code) |
|||
|
|||
|
|||
--------------- P R O C E S S --------------- |
|||
|
|||
Java Threads: ( => current thread ) |
|||
|
|||
Other Threads: |
|||
|
|||
=>0x00000271b7d7d800 (exited) JavaThread "Unknown thread" [_thread_in_vm, id=13108, stack(0x00000082a1500000,0x00000082a1600000)] |
|||
|
|||
VM state:not at safepoint (normal execution) |
|||
|
|||
VM Mutex/Monitor currently owned by a thread: None |
|||
|
|||
heap address: 0x00000006c4000000, size: 4032 MB, Compressed Oops mode: Zero based, Oop shift amount: 3 |
|||
Narrow klass base: 0x0000000000000000, Narrow klass shift: 3 |
|||
Compressed class space size: 1073741824 Address: 0x00000007c0000000 |
|||
|
|||
Heap: |
|||
PSYoungGen total 75264K, used 1290K [0x000000076c000000, 0x0000000771400000, 0x00000007c0000000) |
|||
eden space 64512K, 2% used [0x000000076c000000,0x000000076c142900,0x000000076ff00000) |
|||
from space 10752K, 0% used [0x0000000770980000,0x0000000770980000,0x0000000771400000) |
|||
to space 10752K, 0% used [0x000000076ff00000,0x000000076ff00000,0x0000000770980000) |
|||
ParOldGen total 172032K, used 0K [0x00000006c4000000, 0x00000006ce800000, 0x000000076c000000) |
|||
object space 172032K, 0% used [0x00000006c4000000,0x00000006c4000000,0x00000006ce800000) |
|||
Metaspace used 790K, capacity 4480K, committed 4480K, reserved 1056768K |
|||
class space used 76K, capacity 384K, committed 384K, reserved 1048576K |
|||
|
|||
Card table byte_map: [0x00000271c8b70000,0x00000271c9360000] byte_map_base: 0x00000271c5550000 |
|||
|
|||
Marking Bits: (ParMarkBitMap*) 0x00000000521f38d0 |
|||
Begin Bits: [0x00000271c98a0000, 0x00000271cd7a0000) |
|||
End Bits: [0x00000271cd7a0000, 0x00000271d16a0000) |
|||
|
|||
Polling page: 0x00000271b7eb0000 |
|||
|
|||
CodeCache: size=245760Kb used=328Kb max_used=328Kb free=245431Kb |
|||
bounds [0x00000271b97b0000, 0x00000271b9a20000, 0x00000271c87b0000] |
|||
total_blobs=57 nmethods=0 adapters=38 |
|||
compilation: enabled |
|||
|
|||
Compilation events (0 events): |
|||
No events |
|||
|
|||
GC Heap History (0 events): |
|||
No events |
|||
|
|||
Deoptimization events (0 events): |
|||
No events |
|||
|
|||
Classes redefined (0 events): |
|||
No events |
|||
|
|||
Internal exceptions (0 events): |
|||
No events |
|||
|
|||
Events (10 events): |
|||
Event: 0.012 loading class java/lang/Short |
|||
Event: 0.013 loading class java/lang/Short done |
|||
Event: 0.013 loading class java/lang/Integer |
|||
Event: 0.013 loading class java/lang/Integer done |
|||
Event: 0.013 loading class java/lang/Long |
|||
Event: 0.013 loading class java/lang/Long done |
|||
Event: 0.013 loading class java/lang/NullPointerException |
|||
Event: 0.013 loading class java/lang/NullPointerException done |
|||
Event: 0.013 loading class java/lang/ArithmeticException |
|||
Event: 0.013 loading class java/lang/ArithmeticException done |
|||
|
|||
|
|||
Dynamic libraries: |
|||
0x00007ff7d7590000 - 0x00007ff7d75d6000 C:\Users\18264\.jdks\corretto-1.8.0_422\bin\java.exe |
|||
0x00007ffa1d0b0000 - 0x00007ffa1d2a8000 C:\Windows\SYSTEM32\ntdll.dll |
|||
0x00007ffa1ce90000 - 0x00007ffa1cf52000 C:\Windows\System32\KERNEL32.DLL |
|||
0x00007ffa1add0000 - 0x00007ffa1b0cf000 C:\Windows\System32\KERNELBASE.dll |
|||
0x00007ffa1c470000 - 0x00007ffa1c51f000 C:\Windows\System32\ADVAPI32.dll |
|||
0x00007ffa1cf60000 - 0x00007ffa1cffe000 C:\Windows\System32\msvcrt.dll |
|||
0x00007ffa1cdf0000 - 0x00007ffa1ce8f000 C:\Windows\System32\sechost.dll |
|||
0x00007ffa1c580000 - 0x00007ffa1c6a3000 C:\Windows\System32\RPCRT4.dll |
|||
0x00007ffa1ada0000 - 0x00007ffa1adc7000 C:\Windows\System32\bcrypt.dll |
|||
0x00007ffa1be50000 - 0x00007ffa1bfed000 C:\Windows\System32\USER32.dll |
|||
0x00007ffa1a7a0000 - 0x00007ffa1a7c2000 C:\Windows\System32\win32u.dll |
|||
0x00007ffa1bff0000 - 0x00007ffa1c01b000 C:\Windows\System32\GDI32.dll |
|||
0x00007ffa1ac80000 - 0x00007ffa1ad9a000 C:\Windows\System32\gdi32full.dll |
|||
0x00007ffa1aaa0000 - 0x00007ffa1ab3d000 C:\Windows\System32\msvcp_win.dll |
|||
0x00007ffa1a9a0000 - 0x00007ffa1aaa0000 C:\Windows\System32\ucrtbase.dll |
|||
0x00007ffa00e00000 - 0x00007ffa0109a000 C:\Windows\WinSxS\amd64_microsoft.windows.common-controls_6595b64144ccf1df_6.0.19041.4355_none_60b8b9eb71f62e16\COMCTL32.dll |
|||
0x00007ffa1c030000 - 0x00007ffa1c05f000 C:\Windows\System32\IMM32.DLL |
|||
0x00007ffa10f70000 - 0x00007ffa10f85000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\vcruntime140.dll |
|||
0x00007ff9ceb10000 - 0x00007ff9cebab000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\msvcp140.dll |
|||
0x0000000051a10000 - 0x000000005226c000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\server\jvm.dll |
|||
0x00007ffa1c020000 - 0x00007ffa1c028000 C:\Windows\System32\PSAPI.DLL |
|||
0x00007ff9fac50000 - 0x00007ff9fac59000 C:\Windows\SYSTEM32\WSOCK32.dll |
|||
0x00007ffa0d800000 - 0x00007ffa0d827000 C:\Windows\SYSTEM32\WINMM.dll |
|||
0x00007ffa0ff90000 - 0x00007ffa0ff9a000 C:\Windows\SYSTEM32\VERSION.dll |
|||
0x00007ffa1c060000 - 0x00007ffa1c0cb000 C:\Windows\System32\WS2_32.dll |
|||
0x00007ffa18f70000 - 0x00007ffa18f82000 C:\Windows\SYSTEM32\kernel.appcore.dll |
|||
0x00007ffa10fc0000 - 0x00007ffa10fd0000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\verify.dll |
|||
0x00007ffa0aec0000 - 0x00007ffa0aeeb000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\java.dll |
|||
0x00007ff9ca260000 - 0x00007ff9ca296000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\jdwp.dll |
|||
0x00007ffa0af80000 - 0x00007ffa0af89000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\npt.dll |
|||
0x00007ff9c1ab0000 - 0x00007ff9c1ae2000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\instrument.dll |
|||
0x00007ffa008e0000 - 0x00007ffa008f8000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\zip.dll |
|||
|
|||
VM Arguments: |
|||
jvm_args: -agentlib:jdwp=transport=dt_socket,address=127.0.0.1:56727,suspend=y,server=n -javaagent:C:\Users\18264\AppData\Local\JetBrains\IntelliJIdea2021.1\captureAgent\debugger-agent.jar -Dfile.encoding=UTF-8 |
|||
java_command: com.example.saveInES |
|||
java_class_path (initial): C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\charsets.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\access-bridge-64.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\cldrdata.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\dnsns.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\jaccess.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\jfxrt.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\localedata.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\nashorn.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\sunec.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\sunjce_provider.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\sunmscapi.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\sunpkcs11.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\zipfs.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\jce.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\jfr.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\jfxswt.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\jsse.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\management-agent.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\resources.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\rt.jar;F:\workTest\DaKaES\target\classes;C:\Users\18264\.m2\repository\org\elasticsearch\client\elasticsearch-rest-high-level-client\7.17.0\elasticsearch-rest-high-level-client-7.17.0.jar;C:\Users\18264\.m2\repository\org\elasticsearch\elasticsearch\7.17.0\elasticsearch-7.17.0.jar;C:\Users\18264\.m2\repository\org\elasticsearch\elasticsearch-core\7.17.0\elasticsearch-core-7.17.0.jar;C:\Users\18264\.m2\repository\org\elasticsearch\elasticsearch-secure-sm\7.17.0\elasticsearch-secure-sm-7.17.0.jar;C:\Users\18264\.m2\repository\org\elasticsearch\elasticsearch-x-content\7.17.0\elasticsearch-x-content-7.17.0.jar;C:\Users\18264\.m2\repository\org\yaml\snakeyaml\1.26\snakeyaml-1.26.jar;C:\Users\18264\.m2\repository\c |
|||
Launcher Type: SUN_STANDARD |
|||
|
|||
Environment Variables: |
|||
JAVA_HOME=E:\java |
|||
PATH=C:\Program Files\Common Files\Oracle\Java\javapath;D:\vm\bin\;E:\app\18264\product\11.2.0\dbhome_1\bin;C:\Windows\system32;C:\Windows;C:\Windows\System32\Wbem;C:\Windows\System32\WindowsPowerShell\v1.0\;C:\Windows\System32\OpenSSH\;C:\Program Files (x86)\NVIDIA Corporation\PhysX\Common;C:\Program Files\NVIDIA Corporation\NVIDIA NvDLISR;C:\Windows\system32;C:\Windows;C:\Windows\System32\Wbem;C:\Windows\System32\WindowsPowerShell\v1.0\;C:\Windows\System32\OpenSSH\;E:\java\bin;F:\mysql\mysql-5.7.37-winx64\mysql-5.7.37-winx64\bin;D:\matlab\Matlab R2022a\bin;C:\Program Files (x86)\dotnet\;C:\Program Files\dotnet\;D:\winscp\WinSCP\;F:\javaAbout\apache-maven-3.6.3\bin;C:\Program Files\Git\cmd;F:\tool\nvm\nvm;F:\tool\node;C:\Users\18264\AppData\Local\Programs\Python\Python311\Scripts\;C:\Users\18264\AppData\Local\Programs\Python\Python311\;C:\Users\18264\AppData\Local\Programs\Python\Python37\Scripts\;C:\Users\18264\AppData\Local\Programs\Python\Python37\;C:\Users\18264\AppData\Local\Programs\Python\Launcher\;C:\Users\18264\AppData\Local\Microsoft\WindowsApps;D:\Microsoft VS Code\bin;F:\idea\IntelliJ IDEA 2021.1.3\bin;;F:\tool\nvm\nvm;F:\tool\node |
|||
USERNAME=18264 |
|||
OS=Windows_NT |
|||
PROCESSOR_IDENTIFIER=Intel64 Family 6 Model 141 Stepping 1, GenuineIntel |
|||
|
|||
|
|||
|
|||
--------------- S Y S T E M --------------- |
|||
|
|||
OS: Windows 10 , 64 bit Build 19041 (10.0.19041.5438) |
|||
|
|||
CPU:total 16 (initial active 16) (8 cores per cpu, 2 threads per core) family 6 model 141 stepping 1, cmov, cx8, fxsr, mmx, sse, sse2, sse3, ssse3, sse4.1, sse4.2, popcnt, avx, avx2, aes, clmul, erms, 3dnowpref, lzcnt, ht, tsc, tscinvbit, bmi1, bmi2, adx |
|||
|
|||
Memory: 4k page, physical 16509736k(919328k free), swap 36170532k(5620k free) |
|||
|
|||
vm_info: OpenJDK 64-Bit Server VM (25.422-b05) for windows-amd64 JRE (1.8.0_422-b05), built on Jul 11 2024 17:20:01 by "Administrator" with MS VC++ 15.9 (VS2017) |
|||
|
|||
time: Tue Mar 4 14:31:48 2025 |
|||
timezone: Intel64 Family 6 Model 141 Stepping 1, GenuineIntel |
|||
elapsed time: 0.022707 seconds (0d 0h 0m 0s) |
|||
|
@ -0,0 +1,6 @@ |
|||
Montpellier Institute of Virology, France |
|||
Ontario Public Health Laboratory, Canada |
|||
University of Texas Biosafety Laboratory, USA |
|||
Korea National Institute of Infectious Diseases (KCDC) |
|||
Israel Institute of Life Sciences |
|||
Biosafety Laboratory, University of Basel, Switzerland |
@ -0,0 +1,138 @@ |
|||
<project xmlns="http://maven.apache.org/POM/4.0.0" |
|||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
|||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> |
|||
<modelVersion>4.0.0</modelVersion> |
|||
<groupId>com.example</groupId> |
|||
<artifactId>es-crawler</artifactId> |
|||
<version>1.0-SNAPSHOT</version> |
|||
|
|||
<properties> |
|||
<maven.compiler.source>8</maven.compiler.source> |
|||
<maven.compiler.target>8</maven.compiler.target> |
|||
</properties> |
|||
|
|||
<dependencies> |
|||
<!-- Elasticsearch High Level REST Client --> |
|||
<dependency> |
|||
<groupId>org.elasticsearch.client</groupId> |
|||
<artifactId>elasticsearch-rest-high-level-client</artifactId> |
|||
<version>7.17.0</version> |
|||
</dependency> |
|||
|
|||
<dependency> |
|||
<groupId>co.elastic.clients</groupId> |
|||
<artifactId>elasticsearch-java</artifactId> |
|||
<version>7.17.15</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>com.fasterxml.jackson.core</groupId> |
|||
<artifactId>jackson-databind</artifactId> |
|||
<version>2.15.0</version> |
|||
</dependency> |
|||
|
|||
<!-- Jsoup HTML parser --> |
|||
<dependency> |
|||
<groupId>org.jsoup</groupId> |
|||
<artifactId>jsoup</artifactId> |
|||
<version>1.17.2</version> |
|||
</dependency> |
|||
|
|||
<!-- OkHttp --> |
|||
<dependency> |
|||
<groupId>com.squareup.okhttp3</groupId> |
|||
<artifactId>okhttp</artifactId> |
|||
<version>4.9.3</version> |
|||
</dependency> |
|||
|
|||
<!-- Logging --> |
|||
<dependency> |
|||
<groupId>org.slf4j</groupId> |
|||
<artifactId>slf4j-api</artifactId> |
|||
<version>1.7.36</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>ch.qos.logback</groupId> |
|||
<artifactId>logback-classic</artifactId> |
|||
<version>1.2.11</version> |
|||
</dependency> |
|||
|
|||
<!-- Kafka 客户端 --> |
|||
<dependency> |
|||
<groupId>org.apache.kafka</groupId> |
|||
<artifactId>kafka-clients</artifactId> |
|||
<version>3.9.0</version> |
|||
</dependency> |
|||
|
|||
<!-- Selenium Java --> |
|||
<dependency> |
|||
<groupId>org.seleniumhq.selenium</groupId> |
|||
<artifactId>selenium-java</artifactId> |
|||
<version>4.10.0</version> |
|||
</dependency> |
|||
|
|||
<!-- WebDriver Manager --> |
|||
<dependency> |
|||
<groupId>io.github.bonigarcia</groupId> |
|||
<artifactId>webdrivermanager</artifactId> |
|||
<version>5.6.2</version> |
|||
</dependency> |
|||
|
|||
<dependency> |
|||
<groupId>org.json</groupId> |
|||
<artifactId>json</artifactId> |
|||
<version>20230227</version> |
|||
</dependency> |
|||
|
|||
<dependency> |
|||
<groupId>com.google.code.gson</groupId> |
|||
<artifactId>gson</artifactId> |
|||
<version>2.10.1</version> |
|||
</dependency> |
|||
|
|||
<dependency> |
|||
<groupId>net.sourceforge.htmlunit</groupId> |
|||
<artifactId>htmlunit</artifactId> |
|||
<version>2.61.0</version> |
|||
</dependency> |
|||
</dependencies> |
|||
|
|||
<build> |
|||
<plugins> |
|||
<!-- 编译插件,保持 Java 8 配置 --> |
|||
<plugin> |
|||
<groupId>org.apache.maven.plugins</groupId> |
|||
<artifactId>maven-compiler-plugin</artifactId> |
|||
<version>3.8.1</version> |
|||
<configuration> |
|||
<source>8</source> |
|||
<target>8</target> |
|||
</configuration> |
|||
</plugin> |
|||
<!-- Assembly 插件,打包包含依赖的可执行 JAR --> |
|||
<plugin> |
|||
<groupId>org.apache.maven.plugins</groupId> |
|||
<artifactId>maven-assembly-plugin</artifactId> |
|||
<version>3.3.0</version> |
|||
<configuration> |
|||
<archive> |
|||
<manifest> |
|||
<mainClass>com.example.projTopic</mainClass> <!-- 替换为你的主类全路径 --> |
|||
</manifest> |
|||
</archive> |
|||
<descriptorRefs> |
|||
<descriptorRef>jar-with-dependencies</descriptorRef> |
|||
</descriptorRefs> |
|||
</configuration> |
|||
<executions> |
|||
<execution> |
|||
<id>make-assembly</id> |
|||
<phase>package</phase> |
|||
<goals> |
|||
<goal>single</goal> |
|||
</goals> |
|||
</execution> |
|||
</executions> |
|||
</plugin> |
|||
</plugins> |
|||
</build> |
|||
</project> |
@ -0,0 +1,281 @@ |
|||
|
|||
https://www.zyctd.com/zixun/201/1055143.html |
|||
https://www.zyctd.com/zixun/201/861786.html |
|||
https://www.zyctd.com/zixun/201/1053482.html |
|||
https://www.zyctd.com/zixun/201/269419.html |
|||
https://www.zyctd.com/zixun/201/1053149.html |
|||
https://www.zyctd.com/zixun/201/1023926.html |
|||
https://www.zyctd.com/zixun/201/435325.html |
|||
https://www.zyctd.com/zixun/201/1050302.html |
|||
https://www.zyctd.com/zixun/201/880441.html |
|||
https://www.zyctd.com/zixun/201/1019635.html |
|||
https://www.zyctd.com/zixun/201/970572.html |
|||
https://www.zyctd.com/zixun/201/912277.html |
|||
https://www.zyctd.com/zixun/201/372444.html |
|||
https://www.zyctd.com/zixun/201/1073629.html |
|||
https://www.zyctd.com/zixun/201/1069386.html |
|||
https://www.zyctd.com/zixun/201/730410.html |
|||
https://www.zyctd.com/zixun/201/953220.html |
|||
https://www.zyctd.com/zixun/201/1074339.html |
|||
https://www.zyctd.com/zixun/201/1072317.html |
|||
https://www.zyctd.com/zixun/201/294794.html |
|||
https://www.zyctd.com/zixun/201/267592.html |
|||
https://www.zyctd.com/zixun/201/979665.html |
|||
https://www.zyctd.com/zixun/201/869885.html |
|||
https://www.zyctd.com/zixun/201/1054064.html |
|||
https://www.zyctd.com/zixun/201/1049331.html |
|||
https://www.zyctd.com/zixun/201/442647.html |
|||
https://www.zyctd.com/zixun/201/285992.html |
|||
https://www.zyctd.com/zixun/201/1037972.html |
|||
https://www.zyctd.com/zixun/201/799801.html |
|||
https://www.zyctd.com/zixun/201/916078.html |
|||
https://www.zyctd.com/zixun/201/456647.html |
|||
https://www.zyctd.com/zixun/201/812121.html |
|||
https://www.zyctd.com/zixun/201/1042740.html |
|||
https://www.zyctd.com/zixun/201/1042708.html |
|||
https://www.zyctd.com/zixun/201/840450.html |
|||
https://www.zyctd.com/zixun/201/320749.html |
|||
https://www.zyctd.com/zixun/201/496106.html |
|||
https://www.zyctd.com/zixun/201/850201.html |
|||
https://www.zyctd.com/zixun/201/277145.html |
|||
https://www.zyctd.com/zixun/201/299091.html |
|||
https://www.zyctd.com/zixun/201/266080.html |
|||
https://www.zyctd.com/zixun/201/1051925.html |
|||
https://www.zyctd.com/zixun/201/898081.html |
|||
https://www.zyctd.com/zixun/201/873280.html |
|||
https://www.zyctd.com/zixun/201/703880.html |
|||
https://www.zyctd.com/zixun/201/873126.html |
|||
https://www.zyctd.com/zixun/201/887931.html |
|||
https://www.zyctd.com/zixun/201/432742.html |
|||
https://www.zyctd.com/zixun/201/1040431.html |
|||
https://www.zyctd.com/zixun/201/1040223.html |
|||
https://www.zyctd.com/zixun/201/858118.html |
|||
https://www.zyctd.com/zixun/201/971286.html |
|||
https://www.zyctd.com/zixun/201/458488.html |
|||
https://www.zyctd.com/zixun/201/1079381.html |
|||
https://www.zyctd.com/zixun/201/263578.html |
|||
https://www.zyctd.com/zixun/201/553513.html |
|||
https://www.zyctd.com/zixun/201/286229.html |
|||
https://www.zyctd.com/zixun/201/285365.html |
|||
https://www.zyctd.com/zixun/201/352921.html |
|||
https://www.zyctd.com/zixun/201/503267.html |
|||
https://www.zyctd.com/zixun/201/391337.html |
|||
https://www.zyctd.com/zixun/201/813052.html |
|||
https://www.zyctd.com/zixun/201/1053556.html |
|||
https://www.zyctd.com/zixun/201/1041197.html |
|||
https://www.zyctd.com/zixun/201/287420.html |
|||
https://www.zyctd.com/zixun/201/291563.html |
|||
https://www.zyctd.com/zixun/201/948250.html |
|||
https://www.zyctd.com/zixun/201/289034.html |
|||
https://www.zyctd.com/zixun/201/795965.html |
|||
https://www.zyctd.com/zixun/201/292962.html |
|||
https://www.zyctd.com/zixun/201/975850.html |
|||
https://www.zyctd.com/zixun/201/275335.html |
|||
https://www.zyctd.com/zixun/201/1031992.html |
|||
https://www.zyctd.com/zixun/201/1033886.html |
|||
https://www.zyctd.com/zixun/201/999510.html |
|||
https://www.zyctd.com/zixun/201/270144.html |
|||
https://www.zyctd.com/zixun/201/1055519.html |
|||
https://www.zyctd.com/zixun/201/272205.html |
|||
https://www.zyctd.com/zixun/201/526059.html |
|||
https://www.zyctd.com/zixun/201/456640.html |
|||
https://www.zyctd.com/zixun/201/267952.html |
|||
https://www.zyctd.com/zixun/201/803469.html |
|||
https://www.zyctd.com/zixun/201/270763.html |
|||
https://www.zyctd.com/zixun/201/1072987.html |
|||
https://www.zyctd.com/zixun/201/265176.html |
|||
https://www.zyctd.com/zixun/201/1022141.html |
|||
https://www.zyctd.com/zixun/201/290173.html |
|||
https://www.zyctd.com/zixun/201/269175.html |
|||
https://www.zyctd.com/zixun/201/744991.html |
|||
https://www.zyctd.com/zixun/201/1019131.html |
|||
https://www.zyctd.com/zixun/201/717054.html |
|||
https://www.zyctd.com/zixun/201/517358.html |
|||
https://www.zyctd.com/zixun/201/1058505.html |
|||
https://www.zyctd.com/zixun/201/905515.html |
|||
https://www.zyctd.com/zixun/201/287395.html |
|||
https://www.zyctd.com/zixun/201/934873.html |
|||
https://www.zyctd.com/zixun/201/1051317.html |
|||
https://www.zyctd.com/zixun/201/926018.html |
|||
https://www.zyctd.com/zixun/201/334511.html |
|||
https://www.zyctd.com/zixun/201/845896.html |
|||
https://www.zyctd.com/zixun/201/587785.html |
|||
https://www.zyctd.com/zixun/201/288376.html |
|||
https://www.zyctd.com/zixun/201/851405.html |
|||
https://www.zyctd.com/zixun/201/941404.html |
|||
https://www.zyctd.com/zixun/201/881855.html |
|||
https://www.zyctd.com/zixun/201/602632.html |
|||
https://www.zyctd.com/zixun/201/293601.html |
|||
https://www.zyctd.com/zixun/201/541809.html |
|||
https://www.zyctd.com/zixun/201/335120.html |
|||
https://www.zyctd.com/zixun/201/1031137.html |
|||
https://www.zyctd.com/zixun/201/960101.html |
|||
https://www.zyctd.com/zixun/201/1077142.html |
|||
https://www.zyctd.com/zixun/201/1063222.html |
|||
https://www.zyctd.com/zixun/201/681466.html |
|||
https://www.zyctd.com/zixun/201/1031130.html |
|||
https://www.zyctd.com/zixun/201/1073734.html |
|||
https://www.zyctd.com/zixun/201/1062186.html |
|||
https://www.zyctd.com/zixun/201/1046628.html |
|||
https://www.zyctd.com/zixun/201/358892.html |
|||
https://www.zyctd.com/zixun/201/285361.html |
|||
https://www.zyctd.com/zixun/201/1059889.html |
|||
https://www.zyctd.com/zixun/201/297824.html |
|||
https://www.zyctd.com/zixun/201/844307.html |
|||
https://www.zyctd.com/zixun/201/900524.html |
|||
https://www.zyctd.com/zixun/201/1057636.html |
|||
https://www.zyctd.com/zixun/201/1010080.html |
|||
https://www.zyctd.com/zixun/201/409152.html |
|||
https://www.zyctd.com/zixun/201/402782.html |
|||
https://www.zyctd.com/zixun/201/770296.html |
|||
https://www.zyctd.com/zixun/201/1040602.html |
|||
https://www.zyctd.com/zixun/201/606503.html |
|||
https://www.zyctd.com/zixun/201/784471.html |
|||
https://www.zyctd.com/zixun/201/466097.html |
|||
https://www.zyctd.com/zixun/201/1071160.html |
|||
https://www.zyctd.com/zixun/201/623226.html |
|||
https://www.zyctd.com/zixun/201/948264.html |
|||
https://www.zyctd.com/zixun/201/293462.html |
|||
https://www.zyctd.com/zixun/201/829348.html |
|||
https://www.zyctd.com/zixun/201/332369.html |
|||
https://www.zyctd.com/zixun/201/907461.html |
|||
https://www.zyctd.com/zixun/201/756555.html |
|||
https://www.zyctd.com/zixun/201/717915.html |
|||
https://www.zyctd.com/zixun/201/262203.html |
|||
https://www.zyctd.com/zixun/201/1055787.html |
|||
https://www.zyctd.com/zixun/201/432336.html |
|||
https://www.zyctd.com/zixun/201/907489.html |
|||
https://www.zyctd.com/zixun/201/1014686.html |
|||
https://www.zyctd.com/zixun/201/1053320.html |
|||
https://www.zyctd.com/zixun/201/480020.html |
|||
https://www.zyctd.com/zixun/201/287423.html |
|||
https://www.zyctd.com/zixun/201/385289.html |
|||
https://www.zyctd.com/zixun/201/1030421.html |
|||
https://www.zyctd.com/zixun/201/527648.html |
|||
https://www.zyctd.com/zixun/201/972959.html |
|||
https://www.zyctd.com/zixun/201/408767.html |
|||
https://www.zyctd.com/zixun/201/724887.html |
|||
https://www.zyctd.com/zixun/201/291480.html |
|||
https://www.zyctd.com/zixun/201/472544.html |
|||
https://www.zyctd.com/zixun/201/724873.html |
|||
https://www.zyctd.com/zixun/201/281751.html |
|||
https://www.zyctd.com/zixun/201/1049693.html |
|||
https://www.zyctd.com/zixun/201/869619.html |
|||
https://www.zyctd.com/zixun/201/355497.html |
|||
https://www.zyctd.com/zixun/201/341623.html |
|||
https://www.zyctd.com/zixun/201/450753.html |
|||
https://www.zyctd.com/zixun/201/1065837.html |
|||
https://www.zyctd.com/zixun/201/1031331.html |
|||
https://www.zyctd.com/zixun/201/669727.html |
|||
https://www.zyctd.com/zixun/201/1034010.html |
|||
https://www.zyctd.com/zixun/201/1054058.html |
|||
https://www.zyctd.com/zixun/201/954613.html |
|||
https://www.zyctd.com/zixun/201/715584.html |
|||
https://www.zyctd.com/zixun/201/1051110.html |
|||
https://www.zyctd.com/zixun/201/269963.html |
|||
https://www.zyctd.com/zixun/201/1048128.html |
|||
https://www.zyctd.com/zixun/201/793207.html |
|||
https://www.zyctd.com/zixun/201/284310.html |
|||
https://www.zyctd.com/zixun/201/282639.html |
|||
https://www.zyctd.com/zixun/201/1068138.html |
|||
https://www.zyctd.com/zixun/201/340678.html |
|||
https://www.zyctd.com/zixun/201/294371.html |
|||
https://www.zyctd.com/zixun/201/324277.html |
|||
https://www.zyctd.com/zixun/201/1048931.html |
|||
https://www.zyctd.com/zixun/201/851398.html |
|||
https://www.zyctd.com/zixun/201/263527.html |
|||
https://www.zyctd.com/zixun/201/919480.html |
|||
https://www.zyctd.com/zixun/201/685442.html |
|||
https://www.zyctd.com/zixun/201/428325.html |
|||
https://www.zyctd.com/zixun/201/1032698.html |
|||
https://www.zyctd.com/zixun/201/1003367.html |
|||
https://www.zyctd.com/zixun/201/852315.html |
|||
https://www.zyctd.com/zixun/201/283156.html |
|||
https://www.zyctd.com/zixun/201/262484.html |
|||
https://www.zyctd.com/zixun/201/1065225.html |
|||
https://www.zyctd.com/zixun/201/763331.html |
|||
https://www.zyctd.com/zixun/201/1066158.html |
|||
https://www.zyctd.com/zixun/201/1047744.html |
|||
https://www.zyctd.com/zixun/201/842795.html |
|||
https://www.zyctd.com/zixun/201/975374.html |
|||
https://www.zyctd.com/zixun/201/1055865.html |
|||
https://www.zyctd.com/zixun/201/1017367.html |
|||
https://www.zyctd.com/zixun/201/1057711.html |
|||
https://www.zyctd.com/zixun/201/1074295.html |
|||
https://www.zyctd.com/zixun/201/283647.html |
|||
https://www.zyctd.com/zixun/201/286896.html |
|||
https://www.zyctd.com/zixun/201/1043393.html |
|||
https://www.zyctd.com/zixun/201/305888.html |
|||
https://www.zyctd.com/zixun/201/487258.html |
|||
https://www.zyctd.com/zixun/201/1045652.html |
|||
https://www.zyctd.com/zixun/201/1064905.html |
|||
https://www.zyctd.com/zixun/201/515636.html |
|||
https://www.zyctd.com/zixun/201/1038609.html |
|||
https://www.zyctd.com/zixun/201/438083.html |
|||
https://www.zyctd.com/zixun/201/297327.html |
|||
https://www.zyctd.com/zixun/201/773537.html |
|||
https://www.zyctd.com/zixun/201/1043589.html |
|||
https://www.zyctd.com/zixun/201/815712.html |
|||
https://www.zyctd.com/zixun/201/698595.html |
|||
https://www.zyctd.com/zixun/201/269800.html |
|||
https://www.zyctd.com/zixun/201/1030332.html |
|||
https://www.zyctd.com/zixun/201/422676.html |
|||
https://www.zyctd.com/zixun/201/290130.html |
|||
https://www.zyctd.com/zixun/201/270359.html |
|||
https://www.zyctd.com/zixun/201/995604.html |
|||
https://www.zyctd.com/zixun/201/1074993.html |
|||
https://www.zyctd.com/zixun/201/1054825.html |
|||
https://www.zyctd.com/zixun/201/918577.html |
|||
https://www.zyctd.com/zixun/201/686527.html |
|||
https://www.zyctd.com/zixun/201/297509.html |
|||
https://www.zyctd.com/zixun/201/622708.html |
|||
https://www.zyctd.com/zixun/201/469870.html |
|||
https://www.zyctd.com/zixun/201/844328.html |
|||
https://www.zyctd.com/zixun/201/394508.html |
|||
https://www.zyctd.com/zixun/201/271744.html |
|||
https://www.zyctd.com/zixun/201/1054940.html |
|||
https://www.zyctd.com/zixun/201/732818.html |
|||
https://www.zyctd.com/zixun/201/1049547.html |
|||
https://www.zyctd.com/zixun/201/1059684.html |
|||
https://www.zyctd.com/zixun/201/1055301.html |
|||
https://www.zyctd.com/zixun/201/962068.html |
|||
https://www.zyctd.com/zixun/201/451355.html |
|||
https://www.zyctd.com/zixun/201/1056174.html |
|||
https://www.zyctd.com/zixun/201/930540.html |
|||
https://www.zyctd.com/zixun/201/871656.html |
|||
https://www.zyctd.com/zixun/201/363246.html |
|||
https://www.zyctd.com/zixun/201/845672.html |
|||
https://www.zyctd.com/zixun/201/452965.html |
|||
https://www.zyctd.com/zixun/201/1065920.html |
|||
https://www.zyctd.com/zixun/201/1058808.html |
|||
https://www.zyctd.com/zixun/201/986868.html |
|||
https://www.zyctd.com/zixun/201/489785.html |
|||
https://www.zyctd.com/zixun/201/307946.html |
|||
https://www.zyctd.com/zixun/201/833359.html |
|||
https://www.zyctd.com/zixun/201/806969.html |
|||
https://www.zyctd.com/zixun/201/1050812.html |
|||
https://www.zyctd.com/zixun/201/1033696.html |
|||
https://www.zyctd.com/zixun/201/501167.html |
|||
https://www.zyctd.com/zixun/201/1078919.html |
|||
https://www.zyctd.com/zixun/201/1036495.html |
|||
https://www.zyctd.com/zixun/201/1008736.html |
|||
https://www.zyctd.com/zixun/201/1054264.html |
|||
https://www.zyctd.com/zixun/201/493152.html |
|||
https://www.zyctd.com/zixun/201/685456.html |
|||
https://www.zyctd.com/zixun/201/995597.html |
|||
https://www.zyctd.com/zixun/201/905501.html |
|||
https://www.zyctd.com/zixun/201/347573.html |
|||
https://www.zyctd.com/zixun/201/1045494.html |
|||
https://www.zyctd.com/zixun/201/549775.html |
|||
https://www.zyctd.com/zixun/201/1037336.html |
|||
https://www.zyctd.com/zixun/201/1034972.html |
|||
https://www.zyctd.com/zixun/201/653046.html |
|||
https://www.zyctd.com/zixun/201/316612.html |
|||
https://www.zyctd.com/zixun/201/447064.html |
|||
https://www.zyctd.com/zixun/201/307603.html |
|||
https://www.zyctd.com/zixun/201/263437.html |
|||
https://www.zyctd.com/zixun/201/894490.html |
|||
https://www.zyctd.com/zixun/201/368629.html |
|||
https://www.zyctd.com/zixun/201/273285.html |
|||
https://www.zyctd.com/zixun/201/1059618.html |
|||
https://www.zyctd.com/zixun/201/459237.html |
@ -0,0 +1 @@ |
|||
127.0.0.1:7897 |
@ -0,0 +1,4 @@ |
|||
Manifest-Version: 1.0 |
|||
Build-Jdk-Spec: 22 |
|||
Created-By: Maven Integration for Eclipse |
|||
|
@ -0,0 +1,5 @@ |
|||
#Generated by Maven |
|||
#Fri Apr 18 18:29:46 CST 2025 |
|||
version=1.0-SNAPSHOT |
|||
groupId=com.example |
|||
artifactId=es-crawler |
@ -0,0 +1 @@ |
|||
com\example\projTopic.class |
@ -0,0 +1 @@ |
|||
F:\workTest\DaKaES\src\main\java\com\example\projTopic.java |
@ -0,0 +1,167 @@ |
|||
# |
|||
# There is insufficient memory for the Java Runtime Environment to continue. |
|||
# Native memory allocation (malloc) failed to allocate 1048576 bytes for AllocateHeap |
|||
# Possible reasons: |
|||
# The system is out of physical RAM or swap space |
|||
# The process is running with CompressedOops enabled, and the Java Heap may be blocking the growth of the native heap |
|||
# Possible solutions: |
|||
# Reduce memory load on the system |
|||
# Increase physical memory or swap space |
|||
# Check if swap backing store is full |
|||
# Decrease Java heap size (-Xmx/-Xms) |
|||
# Decrease number of Java threads |
|||
# Decrease Java thread stack sizes (-Xss) |
|||
# Set larger code cache with -XX:ReservedCodeCacheSize= |
|||
# JVM is running with Zero Based Compressed Oops mode in which the Java heap is |
|||
# placed in the first 32GB address space. The Java Heap base address is the |
|||
# maximum limit for the native heap growth. Please use -XX:HeapBaseMinAddress |
|||
# to set the Java Heap base and to place the Java Heap above 32GB virtual address. |
|||
# This output file may be truncated or incomplete. |
|||
# |
|||
# Out of Memory Error (memory/allocation.inline.hpp:61), pid=15760, tid=0x0000000000003334 |
|||
# |
|||
# JRE version: (8.0_422-b05) (build ) |
|||
# Java VM: OpenJDK 64-Bit Server VM (25.422-b05 mixed mode windows-amd64 compressed oops) |
|||
# Failed to write core dump. Minidumps are not enabled by default on client versions of Windows |
|||
# |
|||
|
|||
--------------- T H R E A D --------------- |
|||
|
|||
Current thread (0x00000271b7d7d800): JavaThread "Unknown thread" [_thread_in_vm, id=13108, stack(0x00000082a1500000,0x00000082a1600000)] |
|||
|
|||
Stack: [0x00000082a1500000,0x00000082a1600000] |
|||
[error occurred during error reporting (printing stack bounds), id 0xc0000005] |
|||
|
|||
Native frames: (J=compiled Java code, j=interpreted, Vv=VM code, C=native code) |
|||
|
|||
|
|||
--------------- P R O C E S S --------------- |
|||
|
|||
Java Threads: ( => current thread ) |
|||
|
|||
Other Threads: |
|||
|
|||
=>0x00000271b7d7d800 (exited) JavaThread "Unknown thread" [_thread_in_vm, id=13108, stack(0x00000082a1500000,0x00000082a1600000)] |
|||
|
|||
VM state:not at safepoint (normal execution) |
|||
|
|||
VM Mutex/Monitor currently owned by a thread: None |
|||
|
|||
heap address: 0x00000006c4000000, size: 4032 MB, Compressed Oops mode: Zero based, Oop shift amount: 3 |
|||
Narrow klass base: 0x0000000000000000, Narrow klass shift: 3 |
|||
Compressed class space size: 1073741824 Address: 0x00000007c0000000 |
|||
|
|||
Heap: |
|||
PSYoungGen total 75264K, used 1290K [0x000000076c000000, 0x0000000771400000, 0x00000007c0000000) |
|||
eden space 64512K, 2% used [0x000000076c000000,0x000000076c142900,0x000000076ff00000) |
|||
from space 10752K, 0% used [0x0000000770980000,0x0000000770980000,0x0000000771400000) |
|||
to space 10752K, 0% used [0x000000076ff00000,0x000000076ff00000,0x0000000770980000) |
|||
ParOldGen total 172032K, used 0K [0x00000006c4000000, 0x00000006ce800000, 0x000000076c000000) |
|||
object space 172032K, 0% used [0x00000006c4000000,0x00000006c4000000,0x00000006ce800000) |
|||
Metaspace used 790K, capacity 4480K, committed 4480K, reserved 1056768K |
|||
class space used 76K, capacity 384K, committed 384K, reserved 1048576K |
|||
|
|||
Card table byte_map: [0x00000271c8b70000,0x00000271c9360000] byte_map_base: 0x00000271c5550000 |
|||
|
|||
Marking Bits: (ParMarkBitMap*) 0x00000000521f38d0 |
|||
Begin Bits: [0x00000271c98a0000, 0x00000271cd7a0000) |
|||
End Bits: [0x00000271cd7a0000, 0x00000271d16a0000) |
|||
|
|||
Polling page: 0x00000271b7eb0000 |
|||
|
|||
CodeCache: size=245760Kb used=328Kb max_used=328Kb free=245431Kb |
|||
bounds [0x00000271b97b0000, 0x00000271b9a20000, 0x00000271c87b0000] |
|||
total_blobs=57 nmethods=0 adapters=38 |
|||
compilation: enabled |
|||
|
|||
Compilation events (0 events): |
|||
No events |
|||
|
|||
GC Heap History (0 events): |
|||
No events |
|||
|
|||
Deoptimization events (0 events): |
|||
No events |
|||
|
|||
Classes redefined (0 events): |
|||
No events |
|||
|
|||
Internal exceptions (0 events): |
|||
No events |
|||
|
|||
Events (10 events): |
|||
Event: 0.012 loading class java/lang/Short |
|||
Event: 0.013 loading class java/lang/Short done |
|||
Event: 0.013 loading class java/lang/Integer |
|||
Event: 0.013 loading class java/lang/Integer done |
|||
Event: 0.013 loading class java/lang/Long |
|||
Event: 0.013 loading class java/lang/Long done |
|||
Event: 0.013 loading class java/lang/NullPointerException |
|||
Event: 0.013 loading class java/lang/NullPointerException done |
|||
Event: 0.013 loading class java/lang/ArithmeticException |
|||
Event: 0.013 loading class java/lang/ArithmeticException done |
|||
|
|||
|
|||
Dynamic libraries: |
|||
0x00007ff7d7590000 - 0x00007ff7d75d6000 C:\Users\18264\.jdks\corretto-1.8.0_422\bin\java.exe |
|||
0x00007ffa1d0b0000 - 0x00007ffa1d2a8000 C:\Windows\SYSTEM32\ntdll.dll |
|||
0x00007ffa1ce90000 - 0x00007ffa1cf52000 C:\Windows\System32\KERNEL32.DLL |
|||
0x00007ffa1add0000 - 0x00007ffa1b0cf000 C:\Windows\System32\KERNELBASE.dll |
|||
0x00007ffa1c470000 - 0x00007ffa1c51f000 C:\Windows\System32\ADVAPI32.dll |
|||
0x00007ffa1cf60000 - 0x00007ffa1cffe000 C:\Windows\System32\msvcrt.dll |
|||
0x00007ffa1cdf0000 - 0x00007ffa1ce8f000 C:\Windows\System32\sechost.dll |
|||
0x00007ffa1c580000 - 0x00007ffa1c6a3000 C:\Windows\System32\RPCRT4.dll |
|||
0x00007ffa1ada0000 - 0x00007ffa1adc7000 C:\Windows\System32\bcrypt.dll |
|||
0x00007ffa1be50000 - 0x00007ffa1bfed000 C:\Windows\System32\USER32.dll |
|||
0x00007ffa1a7a0000 - 0x00007ffa1a7c2000 C:\Windows\System32\win32u.dll |
|||
0x00007ffa1bff0000 - 0x00007ffa1c01b000 C:\Windows\System32\GDI32.dll |
|||
0x00007ffa1ac80000 - 0x00007ffa1ad9a000 C:\Windows\System32\gdi32full.dll |
|||
0x00007ffa1aaa0000 - 0x00007ffa1ab3d000 C:\Windows\System32\msvcp_win.dll |
|||
0x00007ffa1a9a0000 - 0x00007ffa1aaa0000 C:\Windows\System32\ucrtbase.dll |
|||
0x00007ffa00e00000 - 0x00007ffa0109a000 C:\Windows\WinSxS\amd64_microsoft.windows.common-controls_6595b64144ccf1df_6.0.19041.4355_none_60b8b9eb71f62e16\COMCTL32.dll |
|||
0x00007ffa1c030000 - 0x00007ffa1c05f000 C:\Windows\System32\IMM32.DLL |
|||
0x00007ffa10f70000 - 0x00007ffa10f85000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\vcruntime140.dll |
|||
0x00007ff9ceb10000 - 0x00007ff9cebab000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\msvcp140.dll |
|||
0x0000000051a10000 - 0x000000005226c000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\server\jvm.dll |
|||
0x00007ffa1c020000 - 0x00007ffa1c028000 C:\Windows\System32\PSAPI.DLL |
|||
0x00007ff9fac50000 - 0x00007ff9fac59000 C:\Windows\SYSTEM32\WSOCK32.dll |
|||
0x00007ffa0d800000 - 0x00007ffa0d827000 C:\Windows\SYSTEM32\WINMM.dll |
|||
0x00007ffa0ff90000 - 0x00007ffa0ff9a000 C:\Windows\SYSTEM32\VERSION.dll |
|||
0x00007ffa1c060000 - 0x00007ffa1c0cb000 C:\Windows\System32\WS2_32.dll |
|||
0x00007ffa18f70000 - 0x00007ffa18f82000 C:\Windows\SYSTEM32\kernel.appcore.dll |
|||
0x00007ffa10fc0000 - 0x00007ffa10fd0000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\verify.dll |
|||
0x00007ffa0aec0000 - 0x00007ffa0aeeb000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\java.dll |
|||
0x00007ff9ca260000 - 0x00007ff9ca296000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\jdwp.dll |
|||
0x00007ffa0af80000 - 0x00007ffa0af89000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\npt.dll |
|||
0x00007ff9c1ab0000 - 0x00007ff9c1ae2000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\instrument.dll |
|||
0x00007ffa008e0000 - 0x00007ffa008f8000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\zip.dll |
|||
|
|||
VM Arguments: |
|||
jvm_args: -agentlib:jdwp=transport=dt_socket,address=127.0.0.1:56727,suspend=y,server=n -javaagent:C:\Users\18264\AppData\Local\JetBrains\IntelliJIdea2021.1\captureAgent\debugger-agent.jar -Dfile.encoding=UTF-8 |
|||
java_command: com.example.saveInES |
|||
java_class_path (initial): C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\charsets.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\access-bridge-64.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\cldrdata.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\dnsns.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\jaccess.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\jfxrt.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\localedata.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\nashorn.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\sunec.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\sunjce_provider.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\sunmscapi.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\sunpkcs11.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\zipfs.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\jce.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\jfr.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\jfxswt.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\jsse.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\management-agent.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\resources.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\rt.jar;F:\workTest\DaKaES\target\classes;C:\Users\18264\.m2\repository\org\elasticsearch\client\elasticsearch-rest-high-level-client\7.17.0\elasticsearch-rest-high-level-client-7.17.0.jar;C:\Users\18264\.m2\repository\org\elasticsearch\elasticsearch\7.17.0\elasticsearch-7.17.0.jar;C:\Users\18264\.m2\repository\org\elasticsearch\elasticsearch-core\7.17.0\elasticsearch-core-7.17.0.jar;C:\Users\18264\.m2\repository\org\elasticsearch\elasticsearch-secure-sm\7.17.0\elasticsearch-secure-sm-7.17.0.jar;C:\Users\18264\.m2\repository\org\elasticsearch\elasticsearch-x-content\7.17.0\elasticsearch-x-content-7.17.0.jar;C:\Users\18264\.m2\repository\org\yaml\snakeyaml\1.26\snakeyaml-1.26.jar;C:\Users\18264\.m2\repository\c |
|||
Launcher Type: SUN_STANDARD |
|||
|
|||
Environment Variables: |
|||
JAVA_HOME=E:\java |
|||
PATH=C:\Program Files\Common Files\Oracle\Java\javapath;D:\vm\bin\;E:\app\18264\product\11.2.0\dbhome_1\bin;C:\Windows\system32;C:\Windows;C:\Windows\System32\Wbem;C:\Windows\System32\WindowsPowerShell\v1.0\;C:\Windows\System32\OpenSSH\;C:\Program Files (x86)\NVIDIA Corporation\PhysX\Common;C:\Program Files\NVIDIA Corporation\NVIDIA NvDLISR;C:\Windows\system32;C:\Windows;C:\Windows\System32\Wbem;C:\Windows\System32\WindowsPowerShell\v1.0\;C:\Windows\System32\OpenSSH\;E:\java\bin;F:\mysql\mysql-5.7.37-winx64\mysql-5.7.37-winx64\bin;D:\matlab\Matlab R2022a\bin;C:\Program Files (x86)\dotnet\;C:\Program Files\dotnet\;D:\winscp\WinSCP\;F:\javaAbout\apache-maven-3.6.3\bin;C:\Program Files\Git\cmd;F:\tool\nvm\nvm;F:\tool\node;C:\Users\18264\AppData\Local\Programs\Python\Python311\Scripts\;C:\Users\18264\AppData\Local\Programs\Python\Python311\;C:\Users\18264\AppData\Local\Programs\Python\Python37\Scripts\;C:\Users\18264\AppData\Local\Programs\Python\Python37\;C:\Users\18264\AppData\Local\Programs\Python\Launcher\;C:\Users\18264\AppData\Local\Microsoft\WindowsApps;D:\Microsoft VS Code\bin;F:\idea\IntelliJ IDEA 2021.1.3\bin;;F:\tool\nvm\nvm;F:\tool\node |
|||
USERNAME=18264 |
|||
OS=Windows_NT |
|||
PROCESSOR_IDENTIFIER=Intel64 Family 6 Model 141 Stepping 1, GenuineIntel |
|||
|
|||
|
|||
|
|||
--------------- S Y S T E M --------------- |
|||
|
|||
OS: Windows 10 , 64 bit Build 19041 (10.0.19041.5438) |
|||
|
|||
CPU:total 16 (initial active 16) (8 cores per cpu, 2 threads per core) family 6 model 141 stepping 1, cmov, cx8, fxsr, mmx, sse, sse2, sse3, ssse3, sse4.1, sse4.2, popcnt, avx, avx2, aes, clmul, erms, 3dnowpref, lzcnt, ht, tsc, tscinvbit, bmi1, bmi2, adx |
|||
|
|||
Memory: 4k page, physical 16509736k(919328k free), swap 36170532k(5620k free) |
|||
|
|||
vm_info: OpenJDK 64-Bit Server VM (25.422-b05) for windows-amd64 JRE (1.8.0_422-b05), built on Jul 11 2024 17:20:01 by "Administrator" with MS VC++ 15.9 (VS2017) |
|||
|
|||
time: Tue Mar 4 14:31:48 2025 |
|||
timezone: Intel64 Family 6 Model 141 Stepping 1, GenuineIntel |
|||
elapsed time: 0.022707 seconds (0d 0h 0m 0s) |
|||
|
1045
keywords.txt
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
After Width: 80 | Height: 30 | Size: 5.7 KiB |
@ -0,0 +1,150 @@ |
|||
<project xmlns="http://maven.apache.org/POM/4.0.0" |
|||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
|||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> |
|||
<modelVersion>4.0.0</modelVersion> |
|||
<groupId>com.example</groupId> |
|||
<artifactId>es-crawler</artifactId> |
|||
<version>1.0-SNAPSHOT</version> |
|||
|
|||
<properties> |
|||
<maven.compiler.source>8</maven.compiler.source> |
|||
<maven.compiler.target>8</maven.compiler.target> |
|||
</properties> |
|||
|
|||
<dependencies> |
|||
<!-- Elasticsearch High Level REST Client --> |
|||
<dependency> |
|||
<groupId>org.elasticsearch.client</groupId> |
|||
<artifactId>elasticsearch-rest-high-level-client</artifactId> |
|||
<version>7.17.0</version> |
|||
</dependency> |
|||
|
|||
<dependency> |
|||
<groupId>co.elastic.clients</groupId> |
|||
<artifactId>elasticsearch-java</artifactId> |
|||
<version>7.17.15</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>com.fasterxml.jackson.core</groupId> |
|||
<artifactId>jackson-databind</artifactId> |
|||
<version>2.15.0</version> |
|||
</dependency> |
|||
|
|||
<!-- Jsoup HTML parser --> |
|||
<dependency> |
|||
<groupId>org.jsoup</groupId> |
|||
<artifactId>jsoup</artifactId> |
|||
<version>1.17.2</version> |
|||
</dependency> |
|||
|
|||
<!-- OkHttp --> |
|||
<dependency> |
|||
<groupId>com.squareup.okhttp3</groupId> |
|||
<artifactId>okhttp</artifactId> |
|||
<version>4.9.3</version> |
|||
</dependency> |
|||
|
|||
<!-- Logging --> |
|||
<dependency> |
|||
<groupId>org.slf4j</groupId> |
|||
<artifactId>slf4j-api</artifactId> |
|||
<version>1.7.36</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>ch.qos.logback</groupId> |
|||
<artifactId>logback-classic</artifactId> |
|||
<version>1.2.11</version> |
|||
</dependency> |
|||
|
|||
<!-- Kafka 客户端 --> |
|||
<dependency> |
|||
<groupId>org.apache.kafka</groupId> |
|||
<artifactId>kafka-clients</artifactId> |
|||
<version>3.9.0</version> |
|||
</dependency> |
|||
|
|||
<!-- Selenium Java --> |
|||
<dependency> |
|||
<groupId>org.seleniumhq.selenium</groupId> |
|||
<artifactId>selenium-java</artifactId> |
|||
<version>4.10.0</version> |
|||
</dependency> |
|||
|
|||
<!-- WebDriver Manager --> |
|||
<dependency> |
|||
<groupId>io.github.bonigarcia</groupId> |
|||
<artifactId>webdrivermanager</artifactId> |
|||
<version>5.6.2</version> |
|||
</dependency> |
|||
|
|||
<dependency> |
|||
<groupId>org.json</groupId> |
|||
<artifactId>json</artifactId> |
|||
<version>20230227</version> |
|||
</dependency> |
|||
|
|||
<dependency> |
|||
<groupId>com.google.code.gson</groupId> |
|||
<artifactId>gson</artifactId> |
|||
<version>2.10.1</version> |
|||
</dependency> |
|||
|
|||
<dependency> |
|||
<groupId>net.sourceforge.htmlunit</groupId> |
|||
<artifactId>htmlunit</artifactId> |
|||
<version>2.61.0</version> |
|||
</dependency> |
|||
|
|||
<dependency> |
|||
<groupId>net.sourceforge.tess4j</groupId> |
|||
<artifactId>tess4j</artifactId> |
|||
<version>4.5.4</version> |
|||
</dependency> |
|||
|
|||
<dependency> |
|||
<groupId>org.apache.httpcomponents.client5</groupId> |
|||
<artifactId>httpclient5</artifactId> |
|||
<version>5.3.1</version> |
|||
</dependency> |
|||
</dependencies> |
|||
|
|||
<build> |
|||
<plugins> |
|||
<!-- 编译插件,保持 Java 8 配置 --> |
|||
<plugin> |
|||
<groupId>org.apache.maven.plugins</groupId> |
|||
<artifactId>maven-compiler-plugin</artifactId> |
|||
<version>3.8.1</version> |
|||
<configuration> |
|||
<source>8</source> |
|||
<target>8</target> |
|||
</configuration> |
|||
</plugin> |
|||
<!-- Assembly 插件,打包包含依赖的可执行 JAR --> |
|||
<plugin> |
|||
<groupId>org.apache.maven.plugins</groupId> |
|||
<artifactId>maven-assembly-plugin</artifactId> |
|||
<version>3.3.0</version> |
|||
<configuration> |
|||
<archive> |
|||
<manifest> |
|||
<mainClass>com.example.CtriScraper</mainClass> <!-- 替换为你的主类全路径 --> |
|||
</manifest> |
|||
</archive> |
|||
<descriptorRefs> |
|||
<descriptorRef>jar-with-dependencies</descriptorRef> |
|||
</descriptorRefs> |
|||
</configuration> |
|||
<executions> |
|||
<execution> |
|||
<id>make-assembly</id> |
|||
<phase>package</phase> |
|||
<goals> |
|||
<goal>single</goal> |
|||
</goals> |
|||
</execution> |
|||
</executions> |
|||
</plugin> |
|||
</plugins> |
|||
</build> |
|||
</project> |
After Width: 80 | Height: 30 | Size: 398 B |
@ -0,0 +1,281 @@ |
|||
|
|||
https://www.zyctd.com/zixun/201/1055143.html |
|||
https://www.zyctd.com/zixun/201/861786.html |
|||
https://www.zyctd.com/zixun/201/1053482.html |
|||
https://www.zyctd.com/zixun/201/269419.html |
|||
https://www.zyctd.com/zixun/201/1053149.html |
|||
https://www.zyctd.com/zixun/201/1023926.html |
|||
https://www.zyctd.com/zixun/201/435325.html |
|||
https://www.zyctd.com/zixun/201/1050302.html |
|||
https://www.zyctd.com/zixun/201/880441.html |
|||
https://www.zyctd.com/zixun/201/1019635.html |
|||
https://www.zyctd.com/zixun/201/970572.html |
|||
https://www.zyctd.com/zixun/201/912277.html |
|||
https://www.zyctd.com/zixun/201/372444.html |
|||
https://www.zyctd.com/zixun/201/1073629.html |
|||
https://www.zyctd.com/zixun/201/1069386.html |
|||
https://www.zyctd.com/zixun/201/730410.html |
|||
https://www.zyctd.com/zixun/201/953220.html |
|||
https://www.zyctd.com/zixun/201/1074339.html |
|||
https://www.zyctd.com/zixun/201/1072317.html |
|||
https://www.zyctd.com/zixun/201/294794.html |
|||
https://www.zyctd.com/zixun/201/267592.html |
|||
https://www.zyctd.com/zixun/201/979665.html |
|||
https://www.zyctd.com/zixun/201/869885.html |
|||
https://www.zyctd.com/zixun/201/1054064.html |
|||
https://www.zyctd.com/zixun/201/1049331.html |
|||
https://www.zyctd.com/zixun/201/442647.html |
|||
https://www.zyctd.com/zixun/201/285992.html |
|||
https://www.zyctd.com/zixun/201/1037972.html |
|||
https://www.zyctd.com/zixun/201/799801.html |
|||
https://www.zyctd.com/zixun/201/916078.html |
|||
https://www.zyctd.com/zixun/201/456647.html |
|||
https://www.zyctd.com/zixun/201/812121.html |
|||
https://www.zyctd.com/zixun/201/1042740.html |
|||
https://www.zyctd.com/zixun/201/1042708.html |
|||
https://www.zyctd.com/zixun/201/840450.html |
|||
https://www.zyctd.com/zixun/201/320749.html |
|||
https://www.zyctd.com/zixun/201/496106.html |
|||
https://www.zyctd.com/zixun/201/850201.html |
|||
https://www.zyctd.com/zixun/201/277145.html |
|||
https://www.zyctd.com/zixun/201/299091.html |
|||
https://www.zyctd.com/zixun/201/266080.html |
|||
https://www.zyctd.com/zixun/201/1051925.html |
|||
https://www.zyctd.com/zixun/201/898081.html |
|||
https://www.zyctd.com/zixun/201/873280.html |
|||
https://www.zyctd.com/zixun/201/703880.html |
|||
https://www.zyctd.com/zixun/201/873126.html |
|||
https://www.zyctd.com/zixun/201/887931.html |
|||
https://www.zyctd.com/zixun/201/432742.html |
|||
https://www.zyctd.com/zixun/201/1040431.html |
|||
https://www.zyctd.com/zixun/201/1040223.html |
|||
https://www.zyctd.com/zixun/201/858118.html |
|||
https://www.zyctd.com/zixun/201/971286.html |
|||
https://www.zyctd.com/zixun/201/458488.html |
|||
https://www.zyctd.com/zixun/201/1079381.html |
|||
https://www.zyctd.com/zixun/201/263578.html |
|||
https://www.zyctd.com/zixun/201/553513.html |
|||
https://www.zyctd.com/zixun/201/286229.html |
|||
https://www.zyctd.com/zixun/201/285365.html |
|||
https://www.zyctd.com/zixun/201/352921.html |
|||
https://www.zyctd.com/zixun/201/503267.html |
|||
https://www.zyctd.com/zixun/201/391337.html |
|||
https://www.zyctd.com/zixun/201/813052.html |
|||
https://www.zyctd.com/zixun/201/1053556.html |
|||
https://www.zyctd.com/zixun/201/1041197.html |
|||
https://www.zyctd.com/zixun/201/287420.html |
|||
https://www.zyctd.com/zixun/201/291563.html |
|||
https://www.zyctd.com/zixun/201/948250.html |
|||
https://www.zyctd.com/zixun/201/289034.html |
|||
https://www.zyctd.com/zixun/201/795965.html |
|||
https://www.zyctd.com/zixun/201/292962.html |
|||
https://www.zyctd.com/zixun/201/975850.html |
|||
https://www.zyctd.com/zixun/201/275335.html |
|||
https://www.zyctd.com/zixun/201/1031992.html |
|||
https://www.zyctd.com/zixun/201/1033886.html |
|||
https://www.zyctd.com/zixun/201/999510.html |
|||
https://www.zyctd.com/zixun/201/270144.html |
|||
https://www.zyctd.com/zixun/201/1055519.html |
|||
https://www.zyctd.com/zixun/201/272205.html |
|||
https://www.zyctd.com/zixun/201/526059.html |
|||
https://www.zyctd.com/zixun/201/456640.html |
|||
https://www.zyctd.com/zixun/201/267952.html |
|||
https://www.zyctd.com/zixun/201/803469.html |
|||
https://www.zyctd.com/zixun/201/270763.html |
|||
https://www.zyctd.com/zixun/201/1072987.html |
|||
https://www.zyctd.com/zixun/201/265176.html |
|||
https://www.zyctd.com/zixun/201/1022141.html |
|||
https://www.zyctd.com/zixun/201/290173.html |
|||
https://www.zyctd.com/zixun/201/269175.html |
|||
https://www.zyctd.com/zixun/201/744991.html |
|||
https://www.zyctd.com/zixun/201/1019131.html |
|||
https://www.zyctd.com/zixun/201/717054.html |
|||
https://www.zyctd.com/zixun/201/517358.html |
|||
https://www.zyctd.com/zixun/201/1058505.html |
|||
https://www.zyctd.com/zixun/201/905515.html |
|||
https://www.zyctd.com/zixun/201/287395.html |
|||
https://www.zyctd.com/zixun/201/934873.html |
|||
https://www.zyctd.com/zixun/201/1051317.html |
|||
https://www.zyctd.com/zixun/201/926018.html |
|||
https://www.zyctd.com/zixun/201/334511.html |
|||
https://www.zyctd.com/zixun/201/845896.html |
|||
https://www.zyctd.com/zixun/201/587785.html |
|||
https://www.zyctd.com/zixun/201/288376.html |
|||
https://www.zyctd.com/zixun/201/851405.html |
|||
https://www.zyctd.com/zixun/201/941404.html |
|||
https://www.zyctd.com/zixun/201/881855.html |
|||
https://www.zyctd.com/zixun/201/602632.html |
|||
https://www.zyctd.com/zixun/201/293601.html |
|||
https://www.zyctd.com/zixun/201/541809.html |
|||
https://www.zyctd.com/zixun/201/335120.html |
|||
https://www.zyctd.com/zixun/201/1031137.html |
|||
https://www.zyctd.com/zixun/201/960101.html |
|||
https://www.zyctd.com/zixun/201/1077142.html |
|||
https://www.zyctd.com/zixun/201/1063222.html |
|||
https://www.zyctd.com/zixun/201/681466.html |
|||
https://www.zyctd.com/zixun/201/1031130.html |
|||
https://www.zyctd.com/zixun/201/1073734.html |
|||
https://www.zyctd.com/zixun/201/1062186.html |
|||
https://www.zyctd.com/zixun/201/1046628.html |
|||
https://www.zyctd.com/zixun/201/358892.html |
|||
https://www.zyctd.com/zixun/201/285361.html |
|||
https://www.zyctd.com/zixun/201/1059889.html |
|||
https://www.zyctd.com/zixun/201/297824.html |
|||
https://www.zyctd.com/zixun/201/844307.html |
|||
https://www.zyctd.com/zixun/201/900524.html |
|||
https://www.zyctd.com/zixun/201/1057636.html |
|||
https://www.zyctd.com/zixun/201/1010080.html |
|||
https://www.zyctd.com/zixun/201/409152.html |
|||
https://www.zyctd.com/zixun/201/402782.html |
|||
https://www.zyctd.com/zixun/201/770296.html |
|||
https://www.zyctd.com/zixun/201/1040602.html |
|||
https://www.zyctd.com/zixun/201/606503.html |
|||
https://www.zyctd.com/zixun/201/784471.html |
|||
https://www.zyctd.com/zixun/201/466097.html |
|||
https://www.zyctd.com/zixun/201/1071160.html |
|||
https://www.zyctd.com/zixun/201/623226.html |
|||
https://www.zyctd.com/zixun/201/948264.html |
|||
https://www.zyctd.com/zixun/201/293462.html |
|||
https://www.zyctd.com/zixun/201/829348.html |
|||
https://www.zyctd.com/zixun/201/332369.html |
|||
https://www.zyctd.com/zixun/201/907461.html |
|||
https://www.zyctd.com/zixun/201/756555.html |
|||
https://www.zyctd.com/zixun/201/717915.html |
|||
https://www.zyctd.com/zixun/201/262203.html |
|||
https://www.zyctd.com/zixun/201/1055787.html |
|||
https://www.zyctd.com/zixun/201/432336.html |
|||
https://www.zyctd.com/zixun/201/907489.html |
|||
https://www.zyctd.com/zixun/201/1014686.html |
|||
https://www.zyctd.com/zixun/201/1053320.html |
|||
https://www.zyctd.com/zixun/201/480020.html |
|||
https://www.zyctd.com/zixun/201/287423.html |
|||
https://www.zyctd.com/zixun/201/385289.html |
|||
https://www.zyctd.com/zixun/201/1030421.html |
|||
https://www.zyctd.com/zixun/201/527648.html |
|||
https://www.zyctd.com/zixun/201/972959.html |
|||
https://www.zyctd.com/zixun/201/408767.html |
|||
https://www.zyctd.com/zixun/201/724887.html |
|||
https://www.zyctd.com/zixun/201/291480.html |
|||
https://www.zyctd.com/zixun/201/472544.html |
|||
https://www.zyctd.com/zixun/201/724873.html |
|||
https://www.zyctd.com/zixun/201/281751.html |
|||
https://www.zyctd.com/zixun/201/1049693.html |
|||
https://www.zyctd.com/zixun/201/869619.html |
|||
https://www.zyctd.com/zixun/201/355497.html |
|||
https://www.zyctd.com/zixun/201/341623.html |
|||
https://www.zyctd.com/zixun/201/450753.html |
|||
https://www.zyctd.com/zixun/201/1065837.html |
|||
https://www.zyctd.com/zixun/201/1031331.html |
|||
https://www.zyctd.com/zixun/201/669727.html |
|||
https://www.zyctd.com/zixun/201/1034010.html |
|||
https://www.zyctd.com/zixun/201/1054058.html |
|||
https://www.zyctd.com/zixun/201/954613.html |
|||
https://www.zyctd.com/zixun/201/715584.html |
|||
https://www.zyctd.com/zixun/201/1051110.html |
|||
https://www.zyctd.com/zixun/201/269963.html |
|||
https://www.zyctd.com/zixun/201/1048128.html |
|||
https://www.zyctd.com/zixun/201/793207.html |
|||
https://www.zyctd.com/zixun/201/284310.html |
|||
https://www.zyctd.com/zixun/201/282639.html |
|||
https://www.zyctd.com/zixun/201/1068138.html |
|||
https://www.zyctd.com/zixun/201/340678.html |
|||
https://www.zyctd.com/zixun/201/294371.html |
|||
https://www.zyctd.com/zixun/201/324277.html |
|||
https://www.zyctd.com/zixun/201/1048931.html |
|||
https://www.zyctd.com/zixun/201/851398.html |
|||
https://www.zyctd.com/zixun/201/263527.html |
|||
https://www.zyctd.com/zixun/201/919480.html |
|||
https://www.zyctd.com/zixun/201/685442.html |
|||
https://www.zyctd.com/zixun/201/428325.html |
|||
https://www.zyctd.com/zixun/201/1032698.html |
|||
https://www.zyctd.com/zixun/201/1003367.html |
|||
https://www.zyctd.com/zixun/201/852315.html |
|||
https://www.zyctd.com/zixun/201/283156.html |
|||
https://www.zyctd.com/zixun/201/262484.html |
|||
https://www.zyctd.com/zixun/201/1065225.html |
|||
https://www.zyctd.com/zixun/201/763331.html |
|||
https://www.zyctd.com/zixun/201/1066158.html |
|||
https://www.zyctd.com/zixun/201/1047744.html |
|||
https://www.zyctd.com/zixun/201/842795.html |
|||
https://www.zyctd.com/zixun/201/975374.html |
|||
https://www.zyctd.com/zixun/201/1055865.html |
|||
https://www.zyctd.com/zixun/201/1017367.html |
|||
https://www.zyctd.com/zixun/201/1057711.html |
|||
https://www.zyctd.com/zixun/201/1074295.html |
|||
https://www.zyctd.com/zixun/201/283647.html |
|||
https://www.zyctd.com/zixun/201/286896.html |
|||
https://www.zyctd.com/zixun/201/1043393.html |
|||
https://www.zyctd.com/zixun/201/305888.html |
|||
https://www.zyctd.com/zixun/201/487258.html |
|||
https://www.zyctd.com/zixun/201/1045652.html |
|||
https://www.zyctd.com/zixun/201/1064905.html |
|||
https://www.zyctd.com/zixun/201/515636.html |
|||
https://www.zyctd.com/zixun/201/1038609.html |
|||
https://www.zyctd.com/zixun/201/438083.html |
|||
https://www.zyctd.com/zixun/201/297327.html |
|||
https://www.zyctd.com/zixun/201/773537.html |
|||
https://www.zyctd.com/zixun/201/1043589.html |
|||
https://www.zyctd.com/zixun/201/815712.html |
|||
https://www.zyctd.com/zixun/201/698595.html |
|||
https://www.zyctd.com/zixun/201/269800.html |
|||
https://www.zyctd.com/zixun/201/1030332.html |
|||
https://www.zyctd.com/zixun/201/422676.html |
|||
https://www.zyctd.com/zixun/201/290130.html |
|||
https://www.zyctd.com/zixun/201/270359.html |
|||
https://www.zyctd.com/zixun/201/995604.html |
|||
https://www.zyctd.com/zixun/201/1074993.html |
|||
https://www.zyctd.com/zixun/201/1054825.html |
|||
https://www.zyctd.com/zixun/201/918577.html |
|||
https://www.zyctd.com/zixun/201/686527.html |
|||
https://www.zyctd.com/zixun/201/297509.html |
|||
https://www.zyctd.com/zixun/201/622708.html |
|||
https://www.zyctd.com/zixun/201/469870.html |
|||
https://www.zyctd.com/zixun/201/844328.html |
|||
https://www.zyctd.com/zixun/201/394508.html |
|||
https://www.zyctd.com/zixun/201/271744.html |
|||
https://www.zyctd.com/zixun/201/1054940.html |
|||
https://www.zyctd.com/zixun/201/732818.html |
|||
https://www.zyctd.com/zixun/201/1049547.html |
|||
https://www.zyctd.com/zixun/201/1059684.html |
|||
https://www.zyctd.com/zixun/201/1055301.html |
|||
https://www.zyctd.com/zixun/201/962068.html |
|||
https://www.zyctd.com/zixun/201/451355.html |
|||
https://www.zyctd.com/zixun/201/1056174.html |
|||
https://www.zyctd.com/zixun/201/930540.html |
|||
https://www.zyctd.com/zixun/201/871656.html |
|||
https://www.zyctd.com/zixun/201/363246.html |
|||
https://www.zyctd.com/zixun/201/845672.html |
|||
https://www.zyctd.com/zixun/201/452965.html |
|||
https://www.zyctd.com/zixun/201/1065920.html |
|||
https://www.zyctd.com/zixun/201/1058808.html |
|||
https://www.zyctd.com/zixun/201/986868.html |
|||
https://www.zyctd.com/zixun/201/489785.html |
|||
https://www.zyctd.com/zixun/201/307946.html |
|||
https://www.zyctd.com/zixun/201/833359.html |
|||
https://www.zyctd.com/zixun/201/806969.html |
|||
https://www.zyctd.com/zixun/201/1050812.html |
|||
https://www.zyctd.com/zixun/201/1033696.html |
|||
https://www.zyctd.com/zixun/201/501167.html |
|||
https://www.zyctd.com/zixun/201/1078919.html |
|||
https://www.zyctd.com/zixun/201/1036495.html |
|||
https://www.zyctd.com/zixun/201/1008736.html |
|||
https://www.zyctd.com/zixun/201/1054264.html |
|||
https://www.zyctd.com/zixun/201/493152.html |
|||
https://www.zyctd.com/zixun/201/685456.html |
|||
https://www.zyctd.com/zixun/201/995597.html |
|||
https://www.zyctd.com/zixun/201/905501.html |
|||
https://www.zyctd.com/zixun/201/347573.html |
|||
https://www.zyctd.com/zixun/201/1045494.html |
|||
https://www.zyctd.com/zixun/201/549775.html |
|||
https://www.zyctd.com/zixun/201/1037336.html |
|||
https://www.zyctd.com/zixun/201/1034972.html |
|||
https://www.zyctd.com/zixun/201/653046.html |
|||
https://www.zyctd.com/zixun/201/316612.html |
|||
https://www.zyctd.com/zixun/201/447064.html |
|||
https://www.zyctd.com/zixun/201/307603.html |
|||
https://www.zyctd.com/zixun/201/263437.html |
|||
https://www.zyctd.com/zixun/201/894490.html |
|||
https://www.zyctd.com/zixun/201/368629.html |
|||
https://www.zyctd.com/zixun/201/273285.html |
|||
https://www.zyctd.com/zixun/201/1059618.html |
|||
https://www.zyctd.com/zixun/201/459237.html |
@ -0,0 +1 @@ |
|||
127.0.0.1:7897 |
@ -0,0 +1,119 @@ |
|||
package com.example; |
|||
|
|||
import okhttp3.*; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
|
|||
import java.io.IOException; |
|||
import java.text.ParseException; |
|||
import java.text.SimpleDateFormat; |
|||
import java.time.LocalDateTime; |
|||
import java.time.format.DateTimeFormatter; |
|||
import java.util.Date; |
|||
import java.util.HashMap; |
|||
import java.util.Map; |
|||
|
|||
public class AusContent { |
|||
public static void main(String[] args) throws IOException { |
|||
OkHttpClient client = new OkHttpClient().newBuilder() |
|||
.build(); |
|||
MediaType mediaType = MediaType.parse("text/plain"); |
|||
RequestBody body = RequestBody.create(mediaType, ""); |
|||
Request request = new Request.Builder() |
|||
.url("https://www.anzctr.org.auTrial/Registration/TrialReview.aspx?id=389345&isReview=true") |
|||
.get() |
|||
.build(); |
|||
Response response = client.newCall(request).execute(); |
|||
String html = response.body().string(); |
|||
Document parse = Jsoup.parse(html); |
|||
String title = parse.select("#ctl00_body_CXSTUDYTITLE").text(); |
|||
String registNum = parse.select("#ctl00_body_CXACTRNUMBER").text(); |
|||
String registTime = convertDate(parse.select("#ctl00_body_CXAPPROVALDATE").text()); |
|||
String sponsor = parse.select("#ctl00_body_repeater_TXFUNDINGSOURCE_ctl00_CXTYPE").text(); |
|||
String studyType = parse.select("#ctl00_body_CXSTUDYTYPE").text(); |
|||
String phase = parse.select("#ctl00_body_CXPHASE").text(); |
|||
String disease = parse.select("#ctl00_body_repeater_TXHEALTHCONDITION_ctl00_CXHEALTHCONDITION").text(); |
|||
String SD1 = parse.select("#ctl00_body_CXPURPOSE").text(); |
|||
String SD2 = parse.select("#ctl00_body_CXALLOCATION").text(); |
|||
String SD3 = parse.select("#ctl00_body_CXCONCEALMENT").text(); |
|||
String SD4 = parse.select("#ctl00_body_CXSEQUENCE").text(); |
|||
String SD5 = parse.select("#ctl00_body_CXMASKING").text(); |
|||
String SD6 = parse.select("#ctl00_body_maskingdiv > div > div.review-element-content").text(); |
|||
String SD7 = parse.select("#ctl00_body_CXASSIGNMENT").text(); |
|||
String SD8 = parse.select("#ctl00_body_CXPHASE").text(); |
|||
String SD9 = parse.select("#ctl00_body_CXENDPOINT").text(); |
|||
String SD10 = parse.select("#ctl00_body_CXSTATISTICALMETHODS").text(); |
|||
String SD11 = parse.select("#ctl00_body_interventional_div > div:nth-child(8) > div > div.review-element-content").text(); |
|||
String studyObjective = parse.select("#ctl00_body_CXPURPOSE").text(); |
|||
String inclusionCriteria = parse.select("#ctl00_body_CXINCLUSIVECRITERIA").text(); |
|||
String exclusionCriteria = parse.select("#ctl00_body_CXEXCLUSIVECRITERIA").text(); |
|||
String currentStatus = parse.select("#ctl00_body_CXRECRUITMENTSTATUS").text(); |
|||
String enrollment = parse.select("#ctl00_body_CXSAMPLESIZE").text(); |
|||
String country = parse.select("#ctl00_body_repeater_TXCOUNTRYOUTSIDEAUSTRALIA_ctl01_CXCOUNTRY").text(); |
|||
String intervention = parse.select("#ctl00_body_trialDiv > div:nth-child(30) > div > div.review-element-content").text(); |
|||
Map<String,Object> studyDesign = new HashMap<>(); |
|||
studyDesign.put("Purpose of the study",SD1); |
|||
studyDesign.put("Allocation to intervention",SD2); |
|||
studyDesign.put("Procedure for enrolling a subject and allocating the treatment (allocation concealment procedures)",SD3); |
|||
studyDesign.put("Methods used to generate the sequence in which subjects will be randomised (sequence generation)",SD4); |
|||
studyDesign.put("Masking / blinding",SD5); |
|||
studyDesign.put("Who is / are masked / blinded?",SD6); |
|||
studyDesign.put("Intervention assignment",SD7); |
|||
studyDesign.put("Other design features",SD11); |
|||
studyDesign.put("Phase",SD8); |
|||
studyDesign.put("Type of endpoint/s",SD9); |
|||
studyDesign.put("Statistical methods / analysis",SD10); |
|||
Map<String,Object> resultData = new HashMap<>(); |
|||
resultData.put("title",title); |
|||
resultData.put("registNum",registNum); |
|||
resultData.put("registTime",registTime); |
|||
resultData.put("registStatus",""); |
|||
resultData.put("registTitle",""); |
|||
resultData.put("fullTitle",""); |
|||
resultData.put("sponsor",sponsor); |
|||
resultData.put("sponsorPart",""); |
|||
resultData.put("studyType",studyType); |
|||
resultData.put("phase",phase); |
|||
resultData.put("disease",disease); |
|||
resultData.put("studyDesign",studyDesign); |
|||
resultData.put("studyObjective",studyObjective); |
|||
resultData.put("studyStartDate",""); |
|||
resultData.put("inclusionCriteria",inclusionCriteria); |
|||
resultData.put("exclusionCriteria",exclusionCriteria); |
|||
resultData.put("currentStatus",currentStatus); |
|||
resultData.put("enrollment",enrollment); |
|||
resultData.put("country",country); |
|||
resultData.put("tagTime",""); |
|||
resultData.put("intervention",intervention); |
|||
resultData.put("primaryOutcome",""); |
|||
resultData.put("crawlTime",getCurrentTime()); |
|||
// resultData.put("crawlUrl",url); |
|||
resultData.put("postTime",registTime); |
|||
resultData.put("content","content"); |
|||
resultData.put("forwardcontent","forwardcontent"); |
|||
System.out.println(resultData); |
|||
} |
|||
public static String convertDate(String inputDate) { |
|||
try { |
|||
|
|||
SimpleDateFormat inputFormat = new SimpleDateFormat("d/MM/yyyy"); |
|||
|
|||
Date date = inputFormat.parse(inputDate); |
|||
|
|||
SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); |
|||
|
|||
return outputFormat.format(date); |
|||
} catch (ParseException e) { |
|||
|
|||
return "Invalid date format"; |
|||
} |
|||
} |
|||
public static String getCurrentTime() { |
|||
// 创建 DateTimeFormatter,指定输出格式 |
|||
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); |
|||
// 获取当前时间 |
|||
LocalDateTime now = LocalDateTime.now(); |
|||
// 格式化 |
|||
return now.format(formatter); |
|||
} |
|||
} |
@ -0,0 +1,200 @@ |
|||
package com.example; |
|||
|
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
import java.io.BufferedReader; |
|||
import java.io.InputStreamReader; |
|||
import java.io.OutputStream; |
|||
import java.net.HttpURLConnection; |
|||
import java.net.MalformedURLException; |
|||
import java.net.URL; |
|||
import java.net.URLEncoder; |
|||
import java.nio.charset.StandardCharsets; |
|||
import java.util.*; |
|||
import java.util.regex.Matcher; |
|||
import java.util.regex.Pattern; |
|||
|
|||
public class AusList { |
|||
public static void main(String[] args) throws Exception { |
|||
String targetUrl = "https://www.anzctr.org.au/TrialSearch.aspx?page=20"; |
|||
String baseUrl = "https://www.anzctr.org.au/TrialSearch.aspx"; |
|||
String postUrl = "https://www.anzctr.org.au/TrialSearch.aspx"; |
|||
String pageNumber = targetUrl.contains("?page=") ? targetUrl.split("page=")[1] : "1"; |
|||
int page = Integer.parseInt(pageNumber); |
|||
System.out.println("Page Number: " + page); |
|||
// 存储 cookies |
|||
Set<String> cookieSet = new HashSet<>(); |
|||
String sessionId = null; |
|||
|
|||
// 第一步:初始 GET 请求,获取 cookies 和 ViewState |
|||
URL initialUrl = new URL(baseUrl); |
|||
HttpURLConnection initialConn = (HttpURLConnection) initialUrl.openConnection(); |
|||
initialConn.setRequestMethod("GET"); |
|||
initialConn.setRequestProperty("User-Agent", |
|||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"); |
|||
|
|||
initialConn.setRequestProperty("Accept", |
|||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"); |
|||
initialConn.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.9,th;q=0.8,en;q=0.7"); |
|||
initialConn.setRequestProperty("Cache-Control", "no-cache"); |
|||
initialConn.setRequestProperty("Pragma", "no-cache"); |
|||
initialConn.setRequestProperty("Upgrade-Insecure-Requests", "1"); |
|||
initialConn.setRequestProperty("Sec-Fetch-Dest", "document"); |
|||
initialConn.setRequestProperty("Sec-Fetch-Mode", "navigate"); |
|||
initialConn.setRequestProperty("Sec-Fetch-Site", "same-origin"); |
|||
initialConn.setRequestProperty("Sec-Fetch-User", "?1"); |
|||
|
|||
initialConn.setRequestProperty("Sec-CH-UA", |
|||
"\"Google Chrome\";v=\"135\", \"Not-A.Brand\";v=\"8\", \"Chromium\";v=\"135\""); |
|||
initialConn.setRequestProperty("Sec-CH-UA-Mobile", "?0"); |
|||
initialConn.setRequestProperty("Sec-CH-UA-Platform", "\"Windows\""); |
|||
|
|||
// initialConn.setRequestProperty("Cookie", |
|||
// "ASP.NET_SessionId=gkhw0unpeytexsa40v1sdjf1; __utma=2822752...; _ga=..."); |
|||
|
|||
initialConn.setInstanceFollowRedirects(false); |
|||
initialConn.setConnectTimeout(10000); |
|||
initialConn.setReadTimeout(10000); |
|||
|
|||
// 捕获 cookies |
|||
sessionId = updateCookies(initialConn, cookieSet); |
|||
|
|||
// 读取响应内容以获取 ViewState |
|||
BufferedReader in = new BufferedReader(new InputStreamReader(initialConn.getInputStream())); |
|||
StringBuilder content = new StringBuilder(); |
|||
String inputLine; |
|||
while ((inputLine = in.readLine()) != null) { |
|||
content.append(inputLine); |
|||
} |
|||
in.close(); |
|||
initialConn.disconnect(); |
|||
|
|||
// 提取初始 ViewState |
|||
Map<String, String> viewStateData = extractViewStateData(content.toString()); |
|||
String viewState = viewStateData.get("__VIEWSTATE"); |
|||
String viewStateGen = viewStateData.get("__VIEWSTATEGENERATOR"); |
|||
String eventValidation = viewStateData.get("__EVENTVALIDATION"); |
|||
String payload = buildPostData(viewState,eventValidation,viewStateGen,page,sessionId); |
|||
|
|||
HttpURLConnection conn = (HttpURLConnection) new URL(postUrl).openConnection(); |
|||
conn.setRequestMethod("POST"); |
|||
conn.setDoOutput(true); |
|||
conn.setInstanceFollowRedirects(false); |
|||
conn.setConnectTimeout(10000); |
|||
conn.setReadTimeout(10000); |
|||
|
|||
// 设置请求头(仿浏览器) |
|||
conn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"); |
|||
conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"); |
|||
conn.setRequestProperty("Accept", "*/*"); |
|||
conn.setRequestProperty("X-Requested-With", "XMLHttpRequest"); |
|||
conn.setRequestProperty("X-MicrosoftAjax", "Delta=true"); |
|||
conn.setRequestProperty("Referer", "https://www.anzctr.org.au/TrialSearch.aspx"); |
|||
conn.setRequestProperty("Origin", "https://www.anzctr.org.au"); |
|||
|
|||
|
|||
// 构建 POST 表单数据 |
|||
String postData = payload; |
|||
// 写入 POST 数据 |
|||
try (OutputStream os = conn.getOutputStream()) { |
|||
byte[] input = postData.getBytes(StandardCharsets.UTF_8); |
|||
os.write(input); |
|||
} |
|||
|
|||
// 读取响应 |
|||
BufferedReader re = new BufferedReader(new InputStreamReader(conn.getInputStream(), StandardCharsets.UTF_8)); |
|||
StringBuilder response = new StringBuilder(); |
|||
String line; |
|||
while ((line = re.readLine()) != null) { |
|||
response.append(line); |
|||
} |
|||
String html = response.toString(); |
|||
Document parse = Jsoup.parse(html); |
|||
Elements elements =parse.select(".results-header-tools a"); |
|||
for (Element element:elements){ |
|||
String link = "https://www.anzctr.org.au" + element.attr("href"); |
|||
System.out.println(link); |
|||
} |
|||
re.close(); |
|||
conn.disconnect(); |
|||
} |
|||
|
|||
// 更新并返回当前连接中的 Cookie,包含 JSESSIONID 的提取 |
|||
private static String updateCookies(HttpURLConnection conn, Set<String> cookieSet) { |
|||
String sessionId = null; |
|||
Map<String, List<String>> headerFields = conn.getHeaderFields(); |
|||
List<String> cookiesHeader = headerFields.get("Set-Cookie"); |
|||
if (cookiesHeader != null) { |
|||
for (String cookie : cookiesHeader) { |
|||
String cookieValue = cookie.split(";")[0]; |
|||
cookieSet.add(cookieValue); |
|||
if (cookieValue.startsWith("ASP.NET_SessionId=") || cookieValue.startsWith("csfcfc=")) { |
|||
sessionId = cookieValue; |
|||
} |
|||
} |
|||
} |
|||
return sessionId; |
|||
} |
|||
// 提取 __VIEWSTATE 隐藏字段的值 |
|||
private static Map<String, String> extractViewStateData(String html) { |
|||
Map<String, String> stateMap = new HashMap<>(); |
|||
|
|||
// 使用三个独立正则提取三个字段 |
|||
extractHiddenField(html, "__VIEWSTATE", stateMap); |
|||
extractHiddenField(html, "__VIEWSTATEGENERATOR", stateMap); |
|||
extractHiddenField(html, "__EVENTVALIDATION", stateMap); |
|||
|
|||
if (!stateMap.containsKey("__VIEWSTATE")) { |
|||
System.err.println("Failed to extract __VIEWSTATE from HTML"); |
|||
} |
|||
if (!stateMap.containsKey("__EVENTVALIDATION")) { |
|||
System.err.println("Failed to extract __EVENTVALIDATION from HTML"); |
|||
} |
|||
if (!stateMap.containsKey("__VIEWSTATEGENERATOR")) { |
|||
System.err.println("Failed to extract __VIEWSTATEGENERATOR from HTML"); |
|||
} |
|||
return stateMap; |
|||
} |
|||
|
|||
private static void extractHiddenField(String html, String fieldName, Map<String, String> map) { |
|||
String regex = "(?i)<input[^>]*name=[\"']" + fieldName + "[\"'][^>]*value=[\"']([^\"']+)[\"']"; |
|||
Pattern pattern = Pattern.compile(regex); |
|||
Matcher matcher = pattern.matcher(html); |
|||
|
|||
if (matcher.find()) { |
|||
map.put(fieldName, matcher.group(1)); |
|||
} |
|||
} |
|||
|
|||
private static String buildPostData(String viewState, String eventValidation, String viewStateGen, int page, String sessionId) { |
|||
try { |
|||
// 按照真实请求体的顺序和字段进行构建 |
|||
String payload = ""; |
|||
payload += URLEncoder.encode("ctl00$body$tsmAJAXScriptManager", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode("ctl00$body$tsmAJAXScriptManager|ctl00$body$tsmAJAXScriptManager", StandardCharsets.UTF_8.name()) + "&"; |
|||
payload += URLEncoder.encode("ctl00_body_tsmAJAXScriptManager_HiddenField", StandardCharsets.UTF_8.name()) + "=&"; // 添加缺失字段 |
|||
payload += URLEncoder.encode("__EVENTTARGET", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode("ctl00$body$tsmAJAXScriptManager", StandardCharsets.UTF_8.name()) + "&"; |
|||
payload += URLEncoder.encode("__EVENTARGUMENT", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode("conditionCode=&dateOfRegistrationFrom=&interventionDescription=&interventionCodeOperator=OR&primarySponsorType=&gender=&distance=&postcode=&pageSize=20&ageGroup=&recruitmentCountryOperator=OR&recruitmentRegion=ðicsReview=&countryOfRecruitment=®istry=&searchTxt=&studyType=&allocationToIntervention=&dateOfRegistrationTo=&recruitmentStatus=&interventionCode=&healthCondition=&healthyVolunteers=&page="+page+"&conditionCategory=&fundingSource=&trialStartDateTo=&trialStartDateFrom=&phase=", StandardCharsets.UTF_8.name()) + "&"; // 注意这里的参数字符串是完整的 |
|||
payload += URLEncoder.encode("__LASTFOCUS", StandardCharsets.UTF_8.name()) + "=&"; |
|||
payload += URLEncoder.encode("__VIEWSTATE", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode(viewState, StandardCharsets.UTF_8.name()) + "&"; |
|||
payload += URLEncoder.encode("__VIEWSTATEGENERATOR", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode(viewStateGen, StandardCharsets.UTF_8.name()) + "&"; |
|||
payload += URLEncoder.encode("__SCROLLPOSITIONX", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode("0", StandardCharsets.UTF_8.name()) + "&"; // 添加缺失字段 |
|||
payload += URLEncoder.encode("__SCROLLPOSITIONY", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode("0", StandardCharsets.UTF_8.name()) + "&"; // 添加缺失字段 |
|||
payload += URLEncoder.encode("__EVENTVALIDATION", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode(eventValidation, StandardCharsets.UTF_8.name()) + "&"; |
|||
|
|||
// ... 添加并按顺序排列其他所有字段,确保名称、值、编码与真实请求体一致 ... |
|||
|
|||
// 确保最后一个字段后面没有 & |
|||
payload += URLEncoder.encode("__ASYNCPOST", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode("true", StandardCharsets.UTF_8.name()); |
|||
|
|||
return payload; |
|||
|
|||
} catch (Exception e) { |
|||
System.err.println("Error building POST data: " + e.getMessage()); |
|||
return ""; |
|||
} |
|||
} |
|||
|
|||
} |
@ -0,0 +1,173 @@ |
|||
package com.example; |
|||
|
|||
import java.awt.image.BufferedImage; |
|||
import java.io.*; |
|||
import java.net.HttpURLConnection; |
|||
import java.net.URL; |
|||
import javax.imageio.ImageIO; |
|||
import net.sourceforge.tess4j.Tesseract; |
|||
import net.sourceforge.tess4j.TesseractException; |
|||
|
|||
// ... 其他必要的导入 ... |
|||
|
|||
public class CaptchaOCR { |
|||
|
|||
// Tesseract data 路径 (tessdata 文件夹所在目录) |
|||
// Windows 示例: "C:\\Program Files\\Tesseract-OCR\\tessdata" |
|||
// Linux/macOS 示例: 通常不需要设置,Tess4J 会自动查找 |
|||
private static final String TESSDATA_PATH = "F:\\tool\\Tesseract-OCR\\tessdata"; // 根据你的安装路径修改 |
|||
|
|||
/** |
|||
* 下载验证码图片 |
|||
* @param imageUrl 图片的完整 URL |
|||
* @return 图片的 BufferedImage 对象 |
|||
* @throws IOException 如果下载失败 |
|||
*/ |
|||
public static BufferedImage downloadImage(String imageUrl) throws IOException { |
|||
URL url = new URL(imageUrl); |
|||
HttpURLConnection conn = (HttpURLConnection) url.openConnection(); |
|||
conn.setRequestMethod("GET"); |
|||
// 添加 User-Agent 等必要的请求头,模拟浏览器 |
|||
conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"); |
|||
// ... 其他头 ... |
|||
|
|||
int responseCode = conn.getResponseCode(); |
|||
if (responseCode == HttpURLConnection.HTTP_OK) { |
|||
try (InputStream is = conn.getInputStream()) { |
|||
// 将输入流读取到字节数组,ImageIO 从字节数组读取更稳定 |
|||
ByteArrayOutputStream baos = new ByteArrayOutputStream(); |
|||
byte[] buffer = new byte[4096]; // 缓冲区大小,可以调整 |
|||
int bytesRead; |
|||
while ((bytesRead = is.read(buffer)) != -1) { |
|||
baos.write(buffer, 0, bytesRead); |
|||
} |
|||
ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray()); |
|||
|
|||
BufferedImage image = ImageIO.read(bais); |
|||
|
|||
if (image == null) { |
|||
throw new IOException("Failed to read image from stream. Check image format."); |
|||
} |
|||
return image; |
|||
} |
|||
} else { |
|||
throw new IOException("Failed to download image. HTTP error code: " + responseCode); |
|||
} |
|||
} |
|||
|
|||
/** |
|||
* 对验证码图片进行预处理 (基础示例:转灰度+二值化) |
|||
* 这是最关键的部分,需要根据验证码样式调整 |
|||
* @param originalImage 原始图片 |
|||
* @return 预处理后的图片 |
|||
*/ |
|||
public static BufferedImage preprocessImage(BufferedImage originalImage) { |
|||
// TODO: 这里是图像预处理的重点,需要根据实际验证码样式进行调整和优化 |
|||
// 基础处理:转灰度 -> 二值化 |
|||
int width = originalImage.getWidth(); |
|||
int height = originalImage.getHeight(); |
|||
BufferedImage grayImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_GRAY); |
|||
grayImage.getGraphics().drawImage(originalImage, 0, 0, null); |
|||
|
|||
BufferedImage binaryImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_BINARY); |
|||
// 二值化阈值,可能需要调整 (0-255) |
|||
int threshold = 128; |
|||
for (int y = 0; y < height; y++) { |
|||
for (int x = 0; x < width; x++) { |
|||
int gray = grayImage.getRaster().getSample(x, y, 0); |
|||
if (gray < threshold) { |
|||
binaryImage.getRaster().setSample(x, y, 0, 0); // 黑色 |
|||
} else { |
|||
binaryImage.getRaster().setSample(x, y, 0, 1); // 白色 |
|||
} |
|||
} |
|||
} |
|||
|
|||
// TODO: 更高级的预处理包括: |
|||
// - 去除干扰线、噪点 |
|||
// - 字符分割(如果字符粘连) |
|||
// - 倾斜校正 |
|||
// - 调整亮度和对比度等 |
|||
// 你可能需要引入更专业的图像处理库或算法 |
|||
|
|||
// 为了调试,可以将预处理后的图片保存下来查看效果 |
|||
try { |
|||
File outputfile = new File("preprocessed_captcha.png"); |
|||
ImageIO.write(binaryImage, "png", outputfile); |
|||
System.out.println("Preprocessed image saved to " + outputfile.getAbsolutePath()); |
|||
} catch (IOException e) { |
|||
e.printStackTrace(); |
|||
} |
|||
|
|||
return binaryImage; // 返回预处理后的图片 |
|||
} |
|||
|
|||
/** |
|||
* 使用 Tess4J 识别图片中的文字 |
|||
* @param image 待识别的图片 (最好是预处理后的) |
|||
* @return 识别出的字符串 |
|||
*/ |
|||
public static String recognizeCaptcha(BufferedImage image) { |
|||
Tesseract tesseract = new Tesseract(); |
|||
|
|||
// 设置 tessdata 路径 (如果 TESSDATA_PATH 已正确设置且 Tesseract 安装正确,这行可能不是必需的,Tess4J 会自动查找) |
|||
// 但显式设置更保险 |
|||
if (TESSDATA_PATH != null && !TESSDATA_PATH.isEmpty()) { |
|||
tesseract.setDatapath(TESSDATA_PATH); |
|||
} else { |
|||
System.out.println("TESSDATA_PATH not set. Tess4J will try to find tessdata automatically."); |
|||
} |
|||
|
|||
|
|||
tesseract.setLanguage("eng"); // 设置识别语言为英文 (通常包含数字) |
|||
// 如果验证码只有数字,可以尝试设置仅识别数字 |
|||
// tesseract.setTessVariable("tessedit_char_whitelist", "0123456789"); |
|||
|
|||
try { |
|||
String result = tesseract.doOCR(image); |
|||
// 清理识别结果,去除空格或换行符等 |
|||
result = result.trim().replaceAll("[^0-9a-zA-Z]", ""); // 根据验证码内容调整清理规则 |
|||
System.out.println("OCR Result: " + result); |
|||
return result; |
|||
} catch (TesseractException e) { |
|||
System.err.println("Error during OCR: " + e.getMessage()); |
|||
return null; // 识别失败 |
|||
} |
|||
} |
|||
|
|||
// 示例如何在你的爬虫流程中使用 |
|||
public static void main(String[] args) { |
|||
String captchaImageUrl = "YOUR_CAPTCHA_IMAGE_URL"; // 从页面解析获取到的验证码图片 URL |
|||
|
|||
try { |
|||
// 1. 下载图片 |
|||
BufferedImage originalCaptchaImage = downloadImage(captchaImageUrl); |
|||
System.out.println("Image downloaded."); |
|||
|
|||
// 2. 预处理图片 |
|||
BufferedImage preprocessedImage = preprocessImage(originalCaptchaImage); |
|||
System.out.println("Image preprocessed."); |
|||
|
|||
// 3. 识别验证码 |
|||
String captchaCode = recognizeCaptcha(preprocessedImage); |
|||
|
|||
if (captchaCode != null && !captchaCode.isEmpty()) { |
|||
System.out.println("Recognized CAPTCHA: " + captchaCode); |
|||
// 4. 将 captchaCode 填入 POST 数据中,提交表单 |
|||
// ... (你的 ASP.NET WebForms POST 提交代码,将 captchaCode 放到对应的隐藏字段或输入框字段中) ... |
|||
// 例如:postData += "&captchaInputFieldName=" + URLEncoder.encode(captchaCode, StandardCharsets.UTF_8.name()); |
|||
// ... 提交 POST 请求 ... |
|||
|
|||
} else { |
|||
System.out.println("Failed to recognize CAPTCHA."); |
|||
// 5. 处理识别失败的情况,可能需要重试或记录日志 |
|||
} |
|||
|
|||
} catch (IOException e) { |
|||
System.err.println("Error downloading or processing image: " + e.getMessage()); |
|||
} |
|||
// catch (URISyntaxException e) { |
|||
// System.err.println("Invalid URL: " + e.getMessage()); |
|||
// } // 如果你的 downloadImage 方法 throws URISyntaxException |
|||
} |
|||
} |
@ -0,0 +1,81 @@ |
|||
package com.example; |
|||
|
|||
import io.github.bonigarcia.wdm.WebDriverManager; |
|||
import org.apache.hc.client5.http.classic.methods.HttpPost; |
|||
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient; |
|||
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse; |
|||
import org.apache.hc.client5.http.impl.classic.HttpClients; |
|||
import org.apache.hc.core5.http.io.entity.StringEntity; |
|||
import org.apache.hc.core5.http.io.entity.EntityUtils; |
|||
import org.openqa.selenium.Cookie; |
|||
import org.openqa.selenium.WebDriver; |
|||
import org.openqa.selenium.chrome.ChromeDriver; |
|||
|
|||
import java.util.Set; |
|||
import java.util.stream.Collectors; |
|||
|
|||
public class CsAirScraper { |
|||
|
|||
public static void main(String[] args) throws Exception { |
|||
// 1. 启动 Selenium,访问南航主站 |
|||
WebDriverManager.chromedriver().setup(); |
|||
WebDriver driver = new ChromeDriver(); |
|||
driver.get("https://b2c.csair.com/portal/main/flight/direct/query"); |
|||
|
|||
// 等待 Cookie 被 JS 设置(稍等几秒) |
|||
Thread.sleep(5000); // 可根据实际页面响应调整等待时间 |
|||
|
|||
// 2. 获取浏览器中所有 Cookie |
|||
Set<Cookie> seleniumCookies = driver.manage().getCookies(); |
|||
String cookieHeader = seleniumCookies.stream() |
|||
.map(c -> c.getName() + "=" + c.getValue()) |
|||
.collect(Collectors.joining("; ")); |
|||
|
|||
System.out.println("获取到 Cookie: " + cookieHeader); |
|||
driver.quit(); // 关闭浏览器 |
|||
|
|||
// 3. 准备 HttpClient 请求,携带 Cookie |
|||
try (CloseableHttpClient httpClient = HttpClients.createDefault()) { |
|||
HttpPost post = new HttpPost("https://b2c.csair.com/portal/main/flight/direct/query"); |
|||
|
|||
// 设置请求头 |
|||
post.setHeader("Content-Type", "application/json"); |
|||
post.setHeader("Cookie", cookieHeader); |
|||
post.setHeader("User-Agent", "Mozilla/5.0"); |
|||
|
|||
// 设置请求体(JSON) |
|||
String json = "{" |
|||
+ "\"action\": \"0\"," |
|||
+ "\"adultNum\": \"1\"," |
|||
+ "\"airLine\": 1," |
|||
+ "\"arrCity\": \"PKX\"," |
|||
+ "\"businessType\": \"COMMON\"," |
|||
+ "\"cabinOrder\": \"0\"," |
|||
+ "\"cache\": 0," |
|||
+ "\"childNum\": \"0\"," |
|||
+ "\"depCity\": \"CAN\"," |
|||
+ "\"flightDate\": \"20250514\"," |
|||
+ "\"flyType\": 0," |
|||
+ "\"infantNum\": \"0\"," |
|||
+ "\"international\": \"0\"," |
|||
+ "\"isMember\": \"\"," |
|||
+ "\"isMultipass\": 1," |
|||
+ "\"language\": \"zh\"," |
|||
+ "\"preUrl\": \"\"," |
|||
+ "\"segType\": \"1\"," |
|||
+ "\"tariffRules\": []" |
|||
+ "}"; |
|||
|
|||
|
|||
post.setEntity(new StringEntity(json)); |
|||
|
|||
// 4. 发请求 |
|||
try (CloseableHttpResponse response = httpClient.execute(post)) { |
|||
int code = response.getCode(); |
|||
String result = EntityUtils.toString(response.getEntity()); |
|||
System.out.println("状态码: " + code); |
|||
System.out.println("响应: " + result); |
|||
} |
|||
} |
|||
} |
|||
} |
@ -0,0 +1,404 @@ |
|||
package com.example; |
|||
|
|||
import com.fasterxml.jackson.databind.ObjectMapper; |
|||
import okhttp3.*; |
|||
import org.apache.hc.client5.http.cookie.BasicCookieStore; |
|||
import org.apache.hc.client5.http.cookie.CookieStore; |
|||
import org.apache.hc.client5.http.classic.methods.HttpGet; |
|||
import org.apache.hc.client5.http.classic.methods.HttpPost; |
|||
import org.apache.hc.client5.http.entity.UrlEncodedFormEntity; |
|||
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient; |
|||
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse; |
|||
import org.apache.hc.client5.http.impl.classic.HttpClients; |
|||
import org.apache.hc.client5.http.protocol.HttpClientContext; |
|||
import org.apache.hc.core5.http.HttpEntity; |
|||
import org.apache.hc.core5.http.NameValuePair; |
|||
import org.apache.hc.core5.http.io.entity.EntityUtils; |
|||
import org.apache.hc.core5.http.message.BasicNameValuePair; |
|||
import org.apache.kafka.clients.producer.KafkaProducer; |
|||
import org.apache.kafka.clients.producer.ProducerConfig; |
|||
import org.apache.kafka.clients.producer.ProducerRecord; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
import java.io.IOException; |
|||
import java.nio.file.Files; |
|||
import java.nio.file.Paths; |
|||
import java.text.ParseException; |
|||
import java.text.SimpleDateFormat; |
|||
import java.time.LocalDate; |
|||
import java.time.LocalDateTime; |
|||
import java.time.Month; |
|||
import java.time.Year; |
|||
import java.time.format.DateTimeFormatter; |
|||
import java.util.*; |
|||
import java.nio.charset.StandardCharsets; |
|||
import java.util.regex.Matcher; |
|||
import java.util.regex.Pattern; |
|||
|
|||
public class CtriScraper { |
|||
|
|||
private static final String SEARCH_FORM_URL = "https://ctri.nic.in/Clinicaltrials/advancesearchmain.php"; |
|||
|
|||
private static final String SEARCH_SUBMIT_URL = "https://ctri.nic.in/Clinicaltrials/advsearch.php"; |
|||
|
|||
private static final Pattern LINK_REGEX_PATTERN = Pattern.compile("'([^']*)'"); |
|||
|
|||
private static final String TOPIC_NAME = "cliniTopic"; |
|||
private static final String BOOTSTRAP_SERVERS = "node-01:19092"; |
|||
private static KafkaProducer<String, String> producer; |
|||
private static ObjectMapper objectMapper = new ObjectMapper(); |
|||
private static final Random random = new Random(); |
|||
|
|||
static { |
|||
Properties props = new Properties(); |
|||
props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, BOOTSTRAP_SERVERS); |
|||
props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer"); |
|||
props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer"); |
|||
props.put(ProducerConfig.ACKS_CONFIG, "all"); // 等待所有副本确认 |
|||
props.put(ProducerConfig.RETRIES_CONFIG, 3); // 重试次数 |
|||
producer = new KafkaProducer<>(props); |
|||
|
|||
} |
|||
public static List<String> getlink(Integer year, Integer month) { |
|||
List<String> linkList = new ArrayList<>(); // 用于存放提取到的链接 |
|||
// 用于存储和管理 Cookies |
|||
CookieStore cookieStore = new BasicCookieStore(); |
|||
// 用于在请求之间维护状态,特别是关联 CookieStore |
|||
HttpClientContext context = HttpClientContext.create(); |
|||
context.setCookieStore(cookieStore); |
|||
|
|||
// 使用 try-with-resources 确保 HttpClient 被正确关闭 |
|||
try (CloseableHttpClient httpClient = HttpClients.custom() |
|||
.setDefaultCookieStore(cookieStore) // 将cookie store绑定到client |
|||
.build()) { |
|||
|
|||
// --- Step 1 & 2: 发送 GET 请求获取表单页面并解析 --- |
|||
// System.out.println("Fetching search form page..."); // 调试信息可以按需保留或删除 |
|||
HttpGet getRequest = new HttpGet(SEARCH_FORM_URL); |
|||
// 添加一些伪装的 Headers 模拟浏览器访问 |
|||
getRequest.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"); |
|||
getRequest.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"); |
|||
getRequest.setHeader("Accept-Language", "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7"); |
|||
|
|||
String formPageHtml = null; |
|||
try (CloseableHttpResponse response = httpClient.execute(getRequest, context)) { |
|||
int statusCode = response.getCode(); |
|||
// System.out.println("GET Response Status: " + statusCode); // 调试信息 |
|||
|
|||
if (statusCode != 200) { |
|||
System.err.println("Error: GET request to form page failed with status code: " + statusCode); |
|||
EntityUtils.consume(response.getEntity()); // 确保消费掉实体,释放连接 |
|||
return null; // 获取表单页面失败,返回 null |
|||
} |
|||
|
|||
HttpEntity entity = response.getEntity(); |
|||
if (entity != null) { |
|||
formPageHtml = EntityUtils.toString(entity, StandardCharsets.UTF_8); |
|||
EntityUtils.consume(entity); // 确保实体内容被完全消费 |
|||
} else { |
|||
System.err.println("Error: Failed to get form page entity."); |
|||
return null; // 获取页面内容失败,返回 null |
|||
} |
|||
} |
|||
// System.out.println("Form page fetched successfully."); // 调试信息 |
|||
|
|||
// --- Step 3 & 4: 解析 HTML 提取 csrf_token 和 __ncforminfo --- |
|||
Document doc = Jsoup.parse(formPageHtml, SEARCH_FORM_URL); // 传入 base URI 有助于处理相对路径 |
|||
|
|||
// 查找隐藏的输入字段 |
|||
Element csrfTokenInput = doc.selectFirst("input[name=csrf_token][type=hidden]"); |
|||
Element ncFormInfoInput = doc.selectFirst("input[name=__ncforminfo][type=hidden]"); |
|||
|
|||
String csrfToken = null; |
|||
String ncFormInfo = null; |
|||
|
|||
if (csrfTokenInput != null) { |
|||
csrfToken = csrfTokenInput.val(); |
|||
// System.out.println("Extracted csrf_token: " + csrfToken); // 调试信息 |
|||
} else { |
|||
System.err.println("Warning: Could not find csrf_token input field."); |
|||
return null; // 缺少关键 token,返回 null |
|||
} |
|||
|
|||
if (ncFormInfoInput != null) { |
|||
ncFormInfo = ncFormInfoInput.val(); |
|||
// System.out.println("Extracted __ncforminfo: " + ncFormInfo); // 调试信息 |
|||
} else { |
|||
System.err.println("Warning: Could not find __ncforminfo input field."); |
|||
return null; // 缺少关键 token,返回 null |
|||
} |
|||
|
|||
// 如果必要的 token 没有获取到,可能无法继续 (虽然上面的检查已经覆盖,这里作为双重保险) |
|||
if (csrfToken == null || ncFormInfo == null) { |
|||
System.err.println("Error: Missing required tokens. Cannot proceed with POST request."); |
|||
return null; |
|||
} |
|||
|
|||
// --- Step 5 & 6: 构建 POST 请求参数并发送 --- |
|||
// System.out.println("\nPreparing POST request..."); // 调试信息 |
|||
HttpPost postRequest = new HttpPost(SEARCH_SUBMIT_URL); |
|||
// 添加 Headers 模拟浏览器提交表单 |
|||
postRequest.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"); |
|||
// 重要:设置 Referer Header |
|||
postRequest.setHeader("Referer", SEARCH_FORM_URL); |
|||
// 添加 Origin Header |
|||
postRequest.setHeader("Origin", "https://ctri.nic.in"); |
|||
postRequest.setHeader("Content-Type", "application/x-www-form-urlencoded"); |
|||
postRequest.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"); |
|||
postRequest.setHeader("Accept-Language", "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7"); |
|||
postRequest.setHeader("Pragma", "no-cache"); |
|||
|
|||
List<NameValuePair> params = new ArrayList<>(); |
|||
// 添加你之前分析的载荷中的所有参数,使用获取到的动态值 |
|||
params.add(new BasicNameValuePair("stid", "1")); // 注意 stid 之前有两个,这里用 1 |
|||
params.add(new BasicNameValuePair("csrf_token", csrfToken)); // 使用获取到的动态 token |
|||
params.add(new BasicNameValuePair("pros", "1")); |
|||
params.add(new BasicNameValuePair("month", String.format("%02d", month))); // 格式化月份为两位数 |
|||
params.add(new BasicNameValuePair("year", String.valueOf(year))); |
|||
params.add(new BasicNameValuePair("study", "0")); |
|||
params.add(new BasicNameValuePair("sdid", "0")); |
|||
params.add(new BasicNameValuePair("phaseid", "0")); |
|||
params.add(new BasicNameValuePair("psponsor", "0")); |
|||
params.add(new BasicNameValuePair("recid", "0")); |
|||
params.add(new BasicNameValuePair("state", "0")); |
|||
params.add(new BasicNameValuePair("district", "0")); |
|||
params.add(new BasicNameValuePair("searchword", "")); |
|||
params.add(new BasicNameValuePair("T4", "anyvalue")); // T4既然无效,随便填 |
|||
params.add(new BasicNameValuePair("btt", "Search")); |
|||
params.add(new BasicNameValuePair("__ncforminfo", ncFormInfo)); // 使用获取到的动态值 |
|||
|
|||
// 将参数列表设置到请求体中 |
|||
postRequest.setEntity(new UrlEncodedFormEntity(params, StandardCharsets.UTF_8)); |
|||
|
|||
// System.out.println("Executing POST request to submit form..."); // 调试信息 |
|||
try (CloseableHttpResponse postResponse = httpClient.execute(postRequest, context)) { |
|||
int postStatusCode = postResponse.getCode(); |
|||
// System.out.println("POST Response Status: " + postStatusCode); // 打印状态码 |
|||
|
|||
if (postStatusCode != 200) { |
|||
System.err.println("Error: POST request to submit form failed with status code: " + postStatusCode); |
|||
EntityUtils.consume(postResponse.getEntity()); // 确保消费掉实体,释放连接 |
|||
return null; // 提交表单失败,返回 null |
|||
} |
|||
|
|||
|
|||
HttpEntity postEntity = postResponse.getEntity(); |
|||
|
|||
if (postEntity != null) { |
|||
String searchResultsHtml = EntityUtils.toString(postEntity, StandardCharsets.UTF_8); |
|||
EntityUtils.consume(postEntity); // 确保实体内容被完全消费 |
|||
|
|||
// --- Step 7: 处理搜索结果页面 --- |
|||
// System.out.println("\nParsing search results..."); // 调试信息 |
|||
|
|||
Document resultsDoc = Jsoup.parse(searchResultsHtml, SEARCH_SUBMIT_URL); |
|||
|
|||
Elements links = resultsDoc.select("tr a"); |
|||
|
|||
for (Element linkElement : links) { |
|||
String rawLink = linkElement.attr("href"); |
|||
// System.out.println("Processing raw link: " + rawLink); // 调试信息 |
|||
|
|||
// 使用预编译的正则表达式 Pattern |
|||
Matcher matcher = LINK_REGEX_PATTERN.matcher(rawLink); |
|||
|
|||
// 查找匹配项 |
|||
if (matcher.find()) { |
|||
String extractedContent = matcher.group(1); // 提取单引号内的内容 |
|||
// 构建完整的链接 URL |
|||
String fullLink = "https://ctri.nic.in/Clinicaltrials/" + extractedContent; |
|||
linkList.add(fullLink); // 将完整链接添加到列表中 |
|||
// System.out.println("Added link: " + fullLink); // 调试信息 |
|||
} else { |
|||
// 如果链接不符合模式,打印警告并跳过 |
|||
System.err.println("Warning: Link does not match expected pattern: " + rawLink); |
|||
} |
|||
} |
|||
|
|||
// --- 返回提取到的链接列表 --- |
|||
// 循环结束后,返回收集到的所有链接 |
|||
// System.out.println("Finished link extraction. Returning list."); // 调试信息 |
|||
return linkList; |
|||
|
|||
|
|||
} else { |
|||
System.err.println("Error: Failed to get search results entity."); |
|||
return null; // 获取结果内容失败,返回 null |
|||
} |
|||
} |
|||
|
|||
} catch (IOException e) { |
|||
// 处理网络请求相关的异常 |
|||
System.err.println("Network or IO error during scraping:"); |
|||
e.printStackTrace(); |
|||
return null; // 发生 IO 错误,返回 null |
|||
} catch (Exception e) { |
|||
// 处理其他可能的异常,例如解析错误或 NPE |
|||
System.err.println("An unexpected error occurred during scraping:"); |
|||
e.printStackTrace(); |
|||
return null; // 发生其他错误,返回 null |
|||
} |
|||
} |
|||
public static void main(String[] args) { |
|||
for (Integer year = Year.now().getValue(); year >= 2024; year--) { |
|||
int monthStart = (year == Year.now().getValue()) ? LocalDate.now().getMonthValue() : 12; |
|||
|
|||
for (Integer month = monthStart; month >= 1; month--) { |
|||
try { |
|||
List<String> links = getlink(year, month); |
|||
if (links == null) { |
|||
System.out.println("年份 " + year + " 月份 " + month + " 抓取失败!"); |
|||
continue; |
|||
} |
|||
|
|||
if (links.isEmpty()) { |
|||
System.out.println("年份 " + year + " 月份 " + month + " 无数据!"); |
|||
continue; |
|||
} |
|||
|
|||
int sleepTime = random.nextInt(1001) + 3000; |
|||
int count = 0; |
|||
|
|||
for (String url : links) { |
|||
try { |
|||
Map<String, Object> result = reslutData(url); |
|||
result.put("crawlUrl", url); |
|||
|
|||
String registNum = String.valueOf(result.get("registNum")); |
|||
String jsonValue = objectMapper.writeValueAsString(result); |
|||
|
|||
ProducerRecord<String, String> record = new ProducerRecord<>(TOPIC_NAME, registNum, jsonValue); |
|||
|
|||
producer.send(record, (metadata, exception) -> { |
|||
if (exception == null) { |
|||
System.out.println("✅ 成功发送到 Kafka: " + registNum + " | Offset: " + metadata.offset() + " | " + url); |
|||
} else { |
|||
System.err.println("❌ Kafka 发送失败: " + exception.getMessage()); |
|||
} |
|||
}); |
|||
|
|||
Thread.sleep(sleepTime); // 控制节奏 |
|||
count++; |
|||
} catch (Exception e) { |
|||
System.err.println("抓取或发送失败: " + url); |
|||
e.printStackTrace(); |
|||
} |
|||
} |
|||
|
|||
System.out.println("📦 年份 " + year + " 月份 " + month + " 已完成,共上传 " + count + " 条数据。"); |
|||
|
|||
} catch (Exception e) { |
|||
System.err.println("处理年份 " + year + " 月份 " + month + " 失败: " + e.getMessage()); |
|||
e.printStackTrace(); |
|||
} |
|||
} |
|||
} |
|||
// 关闭 producer |
|||
producer.close(); |
|||
} |
|||
|
|||
public static Map<String,Object> reslutData(String url) throws IOException { |
|||
Map<String,Object> resultData = new HashMap<>(); |
|||
OkHttpClient client = new OkHttpClient().newBuilder() |
|||
.build(); |
|||
MediaType mediaType = MediaType.parse("text/plain"); |
|||
RequestBody body = RequestBody.create(mediaType, ""); |
|||
Request request = new Request.Builder() |
|||
.url(url) |
|||
.get() |
|||
.build(); |
|||
Response response = client.newCall(request).execute(); |
|||
String html = response.body().string(); |
|||
Document parse = Jsoup.parse(html); |
|||
String title = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(7) > td:nth-child(2)").text(); |
|||
String registNum = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(1) > td:nth-child(2) > b").text(); |
|||
String registTime = extractAndConvertDate(parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(1) > td:nth-child(2)").text()); |
|||
Map<String,Object> sponsor = new HashMap<>(); |
|||
String SMMS = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(14) > td:nth-child(2) > table > tbody > tr > td").text(); |
|||
String primarySponsor = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(15) > td:nth-child(2) > table > tbody").text(); |
|||
sponsor.put("Source of Monetary or Material Support",SMMS); |
|||
sponsor.put("Primary Sponsor",primarySponsor); |
|||
String studyType = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(4) > td:nth-child(2)").text(); |
|||
String phase = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(31) > td:nth-child(2)").text(); |
|||
Map<String,Object> disease = new HashMap<>(); |
|||
String healthType = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(21) > td:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(1)").text(); |
|||
String condition = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(21) > td:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(2)").text(); |
|||
disease.put("healthType",healthType); |
|||
disease.put("condition",condition); |
|||
String studyDesign = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(6) > td:nth-child(2)").text(); |
|||
String inclusionCriteria = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(23) > td:nth-child(2) > table > tbody").text(); |
|||
String exclusionCriteria = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(24) > td:nth-child(2) > table > tbody > tr > td:nth-child(2)").text(); |
|||
String enrollment = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(30) > td:nth-child(2)").text(); |
|||
String country = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(17) > td:nth-child(2)").text(); |
|||
String intervention = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(22) > td:nth-child(2) > table").text(); |
|||
Map<String,Object> primaryOutcome = new HashMap<>(); |
|||
String firstOutcome = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(28) > td:nth-child(2) > table > tbody").text(); |
|||
String secondOutcome = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(29) > td:nth-child(2) > table > tbody").text(); |
|||
primaryOutcome.put("firstOutcome",firstOutcome); |
|||
primaryOutcome.put("secondOutcome",secondOutcome); |
|||
|
|||
resultData.put("disease",disease); |
|||
resultData.put("primaryOutcome",primaryOutcome); |
|||
resultData.put("intervention",intervention); |
|||
resultData.put("country",country); |
|||
resultData.put("enrollment",enrollment); |
|||
resultData.put("exclusionCriteria",exclusionCriteria); |
|||
resultData.put("inclusionCriteria",inclusionCriteria); |
|||
resultData.put("studyDesign",studyDesign); |
|||
resultData.put("sponsor",sponsor); |
|||
resultData.put("title",title); |
|||
resultData.put("registNum",registNum); |
|||
resultData.put("registTime",registTime); |
|||
resultData.put("studyType",studyType); |
|||
resultData.put("phase",phase); |
|||
resultData.put("registStatus",""); |
|||
resultData.put("registTitle",""); |
|||
resultData.put("fullTitle",""); |
|||
resultData.put("sponsorPart",""); |
|||
resultData.put("studyObjective",""); |
|||
resultData.put("studyStartDate",""); |
|||
resultData.put("currentStatus",""); |
|||
resultData.put("tagTime",""); |
|||
resultData.put("crawlTime",getCurrentTime()); |
|||
resultData.put("crawlUrl",url); |
|||
resultData.put("postTime",registTime); |
|||
resultData.put("content","content"); |
|||
resultData.put("forwardcontent","forwardcontent"); |
|||
resultData.put("cid","Nctrinicin"); |
|||
return resultData; |
|||
} |
|||
public static String getCurrentTime() { |
|||
// 创建 DateTimeFormatter,指定输出格式 |
|||
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); |
|||
// 获取当前时间 |
|||
LocalDateTime now = LocalDateTime.now(); |
|||
// 格式化 |
|||
return now.format(formatter); |
|||
} |
|||
public static String extractAndConvertDate(String input) { |
|||
// 定义正则表达式提取 dd/MM/yyyy 格式的日期 |
|||
Pattern pattern = Pattern.compile("\\[(?:Registered on|注册于):\\s*(\\d{2}/\\d{2}/\\d{4})\\]"); |
|||
Matcher matcher = pattern.matcher(input); |
|||
|
|||
if (matcher.find()) { |
|||
String dateStr = matcher.group(1); // 提取的日期字符串 |
|||
try { |
|||
// 解析成 Date 对象 |
|||
SimpleDateFormat inputFormat = new SimpleDateFormat("dd/MM/yyyy"); |
|||
Date date = inputFormat.parse(dateStr); |
|||
|
|||
// 格式化为 yyyy:MM:dd 00:00:00 |
|||
SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MM-dd '00:00:00'"); |
|||
return outputFormat.format(date); |
|||
|
|||
} catch (ParseException e) { |
|||
e.printStackTrace(); |
|||
} |
|||
} |
|||
|
|||
return null; // 如果未匹配或转换失败 |
|||
} |
|||
} |
@ -0,0 +1,121 @@ |
|||
package com.example; |
|||
|
|||
import okhttp3.*; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
|
|||
import java.io.IOException; |
|||
import java.text.ParseException; |
|||
import java.text.SimpleDateFormat; |
|||
import java.time.LocalDateTime; |
|||
import java.time.format.DateTimeFormatter; |
|||
import java.util.Date; |
|||
import java.util.HashMap; |
|||
import java.util.Map; |
|||
import java.util.regex.Matcher; |
|||
import java.util.regex.Pattern; |
|||
|
|||
public class CtriScraperContent { |
|||
public static void main(String[] args) throws IOException { |
|||
Map<String,Object> resultData = new HashMap<>(); |
|||
String url = "https://ctri.nic.in/Clinicaltrials/pmaindet2.php?EncHid=MjQ3MjM=&Enc=&userName="; |
|||
OkHttpClient client = new OkHttpClient().newBuilder() |
|||
.build(); |
|||
MediaType mediaType = MediaType.parse("text/plain"); |
|||
RequestBody body = RequestBody.create(mediaType, ""); |
|||
Request request = new Request.Builder() |
|||
.url(url) |
|||
.get() |
|||
.build(); |
|||
Response response = client.newCall(request).execute(); |
|||
String html = response.body().string(); |
|||
Document parse = Jsoup.parse(html); |
|||
String title = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(7) > td:nth-child(2)").text(); |
|||
String registNum = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(1) > td:nth-child(2) > b").text(); |
|||
String registTime = extractAndConvertDate(parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(1) > td:nth-child(2)").text()); |
|||
Map<String,Object> sponsor = new HashMap<>(); |
|||
String SMMS = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(14) > td:nth-child(2) > table > tbody > tr > td").text(); |
|||
String primarySponsor = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(15) > td:nth-child(2) > table > tbody").text(); |
|||
sponsor.put("Source of Monetary or Material Support",SMMS); |
|||
sponsor.put("Primary Sponsor",primarySponsor); |
|||
String studyType = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(4) > td:nth-child(2)").text(); |
|||
String phase = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(31) > td:nth-child(2)").text(); |
|||
Map<String,Object> disease = new HashMap<>(); |
|||
String healthType = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(21) > td:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(1)").text(); |
|||
String condition = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(21) > td:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(2)").text(); |
|||
disease.put("healthType",healthType); |
|||
disease.put("condition",condition); |
|||
String studyDesign = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(6) > td:nth-child(2)").text(); |
|||
String inclusionCriteria = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(23) > td:nth-child(2) > table > tbody").text(); |
|||
String exclusionCriteria = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(24) > td:nth-child(2) > table > tbody > tr > td:nth-child(2)").text(); |
|||
String enrollment = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(30) > td:nth-child(2)").text(); |
|||
String country = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(17) > td:nth-child(2)").text(); |
|||
String intervention = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(22) > td:nth-child(2) > table").text(); |
|||
Map<String,Object> primaryOutcome = new HashMap<>(); |
|||
String firstOutcome = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(28) > td:nth-child(2) > table > tbody").text(); |
|||
String secondOutcome = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(29) > td:nth-child(2) > table > tbody").text(); |
|||
primaryOutcome.put("firstOutcome",firstOutcome); |
|||
primaryOutcome.put("secondOutcome",secondOutcome); |
|||
|
|||
resultData.put("disease",disease); |
|||
resultData.put("primaryOutcome",primaryOutcome); |
|||
resultData.put("intervention",intervention); |
|||
resultData.put("country",country); |
|||
resultData.put("enrollment",enrollment); |
|||
resultData.put("exclusionCriteria",exclusionCriteria); |
|||
resultData.put("inclusionCriteria",inclusionCriteria); |
|||
resultData.put("studyDesign",studyDesign); |
|||
resultData.put("sponsor",sponsor); |
|||
resultData.put("title",title); |
|||
resultData.put("registNum",registNum); |
|||
resultData.put("registTime",registTime); |
|||
resultData.put("studyType",studyType); |
|||
resultData.put("phase",phase); |
|||
resultData.put("registStatus",""); |
|||
resultData.put("registTitle",""); |
|||
resultData.put("fullTitle",""); |
|||
resultData.put("sponsorPart",""); |
|||
resultData.put("studyObjective",""); |
|||
resultData.put("studyStartDate",""); |
|||
resultData.put("currentStatus",""); |
|||
resultData.put("tagTime",""); |
|||
resultData.put("crawlTime",getCurrentTime()); |
|||
resultData.put("crawlUrl",url); |
|||
resultData.put("postTime",registTime); |
|||
resultData.put("content","content"); |
|||
resultData.put("forwardcontent","forwardcontent"); |
|||
|
|||
System.out.println(resultData); |
|||
} |
|||
public static String getCurrentTime() { |
|||
// 创建 DateTimeFormatter,指定输出格式 |
|||
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); |
|||
// 获取当前时间 |
|||
LocalDateTime now = LocalDateTime.now(); |
|||
// 格式化 |
|||
return now.format(formatter); |
|||
} |
|||
public static String extractAndConvertDate(String input) { |
|||
// 定义正则表达式提取 dd/MM/yyyy 格式的日期 |
|||
Pattern pattern = Pattern.compile("\\[(?:Registered on|注册于):\\s*(\\d{2}/\\d{2}/\\d{4})\\]"); |
|||
Matcher matcher = pattern.matcher(input); |
|||
|
|||
if (matcher.find()) { |
|||
String dateStr = matcher.group(1); // 提取的日期字符串 |
|||
try { |
|||
// 解析成 Date 对象 |
|||
SimpleDateFormat inputFormat = new SimpleDateFormat("dd/MM/yyyy"); |
|||
Date date = inputFormat.parse(dateStr); |
|||
|
|||
// 格式化为 yyyy:MM:dd 00:00:00 |
|||
SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MM-dd '00:00:00'"); |
|||
return outputFormat.format(date); |
|||
|
|||
} catch (ParseException e) { |
|||
e.printStackTrace(); |
|||
} |
|||
} |
|||
|
|||
return null; // 如果未匹配或转换失败 |
|||
} |
|||
} |
@ -0,0 +1,113 @@ |
|||
package com.example; |
|||
|
|||
import com.fasterxml.jackson.databind.ObjectMapper; |
|||
import okhttp3.*; |
|||
import org.apache.kafka.clients.producer.KafkaProducer; |
|||
import org.apache.kafka.clients.producer.ProducerConfig; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
|
|||
import java.io.IOException; |
|||
import java.nio.file.Files; |
|||
import java.nio.file.Paths; |
|||
import java.time.LocalDate; |
|||
import java.time.format.DateTimeFormatter; |
|||
import java.util.*; |
|||
import java.util.concurrent.TimeUnit; |
|||
import java.util.regex.Matcher; |
|||
import java.util.regex.Pattern; |
|||
|
|||
public class Inka { |
|||
// private static final String TOPIC_NAME = "patentTopic"; |
|||
// private static final String BOOTSTRAP_SERVERS = "localhost:9092"; |
|||
// private static KafkaProducer<String, String> producer; |
|||
// private static ObjectMapper objectMapper = new ObjectMapper(); |
|||
// private static final Random random = new Random(); |
|||
private static List<String> proxyList = new ArrayList<>(); // 代理池 |
|||
private static int currentProxyIndex = 0; // 当前使用的代理索引 |
|||
// static { |
|||
// Properties props = new Properties(); |
|||
// props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, BOOTSTRAP_SERVERS); |
|||
// props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer"); |
|||
// props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer"); |
|||
// props.put(ProducerConfig.ACKS_CONFIG, "all"); // 等待所有副本确认 |
|||
// props.put(ProducerConfig.RETRIES_CONFIG, 3); // 重试次数 |
|||
// producer = new KafkaProducer<>(props); |
|||
// try { |
|||
// proxyList = Files.readAllLines(Paths.get("proxy.txt")); |
|||
// if (proxyList.isEmpty()) { |
|||
// System.out.println("警告: proxy.txt 为空,未加载任何代理"); |
|||
// } else { |
|||
// System.out.println("成功加载 " + proxyList.size() + " 个代理"); |
|||
// } |
|||
// } catch (IOException e) { |
|||
// System.err.println("读取 proxy.txt 失败: " + e.getMessage()); |
|||
// } |
|||
// } |
|||
public static void main(String[] args) throws IOException, InterruptedException { |
|||
String load = "javax.faces.partial.ajax=true&javax.faces.source=advancedSearchForm%3AadvancedSearchInput%3Aj_idt1225&javax.faces.partial.execute=advancedSearchForm%3AadvancedSearchInput%3Aj_idt1225+advancedSearchForm&javax.faces.partial.render=advancedSearchForm+results-container+j_idt1272&advancedSearchForm%3AadvancedSearchInput%3Aj_idt1225=advancedSearchForm%3AadvancedSearchInput%3Aj_idt1225&advancedSearchForm=advancedSearchForm&advancedSearchForm%3AadvancedSearchAssistant=on&advancedSearchForm%3AadvancedSearchInput%3Ainput=rance10&javax.faces.ViewState=-3602994148230912322%3A-6313250694718303467"; |
|||
|
|||
OkHttpClient client = createClientWithProxy(); |
|||
|
|||
MediaType mediaType = MediaType.parse("application/x-www-form-urlencoded; charset=UTF-8"); |
|||
RequestBody body = RequestBody.create(mediaType, load); |
|||
|
|||
// 构建请求 |
|||
Request request = new Request.Builder() |
|||
.url("https://patentscope.wipo.int/search/zh/result.jsf?_vid=P21-M9APK2-00815") // 更新为 Patentscope 的 URL |
|||
.method("POST", body) |
|||
.addHeader("Accept", "application/xml, text/xml, */*; q=0.01") |
|||
.addHeader("Accept-Language", "zh-CN,zh;q=0.9,th;q=0.8,en;q=0.7") |
|||
.addHeader("Cache-Control", "no-cache") |
|||
.addHeader("Connection", "keep-alive") |
|||
.addHeader("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8") |
|||
// .addHeader("Cookie", "JSESSIONID=F253B7B0920FFACB89354339F51E325C.wapp2nB; ABIW=balancer.cms41; _ga=GA1.1.33840258.1744249893; Hm_lvt_95e64d347633bfd0a2462e25c93606d6=1744249893; Hm_lpvt_95e64d347633bfd0a2462e25c93606d6=1744249893; HMACCOUNT=0388A9D4AC1C33F5; _pk_id.14.ec75=5aa7b2d46edf6083.1744249894.; cebs=1; _ce.clock_data=-923%2C212.87.194.3%2C1%2C33d0f257a817d1ca4c4381b87f8ad83f%2CChrome%2CJP; cebsp_=1; _pk_uid=0%3DNWFhN2IyZDQ2ZWRmNjA4Mw%3D%3D; _gcl_au=1.1.1245117354.1744249928; wipo-visitor-uunid=28f5a645185bc7b; _pk_ref.9.ec75=%5B%22%22%2C%22%22%2C1744249929%2C%22https%3A%2F%2Fwww.wipo.int%2F%22%5D; _pk_id.9.ec75=957af9d7ac871adb.1744249929.; _ga_15TSHJ0HWP=GS1.1.1744249893.1.1.1744250058.58.0.0; _ce.s=v~274adfa655dbaad3ae6a47724ee5bf89d205d10f~lcw~1744250058720~vir~new~lva~1744249893962~vpv~0~v11.cs~411929~v11.s~559ada70-15ae-11f0-a979-459b55a048ba~v11.sla~1744250058728~gtrk.la~m9apg5tj~v11.send~1744250058720~lcw~1744250058728; _pk_id.5.ec75=ab8529a634a38653.1744250080.; wipo_language=zh; _pk_ses.5.ec75=1") |
|||
.addHeader("Faces-Request", "partial/ajax") |
|||
.addHeader("Host", "patentscope.wipo.int") |
|||
.addHeader("Origin", "https://patentscope.wipo.int") |
|||
.addHeader("Pragma", "no-cache") |
|||
.addHeader("Referer", "https://patentscope.wipo.int/search/zh/result.jsf?_vid=P21-M9APK2-00815") |
|||
.addHeader("Sec-Ch-Ua", "\"Google Chrome\";v=\"135\", \"Not-A.Brand\";v=\"8\", \"Chromium\";v=\"135\"") |
|||
.addHeader("Sec-Ch-Ua-Mobile", "?0") |
|||
.addHeader("Sec-Ch-Ua-Platform", "\"Windows\"") |
|||
.addHeader("Sec-Fetch-Dest", "empty") |
|||
.addHeader("Sec-Fetch-Mode", "cors") |
|||
.addHeader("Sec-Fetch-Site", "same-origin") |
|||
.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36") |
|||
.addHeader("X-Requested-With", "XMLHttpRequest") |
|||
.build(); |
|||
|
|||
// 执行请求并打印响应 |
|||
try (Response response = client.newCall(request).execute()) { |
|||
if (response.isSuccessful()) { |
|||
System.out.println("Response: " + response.body().string()+response.code()); |
|||
} else { |
|||
System.out.println("Error: " + response.code() + " - " + response.message()); |
|||
System.out.println("Response Body: " + response.body().string()); |
|||
} |
|||
} |
|||
} |
|||
|
|||
private static OkHttpClient createClientWithProxy() { |
|||
OkHttpClient.Builder builder = new OkHttpClient().newBuilder() |
|||
.connectTimeout(30, TimeUnit.SECONDS) |
|||
.readTimeout(30, TimeUnit.SECONDS) |
|||
.writeTimeout(30, TimeUnit.SECONDS); |
|||
|
|||
if (!proxyList.isEmpty() && currentProxyIndex < proxyList.size()) { |
|||
String proxy = proxyList.get(currentProxyIndex); |
|||
String[] proxyParts = proxy.split(":"); |
|||
if (proxyParts.length == 2) { |
|||
String proxyHost = proxyParts[0]; |
|||
int proxyPort = Integer.parseInt(proxyParts[1]); |
|||
builder.proxy(new java.net.Proxy(java.net.Proxy.Type.HTTP, |
|||
new java.net.InetSocketAddress(proxyHost, proxyPort))); |
|||
System.out.println("使用代理: " + proxy); |
|||
} |
|||
} |
|||
return builder.build(); |
|||
} |
|||
} |
@ -0,0 +1,111 @@ |
|||
package com.example; |
|||
|
|||
import org.openqa.selenium.By; |
|||
import org.openqa.selenium.WebDriver; |
|||
import org.openqa.selenium.WebElement; |
|||
import org.openqa.selenium.chrome.ChromeDriver; |
|||
import org.openqa.selenium.chrome.ChromeOptions; |
|||
import org.openqa.selenium.support.ui.ExpectedConditions; |
|||
import org.openqa.selenium.support.ui.WebDriverWait; |
|||
import org.openqa.selenium.NoSuchElementException; |
|||
|
|||
import java.time.Duration; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
public class NSFAwardCrawler { |
|||
private static final int PAGE_SIZE = 30; // 每页基准条数 |
|||
|
|||
public static void main(String[] args) { |
|||
// 设置 ChromeDriver 路径 |
|||
System.setProperty("webdriver.chrome.driver", |
|||
"F:\\tool\\EasySpider_0.6.2_Windows_x64\\EasySpider_windows_x64\\EasySpider\\resources\\app\\chrome_win64\\chromedriver_win64.exe"); |
|||
|
|||
ChromeOptions options = new ChromeOptions(); |
|||
WebDriver driver = new ChromeDriver(options); |
|||
|
|||
try { |
|||
String url = "https://www.nsf.gov/awardsearch/simpleSearchResult?queryText=ebola&ActiveAwards=true"; |
|||
driver.get(url); |
|||
|
|||
WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(10)); |
|||
List<String> allAwardIds = new ArrayList<>(); |
|||
int pageNumber = 1; |
|||
|
|||
while (true) { |
|||
System.out.println("Processing page " + pageNumber); |
|||
|
|||
// 等待页面加载完成 |
|||
wait.until(ExpectedConditions.presenceOfElementLocated(By.className("listview-item"))); |
|||
|
|||
// 获取当前页的结果项 |
|||
List<WebElement> resultItems = driver.findElements(By.className("listview-item")); |
|||
int currentPageSize = resultItems.size(); |
|||
System.out.println("Found " + currentPageSize + " items on page " + pageNumber); |
|||
|
|||
// 如果当前页没有结果,退出 |
|||
if (currentPageSize == 0) { |
|||
System.out.println("No items found on page " + pageNumber + ", stopping..."); |
|||
break; |
|||
} |
|||
|
|||
// 提取当前页的奖项 ID |
|||
for (WebElement item : resultItems) { |
|||
try { |
|||
String awardId = item.getAttribute("id"); |
|||
if (awardId != null && !awardId.isEmpty() && !allAwardIds.contains(awardId)) { |
|||
allAwardIds.add(awardId); |
|||
} |
|||
} catch (Exception e) { |
|||
System.out.println("Error processing item: " + e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
// 判断是否需要分页:如果当前页条数小于 30,认为是最后一页 |
|||
if (currentPageSize < PAGE_SIZE) { |
|||
System.out.println("Page " + pageNumber + " has less than " + PAGE_SIZE + " items (" + currentPageSize + "), assuming last page, stopping..."); |
|||
break; |
|||
} |
|||
|
|||
// 检查下一页按钮 |
|||
try { |
|||
WebElement nextButton = driver.findElement(By.name("NEXT")); |
|||
boolean isEnabled = nextButton.isEnabled(); |
|||
System.out.println("Next button enabled: " + isEnabled); |
|||
|
|||
if (!isEnabled) { |
|||
System.out.println("Next button is disabled, stopping..."); |
|||
break; |
|||
} |
|||
|
|||
// 点击下一页 |
|||
nextButton.click(); |
|||
Thread.sleep(2000); // 等待页面加载 |
|||
pageNumber++; |
|||
} catch (NoSuchElementException e) { |
|||
System.out.println("Next button not found, stopping..."); |
|||
break; |
|||
} catch (Exception e) { |
|||
System.out.println("Error clicking next button: " + e.getMessage()); |
|||
break; |
|||
} |
|||
} |
|||
|
|||
// 打印所有结果 |
|||
System.out.println("Found " + allAwardIds.size() + " award IDs across all pages:"); |
|||
for (int i = 0; i < allAwardIds.size(); i++) { |
|||
System.out.println((i + 1) + ". " + allAwardIds.get(i)); |
|||
} |
|||
|
|||
} catch (Exception e) { |
|||
System.out.println("An error occurred: " + e.getMessage()); |
|||
} finally { |
|||
try { |
|||
Thread.sleep(2000); |
|||
} catch (InterruptedException e) { |
|||
e.printStackTrace(); |
|||
} |
|||
driver.quit(); |
|||
} |
|||
} |
|||
} |
@ -0,0 +1,130 @@ |
|||
package com.example; |
|||
|
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
import org.openqa.selenium.By; |
|||
import org.openqa.selenium.Keys; |
|||
import org.openqa.selenium.WebDriver; |
|||
import org.openqa.selenium.WebElement; |
|||
import org.openqa.selenium.chrome.ChromeDriver; |
|||
import org.openqa.selenium.chrome.ChromeOptions; |
|||
import org.openqa.selenium.support.ui.ExpectedConditions; |
|||
import org.openqa.selenium.support.ui.WebDriverWait; |
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
|
|||
import java.time.Duration; |
|||
import java.util.Random; |
|||
|
|||
public class PatentscopeSeleniumCrawler { |
|||
private static final Logger LOGGER = LoggerFactory.getLogger(PatentscopeSeleniumCrawler.class); |
|||
private static final String SEARCH_URL = "https://patentscope.wipo.int/search/en/search.jsf"; |
|||
private static final String SEARCH_INPUT_ID = "simpleSearchForm:fpSearch:input"; |
|||
private static final String SEARCH_BUTTON_ID = "simpleSearchForm:fpSearch:j_idt1319"; |
|||
private static final Random RANDOM = new Random(); |
|||
|
|||
public static void main(String[] args) { |
|||
// 配置 ChromeDriver |
|||
System.setProperty("webdriver.chrome.driver", "F:\\tool\\EasySpider_0.6.2_Windows_x64\\EasySpider_windows_x64\\EasySpider\\resources\\app\\chrome_win64\\chromedriver_win64.exe"); |
|||
ChromeOptions options = new ChromeOptions(); |
|||
options.addArguments("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"); |
|||
options.addArguments("--disable-blink-features=AutomationControlled"); |
|||
// 非无头模式,便于调试 |
|||
WebDriver driver = null; |
|||
|
|||
try { |
|||
driver = new ChromeDriver(options); |
|||
WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(15)); |
|||
|
|||
// Step 1: 访问搜索页面 |
|||
LOGGER.info("Navigating to {}", SEARCH_URL); |
|||
driver.get(SEARCH_URL); |
|||
Thread.sleep(2000 + RANDOM.nextInt(2000)); // 等待页面加载 |
|||
|
|||
// Step 2: 输入搜索关键词 |
|||
LOGGER.info("Entering search query: FP:(fever)"); |
|||
WebElement searchInput = wait.until(ExpectedConditions.elementToBeClickable(By.id(SEARCH_INPUT_ID))); |
|||
searchInput.clear(); |
|||
searchInput.sendKeys("FP:(fever)"); |
|||
Thread.sleep(500 + RANDOM.nextInt(1000)); // 等待输入生效 |
|||
|
|||
// Step 3: 触发搜索 |
|||
LOGGER.info("Attempting to trigger search..."); |
|||
try { |
|||
// 方法 1: 点击搜索按钮 |
|||
WebElement searchButton = wait.until(ExpectedConditions.elementToBeClickable(By.id(SEARCH_BUTTON_ID))); |
|||
LOGGER.info("Clicking search button"); |
|||
searchButton.click(); |
|||
Thread.sleep(3000 + RANDOM.nextInt(2000)); // 等待 AJAX 和跳转 |
|||
} catch (Exception e) { |
|||
LOGGER.warn("Button click failed, trying Enter key: {}", e.getMessage()); |
|||
// 方法 2: 模拟回车 |
|||
searchInput.sendKeys(Keys.ENTER); |
|||
Thread.sleep(3000 + RANDOM.nextInt(2000)); |
|||
} |
|||
|
|||
// Step 4: 验证跳转 |
|||
String currentUrl = driver.getCurrentUrl(); |
|||
LOGGER.info("Current URL: {}", currentUrl); |
|||
if (!currentUrl.contains("result.jsf")) { |
|||
LOGGER.error("Failed to redirect to result.jsf, trying advanced search..."); |
|||
// 尝试高级搜索(备用) |
|||
driver.get("https://patentscope.wipo.int/search/en/search.jsf?advancedSearch=true"); |
|||
searchInput = wait.until(ExpectedConditions.elementToBeClickable(By.id("advancedSearchForm:advancedSearchInput:input"))); |
|||
searchInput.clear(); |
|||
searchInput.sendKeys("FP:(fever)"); |
|||
WebElement advSearchButton = wait.until(ExpectedConditions.elementToBeClickable(By.id("advancedSearchForm:advancedSearchInput:j_idt1208"))); |
|||
advSearchButton.click(); |
|||
Thread.sleep(3000 + RANDOM.nextInt(2000)); |
|||
currentUrl = driver.getCurrentUrl(); |
|||
LOGGER.info("Advanced search URL: {}", currentUrl); |
|||
} |
|||
|
|||
// Step 5: 解析结果页面 |
|||
if (currentUrl.contains("result.jsf")) { |
|||
LOGGER.info("Successfully reached result page"); |
|||
while (true) { |
|||
Document doc = Jsoup.parse(driver.getPageSource()); |
|||
Elements results = doc.select("div.result-row"); // 需确认选择器 |
|||
if (results.isEmpty()) { |
|||
LOGGER.warn("No results found, verify selector or query"); |
|||
} |
|||
|
|||
for (Element item : results) { |
|||
String title = item.select("a.result-title__text").text(); // 需确认 |
|||
String patentId = item.select("div.result__number").text(); // 需确认 |
|||
LOGGER.info("Title: {}", title.isEmpty() ? "N/A" : title); |
|||
LOGGER.info("Patent ID: {}", patentId.isEmpty() ? "N/A" : patentId); |
|||
} |
|||
|
|||
// 分页 |
|||
WebElement nextPage = driver.findElements(By.cssSelector("a.paginator__button--next:not(.is-disabled)")) |
|||
.stream() |
|||
.filter(WebElement::isDisplayed) |
|||
.findFirst() |
|||
.orElse(null); |
|||
if (nextPage == null) { |
|||
LOGGER.info("No more pages"); |
|||
break; |
|||
} |
|||
|
|||
LOGGER.info("Navigating to next page"); |
|||
nextPage.click(); |
|||
Thread.sleep(3000 + RANDOM.nextInt(2000)); |
|||
} |
|||
} else { |
|||
LOGGER.error("Still not on result page, check query or network"); |
|||
} |
|||
|
|||
} catch (Exception e) { |
|||
LOGGER.error("Error during crawling: {}", e.getMessage(), e); |
|||
} finally { |
|||
if (driver != null) { |
|||
driver.quit(); |
|||
LOGGER.info("WebDriver closed"); |
|||
} |
|||
} |
|||
} |
|||
} |
@ -0,0 +1,25 @@ |
|||
package com.example; |
|||
import java.io.BufferedReader; |
|||
import java.io.InputStreamReader; |
|||
import java.net.HttpURLConnection; |
|||
import java.net.URL; |
|||
|
|||
public class ProxyIPChecker { |
|||
public static void main(String[] args) throws Exception { |
|||
URL url = new URL("http://httpbin.org/ip"); |
|||
HttpURLConnection conn = (HttpURLConnection) url.openConnection(); |
|||
conn.setRequestMethod("GET"); |
|||
|
|||
BufferedReader in = new BufferedReader(new InputStreamReader(conn.getInputStream(), "UTF-8")); |
|||
String inputLine; |
|||
StringBuilder response = new StringBuilder(); |
|||
|
|||
while ((inputLine = in.readLine()) != null) { |
|||
response.append(inputLine); |
|||
} |
|||
in.close(); |
|||
|
|||
System.out.println("当前公网 IP 信息:"); |
|||
System.out.println(response.toString()); |
|||
} |
|||
} |
@ -0,0 +1,496 @@ |
|||
package com.example;// 修改为你的包名 |
|||
|
|||
import java.awt.image.BufferedImage; |
|||
import java.io.*; |
|||
import java.net.HttpURLConnection; |
|||
import java.net.URL; |
|||
import java.net.URLEncoder; |
|||
import java.nio.charset.StandardCharsets; |
|||
import java.util.HashMap; |
|||
import java.util.HashSet; |
|||
import java.util.List; |
|||
import java.util.Map; |
|||
import java.util.Set; |
|||
import javax.imageio.ImageIO; |
|||
|
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
import net.sourceforge.tess4j.Tesseract; |
|||
import net.sourceforge.tess4j.TesseractException; |
|||
|
|||
public class ScraperWithCaptcha { |
|||
|
|||
// --- 需要根据目标网站修改的常量 --- |
|||
private static final String BASE_URL = "https://ctri.nic.in/Clinicaltrials/advancesearchmain.php"; // *** 替换为目标网站包含表单和验证码的页面 URL *** |
|||
private static final String FORM_SUBMIT_URL = BASE_URL; // *** 表单提交的 URL,通常是页面本身或 action 属性指定的 URL *** |
|||
private static final String CAPTCHA_IMAGE_SRC_SUBSTRING = "captchasecurityimages.php"; // *** 验证码图片 src 中特有的字符串 *** |
|||
private static final String CAPTCHA_INPUT_SELECTOR = "input[name=T4]"; |
|||
private static final String TARGET_FORM_SELECTOR = "form"; // *** 如果页面有多个表单,指定目标表单的选择器,例如 "#myFormId" *** |
|||
|
|||
// --- 图像预处理相关的阈值,需要根据验证码样式调试 --- |
|||
private static final int BINARY_THRESHOLD = 128; // 二值化阈值 (0-255) |
|||
|
|||
// --- Tesseract 配置 (根据你的安装修改) --- |
|||
// Tesseract tessdata 文件夹的路径 |
|||
private static final String TESSDATA_PATH = "F:\\tool\\Tesseract-OCR\\tessdata"; // *** 请务必修改为你的实际路径 *** |
|||
|
|||
// --- 其他通用配置 --- |
|||
private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"; |
|||
private Set<String> cookies = new HashSet<>(); // 存储 cookies |
|||
|
|||
public static void main(String[] args) { |
|||
ScraperWithCaptcha scraper = new ScraperWithCaptcha(); |
|||
try { |
|||
// 1. 获取包含表单和验证码的页面 |
|||
PageInfo pageInfo = scraper.fetchPage(BASE_URL, null, null, false); // 第一次 GET 不需要 Cookies 和 POST Data, 也不是 AJAX |
|||
|
|||
if (pageInfo.htmlContent == null || pageInfo.statusCode != HttpURLConnection.HTTP_OK) { |
|||
System.err.println("Failed to fetch the initial page. Status code: " + pageInfo.statusCode); |
|||
return; |
|||
} |
|||
|
|||
// 解析页面提取验证码信息和所有表单字段 |
|||
Document doc = Jsoup.parse(pageInfo.htmlContent, BASE_URL); |
|||
|
|||
// 提取验证码图片 URL |
|||
Element captchaImg = doc.selectFirst("img[src*=" + CAPTCHA_IMAGE_SRC_SUBSTRING + "]"); |
|||
String captchaImageUrl = null; |
|||
if (captchaImg != null) { |
|||
captchaImageUrl = captchaImg.absUrl("src"); // 获取绝对 URL |
|||
System.out.println("Found CAPTCHA image URL: " + captchaImageUrl); |
|||
} else { |
|||
System.err.println("CAPTCHA image not found using selector: img[src*=" + CAPTCHA_IMAGE_SRC_SUBSTRING + "]"); |
|||
// 如果找不到验证码,可能无法继续 |
|||
return; |
|||
} |
|||
|
|||
// 提取验证码输入框的 name |
|||
Element captchaInput = doc.selectFirst(CAPTCHA_INPUT_SELECTOR); |
|||
String captchaInputName = null; |
|||
if (captchaInput != null) { |
|||
captchaInputName = captchaInput.attr("name"); |
|||
System.out.println("Found CAPTCHA input field name: " + captchaInputName); |
|||
} else { |
|||
System.err.println("CAPTCHA input field not found using selector: " + CAPTCHA_INPUT_SELECTOR); |
|||
// 如果找不到输入框,也无法提交 |
|||
return; |
|||
} |
|||
|
|||
// 2. 下载验证码图片 |
|||
BufferedImage originalCaptchaImage = scraper.downloadImage(captchaImageUrl); |
|||
System.out.println("Captcha image downloaded."); |
|||
|
|||
// 3. 预处理图片 |
|||
BufferedImage preprocessedImage = scraper.preprocessImage(originalCaptchaImage); |
|||
System.out.println("Image preprocessed (saved as preprocessed_captcha.png)."); |
|||
|
|||
// 4. 识别验证码 |
|||
String captchaCode = scraper.recognizeCaptcha(preprocessedImage); |
|||
|
|||
if (captchaCode != null && !captchaCode.isEmpty()) { |
|||
System.out.println("Recognized CAPTCHA: " + captchaCode); |
|||
|
|||
// 5. 构建包含验证码的 POST 数据 |
|||
// 从页面表单中提取所有字段,并设置其值 |
|||
Map<String, String> formData = scraper.buildFormDataMap(doc, captchaInputName, captchaCode); |
|||
|
|||
String postData = scraper.buildPostData(formData); |
|||
System.out.println("Built POST data: " + postData); |
|||
|
|||
// 6. 提交表单 |
|||
// 通常是标准的 POST 请求 |
|||
PageInfo postResponseInfo = scraper.fetchPage(FORM_SUBMIT_URL, postData, scraper.getCookieHeader(), false); // 非 AJAX POST |
|||
|
|||
System.out.println("Form submitted. Response status code: " + postResponseInfo.statusCode); |
|||
System.out.println("POST Response Body (partial): " + (postResponseInfo.htmlContent != null && postResponseInfo.htmlContent.length() > 500 ? postResponseInfo.htmlContent.substring(0, 500) + "..." : postResponseInfo.htmlContent)); // 打印部分响应查看 |
|||
|
|||
// 7. 检查响应判断是否成功 |
|||
// 对于标准表单提交,成功通常是重定向 (302) 或返回新的页面 |
|||
if (postResponseInfo.statusCode == HttpURLConnection.HTTP_MOVED_TEMP || postResponseInfo.statusCode == HttpURLConnection.HTTP_SEE_OTHER || postResponseInfo.statusCode == HttpURLConnection.HTTP_MOVED_PERM) { |
|||
String redirectUrl = postResponseInfo.redirectUrl; |
|||
System.out.println("POST resulted in redirect. Location: " + redirectUrl); |
|||
// TODO: 如果重定向到成功页面,可以继续爬取该页面 |
|||
// 如果重定向回原页面或错误页,说明提交失败 (验证码错误或其他原因) |
|||
if (redirectUrl != null && redirectUrl.equals(BASE_URL)) { // <-- 检查是否重定向回原页面,需根据实际情况判断 |
|||
System.err.println("Submission failed, redirected back to the form page."); |
|||
// TODO: 实现重试逻辑 (需要重新获取页面和验证码) |
|||
} |
|||
|
|||
} else if (postResponseInfo.statusCode == HttpURLConnection.HTTP_OK) { |
|||
System.out.println("POST returned OK (200). Analyzing response content..."); |
|||
// TODO: 解析 postResponseInfo.htmlContent 来判断是否成功(例如查找成功标志,或检查是否有验证码错误提示) |
|||
if (postResponseInfo.htmlContent != null && postResponseInfo.htmlContent.contains("成功标志字符串")) { // <-- *** 根据实际成功响应的特征修改 *** |
|||
System.out.println("Form submission appears successful based on content."); |
|||
// TODO: 从 postResponseInfo.htmlContent 中提取你想要的数据 |
|||
} else if (postResponseInfo.htmlContent != null && postResponseInfo.htmlContent.contains("验证码错误提示字符串")) { // <-- *** 根据实际验证码错误提示修改 *** |
|||
System.err.println("CAPTCHA appears incorrect. Need to retry."); |
|||
// TODO: 实现重试逻辑 (可能需要重新获取页面,因为验证码会刷新) |
|||
} else { |
|||
System.out.println("POST returned 200, but content not clearly indicating success or failure."); |
|||
// 需要更详细地检查响应内容 |
|||
} |
|||
} |
|||
else { |
|||
System.err.println("POST request failed with status code: " + postResponseInfo.statusCode); |
|||
} |
|||
|
|||
|
|||
} else { |
|||
System.err.println("CAPTCHA recognition failed. Cannot submit form."); |
|||
// TODO: 实现识别失败的重试逻辑 |
|||
} |
|||
|
|||
|
|||
} catch (IOException e) { |
|||
e.printStackTrace(); |
|||
System.err.println("An I/O error occurred: " + e.getMessage()); |
|||
} catch (TesseractException e) { |
|||
e.printStackTrace(); |
|||
System.err.println("A Tesseract OCR error occurred: " + e.getMessage()); |
|||
} catch (Exception e) { |
|||
e.printStackTrace(); |
|||
System.err.println("An unexpected error occurred: " + e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
/** |
|||
* 发起 HTTP 请求 (GET 或 POST),获取页面内容和 Cookies。 |
|||
* |
|||
* @param urlString 请求 URL |
|||
* @param postData POST 请求体数据 (GET 请求时为 null) |
|||
* @param cookieHeader 请求头中的 Cookie 值 (第一次请求时为 null) |
|||
* @param isAjaxPost 是否是 AJAX POST 请求 (影响请求头设置) |
|||
* @return PageInfo 对象,包含响应信息和内容 |
|||
* @throws IOException |
|||
*/ |
|||
private PageInfo fetchPage(String urlString, String postData, String cookieHeader, boolean isAjaxPost) throws IOException { |
|||
URL url = new URL(urlString); |
|||
HttpURLConnection conn = (HttpURLConnection) url.openConnection(); |
|||
|
|||
if (postData != null) { |
|||
conn.setRequestMethod("POST"); |
|||
conn.setDoOutput(true); // 允许写入 POST 数据 |
|||
} else { |
|||
conn.setRequestMethod("GET"); |
|||
} |
|||
|
|||
conn.setInstanceFollowRedirects(false); |
|||
conn.setConnectTimeout(10000); |
|||
conn.setReadTimeout(20000); |
|||
|
|||
// 设置请求头 (不包括 Cookie,Cookie 在后面统一处理) |
|||
conn.setRequestProperty("User-Agent", USER_AGENT); |
|||
if (cookieHeader != null) { |
|||
conn.setRequestProperty("Cookie", cookieHeader); |
|||
} |
|||
conn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"); |
|||
if (postData != null) { |
|||
conn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"); |
|||
if(isAjaxPost) { |
|||
conn.setRequestProperty("X-Requested-With", "XMLHttpRequest"); |
|||
conn.setRequestProperty("X-MicrosoftAjax", "Delta=true"); |
|||
} |
|||
try { |
|||
conn.setRequestProperty("Referer", new URL(urlString).getProtocol() + "://" + new URL(urlString).getHost() + new URL(urlString).getPath()); |
|||
conn.setRequestProperty("Origin", new URL(urlString).getProtocol() + "://" + new URL(urlString).getHost()); |
|||
} catch (Exception e) { } |
|||
} |
|||
|
|||
// --- 写入 POST 数据 (如果是 POST 请求) --- |
|||
// 这一块必须在读取响应之前 |
|||
if (postData != null) { |
|||
try (OutputStream os = conn.getOutputStream()) { // 获取输出流,会触发连接 |
|||
byte[] input = postData.getBytes(StandardCharsets.UTF_8); |
|||
os.write(input, 0, input.length); |
|||
} // os.close() 在 try-with-resources 结束时自动调用,数据在这里被发送 |
|||
} |
|||
// --- End POST Data --- |
|||
|
|||
|
|||
// --- 现在可以获取响应信息了 --- |
|||
// 调用 getResponseCode() 会发送完整的请求 (包括头和体) 并接收响应头 |
|||
int statusCode = conn.getResponseCode(); |
|||
String redirectUrl = null; |
|||
if (statusCode == HttpURLConnection.HTTP_MOVED_TEMP || statusCode == HttpURLConnection.HTTP_SEE_OTHER || statusCode == HttpURLConnection.HTTP_MOVED_PERM) { |
|||
redirectUrl = conn.getHeaderField("Location"); |
|||
} |
|||
|
|||
// --- 处理 Cookies (从响应头读取) --- |
|||
// 这一块现在在获取响应码之后执行 |
|||
Map<String, List<String>> headerFields = conn.getHeaderFields(); |
|||
List<String> cookiesHeader = headerFields.get("Set-Cookie"); |
|||
if (cookiesHeader != null) { |
|||
for (String cookie : cookiesHeader) { |
|||
String cookieValue = cookie.split(";")[0]; |
|||
this.cookies.add(cookieValue); |
|||
} |
|||
} |
|||
// --- End Cookies --- |
|||
|
|||
|
|||
StringBuilder content = new StringBuilder(); |
|||
// 只有当状态码表示成功 (2xx) 或客户端错误 (4xx) 且有响应体时才读取 |
|||
if (statusCode >= 200 && statusCode < 300 || statusCode >= 400 && statusCode < 500 && conn.getContentLength() > 0) { |
|||
try (InputStream is = (statusCode >= 200 && statusCode < 300) ? conn.getInputStream() : conn.getErrorStream(); |
|||
BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))) { |
|||
String line; |
|||
while ((line = reader.readLine()) != null) { |
|||
content.append(line).append("\n"); |
|||
} |
|||
} catch (IOException e) { |
|||
System.err.println("Error reading response body for status " + statusCode + ": " + e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
conn.disconnect(); |
|||
|
|||
PageInfo pageInfo = new PageInfo(); |
|||
pageInfo.statusCode = statusCode; |
|||
pageInfo.redirectUrl = redirectUrl; |
|||
pageInfo.htmlContent = content.toString(); |
|||
|
|||
return pageInfo; |
|||
} |
|||
|
|||
/** |
|||
* 从页面表单中提取所有字段,并设置验证码字段的值 |
|||
* @param doc Jsoup 解析后的 Document 对象 |
|||
* @param captchaInputName 验证码输入框的 name 属性值 |
|||
* @param captchaCode 识别出的验证码字符串 |
|||
* @return 包含所有表单字段名称和值的 Map |
|||
*/ |
|||
private Map<String, String> buildFormDataMap(Document doc, String captchaInputName, String captchaCode) { |
|||
Map<String, String> formData = new HashMap<>(); |
|||
Element form = doc.selectFirst(TARGET_FORM_SELECTOR); // 找到目标表单 |
|||
|
|||
if (form == null) { |
|||
System.err.println("Target form not found using selector: " + TARGET_FORM_SELECTOR); |
|||
return formData; // 返回空 Map |
|||
} |
|||
|
|||
Elements formElements = form.select("input, select, textarea"); // 查找表单内的所有输入元素 |
|||
|
|||
for (Element element : formElements) { |
|||
String name = element.attr("name"); |
|||
String type = element.attr("type"); // 获取 input 的类型 |
|||
String value = element.attr("value"); // 获取默认 value |
|||
|
|||
if (name == null || name.isEmpty()) { |
|||
continue; // 忽略没有 name 属性的元素 |
|||
} |
|||
|
|||
// 处理不同类型的输入元素 |
|||
if ("text".equals(type) || "hidden".equals(type) || "password".equals(type)) { |
|||
if (name.equals(captchaInputName)) { |
|||
// 这是验证码输入框,填入识别结果 |
|||
formData.put(name, captchaCode); |
|||
} else { |
|||
// 其他文本/隐藏字段,使用默认值或留空,取决于需求 |
|||
formData.put(name, value != null ? value : ""); // 通常爬取时这些是空的 |
|||
} |
|||
} else if ("checkbox".equals(type)) { |
|||
// 复选框,如果被勾选则添加到 formData |
|||
if (element.hasAttr("checked")) { |
|||
formData.put(name, value != null ? value : "on"); // 复选框的值通常是 "on" 或 value 属性的值 |
|||
} |
|||
} else if ("radio".equals(type)) { |
|||
// 单选按钮,如果被选中则添加到 formData |
|||
if (element.hasAttr("checked")) { |
|||
formData.put(name, value != null ? value : "on"); // 单选按钮的值通常是 value 属性的值 |
|||
} |
|||
} else if ("select".equals(element.tagName().toLowerCase())) { |
|||
// 下拉列表,找到被选中的 option 的值 |
|||
Element selectedOption = element.selectFirst("option[selected]"); |
|||
if (selectedOption != null) { |
|||
formData.put(name, selectedOption.attr("value")); |
|||
} else { |
|||
// 如果没有选中的项,可能需要根据网站逻辑选择第一个或默认项 |
|||
// 或者如果网站要求必须有值,这里需要更复杂的处理 |
|||
Element firstOption = element.selectFirst("option"); |
|||
if (firstOption != null) { |
|||
formData.put(name, firstOption.attr("value")); |
|||
} else { |
|||
formData.put(name, ""); // 没有选项,留空 |
|||
} |
|||
} |
|||
} else if ("textarea".equals(element.tagName().toLowerCase())) { |
|||
// 文本域,获取其文本内容 |
|||
formData.put(name, element.text()); |
|||
} |
|||
// TODO: 根据需要处理其他类型的 input,如 file, submit, image, reset 等 |
|||
// 注意:submit, image 类型的 input 通常只有在它们被点击时才会被包含在表单提交数据中,并且它们的值是按钮的值 |
|||
} |
|||
|
|||
// TODO: 如果网站通过 JavaScript 动态添加或修改了表单字段,你需要找到这些字段并手动添加到 formData 中。 |
|||
// TODO: 有些表单提交按钮本身会作为 POST 数据的一部分被发送(例如 name="submitButton" value="提交") |
|||
// 你可能需要确定哪个按钮触发了提交,并将它的 name=value 对添加到 formData 中。 |
|||
|
|||
return formData; |
|||
} |
|||
|
|||
|
|||
/** |
|||
* 下载验证码图片 (Java 8 兼容版本) |
|||
* @param imageUrl 图片的完整 URL |
|||
* @return 图片的 BufferedImage 对象 |
|||
* @throws IOException 如果下载失败 |
|||
*/ |
|||
public BufferedImage downloadImage(String imageUrl) throws IOException { |
|||
URL url = new URL(imageUrl); |
|||
HttpURLConnection conn = (HttpURLConnection) url.openConnection(); |
|||
conn.setRequestMethod("GET"); |
|||
conn.setRequestProperty("User-Agent", USER_AGENT); |
|||
// 下载图片时通常也需要带上 cookies,确保会话一致性 |
|||
conn.setRequestProperty("Cookie", getCookieHeader()); |
|||
|
|||
|
|||
int responseCode = conn.getResponseCode(); |
|||
if (responseCode == HttpURLConnection.HTTP_OK) { |
|||
try (InputStream is = conn.getInputStream()) { |
|||
// --- 兼容 Java 8 及更早版本读取 InputStream --- |
|||
ByteArrayOutputStream baos = new ByteArrayOutputStream(); |
|||
byte[] buffer = new byte[4096]; // 缓冲区大小 |
|||
int bytesRead; |
|||
while ((bytesRead = is.read(buffer)) != -1) { |
|||
baos.write(buffer, 0, bytesRead); |
|||
} |
|||
ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray()); |
|||
// --- End 兼容代码 --- |
|||
|
|||
BufferedImage image = ImageIO.read(bais); |
|||
if (image == null) { |
|||
throw new IOException("Failed to read image stream. Check image format or content for URL: " + imageUrl); |
|||
} |
|||
return image; |
|||
} |
|||
} else { |
|||
throw new IOException("Failed to download image. HTTP error code: " + responseCode + " for URL: " + imageUrl); |
|||
} |
|||
} |
|||
|
|||
/** |
|||
* 对验证码图片进行预处理 (基础示例:转灰度+二值化) |
|||
* 这是最关键的部分,需要根据验证码样式调整 |
|||
* @param originalImage 原始图片 |
|||
* @return 预处理后的图片 |
|||
*/ |
|||
public BufferedImage preprocessImage(BufferedImage originalImage) { |
|||
// TODO: 这是图像预处理的重点,需要根据实际验证码样式进行调整和优化 |
|||
// 保存原始图片方便对比 |
|||
try { |
|||
File originalFile = new File("original_captcha.png"); |
|||
ImageIO.write(originalImage, "png", originalFile); |
|||
} catch (IOException e) { |
|||
e.printStackTrace(); |
|||
} |
|||
|
|||
// 基础处理:转灰度 -> 二值化 |
|||
int width = originalImage.getWidth(); |
|||
int height = originalImage.getHeight(); |
|||
BufferedImage grayImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_GRAY); |
|||
grayImage.getGraphics().drawImage(originalImage, 0, 0, null); |
|||
|
|||
BufferedImage binaryImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_BINARY); |
|||
// 二值化阈值,需要调整 (0-255) |
|||
for (int y = 0; y < height; y++) { |
|||
for (int x = 0; x < width; x++) { |
|||
int gray = grayImage.getRaster().getSample(x, y, 0); |
|||
if (gray < BINARY_THRESHOLD) { |
|||
binaryImage.getRaster().setSample(x, y, 0, 0); // 黑色 |
|||
} else { |
|||
binaryImage.getRaster().setSample(x, y, 0, 1); // 白色 |
|||
} |
|||
} |
|||
} |
|||
|
|||
// TODO: 更高级的预处理包括:去噪点、去干扰线、字符分割、倾斜校正等 |
|||
// 如果验证码只有数字,可以尝试裁剪掉图片上下左右的空白或干扰区域 |
|||
|
|||
// 为了调试,将预处理后的图片保存下来查看效果 |
|||
try { |
|||
File outputfile = new File("preprocessed_captcha.png"); |
|||
ImageIO.write(binaryImage, "png", outputfile); |
|||
System.out.println("Preprocessed image saved to " + outputfile.getAbsolutePath()); |
|||
} catch (IOException e) { |
|||
e.printStackTrace(); |
|||
} |
|||
|
|||
return binaryImage; // 返回预处理后的图片 |
|||
} |
|||
|
|||
/** |
|||
* 使用 Tess4J 识别图片中的文字 |
|||
* @param image 待识别的图片 (最好是预处理后的) |
|||
* @return 识别出的字符串 (如果失败返回 null 或空字符串) |
|||
*/ |
|||
public String recognizeCaptcha(BufferedImage image) throws TesseractException { |
|||
Tesseract tesseract = new Tesseract(); |
|||
|
|||
// 设置 tessdata 路径 (如果 TESSDATA_PATH 已正确设置且 Tesseract 安装正确,这行可能不是必需的,Tess4J 会自动查找) |
|||
if (TESSDATA_PATH != null && !TESSDATA_PATH.isEmpty()) { |
|||
tesseract.setDatapath(TESSDATA_PATH); |
|||
} else { |
|||
System.err.println("WARNING: TESSDATA_PATH not set. Tess4J will try to find tessdata automatically."); |
|||
} |
|||
|
|||
tesseract.setLanguage("eng"); // 设置识别语言为英文 (通常包含数字) |
|||
// 如果验证码只有数字,可以尝试设置仅识别数字,这有助于提高准确率 |
|||
// tesseract.setTessVariable("tessedit_char_whitelist", "0123456789"); // 方法名请查阅 Tess4J 文档确认 |
|||
|
|||
String result = tesseract.doOCR(image); |
|||
// 清理识别结果,去除空格或换行符等 |
|||
result = result != null ? result.trim().replaceAll("[^0-9a-zA-Z]", "") : ""; // 根据验证码内容(数字、字母)调整清理规则 |
|||
|
|||
return result; |
|||
} |
|||
|
|||
/** |
|||
* 构建用于 POST 提交的表单数据字符串 |
|||
* @param formDataMap 包含所有表单字段名称和值的 Map |
|||
* @return URL 编码后的表单数据字符串 |
|||
* @throws IOException |
|||
*/ |
|||
private String buildPostData(Map<String, String> formDataMap) throws IOException { |
|||
StringBuilder postDataBuilder = new StringBuilder(); |
|||
boolean first = true; |
|||
// 遍历 Map 构建 POST 数据。如果需要特定顺序,使用 LinkedHashMap |
|||
for (Map.Entry<String, String> entry : formDataMap.entrySet()) { |
|||
if (!first) { |
|||
postDataBuilder.append("&"); |
|||
} |
|||
postDataBuilder.append(URLEncoder.encode(entry.getKey(), StandardCharsets.UTF_8.name())) |
|||
.append("=") |
|||
.append(URLEncoder.encode(entry.getValue() != null ? entry.getValue() : "", StandardCharsets.UTF_8.name())); |
|||
first = false; |
|||
} |
|||
return postDataBuilder.toString(); |
|||
} |
|||
|
|||
/** |
|||
* 将存储的 cookies 格式化为 HTTP 请求头部的 Cookie 字符串 |
|||
*/ |
|||
private String getCookieHeader() { |
|||
StringBuilder cookieHeaderBuilder = new StringBuilder(); |
|||
boolean first = true; |
|||
for (String cookie : this.cookies) { |
|||
if (!first) { |
|||
cookieHeaderBuilder.append("; "); |
|||
} |
|||
cookieHeaderBuilder.append(cookie); |
|||
first = false; |
|||
} |
|||
return cookieHeaderBuilder.toString(); |
|||
} |
|||
|
|||
|
|||
// Helper class to hold information extracted from a page fetch |
|||
private static class PageInfo { |
|||
int statusCode; |
|||
String redirectUrl; // 如果发生重定向 |
|||
String htmlContent; // 页面响应内容 |
|||
// 这里不再包含 ASP.NET 特有的字段,因为它是通用的 |
|||
} |
|||
} |
@ -0,0 +1,74 @@ |
|||
package com.example; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
import java.util.regex.Matcher; |
|||
import java.util.regex.Pattern; |
|||
|
|||
public class StringFieldExtractor { |
|||
public static void main(String[] args) { |
|||
// 输入字符串 |
|||
String input = "postTime:05-06-2024 00:00:00,title:PT/013/2024,content:澳門大學-N21科研大樓六樓智慧城市物聯網國家重點實驗室(澳門大學)建造工程 OBRAS DE CONSTRUÇÃO DO LABORATÓRIO DE REFERÊNCIA DO ESTADO DE INTERNET DAS COISAS PARA A CIDADE INTELIGENTE (UNIVERSIDADE DE MACAU), LOCALIZADO NO 6.º ANDAR DO EDIFÍCIO DE INVESTIGAÇÃO CIENTÍFICA N21 DA UNIVERSIDADE DE MACAU,fileList:[https://pct.admo.um.edu.mo/wp-content/uploads/2024/06/招標文件電子檔cover-CHI.pdf###pdf, https://pct.admo.um.edu.mo/wp-content/uploads/2024/06/招標文件電子檔cover-ENG-1.pdf###pdf, https://pct.admo.um.edu.mo/wp-content/uploads/2024/07/開標結果.pdf###pdf, https://pct.admo.um.edu.mo/wp-content/uploads/2024/11/判給結果-N21-6G.pdf###pdf]"; |
|||
|
|||
try { |
|||
// 存储提取结果 |
|||
String postTime = null; |
|||
String title = null; |
|||
String content = null; |
|||
List<String> fileList = new ArrayList<>(); |
|||
|
|||
// Step 1: 分割 fileList(因为它包含方括号,可能干扰其他字段) |
|||
String fileListStr = null; |
|||
int fileListStart = input.indexOf("fileList:["); |
|||
if (fileListStart != -1) { |
|||
int fileListEnd = input.lastIndexOf("]"); |
|||
if (fileListEnd != -1 && fileListEnd > fileListStart) { |
|||
fileListStr = input.substring(fileListStart + 9, fileListEnd + 1); // 提取 [..] |
|||
input = input.substring(0, fileListStart - 1); // 移除 fileList 部分 |
|||
} |
|||
} |
|||
|
|||
// Step 2: 解析其他字段(postTime, title, content) |
|||
String[] fields = input.split(",(?=\\w+:)", 3); // 按逗号分割,仅在键名前 |
|||
for (String field : fields) { |
|||
String[] keyValue = field.split(":", 2); // 分割键值对 |
|||
if (keyValue.length == 2) { |
|||
String key = keyValue[0].trim(); |
|||
String value = keyValue[1].trim(); |
|||
switch (key) { |
|||
case "postTime": |
|||
postTime = value; |
|||
break; |
|||
case "title": |
|||
title = value; |
|||
break; |
|||
case "content": |
|||
content = value; |
|||
break; |
|||
} |
|||
} |
|||
} |
|||
|
|||
// Step 3: 解析 fileList |
|||
if (fileListStr != null && fileListStr.startsWith("[") && fileListStr.endsWith("]")) { |
|||
String listContent = fileListStr.substring(1, fileListStr.length() - 1).trim(); |
|||
if (!listContent.isEmpty()) { |
|||
// 分割列表元素,注意 URL 内的逗号 |
|||
String[] urls = listContent.split(",\\s*(?=https)"); |
|||
for (String url : urls) { |
|||
fileList.add(url.trim()); |
|||
} |
|||
} |
|||
} |
|||
|
|||
// 输出结果 |
|||
System.out.println("postTime: " + postTime); |
|||
System.out.println("title: " + title); |
|||
System.out.println("content: " + content); |
|||
System.out.println("fileList: " + fileList); |
|||
|
|||
} catch (Exception e) { |
|||
System.err.println("Parsing error: " + e.getMessage()); |
|||
e.printStackTrace(); |
|||
} |
|||
} |
|||
} |
@ -0,0 +1,60 @@ |
|||
package com.example; |
|||
|
|||
import io.github.bonigarcia.wdm.WebDriverManager; |
|||
import org.openqa.selenium.By; |
|||
import org.openqa.selenium.WebDriver; |
|||
import org.openqa.selenium.WebElement; |
|||
import org.openqa.selenium.chrome.ChromeDriver; |
|||
|
|||
import java.util.List; |
|||
|
|||
public class WipoPatentsSelenium { |
|||
|
|||
public static void main(String[] args) throws InterruptedException { |
|||
// 自动管理驱动 |
|||
WebDriverManager.chromedriver().setup(); |
|||
WebDriver driver = new ChromeDriver(); |
|||
|
|||
try { |
|||
driver.get("https://patentscope.wipo.int/search/en/result.jsf?query=FP:(AI)"); |
|||
|
|||
// 等待页面加载(粗略等待) |
|||
Thread.sleep(3000); |
|||
|
|||
int maxPages = 3; |
|||
int currentPage = 1; |
|||
|
|||
while (currentPage <= maxPages) { |
|||
System.out.println("📄 当前第 " + currentPage + " 页:"); |
|||
|
|||
// 找到所有结果项 |
|||
List<WebElement> results = driver.findElements(By.cssSelector(".resultitem")); |
|||
|
|||
for (WebElement result : results) { |
|||
String title = result.findElement(By.cssSelector(".resulttitle")).getText(); |
|||
String pubNum = result.findElement(By.cssSelector(".pubNumber")).getText(); |
|||
System.out.println("🔹 " + pubNum + " - " + title); |
|||
} |
|||
|
|||
// 查找“下一页”按钮,点击 |
|||
WebElement nextButton = null; |
|||
try { |
|||
nextButton = driver.findElement(By.cssSelector("a[title='Next']")); |
|||
} catch (Exception e) { |
|||
System.out.println("✅ 已到最后一页或按钮未找到"); |
|||
break; |
|||
} |
|||
|
|||
if (nextButton != null && nextButton.isDisplayed()) { |
|||
nextButton.click(); |
|||
currentPage++; |
|||
Thread.sleep(3000); // 等待下一页加载 |
|||
} else { |
|||
break; |
|||
} |
|||
} |
|||
} finally { |
|||
driver.quit(); |
|||
} |
|||
} |
|||
} |
@ -0,0 +1,594 @@ |
|||
package com.example; |
|||
import com.fasterxml.jackson.databind.ObjectMapper; |
|||
import okhttp3.*; |
|||
import org.apache.kafka.clients.producer.KafkaProducer; |
|||
import org.apache.kafka.clients.producer.ProducerConfig; |
|||
import org.apache.kafka.clients.producer.ProducerRecord; |
|||
import org.apache.kafka.common.serialization.StringSerializer; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
import java.io.BufferedReader; |
|||
import java.io.IOException; |
|||
import java.io.InputStreamReader; |
|||
import java.io.OutputStream; |
|||
import java.net.HttpURLConnection; |
|||
import java.net.URL; |
|||
import java.net.URLEncoder; |
|||
import java.nio.charset.StandardCharsets; |
|||
import java.nio.file.Files; |
|||
import java.nio.file.Paths; |
|||
import java.text.ParseException; |
|||
import java.text.SimpleDateFormat; |
|||
import java.time.LocalDate; |
|||
import java.time.LocalDateTime; |
|||
import java.time.format.DateTimeFormatter; |
|||
import java.util.*; |
|||
import java.util.concurrent.ExecutorService; |
|||
import java.util.concurrent.Executors; |
|||
import java.util.concurrent.TimeUnit; |
|||
import java.util.regex.Matcher; |
|||
import java.util.regex.Pattern; |
|||
|
|||
public class cliniTopic { |
|||
private static final String TOPIC_NAME = "cliniTopic"; |
|||
private static final String BOOTSTRAP_SERVERS = "localhost:9092"; |
|||
private static KafkaProducer<String, String> producer; |
|||
private static ObjectMapper objectMapper = new ObjectMapper(); |
|||
private static final Random random = new Random(); |
|||
private static List<String> proxyList = new ArrayList<>(); // 代理池 |
|||
private static int currentProxyIndex = 0; // 当前使用的代理索引 |
|||
static { |
|||
Properties props = new Properties(); |
|||
props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, BOOTSTRAP_SERVERS); |
|||
props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer"); |
|||
props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer"); |
|||
props.put(ProducerConfig.ACKS_CONFIG, "all"); // 等待所有副本确认 |
|||
props.put(ProducerConfig.RETRIES_CONFIG, 3); // 重试次数 |
|||
producer = new KafkaProducer<>(props); |
|||
try { |
|||
proxyList = Files.readAllLines(Paths.get("proxy.txt")); |
|||
if (proxyList.isEmpty()) { |
|||
System.out.println("警告: proxy.txt 为空,未加载任何代理"); |
|||
} else { |
|||
System.out.println("成功加载 " + proxyList.size() + " 个代理"); |
|||
} |
|||
} catch (IOException e) { |
|||
System.err.println("读取 proxy.txt 失败: " + e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
public static void main(String[] args) throws IOException, InterruptedException { |
|||
List<String> keywords = Files.readAllLines(Paths.get("keywords.txt")); |
|||
List<String> cleanedKeywords = new ArrayList<>(); |
|||
for (String keyword : keywords) { |
|||
String cleaned = keyword.split(",")[0].trim(); // 取逗号前的部分并去除首尾空格 |
|||
cleanedKeywords.add(cleaned); |
|||
} |
|||
ExecutorService executor = Executors.newFixedThreadPool(4); // 4 个线程 |
|||
for (String keyword : cleanedKeywords) { |
|||
executor.submit(() -> { |
|||
try { |
|||
int sleepTime = random.nextInt(1001) + 30000; |
|||
for (Integer i=1;i<=7;i++){ |
|||
final Integer pageNum = i; |
|||
Map list = list(keyword,i); |
|||
List<String> urls = (List<String>) list.get("listUrl"); |
|||
if (urls.isEmpty()){ |
|||
System.out.println("没有关键词"+keyword+"检索结果"); |
|||
break; |
|||
} |
|||
Integer count = Integer.parseInt(String.valueOf(list.get("count"))); |
|||
Integer totalPage = Integer.parseInt(String.valueOf(list.get("totalPage"))); |
|||
for(String url:urls){ |
|||
Map<String,Object> result = content(url); |
|||
Thread.sleep(sleepTime); |
|||
String registNum = String.valueOf(result.get("registNum")); |
|||
String crawlUrl = String.valueOf(result.get("crawlUrl")); |
|||
|
|||
try { |
|||
String jsonValue = objectMapper.writeValueAsString(result); |
|||
ProducerRecord<String, String> record = new ProducerRecord<>(TOPIC_NAME, registNum, jsonValue); |
|||
|
|||
producer.send(record, (metadata, exception) -> { |
|||
if (exception == null) { |
|||
System.out.println("成功发送到Kafka - Partition: " + metadata.partition() + |
|||
", Offset: " + metadata.offset() + ", "+crawlUrl + ", "+ keyword + " , " + pageNum ); |
|||
} else { |
|||
System.err.println("发送到Kafka失败: " + exception.getMessage()); |
|||
} |
|||
}); |
|||
} catch (Exception e) { |
|||
System.err.println("序列化或发送Kafka消息失败: " + e.getMessage()); |
|||
} |
|||
Thread.sleep(sleepTime); |
|||
|
|||
} |
|||
if(count<10||totalPage==i){ |
|||
System.out.println("关键词"+keyword+"已检索完毕"); |
|||
break; |
|||
} |
|||
|
|||
} |
|||
} catch (Exception e) { |
|||
System.err.println("处理 " + keyword + " 失败: " + e.getMessage()); |
|||
e.printStackTrace(); |
|||
} |
|||
}); |
|||
} |
|||
executor.shutdown(); |
|||
executor.awaitTermination(5, TimeUnit.HOURS); |
|||
producer.close(); |
|||
} |
|||
|
|||
private static Map<String,Object> list(String keyword,Integer page) throws Exception{ |
|||
Map<String,Object> map = new HashMap<>(); |
|||
String baseUrl = "https://www.drks.de/search/de"; |
|||
String hostUrl = "https://www.drks.de"; |
|||
String cleanUrl = "https://www.drks.de/search/de/results"; |
|||
System.out.println("Pure URL: " + cleanUrl); |
|||
|
|||
System.out.println("Page Number: " + page); |
|||
|
|||
// 存储 cookies |
|||
Set<String> cookieSet = new HashSet<>(); |
|||
String sessionId = null; |
|||
|
|||
// 第一步:初始 GET 请求,获取 cookies 和 ViewState |
|||
URL initialUrl = new URL(baseUrl); |
|||
HttpURLConnection initialConn = (HttpURLConnection) initialUrl.openConnection(); |
|||
initialConn.setRequestMethod("GET"); |
|||
initialConn.setInstanceFollowRedirects(false); |
|||
initialConn.setConnectTimeout(10000); |
|||
initialConn.setReadTimeout(10000); |
|||
|
|||
// 捕获 cookies |
|||
sessionId = updateCookies(initialConn, cookieSet); |
|||
System.out.println("Initial Cookies: " + cookieSet); |
|||
System.out.println("Initial Session ID: " + sessionId); |
|||
|
|||
// 读取响应内容以获取 ViewState |
|||
BufferedReader in = new BufferedReader(new InputStreamReader(initialConn.getInputStream())); |
|||
StringBuilder content = new StringBuilder(); |
|||
String inputLine; |
|||
while ((inputLine = in.readLine()) != null) { |
|||
content.append(inputLine); |
|||
} |
|||
in.close(); |
|||
initialConn.disconnect(); |
|||
|
|||
// 提取初始 ViewState |
|||
String initialViewState = extractViewState(content.toString()); |
|||
System.out.println("Initial ViewState: " + initialViewState); |
|||
|
|||
// 第二步:发送搜索 POST 请求 |
|||
HttpURLConnection searchConn = (HttpURLConnection) new URL(baseUrl).openConnection(); |
|||
searchConn.setRequestMethod("POST"); |
|||
searchConn.setInstanceFollowRedirects(false); |
|||
searchConn.setDoOutput(true); |
|||
searchConn.setConnectTimeout(10000); |
|||
searchConn.setReadTimeout(10000); |
|||
|
|||
// 设置搜索请求的请求头 |
|||
searchConn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"); |
|||
searchConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"); |
|||
searchConn.setRequestProperty("Cookie", String.join("; ", cookieSet)); |
|||
searchConn.setRequestProperty("Origin", "https://www.drks.de"); |
|||
searchConn.setRequestProperty("Referer", baseUrl); |
|||
searchConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"); |
|||
|
|||
// 构建搜索请求的 POST 数据 |
|||
String searchPostData = buildSearchPostData(initialViewState,keyword); |
|||
|
|||
// 发送搜索 POST 请求 |
|||
try (OutputStream os = searchConn.getOutputStream()) { |
|||
byte[] input = searchPostData.getBytes(StandardCharsets.UTF_8); |
|||
os.write(input, 0, input.length); |
|||
} |
|||
|
|||
// 更新 cookies |
|||
String searchSessionId = updateCookies(searchConn, cookieSet); |
|||
System.out.println("Search Cookies: " + cookieSet); |
|||
System.out.println("Search Session ID: " + searchSessionId); |
|||
|
|||
// 处理搜索响应 |
|||
int searchResponseCode = searchConn.getResponseCode(); |
|||
System.out.println("Search Response Code: " + searchResponseCode); |
|||
String redirectUrl = searchConn.getHeaderField("Location"); |
|||
searchConn.disconnect(); |
|||
|
|||
if (searchResponseCode != 302 || redirectUrl == null) { |
|||
System.err.println("Search request did not return expected 302 redirect. Response code: " + searchResponseCode); |
|||
return null; |
|||
} |
|||
System.out.println("Redirect URL (raw): " + redirectUrl); |
|||
|
|||
// 解析相对 URL |
|||
if (!redirectUrl.startsWith("http")) { |
|||
redirectUrl = hostUrl + (redirectUrl.startsWith("/") ? redirectUrl : "/" + redirectUrl); |
|||
} |
|||
System.out.println("Resolved Redirect URL: " + redirectUrl); |
|||
|
|||
// 第三步:跟随重定向(使用 GET 请求) |
|||
URL resultsUrl = new URL(redirectUrl); |
|||
HttpURLConnection resultsConn = (HttpURLConnection) resultsUrl.openConnection(); |
|||
resultsConn.setRequestMethod("GET"); |
|||
resultsConn.setInstanceFollowRedirects(false); |
|||
resultsConn.setConnectTimeout(10000); |
|||
resultsConn.setReadTimeout(10000); |
|||
resultsConn.setRequestProperty("Cookie", String.join("; ", cookieSet)); |
|||
resultsConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"); |
|||
resultsConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64ек; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"); |
|||
|
|||
// 更新 cookies |
|||
String resultsSessionId = updateCookies(resultsConn, cookieSet); |
|||
System.out.println("Results Cookies: " + cookieSet); |
|||
System.out.println("Results Session ID: " + resultsSessionId); |
|||
|
|||
// 读取重定向后的结果页面内容 |
|||
BufferedReader resultsReader = new BufferedReader(new InputStreamReader(resultsConn.getInputStream())); |
|||
StringBuilder resultsContent = new StringBuilder(); |
|||
while ((inputLine = resultsReader.readLine()) != null) { |
|||
resultsContent.append(inputLine); |
|||
} |
|||
resultsReader.close(); |
|||
resultsConn.disconnect(); |
|||
|
|||
// 提取页面中的 ViewState(状态信息,用于后续请求) |
|||
String viewState = extractViewState(resultsContent.toString()); |
|||
System.out.println("Results ViewState: " + viewState); |
|||
|
|||
// 检查 Session ID 是否一致,确保会话未被重置 |
|||
if (sessionId != null && !sessionId.equals(resultsSessionId)) { |
|||
System.out.println("Warning: Session ID changed. Initial: " + sessionId + ", Results: " + resultsSessionId); |
|||
} |
|||
|
|||
// Step 4: 第四步:发送分页请求(使用 POST) |
|||
HttpURLConnection postConn = (HttpURLConnection) new URL(cleanUrl).openConnection(); |
|||
postConn.setRequestMethod("POST"); |
|||
postConn.setInstanceFollowRedirects(false); |
|||
postConn.setDoOutput(true); |
|||
postConn.setConnectTimeout(10000); |
|||
postConn.setReadTimeout(10000); |
|||
|
|||
// 设置分页请求的请求头(非 AJAX,模拟浏览器常规请求) |
|||
postConn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"); |
|||
postConn.setRequestProperty("Cookie", String.join("; ", cookieSet)); |
|||
postConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"); |
|||
postConn.setRequestProperty("Origin", "https://www.drks.de"); |
|||
postConn.setRequestProperty("Referer", cleanUrl); |
|||
postConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"); |
|||
postConn.setRequestProperty("Sec-Fetch-Dest", "document"); |
|||
postConn.setRequestProperty("Sec-Fetch-Mode", "navigate"); |
|||
|
|||
// 构建分页请求的 POST 参数(包括页码和 ViewState 等) |
|||
String postData = buildPostData(viewState, page); |
|||
// 发送分页的 POST 请求 |
|||
try (OutputStream os = postConn.getOutputStream()) { |
|||
byte[] input = postData.getBytes(StandardCharsets.UTF_8); |
|||
os.write(input, 0, input.length); |
|||
} |
|||
|
|||
// 更新 cookies(分页响应可能返回新的 Set-Cookie) |
|||
String paginationSessionId = updateCookies(postConn, cookieSet); |
|||
System.out.println("Pagination Cookies: " + cookieSet); |
|||
System.out.println("Pagination Session ID: " + paginationSessionId); |
|||
|
|||
// 处理分页响应 |
|||
int responseCode = postConn.getResponseCode(); |
|||
System.out.println("Pagination Response Code: " + responseCode); |
|||
|
|||
// 读取分页响应的 HTML 内容 |
|||
StringBuilder postContent = new StringBuilder(); |
|||
try (BufferedReader postReader = new BufferedReader( |
|||
new InputStreamReader( |
|||
responseCode >= 400 ? postConn.getErrorStream() : postConn.getInputStream()))) { |
|||
while ((inputLine = postReader.readLine()) != null) { |
|||
postContent.append(inputLine); |
|||
} |
|||
} |
|||
Document parse = null; |
|||
if (responseCode == HttpURLConnection.HTTP_MOVED_TEMP |
|||
|| responseCode == HttpURLConnection.HTTP_MOVED_PERM |
|||
|| responseCode == HttpURLConnection.HTTP_SEE_OTHER) { |
|||
String newUrl = postConn.getHeaderField("Location"); |
|||
System.out.println("Pagination Redirecting to: " + newUrl); |
|||
|
|||
// 解析重定向中的相对地址为完整 URL(如果是相对路径) |
|||
if (!newUrl.startsWith("http")) { |
|||
newUrl = hostUrl + (newUrl.startsWith("/") ? newUrl : "/" + newUrl); |
|||
} |
|||
|
|||
// 重定向 |
|||
URL redirectConn = new URL(newUrl); |
|||
HttpURLConnection followConn = (HttpURLConnection) redirectConn.openConnection(); |
|||
followConn.setRequestMethod("GET"); |
|||
followConn.setRequestProperty("Cookie", String.join("; ", cookieSet)); |
|||
followConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"); |
|||
followConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"); |
|||
|
|||
BufferedReader redirectReader = new BufferedReader(new InputStreamReader(followConn.getInputStream())); |
|||
StringBuilder redirectContent = new StringBuilder(); |
|||
while ((inputLine = redirectReader.readLine()) != null) { |
|||
redirectContent.append(inputLine); |
|||
} |
|||
redirectReader.close(); |
|||
followConn.disconnect(); |
|||
parse = Jsoup.parse(String.valueOf(redirectContent)); |
|||
} else if (responseCode == 200) { |
|||
parse = Jsoup.parse(String.valueOf(postContent)); |
|||
} |
|||
|
|||
|
|||
|
|||
Elements links = parse.select("div[data-label='Titel der Studie'] a"); |
|||
List<String> listUrl = new ArrayList(); |
|||
Integer count = 0; |
|||
for (Element link : links) { |
|||
String href = link.attr("href"); |
|||
String trueUrl = "https://www.drks.de/"+href; |
|||
listUrl.add(trueUrl); |
|||
count++; |
|||
} |
|||
String text = parse.select("div.col-md-2.pt-3.ps-0.text-md-end").text(); |
|||
// 使用正则表达式提取 "第" 和 "/" 之间的数字 |
|||
String regex = "Seite\\s*(\\d+)\\s*/"; |
|||
Matcher matcher = Pattern.compile(regex).matcher(text); |
|||
if (matcher.find()) { |
|||
map.put("totalPage",matcher.group(1));// 返回第一个捕获组,即数字 "1" |
|||
} |
|||
map.put("listUrl",listUrl); |
|||
map.put("count",count); |
|||
map.put("keyword",keyword); |
|||
postConn.disconnect(); |
|||
return map; |
|||
} |
|||
// 更新并返回当前连接中的 Cookie,包含 JSESSIONID 的提取 |
|||
private static String updateCookies(HttpURLConnection conn, Set<String> cookieSet) { |
|||
String sessionId = null; |
|||
Map<String, List<String>> headerFields = conn.getHeaderFields(); |
|||
List<String> cookiesHeader = headerFields.get("Set-Cookie"); |
|||
if (cookiesHeader != null) { |
|||
for (String cookie : cookiesHeader) { |
|||
String cookieValue = cookie.split(";")[0]; |
|||
cookieSet.add(cookieValue); |
|||
if (cookieValue.startsWith("JSESSIONID=") || cookieValue.startsWith("csfcfc=")) { |
|||
sessionId = cookieValue; |
|||
} |
|||
} |
|||
} |
|||
return sessionId; |
|||
} |
|||
// 提取 __VIEWSTATE 隐藏字段的值 |
|||
private static String extractViewState(String html) { |
|||
if (html == null || html.isEmpty()) { |
|||
System.err.println("HTML content is empty or null"); |
|||
return ""; |
|||
} |
|||
|
|||
// 兼容 jakarta.faces.ViewState 和 javax.faces.ViewState |
|||
String regex = "<input[^>]*name=[\"'](?:jakarta|javax)\\.faces\\.ViewState[\"'][^>]*value=[\"']([^\"']+)[\"']"; |
|||
Pattern pattern = Pattern.compile(regex); |
|||
Matcher matcher = pattern.matcher(html); |
|||
|
|||
if (matcher.find()) { |
|||
return matcher.group(1); |
|||
} |
|||
|
|||
System.err.println("Failed to extract ViewState from HTML"); |
|||
return ""; |
|||
} |
|||
|
|||
private static Map<String,Object> content(String url)throws Exception{ |
|||
|
|||
OkHttpClient client = new OkHttpClient().newBuilder() |
|||
.build(); |
|||
MediaType mediaType = MediaType.parse("application/json"); |
|||
Request request = new Request.Builder() |
|||
.url(url) |
|||
.get() |
|||
.addHeader("Content-Type", "application/json") |
|||
.build(); |
|||
Response response = client.newCall(request).execute(); |
|||
String html = response.body().string(); |
|||
Document parse = Jsoup.parse(html, "UTF-8"); |
|||
String title = parse.select(".title-bold").text(); |
|||
String registNum = parse.select(".card.trial-details-float.mb-4 .card-body dl dd:nth-child(2)").text(); |
|||
String registTime = convertDate(parse.select(".card.trial-details-float.mb-4 .card-body dl dd:nth-child(6)").text()); |
|||
Map<String,Object> sponsor = new HashMap<>(); |
|||
String header = parse.select("body > main > div.card-body > div:nth-child(9) > div.card-body > div > div > div > div.card-header > h4").text(); |
|||
String site = parse.select("body > main > div.card-body > div:nth-child(9) > div.card-body > div > div > div > div.card-body > dl > dd:nth-child(2) > div").text(); |
|||
String telefon = parse.select("body > main > div.card-body > div:nth-child(9) > div.card-body > div > div > div > div.card-body > dl > dd:nth-child(4) > span").text(); |
|||
String disease = parse.select("body > main > div.card-body > div:nth-child(6) > div.card-body > div > div:nth-child(2) > dl > dd:nth-child(2) > span").text(); |
|||
String studyType = parse.select("body > main > div.card-body > div:nth-child(3) > div.card-body > dl").text(); |
|||
String inclusionCriteria = parse.select("body > main > div.card-body > div:nth-child(7) > div.card-body > div:nth-child(2) > div:nth-child(3) > div > div.card-body > div > div.col-12.mt-3 > dl > dd > span").text(); |
|||
String exclusionCriteria = parse.select("body > main > div.card-body > div:nth-child(7) > div.card-body > div:nth-child(2) > div:nth-child(4) > div > div.card-body > p > span").text(); |
|||
String country = parse.select("body > main > div.card-body > div:nth-child(7) > div.card-body > div:nth-child(2) > div:nth-child(1) > div > div.card-body > dl > dd:nth-child(2)").text(); |
|||
String intervention = parse.select("body > main > div.card-body > div:nth-child(4) > div.card-body > dl").text(); |
|||
String primaryOutcome = parse.select("body > main > div.card-body > div:nth-child(5) > div.card-body > div > div > dl").text(); |
|||
String enrollment = parse.select("body > main > div.card-body > div:nth-child(7) > div.card-body > div:nth-child(2) > div:nth-child(2) > div > div.card-body > div > div:nth-child(5) > dl > dd > span").text(); |
|||
sponsor.put("header",header); |
|||
sponsor.put("site",site); |
|||
sponsor.put("telefon",telefon); |
|||
Map<String,Object> resultData = new HashMap<>(); |
|||
resultData.put("title", title); |
|||
resultData.put("registNum",registNum); |
|||
resultData.put("registTime",registTime); |
|||
resultData.put("registStatus",""); |
|||
resultData.put("registTitle",""); |
|||
resultData.put("fullTitle",""); |
|||
resultData.put("sponsor",sponsor); |
|||
resultData.put("sponsorPart",""); |
|||
resultData.put("studyType",studyType); |
|||
resultData.put("phase",""); |
|||
resultData.put("disease",disease); |
|||
resultData.put("studyDesign",""); |
|||
resultData.put("studyObjective",""); |
|||
resultData.put("studyStartDate",""); |
|||
resultData.put("inclusionCriteria",inclusionCriteria); |
|||
resultData.put("exclusionCriteria",exclusionCriteria); |
|||
resultData.put("currentStatus",""); |
|||
resultData.put("enrollment",enrollment); |
|||
resultData.put("country",country); |
|||
resultData.put("tagTime",""); |
|||
resultData.put("intervention",intervention); |
|||
resultData.put("primaryOutcome",primaryOutcome); |
|||
resultData.put("crawlTime",getCurrentTime()); |
|||
resultData.put("crawlUrl",url); |
|||
resultData.put("postTime",registTime); |
|||
resultData.put("content","content"); |
|||
resultData.put("forwardcontent","forwardcontent"); |
|||
resultData.put("cid","Ndrks"); |
|||
return resultData; |
|||
} |
|||
// 生成搜索请求的 POST 数据 |
|||
private static String buildSearchPostData(String viewState,String keyword) { |
|||
try { |
|||
return "searchForm=searchForm" + |
|||
"&searchForm%3Aj_idt80=" + keyword + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AdrksId=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AsecondaryId=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AscientificSummary=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aoutcome=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AhealthOfCondition=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AhealthyVolunteers=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aaddresses=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt128=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AipdSharingPlan=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt135%3Afrom=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt135%3Ato=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt146%3Afrom=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt146%3Ato=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt157%3Afrom=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt157%3Ato=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3Agender=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AageInYears=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AinclusionCriteria=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AexclusionCriteria=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AtrialStatus=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3ArecrutingLocation=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3Aj_idt213%3Afrom=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3Aj_idt213%3Ato=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3AtrialDesign%3Apurpose=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3AtrialDesign%3AstudyType=" + |
|||
"&searchForm%3Aj_idt287=" + |
|||
"&javax.faces.ViewState=" + URLEncoder.encode(viewState, StandardCharsets.UTF_8.name()); |
|||
} catch (Exception e) { |
|||
System.err.println("Error encoding search ViewState: " + e.getMessage()); |
|||
return ""; |
|||
} |
|||
} |
|||
// 生成分页请求的 POST 数据 |
|||
private static String buildPostData(String viewState, int page) { |
|||
int adjustedPage = page - 1; |
|||
try { |
|||
return "resultForm=resultForm" + |
|||
"&resultForm%3Asorting%3ArowsPerPage=10" + |
|||
"&resultForm%3ApaginationTop%3Aj_idt156%3A"+ adjustedPage +"%3Aj_idt158=" + page + |
|||
"&resultForm%3Asorting%3AsortingBy=SCORE" + |
|||
"&resultForm%3Asorting%3Aj_idt141=true" + |
|||
"&resultForm%3Aj_idt221%3Aj_idt223%3AdownloadConfirmation=resultForm%3Aj_idt221%3Aj_idt223%3AdownloadConfirmation" + |
|||
"&selectedType=JSON" + |
|||
"&javax.faces.ViewState=" + URLEncoder.encode(viewState, StandardCharsets.UTF_8.name()); |
|||
} catch (Exception e) { |
|||
System.err.println("Error encoding pagination ViewState: " + e.getMessage()); |
|||
return ""; |
|||
} |
|||
} |
|||
public static String convertDate(String inputDate) { |
|||
try { |
|||
// 输入格式:dd.MM.yyyy |
|||
SimpleDateFormat inputFormat = new SimpleDateFormat("dd.MM.yyyy"); |
|||
// 解析输入日期 |
|||
Date date = inputFormat.parse(inputDate); |
|||
// 输出格式:yyyy-MM-dd HH:mm:ss |
|||
SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); |
|||
// 转换为目标格式 |
|||
return outputFormat.format(date); |
|||
} catch (ParseException e) { |
|||
// 处理解析异常 |
|||
return "Invalid date format"; |
|||
} |
|||
} |
|||
|
|||
public static String getCurrentTime() { |
|||
// 创建 DateTimeFormatter,指定输出格式 |
|||
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); |
|||
// 获取当前时间 |
|||
LocalDateTime now = LocalDateTime.now(); |
|||
// 格式化 |
|||
return now.format(formatter); |
|||
} |
|||
private static Response executeWithRetry(OkHttpClient client, Request request, String keyword) throws IOException { |
|||
int maxRetries = proxyList.isEmpty() ? 1 : proxyList.size(); // 如果没有代理,只尝试一次 |
|||
int attempt = 0; |
|||
|
|||
while (attempt < maxRetries) { |
|||
Response response = client.newCall(request).execute(); |
|||
if (response.code() == 403) { |
|||
System.out.println("收到 403 状态码,尝试切换代理重试..."); |
|||
response.close(); |
|||
switchProxy(); |
|||
client = createClientWithProxy(); // 使用新代理重建客户端 |
|||
attempt++; |
|||
if (attempt == maxRetries) { |
|||
throw new IOException("所有代理尝试失败,仍然收到 403"); |
|||
} |
|||
continue; |
|||
} |
|||
return response; // 成功或非 403 状态码,直接返回 |
|||
} |
|||
throw new IOException("无法执行请求,未获取响应"); |
|||
} |
|||
private static OkHttpClient createClientWithProxy() { |
|||
OkHttpClient.Builder builder = new OkHttpClient().newBuilder() |
|||
.connectTimeout(30, TimeUnit.SECONDS) |
|||
.readTimeout(30, TimeUnit.SECONDS) |
|||
.writeTimeout(30, TimeUnit.SECONDS); |
|||
|
|||
if (!proxyList.isEmpty() && currentProxyIndex < proxyList.size()) { |
|||
String proxy = proxyList.get(currentProxyIndex); |
|||
String[] proxyParts = proxy.split(":"); |
|||
if (proxyParts.length == 2) { |
|||
String proxyHost = proxyParts[0]; |
|||
int proxyPort = Integer.parseInt(proxyParts[1]); |
|||
builder.proxy(new java.net.Proxy(java.net.Proxy.Type.HTTP, |
|||
new java.net.InetSocketAddress(proxyHost, proxyPort))); |
|||
System.out.println("使用代理: " + proxy); |
|||
} |
|||
} |
|||
return builder.build(); |
|||
} |
|||
private static synchronized void switchProxy() { |
|||
if (proxyList.isEmpty()) return; |
|||
currentProxyIndex = (currentProxyIndex + 1) % proxyList.size(); |
|||
System.out.println("切换到新代理: " + proxyList.get(currentProxyIndex)); |
|||
} |
|||
public static String increaseOffsetBy30(String originalPayload) { |
|||
// 以 "|" 分割载荷为数组 |
|||
String[] parts = originalPayload.split("\\|"); |
|||
|
|||
// 检查数组长度,确保有足够元素 |
|||
if (parts.length < 4) { |
|||
throw new IllegalArgumentException("载荷格式无效,元素不足"); |
|||
} |
|||
|
|||
// 找到倒数第 4 个元素的位置 |
|||
int targetIndex = parts.length - 4; |
|||
|
|||
try { |
|||
// 将倒数第 4 个数字解析为整数 |
|||
int currentOffset = Integer.parseInt(parts[targetIndex]); |
|||
// 增加 30 |
|||
int newOffset = currentOffset + 30; |
|||
// 将新值放回数组 |
|||
parts[targetIndex] = String.valueOf(newOffset); |
|||
// 重新拼接载荷 |
|||
return String.join("|", parts); |
|||
} catch (NumberFormatException e) { |
|||
throw new IllegalArgumentException("倒数第 4 个元素不是有效数字: " + parts[targetIndex]); |
|||
} |
|||
} |
|||
} |
@ -0,0 +1,438 @@ |
|||
package com.example; |
|||
|
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
import java.io.BufferedReader; |
|||
import java.io.InputStreamReader; |
|||
import java.io.OutputStream; |
|||
import java.net.HttpURLConnection; |
|||
import java.net.URL; |
|||
import java.net.URLEncoder; |
|||
import java.nio.charset.StandardCharsets; |
|||
import java.util.HashSet; |
|||
import java.util.List; |
|||
import java.util.Map; |
|||
import java.util.Set; |
|||
import java.util.regex.Matcher; |
|||
import java.util.regex.Pattern; |
|||
|
|||
public class drks { |
|||
public static void main(String[] args) throws Exception { |
|||
String targetUrl = "https://www.drks.de/search/de/results?page=4"; |
|||
String baseUrl = "https://www.drks.de/search/de"; |
|||
String hostUrl = "https://www.drks.de"; |
|||
String cleanUrl = targetUrl.split("\\?")[0]; |
|||
System.out.println("Pure URL: " + cleanUrl); |
|||
|
|||
|
|||
String pageNumber = targetUrl.contains("?page=") ? targetUrl.split("page=")[1] : "1"; |
|||
int page = Integer.parseInt(pageNumber); |
|||
System.out.println("Page Number: " + page); |
|||
|
|||
// 存储 cookies |
|||
Set<String> cookieSet = new HashSet<>(); |
|||
String sessionId = null; |
|||
|
|||
// 第一步:初始 GET 请求,获取 cookies 和 ViewState |
|||
System.out.println("\n--- Step 1: Initial GET Request ---"); |
|||
URL initialUrl = new URL(baseUrl); |
|||
HttpURLConnection initialConn = (HttpURLConnection) initialUrl.openConnection(); |
|||
initialConn.setRequestMethod("GET"); |
|||
initialConn.setInstanceFollowRedirects(false); |
|||
initialConn.setConnectTimeout(10000); |
|||
initialConn.setReadTimeout(10000); |
|||
initialConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"); |
|||
|
|||
|
|||
// 捕获 cookies |
|||
sessionId = updateCookies(initialConn, cookieSet); |
|||
System.out.println("Initial Cookies: " + cookieSet); |
|||
System.out.println("Initial Session ID: " + sessionId); |
|||
|
|||
// 读取响应内容以获取 ViewState |
|||
BufferedReader in = new BufferedReader(new InputStreamReader(initialConn.getInputStream())); |
|||
StringBuilder content = new StringBuilder(); |
|||
String inputLine; |
|||
while ((inputLine = in.readLine()) != null) { |
|||
content.append(inputLine); |
|||
} |
|||
in.close(); |
|||
initialConn.disconnect(); |
|||
|
|||
// 提取初始 ViewState |
|||
String initialViewState = extractViewState(content.toString()); |
|||
System.out.println("Initial ViewState: " + initialViewState); |
|||
|
|||
// 第二步:发送搜索 POST 请求 |
|||
System.out.println("\n--- Step 2: Search POST Request ---"); |
|||
HttpURLConnection searchConn = (HttpURLConnection) new URL(baseUrl).openConnection(); |
|||
searchConn.setRequestMethod("POST"); |
|||
searchConn.setInstanceFollowRedirects(false); |
|||
searchConn.setDoOutput(true); |
|||
searchConn.setConnectTimeout(10000); |
|||
searchConn.setReadTimeout(10000); |
|||
|
|||
// 设置搜索请求的请求头 |
|||
searchConn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"); |
|||
searchConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"); |
|||
searchConn.setRequestProperty("Cookie", String.join("; ", cookieSet)); |
|||
searchConn.setRequestProperty("Origin", "https://www.drks.de"); |
|||
searchConn.setRequestProperty("Referer", baseUrl); |
|||
searchConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"); |
|||
|
|||
// 构建搜索请求的 POST 数据 |
|||
String searchPostData = buildSearchPostData(initialViewState); |
|||
System.out.println("Search POST Data: " + searchPostData); |
|||
|
|||
// 发送搜索 POST 请求 |
|||
try (OutputStream os = searchConn.getOutputStream()) { |
|||
byte[] input = searchPostData.getBytes(StandardCharsets.UTF_8); |
|||
os.write(input, 0, input.length); |
|||
} |
|||
|
|||
// 更新 cookies |
|||
String searchSessionId = updateCookies(searchConn, cookieSet); |
|||
System.out.println("Search Cookies: " + cookieSet); |
|||
System.out.println("Search Session ID: " + searchSessionId); // This is null in your output, which is a potential issue |
|||
|
|||
// 处理搜索响应 |
|||
int searchResponseCode = searchConn.getResponseCode(); |
|||
System.out.println("Search Response Code: " + searchResponseCode); |
|||
|
|||
if (searchResponseCode == 302) { |
|||
String redirectUrl = searchConn.getHeaderField("Location"); |
|||
searchConn.disconnect(); |
|||
|
|||
if (redirectUrl == null) { |
|||
System.err.println("Search request returned 302 but no Location header found."); |
|||
return; |
|||
} |
|||
System.out.println("Redirect URL (raw): " + redirectUrl); |
|||
|
|||
// 解析相对 URL |
|||
if (!redirectUrl.startsWith("http")) { |
|||
redirectUrl = hostUrl + (redirectUrl.startsWith("/") ? redirectUrl : "/" + redirectUrl); |
|||
} |
|||
System.out.println("Resolved Redirect URL: " + redirectUrl); |
|||
|
|||
// 第三步:跟随重定向(使用 GET 请求) |
|||
System.out.println("\n--- Step 3: Follow Redirect (GET Request) ---"); |
|||
URL resultsUrl = new URL(redirectUrl); |
|||
HttpURLConnection resultsConn = (HttpURLConnection) resultsUrl.openConnection(); |
|||
resultsConn.setRequestMethod("GET"); |
|||
resultsConn.setInstanceFollowRedirects(false); |
|||
resultsConn.setConnectTimeout(10000); |
|||
resultsConn.setReadTimeout(10000); |
|||
resultsConn.setRequestProperty("Cookie", String.join("; ", cookieSet)); |
|||
resultsConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"); |
|||
resultsConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"); |
|||
|
|||
// 更新 cookies |
|||
String resultsSessionId = updateCookies(resultsConn, cookieSet); |
|||
System.out.println("Results Cookies: " + cookieSet); |
|||
System.out.println("Results Session ID: " + resultsSessionId); |
|||
|
|||
// 读取重定向后的结果页面内容 |
|||
BufferedReader resultsReader = new BufferedReader(new InputStreamReader(resultsConn.getInputStream())); |
|||
StringBuilder resultsContent = new StringBuilder(); |
|||
while ((inputLine = resultsReader.readLine()) != null) { |
|||
resultsContent.append(inputLine); |
|||
} |
|||
resultsReader.close(); |
|||
resultsConn.disconnect(); |
|||
|
|||
// 提取页面中的 ViewState(状态信息,用于后续请求) |
|||
String viewState = extractViewState(resultsContent.toString()); |
|||
System.out.println("Results ViewState: " + viewState); |
|||
|
|||
// 检查 Session ID 是否一致,确保会话未被重置 |
|||
if (sessionId != null && !sessionId.equals(resultsSessionId)) { |
|||
System.out.println("Warning: Session ID changed. Initial: " + sessionId + ", Results: " + resultsSessionId); |
|||
} |
|||
|
|||
// Step 4: 第四步:发送分页请求(使用 POST) |
|||
System.out.println("\n--- Step 4: Pagination POST Request ---"); |
|||
HttpURLConnection postConn = (HttpURLConnection) new URL(cleanUrl).openConnection(); |
|||
postConn.setRequestMethod("POST"); |
|||
postConn.setInstanceFollowRedirects(false); |
|||
postConn.setDoOutput(true); |
|||
postConn.setConnectTimeout(10000); |
|||
postConn.setReadTimeout(10000); |
|||
|
|||
// 设置分页请求的请求头(非 AJAX,模拟浏览器常规请求) |
|||
postConn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"); |
|||
postConn.setRequestProperty("Cookie", String.join("; ", cookieSet)); |
|||
postConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"); |
|||
postConn.setRequestProperty("Origin", "https://www.drks.de"); |
|||
postConn.setRequestProperty("Referer", cleanUrl); |
|||
postConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"); |
|||
postConn.setRequestProperty("Sec-Fetch-Dest", "document"); |
|||
postConn.setRequestProperty("Sec-Fetch-Mode", "navigate"); |
|||
|
|||
// 构建分页请求的 POST 参数(包括页码和 ViewState 等) |
|||
String postData = buildPostData(viewState, page); |
|||
System.out.println("Pagination POST Data: " + postData); |
|||
|
|||
// 发送分页的 POST 请求 |
|||
try (OutputStream os = postConn.getOutputStream()) { |
|||
byte[] input = postData.getBytes(StandardCharsets.UTF_8); |
|||
os.write(input, 0, input.length); |
|||
} |
|||
|
|||
// 更新 cookies(分页响应可能返回新的 Set-Cookie) |
|||
String paginationSessionId = updateCookies(postConn, cookieSet); |
|||
System.out.println("Pagination Cookies: " + cookieSet); |
|||
System.out.println("Pagination Session ID: " + paginationSessionId); |
|||
|
|||
// 处理分页响应 |
|||
int responseCode = postConn.getResponseCode(); |
|||
System.out.println("Pagination Response Code: " + responseCode); |
|||
|
|||
// Read and process the pagination response |
|||
StringBuilder postContent = new StringBuilder(); |
|||
try (BufferedReader postReader = new BufferedReader( |
|||
new InputStreamReader( |
|||
responseCode >= 400 ? postConn.getErrorStream() : postConn.getInputStream()))) { |
|||
while ((inputLine = postReader.readLine()) != null) { |
|||
postContent.append(inputLine); |
|||
} |
|||
} |
|||
|
|||
Document parse = null; |
|||
if (responseCode == HttpURLConnection.HTTP_MOVED_TEMP |
|||
|| responseCode == HttpURLConnection.HTTP_MOVED_PERM |
|||
|| responseCode == HttpURLConnection.HTTP_SEE_OTHER) { |
|||
String newUrl = postConn.getHeaderField("Location"); |
|||
System.out.println("Pagination Redirecting to: " + newUrl); |
|||
|
|||
// 解析重定向中的相对地址为完整 URL(如果是相对路径) |
|||
if (!newUrl.startsWith("http")) { |
|||
newUrl = hostUrl + (newUrl.startsWith("/") ? newUrl : "/" + newUrl); |
|||
} |
|||
|
|||
// Follow the redirect |
|||
URL redirectConnUrl = new URL(newUrl); |
|||
HttpURLConnection followConn = (HttpURLConnection) redirectConnUrl.openConnection(); |
|||
followConn.setRequestMethod("GET"); |
|||
followConn.setRequestProperty("Cookie", String.join("; ", cookieSet)); |
|||
followConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"); |
|||
followConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"); |
|||
|
|||
BufferedReader redirectReader = new BufferedReader(new InputStreamReader(followConn.getInputStream())); |
|||
StringBuilder redirectContent = new StringBuilder(); |
|||
while ((inputLine = redirectReader.readLine()) != null) { |
|||
redirectContent.append(inputLine); |
|||
} |
|||
redirectReader.close(); |
|||
followConn.disconnect(); |
|||
|
|||
System.out.println("Redirect Response: " + redirectContent); |
|||
parse = Jsoup.parse(String.valueOf(redirectContent)); |
|||
} else if (responseCode == 200) { |
|||
System.out.println("Pagination Response: " + postContent); |
|||
parse = Jsoup.parse(String.valueOf(postContent)); |
|||
} else { |
|||
System.err.println("Unexpected Pagination Response Code: " + responseCode); |
|||
// Optionally read and print error stream for non-200/3xx codes |
|||
try (BufferedReader errorReader = new BufferedReader(new InputStreamReader(postConn.getErrorStream()))) { |
|||
String errorLine; |
|||
System.err.println("Error Stream:"); |
|||
while ((errorLine = errorReader.readLine()) != null) { |
|||
System.err.println(errorLine); |
|||
} |
|||
} catch (Exception e) { |
|||
System.err.println("Could not read error stream: " + e.getMessage()); |
|||
} |
|||
return; // Exit if pagination fails unexpectedly |
|||
} |
|||
|
|||
Elements links = parse.select("div[data-label='Titel der Studie'] a"); |
|||
|
|||
for (Element link : links) { |
|||
String href = link.attr("href"); |
|||
String text = link.text(); |
|||
|
|||
System.out.println("链接: " + href); |
|||
System.out.println("标题: " + text); |
|||
} |
|||
String text = parse.select("div.col-md-2.pt-3.ps-0.text-md-end").text(); |
|||
// 使用正则表达式提取 "第" 和 "/" 之间的数字 |
|||
String regex = "Seite\\s*(\\d+)\\s*/"; |
|||
Matcher matcher = Pattern.compile(regex).matcher(text); |
|||
if (matcher.find()) { |
|||
System.out.println("总共有"+matcher.group(1));// 返回第一个捕获组,即数字 "1" |
|||
} |
|||
postConn.disconnect(); |
|||
|
|||
} else if (searchResponseCode == 200) { |
|||
System.out.println("Search request returned 200 OK. Reading response body:"); |
|||
// Read and print the response body for debugging |
|||
try (BufferedReader searchReader = new BufferedReader(new InputStreamReader(searchConn.getInputStream()))) { |
|||
String line; |
|||
StringBuilder searchResponseBody = new StringBuilder(); |
|||
while ((line = searchReader.readLine()) != null) { |
|||
searchResponseBody.append(line).append("\n"); |
|||
} |
|||
System.out.println("Search Response Body:\n" + searchResponseBody.toString()); |
|||
} catch (Exception e) { |
|||
System.err.println("Could not read search response body: " + e.getMessage()); |
|||
} finally { |
|||
searchConn.disconnect(); |
|||
} |
|||
|
|||
System.err.println("Search request did not return expected 302 redirect. Response code: " + searchResponseCode); |
|||
System.err.println("The website's search mechanism may have changed."); |
|||
|
|||
} else { |
|||
// Handle other unexpected response codes for the search request |
|||
System.err.println("Unexpected Search Response Code: " + searchResponseCode); |
|||
try (BufferedReader errorReader = new BufferedReader(new InputStreamReader(searchConn.getErrorStream()))) { |
|||
String errorLine; |
|||
System.err.println("Error Stream:"); |
|||
while ((errorLine = errorReader.readLine()) != null) { |
|||
System.err.println(errorLine); |
|||
} |
|||
} catch (Exception e) { |
|||
System.err.println("Could not read error stream for search response: " + e.getMessage()); |
|||
} |
|||
searchConn.disconnect(); |
|||
} |
|||
} |
|||
|
|||
// 更新并返回当前连接中的 Cookie,包含 JSESSIONID 的提取 |
|||
private static String updateCookies(HttpURLConnection conn, Set<String> cookieSet) { |
|||
String sessionId = null; |
|||
Map<String, List<String>> headerFields = conn.getHeaderFields(); |
|||
List<String> cookiesHeader = headerFields.get("Set-Cookie"); |
|||
if (cookiesHeader != null) { |
|||
for (String cookie : cookiesHeader) { |
|||
String cookieValue = cookie.split(";")[0]; |
|||
cookieSet.add(cookieValue); |
|||
// Prioritize JSESSIONID or csfcfc if present |
|||
if (cookieValue.startsWith("JSESSIONID=")) { |
|||
sessionId = cookieValue; |
|||
} else if (cookieValue.startsWith("csfcfc=") && sessionId == null) { |
|||
sessionId = cookieValue; |
|||
} |
|||
} |
|||
} |
|||
return sessionId; |
|||
} |
|||
|
|||
// 提取 __VIEWSTATE 隐藏字段的值 |
|||
private static String extractViewState(String html) { |
|||
// Try regex first for jakarta.faces.ViewState |
|||
String regexJakarta = "name=\"jakarta\\.faces\\.ViewState\"[^>]*value=\"([^\"]+)\""; |
|||
Pattern patternJakarta = Pattern.compile(regexJakarta); |
|||
Matcher matcherJakarta = patternJakarta.matcher(html); |
|||
|
|||
if (matcherJakarta.find()) { |
|||
return matcherJakarta.group(1); |
|||
} |
|||
|
|||
// Fallback to regex for javax.faces.ViewState (older versions or other parts of site) |
|||
String regexJavax = "name=\"javax\\.faces\\.ViewState\"[^>]*value=\"([^\"]+)\""; |
|||
Pattern patternJavax = Pattern.compile(regexJavax); |
|||
Matcher matcherJavax = patternJavax.matcher(html); |
|||
|
|||
if (matcherJavax.find()) { |
|||
return matcherJavax.group(1); |
|||
} |
|||
|
|||
// Fallback to string search if regex fails (less reliable) |
|||
String searchStringJakarta = "jakarta.faces.ViewState"; |
|||
int startIndexJakarta = html.indexOf(searchStringJakarta); |
|||
if (startIndexJakarta != -1) { |
|||
int valueStart = html.indexOf("value=\"", startIndexJakarta) + 7; |
|||
int valueEnd = html.indexOf("\"", valueStart); |
|||
if (valueStart != -1 && valueEnd != -1) { |
|||
return html.substring(valueStart, valueEnd); |
|||
} |
|||
} |
|||
|
|||
String searchStringJavax = "javax.faces.ViewState"; |
|||
int startIndexJavax = html.indexOf(searchStringJavax); |
|||
if (startIndexJavax != -1) { |
|||
int valueStart = html.indexOf("value=\"", startIndexJavax) + 7; |
|||
int valueEnd = html.indexOf("\"", valueStart); |
|||
if (valueStart != -1 && valueEnd != -1) { |
|||
return html.substring(valueStart, valueEnd); |
|||
} |
|||
} |
|||
|
|||
|
|||
System.err.println("Failed to extract ViewState from HTML"); |
|||
return ""; // Return empty string if not found |
|||
} |
|||
|
|||
// 生成搜索请求的 POST 数据 |
|||
private static String buildSearchPostData(String viewState) { |
|||
try { |
|||
// URL-encode the ViewState |
|||
String encodedViewState = URLEncoder.encode(viewState, StandardCharsets.UTF_8.name()); |
|||
|
|||
return "searchForm=searchForm" + |
|||
"&searchForm%3Aj_idt80=Midwifery" + // Assuming 'Midwifery' is the search term |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AdrksId=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AsecondaryId=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AscientificSummary=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aoutcome=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AhealthOfCondition=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AhealthyVolunteers=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aaddresses=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt128=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AipdSharingPlan=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt135%3Afrom=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt135%3Ato=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt146%3Afrom=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt146%3Ato=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt157%3Afrom=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt157%3Ato=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3Agender=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AageInYears=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AinclusionCriteria=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AexclusionCriteria=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AtrialStatus=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3ArecrutingLocation=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3Aj_idt213%3Afrom=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3Aj_idt213%3Ato=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3AtrialDesign%3Apurpose=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3AtrialDesign%3AstudyType=" + |
|||
"&searchForm%3Aj_idt287=" + // This parameter might be related to the search button click |
|||
"&jakarta.faces.ViewState=" + encodedViewState; // Changed to jakarta.faces.ViewState |
|||
} catch (Exception e) { |
|||
System.err.println("Error encoding search ViewState: " + e.getMessage()); |
|||
return ""; |
|||
} |
|||
} |
|||
|
|||
// 生成分页请求的 POST 数据 |
|||
private static String buildPostData(String viewState, int page) { |
|||
// The page parameter in the POST data might be 0-indexed or 1-indexed |
|||
// Let's assume it's 0-indexed for the parameter name and 1-indexed for the value based on your original code |
|||
int parameterPage = page - 1; |
|||
int valuePage = page; // The value sent in the form might be the actual page number |
|||
|
|||
try { |
|||
// URL-encode the ViewState |
|||
String encodedViewState = URLEncoder.encode(viewState, StandardCharsets.UTF_8.name()); |
|||
|
|||
return "resultForm=resultForm" + |
|||
"&resultForm%3Asorting%3ArowsPerPage=10" + |
|||
// The parameter name for pagination button might have changed |
|||
// Check browser network traffic for the exact parameter name for page buttons |
|||
"&resultForm%3ApaginationTop%3Aj_idt156%3A"+ parameterPage +"%3Aj_idt158=" + valuePage + |
|||
"&resultForm%3Asorting%3AsortingBy=SCORE" + |
|||
"&resultForm%3Asorting%3Aj_idt141=true" + // This might be for sorting direction |
|||
"&resultForm%3Aj_idt221%3Aj_idt223%3AdownloadConfirmation=resultForm%3Aj_idt221%3Aj_idt223%3AdownloadConfirmation" + |
|||
"&selectedType=JSON" + // This might be for download format, potentially not needed for pagination |
|||
"&jakarta.faces.ViewState=" + encodedViewState; // Changed to jakarta.faces.ViewState |
|||
} catch (Exception e) { |
|||
System.err.println("Error encoding pagination ViewState: " + e.getMessage()); |
|||
return ""; |
|||
} |
|||
} |
|||
} |
@ -0,0 +1,165 @@ |
|||
package com.example; |
|||
|
|||
import org.apache.kafka.clients.producer.*; |
|||
import org.apache.kafka.common.serialization.StringSerializer; |
|||
|
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.select.Elements; |
|||
import okhttp3.OkHttpClient; |
|||
import okhttp3.Request; |
|||
import okhttp3.Response; |
|||
|
|||
import java.io.*; |
|||
import java.util.*; |
|||
import java.util.concurrent.Future; |
|||
|
|||
public class getInKa { |
|||
// 初始化 OkHttp 客户端,用于发送 HTTP 请求 |
|||
private static final OkHttpClient httpClient = new OkHttpClient(); |
|||
private static final String PROCESSED_URLS_FILE = "processed_urls.txt"; // 记录已处理的 URL 文件 |
|||
public static void main(String[] args) { |
|||
try { |
|||
// 获取目标 URL 列表 |
|||
System.out.println("Starting URL collection..."); |
|||
List<String> urls = getUrls(); |
|||
System.out.println("Collected " + urls.size() + " URLs."); |
|||
|
|||
// 从 URL 中提取新闻数据并保存到 kafka |
|||
System.out.println("Starting news extraction..."); |
|||
getNews(urls); |
|||
System.out.println("News extraction completed."); |
|||
} catch (IOException | InterruptedException e) { |
|||
System.out.println("Error in main: " + e.getMessage()); |
|||
} |
|||
} |
|||
public static List<String> getUrls() throws IOException, InterruptedException { |
|||
List<String> urls = new ArrayList<>(); |
|||
Set<String> processedUrls = loadProcessedUrls(); // 加载已处理的 URL |
|||
|
|||
for (int page = 1; page <= 28; page++) { |
|||
String url = "https://www.zyctd.com/zixun/201/pz102-" + page + ".html"; |
|||
Request request = new Request.Builder() |
|||
.url(url) |
|||
.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0") |
|||
.build(); |
|||
|
|||
System.out.println("Fetching page " + page + ": " + url); |
|||
try (Response response = httpClient.newCall(request).execute()) { |
|||
if (response.isSuccessful() && response.body() != null) { |
|||
System.out.println("Successfully fetched page " + page); |
|||
String html = response.body().string(); |
|||
Document doc = Jsoup.parse(html); |
|||
Elements links = doc.select("div.zixun-list > div.zixun-item-box > div.zixun-item-title > p > a"); |
|||
List<String> projectIDs = links.eachAttr("href"); |
|||
System.out.println("Found " + projectIDs.size() + " URLs on page " + page); |
|||
|
|||
for (String projectUrl : projectIDs) { |
|||
if (!processedUrls.contains(projectUrl)) { // 检查是否已处理 |
|||
urls.add(projectUrl); |
|||
processedUrls.add(projectUrl); // 添加到已处理集合 |
|||
} |
|||
} |
|||
} else { |
|||
System.out.println("Failed to fetch page " + page + ": Status code " + response.code()); |
|||
} |
|||
} |
|||
Thread.sleep(1000); |
|||
} |
|||
saveProcessedUrls(processedUrls); // 保存已处理的 URL |
|||
return urls; |
|||
} |
|||
public static void getNews(List<String> urls) throws IOException { |
|||
for (int i = 0; i < urls.size(); i++) { |
|||
String url = urls.get(i); |
|||
Request request = new Request.Builder() |
|||
.url(url) |
|||
.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0") |
|||
.build(); |
|||
|
|||
System.out.println("Processing URL " + (i + 1) + "/" + urls.size() + ": " + url); |
|||
try (Response response = httpClient.newCall(request).execute()) { |
|||
if (response.isSuccessful() && response.body() != null) { |
|||
System.out.println("Successfully fetched news from " + url); |
|||
String html = response.body().string(); |
|||
Document doc = Jsoup.parse(html); |
|||
String title = doc.select("div.info-title.t-center > h1").text().trim(); |
|||
String date = doc.select("div.author.color-grey.art-info > span:nth-child(1)").text().trim(); |
|||
String content = String.join("\n", doc.select("div.info-content > div > p").eachText()).trim(); |
|||
if (content.isEmpty()) { |
|||
content = String.join("\n", doc.select("div.info-content > p:nth-child(2)").eachText()).trim(); |
|||
} |
|||
|
|||
if (!title.isEmpty() && !date.isEmpty() && !content.isEmpty()) { |
|||
Map<String, String> news = new HashMap<>(); |
|||
news.put("title", title); |
|||
news.put("date", date); |
|||
news.put("content", content); |
|||
news.put("url", url); |
|||
System.out.println("Extracted news: " + news.get("title")); |
|||
saveData(news); // 调用修改后的 saveData 方法 |
|||
} else { |
|||
System.out.println("Failed to extract complete data from " + url); |
|||
} |
|||
} else { |
|||
System.out.println("Failed to fetch news from " + url + ": Status code " + response.code()); |
|||
} |
|||
} catch (Exception e) { |
|||
System.out.println("An error occurred while fetching " + url + ": " + e.getMessage()); |
|||
} |
|||
try { |
|||
Thread.sleep(5000); // 休眠5秒 |
|||
} catch (InterruptedException e) { |
|||
System.out.println("Sleep interrupted: " + e.getMessage()); |
|||
} |
|||
} |
|||
} |
|||
public static void saveData(Map<String, String> news) { |
|||
Properties properties = new Properties(); |
|||
properties.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092"); |
|||
properties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName()); |
|||
properties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName()); |
|||
|
|||
try (Producer<String, String> producer = new KafkaProducer<>(properties)) { |
|||
String topic = "news-topic"; |
|||
String key = news.get("title"); |
|||
String value = news.toString(); |
|||
ProducerRecord<String, String> record = new ProducerRecord<>(topic, key, value); |
|||
|
|||
producer.send(record, (metadata, exception) -> { |
|||
if (exception == null) { |
|||
System.out.println("Data sent successfully to Kafka: topic=" + metadata.topic() + |
|||
", partition=" + metadata.partition() + ", offset=" + metadata.offset()); |
|||
} else { |
|||
System.err.println("Failed to send data to Kafka: " + exception.getMessage()); |
|||
} |
|||
}).get(); |
|||
} catch (Exception e) { |
|||
System.err.println("Error while sending data to Kafka: " + e.getMessage()); |
|||
} |
|||
} |
|||
// 加载已处理的 URL |
|||
private static Set<String> loadProcessedUrls() throws IOException { |
|||
Set<String> processedUrls = new HashSet<>(); |
|||
File file = new File(PROCESSED_URLS_FILE); |
|||
if (file.exists()) { |
|||
try (BufferedReader reader = new BufferedReader(new FileReader(file))) { |
|||
String line; |
|||
while ((line = reader.readLine()) != null) { |
|||
processedUrls.add(line.trim()); |
|||
} |
|||
} |
|||
} |
|||
return processedUrls; |
|||
} |
|||
|
|||
// 保存已处理的 URL |
|||
private static void saveProcessedUrls(Set<String> processedUrls) throws IOException { |
|||
try (BufferedWriter writer = new BufferedWriter(new FileWriter(PROCESSED_URLS_FILE))) { |
|||
for (String url : processedUrls) { |
|||
writer.write(url); |
|||
writer.newLine(); |
|||
} |
|||
} |
|||
} |
|||
} |
@ -0,0 +1,47 @@ |
|||
package com.example; |
|||
|
|||
import okhttp3.*; |
|||
import org.json.JSONArray; |
|||
import org.json.JSONObject; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
|
|||
import java.io.IOException; |
|||
import java.util.HashMap; |
|||
import java.util.Map; |
|||
|
|||
public class jsonGetOk { |
|||
public static void main(String[] args) throws IOException { |
|||
OkHttpClient client = new OkHttpClient().newBuilder() |
|||
.build(); |
|||
MediaType mediaType = MediaType.parse("text/plain"); |
|||
RequestBody body = RequestBody.create(mediaType, ""); |
|||
Request request = new Request.Builder() |
|||
.url("https://www.dsscu.gov.mo/api/common/page_detail?PostType=page&EntityId=6654829e-8163-b801-0096-c02e09d690d1") |
|||
.get() |
|||
.build(); |
|||
Response response = client.newCall(request).execute(); |
|||
String responseBody = response.body().string(); |
|||
|
|||
// 解析 JSON |
|||
JSONObject jsonObject = new JSONObject(responseBody); |
|||
JSONObject data = jsonObject.getJSONObject("data"); |
|||
String postTime = data.getString("onlineAt"); |
|||
JSONObject metas = data.getJSONObject("metas"); |
|||
String title = metas.getString("name"); |
|||
String summary = metas.getString("summary"); |
|||
Document parse = Jsoup.parse(summary); |
|||
String content = parse.text(); |
|||
String forwardcontent = responseBody; |
|||
String fileList = metas.getString("biddersFile"); |
|||
fileList = fileList+"###"+"pdf"; |
|||
Map<String,Object> map = new HashMap<>(); |
|||
map.put("postTime",postTime); |
|||
map.put("title",title); |
|||
map.put("content",content); |
|||
map.put("forwardcontent",forwardcontent); |
|||
map.put("fileList",fileList); |
|||
System.out.println(map); |
|||
} |
|||
|
|||
} |
@ -0,0 +1,256 @@ |
|||
package com.example; |
|||
|
|||
import okhttp3.*; |
|||
import org.json.JSONObject; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
import java.io.IOException; |
|||
import java.net.InetSocketAddress; |
|||
import java.net.Proxy; |
|||
import java.text.ParseException; |
|||
import java.text.SimpleDateFormat; |
|||
import java.time.LocalDate; |
|||
import java.time.LocalDateTime; |
|||
import java.time.ZonedDateTime; |
|||
import java.time.format.DateTimeFormatter; |
|||
import java.time.format.DateTimeParseException; |
|||
import java.util.Date; |
|||
import java.util.HashMap; |
|||
import java.util.Locale; |
|||
import java.util.Map; |
|||
import java.util.concurrent.TimeUnit; |
|||
import java.util.regex.Matcher; |
|||
import java.util.regex.Pattern; |
|||
|
|||
public class ook { |
|||
|
|||
|
|||
public static void main(String[] args) throws Exception { |
|||
// 1. 获取代理地址 |
|||
// String proxyJson = getProxyFromLocalService(); |
|||
// JSONObject proxyData = new JSONObject(proxyJson); |
|||
// String httpProxy = proxyData.getString("http"); // 例如 "http://proxy1:port" |
|||
// |
|||
// // 2. 解析代理地址 |
|||
// String[] proxyParts = httpProxy.replace("http://", "").split(":"); |
|||
// String proxyHost = proxyParts[0]; // proxy1 |
|||
// int proxyPort = Integer.parseInt(proxyParts[1]); // port |
|||
|
|||
OkHttpClient client = new OkHttpClient().newBuilder() |
|||
.connectTimeout(30, TimeUnit.SECONDS) |
|||
.readTimeout(30, TimeUnit.SECONDS) |
|||
.writeTimeout(30, TimeUnit.SECONDS) |
|||
.proxy(new Proxy(Proxy.Type.HTTP, new InetSocketAddress("127.0.0.1", 7897))) // 直接使用 7897 端口 |
|||
.build(); |
|||
|
|||
MediaType mediaType = MediaType.parse("text/plain"); |
|||
Request request = new Request.Builder() |
|||
.url("https://wrair.health.mil/News-Media/Press-Releases/") |
|||
.get() |
|||
// 添加关键请求头 |
|||
.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36") |
|||
.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7") |
|||
// .addHeader("Accept-Encoding", "gzip, deflate, br, zstd") |
|||
.addHeader("Accept-Language", "zh-CN,zh;q=0.9,th;q=0.8") |
|||
.addHeader("Cache-Control", "no-cache") |
|||
.addHeader("Pragma", "no-cache") |
|||
.addHeader("Referer", "https://wrair.health.mil/News-Media/Press-Releases/") |
|||
.addHeader("Cookie", "_ga=GA1.1.516170455.1740971326; .ASPXANONYMOUS=xUBztj4Ek1vHfBPe-1QqFJhd83I4bkB1k0_d-2QrQ7drfd7R7Y6eNsyyHVjSeffyIKzy_qm5tOKOCtbvst-s9ZGWThxifCGMdJE117EQlr1OZARa0; dnn_IsMobile=False; language=en-US; ARRAffinity=c30f7cdebcf208f7c5a996cb410451c36532afc64703669607f68f04a75f4b39; _ga_CSLL4ZEK4L=GS1.1.1742349582.4.1.1742350035.0.0.0") |
|||
.addHeader("Upgrade-Insecure-Requests", "1") |
|||
.addHeader("Sec-Fetch-Dest", "document") |
|||
.addHeader("Sec-Fetch-Mode", "navigate") |
|||
.addHeader("Sec-Fetch-Site", "same-origin") |
|||
.addHeader("Sec-Fetch-User", "?1") |
|||
.addHeader("Sec-Ch-Ua", "\"Chromium\";v=\"134\", \"Not:A-Brand\";v=\"24\", \"Google Chrome\";v=\"134\"") |
|||
.addHeader("Sec-Ch-Ua-Mobile", "?0") |
|||
.addHeader("Sec-Ch-Ua-Platform", "\"Windows\"") |
|||
.addHeader("Priority", "u=0, i") |
|||
.build(); |
|||
Response response = client.newCall(request).execute(); |
|||
String html = response.body().string(); |
|||
Document parse = Jsoup.parse(html); |
|||
// String url = "https://www.uu.se/nyheter/alla?newsResearch=researchtopic11%3Bresearchtopic7%3Bresearchtopic22%3Bresearchtopic10%3Bresearchtopic2&start=20"; |
|||
// // 定义正则表达式 |
|||
// String regex = "start=(\\d+)"; |
|||
// Pattern pattern = Pattern.compile(regex); |
|||
// Matcher matcher = pattern.matcher(url); |
|||
// Integer start = 0; |
|||
|
|||
|
|||
// String postTime = convertToTimestamp(parse.select(".mr10").text()); |
|||
// String title = parse.select(".hdg01").text(); |
|||
// String content = parse.select(".container01 p").text(); |
|||
// String forwardcontent = parse.select("#main").html(); |
|||
// Map<String,Object> map = new HashMap<>(); |
|||
|
|||
// if (matcher.find()) { |
|||
// start = Integer.parseInt(matcher.group(1)); |
|||
// System.out.println("Start: " + start); // start = 12 |
|||
// } |
|||
// |
|||
// Elements allLinks = new Elements(); |
|||
// Elements links = parse.select(".search-result-hit-text-container a"); |
|||
// allLinks.addAll(links); |
|||
// |
|||
// int totalLinks = allLinks.size(); |
|||
// int startIndex = Math.max(0, totalLinks - 10); |
|||
// for (int i = startIndex; i < totalLinks; i++) { |
|||
// Map<String, Object> task = new HashMap<String, Object>(16); |
|||
// task.put("link","https://www.uu.se"+allLinks.get(i).attr("href")); |
|||
// task.put("linktype", "newscontent"); // 設置鏈接類型為 "newscontent" |
|||
// |
|||
// System.out.println(task); |
|||
// } |
|||
Elements elements = parse.select(".title a"); |
|||
for (Element element : elements) { |
|||
String link = element.attr("href"); |
|||
System.out.println(link); |
|||
} |
|||
|
|||
|
|||
// map.put("postTime",postTime); |
|||
// map.put("title",title); |
|||
// map.put("content",content); |
|||
// map.put("forwardcontent",forwardcontent); |
|||
// System.out.println(map); |
|||
|
|||
} |
|||
public ook() throws IOException { |
|||
} |
|||
// public static String convertToTimestamp(String dateStr) { |
|||
// try { |
|||
// // 定义输入格式:dd MMMM , yyyy(例如 "28 February , 2025") |
|||
// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("MMMM dd, yyyy", Locale.ENGLISH); |
|||
// // 定义输出格式:yyyy-MM-dd HH:mm:ss |
|||
// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); |
|||
// |
|||
// // 解析输入日期 |
|||
// LocalDate date = LocalDate.parse(dateStr, inputFormatter); |
|||
// // 转换为带时间的格式,时间设为 00:00:00 |
|||
// return date.atStartOfDay().format(outputFormatter); |
|||
// } catch (Exception e) { |
|||
// e.printStackTrace(); |
|||
// return null; // 或抛出异常,根据需求调整 |
|||
// } |
|||
// } |
|||
// public static String convertToTimestamp(String dateStr) { |
|||
// try { |
|||
// // 定义输入格式:yyyy 年 MM 月 dd 日 |
|||
// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("MM-dd-yyyy", Locale.CHINESE); |
|||
// // 定义输出格式:yyyy-MM-dd HH:mm:ss |
|||
// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); |
|||
// |
|||
// // 解析输入日期 |
|||
// LocalDate date = LocalDate.parse(dateStr, inputFormatter); |
|||
// // 转换为带时间的格式,时间设为 00:00:00 |
|||
// return date.atStartOfDay().format(outputFormatter); |
|||
// } catch (Exception e) { |
|||
// e.printStackTrace(); |
|||
// return null; // 或抛出异常,根据需求调整 |
|||
// } |
|||
// } |
|||
|
|||
// public static String convertToTimestamp(String dateStr) { |
|||
// try { |
|||
// // 定义输入格式 |
|||
// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("dd/MM/yyyy"); |
|||
// // 定义输出格式 |
|||
// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); |
|||
// |
|||
// // 解析输入字符串为 LocalDate |
|||
// LocalDate date = LocalDate.parse(dateStr, inputFormatter); |
|||
// // 转换为 LocalDateTime,设置时间为 00:00:00 |
|||
// LocalDateTime dateTime = date.atStartOfDay(); |
|||
// // 格式化为目标字符串 |
|||
// return dateTime.format(outputFormatter); |
|||
// } catch (Exception e) { |
|||
// e.printStackTrace(); |
|||
// return null; // 或者抛出异常,根据需求调整 |
|||
// } |
|||
// } |
|||
// public static String convertToTimestamp(String dateStr) { |
|||
// try { |
|||
// // 定义输入格式:MMMM d, yyyy(例如 "June 3, 2015") |
|||
// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("d MMMM, yyyy", Locale.ENGLISH); |
|||
// // 定义输出格式:yyyy-MM-dd HH:mm:ss |
|||
// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); |
|||
// |
|||
// // 解析输入日期 |
|||
// LocalDate date = LocalDate.parse(dateStr, inputFormatter); |
|||
// // 转换为带时间的格式,时间设为 00:00:00 |
|||
// return date.atStartOfDay().format(outputFormatter); |
|||
// } catch (Exception e) { |
|||
// e.printStackTrace(); |
|||
// return null; // 或抛出异常,根据需求调整 |
|||
// } |
|||
// } |
|||
// public static String convertToTimestamp(String input) { |
|||
// try { |
|||
// // 正则匹配 "d MMMM yyyy" |
|||
// Pattern pattern = Pattern.compile("\\d{1,2} [A-Za-z]+ \\d{4}"); |
|||
// Matcher matcher = pattern.matcher(input); |
|||
// if (matcher.find()) { |
|||
// String dateStr = matcher.group(); |
|||
// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("d MMMM yyyy", Locale.ENGLISH); |
|||
// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); |
|||
// LocalDate date = LocalDate.parse(dateStr, inputFormatter); |
|||
// return date.atStartOfDay().format(outputFormatter); |
|||
// } else { |
|||
// System.out.println("No date found in: " + input); |
|||
// return null; |
|||
// } |
|||
// } catch (Exception e) { |
|||
// e.printStackTrace(); |
|||
// return null; |
|||
// } |
|||
// } |
|||
// public static String convertToTimestamp(String dateStr) { |
|||
// try { |
|||
// // Parse the ISO 8601 date string (e.g., "2025-03-17T12:37:33.033Z") |
|||
// ZonedDateTime zdt = ZonedDateTime.parse(dateStr, DateTimeFormatter.ISO_DATE_TIME); |
|||
// |
|||
// // Define the output format (yyyy-MM-dd hh:mm:ss) |
|||
// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); |
|||
// |
|||
// // Format the date to the desired output |
|||
// return zdt.format(outputFormatter); |
|||
// } catch (Exception e) { |
|||
// e.printStackTrace(); |
|||
// return null; // Or throw an exception, depending on your needs |
|||
// } |
|||
// } |
|||
public static String convertToTimestamp(String dateStr) { |
|||
try { |
|||
// Parse "Jan. 9, 2025" (abbreviated month, dot, comma-separated) |
|||
DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("MMM. d, yyyy", Locale.ENGLISH); |
|||
LocalDate date = LocalDate.parse(dateStr, inputFormatter); |
|||
|
|||
// Format to "yyyy-MM-dd HH:mm:ss" (defaulting time to 00:00:00) |
|||
DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); |
|||
return date.atStartOfDay().format(outputFormatter); |
|||
} catch (Exception e) { |
|||
e.printStackTrace(); |
|||
return null; |
|||
} |
|||
} |
|||
// 调用本地代理服务获取代理地址 |
|||
private static String getProxyFromLocalService() throws Exception { |
|||
OkHttpClient client = new OkHttpClient(); |
|||
Request request = new Request.Builder() |
|||
.url("http://127.0.0.1:7897") |
|||
.get() |
|||
.build(); |
|||
|
|||
try (Response response = client.newCall(request).execute()) { |
|||
if (response.isSuccessful()) { |
|||
return response.body().string(); // 返回 JSON 字符串 |
|||
} else { |
|||
throw new Exception("获取代理失败,状态码: " + response.code()); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
@ -0,0 +1,524 @@ |
|||
package com.example; |
|||
|
|||
import okhttp3.*; |
|||
import org.json.JSONArray; |
|||
import org.json.JSONObject; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
import java.io.IOException; |
|||
import java.net.InetSocketAddress; |
|||
import java.net.Proxy; |
|||
import java.text.ParseException; |
|||
import java.text.SimpleDateFormat; |
|||
import java.time.*; |
|||
import java.time.format.DateTimeFormatter; |
|||
import java.time.format.DateTimeParseException; |
|||
import java.util.*; |
|||
import java.util.concurrent.TimeUnit; |
|||
import java.util.regex.Matcher; |
|||
import java.util.regex.Pattern; |
|||
|
|||
public class oook { |
|||
|
|||
|
|||
public static void main(String[] args) throws Exception { |
|||
// 1. 获取代理地址 |
|||
// String proxyJson = getProxyFromLocalService(); |
|||
// JSONObject proxyData = new JSONObject(proxyJson); |
|||
// String httpProxy = proxyData.getString("http"); // 例如 "http://proxy1:port" |
|||
// |
|||
// // 2. 解析代理地址 |
|||
// String[] proxyParts = httpProxy.replace("http://", "").split(":"); |
|||
// String proxyHost = proxyParts[0]; // proxy1 |
|||
// int proxyPort = Integer.parseInt(proxyParts[1]); // port |
|||
OkHttpClient client = new OkHttpClient().newBuilder() |
|||
.connectTimeout(30, TimeUnit.SECONDS) |
|||
.readTimeout(30, TimeUnit.SECONDS) |
|||
.writeTimeout(30, TimeUnit.SECONDS) |
|||
// .cookieJar(new CookieJar() { |
|||
// private final HashMap<String, List<Cookie>> cookieStore = new HashMap<>(); |
|||
// |
|||
// @Override |
|||
// public void saveFromResponse(HttpUrl url, List<Cookie> cookies) { |
|||
// cookieStore.put(url.host(), cookies); // 保存 Cookie |
|||
// } |
|||
// |
|||
// @Override |
|||
// public List<Cookie> loadForRequest(HttpUrl url) { |
|||
// List<Cookie> cookies = cookieStore.get(url.host()); |
|||
// return cookies != null ? cookies : new ArrayList<>(); |
|||
// } |
|||
// }) |
|||
// .followRedirects(true) // 自动处理重定向 |
|||
.build(); |
|||
|
|||
|
|||
// 发送目标请求,自动获取和使用 Cookie |
|||
// Request request = new Request.Builder() |
|||
// .url("https://thl.fi/aiheet/infektiotaudit-ja-rokotukset/ajankohtaista/infektio-ja-rokotusuutiset?p_p_id=com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_L2Jk5CCjrKPN&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_L2Jk5CCjrKPN_redirect=%2Faiheet%2Finfektiotaudit-ja-rokotukset%2Fajankohtaista%2Finfektio-ja-rokotusuutiset&_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_L2Jk5CCjrKPN_delta=50&p_r_p_resetCur=false&_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_L2Jk5CCjrKPN_cur=1") |
|||
// .get() |
|||
// .addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36") |
|||
// .addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") |
|||
// .addHeader("Accept-Language", "en-US,en;q=0.5") |
|||
// .addHeader("Cookie", "__cf_bm=HXf4OleH9DiJmEagV_4Wori6vFzyN4wf.CBVL57AQUI-1743471952-1.0.1.1-h0KqPKUW2_wblBJ1HWbn50Xi1EPDIxjvFhRyrkdPrAoRHNjlXk..tK_KDWGUs6f4Z1VbQUbJD1Vw3KTi9IYO5bx5af4ZqE2nABBXT.YpLKQ; _cfuvid=jdweOOZm.a8GWXZGqRHb.fiSFMKZuAppyOlkDBbafw0-1743471952167-0.0.1.1-604800000") .build(); |
|||
// OkHttpClient client = new OkHttpClient().newBuilder() |
|||
// .connectTimeout(30, TimeUnit.SECONDS) |
|||
// .readTimeout(30, TimeUnit.SECONDS) |
|||
// .writeTimeout(30, TimeUnit.SECONDS) |
|||
//// .proxy(new Proxy(Proxy.Type.HTTP, new InetSocketAddress("127.0.0.1", 7897))) // 直接使用 7897 端口 |
|||
// .build(); |
|||
String url = "https://www.iranintl.com/en/202504116060"; |
|||
MediaType mediaType = MediaType.parse("text/plain"); |
|||
RequestBody body = RequestBody.create(mediaType, ""); |
|||
Request request = new Request.Builder() |
|||
.url(url) |
|||
.get() |
|||
.build(); |
|||
Response response = client.newCall(request).execute(); |
|||
String html = response.body().string(); |
|||
Document parse = Jsoup.parse(html); |
|||
// String htmlData = null; |
|||
// JSONArray jsonArray = new JSONArray(html); |
|||
// for (int i = 0; i < jsonArray.length(); i++) { |
|||
// JSONObject obj = jsonArray.getJSONObject(i); |
|||
// if ("insert".equals(obj.optString("command")) && obj.has("data")) { |
|||
// htmlData = obj.getString("data"); |
|||
// break; |
|||
// } |
|||
// } |
|||
// Document doc = Jsoup.parse(htmlData); |
|||
// Elements rows = doc.select(".o-grid__item.col-1, .o-grid__item.col-2, .o-grid__item.col-3"); |
|||
// |
|||
// Set<String> uniqueHrefs = new HashSet<>(); |
|||
// |
|||
// for (Element row : rows) { |
|||
// Elements links = row.select("a[href]"); // 选择所有 a 标签 |
|||
// if (!links.isEmpty()) { |
|||
// // 只取第一个 href |
|||
// String href = links.first().attr("href"); |
|||
// uniqueHrefs.add(href); |
|||
// } |
|||
// } |
|||
//// |
|||
// for (String href : uniqueHrefs) { |
|||
// System.out.println("Href: " + href); |
|||
// } |
|||
// String next = getNextPageUrl(url); |
|||
// System.out.println(next); |
|||
|
|||
// JSONObject jsonObject = new JSONObject(html); |
|||
// JSONObject response1 = jsonObject.getJSONObject("response"); |
|||
// JSONArray docs = response1.getJSONArray("docs"); |
|||
// |
|||
// // 遍历 docs 数组,提取 permalink |
|||
// for (int i = 0; i < docs.length(); i++) { |
|||
// JSONObject doc = docs.getJSONObject(i); |
|||
// String permalink = doc.getString("permalink"); |
|||
// System.out.println("Permalink: " + permalink); |
|||
// } |
|||
|
|||
// String url = "https://www.uu.se/nyheter/alla?newsResearch=researchtopic11%3Bresearchtopic7%3Bresearchtopic22%3Bresearchtopic10%3Bresearchtopic2&start=20"; |
|||
// // 定义正则表达式 |
|||
// String regex = "start=(\\d+)"; |
|||
// Pattern pattern = Pattern.compile(regex); |
|||
// Matcher matcher = pattern.matcher(url); |
|||
// Integer start = 0; |
|||
// String postTime = convertToTimestamp( |
|||
// parse.select(".c-news-info__date.o-meta span.c-date").text().trim() + " " + |
|||
// parse.select(".c-news-info__date.o-meta span.c-year").text().trim() |
|||
// ); |
|||
// String postTime = parse.select("tr:nth-child(3) td:nth-child(3)").text()+" 00:00:00"; |
|||
String postTime = convertIsoToTimestamp(parse.select(".WrittenContentBlock-module__9pvVhW__timeAgo time").attr("datetime")); |
|||
String title = parse.select(".WrittenContentBlock-module__9pvVhW__headline").text(); |
|||
String content = parse.select(".WrittenContentBlock-module__9pvVhW__body p").text(); |
|||
String forwardcontent = parse.select(".page").html(); |
|||
Elements imgs = parse.select(".page img"); |
|||
// Elements pdfs = parse.select("tr:nth-child(3) td a"); |
|||
|
|||
String prefix = ""; |
|||
|
|||
List imgList = new ArrayList<String>(); |
|||
|
|||
for (Element img : imgs) { |
|||
String src = img.attr("src"); |
|||
if (src != null && !src.isEmpty()) { |
|||
// 判断是否以 https 开头 |
|||
String fullUrl; |
|||
if (!src.startsWith("https")) { |
|||
// 如果不以 https 开头,拼接前缀 |
|||
if (src.startsWith("/")) { |
|||
fullUrl = prefix + src; |
|||
} else { |
|||
fullUrl = prefix + "/" + src; |
|||
} |
|||
} else { |
|||
fullUrl = src; |
|||
} |
|||
// 拼接格式 |
|||
String imgUrl = fullUrl + "###" + "avif"; |
|||
imgList.add(imgUrl); |
|||
} |
|||
} |
|||
|
|||
// String prefix = ""; |
|||
// |
|||
// List<String> fileList = new ArrayList<String>(); |
|||
// |
|||
// for (Element pdf : pdfs) { |
|||
// String pdfUrl = pdf.attr("href"); |
|||
// if (pdfUrl != null && !pdfUrl.isEmpty()) { |
|||
// // 判断是否以 https 开头 |
|||
// String fullUrl; |
|||
// if (!pdfUrl.startsWith("https")) { |
|||
// // 如果不以 https 开头,拼接前缀 |
|||
// if (pdfUrl.startsWith("/")) { |
|||
// fullUrl = prefix + pdfUrl; |
|||
// } else { |
|||
// fullUrl = prefix + "/" + pdfUrl; |
|||
// } |
|||
// } else { |
|||
// fullUrl = pdfUrl; |
|||
// } |
|||
// // 拼接格式 |
|||
// String fileUrl = fullUrl + "###" + "pdf"; |
|||
// fileList.add(fileUrl); |
|||
// } |
|||
// } |
|||
// |
|||
|
|||
|
|||
// if (matcher.find()) { |
|||
// start = Integer.parseInt(matcher.group(1)); |
|||
// System.out.println("Start: " + start); // start = 12 |
|||
// } |
|||
|
|||
// Elements allLinks = new Elements(); |
|||
// Elements links = parse.select(".card-body a"); |
|||
// allLinks.addAll(links); |
|||
// |
|||
// int totalLinks = allLinks.size(); |
|||
// int startIndex = Math.max(0, totalLinks - 10); |
|||
// for (int i = startIndex; i < totalLinks; i++) { |
|||
// Map<String, Object> task = new HashMap<String, Object>(16); |
|||
// task.put("link","https://www.uu.se"+allLinks.get(i).attr("href")); |
|||
// task.put("linktype", "newscontent"); // 設置鏈接類型為 "newscontent" |
|||
// |
|||
// System.out.println(task); |
|||
// } |
|||
|
|||
// Elements elements = parse.select(".topic__grid__item a"); |
|||
// Integer count = elements.size(); |
|||
// for (Element element : elements) { |
|||
// String link = element.attr("href"); // 獲取新聞鏈接的 href 屬性 |
|||
// System.out.println(link); |
|||
// } |
|||
|
|||
// if(count <10){ |
|||
// String nextpageurl = getPreviousYearUrl(url); |
|||
// System.out.println(nextpageurl); |
|||
// }else { |
|||
// String nextpageurl = getNextPageUrl(url); |
|||
// System.out.println(nextpageurl); |
|||
// } |
|||
Map<String,Object> map = new HashMap<>(); |
|||
map.put("postTime",postTime); |
|||
map.put("title",title); |
|||
map.put("content",content); |
|||
map.put("forwardcontent",forwardcontent); |
|||
map.put("imgList",imgList); |
|||
// map.put("fileList",fileList); |
|||
System.out.println(map); |
|||
|
|||
} |
|||
public oook() throws IOException { |
|||
} |
|||
// public static String convertToTimestamp(String dateStr) { |
|||
// try { |
|||
// // 定义输入格式:dd MMMM , yyyy(例如 "28 February , 2025") |
|||
// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("MMMM dd, yyyy", Locale.ENGLISH); |
|||
// // 定义输出格式:yyyy-MM-dd HH:mm:ss |
|||
// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); |
|||
// dateStr = dateStr.replace("|", "").trim(); |
|||
// // 解析输入日期 |
|||
// LocalDate date = LocalDate.parse(dateStr, inputFormatter); |
|||
// // 转换为带时间的格式,时间设为 00:00:00 |
|||
// return date.atStartOfDay().format(outputFormatter); |
|||
// } catch (Exception e) { |
|||
// e.printStackTrace(); |
|||
// return null; // 或抛出异常,根据需求调整 |
|||
// } |
|||
// } |
|||
// public static String convertToTimestamp(String dateStr) { |
|||
// try { |
|||
// // 去掉 "Publié le" 前缀并清理多余字符 |
|||
// dateStr = dateStr.replace("Publié le", "").trim(); |
|||
// |
|||
// // 定义输入格式:dd MMMM yyyy(例如 "25 mars 2025") |
|||
// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("dd MMMM yyyy", Locale.FRENCH); |
|||
// |
|||
// // 定义输出格式:yyyy-MM-dd HH:mm:ss |
|||
// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); |
|||
// |
|||
// // 解析输入日期 |
|||
// LocalDate date = LocalDate.parse(dateStr, inputFormatter); |
|||
// |
|||
// // 转换为带时间的格式,时间设为 00:00:00 |
|||
// return date.atStartOfDay().format(outputFormatter); |
|||
// } catch (Exception e) { |
|||
// e.printStackTrace(); |
|||
// return null; // 或抛出异常,根据需求调整 |
|||
// } |
|||
// } |
|||
// public static String convertToTimestamp(String dateStr) { |
|||
// try { |
|||
// // 定义输入格式:yyyy 年 MM 月 dd 日 |
|||
// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("MM-dd-yyyy", Locale.CHINESE); |
|||
// // 定义输出格式:yyyy-MM-dd HH:mm:ss |
|||
// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); |
|||
// |
|||
// // 解析输入日期 |
|||
// LocalDate date = LocalDate.parse(dateStr, inputFormatter); |
|||
// // 转换为带时间的格式,时间设为 00:00:00 |
|||
// return date.atStartOfDay().format(outputFormatter); |
|||
// } catch (Exception e) { |
|||
// e.printStackTrace(); |
|||
// return null; // 或抛出异常,根据需求调整 |
|||
// } |
|||
// } |
|||
|
|||
// public static String convertToTimestamp(String dateStr) { |
|||
// try { |
|||
// // 定义输入格式 |
|||
// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("dd/MM/yyyy"); |
|||
// // 定义输出格式 |
|||
// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); |
|||
// |
|||
// // 解析输入字符串为 LocalDate |
|||
// LocalDate date = LocalDate.parse(dateStr, inputFormatter); |
|||
// // 转换为 LocalDateTime,设置时间为 00:00:00 |
|||
// LocalDateTime dateTime = date.atStartOfDay(); |
|||
// // 格式化为目标字符串 |
|||
// return dateTime.format(outputFormatter); |
|||
// } catch (Exception e) { |
|||
// e.printStackTrace(); |
|||
// return null; // 或者抛出异常,根据需求调整 |
|||
// } |
|||
// } |
|||
// public static String convertToTimestamp(String dateStr) { |
|||
// try { |
|||
// // 定义输入格式:MMMM d, yyyy(例如 "June 3, 2015") |
|||
// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("d MMMM, yyyy", Locale.ENGLISH); |
|||
// // 定义输出格式:yyyy-MM-dd HH:mm:ss |
|||
// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); |
|||
// |
|||
// // 解析输入日期 |
|||
// LocalDate date = LocalDate.parse(dateStr, inputFormatter); |
|||
// // 转换为带时间的格式,时间设为 00:00:00 |
|||
// return date.atStartOfDay().format(outputFormatter); |
|||
// } catch (Exception e) { |
|||
// e.printStackTrace(); |
|||
// return null; // 或抛出异常,根据需求调整 |
|||
// } |
|||
// } |
|||
// public static String convertToTimestamp(String input) { |
|||
// try { |
|||
// // 正则匹配 "d MMMM yyyy" |
|||
// Pattern pattern = Pattern.compile("\\d{1,2} [A-Za-z]+ \\d{4}"); |
|||
// Matcher matcher = pattern.matcher(input); |
|||
// if (matcher.find()) { |
|||
// String dateStr = matcher.group(); |
|||
// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("d MMMM yyyy", Locale.ENGLISH); |
|||
// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); |
|||
// LocalDate date = LocalDate.parse(dateStr, inputFormatter); |
|||
// return date.atStartOfDay().format(outputFormatter); |
|||
// } else { |
|||
// System.out.println("No date found in: " + input); |
|||
// return null; |
|||
// } |
|||
// } catch (Exception e) { |
|||
// e.printStackTrace(); |
|||
// return null; |
|||
// } |
|||
// } |
|||
// public static String convertToTimestamp(String dateStr) { |
|||
// try { |
|||
// // Parse the ISO 8601 date string (e.g., "2025-03-17T12:37:33.033Z") |
|||
// ZonedDateTime zdt = ZonedDateTime.parse(dateStr, DateTimeFormatter.ISO_DATE_TIME); |
|||
// |
|||
// // Define the output format (yyyy-MM-dd hh:mm:ss) |
|||
// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); |
|||
// |
|||
// // Format the date to the desired output |
|||
// return zdt.format(outputFormatter); |
|||
// } catch (Exception e) { |
|||
// e.printStackTrace(); |
|||
// return null; // Or throw an exception, depending on your needs |
|||
// } |
|||
// } |
|||
// public static String convertToTimestamp(String dateStr) { |
|||
// try { |
|||
// // Parse "Jan. 9, 2025" (abbreviated month, dot, comma-separated) |
|||
// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("MMM d, yyyy", Locale.ENGLISH); |
|||
// LocalDate date = LocalDate.parse(dateStr, inputFormatter); |
|||
// |
|||
// // Format to "yyyy-MM-dd HH:mm:ss" (defaulting time to 00:00:00) |
|||
// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); |
|||
// return date.atStartOfDay().format(outputFormatter); |
|||
// } catch (Exception e) { |
|||
// e.printStackTrace(); |
|||
// return null; |
|||
// } |
|||
// } |
|||
// public static String convertToTimestamp(String dateStr) { |
|||
// try { |
|||
// // 从文本中提取修改日期 |
|||
// String modifiedDateStr = extractModifiedDate(dateStr); |
|||
// if (modifiedDateStr == null) { |
|||
// throw new IllegalArgumentException("无法找到修改日期"); |
|||
// } |
|||
// |
|||
// // Parse "20/12/2024" (day/month/year format, Italian style) |
|||
// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("dd/MM/yyyy", Locale.ITALIAN); |
|||
// LocalDate date = LocalDate.parse(modifiedDateStr, inputFormatter); |
|||
// |
|||
// // Format to "yyyy-MM-dd HH:mm:ss" (defaulting time to 00:00:00) |
|||
// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); |
|||
// return date.atStartOfDay().format(outputFormatter); |
|||
// } catch (Exception e) { |
|||
// e.printStackTrace(); |
|||
// return null; |
|||
// } |
|||
// } |
|||
public static String convertIsoToTimestamp(String dateStr) { |
|||
try { |
|||
// 解析 ISO 8601 格式的 UTC 时间为 Instant |
|||
Instant instant = Instant.parse(dateStr); |
|||
// 转为本地时间(系统默认时区),如果你不想转换时区,可以用 LocalDateTime.ofInstant |
|||
LocalDateTime localDateTime = LocalDateTime.ofInstant(instant, ZoneOffset.UTC); |
|||
// 定义输出格式 |
|||
DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); |
|||
return localDateTime.format(outputFormatter); |
|||
} catch (Exception e) { |
|||
e.printStackTrace(); |
|||
return null; |
|||
} |
|||
} |
|||
|
|||
// public static String convertToTimestamp(String dateStr) { |
|||
// try { |
|||
// // 创建捷克语的日期格式器,解析 "27. listopadu 2024" |
|||
// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("d. MMMM yyyy", new Locale("cs", "CZ")); |
|||
// LocalDate date = LocalDate.parse(dateStr, inputFormatter); |
|||
// |
|||
// // 转换为 "yyyy-MM-dd HH:mm:ss" 格式,默认时间为 00:00:00 |
|||
// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); |
|||
// return date.atStartOfDay().format(outputFormatter); |
|||
// } catch (Exception e) { |
|||
// e.printStackTrace(); |
|||
// return null; |
|||
// } |
|||
// } |
|||
// 提取修改日期的方法 |
|||
private static String extractModifiedDate(String text) { |
|||
String[] lines = text.split("\n"); |
|||
for (String line : lines) { |
|||
if (line.contains("Modificato")) { |
|||
// 提取 "Modificato" 后面的日期部分 |
|||
String[] parts = line.split("\\s+"); |
|||
for (String part : parts) { |
|||
if (part.matches("\\d{2}/\\d{2}/\\d{4}")) { |
|||
return part; // 返回 "20/12/2024" |
|||
} |
|||
} |
|||
} |
|||
} |
|||
return null; // 如果没找到修改日期,返回 null |
|||
} |
|||
// 调用本地代理服务获取代理地址 |
|||
private static String getProxyFromLocalService() throws Exception { |
|||
OkHttpClient client = new OkHttpClient(); |
|||
Request request = new Request.Builder() |
|||
.url("http://127.0.0.1:7897") |
|||
.get() |
|||
.build(); |
|||
|
|||
try (Response response = client.newCall(request).execute()) { |
|||
if (response.isSuccessful()) { |
|||
return response.body().string(); // 返回 JSON 字符串 |
|||
} else { |
|||
throw new Exception("获取代理失败,状态码: " + response.code()); |
|||
} |
|||
} |
|||
} |
|||
public static String getNextPageUrl(String currentUrl) { |
|||
if (currentUrl == null || currentUrl.trim().isEmpty()) { |
|||
return null; |
|||
} |
|||
|
|||
// // 定义基础 URL |
|||
// String baseUrl = "https://www.pasteur.dz/fr/espace-presse"; |
|||
// |
|||
// // 如果是基础 URL,默认第 1 页,下一页为 ?page=2 |
|||
// if (currentUrl.equals(baseUrl)) { |
|||
// return baseUrl + "?start=5"; |
|||
// } |
|||
|
|||
// 定义正则表达式,匹配 ?page=数字 |
|||
String regex = "_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_gJ3hFqMQsykM_cur=(\\d+)"; |
|||
Pattern pattern = Pattern.compile(regex); |
|||
Matcher matcher = pattern.matcher(currentUrl); |
|||
|
|||
// 如果找到 ?page=* |
|||
if (matcher.find()) { |
|||
// 提取页码(group(1) 是括号中的数字部分) |
|||
String pageNumStr = matcher.group(1); |
|||
try { |
|||
int currentPage = Integer.parseInt(pageNumStr); |
|||
// 替换旧页码为新页码 |
|||
return matcher.replaceFirst("_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_gJ3hFqMQsykM_cur=" + (currentPage + 1)); |
|||
} catch (NumberFormatException e) { |
|||
return null; // 页码解析失败 |
|||
} |
|||
}else { |
|||
return null; |
|||
} |
|||
} |
|||
public static String getPreviousYearUrl(String url) { |
|||
if (url == null || url.trim().isEmpty()) { |
|||
return null; |
|||
} |
|||
|
|||
// 定义正则表达式匹配年份 |
|||
String yearRegex = "_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_gJ3hFqMQsykM_year=(\\d{4})"; |
|||
Pattern yearPattern = Pattern.compile(yearRegex); |
|||
Matcher yearMatcher = yearPattern.matcher(url); |
|||
|
|||
// 如果找到年份 |
|||
if (yearMatcher.find()) { |
|||
String yearStr = yearMatcher.group(1); // 提取年份 |
|||
Integer currentYear = Integer.parseInt(yearStr); |
|||
Integer previousYear = currentYear - 1; // 计算上一年 |
|||
|
|||
// 替换年份 |
|||
url = yearMatcher.replaceFirst("_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_gJ3hFqMQsykM_year=" + previousYear); |
|||
} |
|||
|
|||
// 定义正则表达式匹配页码 |
|||
String pageRegex = "_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_gJ3hFqMQsykM_cur=(\\d+)"; |
|||
Pattern pagePattern = Pattern.compile(pageRegex); |
|||
Matcher pageMatcher = pagePattern.matcher(url); |
|||
|
|||
// 如果找到页码 |
|||
if (pageMatcher.find()) { |
|||
// 替换页码为 1 |
|||
return pageMatcher.replaceFirst("_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_gJ3hFqMQsykM_cur=1"); |
|||
} else { |
|||
// 如果没有找到页码,默认页码为 1 |
|||
return url + "_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_gJ3hFqMQsykM_cur=1"; |
|||
} |
|||
} |
|||
} |
|||
|
@ -0,0 +1,403 @@ |
|||
package com.example; |
|||
import com.fasterxml.jackson.databind.ObjectMapper; |
|||
import okhttp3.*; |
|||
import org.apache.kafka.clients.producer.KafkaProducer; |
|||
import org.apache.kafka.clients.producer.ProducerConfig; |
|||
import org.apache.kafka.clients.producer.ProducerRecord; |
|||
import org.apache.kafka.common.serialization.StringSerializer; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
import java.io.IOException; |
|||
import java.nio.file.Files; |
|||
import java.nio.file.Paths; |
|||
import java.time.LocalDate; |
|||
import java.time.LocalDateTime; |
|||
import java.time.format.DateTimeFormatter; |
|||
import java.util.*; |
|||
import java.util.concurrent.ExecutorService; |
|||
import java.util.concurrent.Executors; |
|||
import java.util.concurrent.TimeUnit; |
|||
import java.util.regex.Matcher; |
|||
import java.util.regex.Pattern; |
|||
|
|||
public class projTopic { |
|||
private static final String TOPIC_NAME = "projTopic"; |
|||
private static final String BOOTSTRAP_SERVERS = "node-01:19092"; |
|||
private static KafkaProducer<String, String> producer; |
|||
private static ObjectMapper objectMapper = new ObjectMapper(); |
|||
private static final Random random = new Random(); |
|||
private static List<String> proxyList = new ArrayList<>(); // 代理池 |
|||
private static int currentProxyIndex = 0; // 当前使用的代理索引 |
|||
static { |
|||
Properties props = new Properties(); |
|||
props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, BOOTSTRAP_SERVERS); |
|||
props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer"); |
|||
props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer"); |
|||
props.put(ProducerConfig.ACKS_CONFIG, "all"); // 等待所有副本确认 |
|||
props.put(ProducerConfig.RETRIES_CONFIG, 3); // 重试次数 |
|||
producer = new KafkaProducer<>(props); |
|||
try { |
|||
proxyList = Files.readAllLines(Paths.get("proxy.txt")); |
|||
if (proxyList.isEmpty()) { |
|||
System.out.println("警告: proxy.txt 为空,未加载任何代理"); |
|||
} else { |
|||
System.out.println("成功加载 " + proxyList.size() + " 个代理"); |
|||
} |
|||
} catch (IOException e) { |
|||
System.err.println("读取 proxy.txt 失败: " + e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
public static void main(String[] args) throws IOException, InterruptedException { |
|||
List<String> keywords = Files.readAllLines(Paths.get("keywords.txt")); |
|||
List<String> cleanedKeywords = new ArrayList<>(); |
|||
for (String keyword : keywords) { |
|||
String cleaned = keyword.split(",")[0].trim(); // 取逗号前的部分并去除首尾空格 |
|||
cleaned = cleaned.replaceAll("\\s+", "+"); // 替换所有空格为 + |
|||
cleanedKeywords.add(cleaned); |
|||
} |
|||
ExecutorService executor = Executors.newFixedThreadPool(4); // 4 个线程 |
|||
for (String keyword : cleanedKeywords) { |
|||
executor.submit(() -> { |
|||
try { |
|||
int sleepTime = random.nextInt(1001) + 30000; |
|||
String load = "5|0|20|https://www.nsf.gov/awardsearch/jsp/gwt/search/|57BE5CA45E781DC0159F727F8A8205EB|gov.nsf.research.awardsearch.gwt.client.SearchAwardService|getAwards|gov.nsf.research.awardsearch.gwt.bean.SearchRequestBean/3930579236|com.extjs.gxt.ui.client.data.PagingLoadConfig|java.util.HashMap/962170901|java.lang.String/2004016611|QueryText|" + keyword + "|ActiveAwards|true|com.extjs.gxt.ui.client.data.BasePagingLoadConfig/2011366567|com.extjs.gxt.ui.client.data.RpcMap/3441186752|sortField|sortDir|com.extjs.gxt.ui.client.Style$SortDir/640452531|offset|java.lang.Integer/3438268394|limit|1|2|3|4|2|5|6|5|7|2|8|9|8|10|8|11|8|12|13|0|1|14|4|15|0|16|17|0|18|19|0|20|19|30|"; |
|||
for(int i=0;;i++){ |
|||
OkHttpClient client = createClientWithProxy(); |
|||
MediaType mediaType = MediaType.parse("text/x-gwt-rpc; charset=UTF-8"); |
|||
RequestBody body = RequestBody.create(mediaType, load); |
|||
|
|||
Request request = new Request.Builder() |
|||
.url("https://www.nsf.gov/awardsearch/jsp/gwt/search/.searchaward") |
|||
.method("POST", body) |
|||
.addHeader("Content-Type", "text/x-gwt-rpc; charset=UTF-8") |
|||
.addHeader("X-GWT-Module-Base", "https://www.nsf.gov/awardsearch/jsp/gwt/search/") |
|||
.addHeader("X-GWT-Permutation", "368C3CF86AA4CD7DB2250B35B844C1C2") |
|||
// .addHeader("cookie", "JSESSIONID=E9DCB88F6AD2241C9973AFEC03158ECB") |
|||
.build(); |
|||
Response response = executeWithRetry(client, request, keyword); |
|||
String content = response.body().string(); |
|||
|
|||
Pattern pattern = Pattern.compile("\"awdNumber\",\"(\\d+)\""); |
|||
Matcher matcher = pattern.matcher(content); |
|||
|
|||
List<String> numbers = new ArrayList<>(); // 用于存储匹配的数字 |
|||
// 查找并提取数字 |
|||
List<String> additionalNumbers = new ArrayList<>(); |
|||
List<String> urls = new ArrayList<>(); |
|||
// 查找匹配项 |
|||
while (matcher.find()) { |
|||
// 获取捕获到的数字,并将其添加到列表中 |
|||
numbers.add(matcher.group(1)); |
|||
} |
|||
|
|||
// 输出捕获到的数字 |
|||
if (numbers.isEmpty()) { |
|||
System.out.println("没找到awdNumber,继续下一种查找"); |
|||
|
|||
} else { |
|||
for (String number : numbers) { |
|||
additionalNumbers.add(number); |
|||
} |
|||
} |
|||
|
|||
Pattern additionalPattern = Pattern.compile("\"[^\"]+\",\"(?:\\d{2}/\\d{2}/\\d{4}|\\d+\\.\\d+)\"(?:,\"(?:\\d{2}/\\d{2}/\\d{4}|\\d+\\.\\d+)\")?,\"(\\d+)\""); |
|||
Matcher additionalMatcher = additionalPattern.matcher(content); |
|||
|
|||
|
|||
while (additionalMatcher.find()) { |
|||
additionalNumbers.add(additionalMatcher.group(1)); |
|||
} |
|||
if (additionalNumbers.isEmpty()) { |
|||
System.out.println("没找到下一页内容链接"); |
|||
Thread.sleep(sleepTime); |
|||
break; |
|||
} else { |
|||
for (String number : additionalNumbers) { |
|||
String url = "https://www.nsf.gov/awardsearch/showAward?AWD_ID=" + number + "&HistoricalAwards=false"; |
|||
urls.add(url); |
|||
} |
|||
} |
|||
if (!urls.isEmpty() && urls.get(0).equals("https://www.nsf.gov/awardsearch/showAward?AWD_ID=2446604&HistoricalAwards=false")) { |
|||
System.out.println("第一个 URL 是 AWD_ID=2446604,跳过关键词: " + keyword); |
|||
Thread.sleep(sleepTime); |
|||
return; // 跳出当前任务,处理下一个关键词 |
|||
} |
|||
for(String url:urls){ |
|||
OkHttpClient client2 = createClientWithProxy(); |
|||
MediaType mediaType2 = MediaType.parse("text/plain"); |
|||
RequestBody body2 = RequestBody.create(mediaType2, ""); |
|||
Request request2 = new Request.Builder() |
|||
.url(url) |
|||
.get() |
|||
// .addHeader("Cookie", "JSESSIONID=E9DCB88F6AD2241C9973AFEC03158ECB") |
|||
.build(); |
|||
Response response2 = executeWithRetry(client2, request2, keyword); |
|||
System.out.println(response2.code()); |
|||
String html = response2.body().string(); |
|||
Document parse = Jsoup.parse(html); |
|||
String title = parse.select(".pageheadline").text(); |
|||
String projectNum = parse.select(".clear tr:nth-child(5) .tabletext2:nth-child(2)").text(); |
|||
String projectLeader = parse.select(".clear tr:nth-child(13) .tabletext2:nth-child(2)").text(); |
|||
String projectStartTime = convertToTimestamp(parse.select(".clear tr:nth-child(8) .tabletext2:nth-child(2)").text()); |
|||
String projectEndTime = convertToTimestamp2(parse.select(".clear tr:nth-child(9) .tabletext2:nth-child(2)").text()); |
|||
String sponsorPart = parse.select(".clear tr:nth-child(2) .tabletext2:nth-child(2)").text(); |
|||
String country = "USA"; |
|||
String brief = parse.select(".clear.margintop25 span").text(); |
|||
String sponsor = parse.select(".clear tr:nth-child(1) .tabletext2:nth-child(2)").text(); |
|||
String projectFunding = parse.select(".clear tr:nth-child(12) .tabletext2:nth-child(2)").text(); |
|||
String relatedProject = parse.select(".clear tr:nth-child(20) .tabletext2:nth-child(2)").text(); |
|||
|
|||
|
|||
|
|||
String awardInstrument = parse.select(".clear tr:nth-child(6) .tabletext2:nth-child(2)").text(); |
|||
String programManager = parse.select(".clear tr:nth-child(7) .tabletext2:nth-child(2)").text(); |
|||
String totalIntendedAwardAmount = parse.select(".clear tr:nth-child(10) .tabletext2:nth-child(2)").text(); |
|||
String totalAwardedAmountToDate = parse.select(".clear tr:nth-child(11) .tabletext2:nth-child(2)").text(); |
|||
String recipientSponsoredResearchOffice = parse.select(".clear tr:nth-child(14) .tabletext2:nth-child(2)").text(); |
|||
String sponsorCongressionalDistrict = parse.select(".clear tr:nth-child(15) .tabletext2:nth-child(2)").text(); |
|||
String primaryPlaceOfPerformance = parse.select(".clear tr:nth-child(16) .tabletext2:nth-child(2)").text(); |
|||
String primaryPlaceOfPerformanceCongressionalDistrict = parse.select(".clear tr:nth-child(17) .tabletext2:nth-child(2)").text(); |
|||
String uniqueEntityIdentifier = parse.select(".clear tr:nth-child(18) .tabletext2:nth-child(2)").text(); |
|||
String parentUEI = parse.select(".clear tr:nth-child(19) .tabletext2:nth-child(2)").text(); |
|||
String primaryProgramSource = parse.select(".clear tr:nth-child(21) .tabletext2:nth-child(2)").text(); |
|||
String programReferenceCode = parse.select(".clear tr:nth-child(22) .tabletext2:nth-child(2)").text(); |
|||
String programElementCode = parse.select(".clear tr:nth-child(23) .tabletext2:nth-child(2)").text(); |
|||
String awardAgencyCode = parse.select(".clear tr:nth-child(24) .tabletext2:nth-child(2)").text(); |
|||
String fundAgencyCode = parse.select(".clear tr:nth-child(25) .tabletext2:nth-child(2)").text(); |
|||
String assistanceListingNumber = parse.select(".clear tr:nth-child(26) .tabletext2:nth-child(2)").text(); |
|||
String initialAmendmentDate = convertToTimestamp(parse.select(".clear tr:nth-child(3) .tabletext2:nth-child(2)").text()); |
|||
String latestAmendmentDate = convertToTimestamp(parse.select(".clear tr:nth-child(4) .tabletext2:nth-child(2)").text()); |
|||
|
|||
List<Map<String, Object>> citations = extractAllCitationInfo(html); |
|||
Map<String,Object> data = new HashMap<>(); |
|||
data.put("title",title); |
|||
data.put("projectNum",projectNum); |
|||
data.put("projectLeader",projectLeader); |
|||
data.put("projectStartTime",projectStartTime); |
|||
data.put("projectEndTime",projectEndTime); |
|||
data.put("sponsorPart",sponsorPart); |
|||
data.put("country",country); |
|||
data.put("brief",brief); |
|||
data.put("sponsor",sponsor); |
|||
data.put("projectFunding",projectFunding); |
|||
data.put("relatedProject",relatedProject); |
|||
data.put("awardInstrument",awardInstrument); |
|||
data.put("programManager",programManager); |
|||
data.put("totalIntendedAwardAmount",totalIntendedAwardAmount); |
|||
data.put("totalAwardedAmountToDate",totalAwardedAmountToDate); |
|||
data.put("recipientSponsoredResearchOffice",recipientSponsoredResearchOffice); |
|||
data.put("sponsorCongressionalDistrict",sponsorCongressionalDistrict); |
|||
data.put("primaryPlaceOfPerformance",primaryPlaceOfPerformance); |
|||
data.put("primaryPlaceOfPerformanceCongressionalDistrict",primaryPlaceOfPerformanceCongressionalDistrict); |
|||
data.put("uniqueEntityIdentifier",uniqueEntityIdentifier); |
|||
data.put("parentUEI",parentUEI); |
|||
data.put("primaryProgramSource",primaryProgramSource); |
|||
data.put("programReferenceCode",programReferenceCode); |
|||
data.put("programElementCode",programElementCode); |
|||
data.put("awardAgencyCode",awardAgencyCode); |
|||
data.put("fundAgencyCode",fundAgencyCode); |
|||
data.put("assistanceListingNumber",assistanceListingNumber); |
|||
data.put("publications",citations); |
|||
data.put("initialAmendmentDate",initialAmendmentDate); |
|||
data.put("latestAmendmentDate",latestAmendmentDate); |
|||
data.put("crawlUrl",url); |
|||
data.put("crawlTime",localDateTime()); |
|||
Map<String,Object> result = new HashMap<>(); |
|||
result.put("keyword",keyword); |
|||
result.put("data",data); |
|||
try { |
|||
String jsonValue = objectMapper.writeValueAsString(result); |
|||
ProducerRecord<String, String> record = new ProducerRecord<>(TOPIC_NAME, projectNum, jsonValue); |
|||
|
|||
producer.send(record, (metadata, exception) -> { |
|||
if (exception == null) { |
|||
System.out.println("成功发送到Kafka - Partition: " + metadata.partition() + |
|||
", Offset: " + metadata.offset()); |
|||
} else { |
|||
System.err.println("发送到Kafka失败: " + exception.getMessage()); |
|||
} |
|||
}); |
|||
} catch (Exception e) { |
|||
System.err.println("序列化或发送Kafka消息失败: " + e.getMessage()); |
|||
|
|||
} |
|||
|
|||
Thread.sleep(sleepTime); |
|||
} |
|||
load = increaseOffsetBy30(load); |
|||
} |
|||
|
|||
} catch (Exception e) { |
|||
System.err.println("处理 " + keyword + " 失败: " + e.getMessage()); |
|||
e.printStackTrace(); |
|||
} |
|||
}); |
|||
} |
|||
executor.shutdown(); |
|||
executor.awaitTermination(5, TimeUnit.HOURS); |
|||
producer.close(); |
|||
} |
|||
|
|||
public static String convertToTimestamp(String dateStr) { |
|||
try { |
|||
// Parse "Jan. 9, 2025" (abbreviated month, dot, comma-separated) |
|||
DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("MMMM d, yyyy", Locale.ENGLISH); |
|||
LocalDate date = LocalDate.parse(dateStr, inputFormatter); |
|||
|
|||
// Format to "yyyy-MM-dd HH:mm:ss" (defaulting time to 00:00:00) |
|||
DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); |
|||
return date.atStartOfDay().format(outputFormatter); |
|||
} catch (Exception e) { |
|||
e.printStackTrace(); |
|||
return null; |
|||
} |
|||
|
|||
} |
|||
public static String convertToTimestamp2(String dateStr) { |
|||
try { |
|||
// 移除 "(Estimated)" 部分 |
|||
String cleanDateStr = dateStr.replace(" (Estimated)", "").trim(); |
|||
|
|||
// Parse "June 30, 2025" (full month, day, comma-separated year) |
|||
DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("MMMM d, yyyy", Locale.ENGLISH); |
|||
LocalDate date = LocalDate.parse(cleanDateStr, inputFormatter); |
|||
|
|||
// Format to "yyyy-MM-dd HH:mm:ss" (defaulting time to 00:00:00) |
|||
DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); |
|||
return date.atStartOfDay().format(outputFormatter); |
|||
} catch (Exception e) { |
|||
e.printStackTrace(); |
|||
return null; |
|||
} |
|||
} |
|||
public static List<Map<String, Object>> extractAllCitationInfo(String html) { |
|||
Document doc = Jsoup.parse(html); |
|||
List<Map<String, Object>> citations = new ArrayList<>(); |
|||
|
|||
// 选择所有 margintop15 |
|||
Elements marginDivs = doc.select(".margintop15"); |
|||
Pattern urlPattern = Pattern.compile("javascript:popwin\\('(.*?)'\\)"); |
|||
|
|||
for (Element div : marginDivs) { |
|||
Map<String, Object> info = new HashMap<>(); |
|||
|
|||
// 提取 span 中的文本 |
|||
Elements spans = div.select("> span"); |
|||
if (spans.size() >= 3) { |
|||
info.put("authors", spans.get(0).text()); |
|||
info.put("title", spans.get(1).text()); |
|||
info.put("year", spans.get(2).text()); |
|||
} |
|||
|
|||
// 提取链接 |
|||
Elements links = div.select("a"); |
|||
String doiUrl = ""; |
|||
String citationUrl = ""; |
|||
for (Element link : links) { |
|||
String href = link.attr("href"); |
|||
Matcher matcher = urlPattern.matcher(href); |
|||
if (matcher.find()) { |
|||
String url = matcher.group(1); |
|||
if (link.text().contains("doi.org") && doiUrl.isEmpty()) { |
|||
doiUrl = url; |
|||
} else if (link.text().contains("引用详细信息") && citationUrl.isEmpty()) { |
|||
citationUrl = url; |
|||
} |
|||
} |
|||
} |
|||
info.put("doiUrl", doiUrl); |
|||
info.put("citationUrl", citationUrl); |
|||
|
|||
// 添加到结果列表 |
|||
citations.add(info); |
|||
} |
|||
|
|||
return citations; |
|||
} |
|||
public static String localDateTime(){ |
|||
LocalDateTime dateTime = LocalDateTime.now(); |
|||
|
|||
// 创建 DateTimeFormatter,定义日期时间的格式 |
|||
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); |
|||
|
|||
// 使用 formatter 格式化 LocalDateTime |
|||
String formattedDateTime = dateTime.format(formatter); |
|||
|
|||
return formattedDateTime; // 输出类似: 2025-04-08 13:45:30 |
|||
} |
|||
public static String increaseOffsetBy30(String originalPayload) { |
|||
// 以 "|" 分割载荷为数组 |
|||
String[] parts = originalPayload.split("\\|"); |
|||
|
|||
// 检查数组长度,确保有足够元素 |
|||
if (parts.length < 4) { |
|||
throw new IllegalArgumentException("载荷格式无效,元素不足"); |
|||
} |
|||
|
|||
// 找到倒数第 4 个元素的位置 |
|||
int targetIndex = parts.length - 4; |
|||
|
|||
try { |
|||
// 将倒数第 4 个数字解析为整数 |
|||
int currentOffset = Integer.parseInt(parts[targetIndex]); |
|||
// 增加 30 |
|||
int newOffset = currentOffset + 30; |
|||
// 将新值放回数组 |
|||
parts[targetIndex] = String.valueOf(newOffset); |
|||
|
|||
// 重新拼接载荷 |
|||
return String.join("|", parts); |
|||
} catch (NumberFormatException e) { |
|||
throw new IllegalArgumentException("倒数第 4 个元素不是有效数字: " + parts[targetIndex]); |
|||
} |
|||
} |
|||
private static Response executeWithRetry(OkHttpClient client, Request request, String keyword) throws IOException { |
|||
int maxRetries = proxyList.isEmpty() ? 1 : proxyList.size(); // 如果没有代理,只尝试一次 |
|||
int attempt = 0; |
|||
|
|||
while (attempt < maxRetries) { |
|||
Response response = client.newCall(request).execute(); |
|||
if (response.code() == 403) { |
|||
System.out.println("收到 403 状态码,尝试切换代理重试..."); |
|||
response.close(); |
|||
switchProxy(); |
|||
client = createClientWithProxy(); // 使用新代理重建客户端 |
|||
attempt++; |
|||
if (attempt == maxRetries) { |
|||
throw new IOException("所有代理尝试失败,仍然收到 403"); |
|||
} |
|||
continue; |
|||
} |
|||
return response; // 成功或非 403 状态码,直接返回 |
|||
} |
|||
throw new IOException("无法执行请求,未获取响应"); |
|||
} |
|||
private static OkHttpClient createClientWithProxy() { |
|||
OkHttpClient.Builder builder = new OkHttpClient().newBuilder() |
|||
.connectTimeout(30, TimeUnit.SECONDS) |
|||
.readTimeout(30, TimeUnit.SECONDS) |
|||
.writeTimeout(30, TimeUnit.SECONDS); |
|||
|
|||
if (!proxyList.isEmpty() && currentProxyIndex < proxyList.size()) { |
|||
String proxy = proxyList.get(currentProxyIndex); |
|||
String[] proxyParts = proxy.split(":"); |
|||
if (proxyParts.length == 2) { |
|||
String proxyHost = proxyParts[0]; |
|||
int proxyPort = Integer.parseInt(proxyParts[1]); |
|||
builder.proxy(new java.net.Proxy(java.net.Proxy.Type.HTTP, |
|||
new java.net.InetSocketAddress(proxyHost, proxyPort))); |
|||
System.out.println("使用代理: " + proxy); |
|||
} |
|||
} |
|||
return builder.build(); |
|||
} |
|||
private static synchronized void switchProxy() { |
|||
if (proxyList.isEmpty()) return; |
|||
currentProxyIndex = (currentProxyIndex + 1) % proxyList.size(); |
|||
System.out.println("切换到新代理: " + proxyList.get(currentProxyIndex)); |
|||
} |
|||
} |
@ -0,0 +1,122 @@ |
|||
package com.example; |
|||
|
|||
import co.elastic.clients.elasticsearch.ElasticsearchClient; |
|||
import co.elastic.clients.elasticsearch.core.IndexRequest; |
|||
import co.elastic.clients.elasticsearch.core.IndexResponse; |
|||
import co.elastic.clients.json.jackson.JacksonJsonpMapper; |
|||
import co.elastic.clients.transport.ElasticsearchTransport; |
|||
import co.elastic.clients.transport.rest_client.RestClientTransport; |
|||
import org.apache.http.HttpHost; |
|||
import org.apache.kafka.clients.consumer.ConsumerConfig; |
|||
import org.apache.kafka.clients.consumer.ConsumerRecord; |
|||
import org.apache.kafka.clients.consumer.ConsumerRecords; |
|||
import org.apache.kafka.clients.consumer.KafkaConsumer; |
|||
import org.apache.kafka.common.serialization.StringDeserializer; |
|||
import org.elasticsearch.client.RestClient; |
|||
|
|||
import java.io.IOException; |
|||
import java.time.Duration; |
|||
import java.util.Collections; |
|||
import java.util.HashMap; |
|||
import java.util.Map; |
|||
import java.util.Properties; |
|||
|
|||
public class saveInES { |
|||
public static void main(String[] args) { |
|||
ElasticsearchClient esClient = createElasticsearchClient(); |
|||
Properties properties = new Properties(); |
|||
properties.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092"); |
|||
properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName()); |
|||
properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName()); |
|||
properties.put(ConsumerConfig.GROUP_ID_CONFIG, "news-consumer-group"); |
|||
properties.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"); |
|||
properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false"); // 关闭自动提交偏移量 |
|||
// 创建 Kafka 消费者 |
|||
KafkaConsumer<String, String> consumer = new KafkaConsumer<>(properties); |
|||
|
|||
// 订阅主题 |
|||
String topic = "news-topic"; // Kafka 主题 |
|||
consumer.subscribe(Collections.singletonList(topic)); |
|||
|
|||
// 消费消息 |
|||
try { |
|||
while (true) { |
|||
// 拉取消息 |
|||
ConsumerRecords<String, String> records = consumer.poll(Duration.ofMillis(1000)); |
|||
|
|||
// 处理消息 |
|||
for (ConsumerRecord<String, String> record : records) { |
|||
System.out.println("Received message: key=" + record.key() + ", value=" + record.value()); |
|||
|
|||
// 将消息保存到 Elasticsearch |
|||
saveToElasticsearch(esClient, record.value()); |
|||
} |
|||
consumer.commitSync(); |
|||
} |
|||
} finally { |
|||
// 关闭消费者 |
|||
consumer.close(); |
|||
try { |
|||
esClient._transport().close(); |
|||
} catch (IOException e) { |
|||
System.err.println("Error closing Elasticsearch client: " + e.getMessage()); |
|||
} |
|||
} |
|||
} |
|||
|
|||
/** |
|||
* 初始化 Elasticsearch 客户端 |
|||
*/ |
|||
private static ElasticsearchClient createElasticsearchClient() { |
|||
RestClient restClient = RestClient.builder(new HttpHost("localhost", 9200)).build(); |
|||
ElasticsearchTransport transport = new RestClientTransport(restClient, new JacksonJsonpMapper()); |
|||
return new ElasticsearchClient(transport); |
|||
} |
|||
|
|||
/** |
|||
* 将消息保存到 Elasticsearch |
|||
* |
|||
* @param esClient Elasticsearch 客户端 |
|||
* @param message 消息内容(JSON 格式) |
|||
*/ |
|||
private static void saveToElasticsearch(ElasticsearchClient esClient, String message) { |
|||
try { |
|||
// 将消息解析为 Map(假设消息是 JSON 格式) |
|||
Map<String, String> news = parseMessageToMap(message); |
|||
String docId = news.get("url"); |
|||
// 创建索引请求 |
|||
IndexRequest<Map<String, String>> request = IndexRequest.of(b -> b |
|||
.index("news") // 索引名称 |
|||
.id(docId) |
|||
.document(news) // 要保存的数据 |
|||
); |
|||
|
|||
// 执行索引请求 |
|||
IndexResponse response = esClient.index(request); |
|||
System.out.println("Data saved to Elasticsearch: ID=" + response.id()); |
|||
} catch (Exception e) { |
|||
System.err.println("Failed to save data to Elasticsearch: " + e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
/** |
|||
* 将消息解析为 Map |
|||
* |
|||
* @param message 消息内容(JSON 格式) |
|||
* @return 解析后的 Map |
|||
*/ |
|||
private static Map<String, String> parseMessageToMap(String message) { |
|||
// 这里假设消息是 JSON 格式,例如:{"title":"...", "date":"...", "content":"...", "url":"..."} |
|||
// 可以使用 JSON 库(如 Jackson)解析消息 |
|||
// 这里简单地将消息按逗号分割并转换为 Map |
|||
Map<String, String> map = new HashMap<>(); |
|||
String[] pairs = message.replace("{", "").replace("}", "").split(","); |
|||
for (String pair : pairs) { |
|||
String[] keyValue = pair.split("="); |
|||
if (keyValue.length == 2) { |
|||
map.put(keyValue[0].trim(), keyValue[1].trim()); |
|||
} |
|||
} |
|||
return map; |
|||
} |
|||
} |
@ -0,0 +1,101 @@ |
|||
package com.example;// 注意:如果你使用手动设置路径,就不需要导入 WebDriverManager 了 |
|||
// import io.github.bonigarcia.wdm.WebDriverManager; |
|||
|
|||
import org.openqa.selenium.By; |
|||
import org.openqa.selenium.WebDriver; |
|||
import org.openqa.selenium.WebElement; |
|||
import org.openqa.selenium.chrome.ChromeDriver; |
|||
import org.openqa.selenium.chrome.ChromeOptions; |
|||
import org.openqa.selenium.support.ui.WebDriverWait; |
|||
import org.openqa.selenium.support.ui.ExpectedConditions; |
|||
import org.openqa.selenium.NoSuchElementException; |
|||
import org.openqa.selenium.TimeoutException; |
|||
|
|||
import java.time.Duration; |
|||
import java.util.List; |
|||
|
|||
public class test { // 更改类名以示区别 |
|||
|
|||
public static void main(String[] args) { |
|||
// 手动设置 ChromeDriver 的路径 (如果你选择手动方式的话) |
|||
// *** 将这里的路径替换为你实际的 chromedriver.exe 路径 *** |
|||
System.setProperty("webdriver.chrome.driver", "F:\\tool\\EasySpider_0.6.2_Windows_x64\\EasySpider_windows_x64\\EasySpider\\resources\\app\\chrome_win64\\chromedriver_win64.exe"); |
|||
|
|||
// 如果你选择使用 WebDriverManager,则使用以下代码替代上面的 System.setProperty: |
|||
// import io.github.bonigarcia.wdm.WebDriverManager; |
|||
// WebDriverManager.chromedriver().setup(); |
|||
|
|||
|
|||
WebDriver driver = null; |
|||
|
|||
try { |
|||
// 配置 Chrome 选项 (可选) |
|||
ChromeOptions options = new ChromeOptions(); |
|||
// options.addArguments("--headless"); // 启用无头模式 |
|||
// options.addArguments("--disable-gpu"); |
|||
|
|||
// 初始化 WebDriver |
|||
driver = new ChromeDriver(options); |
|||
|
|||
// 直接打开包含搜索条件的 URL |
|||
// 注意这里使用的 URL 已经包含了查询参数 |
|||
driver.get("https://patentscope.wipo.int/search/en/result.jsf?query=FP:(AI)"); |
|||
|
|||
// 设置一个显式等待 |
|||
WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(20)); |
|||
|
|||
// 由于直接打开了结果页,我们不再需要等待搜索框和点击按钮 |
|||
// 直接等待搜索结果列表加载 |
|||
// *** 请使用浏览器开发者工具确认这里的元素定位器是否正确 *** |
|||
// "div.ps-result-list" 是一个可能的 CSS 选择器示例,你需要根据实际页面确认 |
|||
wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("div.ps-result-list"))); |
|||
|
|||
// --- 在这里添加提取搜索结果的代码 --- |
|||
// *** 请使用浏览器开发者工具确认这里的元素定位器是否正确 *** |
|||
List<WebElement> resultItems = driver.findElements(By.cssSelector("div.ps-result-item")); // 定位每个结果项 |
|||
|
|||
System.out.println("Found " + resultItems.size() + " results:"); |
|||
|
|||
for (WebElement resultItem : resultItems) { |
|||
try { |
|||
// 提取标题 (示例选择器) |
|||
// *** 请使用浏览器开发者工具确认这里的元素定位器是否正确 *** |
|||
WebElement titleElement = resultItem.findElement(By.cssSelector("span.ps-field-value.ps-field-title")); |
|||
String title = titleElement.getText().trim(); |
|||
|
|||
// 提取链接 (示例选择器) |
|||
// *** 请使用浏览器开发者工具确认这里的元素定位器是否正确 *** |
|||
WebElement linkElement = resultItem.findElement(By.tagName("a")); |
|||
String patentLink = linkElement.getAttribute("href"); |
|||
|
|||
|
|||
System.out.println("Title: " + title + ", Link: " + patentLink); |
|||
|
|||
} catch (NoSuchElementException e) { |
|||
System.out.println("Could not find title or link for a result item in this result item."); |
|||
continue; |
|||
} |
|||
} |
|||
|
|||
// --- 处理分页(如果需要)--- |
|||
// 这部分逻辑与之前相同,你需要找到下一页按钮的定位器并实现循环点击和等待 |
|||
// 尽管是直接打开结果页,如果结果有多页,你仍然需要处理分页来获取所有结果。 |
|||
// ... |
|||
|
|||
|
|||
} catch (TimeoutException e) { |
|||
System.err.println("等待元素超时,可能页面结构发生变化或加载缓慢: " + e.getMessage()); |
|||
} catch (NoSuchElementException e) { |
|||
System.err.println("未能找到指定的元素,请检查元素定位器是否正确: " + e.getMessage()); |
|||
} catch (Exception e) { |
|||
System.err.println("发生其他错误: " + e.getMessage()); |
|||
e.printStackTrace(); |
|||
} finally { |
|||
// 关闭浏览器 |
|||
if (driver != null) { |
|||
driver.quit(); |
|||
System.out.println("Browser closed."); |
|||
} |
|||
} |
|||
} |
|||
} |
@ -0,0 +1,103 @@ |
|||
package com.example; |
|||
|
|||
import okhttp3.*; |
|||
import org.joda.time.DateTime; |
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
|
|||
import java.io.IOException; |
|||
import java.text.ParseException; |
|||
import java.text.SimpleDateFormat; |
|||
import java.time.LocalDateTime; |
|||
import java.time.format.DateTimeFormatter; |
|||
import java.util.Date; |
|||
import java.util.HashMap; |
|||
import java.util.Map; |
|||
|
|||
public class testContent { |
|||
public static void main(String[] args) throws IOException { |
|||
String url = "https://www.drks.de/search/de/trial/DRKS00036725/details"; |
|||
OkHttpClient client = new OkHttpClient().newBuilder() |
|||
.build(); |
|||
MediaType mediaType = MediaType.parse("application/json"); |
|||
Request request = new Request.Builder() |
|||
.url(url) |
|||
.get() |
|||
// .addHeader("Cookie", "JSESSIONID=F6B6320CBBC2A27482AEFC0EC641EBF8; JSESSIONID=D9A5D49C09D091E9791733727D8AF2F1") |
|||
.addHeader("Content-Type", "application/json") |
|||
.build(); |
|||
Response response = client.newCall(request).execute(); |
|||
String html = response.body().string(); |
|||
Document parse = Jsoup.parse(html); |
|||
String title = parse.select(".title-bold").text(); |
|||
String registNum = parse.select(".card.trial-details-float.mb-4 .card-body dl dd:nth-child(2)").text(); |
|||
String registTime = convertDate(parse.select(".card.trial-details-float.mb-4 .card-body dl dd:nth-child(6)").text()); |
|||
Map<String,Object> sponsor = new HashMap<>(); |
|||
String header = parse.select("body > main > div.card-body > div:nth-child(9) > div.card-body > div > div > div > div.card-header > h4").text(); |
|||
String site = parse.select("body > main > div.card-body > div:nth-child(9) > div.card-body > div > div > div > div.card-body > dl > dd:nth-child(2) > div").text(); |
|||
String telefon = parse.select("body > main > div.card-body > div:nth-child(9) > div.card-body > div > div > div > div.card-body > dl > dd:nth-child(4) > span").text(); |
|||
String disease = parse.select("body > main > div.card-body > div:nth-child(6) > div.card-body > div > div:nth-child(2) > dl > dd:nth-child(2) > span").text(); |
|||
String studyType = parse.select("body > main > div.card-body > div:nth-child(3) > div.card-body > dl").text(); |
|||
String inclusionCriteria = parse.select("body > main > div.card-body > div:nth-child(7) > div.card-body > div:nth-child(2) > div:nth-child(3) > div > div.card-body > div > div.col-12.mt-3 > dl > dd > span").text(); |
|||
String exclusionCriteria = parse.select("body > main > div.card-body > div:nth-child(7) > div.card-body > div:nth-child(2) > div:nth-child(4) > div > div.card-body > p > span").text(); |
|||
String country = parse.select("body > main > div.card-body > div:nth-child(7) > div.card-body > div:nth-child(2) > div:nth-child(1) > div > div.card-body > dl > dd:nth-child(2)").text(); |
|||
String intervention = parse.select("body > main > div.card-body > div:nth-child(4) > div.card-body > dl").text(); |
|||
String primaryOutcome = parse.select("body > main > div.card-body > div:nth-child(5) > div.card-body > div > div > dl").text(); |
|||
String enrollment = parse.select("body > main > div.card-body > div:nth-child(7) > div.card-body > div:nth-child(2) > div:nth-child(2) > div > div.card-body > div > div:nth-child(5) > dl > dd > span").text(); |
|||
sponsor.put("header",header); |
|||
sponsor.put("site",site); |
|||
sponsor.put("telefon",telefon); |
|||
Map<String,Object> resultData = new HashMap<>(); |
|||
resultData.put("title", title); |
|||
resultData.put("registNum",registNum); |
|||
resultData.put("registTime",registTime); |
|||
resultData.put("registStatus","无"); |
|||
resultData.put("registTitle","无"); |
|||
resultData.put("fullTitle","无"); |
|||
resultData.put("sponsor",sponsor); |
|||
resultData.put("sponsorPart","无"); |
|||
resultData.put("studyType",studyType); |
|||
resultData.put("phase","无"); |
|||
resultData.put("disease",disease); |
|||
resultData.put("studyDesign","无"); |
|||
resultData.put("studyObjective","无"); |
|||
resultData.put("studyStartDate","无"); |
|||
resultData.put("inclusionCriteria",inclusionCriteria); |
|||
resultData.put("exclusionCriteria",exclusionCriteria); |
|||
resultData.put("currentStatus","无"); |
|||
resultData.put("enrollment",enrollment); |
|||
resultData.put("country",country); |
|||
resultData.put("tagTime","无"); |
|||
resultData.put("intervention",intervention); |
|||
resultData.put("primaryOutcome",primaryOutcome); |
|||
resultData.put("crawlTime",getCurrentTime()); |
|||
resultData.put("crawlUrl",url); |
|||
resultData.put("postTime",registTime); |
|||
resultData.put("content","content"); |
|||
resultData.put("forwardcontent","forwardcontent"); |
|||
System.out.println(resultData); |
|||
} |
|||
public static String convertDate(String inputDate) { |
|||
try { |
|||
// 输入格式:dd.MM.yyyy |
|||
SimpleDateFormat inputFormat = new SimpleDateFormat("dd.MM.yyyy"); |
|||
// 解析输入日期 |
|||
Date date = inputFormat.parse(inputDate); |
|||
// 输出格式:yyyy-MM-dd HH:mm:ss |
|||
SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); |
|||
// 转换为目标格式 |
|||
return outputFormat.format(date); |
|||
} catch (ParseException e) { |
|||
// 处理解析异常 |
|||
return "Invalid date format"; |
|||
} |
|||
} |
|||
public static String getCurrentTime() { |
|||
// 创建 DateTimeFormatter,指定输出格式 |
|||
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); |
|||
// 获取当前时间 |
|||
LocalDateTime now = LocalDateTime.now(); |
|||
// 格式化 |
|||
return now.format(formatter); |
|||
} |
|||
} |
@ -0,0 +1,340 @@ |
|||
package com.example; |
|||
|
|||
import org.jsoup.Jsoup; |
|||
import org.jsoup.nodes.Document; |
|||
import org.jsoup.nodes.Element; |
|||
import org.jsoup.select.Elements; |
|||
|
|||
import java.io.BufferedReader; |
|||
import java.io.InputStreamReader; |
|||
import java.io.OutputStream; |
|||
import java.net.HttpURLConnection; |
|||
import java.net.URL; |
|||
import java.net.URLEncoder; |
|||
import java.nio.charset.StandardCharsets; |
|||
import java.util.HashSet; |
|||
import java.util.List; |
|||
import java.util.Map; |
|||
import java.util.Set; |
|||
import java.util.regex.Matcher; |
|||
import java.util.regex.Pattern; |
|||
|
|||
public class testList { |
|||
public static void main(String[] args) throws Exception { |
|||
String targetUrl = "https://www.drks.de/search/de/results?page=4"; |
|||
String baseUrl = "https://www.drks.de/search/de"; |
|||
String hostUrl = "https://www.drks.de"; |
|||
String cleanUrl = targetUrl.split("\\?")[0]; |
|||
System.out.println("Pure URL: " + cleanUrl); |
|||
|
|||
|
|||
String pageNumber = targetUrl.contains("?page=") ? targetUrl.split("page=")[1] : "1"; |
|||
int page = Integer.parseInt(pageNumber); |
|||
System.out.println("Page Number: " + page); |
|||
|
|||
// 存储 cookies |
|||
Set<String> cookieSet = new HashSet<>(); |
|||
String sessionId = null; |
|||
|
|||
// 第一步:初始 GET 请求,获取 cookies 和 ViewState |
|||
URL initialUrl = new URL(baseUrl); |
|||
HttpURLConnection initialConn = (HttpURLConnection) initialUrl.openConnection(); |
|||
initialConn.setRequestMethod("GET"); |
|||
initialConn.setInstanceFollowRedirects(false); |
|||
initialConn.setConnectTimeout(10000); |
|||
initialConn.setReadTimeout(10000); |
|||
|
|||
// 捕获 cookies |
|||
sessionId = updateCookies(initialConn, cookieSet); |
|||
System.out.println("Initial Cookies: " + cookieSet); |
|||
System.out.println("Initial Session ID: " + sessionId); |
|||
|
|||
// 读取响应内容以获取 ViewState |
|||
BufferedReader in = new BufferedReader(new InputStreamReader(initialConn.getInputStream())); |
|||
StringBuilder content = new StringBuilder(); |
|||
String inputLine; |
|||
while ((inputLine = in.readLine()) != null) { |
|||
content.append(inputLine); |
|||
} |
|||
in.close(); |
|||
initialConn.disconnect(); |
|||
|
|||
// 提取初始 ViewState |
|||
String initialViewState = extractViewState(content.toString()); |
|||
System.out.println("Initial ViewState: " + initialViewState); |
|||
|
|||
// 第二步:发送搜索 POST 请求 |
|||
HttpURLConnection searchConn = (HttpURLConnection) new URL(baseUrl).openConnection(); |
|||
searchConn.setRequestMethod("POST"); |
|||
searchConn.setInstanceFollowRedirects(false); |
|||
searchConn.setDoOutput(true); |
|||
searchConn.setConnectTimeout(10000); |
|||
searchConn.setReadTimeout(10000); |
|||
|
|||
// 设置搜索请求的请求头 |
|||
searchConn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"); |
|||
searchConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"); |
|||
searchConn.setRequestProperty("Cookie", String.join("; ", cookieSet)); |
|||
searchConn.setRequestProperty("Origin", "https://www.drks.de"); |
|||
searchConn.setRequestProperty("Referer", baseUrl); |
|||
searchConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"); |
|||
|
|||
// 构建搜索请求的 POST 数据 |
|||
String searchPostData = buildSearchPostData(initialViewState); |
|||
System.out.println("Search POST Data: " + searchPostData); |
|||
|
|||
// 发送搜索 POST 请求 |
|||
try (OutputStream os = searchConn.getOutputStream()) { |
|||
byte[] input = searchPostData.getBytes(StandardCharsets.UTF_8); |
|||
os.write(input, 0, input.length); |
|||
} |
|||
|
|||
// 更新 cookies |
|||
String searchSessionId = updateCookies(searchConn, cookieSet); |
|||
System.out.println("Search Cookies: " + cookieSet); |
|||
System.out.println("Search Session ID: " + searchSessionId); |
|||
|
|||
// 处理搜索响应 |
|||
int searchResponseCode = searchConn.getResponseCode(); |
|||
System.out.println("Search Response Code: " + searchResponseCode); |
|||
String redirectUrl = searchConn.getHeaderField("Location"); |
|||
searchConn.disconnect(); |
|||
|
|||
if (searchResponseCode != 302 || redirectUrl == null) { |
|||
System.err.println("Search request did not return expected 302 redirect. Response code: " + searchResponseCode); |
|||
return; |
|||
} |
|||
System.out.println("Redirect URL (raw): " + redirectUrl); |
|||
|
|||
// 解析相对 URL |
|||
if (!redirectUrl.startsWith("http")) { |
|||
redirectUrl = hostUrl + (redirectUrl.startsWith("/") ? redirectUrl : "/" + redirectUrl); |
|||
} |
|||
System.out.println("Resolved Redirect URL: " + redirectUrl); |
|||
|
|||
// 第三步:跟随重定向(使用 GET 请求) |
|||
URL resultsUrl = new URL(redirectUrl); |
|||
HttpURLConnection resultsConn = (HttpURLConnection) resultsUrl.openConnection(); |
|||
resultsConn.setRequestMethod("GET"); |
|||
resultsConn.setInstanceFollowRedirects(false); |
|||
resultsConn.setConnectTimeout(10000); |
|||
resultsConn.setReadTimeout(10000); |
|||
resultsConn.setRequestProperty("Cookie", String.join("; ", cookieSet)); |
|||
resultsConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"); |
|||
resultsConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64ек; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"); |
|||
|
|||
// 更新 cookies |
|||
String resultsSessionId = updateCookies(resultsConn, cookieSet); |
|||
System.out.println("Results Cookies: " + cookieSet); |
|||
System.out.println("Results Session ID: " + resultsSessionId); |
|||
|
|||
// 读取重定向后的结果页面内容 |
|||
BufferedReader resultsReader = new BufferedReader(new InputStreamReader(resultsConn.getInputStream())); |
|||
StringBuilder resultsContent = new StringBuilder(); |
|||
while ((inputLine = resultsReader.readLine()) != null) { |
|||
resultsContent.append(inputLine); |
|||
} |
|||
resultsReader.close(); |
|||
resultsConn.disconnect(); |
|||
|
|||
// 提取页面中的 ViewState(状态信息,用于后续请求) |
|||
String viewState = extractViewState(resultsContent.toString()); |
|||
System.out.println("Results ViewState: " + viewState); |
|||
|
|||
// 检查 Session ID 是否一致,确保会话未被重置 |
|||
if (sessionId != null && !sessionId.equals(resultsSessionId)) { |
|||
System.out.println("Warning: Session ID changed. Initial: " + sessionId + ", Results: " + resultsSessionId); |
|||
} |
|||
|
|||
// Step 4: 第四步:发送分页请求(使用 POST) |
|||
HttpURLConnection postConn = (HttpURLConnection) new URL(cleanUrl).openConnection(); |
|||
postConn.setRequestMethod("POST"); |
|||
postConn.setInstanceFollowRedirects(false); |
|||
postConn.setDoOutput(true); |
|||
postConn.setConnectTimeout(10000); |
|||
postConn.setReadTimeout(10000); |
|||
|
|||
// 设置分页请求的请求头(非 AJAX,模拟浏览器常规请求) |
|||
postConn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"); |
|||
postConn.setRequestProperty("Cookie", String.join("; ", cookieSet)); |
|||
postConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"); |
|||
postConn.setRequestProperty("Origin", "https://www.drks.de"); |
|||
postConn.setRequestProperty("Referer", cleanUrl); |
|||
postConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"); |
|||
postConn.setRequestProperty("Sec-Fetch-Dest", "document"); |
|||
postConn.setRequestProperty("Sec-Fetch-Mode", "navigate"); |
|||
|
|||
// 构建分页请求的 POST 参数(包括页码和 ViewState 等) |
|||
String postData = buildPostData(viewState, page); |
|||
System.out.println("Pagination POST Data: " + postData); |
|||
|
|||
// 发送分页的 POST 请求 |
|||
try (OutputStream os = postConn.getOutputStream()) { |
|||
byte[] input = postData.getBytes(StandardCharsets.UTF_8); |
|||
os.write(input, 0, input.length); |
|||
} |
|||
|
|||
// 更新 cookies(分页响应可能返回新的 Set-Cookie) |
|||
String paginationSessionId = updateCookies(postConn, cookieSet); |
|||
System.out.println("Pagination Cookies: " + cookieSet); |
|||
System.out.println("Pagination Session ID: " + paginationSessionId); |
|||
|
|||
// 处理分页响应 |
|||
int responseCode = postConn.getResponseCode(); |
|||
System.out.println("Pagination Response Code: " + responseCode); |
|||
|
|||
// 读取分页响应的 HTML 内容 |
|||
StringBuilder postContent = new StringBuilder(); |
|||
try (BufferedReader postReader = new BufferedReader( |
|||
new InputStreamReader( |
|||
responseCode >= 400 ? postConn.getErrorStream() : postConn.getInputStream()))) { |
|||
while ((inputLine = postReader.readLine()) != null) { |
|||
postContent.append(inputLine); |
|||
} |
|||
} |
|||
Document parse = null; |
|||
if (responseCode == HttpURLConnection.HTTP_MOVED_TEMP |
|||
|| responseCode == HttpURLConnection.HTTP_MOVED_PERM |
|||
|| responseCode == HttpURLConnection.HTTP_SEE_OTHER) { |
|||
String newUrl = postConn.getHeaderField("Location"); |
|||
System.out.println("Pagination Redirecting to: " + newUrl); |
|||
|
|||
// 解析重定向中的相对地址为完整 URL(如果是相对路径) |
|||
if (!newUrl.startsWith("http")) { |
|||
newUrl = hostUrl + (newUrl.startsWith("/") ? newUrl : "/" + newUrl); |
|||
} |
|||
|
|||
// 重定向 |
|||
URL redirectConn = new URL(newUrl); |
|||
HttpURLConnection followConn = (HttpURLConnection) redirectConn.openConnection(); |
|||
followConn.setRequestMethod("GET"); |
|||
followConn.setRequestProperty("Cookie", String.join("; ", cookieSet)); |
|||
followConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"); |
|||
followConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"); |
|||
|
|||
BufferedReader redirectReader = new BufferedReader(new InputStreamReader(followConn.getInputStream())); |
|||
StringBuilder redirectContent = new StringBuilder(); |
|||
while ((inputLine = redirectReader.readLine()) != null) { |
|||
redirectContent.append(inputLine); |
|||
} |
|||
redirectReader.close(); |
|||
followConn.disconnect(); |
|||
|
|||
System.out.println("Redirect Response: " + redirectContent); |
|||
parse = Jsoup.parse(String.valueOf(redirectContent)); |
|||
} else if (responseCode == 200) { |
|||
System.out.println("Pagination Response: " + postContent); |
|||
parse = Jsoup.parse(String.valueOf(postContent)); |
|||
} |
|||
|
|||
|
|||
|
|||
Elements links = parse.select("div[data-label='Titel der Studie'] a"); |
|||
|
|||
for (Element link : links) { |
|||
String href = link.attr("href"); |
|||
String text = link.text(); |
|||
|
|||
System.out.println("链接: " + href); |
|||
System.out.println("标题: " + text); |
|||
} |
|||
String text = parse.select("div.col-md-2.pt-3.ps-0.text-md-end").text(); |
|||
// 使用正则表达式提取 "第" 和 "/" 之间的数字 |
|||
String regex = "Seite\\s*(\\d+)\\s*/"; |
|||
Matcher matcher = Pattern.compile(regex).matcher(text); |
|||
if (matcher.find()) { |
|||
System.out.println("总共有"+matcher.group(1));// 返回第一个捕获组,即数字 "1" |
|||
} |
|||
postConn.disconnect(); |
|||
} |
|||
// 更新并返回当前连接中的 Cookie,包含 JSESSIONID 的提取 |
|||
private static String updateCookies(HttpURLConnection conn, Set<String> cookieSet) { |
|||
String sessionId = null; |
|||
Map<String, List<String>> headerFields = conn.getHeaderFields(); |
|||
List<String> cookiesHeader = headerFields.get("Set-Cookie"); |
|||
if (cookiesHeader != null) { |
|||
for (String cookie : cookiesHeader) { |
|||
String cookieValue = cookie.split(";")[0]; |
|||
cookieSet.add(cookieValue); |
|||
if (cookieValue.startsWith("JSESSIONID=") || cookieValue.startsWith("csfcfc=")) { |
|||
sessionId = cookieValue; |
|||
} |
|||
} |
|||
} |
|||
return sessionId; |
|||
} |
|||
// 提取 __VIEWSTATE 隐藏字段的值 |
|||
private static String extractViewState(String html) { |
|||
if (html == null || html.isEmpty()) { |
|||
System.err.println("HTML content is empty or null"); |
|||
return ""; |
|||
} |
|||
|
|||
// 兼容 jakarta.faces.ViewState 和 javax.faces.ViewState |
|||
String regex = "<input[^>]*name=[\"'](?:jakarta|javax)\\.faces\\.ViewState[\"'][^>]*value=[\"']([^\"']+)[\"']"; |
|||
Pattern pattern = Pattern.compile(regex); |
|||
Matcher matcher = pattern.matcher(html); |
|||
|
|||
if (matcher.find()) { |
|||
return matcher.group(1); |
|||
} |
|||
|
|||
System.err.println("Failed to extract ViewState from HTML"); |
|||
return ""; |
|||
} |
|||
|
|||
// 生成搜索请求的 POST 数据 |
|||
private static String buildSearchPostData(String viewState) { |
|||
try { |
|||
return "searchForm=searchForm" + |
|||
"&searchForm%3Aj_idt80=Midwifery" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AdrksId=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AsecondaryId=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AscientificSummary=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aoutcome=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AhealthOfCondition=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AhealthyVolunteers=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aaddresses=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt128=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AipdSharingPlan=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt135%3Afrom=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt135%3Ato=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt146%3Afrom=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt146%3Ato=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt157%3Afrom=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt157%3Ato=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3Agender=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AageInYears=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AinclusionCriteria=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AexclusionCriteria=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AtrialStatus=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3ArecrutingLocation=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3Aj_idt213%3Afrom=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3Aj_idt213%3Ato=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3AtrialDesign%3Apurpose=" + |
|||
"&searchForm%3AextendedSearch%3AextendedSearchTabs%3AtrialDesign%3AstudyType=" + |
|||
"&searchForm%3Aj_idt287=" + |
|||
"&javax.faces.ViewState=" + URLEncoder.encode(viewState, StandardCharsets.UTF_8.name()); |
|||
} catch (Exception e) { |
|||
System.err.println("Error encoding search ViewState: " + e.getMessage()); |
|||
return ""; |
|||
} |
|||
} |
|||
// 生成分页请求的 POST 数据 |
|||
private static String buildPostData(String viewState, int page) { |
|||
int adjustedPage = page - 1; |
|||
try { |
|||
return "resultForm=resultForm" + |
|||
"&resultForm%3Asorting%3ArowsPerPage=10" + |
|||
"&resultForm%3ApaginationTop%3Aj_idt156%3A"+ adjustedPage +"%3Aj_idt158=" + page + |
|||
"&resultForm%3Asorting%3AsortingBy=SCORE" + |
|||
"&resultForm%3Asorting%3Aj_idt141=true" + |
|||
"&resultForm%3Aj_idt221%3Aj_idt223%3AdownloadConfirmation=resultForm%3Aj_idt221%3Aj_idt223%3AdownloadConfirmation" + |
|||
"&selectedType=JSON" + |
|||
"&javax.faces.ViewState=" + URLEncoder.encode(viewState, StandardCharsets.UTF_8.name()); |
|||
} catch (Exception e) { |
|||
System.err.println("Error encoding pagination ViewState: " + e.getMessage()); |
|||
return ""; |
|||
} |
|||
} |
|||
} |
@ -0,0 +1,22 @@ |
|||
package com.example; |
|||
|
|||
import okhttp3.*; |
|||
|
|||
import java.io.IOException; |
|||
|
|||
public class umlistTest { |
|||
public static void main(String[] args) throws IOException { |
|||
OkHttpClient client = new OkHttpClient().newBuilder() |
|||
.build(); |
|||
MediaType mediaType = MediaType.parse("text/plain"); |
|||
RequestBody body = RequestBody.create(mediaType, ""); |
|||
Request request = new Request.Builder() |
|||
.url("http://who.int/westernpacific/publications/m/item/bi-weekly-covid-19-situation-update--11-april-2025") |
|||
.get() |
|||
// .addHeader("Cookie", "_cfuvid=Y2mczEYT8OCAEN719Uv9vPTpARSDmHju6OjSUfxYbb4-1745207891947-0.0.1.1-604800000") |
|||
.build(); |
|||
Response response = client.newCall(request).execute(); |
|||
String html = response.body().string(); |
|||
System.out.println(html); |
|||
} |
|||
} |
@ -0,0 +1,12 @@ |
|||
<configuration> |
|||
<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender"> |
|||
<encoder> |
|||
<pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern> |
|||
</encoder> |
|||
</appender> |
|||
|
|||
<root level="INFO"> |
|||
<appender-ref ref="STDOUT" /> |
|||
</root> |
|||
|
|||
</configuration> |
Some files were not shown because too many files changed in this diff
Write
Preview
Loading…
Cancel
Save
Reference in new issue