commit 86b867f54155800bb9111936a9b24acd7bb64837
Author: guanjz <1826473923@qq.com>
Date: Mon May 19 10:41:53 2025 +0800
本地部署
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..73f69e0
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
+# Editor-based HTTP Client requests
+/httpRequests/
diff --git a/.idea/compiler.xml b/.idea/compiler.xml
new file mode 100644
index 0000000..e6b77de
--- /dev/null
+++ b/.idea/compiler.xml
@@ -0,0 +1,13 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/jarRepositories.xml b/.idea/jarRepositories.xml
new file mode 100644
index 0000000..712ab9d
--- /dev/null
+++ b/.idea/jarRepositories.xml
@@ -0,0 +1,20 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..ae9c995
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,14 @@
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/uiDesigner.xml b/.idea/uiDesigner.xml
new file mode 100644
index 0000000..e96534f
--- /dev/null
+++ b/.idea/uiDesigner.xml
@@ -0,0 +1,124 @@
+
+
+
+
+ -
+
+
+ -
+
+
+ -
+
+
+ -
+
+
+ -
+
+
+
+
+
+ -
+
+
+
+
+
+ -
+
+
+
+
+
+ -
+
+
+
+
+
+ -
+
+
+
+
+ -
+
+
+
+
+ -
+
+
+
+
+ -
+
+
+
+
+ -
+
+
+
+
+ -
+
+
+
+
+ -
+
+
+ -
+
+
+
+
+ -
+
+
+
+
+ -
+
+
+
+
+ -
+
+
+
+
+ -
+
+
+
+
+ -
+
+
+ -
+
+
+ -
+
+
+ -
+
+
+ -
+
+
+
+
+ -
+
+
+ -
+
+
+
+
+
\ No newline at end of file
diff --git a/.project b/.project
new file mode 100644
index 0000000..a8309f9
--- /dev/null
+++ b/.project
@@ -0,0 +1,23 @@
+
+
+ DaKaES
+
+
+
+
+
+ org.eclipse.jdt.core.javabuilder
+
+
+
+
+ org.eclipse.m2e.core.maven2Builder
+
+
+
+
+
+ org.eclipse.jdt.core.javanature
+ org.eclipse.m2e.core.maven2Nature
+
+
diff --git a/NsantegouvListRe.jar b/NsantegouvListRe.jar
new file mode 100644
index 0000000..57433ef
Binary files /dev/null and b/NsantegouvListRe.jar differ
diff --git a/bin/.idea/.gitignore b/bin/.idea/.gitignore
new file mode 100644
index 0000000..73f69e0
--- /dev/null
+++ b/bin/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
+# Editor-based HTTP Client requests
+/httpRequests/
diff --git a/bin/.idea/compiler.xml b/bin/.idea/compiler.xml
new file mode 100644
index 0000000..e6b77de
--- /dev/null
+++ b/bin/.idea/compiler.xml
@@ -0,0 +1,13 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/bin/.idea/jarRepositories.xml b/bin/.idea/jarRepositories.xml
new file mode 100644
index 0000000..712ab9d
--- /dev/null
+++ b/bin/.idea/jarRepositories.xml
@@ -0,0 +1,20 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/bin/.idea/misc.xml b/bin/.idea/misc.xml
new file mode 100644
index 0000000..ae9c995
--- /dev/null
+++ b/bin/.idea/misc.xml
@@ -0,0 +1,14 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/bin/.project b/bin/.project
new file mode 100644
index 0000000..a8309f9
--- /dev/null
+++ b/bin/.project
@@ -0,0 +1,23 @@
+
+
+ DaKaES
+
+
+
+
+
+ org.eclipse.jdt.core.javabuilder
+
+
+
+
+ org.eclipse.m2e.core.maven2Builder
+
+
+
+
+
+ org.eclipse.jdt.core.javanature
+ org.eclipse.m2e.core.maven2Nature
+
+
diff --git a/bin/hs_err_pid15760.log b/bin/hs_err_pid15760.log
new file mode 100644
index 0000000..3459d43
--- /dev/null
+++ b/bin/hs_err_pid15760.log
@@ -0,0 +1,167 @@
+#
+# There is insufficient memory for the Java Runtime Environment to continue.
+# Native memory allocation (malloc) failed to allocate 1048576 bytes for AllocateHeap
+# Possible reasons:
+# The system is out of physical RAM or swap space
+# The process is running with CompressedOops enabled, and the Java Heap may be blocking the growth of the native heap
+# Possible solutions:
+# Reduce memory load on the system
+# Increase physical memory or swap space
+# Check if swap backing store is full
+# Decrease Java heap size (-Xmx/-Xms)
+# Decrease number of Java threads
+# Decrease Java thread stack sizes (-Xss)
+# Set larger code cache with -XX:ReservedCodeCacheSize=
+# JVM is running with Zero Based Compressed Oops mode in which the Java heap is
+# placed in the first 32GB address space. The Java Heap base address is the
+# maximum limit for the native heap growth. Please use -XX:HeapBaseMinAddress
+# to set the Java Heap base and to place the Java Heap above 32GB virtual address.
+# This output file may be truncated or incomplete.
+#
+# Out of Memory Error (memory/allocation.inline.hpp:61), pid=15760, tid=0x0000000000003334
+#
+# JRE version: (8.0_422-b05) (build )
+# Java VM: OpenJDK 64-Bit Server VM (25.422-b05 mixed mode windows-amd64 compressed oops)
+# Failed to write core dump. Minidumps are not enabled by default on client versions of Windows
+#
+
+--------------- T H R E A D ---------------
+
+Current thread (0x00000271b7d7d800): JavaThread "Unknown thread" [_thread_in_vm, id=13108, stack(0x00000082a1500000,0x00000082a1600000)]
+
+Stack: [0x00000082a1500000,0x00000082a1600000]
+[error occurred during error reporting (printing stack bounds), id 0xc0000005]
+
+Native frames: (J=compiled Java code, j=interpreted, Vv=VM code, C=native code)
+
+
+--------------- P R O C E S S ---------------
+
+Java Threads: ( => current thread )
+
+Other Threads:
+
+=>0x00000271b7d7d800 (exited) JavaThread "Unknown thread" [_thread_in_vm, id=13108, stack(0x00000082a1500000,0x00000082a1600000)]
+
+VM state:not at safepoint (normal execution)
+
+VM Mutex/Monitor currently owned by a thread: None
+
+heap address: 0x00000006c4000000, size: 4032 MB, Compressed Oops mode: Zero based, Oop shift amount: 3
+Narrow klass base: 0x0000000000000000, Narrow klass shift: 3
+Compressed class space size: 1073741824 Address: 0x00000007c0000000
+
+Heap:
+ PSYoungGen total 75264K, used 1290K [0x000000076c000000, 0x0000000771400000, 0x00000007c0000000)
+ eden space 64512K, 2% used [0x000000076c000000,0x000000076c142900,0x000000076ff00000)
+ from space 10752K, 0% used [0x0000000770980000,0x0000000770980000,0x0000000771400000)
+ to space 10752K, 0% used [0x000000076ff00000,0x000000076ff00000,0x0000000770980000)
+ ParOldGen total 172032K, used 0K [0x00000006c4000000, 0x00000006ce800000, 0x000000076c000000)
+ object space 172032K, 0% used [0x00000006c4000000,0x00000006c4000000,0x00000006ce800000)
+ Metaspace used 790K, capacity 4480K, committed 4480K, reserved 1056768K
+ class space used 76K, capacity 384K, committed 384K, reserved 1048576K
+
+Card table byte_map: [0x00000271c8b70000,0x00000271c9360000] byte_map_base: 0x00000271c5550000
+
+Marking Bits: (ParMarkBitMap*) 0x00000000521f38d0
+ Begin Bits: [0x00000271c98a0000, 0x00000271cd7a0000)
+ End Bits: [0x00000271cd7a0000, 0x00000271d16a0000)
+
+Polling page: 0x00000271b7eb0000
+
+CodeCache: size=245760Kb used=328Kb max_used=328Kb free=245431Kb
+ bounds [0x00000271b97b0000, 0x00000271b9a20000, 0x00000271c87b0000]
+ total_blobs=57 nmethods=0 adapters=38
+ compilation: enabled
+
+Compilation events (0 events):
+No events
+
+GC Heap History (0 events):
+No events
+
+Deoptimization events (0 events):
+No events
+
+Classes redefined (0 events):
+No events
+
+Internal exceptions (0 events):
+No events
+
+Events (10 events):
+Event: 0.012 loading class java/lang/Short
+Event: 0.013 loading class java/lang/Short done
+Event: 0.013 loading class java/lang/Integer
+Event: 0.013 loading class java/lang/Integer done
+Event: 0.013 loading class java/lang/Long
+Event: 0.013 loading class java/lang/Long done
+Event: 0.013 loading class java/lang/NullPointerException
+Event: 0.013 loading class java/lang/NullPointerException done
+Event: 0.013 loading class java/lang/ArithmeticException
+Event: 0.013 loading class java/lang/ArithmeticException done
+
+
+Dynamic libraries:
+0x00007ff7d7590000 - 0x00007ff7d75d6000 C:\Users\18264\.jdks\corretto-1.8.0_422\bin\java.exe
+0x00007ffa1d0b0000 - 0x00007ffa1d2a8000 C:\Windows\SYSTEM32\ntdll.dll
+0x00007ffa1ce90000 - 0x00007ffa1cf52000 C:\Windows\System32\KERNEL32.DLL
+0x00007ffa1add0000 - 0x00007ffa1b0cf000 C:\Windows\System32\KERNELBASE.dll
+0x00007ffa1c470000 - 0x00007ffa1c51f000 C:\Windows\System32\ADVAPI32.dll
+0x00007ffa1cf60000 - 0x00007ffa1cffe000 C:\Windows\System32\msvcrt.dll
+0x00007ffa1cdf0000 - 0x00007ffa1ce8f000 C:\Windows\System32\sechost.dll
+0x00007ffa1c580000 - 0x00007ffa1c6a3000 C:\Windows\System32\RPCRT4.dll
+0x00007ffa1ada0000 - 0x00007ffa1adc7000 C:\Windows\System32\bcrypt.dll
+0x00007ffa1be50000 - 0x00007ffa1bfed000 C:\Windows\System32\USER32.dll
+0x00007ffa1a7a0000 - 0x00007ffa1a7c2000 C:\Windows\System32\win32u.dll
+0x00007ffa1bff0000 - 0x00007ffa1c01b000 C:\Windows\System32\GDI32.dll
+0x00007ffa1ac80000 - 0x00007ffa1ad9a000 C:\Windows\System32\gdi32full.dll
+0x00007ffa1aaa0000 - 0x00007ffa1ab3d000 C:\Windows\System32\msvcp_win.dll
+0x00007ffa1a9a0000 - 0x00007ffa1aaa0000 C:\Windows\System32\ucrtbase.dll
+0x00007ffa00e00000 - 0x00007ffa0109a000 C:\Windows\WinSxS\amd64_microsoft.windows.common-controls_6595b64144ccf1df_6.0.19041.4355_none_60b8b9eb71f62e16\COMCTL32.dll
+0x00007ffa1c030000 - 0x00007ffa1c05f000 C:\Windows\System32\IMM32.DLL
+0x00007ffa10f70000 - 0x00007ffa10f85000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\vcruntime140.dll
+0x00007ff9ceb10000 - 0x00007ff9cebab000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\msvcp140.dll
+0x0000000051a10000 - 0x000000005226c000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\server\jvm.dll
+0x00007ffa1c020000 - 0x00007ffa1c028000 C:\Windows\System32\PSAPI.DLL
+0x00007ff9fac50000 - 0x00007ff9fac59000 C:\Windows\SYSTEM32\WSOCK32.dll
+0x00007ffa0d800000 - 0x00007ffa0d827000 C:\Windows\SYSTEM32\WINMM.dll
+0x00007ffa0ff90000 - 0x00007ffa0ff9a000 C:\Windows\SYSTEM32\VERSION.dll
+0x00007ffa1c060000 - 0x00007ffa1c0cb000 C:\Windows\System32\WS2_32.dll
+0x00007ffa18f70000 - 0x00007ffa18f82000 C:\Windows\SYSTEM32\kernel.appcore.dll
+0x00007ffa10fc0000 - 0x00007ffa10fd0000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\verify.dll
+0x00007ffa0aec0000 - 0x00007ffa0aeeb000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\java.dll
+0x00007ff9ca260000 - 0x00007ff9ca296000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\jdwp.dll
+0x00007ffa0af80000 - 0x00007ffa0af89000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\npt.dll
+0x00007ff9c1ab0000 - 0x00007ff9c1ae2000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\instrument.dll
+0x00007ffa008e0000 - 0x00007ffa008f8000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\zip.dll
+
+VM Arguments:
+jvm_args: -agentlib:jdwp=transport=dt_socket,address=127.0.0.1:56727,suspend=y,server=n -javaagent:C:\Users\18264\AppData\Local\JetBrains\IntelliJIdea2021.1\captureAgent\debugger-agent.jar -Dfile.encoding=UTF-8
+java_command: com.example.saveInES
+java_class_path (initial): C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\charsets.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\access-bridge-64.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\cldrdata.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\dnsns.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\jaccess.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\jfxrt.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\localedata.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\nashorn.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\sunec.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\sunjce_provider.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\sunmscapi.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\sunpkcs11.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\zipfs.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\jce.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\jfr.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\jfxswt.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\jsse.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\management-agent.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\resources.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\rt.jar;F:\workTest\DaKaES\target\classes;C:\Users\18264\.m2\repository\org\elasticsearch\client\elasticsearch-rest-high-level-client\7.17.0\elasticsearch-rest-high-level-client-7.17.0.jar;C:\Users\18264\.m2\repository\org\elasticsearch\elasticsearch\7.17.0\elasticsearch-7.17.0.jar;C:\Users\18264\.m2\repository\org\elasticsearch\elasticsearch-core\7.17.0\elasticsearch-core-7.17.0.jar;C:\Users\18264\.m2\repository\org\elasticsearch\elasticsearch-secure-sm\7.17.0\elasticsearch-secure-sm-7.17.0.jar;C:\Users\18264\.m2\repository\org\elasticsearch\elasticsearch-x-content\7.17.0\elasticsearch-x-content-7.17.0.jar;C:\Users\18264\.m2\repository\org\yaml\snakeyaml\1.26\snakeyaml-1.26.jar;C:\Users\18264\.m2\repository\c
+Launcher Type: SUN_STANDARD
+
+Environment Variables:
+JAVA_HOME=E:\java
+PATH=C:\Program Files\Common Files\Oracle\Java\javapath;D:\vm\bin\;E:\app\18264\product\11.2.0\dbhome_1\bin;C:\Windows\system32;C:\Windows;C:\Windows\System32\Wbem;C:\Windows\System32\WindowsPowerShell\v1.0\;C:\Windows\System32\OpenSSH\;C:\Program Files (x86)\NVIDIA Corporation\PhysX\Common;C:\Program Files\NVIDIA Corporation\NVIDIA NvDLISR;C:\Windows\system32;C:\Windows;C:\Windows\System32\Wbem;C:\Windows\System32\WindowsPowerShell\v1.0\;C:\Windows\System32\OpenSSH\;E:\java\bin;F:\mysql\mysql-5.7.37-winx64\mysql-5.7.37-winx64\bin;D:\matlab\Matlab R2022a\bin;C:\Program Files (x86)\dotnet\;C:\Program Files\dotnet\;D:\winscp\WinSCP\;F:\javaAbout\apache-maven-3.6.3\bin;C:\Program Files\Git\cmd;F:\tool\nvm\nvm;F:\tool\node;C:\Users\18264\AppData\Local\Programs\Python\Python311\Scripts\;C:\Users\18264\AppData\Local\Programs\Python\Python311\;C:\Users\18264\AppData\Local\Programs\Python\Python37\Scripts\;C:\Users\18264\AppData\Local\Programs\Python\Python37\;C:\Users\18264\AppData\Local\Programs\Python\Launcher\;C:\Users\18264\AppData\Local\Microsoft\WindowsApps;D:\Microsoft VS Code\bin;F:\idea\IntelliJ IDEA 2021.1.3\bin;;F:\tool\nvm\nvm;F:\tool\node
+USERNAME=18264
+OS=Windows_NT
+PROCESSOR_IDENTIFIER=Intel64 Family 6 Model 141 Stepping 1, GenuineIntel
+
+
+
+--------------- S Y S T E M ---------------
+
+OS: Windows 10 , 64 bit Build 19041 (10.0.19041.5438)
+
+CPU:total 16 (initial active 16) (8 cores per cpu, 2 threads per core) family 6 model 141 stepping 1, cmov, cx8, fxsr, mmx, sse, sse2, sse3, ssse3, sse4.1, sse4.2, popcnt, avx, avx2, aes, clmul, erms, 3dnowpref, lzcnt, ht, tsc, tscinvbit, bmi1, bmi2, adx
+
+Memory: 4k page, physical 16509736k(919328k free), swap 36170532k(5620k free)
+
+vm_info: OpenJDK 64-Bit Server VM (25.422-b05) for windows-amd64 JRE (1.8.0_422-b05), built on Jul 11 2024 17:20:01 by "Administrator" with MS VC++ 15.9 (VS2017)
+
+time: Tue Mar 4 14:31:48 2025
+timezone: Intel64 Family 6 Model 141 Stepping 1, GenuineIntel
+elapsed time: 0.022707 seconds (0d 0h 0m 0s)
+
diff --git a/bin/keywords.txt b/bin/keywords.txt
new file mode 100644
index 0000000..2358ab6
--- /dev/null
+++ b/bin/keywords.txt
@@ -0,0 +1,6 @@
+Montpellier Institute of Virology, France
+Ontario Public Health Laboratory, Canada
+University of Texas Biosafety Laboratory, USA
+Korea National Institute of Infectious Diseases (KCDC)
+Israel Institute of Life Sciences
+Biosafety Laboratory, University of Basel, Switzerland
\ No newline at end of file
diff --git a/bin/pom.xml b/bin/pom.xml
new file mode 100644
index 0000000..730bf94
--- /dev/null
+++ b/bin/pom.xml
@@ -0,0 +1,138 @@
+
+ 4.0.0
+ com.example
+ es-crawler
+ 1.0-SNAPSHOT
+
+
+ 8
+ 8
+
+
+
+
+
+ org.elasticsearch.client
+ elasticsearch-rest-high-level-client
+ 7.17.0
+
+
+
+ co.elastic.clients
+ elasticsearch-java
+ 7.17.15
+
+
+ com.fasterxml.jackson.core
+ jackson-databind
+ 2.15.0
+
+
+
+
+ org.jsoup
+ jsoup
+ 1.17.2
+
+
+
+
+ com.squareup.okhttp3
+ okhttp
+ 4.9.3
+
+
+
+
+ org.slf4j
+ slf4j-api
+ 1.7.36
+
+
+ ch.qos.logback
+ logback-classic
+ 1.2.11
+
+
+
+
+ org.apache.kafka
+ kafka-clients
+ 3.9.0
+
+
+
+
+ org.seleniumhq.selenium
+ selenium-java
+ 4.10.0
+
+
+
+
+ io.github.bonigarcia
+ webdrivermanager
+ 5.6.2
+
+
+
+ org.json
+ json
+ 20230227
+
+
+
+ com.google.code.gson
+ gson
+ 2.10.1
+
+
+
+ net.sourceforge.htmlunit
+ htmlunit
+ 2.61.0
+
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+ 3.8.1
+
+ 8
+ 8
+
+
+
+
+ org.apache.maven.plugins
+ maven-assembly-plugin
+ 3.3.0
+
+
+
+ com.example.projTopic
+
+
+
+ jar-with-dependencies
+
+
+
+
+ make-assembly
+ package
+
+ single
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/bin/processed_urls.txt b/bin/processed_urls.txt
new file mode 100644
index 0000000..f862ec3
--- /dev/null
+++ b/bin/processed_urls.txt
@@ -0,0 +1,281 @@
+
+https://www.zyctd.com/zixun/201/1055143.html
+https://www.zyctd.com/zixun/201/861786.html
+https://www.zyctd.com/zixun/201/1053482.html
+https://www.zyctd.com/zixun/201/269419.html
+https://www.zyctd.com/zixun/201/1053149.html
+https://www.zyctd.com/zixun/201/1023926.html
+https://www.zyctd.com/zixun/201/435325.html
+https://www.zyctd.com/zixun/201/1050302.html
+https://www.zyctd.com/zixun/201/880441.html
+https://www.zyctd.com/zixun/201/1019635.html
+https://www.zyctd.com/zixun/201/970572.html
+https://www.zyctd.com/zixun/201/912277.html
+https://www.zyctd.com/zixun/201/372444.html
+https://www.zyctd.com/zixun/201/1073629.html
+https://www.zyctd.com/zixun/201/1069386.html
+https://www.zyctd.com/zixun/201/730410.html
+https://www.zyctd.com/zixun/201/953220.html
+https://www.zyctd.com/zixun/201/1074339.html
+https://www.zyctd.com/zixun/201/1072317.html
+https://www.zyctd.com/zixun/201/294794.html
+https://www.zyctd.com/zixun/201/267592.html
+https://www.zyctd.com/zixun/201/979665.html
+https://www.zyctd.com/zixun/201/869885.html
+https://www.zyctd.com/zixun/201/1054064.html
+https://www.zyctd.com/zixun/201/1049331.html
+https://www.zyctd.com/zixun/201/442647.html
+https://www.zyctd.com/zixun/201/285992.html
+https://www.zyctd.com/zixun/201/1037972.html
+https://www.zyctd.com/zixun/201/799801.html
+https://www.zyctd.com/zixun/201/916078.html
+https://www.zyctd.com/zixun/201/456647.html
+https://www.zyctd.com/zixun/201/812121.html
+https://www.zyctd.com/zixun/201/1042740.html
+https://www.zyctd.com/zixun/201/1042708.html
+https://www.zyctd.com/zixun/201/840450.html
+https://www.zyctd.com/zixun/201/320749.html
+https://www.zyctd.com/zixun/201/496106.html
+https://www.zyctd.com/zixun/201/850201.html
+https://www.zyctd.com/zixun/201/277145.html
+https://www.zyctd.com/zixun/201/299091.html
+https://www.zyctd.com/zixun/201/266080.html
+https://www.zyctd.com/zixun/201/1051925.html
+https://www.zyctd.com/zixun/201/898081.html
+https://www.zyctd.com/zixun/201/873280.html
+https://www.zyctd.com/zixun/201/703880.html
+https://www.zyctd.com/zixun/201/873126.html
+https://www.zyctd.com/zixun/201/887931.html
+https://www.zyctd.com/zixun/201/432742.html
+https://www.zyctd.com/zixun/201/1040431.html
+https://www.zyctd.com/zixun/201/1040223.html
+https://www.zyctd.com/zixun/201/858118.html
+https://www.zyctd.com/zixun/201/971286.html
+https://www.zyctd.com/zixun/201/458488.html
+https://www.zyctd.com/zixun/201/1079381.html
+https://www.zyctd.com/zixun/201/263578.html
+https://www.zyctd.com/zixun/201/553513.html
+https://www.zyctd.com/zixun/201/286229.html
+https://www.zyctd.com/zixun/201/285365.html
+https://www.zyctd.com/zixun/201/352921.html
+https://www.zyctd.com/zixun/201/503267.html
+https://www.zyctd.com/zixun/201/391337.html
+https://www.zyctd.com/zixun/201/813052.html
+https://www.zyctd.com/zixun/201/1053556.html
+https://www.zyctd.com/zixun/201/1041197.html
+https://www.zyctd.com/zixun/201/287420.html
+https://www.zyctd.com/zixun/201/291563.html
+https://www.zyctd.com/zixun/201/948250.html
+https://www.zyctd.com/zixun/201/289034.html
+https://www.zyctd.com/zixun/201/795965.html
+https://www.zyctd.com/zixun/201/292962.html
+https://www.zyctd.com/zixun/201/975850.html
+https://www.zyctd.com/zixun/201/275335.html
+https://www.zyctd.com/zixun/201/1031992.html
+https://www.zyctd.com/zixun/201/1033886.html
+https://www.zyctd.com/zixun/201/999510.html
+https://www.zyctd.com/zixun/201/270144.html
+https://www.zyctd.com/zixun/201/1055519.html
+https://www.zyctd.com/zixun/201/272205.html
+https://www.zyctd.com/zixun/201/526059.html
+https://www.zyctd.com/zixun/201/456640.html
+https://www.zyctd.com/zixun/201/267952.html
+https://www.zyctd.com/zixun/201/803469.html
+https://www.zyctd.com/zixun/201/270763.html
+https://www.zyctd.com/zixun/201/1072987.html
+https://www.zyctd.com/zixun/201/265176.html
+https://www.zyctd.com/zixun/201/1022141.html
+https://www.zyctd.com/zixun/201/290173.html
+https://www.zyctd.com/zixun/201/269175.html
+https://www.zyctd.com/zixun/201/744991.html
+https://www.zyctd.com/zixun/201/1019131.html
+https://www.zyctd.com/zixun/201/717054.html
+https://www.zyctd.com/zixun/201/517358.html
+https://www.zyctd.com/zixun/201/1058505.html
+https://www.zyctd.com/zixun/201/905515.html
+https://www.zyctd.com/zixun/201/287395.html
+https://www.zyctd.com/zixun/201/934873.html
+https://www.zyctd.com/zixun/201/1051317.html
+https://www.zyctd.com/zixun/201/926018.html
+https://www.zyctd.com/zixun/201/334511.html
+https://www.zyctd.com/zixun/201/845896.html
+https://www.zyctd.com/zixun/201/587785.html
+https://www.zyctd.com/zixun/201/288376.html
+https://www.zyctd.com/zixun/201/851405.html
+https://www.zyctd.com/zixun/201/941404.html
+https://www.zyctd.com/zixun/201/881855.html
+https://www.zyctd.com/zixun/201/602632.html
+https://www.zyctd.com/zixun/201/293601.html
+https://www.zyctd.com/zixun/201/541809.html
+https://www.zyctd.com/zixun/201/335120.html
+https://www.zyctd.com/zixun/201/1031137.html
+https://www.zyctd.com/zixun/201/960101.html
+https://www.zyctd.com/zixun/201/1077142.html
+https://www.zyctd.com/zixun/201/1063222.html
+https://www.zyctd.com/zixun/201/681466.html
+https://www.zyctd.com/zixun/201/1031130.html
+https://www.zyctd.com/zixun/201/1073734.html
+https://www.zyctd.com/zixun/201/1062186.html
+https://www.zyctd.com/zixun/201/1046628.html
+https://www.zyctd.com/zixun/201/358892.html
+https://www.zyctd.com/zixun/201/285361.html
+https://www.zyctd.com/zixun/201/1059889.html
+https://www.zyctd.com/zixun/201/297824.html
+https://www.zyctd.com/zixun/201/844307.html
+https://www.zyctd.com/zixun/201/900524.html
+https://www.zyctd.com/zixun/201/1057636.html
+https://www.zyctd.com/zixun/201/1010080.html
+https://www.zyctd.com/zixun/201/409152.html
+https://www.zyctd.com/zixun/201/402782.html
+https://www.zyctd.com/zixun/201/770296.html
+https://www.zyctd.com/zixun/201/1040602.html
+https://www.zyctd.com/zixun/201/606503.html
+https://www.zyctd.com/zixun/201/784471.html
+https://www.zyctd.com/zixun/201/466097.html
+https://www.zyctd.com/zixun/201/1071160.html
+https://www.zyctd.com/zixun/201/623226.html
+https://www.zyctd.com/zixun/201/948264.html
+https://www.zyctd.com/zixun/201/293462.html
+https://www.zyctd.com/zixun/201/829348.html
+https://www.zyctd.com/zixun/201/332369.html
+https://www.zyctd.com/zixun/201/907461.html
+https://www.zyctd.com/zixun/201/756555.html
+https://www.zyctd.com/zixun/201/717915.html
+https://www.zyctd.com/zixun/201/262203.html
+https://www.zyctd.com/zixun/201/1055787.html
+https://www.zyctd.com/zixun/201/432336.html
+https://www.zyctd.com/zixun/201/907489.html
+https://www.zyctd.com/zixun/201/1014686.html
+https://www.zyctd.com/zixun/201/1053320.html
+https://www.zyctd.com/zixun/201/480020.html
+https://www.zyctd.com/zixun/201/287423.html
+https://www.zyctd.com/zixun/201/385289.html
+https://www.zyctd.com/zixun/201/1030421.html
+https://www.zyctd.com/zixun/201/527648.html
+https://www.zyctd.com/zixun/201/972959.html
+https://www.zyctd.com/zixun/201/408767.html
+https://www.zyctd.com/zixun/201/724887.html
+https://www.zyctd.com/zixun/201/291480.html
+https://www.zyctd.com/zixun/201/472544.html
+https://www.zyctd.com/zixun/201/724873.html
+https://www.zyctd.com/zixun/201/281751.html
+https://www.zyctd.com/zixun/201/1049693.html
+https://www.zyctd.com/zixun/201/869619.html
+https://www.zyctd.com/zixun/201/355497.html
+https://www.zyctd.com/zixun/201/341623.html
+https://www.zyctd.com/zixun/201/450753.html
+https://www.zyctd.com/zixun/201/1065837.html
+https://www.zyctd.com/zixun/201/1031331.html
+https://www.zyctd.com/zixun/201/669727.html
+https://www.zyctd.com/zixun/201/1034010.html
+https://www.zyctd.com/zixun/201/1054058.html
+https://www.zyctd.com/zixun/201/954613.html
+https://www.zyctd.com/zixun/201/715584.html
+https://www.zyctd.com/zixun/201/1051110.html
+https://www.zyctd.com/zixun/201/269963.html
+https://www.zyctd.com/zixun/201/1048128.html
+https://www.zyctd.com/zixun/201/793207.html
+https://www.zyctd.com/zixun/201/284310.html
+https://www.zyctd.com/zixun/201/282639.html
+https://www.zyctd.com/zixun/201/1068138.html
+https://www.zyctd.com/zixun/201/340678.html
+https://www.zyctd.com/zixun/201/294371.html
+https://www.zyctd.com/zixun/201/324277.html
+https://www.zyctd.com/zixun/201/1048931.html
+https://www.zyctd.com/zixun/201/851398.html
+https://www.zyctd.com/zixun/201/263527.html
+https://www.zyctd.com/zixun/201/919480.html
+https://www.zyctd.com/zixun/201/685442.html
+https://www.zyctd.com/zixun/201/428325.html
+https://www.zyctd.com/zixun/201/1032698.html
+https://www.zyctd.com/zixun/201/1003367.html
+https://www.zyctd.com/zixun/201/852315.html
+https://www.zyctd.com/zixun/201/283156.html
+https://www.zyctd.com/zixun/201/262484.html
+https://www.zyctd.com/zixun/201/1065225.html
+https://www.zyctd.com/zixun/201/763331.html
+https://www.zyctd.com/zixun/201/1066158.html
+https://www.zyctd.com/zixun/201/1047744.html
+https://www.zyctd.com/zixun/201/842795.html
+https://www.zyctd.com/zixun/201/975374.html
+https://www.zyctd.com/zixun/201/1055865.html
+https://www.zyctd.com/zixun/201/1017367.html
+https://www.zyctd.com/zixun/201/1057711.html
+https://www.zyctd.com/zixun/201/1074295.html
+https://www.zyctd.com/zixun/201/283647.html
+https://www.zyctd.com/zixun/201/286896.html
+https://www.zyctd.com/zixun/201/1043393.html
+https://www.zyctd.com/zixun/201/305888.html
+https://www.zyctd.com/zixun/201/487258.html
+https://www.zyctd.com/zixun/201/1045652.html
+https://www.zyctd.com/zixun/201/1064905.html
+https://www.zyctd.com/zixun/201/515636.html
+https://www.zyctd.com/zixun/201/1038609.html
+https://www.zyctd.com/zixun/201/438083.html
+https://www.zyctd.com/zixun/201/297327.html
+https://www.zyctd.com/zixun/201/773537.html
+https://www.zyctd.com/zixun/201/1043589.html
+https://www.zyctd.com/zixun/201/815712.html
+https://www.zyctd.com/zixun/201/698595.html
+https://www.zyctd.com/zixun/201/269800.html
+https://www.zyctd.com/zixun/201/1030332.html
+https://www.zyctd.com/zixun/201/422676.html
+https://www.zyctd.com/zixun/201/290130.html
+https://www.zyctd.com/zixun/201/270359.html
+https://www.zyctd.com/zixun/201/995604.html
+https://www.zyctd.com/zixun/201/1074993.html
+https://www.zyctd.com/zixun/201/1054825.html
+https://www.zyctd.com/zixun/201/918577.html
+https://www.zyctd.com/zixun/201/686527.html
+https://www.zyctd.com/zixun/201/297509.html
+https://www.zyctd.com/zixun/201/622708.html
+https://www.zyctd.com/zixun/201/469870.html
+https://www.zyctd.com/zixun/201/844328.html
+https://www.zyctd.com/zixun/201/394508.html
+https://www.zyctd.com/zixun/201/271744.html
+https://www.zyctd.com/zixun/201/1054940.html
+https://www.zyctd.com/zixun/201/732818.html
+https://www.zyctd.com/zixun/201/1049547.html
+https://www.zyctd.com/zixun/201/1059684.html
+https://www.zyctd.com/zixun/201/1055301.html
+https://www.zyctd.com/zixun/201/962068.html
+https://www.zyctd.com/zixun/201/451355.html
+https://www.zyctd.com/zixun/201/1056174.html
+https://www.zyctd.com/zixun/201/930540.html
+https://www.zyctd.com/zixun/201/871656.html
+https://www.zyctd.com/zixun/201/363246.html
+https://www.zyctd.com/zixun/201/845672.html
+https://www.zyctd.com/zixun/201/452965.html
+https://www.zyctd.com/zixun/201/1065920.html
+https://www.zyctd.com/zixun/201/1058808.html
+https://www.zyctd.com/zixun/201/986868.html
+https://www.zyctd.com/zixun/201/489785.html
+https://www.zyctd.com/zixun/201/307946.html
+https://www.zyctd.com/zixun/201/833359.html
+https://www.zyctd.com/zixun/201/806969.html
+https://www.zyctd.com/zixun/201/1050812.html
+https://www.zyctd.com/zixun/201/1033696.html
+https://www.zyctd.com/zixun/201/501167.html
+https://www.zyctd.com/zixun/201/1078919.html
+https://www.zyctd.com/zixun/201/1036495.html
+https://www.zyctd.com/zixun/201/1008736.html
+https://www.zyctd.com/zixun/201/1054264.html
+https://www.zyctd.com/zixun/201/493152.html
+https://www.zyctd.com/zixun/201/685456.html
+https://www.zyctd.com/zixun/201/995597.html
+https://www.zyctd.com/zixun/201/905501.html
+https://www.zyctd.com/zixun/201/347573.html
+https://www.zyctd.com/zixun/201/1045494.html
+https://www.zyctd.com/zixun/201/549775.html
+https://www.zyctd.com/zixun/201/1037336.html
+https://www.zyctd.com/zixun/201/1034972.html
+https://www.zyctd.com/zixun/201/653046.html
+https://www.zyctd.com/zixun/201/316612.html
+https://www.zyctd.com/zixun/201/447064.html
+https://www.zyctd.com/zixun/201/307603.html
+https://www.zyctd.com/zixun/201/263437.html
+https://www.zyctd.com/zixun/201/894490.html
+https://www.zyctd.com/zixun/201/368629.html
+https://www.zyctd.com/zixun/201/273285.html
+https://www.zyctd.com/zixun/201/1059618.html
+https://www.zyctd.com/zixun/201/459237.html
diff --git a/bin/proxy.txt b/bin/proxy.txt
new file mode 100644
index 0000000..199a16c
--- /dev/null
+++ b/bin/proxy.txt
@@ -0,0 +1 @@
+127.0.0.1:7897
\ No newline at end of file
diff --git a/bin/src/main/java/com/example/Inka.class b/bin/src/main/java/com/example/Inka.class
new file mode 100644
index 0000000..ac137ee
Binary files /dev/null and b/bin/src/main/java/com/example/Inka.class differ
diff --git a/bin/src/main/java/com/example/NSFAwardCrawler.class b/bin/src/main/java/com/example/NSFAwardCrawler.class
new file mode 100644
index 0000000..eb1e050
Binary files /dev/null and b/bin/src/main/java/com/example/NSFAwardCrawler.class differ
diff --git a/bin/src/main/java/com/example/PatentscopeSeleniumCrawler.class b/bin/src/main/java/com/example/PatentscopeSeleniumCrawler.class
new file mode 100644
index 0000000..ee2edab
Binary files /dev/null and b/bin/src/main/java/com/example/PatentscopeSeleniumCrawler.class differ
diff --git a/bin/src/main/java/com/example/ProxyIPChecker.class b/bin/src/main/java/com/example/ProxyIPChecker.class
new file mode 100644
index 0000000..1b87f7c
Binary files /dev/null and b/bin/src/main/java/com/example/ProxyIPChecker.class differ
diff --git a/bin/src/main/java/com/example/StringFieldExtractor.class b/bin/src/main/java/com/example/StringFieldExtractor.class
new file mode 100644
index 0000000..d938b0b
Binary files /dev/null and b/bin/src/main/java/com/example/StringFieldExtractor.class differ
diff --git a/bin/src/main/java/com/example/getInKa.class b/bin/src/main/java/com/example/getInKa.class
new file mode 100644
index 0000000..a9baf48
Binary files /dev/null and b/bin/src/main/java/com/example/getInKa.class differ
diff --git a/bin/src/main/java/com/example/jsonGetOk.class b/bin/src/main/java/com/example/jsonGetOk.class
new file mode 100644
index 0000000..a9f68b9
Binary files /dev/null and b/bin/src/main/java/com/example/jsonGetOk.class differ
diff --git a/bin/src/main/java/com/example/ook.class b/bin/src/main/java/com/example/ook.class
new file mode 100644
index 0000000..8a7afb8
Binary files /dev/null and b/bin/src/main/java/com/example/ook.class differ
diff --git a/bin/src/main/java/com/example/oook.class b/bin/src/main/java/com/example/oook.class
new file mode 100644
index 0000000..a140aa5
Binary files /dev/null and b/bin/src/main/java/com/example/oook.class differ
diff --git a/bin/src/main/java/com/example/projTopic.class b/bin/src/main/java/com/example/projTopic.class
new file mode 100644
index 0000000..29af390
Binary files /dev/null and b/bin/src/main/java/com/example/projTopic.class differ
diff --git a/bin/src/main/java/com/example/saveInES.class b/bin/src/main/java/com/example/saveInES.class
new file mode 100644
index 0000000..1bf30e6
Binary files /dev/null and b/bin/src/main/java/com/example/saveInES.class differ
diff --git a/bin/src/main/java/com/example/test.class b/bin/src/main/java/com/example/test.class
new file mode 100644
index 0000000..95aa04c
Binary files /dev/null and b/bin/src/main/java/com/example/test.class differ
diff --git a/bin/src/main/java/com/example/test2.class b/bin/src/main/java/com/example/test2.class
new file mode 100644
index 0000000..6f03608
Binary files /dev/null and b/bin/src/main/java/com/example/test2.class differ
diff --git a/bin/src/main/java/com/example/testContent.class b/bin/src/main/java/com/example/testContent.class
new file mode 100644
index 0000000..16cc481
Binary files /dev/null and b/bin/src/main/java/com/example/testContent.class differ
diff --git a/bin/src/main/java/com/example/umlistTest.class b/bin/src/main/java/com/example/umlistTest.class
new file mode 100644
index 0000000..ccfdbcd
Binary files /dev/null and b/bin/src/main/java/com/example/umlistTest.class differ
diff --git a/bin/target/classes/META-INF/MANIFEST.MF b/bin/target/classes/META-INF/MANIFEST.MF
new file mode 100644
index 0000000..38f1f7e
--- /dev/null
+++ b/bin/target/classes/META-INF/MANIFEST.MF
@@ -0,0 +1,4 @@
+Manifest-Version: 1.0
+Build-Jdk-Spec: 22
+Created-By: Maven Integration for Eclipse
+
diff --git a/bin/target/es-crawler-1.0-SNAPSHOT-jar-with-dependencies.jar b/bin/target/es-crawler-1.0-SNAPSHOT-jar-with-dependencies.jar
new file mode 100644
index 0000000..041697a
Binary files /dev/null and b/bin/target/es-crawler-1.0-SNAPSHOT-jar-with-dependencies.jar differ
diff --git a/bin/target/es-crawler-1.0-SNAPSHOT.jar b/bin/target/es-crawler-1.0-SNAPSHOT.jar
new file mode 100644
index 0000000..febbb6e
Binary files /dev/null and b/bin/target/es-crawler-1.0-SNAPSHOT.jar differ
diff --git a/bin/target/maven-archiver/pom.properties b/bin/target/maven-archiver/pom.properties
new file mode 100644
index 0000000..c35b816
--- /dev/null
+++ b/bin/target/maven-archiver/pom.properties
@@ -0,0 +1,5 @@
+#Generated by Maven
+#Fri Apr 18 18:29:46 CST 2025
+version=1.0-SNAPSHOT
+groupId=com.example
+artifactId=es-crawler
diff --git a/bin/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst b/bin/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst
new file mode 100644
index 0000000..5f1323f
--- /dev/null
+++ b/bin/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst
@@ -0,0 +1 @@
+com\example\projTopic.class
diff --git a/bin/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst b/bin/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst
new file mode 100644
index 0000000..90e26e0
--- /dev/null
+++ b/bin/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst
@@ -0,0 +1 @@
+F:\workTest\DaKaES\src\main\java\com\example\projTopic.java
diff --git a/bin/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/createdFiles.lst b/bin/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/createdFiles.lst
new file mode 100644
index 0000000..e69de29
diff --git a/bin/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/inputFiles.lst b/bin/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/inputFiles.lst
new file mode 100644
index 0000000..e69de29
diff --git a/hs_err_pid15760.log b/hs_err_pid15760.log
new file mode 100644
index 0000000..3459d43
--- /dev/null
+++ b/hs_err_pid15760.log
@@ -0,0 +1,167 @@
+#
+# There is insufficient memory for the Java Runtime Environment to continue.
+# Native memory allocation (malloc) failed to allocate 1048576 bytes for AllocateHeap
+# Possible reasons:
+# The system is out of physical RAM or swap space
+# The process is running with CompressedOops enabled, and the Java Heap may be blocking the growth of the native heap
+# Possible solutions:
+# Reduce memory load on the system
+# Increase physical memory or swap space
+# Check if swap backing store is full
+# Decrease Java heap size (-Xmx/-Xms)
+# Decrease number of Java threads
+# Decrease Java thread stack sizes (-Xss)
+# Set larger code cache with -XX:ReservedCodeCacheSize=
+# JVM is running with Zero Based Compressed Oops mode in which the Java heap is
+# placed in the first 32GB address space. The Java Heap base address is the
+# maximum limit for the native heap growth. Please use -XX:HeapBaseMinAddress
+# to set the Java Heap base and to place the Java Heap above 32GB virtual address.
+# This output file may be truncated or incomplete.
+#
+# Out of Memory Error (memory/allocation.inline.hpp:61), pid=15760, tid=0x0000000000003334
+#
+# JRE version: (8.0_422-b05) (build )
+# Java VM: OpenJDK 64-Bit Server VM (25.422-b05 mixed mode windows-amd64 compressed oops)
+# Failed to write core dump. Minidumps are not enabled by default on client versions of Windows
+#
+
+--------------- T H R E A D ---------------
+
+Current thread (0x00000271b7d7d800): JavaThread "Unknown thread" [_thread_in_vm, id=13108, stack(0x00000082a1500000,0x00000082a1600000)]
+
+Stack: [0x00000082a1500000,0x00000082a1600000]
+[error occurred during error reporting (printing stack bounds), id 0xc0000005]
+
+Native frames: (J=compiled Java code, j=interpreted, Vv=VM code, C=native code)
+
+
+--------------- P R O C E S S ---------------
+
+Java Threads: ( => current thread )
+
+Other Threads:
+
+=>0x00000271b7d7d800 (exited) JavaThread "Unknown thread" [_thread_in_vm, id=13108, stack(0x00000082a1500000,0x00000082a1600000)]
+
+VM state:not at safepoint (normal execution)
+
+VM Mutex/Monitor currently owned by a thread: None
+
+heap address: 0x00000006c4000000, size: 4032 MB, Compressed Oops mode: Zero based, Oop shift amount: 3
+Narrow klass base: 0x0000000000000000, Narrow klass shift: 3
+Compressed class space size: 1073741824 Address: 0x00000007c0000000
+
+Heap:
+ PSYoungGen total 75264K, used 1290K [0x000000076c000000, 0x0000000771400000, 0x00000007c0000000)
+ eden space 64512K, 2% used [0x000000076c000000,0x000000076c142900,0x000000076ff00000)
+ from space 10752K, 0% used [0x0000000770980000,0x0000000770980000,0x0000000771400000)
+ to space 10752K, 0% used [0x000000076ff00000,0x000000076ff00000,0x0000000770980000)
+ ParOldGen total 172032K, used 0K [0x00000006c4000000, 0x00000006ce800000, 0x000000076c000000)
+ object space 172032K, 0% used [0x00000006c4000000,0x00000006c4000000,0x00000006ce800000)
+ Metaspace used 790K, capacity 4480K, committed 4480K, reserved 1056768K
+ class space used 76K, capacity 384K, committed 384K, reserved 1048576K
+
+Card table byte_map: [0x00000271c8b70000,0x00000271c9360000] byte_map_base: 0x00000271c5550000
+
+Marking Bits: (ParMarkBitMap*) 0x00000000521f38d0
+ Begin Bits: [0x00000271c98a0000, 0x00000271cd7a0000)
+ End Bits: [0x00000271cd7a0000, 0x00000271d16a0000)
+
+Polling page: 0x00000271b7eb0000
+
+CodeCache: size=245760Kb used=328Kb max_used=328Kb free=245431Kb
+ bounds [0x00000271b97b0000, 0x00000271b9a20000, 0x00000271c87b0000]
+ total_blobs=57 nmethods=0 adapters=38
+ compilation: enabled
+
+Compilation events (0 events):
+No events
+
+GC Heap History (0 events):
+No events
+
+Deoptimization events (0 events):
+No events
+
+Classes redefined (0 events):
+No events
+
+Internal exceptions (0 events):
+No events
+
+Events (10 events):
+Event: 0.012 loading class java/lang/Short
+Event: 0.013 loading class java/lang/Short done
+Event: 0.013 loading class java/lang/Integer
+Event: 0.013 loading class java/lang/Integer done
+Event: 0.013 loading class java/lang/Long
+Event: 0.013 loading class java/lang/Long done
+Event: 0.013 loading class java/lang/NullPointerException
+Event: 0.013 loading class java/lang/NullPointerException done
+Event: 0.013 loading class java/lang/ArithmeticException
+Event: 0.013 loading class java/lang/ArithmeticException done
+
+
+Dynamic libraries:
+0x00007ff7d7590000 - 0x00007ff7d75d6000 C:\Users\18264\.jdks\corretto-1.8.0_422\bin\java.exe
+0x00007ffa1d0b0000 - 0x00007ffa1d2a8000 C:\Windows\SYSTEM32\ntdll.dll
+0x00007ffa1ce90000 - 0x00007ffa1cf52000 C:\Windows\System32\KERNEL32.DLL
+0x00007ffa1add0000 - 0x00007ffa1b0cf000 C:\Windows\System32\KERNELBASE.dll
+0x00007ffa1c470000 - 0x00007ffa1c51f000 C:\Windows\System32\ADVAPI32.dll
+0x00007ffa1cf60000 - 0x00007ffa1cffe000 C:\Windows\System32\msvcrt.dll
+0x00007ffa1cdf0000 - 0x00007ffa1ce8f000 C:\Windows\System32\sechost.dll
+0x00007ffa1c580000 - 0x00007ffa1c6a3000 C:\Windows\System32\RPCRT4.dll
+0x00007ffa1ada0000 - 0x00007ffa1adc7000 C:\Windows\System32\bcrypt.dll
+0x00007ffa1be50000 - 0x00007ffa1bfed000 C:\Windows\System32\USER32.dll
+0x00007ffa1a7a0000 - 0x00007ffa1a7c2000 C:\Windows\System32\win32u.dll
+0x00007ffa1bff0000 - 0x00007ffa1c01b000 C:\Windows\System32\GDI32.dll
+0x00007ffa1ac80000 - 0x00007ffa1ad9a000 C:\Windows\System32\gdi32full.dll
+0x00007ffa1aaa0000 - 0x00007ffa1ab3d000 C:\Windows\System32\msvcp_win.dll
+0x00007ffa1a9a0000 - 0x00007ffa1aaa0000 C:\Windows\System32\ucrtbase.dll
+0x00007ffa00e00000 - 0x00007ffa0109a000 C:\Windows\WinSxS\amd64_microsoft.windows.common-controls_6595b64144ccf1df_6.0.19041.4355_none_60b8b9eb71f62e16\COMCTL32.dll
+0x00007ffa1c030000 - 0x00007ffa1c05f000 C:\Windows\System32\IMM32.DLL
+0x00007ffa10f70000 - 0x00007ffa10f85000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\vcruntime140.dll
+0x00007ff9ceb10000 - 0x00007ff9cebab000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\msvcp140.dll
+0x0000000051a10000 - 0x000000005226c000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\server\jvm.dll
+0x00007ffa1c020000 - 0x00007ffa1c028000 C:\Windows\System32\PSAPI.DLL
+0x00007ff9fac50000 - 0x00007ff9fac59000 C:\Windows\SYSTEM32\WSOCK32.dll
+0x00007ffa0d800000 - 0x00007ffa0d827000 C:\Windows\SYSTEM32\WINMM.dll
+0x00007ffa0ff90000 - 0x00007ffa0ff9a000 C:\Windows\SYSTEM32\VERSION.dll
+0x00007ffa1c060000 - 0x00007ffa1c0cb000 C:\Windows\System32\WS2_32.dll
+0x00007ffa18f70000 - 0x00007ffa18f82000 C:\Windows\SYSTEM32\kernel.appcore.dll
+0x00007ffa10fc0000 - 0x00007ffa10fd0000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\verify.dll
+0x00007ffa0aec0000 - 0x00007ffa0aeeb000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\java.dll
+0x00007ff9ca260000 - 0x00007ff9ca296000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\jdwp.dll
+0x00007ffa0af80000 - 0x00007ffa0af89000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\npt.dll
+0x00007ff9c1ab0000 - 0x00007ff9c1ae2000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\instrument.dll
+0x00007ffa008e0000 - 0x00007ffa008f8000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\zip.dll
+
+VM Arguments:
+jvm_args: -agentlib:jdwp=transport=dt_socket,address=127.0.0.1:56727,suspend=y,server=n -javaagent:C:\Users\18264\AppData\Local\JetBrains\IntelliJIdea2021.1\captureAgent\debugger-agent.jar -Dfile.encoding=UTF-8
+java_command: com.example.saveInES
+java_class_path (initial): C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\charsets.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\access-bridge-64.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\cldrdata.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\dnsns.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\jaccess.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\jfxrt.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\localedata.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\nashorn.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\sunec.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\sunjce_provider.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\sunmscapi.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\sunpkcs11.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\zipfs.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\jce.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\jfr.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\jfxswt.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\jsse.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\management-agent.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\resources.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\rt.jar;F:\workTest\DaKaES\target\classes;C:\Users\18264\.m2\repository\org\elasticsearch\client\elasticsearch-rest-high-level-client\7.17.0\elasticsearch-rest-high-level-client-7.17.0.jar;C:\Users\18264\.m2\repository\org\elasticsearch\elasticsearch\7.17.0\elasticsearch-7.17.0.jar;C:\Users\18264\.m2\repository\org\elasticsearch\elasticsearch-core\7.17.0\elasticsearch-core-7.17.0.jar;C:\Users\18264\.m2\repository\org\elasticsearch\elasticsearch-secure-sm\7.17.0\elasticsearch-secure-sm-7.17.0.jar;C:\Users\18264\.m2\repository\org\elasticsearch\elasticsearch-x-content\7.17.0\elasticsearch-x-content-7.17.0.jar;C:\Users\18264\.m2\repository\org\yaml\snakeyaml\1.26\snakeyaml-1.26.jar;C:\Users\18264\.m2\repository\c
+Launcher Type: SUN_STANDARD
+
+Environment Variables:
+JAVA_HOME=E:\java
+PATH=C:\Program Files\Common Files\Oracle\Java\javapath;D:\vm\bin\;E:\app\18264\product\11.2.0\dbhome_1\bin;C:\Windows\system32;C:\Windows;C:\Windows\System32\Wbem;C:\Windows\System32\WindowsPowerShell\v1.0\;C:\Windows\System32\OpenSSH\;C:\Program Files (x86)\NVIDIA Corporation\PhysX\Common;C:\Program Files\NVIDIA Corporation\NVIDIA NvDLISR;C:\Windows\system32;C:\Windows;C:\Windows\System32\Wbem;C:\Windows\System32\WindowsPowerShell\v1.0\;C:\Windows\System32\OpenSSH\;E:\java\bin;F:\mysql\mysql-5.7.37-winx64\mysql-5.7.37-winx64\bin;D:\matlab\Matlab R2022a\bin;C:\Program Files (x86)\dotnet\;C:\Program Files\dotnet\;D:\winscp\WinSCP\;F:\javaAbout\apache-maven-3.6.3\bin;C:\Program Files\Git\cmd;F:\tool\nvm\nvm;F:\tool\node;C:\Users\18264\AppData\Local\Programs\Python\Python311\Scripts\;C:\Users\18264\AppData\Local\Programs\Python\Python311\;C:\Users\18264\AppData\Local\Programs\Python\Python37\Scripts\;C:\Users\18264\AppData\Local\Programs\Python\Python37\;C:\Users\18264\AppData\Local\Programs\Python\Launcher\;C:\Users\18264\AppData\Local\Microsoft\WindowsApps;D:\Microsoft VS Code\bin;F:\idea\IntelliJ IDEA 2021.1.3\bin;;F:\tool\nvm\nvm;F:\tool\node
+USERNAME=18264
+OS=Windows_NT
+PROCESSOR_IDENTIFIER=Intel64 Family 6 Model 141 Stepping 1, GenuineIntel
+
+
+
+--------------- S Y S T E M ---------------
+
+OS: Windows 10 , 64 bit Build 19041 (10.0.19041.5438)
+
+CPU:total 16 (initial active 16) (8 cores per cpu, 2 threads per core) family 6 model 141 stepping 1, cmov, cx8, fxsr, mmx, sse, sse2, sse3, ssse3, sse4.1, sse4.2, popcnt, avx, avx2, aes, clmul, erms, 3dnowpref, lzcnt, ht, tsc, tscinvbit, bmi1, bmi2, adx
+
+Memory: 4k page, physical 16509736k(919328k free), swap 36170532k(5620k free)
+
+vm_info: OpenJDK 64-Bit Server VM (25.422-b05) for windows-amd64 JRE (1.8.0_422-b05), built on Jul 11 2024 17:20:01 by "Administrator" with MS VC++ 15.9 (VS2017)
+
+time: Tue Mar 4 14:31:48 2025
+timezone: Intel64 Family 6 Model 141 Stepping 1, GenuineIntel
+elapsed time: 0.022707 seconds (0d 0h 0m 0s)
+
diff --git a/keywords.txt b/keywords.txt
new file mode 100644
index 0000000..51fc8fc
--- /dev/null
+++ b/keywords.txt
@@ -0,0 +1,1045 @@
+Zoonotic disease
+pandemic
+Emerging and re-emerging diseases
+biosafet
+biosecurit
+biodefen
+biological defen
+bioweapon
+biologicalweapon
+bioterroris
+biological terroris
+biowarfare
+biological warfare
+biosurveillan
+biological surveillan
+biohazard
+biological hazard
+bioincident
+biological incident
+biothreat
+biological threat
+bioagent
+biologicalagent
+biological protect
+bioprotect
+biological risk
+Bacillus anthracis
+Bacillus cereus Biovaranthracis
+Brucella abortus
+Brucella melitensis
+Brucella neotomae
+Brucella suis
+Brucellamelitensis biovar suis
+Burkholderia mallei
+Pseudomonas mallei
+Burkholderia pseudomallei
+Acinetobacter mallei
+Glanders bacillus
+Bacillus mallei
+Actinobacillus mallei
+Pfeifferella mallei
+Malleomyces mallei
+Loefferella mallei
+Chlamydophila psittaci
+Chlamydia psittaci
+Clostridium botulinum
+Coxiella burnetii
+Escherichia coli O157
+Escherichia coli
+E coli O157-H7
+Escherichia coli O157:H7
+Francisella tularensis
+Legionella pneumophila
+Mycoplasma capricolum
+Mycoplasma mycoides
+Rickettsia prowazekii
+Rickettsia rickettsii
+Salmonella enterica
+Salmonella choleraesuis
+Vibrio cholerae
+Vibrio comma
+Yersinia pestis
+Bacille de la peste
+Bacterium pestis
+Pasteurella pestis
+African horse sickness virus
+AHSV
+African Swine Fever Virus
+ASFV
+Wart-Hog Disease Virus
+Wart Hog Disease Virus
+Avian influenza virus
+Bluetongue virus
+Bluetongue Viruses
+Blue Tongue Virus
+Ovine Catarrhal Fever Virus
+Chapare virus
+Chapare viruses
+Chaparemammarenavirus
+Chikungunya virus
+CHIKV
+Classical Swine Fever Virus
+Hog CholeraVirus
+Pestivirus C
+CSFV
+Crimean-Congohaemorrhagic fever virus
+Crimean Congohemorrhagic fever virus
+Congo Virus
+denguevirus
+DENV
+Dengue Viruses
+BreakboneFever Virus
+Breakbone Fever Viruses
+EasternEquine Encephalomyelitis Virus
+Eastern EquineEncephalitis virus
+EEE Virus
+EEEV
+Ebolavirus
+Ebolaviruses
+Ebola Virus
+Ebola Viruses
+Ebola-like Viruses
+Ebola likeViruses
+Ebola-like Virus
+Foot-and-MouthDisease Virus
+Foot and Mouth Disease Virus
+Foot-and-Mouth Disease Viruses
+FMDV
+Goatpox virus
+Goatpox viruses
+Goat PoxVirus
+Goat Pox Viruses
+Guanarito virus
+Guanarito viruses
+Guanarito mammarenavirus
+GTOV
+Hantaan virus
+Korean HemorrhagicFever Virus
+Hantaan orthohantavirus
+Hemorrhagic Nephroso-Nephritis Virus
+Hemorrhagic Nephroso Nephritis Virus
+Hemorrhagic Nephroso-Nephritis Viruses
+Epidemic Hemorrhagic Fever Virus
+HFRS Viruses
+Hemorrhagic Fever Renal Syndrome Virus
+HTNV
+Hendra Virus
+HendraViruses
+Equine Morbillivirus
+EquineMorbilliviruses
+MojV
+Japanese B EncephalitisVirus
+Japanese Encephalitis Virus
+JEV
+Junin virus
+Argentinian mammarenavirus
+JUNV
+Kyasanur Forest disease virus
+KFDV
+Lassa virus
+Lassa fever virus
+Lassamammarenavirus
+LASV
+Lujo virus
+Lujomammarenavirus
+LUJV
+Lumpy skin diseasevirus
+Neethling Virus
+Machupo virus
+Machupo mammarenavirus
+MACV
+Marburgvirus
+Marburgviruses
+Marburg Virus
+Marburg Viruses
+Marburg-like Viruses
+Marburg like Viruses
+Marburg-like Virus
+Frankfurt-Marburg Syndrome Virus
+FrankfurtMarburg Syndrome Virus
+Monkeypox virus
+Monkeypox viruses
+Monkeypoxvirus
+Monkeypoxviruses
+Monkey Pox Virus
+Monkey Pox Viruses
+Newcastle disease virus
+NDV
+Nipah virus
+Nipah henipavirus
+Nipah Viruses
+Omskhemorrhagic fever virus
+OHFV
+Omskhaemorrhagic fever virus
+Peste-des-petits-ruminants virus
+Peste des petitsruminants virus
+Rabies virus
+Rabies lyssavirus
+Reconstructed 1918 Influenza virus
+RiftValley fever virus
+Rift Valley fever phlebovirus
+RVFV
+Rinderpest virus
+Rinderpestmorbillivirus
+Sabia virus
+SARS Virus
+Severe Acute Respiratory Syndrome Virus
+RSARS-Related Coronavirus
+SARS RelatedCoronavirus
+SARS-CoV
+SARS AssociatedCoronavirus
+SARS Coronavirus
+SARS-Associated Coronavirus
+Severe acuterespiratory syndrome related coronavirus
+Severeacute respiratory syndrome-related coronavirus
+Sheeppox virus
+Sheeppox viruses
+Sheep PoxVirus
+Sheep Pox Viruses
+Sin Nombre virus
+Muerto Canyon Virus
+Four Corners Virus
+Sin Nombre hantavirus
+Swine vesicular diseasevirus
+SVDV
+Tick-Borne Encephalitis Virus
+Tick Borne Encephalitis Virus
+Tick-BorneEncephalitis Viruses
+Tick Borne EncephalitisViruses
+TBEV
+Variola virus
+Variolaviruses
+Smallpox Virus
+Smallpox Viruses
+Poxvirus variolae
+Variola minor virus
+Variolamajor virus
+Alastrim
+Venezuelan equineencephalitis virus
+Venezuelan Equine EncephalitisViruses
+West Nile virus
+Egypt 101 virus
+Kunjin virus
+WNV
+WEE Virus
+WEEViruses
+Western Equine Encephalitis Viruses
+Western equine encephalitis virus
+WEEV
+Yellow fever virus
+Naegleria fowleri
+Naegleria fowlerus
+Fiji disease virus
+Ralstoniasolanacearum
+Rathayibacter toxicus
+Xanthomonas oryzae
+Erwinia amylovora
+Xanthomonas albilineans
+Dothistroma pini
+Dothistroma septosporum
+Scirrhia pini
+Tilletiaindica
+Coniothyrium glycines
+Phomaglycinicola
+Pyrenochaeta glycines
+Coccidioides immitis
+Histoplasma capsulatum
+Synchytrium endobioticum
+Colletotrichumcoffeanum
+Peronospora hyoscyami
+Peronosclerospora philippinensis
+Sclerophthorarayssiae
+Bacteriotoxins
+Botulinum toxins
+Clostridium perfringens toxins
+Staphylococcalenterotoxins
+Shigatoxins
+Anatoxins
+Ciguatoxins
+Saxitoxins
+Trichothecene toxins
+Abrins
+Ricin*
+recin
+Bungarotoxins
+Botulinum neurotoxin producing species ofClostridium
+Conotoxins
+T-2 toxin
+Tetrodotoxin
+Diacetoxyscirpeno
+SARS-COV-2
+COVID-19
+coronavirus disease 2019
+2019-nCov
+Alastrim virus
+Mpox virus
+Hypr virus
+Kumlinge virus
+Louping ill virus
+Hanzalova virus
+Omsk hemorrhagic fever virus
+St.Louis encephalitis virus
+Crimean-Congo hemorrhagic fever virus (Xinjiang hemorrhagic fever virus)
+Herpesvirus simiae
+Eastern equine encephalitis virus
+Venezuelan equine encephalitis virus
+Flexal virus
+Mopeia virus (and other Tacaribe viruses)
+Tacaribe virus
+Dabie bandavirus (SFTS Virus)
+Gordil virus
+Heartland bandavirus
+Itaituba virus
+Khasan virus
+Razdan virus
+Rift valley fever virus
+Garba virus
+Rabies virus (street virus)
+Rochambeau virus
+Inhangapi virus
+Middle East Respiratory Syndrome coronavirus (MERS-CoV)
+Severe acute respiratory syndrome coronavirus (SARS-CoV)
+Severe acute respiratory syndrome coronavirus 2, (SARS-CoV-2)
+Hantaviruses causing pulmonary syndrome
+Hantaviruses causing hemorrhagic fever with renal syndrome
+Murray valley encephalitis virus
+Negishi virus
+Powassan virus
+Rocio virus
+Sepik virus
+Issyk-Kul virus
+Nairobi sheep disease virus
+Sapphire orthonairovirus (Paramushir virus)
+Tamdy virus
+Human immunodeficiency virus (HIV) (Type 1 and 2 virus)
+Simian immunodeficiency virus (SIV)
+Everglades virus
+Kyzylagach virus
+Mayaro virus
+Middelburg virus
+Mucambo virus
+Ndumu virus
+Sagiyama virus
+Lymphocytic choriomeningitis (neurotropic) virus
+Polio virus
+Dhori virus
+High pathogenic avian influenza virus
+California encephalitis virus
+Germiston virus
+Inini virus (Simbu orthobunyavirus)
+Oropouche virus
+Sandfly fever virus
+Norovirus
+Sapovirus
+Flanders virus
+Hart Park virus
+Rabies virus (fixed virus)
+Vesicular stomatitis virus
+Buffalopox virus
+Camelpox virus
+Cowpox virus
+Molluscum contagiosum virus
+Orf virus
+Pseudocowpox virus (Milker‘s nodule virus)
+Rabbitpox virus
+Tanapox virus
+Vaccinia virus
+Polyoma virus
+Simian virus 40
+Metapneumovirus
+Respiratory syncytial virus
+Rubivirus (Rubella)
+Measles virus
+Mumps virus
+Parainfluenza virus
+Sendai virus (murine parainfluenza virus type 1)
+Coronavirus (low pathogenicity to human)
+Coltivirus
+Rotavirus
+Dengue virus
+Flaviviruses,other known non-highly pathogenic
+Hepatitis C virus
+Langat virus
+Saumarez reef virus
+Yellow fever virus, (vaccine strain, 17D)
+Zika Virus
+Hazara virus
+Human T- lymphotropic virus (HTLV)
+Lentivirus (Non highly pathogenic)
+Cytomegalovirus
+Epstein-Barr virus
+Herpes simplex virus
+Herpesvirus saimiri
+Human herpes virus-6
+Human herpes virus-7
+Human herpes virus-8
+Varicella-Zoster virus
+Alphaviruses, other known non-highly pathogenic
+Barmah forest virus
+Bebaru virus
+Getah virus
+O’nyong-nyong virus
+Ross river virus
+Semliki forest virus
+Sindbis virus
+Papillomavirus (human)
+Lymphocytic choriomeningitis virus
+Hepatitis B virus
+Hepatitis D virus
+Hepatitis E virus
+Adeno-associated virus
+Bocavirus
+Parvovirus B19
+Adenovirus
+Cardiovirus
+Coxsakie virus
+ECHO virus
+Enterovirus
+Enterovirus A-71
+Hepatitis A virus
+Human Cosavirus
+Kobuvirus
+Parechovirus
+Rhinovirus
+Astrovirus
+Influenza virus
+Guaratuba virus
+La Crosse virus
+Tahyna orthobunyavirus
+Tensaw virus
+Turlock virus
+Hamster leukemia virus
+Mouse leukemia virus
+Mouse mammary tumor virus
+Rat leukemia virus
+Guinea pig herpes virus
+Bovine spongiform encephalopathy (BSE)
+Creutzfeldt-Jakob disease (CJD)
+Fatal familian insomnia (FFI)
+Gerstmann- Sträussler -Scheinker syndrome(GSS)
+Kuru disease
+Variant Creutzfeldt-Jakob disease (vCJD)
+Scrapie
+Phagophilic cells without form
+Brucella genus
+Mycobacterium bovis
+Mycobacterium tuberculosis
+Rickettsia belongs to the spotted fever group
+Rickettsia Mori
+Przewalski's Rickettsia
+Eastern body of scrub typhus
+Lutheran rickettsia
+Siberian Rickettsia
+Tarasawich rickettsia
+Goat shapeless
+Acinetobacter baumannii
+Acinetobacter lwoffii
+Madura actinomycetes
+Bai Lejie Madura actinomycete
+Bovine actinomycete
+Actinomyces granulosus
+Yi's actinomycetes
+Nei's actinomycetes
+Other species of actinomycetes
+Aeromonas hydrophila
+Spotted Aeromonas
+Other species of Aeromonas genus
+Afipota genus
+Actinobacteria agglomerating bacteria
+Arachnia propionica
+Arcanobacterium equi
+Hemolytic Cryptococcus
+Bacillus cereus
+Fragile pseudomonas
+Rod like Bartonella
+Klebsiella pneumoniae
+Duoshi Bartonella
+Elizabethan Bartonella
+Guillain Barr é body
+bartonella henselae
+Kochia Bartonella
+5-Day Heat Bartonella Body
+Tribal Bartonella
+Wens Bartonella Wens subspecies
+Botrytis bronchiolitis
+Bordetella pertussis
+Borrelia burgdorferi
+Dashi sparse spiral body
+Returning to the heat sparse spiral body
+Fensenshu spirochete
+Short spiral bacteria genus
+Granuloma sheath bacteria
+Campylobacter coli
+Fetal Campylobacter
+Campylobacter jejuni
+Salivary Campylobacter
+Other species of Campylobacter genus
+Chlamydia pneumoniae
+Chlamydia parrot
+Chlamydia trachomatis
+Difficult Clostridium difficile
+Fusarium oxysporum
+Hemolytic Clostridium
+Clostridium novyi
+Clostridium perfringens
+Tetanus Clostridium
+Lactobacillus bovis
+Corynebacterium diphtheriae
+Corynebacterium minutissimum
+Fake Mycobacterium tuberculosis
+Corynebacterium striatum
+Acinetobacter canker
+Congo Pichia
+edwardsiella tarda
+Yifei Erich's body
+Eikenella corrodens
+Gas producing Escherichia coli
+Enterobacter cloacae
+Other species of Escherichia coli
+Adenothermic rickettsia
+Porcine red spot erysipelas fungus
+Dandelion fungus genus
+Burkholderia meningoseptica
+Bozeman's Legionella
+The new subspecies of the killer of the Tula Francisella fungus
+Fusobacterium necrophorum
+gardnerella vaginalis
+Hemophilus ducreyi
+Haemophilus influenzae
+Helicobacter pylori
+Kingella Kingae
+Klebsiella oxytoca
+Question mark Leptospira
+Listeria ivanovii
+Listeria monocytogenes
+Polymorphic small bacteria
+Morganella morganii
+African mycobacteria
+Goat mycobacteria
+Field mouse mycobacteria
+Mycobacterium asiaticum
+Mycobacterium avium
+Occasional mycobacteria
+Kansas mycobacteria
+Mycobacterium leprae
+Mycobacterium malmoense
+Mycobacterium avium subsp. paratuberculosis
+Mycobacterium scrotum
+Mycobacterium hominis
+Mycobacterium szulgai
+Ulcerative mycobacteria
+Other species of Mycobacterium genus
+mycoplasma pneumoniae
+neisseria gonorrhoeae
+Neisseria meningitidis
+Nocardia asteroides
+Nocardia brasiliensis
+Nocardia botulinum
+Nocardia pyogenes
+New Nocardia
+Nocardia in guinea pig ear inflammation
+Delancewanorca bacteria
+Clostridium sporogenes
+Pasteurella multocida
+Rodent bacteria invading the lungs
+Pathogenic Escherichia coli
+Other pathogenic Escherichia coli genera
+Anaerobic digestion streptococcus
+Plesiomonas shigelloides
+Prevotella genus
+Proteus mirabilis
+Proteus penneri
+Ordinary Proteobacteria
+Propionibacterium prolifera producing alkali
+Prevotella reinhardtii
+Pseudomonas aeruginosa
+Autotrophic false Nocardia
+Staphylococcus aureus
+Bongor Salmonella
+Serratia liquefaciens
+Fading Salmonella
+Shigella dysenteriae
+Shigella flexneri
+Shigella boydii
+Shigella Songnei
+Staphylococcus epidermidis
+Candida albicans
+Streptococcus pneumoniae
+Streptococcus pyogenes
+Streptococcus genus
+streptococcus suis
+Treponema carateum
+Treponema pallidum (syphilis)
+Treponema pertenue
+Wen's density spiral body
+Ureaplasma urealyticum
+Vibrio vulnificus
+Vibrio parahaemolyticus
+River Vibrio
+Vibrio alginolyticus
+Other species of Vibrio genus
+Yersinia enterocolitica
+Yersinia pseudotuberculosis
+mycoplasma genitalium
+Cronobacter genus
+Citrobacter genus
+Photobacterium damselae
+Shiwanju genus
+Seafood Deformable Fungi
+Defective autotrophic bacteria
+Carbon dioxide fiber eating bacteria genus
+Chromobacterium genus
+Golden rod genus
+Short chain Streptococcus genus
+Dermatitis budding bacteria
+Coarse ball spore fungus
+Posadas spore forming bacteria
+Capsular tissue cytoplasmic bacteria
+Histoplasma bacteria and other pathogenic diseases
+Brazilian Azospirillum
+Other pathogenic diseases of the genus Ascomycota
+Cladosporium genus
+Rhizopus genus
+Alternaria alternata
+Infecting Alternaria
+Other pathogenic diseases of the genus Alternaria
+Scale mold genus
+Genus Fusarium
+Arthrobacter genus
+Aspergillus flavus complex
+Aspergillus fumigatus complex group
+Aspergillus terreus
+Short stem mold genus
+Solid spore frog manure mold
+Frog manure mold belongs to other pathogenic diseases
+Beauveria genus
+candida dubliniensis
+Smooth Candida complex
+Ji Yemeng Candida complex
+Ximulong Candida complex
+Candida krusei
+Near smooth Candida complex
+Tropical Candida
+Candida auricula
+Candida and other pathogenic diseases
+Cephalosporin genus
+Genus Trichoderma
+Golden spore fungus genus
+Curly mold genus
+Botrytis cinerea
+Other pathogenic diseases of Aspergillus genus
+Saccharomyces genus
+Trichoderma genus
+Crown ear mold
+Conidiobolus incongruus
+Ear mold belongs to other pathogenic diseases
+False black powdery mildew genus
+Kashi cola rod mold
+Other pathogenic diseases of Colletotrichum genus
+Gert Cryptococcus complex
+Cryptococcus neoformans complex
+Cryptococcus and other pathogenic diseases
+Cunninghamella bertholletiae
+Xiaoke Yinhan mold belongs to other pathogenic diseases
+Hawaiian curved fungus
+Babendorf's curved fungus
+Suiform curved fungus
+Curvularia genus
+Corydalis genus
+Interstitial shell genus
+The genus Bifidobacterium
+Aemonas genus
+Trichophyton flocs
+Dermatitis external bottle mold
+Zhen's external bottle mold complex group
+Spinous external bottle mold
+External bottle mold belongs to other pathogenic diseases
+Magnum's navel mold
+Beaked navel mold
+Monofer coloring mold
+Pei's coloring mold
+Nubica coloring mold
+Other pathogenic diseases of the genus Trichoderma
+Fusarium oxysporum complex
+Fusarium complex of eggplant disease
+Other pathogenic diseases of Fusarium genus
+Geotrichum genus
+Genus Mucomycota
+Venetobacter baumannii
+Half new pillar top spore
+Lasiodiplodia theobromae
+Umbrella branch transverse stem mold
+Multi branch transverse stem mold
+Other pathogenic diseases of Streptomyces genus
+Polyphenophore Spore
+Gray Madura fungus
+Podomycosis Madura bacteria
+Madura bacteria and other pathogenic diseases
+Malassezia furfur
+Spherical Malassezia
+Malassezia and other pathogenic diseases
+Microsporidia canis
+Rust colored microsporidia
+Other pathogenic diseases of the genus Microsporidia
+Aspergillus genus
+Fusarium complex group
+Irregular mold
+Mucor racemosa
+Other pathogenic diseases of Mucor genus
+Gypsum Neisseria
+Neosatobacter genus
+New genus of balanoposthitis
+Black spore fungus genus
+Ochromycetes genus
+Xufang yeast genus
+Wan's Penicillium
+Dark colored Cladosporium genus
+Dark colored Cyclosporidium genus
+Pingge bacteria genus
+Single spore bottle mold genus
+American bottle mold
+Verrucous bottle mold
+Bottle mold belongs to other pathogenic diseases
+Stem point mold genus
+Hedermann nodule fungus
+The genus of crooked mouth shell
+Wickham without green algae
+Zufei no green algae
+No other pathogenic diseases of the green algae genus
+Conomycota genus
+Rhizopus microsporus
+rhizopus arrhizus
+Rhizopus and other pathogenic diseases
+Red yeast genus
+Broomycota genus
+Sharp tip Sedosporium
+Other pathogenic diseases of the genus Zygomycota
+Schizophyllum genus
+Short broom mold
+Other pathogenic diseases of the broom mold genus
+Spheroidal sporophytes
+Schenker Sporothrix fungus
+Sporothrix bacteria and other pathogenic diseases
+Copium genus
+Marlini's basket shaped bacteria
+Trichophyton complex
+Red Trichophyton complex
+Trichophyton schoenleinii
+Trichophyton interruptus
+Purple Trichophyton
+Trichophyton genus and other pathogenic diseases
+trichosporon asahii
+Other pathogenic diseases of the genus Trichosporum
+Wheat stalk mold genus
+Monogramma genus
+Viranthus genus
+Verrucosporium genus
+Ameba
+Wuchereria bancrofti
+Hydatid
+Whipworm
+Lung fluke
+Liver fluke
+Toxoplasma
+Hookworm
+Ascaris
+Giardia
+Scabies
+Pinworm
+Malaria
+Plasmodium
+Filarial worm
+Taenia
+Microsporidia
+Schistosome
+Cryptosporidium
+Porcine tapeworm
+Q hot
+Ebola hemorrhagic fever
+Bacillus subtilis
+Brucella bacteria
+Actinomycetes
+Eperythrocytic disease
+Para tuberculosis
+tox
+Leptospirasis
+Echinococcosis
+tuberculosis
+Old World spiral maggot disease
+Crimean Congo hemorrhagic fever
+foot-and-mouth disease
+rabies
+Pseudomallei
+Rift Valley fever
+Nipah's disease
+Japanese encephalitis
+Schmallenberg disease
+Vesicular stomatitis
+anthrax
+Pseudorabies
+Siniro fever
+Heart water disease
+New World spiral maggot disease
+Clostridium perfringens infections
+Infection with Trichinella spp
+Tularemia
+Trypanosoma Evansi infection
+Leishmaniasis
+Infection with epizootic haemorrhagicdis-ease
+Filariasis
+Staphylococcosis
+Schistosomiasis
+Nipah virus Encephalitis
+Rotavirus infection
+Clostridum Perfringens
+Salmonella disease
+Listeriosis
+Hemolytic brucellosis
+Mycoplasma disease
+Chlamydia disease
+Eastern schistosomiasis
+Clonorchiasis sinensis
+Cysticercosis
+Fasciola hepatica
+Blood Spear Nematode Disease
+Cryptosporidiosis
+Akabane disease
+Hemorrhagic sepsis
+Ibaraki disease
+Bovine leukemia
+Bovine viral diarrhea
+Bovine infectious rhinotracheitis
+Bovine contagious pleuropneumonia
+malignant catarrhal fever
+Bovine spongiform encephalopathy
+Bovine nodular dermatitis
+Cattle popularity trend
+Bovine hypodermatid myiasis
+Bovine non plasma disease
+Bovine mucosal disease
+Zhongshan disease
+Infectious bo-vine rhinotracheitis/Infectious pustular vulvovaginitis
+Bovine genital campylobacteriosis
+Bovine viral diarrhoea/Mucosal disease
+Bovine babesiosis
+Theileriosis
+Trichomonosis
+Dermatophilosis
+Local epidemic bovine leukemia
+Bovine coronavirus infection
+Bovine pear shaped insect disease
+African horse plague
+Hendra's disease
+Ulcerative lymphangitis
+Equine glanders
+Equine disease toxic arteritis
+Equine infectious anemia
+Equine infectious uterine inflammation
+equine paratyphoid
+Horse mating disease
+Equine influenza
+Equine epidemic lymphangitis
+Horse gland disease
+Venezuelan equine encephalomyelitis
+Infection with equid herpesvirus-1
+Equine encephalomyelitis (East-ern and Western)
+Horse flu
+Equine nosed pneumonia
+equine piroplasmosis
+african swine fever
+Seneca virus disease
+Porcine infectious gastroenteritis
+porcine contagious pleuropneumonia
+Pig erysipelas
+Porcine Reproductive and Respiratory Syndrome
+Porcine paratyphoid fever
+Porcine Epidemic Diarrhea
+Swine influenza
+Swine dysentery
+Porcine vesicular disease
+Porcine Tetreovirus induced encephalomyelitis
+Atrophic rhinitis in pigs
+swine fever
+Mycoplasma hyopneumoniae pneumonia in pigs
+Porcine parvovirus infection
+Swine streptococosis
+Porcine circovirus infection
+Glaesser’s disease(Haemoph-ilus parasuis)
+Infection with Taenia solium(Porcine cysticercosis)
+Porcine deltacorona virus(PDCoV)
+Porcine brucellosis
+Porcine Circovirus Disease
+Glaser's disease
+swine flu
+Porcine Coronavirus Infection
+Porcine Seneca virus infection
+Piglet dysentery
+Porcine dysentery
+Porcine proliferative intestinal disease
+Infectious rhinitis
+Infectious bursal disease
+Low pathogenic avian influenza
+Highly pathogenic avian influenza
+turkey rhinotracheitis
+Chicken white diarrhea
+Chicken viral arthritis
+Chicken egg production decline syndrome
+Infectious laryngotracheitis in chickens
+Infectious bronchitis in chickens
+Marek's disease
+Avian leukemia
+Avian infectious encephalomyelitis
+Avian pox
+Avian paratyphoid fever
+Avian spirochete disease
+Avian typhoid fever
+Avian nephritis
+Avian reticuloendothelial hyperplasia
+avian chlamydiosis
+Avian mycoplasmosis
+Newcastle disease
+Duck viral hepatitis
+Leucocytozoonosis
+Goose parvovirus infection
+Duck virus enteritis
+Avian coccidiosis
+Riemerella anatipestifer infection
+Duck plague
+Gosling plague
+Avian Infectious Laryngotracheitis
+avian infectious bronchitis
+Marek’s Disease
+egg drop syndrome
+Duck serositis
+Avian reticuloendothelial tissue proliferation disease
+Chicken infectious rhinitis
+Infection with avian Tembusu virus
+Avian adenovirus infection
+Chicken infectious anemia
+Infection of avian influenza virus
+Chicken red mite disease
+necrotic enteritis
+Duck reovirus infection
+Boundary disease
+Infectious azoospermia
+Caseous lymphadenitis
+Blue tongue disease
+Medi Visna disease
+enzootic abortion of ewes
+Sheep pox and goat pox
+Nairobi sheep disease
+Contagious pleuropneumonia in goats
+Goat encephalitis
+Small ruminant plague
+Sheep infectious pustular dermatitis
+ovine pulmonary adenomatosis
+Itchy disease
+Caprine arthritis/encephalitis
+Salmonellosis(S.abortusovis)
+Sheep lung adenomatous disease
+Sheep pear shaped worm disease
+Sheep without plasma disease
+Crayfish plague
+Vitiligo syndrome
+Spotted catfish viral disease
+Viral hemorrhagic sepsis
+Viral neuronecrosis disease
+Infectious muscle necrosis disease
+Infectious subcutaneous and hematopoietic organ necrosis disease
+Infectious splenic and renal necrosis disease
+Infectious Hematopoietic Organ Necrosis
+Bacterial sepsis in freshwater fish
+Salmon infectious anemia
+Necrotizing liver pancreatitis
+Huangtou disease
+Catfish intestinal sepsis
+Acute liver and pancreas necrosis
+Koi herpesvirus disease
+Carp spring viremia
+Carp edema virus disease
+Epidemic Ulcer Syndrome
+epizootic haematopoietic necrosis
+Tilapia Lake Virus Disease
+White tail disease
+Taura syndrome
+Bacterial nephropathy
+Red snapper rainbow virus disease
+Infection with Gyrodactylus Salaris
+Infection with abalone herpesvirus
+Infection with Bonamia Ostreae
+Infection with Bonamia Exitiosa
+Infection with Marteilia Refringens
+Infection with Perkinsus Olseni
+Infection with Perkinsus Marinus
+Infection with Xenohaliotis Californiensis
+Infection with Batrachochytrium Dendrobatidis
+Infection with Ranavirus species
+Anisakiasis
+Cryptocaryoniasis
+Edwardsiellasis
+Fish streptococcosis
+Chryseobacterium meningsepticum of frog (Rana spp)
+Infection with salmonid alphavirus
+Infection with Batrachochytrium salamandrivorans
+Infection with Decapod iridescent virus 1
+Grass carp hemorrhagic disease
+Necrosis of hematopoietic organs in crucian carp
+Carp float disease
+Shrimp liver intestinal worm disease
+schistosomiasis japonica
+Infectious pancreatic necrosis disease
+Paralichthys olivaceus virus disease
+Fish Edwardellosis
+Streptococcal disease
+Salmon killing Aeromonas disease
+Small melon worm disease
+Myxosporidiosis
+Third generation insect disease
+Ringworm disease
+Crab snail pathogen disease
+Bao herpesvirus disease
+Oyster herpesvirus disease
+Beehive Beetle
+american foul brood
+Bee chalky disease
+Bee shield mite disease
+Honey bee bright heat mite disease
+Bee mite disease
+european foul brood
+Small hive beetle infestation(Aethina tumida)
+Nosemosis of honey bees
+Bombyx mori polyhedrosis
+Bright and hot mite disease
+chalkbrood
+white muscardine
+Silkworm microsporidia
+Rabbit hemorrhagic disease
+Rabbit myxomatosis
+Rabbit coccidiosis
+Rabbit brucellosis
+Feline panleukopenia
+Canine infectious hepatitis
+canine distemper
+Canine parvovirus infection
+Canine parvovirus disease
+Cat cupping virus infection
+Feline infectious peritonitis
+canine babesiosis
+Amphibian frog iridovirus disease
+Turtle parotitis disease
+Frog meningitis sepsis
+Monkey viral immunodeficiency syndrome
+Monkeypox
+Lymphocytic choroidal meningitis
+Chronic wasting disease
+Camel pox
+Marburg Hemorrhagic Fever
+Rat pox
+Mink Aleutian disease
+Mink viral enteritis
+Mouse hepatitis
+Cercopithecine Herpesvirus Type I(B virus)infectious diseases
+Sendai virus infectious disease
+Infectious subcutaneous and hematopoietic tissue necrosis disease
+Acute Hepatopancreatic Necrosis
diff --git a/original_captcha.png b/original_captcha.png
new file mode 100644
index 0000000..6a588a1
Binary files /dev/null and b/original_captcha.png differ
diff --git a/pom.xml b/pom.xml
new file mode 100644
index 0000000..12f0a88
--- /dev/null
+++ b/pom.xml
@@ -0,0 +1,150 @@
+
+ 4.0.0
+ com.example
+ es-crawler
+ 1.0-SNAPSHOT
+
+
+ 8
+ 8
+
+
+
+
+
+ org.elasticsearch.client
+ elasticsearch-rest-high-level-client
+ 7.17.0
+
+
+
+ co.elastic.clients
+ elasticsearch-java
+ 7.17.15
+
+
+ com.fasterxml.jackson.core
+ jackson-databind
+ 2.15.0
+
+
+
+
+ org.jsoup
+ jsoup
+ 1.17.2
+
+
+
+
+ com.squareup.okhttp3
+ okhttp
+ 4.9.3
+
+
+
+
+ org.slf4j
+ slf4j-api
+ 1.7.36
+
+
+ ch.qos.logback
+ logback-classic
+ 1.2.11
+
+
+
+
+ org.apache.kafka
+ kafka-clients
+ 3.9.0
+
+
+
+
+ org.seleniumhq.selenium
+ selenium-java
+ 4.10.0
+
+
+
+
+ io.github.bonigarcia
+ webdrivermanager
+ 5.6.2
+
+
+
+ org.json
+ json
+ 20230227
+
+
+
+ com.google.code.gson
+ gson
+ 2.10.1
+
+
+
+ net.sourceforge.htmlunit
+ htmlunit
+ 2.61.0
+
+
+
+ net.sourceforge.tess4j
+ tess4j
+ 4.5.4
+
+
+
+ org.apache.httpcomponents.client5
+ httpclient5
+ 5.3.1
+
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+ 3.8.1
+
+ 8
+ 8
+
+
+
+
+ org.apache.maven.plugins
+ maven-assembly-plugin
+ 3.3.0
+
+
+
+ com.example.CtriScraper
+
+
+
+ jar-with-dependencies
+
+
+
+
+ make-assembly
+ package
+
+ single
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/preprocessed_captcha.png b/preprocessed_captcha.png
new file mode 100644
index 0000000..20329de
Binary files /dev/null and b/preprocessed_captcha.png differ
diff --git a/processed_urls.txt b/processed_urls.txt
new file mode 100644
index 0000000..f862ec3
--- /dev/null
+++ b/processed_urls.txt
@@ -0,0 +1,281 @@
+
+https://www.zyctd.com/zixun/201/1055143.html
+https://www.zyctd.com/zixun/201/861786.html
+https://www.zyctd.com/zixun/201/1053482.html
+https://www.zyctd.com/zixun/201/269419.html
+https://www.zyctd.com/zixun/201/1053149.html
+https://www.zyctd.com/zixun/201/1023926.html
+https://www.zyctd.com/zixun/201/435325.html
+https://www.zyctd.com/zixun/201/1050302.html
+https://www.zyctd.com/zixun/201/880441.html
+https://www.zyctd.com/zixun/201/1019635.html
+https://www.zyctd.com/zixun/201/970572.html
+https://www.zyctd.com/zixun/201/912277.html
+https://www.zyctd.com/zixun/201/372444.html
+https://www.zyctd.com/zixun/201/1073629.html
+https://www.zyctd.com/zixun/201/1069386.html
+https://www.zyctd.com/zixun/201/730410.html
+https://www.zyctd.com/zixun/201/953220.html
+https://www.zyctd.com/zixun/201/1074339.html
+https://www.zyctd.com/zixun/201/1072317.html
+https://www.zyctd.com/zixun/201/294794.html
+https://www.zyctd.com/zixun/201/267592.html
+https://www.zyctd.com/zixun/201/979665.html
+https://www.zyctd.com/zixun/201/869885.html
+https://www.zyctd.com/zixun/201/1054064.html
+https://www.zyctd.com/zixun/201/1049331.html
+https://www.zyctd.com/zixun/201/442647.html
+https://www.zyctd.com/zixun/201/285992.html
+https://www.zyctd.com/zixun/201/1037972.html
+https://www.zyctd.com/zixun/201/799801.html
+https://www.zyctd.com/zixun/201/916078.html
+https://www.zyctd.com/zixun/201/456647.html
+https://www.zyctd.com/zixun/201/812121.html
+https://www.zyctd.com/zixun/201/1042740.html
+https://www.zyctd.com/zixun/201/1042708.html
+https://www.zyctd.com/zixun/201/840450.html
+https://www.zyctd.com/zixun/201/320749.html
+https://www.zyctd.com/zixun/201/496106.html
+https://www.zyctd.com/zixun/201/850201.html
+https://www.zyctd.com/zixun/201/277145.html
+https://www.zyctd.com/zixun/201/299091.html
+https://www.zyctd.com/zixun/201/266080.html
+https://www.zyctd.com/zixun/201/1051925.html
+https://www.zyctd.com/zixun/201/898081.html
+https://www.zyctd.com/zixun/201/873280.html
+https://www.zyctd.com/zixun/201/703880.html
+https://www.zyctd.com/zixun/201/873126.html
+https://www.zyctd.com/zixun/201/887931.html
+https://www.zyctd.com/zixun/201/432742.html
+https://www.zyctd.com/zixun/201/1040431.html
+https://www.zyctd.com/zixun/201/1040223.html
+https://www.zyctd.com/zixun/201/858118.html
+https://www.zyctd.com/zixun/201/971286.html
+https://www.zyctd.com/zixun/201/458488.html
+https://www.zyctd.com/zixun/201/1079381.html
+https://www.zyctd.com/zixun/201/263578.html
+https://www.zyctd.com/zixun/201/553513.html
+https://www.zyctd.com/zixun/201/286229.html
+https://www.zyctd.com/zixun/201/285365.html
+https://www.zyctd.com/zixun/201/352921.html
+https://www.zyctd.com/zixun/201/503267.html
+https://www.zyctd.com/zixun/201/391337.html
+https://www.zyctd.com/zixun/201/813052.html
+https://www.zyctd.com/zixun/201/1053556.html
+https://www.zyctd.com/zixun/201/1041197.html
+https://www.zyctd.com/zixun/201/287420.html
+https://www.zyctd.com/zixun/201/291563.html
+https://www.zyctd.com/zixun/201/948250.html
+https://www.zyctd.com/zixun/201/289034.html
+https://www.zyctd.com/zixun/201/795965.html
+https://www.zyctd.com/zixun/201/292962.html
+https://www.zyctd.com/zixun/201/975850.html
+https://www.zyctd.com/zixun/201/275335.html
+https://www.zyctd.com/zixun/201/1031992.html
+https://www.zyctd.com/zixun/201/1033886.html
+https://www.zyctd.com/zixun/201/999510.html
+https://www.zyctd.com/zixun/201/270144.html
+https://www.zyctd.com/zixun/201/1055519.html
+https://www.zyctd.com/zixun/201/272205.html
+https://www.zyctd.com/zixun/201/526059.html
+https://www.zyctd.com/zixun/201/456640.html
+https://www.zyctd.com/zixun/201/267952.html
+https://www.zyctd.com/zixun/201/803469.html
+https://www.zyctd.com/zixun/201/270763.html
+https://www.zyctd.com/zixun/201/1072987.html
+https://www.zyctd.com/zixun/201/265176.html
+https://www.zyctd.com/zixun/201/1022141.html
+https://www.zyctd.com/zixun/201/290173.html
+https://www.zyctd.com/zixun/201/269175.html
+https://www.zyctd.com/zixun/201/744991.html
+https://www.zyctd.com/zixun/201/1019131.html
+https://www.zyctd.com/zixun/201/717054.html
+https://www.zyctd.com/zixun/201/517358.html
+https://www.zyctd.com/zixun/201/1058505.html
+https://www.zyctd.com/zixun/201/905515.html
+https://www.zyctd.com/zixun/201/287395.html
+https://www.zyctd.com/zixun/201/934873.html
+https://www.zyctd.com/zixun/201/1051317.html
+https://www.zyctd.com/zixun/201/926018.html
+https://www.zyctd.com/zixun/201/334511.html
+https://www.zyctd.com/zixun/201/845896.html
+https://www.zyctd.com/zixun/201/587785.html
+https://www.zyctd.com/zixun/201/288376.html
+https://www.zyctd.com/zixun/201/851405.html
+https://www.zyctd.com/zixun/201/941404.html
+https://www.zyctd.com/zixun/201/881855.html
+https://www.zyctd.com/zixun/201/602632.html
+https://www.zyctd.com/zixun/201/293601.html
+https://www.zyctd.com/zixun/201/541809.html
+https://www.zyctd.com/zixun/201/335120.html
+https://www.zyctd.com/zixun/201/1031137.html
+https://www.zyctd.com/zixun/201/960101.html
+https://www.zyctd.com/zixun/201/1077142.html
+https://www.zyctd.com/zixun/201/1063222.html
+https://www.zyctd.com/zixun/201/681466.html
+https://www.zyctd.com/zixun/201/1031130.html
+https://www.zyctd.com/zixun/201/1073734.html
+https://www.zyctd.com/zixun/201/1062186.html
+https://www.zyctd.com/zixun/201/1046628.html
+https://www.zyctd.com/zixun/201/358892.html
+https://www.zyctd.com/zixun/201/285361.html
+https://www.zyctd.com/zixun/201/1059889.html
+https://www.zyctd.com/zixun/201/297824.html
+https://www.zyctd.com/zixun/201/844307.html
+https://www.zyctd.com/zixun/201/900524.html
+https://www.zyctd.com/zixun/201/1057636.html
+https://www.zyctd.com/zixun/201/1010080.html
+https://www.zyctd.com/zixun/201/409152.html
+https://www.zyctd.com/zixun/201/402782.html
+https://www.zyctd.com/zixun/201/770296.html
+https://www.zyctd.com/zixun/201/1040602.html
+https://www.zyctd.com/zixun/201/606503.html
+https://www.zyctd.com/zixun/201/784471.html
+https://www.zyctd.com/zixun/201/466097.html
+https://www.zyctd.com/zixun/201/1071160.html
+https://www.zyctd.com/zixun/201/623226.html
+https://www.zyctd.com/zixun/201/948264.html
+https://www.zyctd.com/zixun/201/293462.html
+https://www.zyctd.com/zixun/201/829348.html
+https://www.zyctd.com/zixun/201/332369.html
+https://www.zyctd.com/zixun/201/907461.html
+https://www.zyctd.com/zixun/201/756555.html
+https://www.zyctd.com/zixun/201/717915.html
+https://www.zyctd.com/zixun/201/262203.html
+https://www.zyctd.com/zixun/201/1055787.html
+https://www.zyctd.com/zixun/201/432336.html
+https://www.zyctd.com/zixun/201/907489.html
+https://www.zyctd.com/zixun/201/1014686.html
+https://www.zyctd.com/zixun/201/1053320.html
+https://www.zyctd.com/zixun/201/480020.html
+https://www.zyctd.com/zixun/201/287423.html
+https://www.zyctd.com/zixun/201/385289.html
+https://www.zyctd.com/zixun/201/1030421.html
+https://www.zyctd.com/zixun/201/527648.html
+https://www.zyctd.com/zixun/201/972959.html
+https://www.zyctd.com/zixun/201/408767.html
+https://www.zyctd.com/zixun/201/724887.html
+https://www.zyctd.com/zixun/201/291480.html
+https://www.zyctd.com/zixun/201/472544.html
+https://www.zyctd.com/zixun/201/724873.html
+https://www.zyctd.com/zixun/201/281751.html
+https://www.zyctd.com/zixun/201/1049693.html
+https://www.zyctd.com/zixun/201/869619.html
+https://www.zyctd.com/zixun/201/355497.html
+https://www.zyctd.com/zixun/201/341623.html
+https://www.zyctd.com/zixun/201/450753.html
+https://www.zyctd.com/zixun/201/1065837.html
+https://www.zyctd.com/zixun/201/1031331.html
+https://www.zyctd.com/zixun/201/669727.html
+https://www.zyctd.com/zixun/201/1034010.html
+https://www.zyctd.com/zixun/201/1054058.html
+https://www.zyctd.com/zixun/201/954613.html
+https://www.zyctd.com/zixun/201/715584.html
+https://www.zyctd.com/zixun/201/1051110.html
+https://www.zyctd.com/zixun/201/269963.html
+https://www.zyctd.com/zixun/201/1048128.html
+https://www.zyctd.com/zixun/201/793207.html
+https://www.zyctd.com/zixun/201/284310.html
+https://www.zyctd.com/zixun/201/282639.html
+https://www.zyctd.com/zixun/201/1068138.html
+https://www.zyctd.com/zixun/201/340678.html
+https://www.zyctd.com/zixun/201/294371.html
+https://www.zyctd.com/zixun/201/324277.html
+https://www.zyctd.com/zixun/201/1048931.html
+https://www.zyctd.com/zixun/201/851398.html
+https://www.zyctd.com/zixun/201/263527.html
+https://www.zyctd.com/zixun/201/919480.html
+https://www.zyctd.com/zixun/201/685442.html
+https://www.zyctd.com/zixun/201/428325.html
+https://www.zyctd.com/zixun/201/1032698.html
+https://www.zyctd.com/zixun/201/1003367.html
+https://www.zyctd.com/zixun/201/852315.html
+https://www.zyctd.com/zixun/201/283156.html
+https://www.zyctd.com/zixun/201/262484.html
+https://www.zyctd.com/zixun/201/1065225.html
+https://www.zyctd.com/zixun/201/763331.html
+https://www.zyctd.com/zixun/201/1066158.html
+https://www.zyctd.com/zixun/201/1047744.html
+https://www.zyctd.com/zixun/201/842795.html
+https://www.zyctd.com/zixun/201/975374.html
+https://www.zyctd.com/zixun/201/1055865.html
+https://www.zyctd.com/zixun/201/1017367.html
+https://www.zyctd.com/zixun/201/1057711.html
+https://www.zyctd.com/zixun/201/1074295.html
+https://www.zyctd.com/zixun/201/283647.html
+https://www.zyctd.com/zixun/201/286896.html
+https://www.zyctd.com/zixun/201/1043393.html
+https://www.zyctd.com/zixun/201/305888.html
+https://www.zyctd.com/zixun/201/487258.html
+https://www.zyctd.com/zixun/201/1045652.html
+https://www.zyctd.com/zixun/201/1064905.html
+https://www.zyctd.com/zixun/201/515636.html
+https://www.zyctd.com/zixun/201/1038609.html
+https://www.zyctd.com/zixun/201/438083.html
+https://www.zyctd.com/zixun/201/297327.html
+https://www.zyctd.com/zixun/201/773537.html
+https://www.zyctd.com/zixun/201/1043589.html
+https://www.zyctd.com/zixun/201/815712.html
+https://www.zyctd.com/zixun/201/698595.html
+https://www.zyctd.com/zixun/201/269800.html
+https://www.zyctd.com/zixun/201/1030332.html
+https://www.zyctd.com/zixun/201/422676.html
+https://www.zyctd.com/zixun/201/290130.html
+https://www.zyctd.com/zixun/201/270359.html
+https://www.zyctd.com/zixun/201/995604.html
+https://www.zyctd.com/zixun/201/1074993.html
+https://www.zyctd.com/zixun/201/1054825.html
+https://www.zyctd.com/zixun/201/918577.html
+https://www.zyctd.com/zixun/201/686527.html
+https://www.zyctd.com/zixun/201/297509.html
+https://www.zyctd.com/zixun/201/622708.html
+https://www.zyctd.com/zixun/201/469870.html
+https://www.zyctd.com/zixun/201/844328.html
+https://www.zyctd.com/zixun/201/394508.html
+https://www.zyctd.com/zixun/201/271744.html
+https://www.zyctd.com/zixun/201/1054940.html
+https://www.zyctd.com/zixun/201/732818.html
+https://www.zyctd.com/zixun/201/1049547.html
+https://www.zyctd.com/zixun/201/1059684.html
+https://www.zyctd.com/zixun/201/1055301.html
+https://www.zyctd.com/zixun/201/962068.html
+https://www.zyctd.com/zixun/201/451355.html
+https://www.zyctd.com/zixun/201/1056174.html
+https://www.zyctd.com/zixun/201/930540.html
+https://www.zyctd.com/zixun/201/871656.html
+https://www.zyctd.com/zixun/201/363246.html
+https://www.zyctd.com/zixun/201/845672.html
+https://www.zyctd.com/zixun/201/452965.html
+https://www.zyctd.com/zixun/201/1065920.html
+https://www.zyctd.com/zixun/201/1058808.html
+https://www.zyctd.com/zixun/201/986868.html
+https://www.zyctd.com/zixun/201/489785.html
+https://www.zyctd.com/zixun/201/307946.html
+https://www.zyctd.com/zixun/201/833359.html
+https://www.zyctd.com/zixun/201/806969.html
+https://www.zyctd.com/zixun/201/1050812.html
+https://www.zyctd.com/zixun/201/1033696.html
+https://www.zyctd.com/zixun/201/501167.html
+https://www.zyctd.com/zixun/201/1078919.html
+https://www.zyctd.com/zixun/201/1036495.html
+https://www.zyctd.com/zixun/201/1008736.html
+https://www.zyctd.com/zixun/201/1054264.html
+https://www.zyctd.com/zixun/201/493152.html
+https://www.zyctd.com/zixun/201/685456.html
+https://www.zyctd.com/zixun/201/995597.html
+https://www.zyctd.com/zixun/201/905501.html
+https://www.zyctd.com/zixun/201/347573.html
+https://www.zyctd.com/zixun/201/1045494.html
+https://www.zyctd.com/zixun/201/549775.html
+https://www.zyctd.com/zixun/201/1037336.html
+https://www.zyctd.com/zixun/201/1034972.html
+https://www.zyctd.com/zixun/201/653046.html
+https://www.zyctd.com/zixun/201/316612.html
+https://www.zyctd.com/zixun/201/447064.html
+https://www.zyctd.com/zixun/201/307603.html
+https://www.zyctd.com/zixun/201/263437.html
+https://www.zyctd.com/zixun/201/894490.html
+https://www.zyctd.com/zixun/201/368629.html
+https://www.zyctd.com/zixun/201/273285.html
+https://www.zyctd.com/zixun/201/1059618.html
+https://www.zyctd.com/zixun/201/459237.html
diff --git a/proxy.txt b/proxy.txt
new file mode 100644
index 0000000..199a16c
--- /dev/null
+++ b/proxy.txt
@@ -0,0 +1 @@
+127.0.0.1:7897
\ No newline at end of file
diff --git a/src/main/java/com/example/AusContent.java b/src/main/java/com/example/AusContent.java
new file mode 100644
index 0000000..f71c2d8
--- /dev/null
+++ b/src/main/java/com/example/AusContent.java
@@ -0,0 +1,119 @@
+package com.example;
+
+import okhttp3.*;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+
+import java.io.IOException;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.time.LocalDateTime;
+import java.time.format.DateTimeFormatter;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Map;
+
+public class AusContent {
+ public static void main(String[] args) throws IOException {
+ OkHttpClient client = new OkHttpClient().newBuilder()
+ .build();
+ MediaType mediaType = MediaType.parse("text/plain");
+ RequestBody body = RequestBody.create(mediaType, "");
+ Request request = new Request.Builder()
+ .url("https://www.anzctr.org.auTrial/Registration/TrialReview.aspx?id=389345&isReview=true")
+ .get()
+ .build();
+ Response response = client.newCall(request).execute();
+ String html = response.body().string();
+ Document parse = Jsoup.parse(html);
+ String title = parse.select("#ctl00_body_CXSTUDYTITLE").text();
+ String registNum = parse.select("#ctl00_body_CXACTRNUMBER").text();
+ String registTime = convertDate(parse.select("#ctl00_body_CXAPPROVALDATE").text());
+ String sponsor = parse.select("#ctl00_body_repeater_TXFUNDINGSOURCE_ctl00_CXTYPE").text();
+ String studyType = parse.select("#ctl00_body_CXSTUDYTYPE").text();
+ String phase = parse.select("#ctl00_body_CXPHASE").text();
+ String disease = parse.select("#ctl00_body_repeater_TXHEALTHCONDITION_ctl00_CXHEALTHCONDITION").text();
+ String SD1 = parse.select("#ctl00_body_CXPURPOSE").text();
+ String SD2 = parse.select("#ctl00_body_CXALLOCATION").text();
+ String SD3 = parse.select("#ctl00_body_CXCONCEALMENT").text();
+ String SD4 = parse.select("#ctl00_body_CXSEQUENCE").text();
+ String SD5 = parse.select("#ctl00_body_CXMASKING").text();
+ String SD6 = parse.select("#ctl00_body_maskingdiv > div > div.review-element-content").text();
+ String SD7 = parse.select("#ctl00_body_CXASSIGNMENT").text();
+ String SD8 = parse.select("#ctl00_body_CXPHASE").text();
+ String SD9 = parse.select("#ctl00_body_CXENDPOINT").text();
+ String SD10 = parse.select("#ctl00_body_CXSTATISTICALMETHODS").text();
+ String SD11 = parse.select("#ctl00_body_interventional_div > div:nth-child(8) > div > div.review-element-content").text();
+ String studyObjective = parse.select("#ctl00_body_CXPURPOSE").text();
+ String inclusionCriteria = parse.select("#ctl00_body_CXINCLUSIVECRITERIA").text();
+ String exclusionCriteria = parse.select("#ctl00_body_CXEXCLUSIVECRITERIA").text();
+ String currentStatus = parse.select("#ctl00_body_CXRECRUITMENTSTATUS").text();
+ String enrollment = parse.select("#ctl00_body_CXSAMPLESIZE").text();
+ String country = parse.select("#ctl00_body_repeater_TXCOUNTRYOUTSIDEAUSTRALIA_ctl01_CXCOUNTRY").text();
+ String intervention = parse.select("#ctl00_body_trialDiv > div:nth-child(30) > div > div.review-element-content").text();
+ Map studyDesign = new HashMap<>();
+ studyDesign.put("Purpose of the study",SD1);
+ studyDesign.put("Allocation to intervention",SD2);
+ studyDesign.put("Procedure for enrolling a subject and allocating the treatment (allocation concealment procedures)",SD3);
+ studyDesign.put("Methods used to generate the sequence in which subjects will be randomised (sequence generation)",SD4);
+ studyDesign.put("Masking / blinding",SD5);
+ studyDesign.put("Who is / are masked / blinded?",SD6);
+ studyDesign.put("Intervention assignment",SD7);
+ studyDesign.put("Other design features",SD11);
+ studyDesign.put("Phase",SD8);
+ studyDesign.put("Type of endpoint/s",SD9);
+ studyDesign.put("Statistical methods / analysis",SD10);
+ Map resultData = new HashMap<>();
+ resultData.put("title",title);
+ resultData.put("registNum",registNum);
+ resultData.put("registTime",registTime);
+ resultData.put("registStatus","");
+ resultData.put("registTitle","");
+ resultData.put("fullTitle","");
+ resultData.put("sponsor",sponsor);
+ resultData.put("sponsorPart","");
+ resultData.put("studyType",studyType);
+ resultData.put("phase",phase);
+ resultData.put("disease",disease);
+ resultData.put("studyDesign",studyDesign);
+ resultData.put("studyObjective",studyObjective);
+ resultData.put("studyStartDate","");
+ resultData.put("inclusionCriteria",inclusionCriteria);
+ resultData.put("exclusionCriteria",exclusionCriteria);
+ resultData.put("currentStatus",currentStatus);
+ resultData.put("enrollment",enrollment);
+ resultData.put("country",country);
+ resultData.put("tagTime","");
+ resultData.put("intervention",intervention);
+ resultData.put("primaryOutcome","");
+ resultData.put("crawlTime",getCurrentTime());
+// resultData.put("crawlUrl",url);
+ resultData.put("postTime",registTime);
+ resultData.put("content","content");
+ resultData.put("forwardcontent","forwardcontent");
+ System.out.println(resultData);
+ }
+ public static String convertDate(String inputDate) {
+ try {
+
+ SimpleDateFormat inputFormat = new SimpleDateFormat("d/MM/yyyy");
+
+ Date date = inputFormat.parse(inputDate);
+
+ SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+
+ return outputFormat.format(date);
+ } catch (ParseException e) {
+
+ return "Invalid date format";
+ }
+ }
+ public static String getCurrentTime() {
+ // 创建 DateTimeFormatter,指定输出格式
+ DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
+ // 获取当前时间
+ LocalDateTime now = LocalDateTime.now();
+ // 格式化
+ return now.format(formatter);
+ }
+}
diff --git a/src/main/java/com/example/AusList.java b/src/main/java/com/example/AusList.java
new file mode 100644
index 0000000..eaae8f9
--- /dev/null
+++ b/src/main/java/com/example/AusList.java
@@ -0,0 +1,200 @@
+package com.example;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.net.HttpURLConnection;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.net.URLEncoder;
+import java.nio.charset.StandardCharsets;
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class AusList {
+ public static void main(String[] args) throws Exception {
+ String targetUrl = "https://www.anzctr.org.au/TrialSearch.aspx?page=20";
+ String baseUrl = "https://www.anzctr.org.au/TrialSearch.aspx";
+ String postUrl = "https://www.anzctr.org.au/TrialSearch.aspx";
+ String pageNumber = targetUrl.contains("?page=") ? targetUrl.split("page=")[1] : "1";
+ int page = Integer.parseInt(pageNumber);
+ System.out.println("Page Number: " + page);
+ // 存储 cookies
+ Set cookieSet = new HashSet<>();
+ String sessionId = null;
+
+ // 第一步:初始 GET 请求,获取 cookies 和 ViewState
+ URL initialUrl = new URL(baseUrl);
+ HttpURLConnection initialConn = (HttpURLConnection) initialUrl.openConnection();
+ initialConn.setRequestMethod("GET");
+ initialConn.setRequestProperty("User-Agent",
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36");
+
+ initialConn.setRequestProperty("Accept",
+ "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7");
+ initialConn.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.9,th;q=0.8,en;q=0.7");
+ initialConn.setRequestProperty("Cache-Control", "no-cache");
+ initialConn.setRequestProperty("Pragma", "no-cache");
+ initialConn.setRequestProperty("Upgrade-Insecure-Requests", "1");
+ initialConn.setRequestProperty("Sec-Fetch-Dest", "document");
+ initialConn.setRequestProperty("Sec-Fetch-Mode", "navigate");
+ initialConn.setRequestProperty("Sec-Fetch-Site", "same-origin");
+ initialConn.setRequestProperty("Sec-Fetch-User", "?1");
+
+ initialConn.setRequestProperty("Sec-CH-UA",
+ "\"Google Chrome\";v=\"135\", \"Not-A.Brand\";v=\"8\", \"Chromium\";v=\"135\"");
+ initialConn.setRequestProperty("Sec-CH-UA-Mobile", "?0");
+ initialConn.setRequestProperty("Sec-CH-UA-Platform", "\"Windows\"");
+
+// initialConn.setRequestProperty("Cookie",
+// "ASP.NET_SessionId=gkhw0unpeytexsa40v1sdjf1; __utma=2822752...; _ga=...");
+
+ initialConn.setInstanceFollowRedirects(false);
+ initialConn.setConnectTimeout(10000);
+ initialConn.setReadTimeout(10000);
+
+ // 捕获 cookies
+ sessionId = updateCookies(initialConn, cookieSet);
+
+ // 读取响应内容以获取 ViewState
+ BufferedReader in = new BufferedReader(new InputStreamReader(initialConn.getInputStream()));
+ StringBuilder content = new StringBuilder();
+ String inputLine;
+ while ((inputLine = in.readLine()) != null) {
+ content.append(inputLine);
+ }
+ in.close();
+ initialConn.disconnect();
+
+ // 提取初始 ViewState
+ Map viewStateData = extractViewStateData(content.toString());
+ String viewState = viewStateData.get("__VIEWSTATE");
+ String viewStateGen = viewStateData.get("__VIEWSTATEGENERATOR");
+ String eventValidation = viewStateData.get("__EVENTVALIDATION");
+ String payload = buildPostData(viewState,eventValidation,viewStateGen,page,sessionId);
+
+ HttpURLConnection conn = (HttpURLConnection) new URL(postUrl).openConnection();
+ conn.setRequestMethod("POST");
+ conn.setDoOutput(true);
+ conn.setInstanceFollowRedirects(false);
+ conn.setConnectTimeout(10000);
+ conn.setReadTimeout(10000);
+
+ // 设置请求头(仿浏览器)
+ conn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8");
+ conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36");
+ conn.setRequestProperty("Accept", "*/*");
+ conn.setRequestProperty("X-Requested-With", "XMLHttpRequest");
+ conn.setRequestProperty("X-MicrosoftAjax", "Delta=true");
+ conn.setRequestProperty("Referer", "https://www.anzctr.org.au/TrialSearch.aspx");
+ conn.setRequestProperty("Origin", "https://www.anzctr.org.au");
+
+
+ // 构建 POST 表单数据
+ String postData = payload;
+ // 写入 POST 数据
+ try (OutputStream os = conn.getOutputStream()) {
+ byte[] input = postData.getBytes(StandardCharsets.UTF_8);
+ os.write(input);
+ }
+
+ // 读取响应
+ BufferedReader re = new BufferedReader(new InputStreamReader(conn.getInputStream(), StandardCharsets.UTF_8));
+ StringBuilder response = new StringBuilder();
+ String line;
+ while ((line = re.readLine()) != null) {
+ response.append(line);
+ }
+ String html = response.toString();
+ Document parse = Jsoup.parse(html);
+ Elements elements =parse.select(".results-header-tools a");
+ for (Element element:elements){
+ String link = "https://www.anzctr.org.au" + element.attr("href");
+ System.out.println(link);
+ }
+ re.close();
+ conn.disconnect();
+ }
+
+ // 更新并返回当前连接中的 Cookie,包含 JSESSIONID 的提取
+ private static String updateCookies(HttpURLConnection conn, Set cookieSet) {
+ String sessionId = null;
+ Map> headerFields = conn.getHeaderFields();
+ List cookiesHeader = headerFields.get("Set-Cookie");
+ if (cookiesHeader != null) {
+ for (String cookie : cookiesHeader) {
+ String cookieValue = cookie.split(";")[0];
+ cookieSet.add(cookieValue);
+ if (cookieValue.startsWith("ASP.NET_SessionId=") || cookieValue.startsWith("csfcfc=")) {
+ sessionId = cookieValue;
+ }
+ }
+ }
+ return sessionId;
+ }
+ // 提取 __VIEWSTATE 隐藏字段的值
+ private static Map extractViewStateData(String html) {
+ Map stateMap = new HashMap<>();
+
+ // 使用三个独立正则提取三个字段
+ extractHiddenField(html, "__VIEWSTATE", stateMap);
+ extractHiddenField(html, "__VIEWSTATEGENERATOR", stateMap);
+ extractHiddenField(html, "__EVENTVALIDATION", stateMap);
+
+ if (!stateMap.containsKey("__VIEWSTATE")) {
+ System.err.println("Failed to extract __VIEWSTATE from HTML");
+ }
+ if (!stateMap.containsKey("__EVENTVALIDATION")) {
+ System.err.println("Failed to extract __EVENTVALIDATION from HTML");
+ }
+ if (!stateMap.containsKey("__VIEWSTATEGENERATOR")) {
+ System.err.println("Failed to extract __VIEWSTATEGENERATOR from HTML");
+ }
+ return stateMap;
+ }
+
+ private static void extractHiddenField(String html, String fieldName, Map map) {
+ String regex = "(?i)]*name=[\"']" + fieldName + "[\"'][^>]*value=[\"']([^\"']+)[\"']";
+ Pattern pattern = Pattern.compile(regex);
+ Matcher matcher = pattern.matcher(html);
+
+ if (matcher.find()) {
+ map.put(fieldName, matcher.group(1));
+ }
+ }
+
+ private static String buildPostData(String viewState, String eventValidation, String viewStateGen, int page, String sessionId) {
+ try {
+ // 按照真实请求体的顺序和字段进行构建
+ String payload = "";
+ payload += URLEncoder.encode("ctl00$body$tsmAJAXScriptManager", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode("ctl00$body$tsmAJAXScriptManager|ctl00$body$tsmAJAXScriptManager", StandardCharsets.UTF_8.name()) + "&";
+ payload += URLEncoder.encode("ctl00_body_tsmAJAXScriptManager_HiddenField", StandardCharsets.UTF_8.name()) + "=&"; // 添加缺失字段
+ payload += URLEncoder.encode("__EVENTTARGET", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode("ctl00$body$tsmAJAXScriptManager", StandardCharsets.UTF_8.name()) + "&";
+ payload += URLEncoder.encode("__EVENTARGUMENT", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode("conditionCode=&dateOfRegistrationFrom=&interventionDescription=&interventionCodeOperator=OR&primarySponsorType=&gender=&distance=&postcode=&pageSize=20&ageGroup=&recruitmentCountryOperator=OR&recruitmentRegion=ðicsReview=&countryOfRecruitment=®istry=&searchTxt=&studyType=&allocationToIntervention=&dateOfRegistrationTo=&recruitmentStatus=&interventionCode=&healthCondition=&healthyVolunteers=&page="+page+"&conditionCategory=&fundingSource=&trialStartDateTo=&trialStartDateFrom=&phase=", StandardCharsets.UTF_8.name()) + "&"; // 注意这里的参数字符串是完整的
+ payload += URLEncoder.encode("__LASTFOCUS", StandardCharsets.UTF_8.name()) + "=&";
+ payload += URLEncoder.encode("__VIEWSTATE", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode(viewState, StandardCharsets.UTF_8.name()) + "&";
+ payload += URLEncoder.encode("__VIEWSTATEGENERATOR", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode(viewStateGen, StandardCharsets.UTF_8.name()) + "&";
+ payload += URLEncoder.encode("__SCROLLPOSITIONX", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode("0", StandardCharsets.UTF_8.name()) + "&"; // 添加缺失字段
+ payload += URLEncoder.encode("__SCROLLPOSITIONY", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode("0", StandardCharsets.UTF_8.name()) + "&"; // 添加缺失字段
+ payload += URLEncoder.encode("__EVENTVALIDATION", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode(eventValidation, StandardCharsets.UTF_8.name()) + "&";
+
+ // ... 添加并按顺序排列其他所有字段,确保名称、值、编码与真实请求体一致 ...
+
+ // 确保最后一个字段后面没有 &
+ payload += URLEncoder.encode("__ASYNCPOST", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode("true", StandardCharsets.UTF_8.name());
+
+ return payload;
+
+ } catch (Exception e) {
+ System.err.println("Error building POST data: " + e.getMessage());
+ return "";
+ }
+ }
+
+}
diff --git a/src/main/java/com/example/CaptchaOCR.java b/src/main/java/com/example/CaptchaOCR.java
new file mode 100644
index 0000000..f9f6c53
--- /dev/null
+++ b/src/main/java/com/example/CaptchaOCR.java
@@ -0,0 +1,173 @@
+package com.example;
+
+import java.awt.image.BufferedImage;
+import java.io.*;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import javax.imageio.ImageIO;
+import net.sourceforge.tess4j.Tesseract;
+import net.sourceforge.tess4j.TesseractException;
+
+// ... 其他必要的导入 ...
+
+public class CaptchaOCR {
+
+ // Tesseract data 路径 (tessdata 文件夹所在目录)
+ // Windows 示例: "C:\\Program Files\\Tesseract-OCR\\tessdata"
+ // Linux/macOS 示例: 通常不需要设置,Tess4J 会自动查找
+ private static final String TESSDATA_PATH = "F:\\tool\\Tesseract-OCR\\tessdata"; // 根据你的安装路径修改
+
+ /**
+ * 下载验证码图片
+ * @param imageUrl 图片的完整 URL
+ * @return 图片的 BufferedImage 对象
+ * @throws IOException 如果下载失败
+ */
+ public static BufferedImage downloadImage(String imageUrl) throws IOException {
+ URL url = new URL(imageUrl);
+ HttpURLConnection conn = (HttpURLConnection) url.openConnection();
+ conn.setRequestMethod("GET");
+ // 添加 User-Agent 等必要的请求头,模拟浏览器
+ conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36");
+ // ... 其他头 ...
+
+ int responseCode = conn.getResponseCode();
+ if (responseCode == HttpURLConnection.HTTP_OK) {
+ try (InputStream is = conn.getInputStream()) {
+ // 将输入流读取到字节数组,ImageIO 从字节数组读取更稳定
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ byte[] buffer = new byte[4096]; // 缓冲区大小,可以调整
+ int bytesRead;
+ while ((bytesRead = is.read(buffer)) != -1) {
+ baos.write(buffer, 0, bytesRead);
+ }
+ ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
+
+ BufferedImage image = ImageIO.read(bais);
+
+ if (image == null) {
+ throw new IOException("Failed to read image from stream. Check image format.");
+ }
+ return image;
+ }
+ } else {
+ throw new IOException("Failed to download image. HTTP error code: " + responseCode);
+ }
+ }
+
+ /**
+ * 对验证码图片进行预处理 (基础示例:转灰度+二值化)
+ * 这是最关键的部分,需要根据验证码样式调整
+ * @param originalImage 原始图片
+ * @return 预处理后的图片
+ */
+ public static BufferedImage preprocessImage(BufferedImage originalImage) {
+ // TODO: 这里是图像预处理的重点,需要根据实际验证码样式进行调整和优化
+ // 基础处理:转灰度 -> 二值化
+ int width = originalImage.getWidth();
+ int height = originalImage.getHeight();
+ BufferedImage grayImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_GRAY);
+ grayImage.getGraphics().drawImage(originalImage, 0, 0, null);
+
+ BufferedImage binaryImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_BINARY);
+ // 二值化阈值,可能需要调整 (0-255)
+ int threshold = 128;
+ for (int y = 0; y < height; y++) {
+ for (int x = 0; x < width; x++) {
+ int gray = grayImage.getRaster().getSample(x, y, 0);
+ if (gray < threshold) {
+ binaryImage.getRaster().setSample(x, y, 0, 0); // 黑色
+ } else {
+ binaryImage.getRaster().setSample(x, y, 0, 1); // 白色
+ }
+ }
+ }
+
+ // TODO: 更高级的预处理包括:
+ // - 去除干扰线、噪点
+ // - 字符分割(如果字符粘连)
+ // - 倾斜校正
+ // - 调整亮度和对比度等
+ // 你可能需要引入更专业的图像处理库或算法
+
+ // 为了调试,可以将预处理后的图片保存下来查看效果
+ try {
+ File outputfile = new File("preprocessed_captcha.png");
+ ImageIO.write(binaryImage, "png", outputfile);
+ System.out.println("Preprocessed image saved to " + outputfile.getAbsolutePath());
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ return binaryImage; // 返回预处理后的图片
+ }
+
+ /**
+ * 使用 Tess4J 识别图片中的文字
+ * @param image 待识别的图片 (最好是预处理后的)
+ * @return 识别出的字符串
+ */
+ public static String recognizeCaptcha(BufferedImage image) {
+ Tesseract tesseract = new Tesseract();
+
+ // 设置 tessdata 路径 (如果 TESSDATA_PATH 已正确设置且 Tesseract 安装正确,这行可能不是必需的,Tess4J 会自动查找)
+ // 但显式设置更保险
+ if (TESSDATA_PATH != null && !TESSDATA_PATH.isEmpty()) {
+ tesseract.setDatapath(TESSDATA_PATH);
+ } else {
+ System.out.println("TESSDATA_PATH not set. Tess4J will try to find tessdata automatically.");
+ }
+
+
+ tesseract.setLanguage("eng"); // 设置识别语言为英文 (通常包含数字)
+ // 如果验证码只有数字,可以尝试设置仅识别数字
+ // tesseract.setTessVariable("tessedit_char_whitelist", "0123456789");
+
+ try {
+ String result = tesseract.doOCR(image);
+ // 清理识别结果,去除空格或换行符等
+ result = result.trim().replaceAll("[^0-9a-zA-Z]", ""); // 根据验证码内容调整清理规则
+ System.out.println("OCR Result: " + result);
+ return result;
+ } catch (TesseractException e) {
+ System.err.println("Error during OCR: " + e.getMessage());
+ return null; // 识别失败
+ }
+ }
+
+ // 示例如何在你的爬虫流程中使用
+ public static void main(String[] args) {
+ String captchaImageUrl = "YOUR_CAPTCHA_IMAGE_URL"; // 从页面解析获取到的验证码图片 URL
+
+ try {
+ // 1. 下载图片
+ BufferedImage originalCaptchaImage = downloadImage(captchaImageUrl);
+ System.out.println("Image downloaded.");
+
+ // 2. 预处理图片
+ BufferedImage preprocessedImage = preprocessImage(originalCaptchaImage);
+ System.out.println("Image preprocessed.");
+
+ // 3. 识别验证码
+ String captchaCode = recognizeCaptcha(preprocessedImage);
+
+ if (captchaCode != null && !captchaCode.isEmpty()) {
+ System.out.println("Recognized CAPTCHA: " + captchaCode);
+ // 4. 将 captchaCode 填入 POST 数据中,提交表单
+ // ... (你的 ASP.NET WebForms POST 提交代码,将 captchaCode 放到对应的隐藏字段或输入框字段中) ...
+ // 例如:postData += "&captchaInputFieldName=" + URLEncoder.encode(captchaCode, StandardCharsets.UTF_8.name());
+ // ... 提交 POST 请求 ...
+
+ } else {
+ System.out.println("Failed to recognize CAPTCHA.");
+ // 5. 处理识别失败的情况,可能需要重试或记录日志
+ }
+
+ } catch (IOException e) {
+ System.err.println("Error downloading or processing image: " + e.getMessage());
+ }
+ // catch (URISyntaxException e) {
+ // System.err.println("Invalid URL: " + e.getMessage());
+ // } // 如果你的 downloadImage 方法 throws URISyntaxException
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/com/example/CsAirScraper.java b/src/main/java/com/example/CsAirScraper.java
new file mode 100644
index 0000000..b47cf8d
--- /dev/null
+++ b/src/main/java/com/example/CsAirScraper.java
@@ -0,0 +1,81 @@
+package com.example;
+
+import io.github.bonigarcia.wdm.WebDriverManager;
+import org.apache.hc.client5.http.classic.methods.HttpPost;
+import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
+import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
+import org.apache.hc.client5.http.impl.classic.HttpClients;
+import org.apache.hc.core5.http.io.entity.StringEntity;
+import org.apache.hc.core5.http.io.entity.EntityUtils;
+import org.openqa.selenium.Cookie;
+import org.openqa.selenium.WebDriver;
+import org.openqa.selenium.chrome.ChromeDriver;
+
+import java.util.Set;
+import java.util.stream.Collectors;
+
+public class CsAirScraper {
+
+ public static void main(String[] args) throws Exception {
+ // 1. 启动 Selenium,访问南航主站
+ WebDriverManager.chromedriver().setup();
+ WebDriver driver = new ChromeDriver();
+ driver.get("https://b2c.csair.com/portal/main/flight/direct/query");
+
+ // 等待 Cookie 被 JS 设置(稍等几秒)
+ Thread.sleep(5000); // 可根据实际页面响应调整等待时间
+
+ // 2. 获取浏览器中所有 Cookie
+ Set seleniumCookies = driver.manage().getCookies();
+ String cookieHeader = seleniumCookies.stream()
+ .map(c -> c.getName() + "=" + c.getValue())
+ .collect(Collectors.joining("; "));
+
+ System.out.println("获取到 Cookie: " + cookieHeader);
+ driver.quit(); // 关闭浏览器
+
+ // 3. 准备 HttpClient 请求,携带 Cookie
+ try (CloseableHttpClient httpClient = HttpClients.createDefault()) {
+ HttpPost post = new HttpPost("https://b2c.csair.com/portal/main/flight/direct/query");
+
+ // 设置请求头
+ post.setHeader("Content-Type", "application/json");
+ post.setHeader("Cookie", cookieHeader);
+ post.setHeader("User-Agent", "Mozilla/5.0");
+
+ // 设置请求体(JSON)
+ String json = "{"
+ + "\"action\": \"0\","
+ + "\"adultNum\": \"1\","
+ + "\"airLine\": 1,"
+ + "\"arrCity\": \"PKX\","
+ + "\"businessType\": \"COMMON\","
+ + "\"cabinOrder\": \"0\","
+ + "\"cache\": 0,"
+ + "\"childNum\": \"0\","
+ + "\"depCity\": \"CAN\","
+ + "\"flightDate\": \"20250514\","
+ + "\"flyType\": 0,"
+ + "\"infantNum\": \"0\","
+ + "\"international\": \"0\","
+ + "\"isMember\": \"\","
+ + "\"isMultipass\": 1,"
+ + "\"language\": \"zh\","
+ + "\"preUrl\": \"\","
+ + "\"segType\": \"1\","
+ + "\"tariffRules\": []"
+ + "}";
+
+
+ post.setEntity(new StringEntity(json));
+
+ // 4. 发请求
+ try (CloseableHttpResponse response = httpClient.execute(post)) {
+ int code = response.getCode();
+ String result = EntityUtils.toString(response.getEntity());
+ System.out.println("状态码: " + code);
+ System.out.println("响应: " + result);
+ }
+ }
+ }
+}
diff --git a/src/main/java/com/example/CtriScraper.java b/src/main/java/com/example/CtriScraper.java
new file mode 100644
index 0000000..3ff578f
--- /dev/null
+++ b/src/main/java/com/example/CtriScraper.java
@@ -0,0 +1,404 @@
+package com.example;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import okhttp3.*;
+import org.apache.hc.client5.http.cookie.BasicCookieStore;
+import org.apache.hc.client5.http.cookie.CookieStore;
+import org.apache.hc.client5.http.classic.methods.HttpGet;
+import org.apache.hc.client5.http.classic.methods.HttpPost;
+import org.apache.hc.client5.http.entity.UrlEncodedFormEntity;
+import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
+import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
+import org.apache.hc.client5.http.impl.classic.HttpClients;
+import org.apache.hc.client5.http.protocol.HttpClientContext;
+import org.apache.hc.core5.http.HttpEntity;
+import org.apache.hc.core5.http.NameValuePair;
+import org.apache.hc.core5.http.io.entity.EntityUtils;
+import org.apache.hc.core5.http.message.BasicNameValuePair;
+import org.apache.kafka.clients.producer.KafkaProducer;
+import org.apache.kafka.clients.producer.ProducerConfig;
+import org.apache.kafka.clients.producer.ProducerRecord;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.time.LocalDate;
+import java.time.LocalDateTime;
+import java.time.Month;
+import java.time.Year;
+import java.time.format.DateTimeFormatter;
+import java.util.*;
+import java.nio.charset.StandardCharsets;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class CtriScraper {
+
+ private static final String SEARCH_FORM_URL = "https://ctri.nic.in/Clinicaltrials/advancesearchmain.php";
+
+ private static final String SEARCH_SUBMIT_URL = "https://ctri.nic.in/Clinicaltrials/advsearch.php";
+
+ private static final Pattern LINK_REGEX_PATTERN = Pattern.compile("'([^']*)'");
+
+ private static final String TOPIC_NAME = "cliniTopic";
+ private static final String BOOTSTRAP_SERVERS = "node-01:19092";
+ private static KafkaProducer producer;
+ private static ObjectMapper objectMapper = new ObjectMapper();
+ private static final Random random = new Random();
+
+ static {
+ Properties props = new Properties();
+ props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, BOOTSTRAP_SERVERS);
+ props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
+ props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
+ props.put(ProducerConfig.ACKS_CONFIG, "all"); // 等待所有副本确认
+ props.put(ProducerConfig.RETRIES_CONFIG, 3); // 重试次数
+ producer = new KafkaProducer<>(props);
+
+ }
+ public static List getlink(Integer year, Integer month) {
+ List linkList = new ArrayList<>(); // 用于存放提取到的链接
+ // 用于存储和管理 Cookies
+ CookieStore cookieStore = new BasicCookieStore();
+ // 用于在请求之间维护状态,特别是关联 CookieStore
+ HttpClientContext context = HttpClientContext.create();
+ context.setCookieStore(cookieStore);
+
+ // 使用 try-with-resources 确保 HttpClient 被正确关闭
+ try (CloseableHttpClient httpClient = HttpClients.custom()
+ .setDefaultCookieStore(cookieStore) // 将cookie store绑定到client
+ .build()) {
+
+ // --- Step 1 & 2: 发送 GET 请求获取表单页面并解析 ---
+ // System.out.println("Fetching search form page..."); // 调试信息可以按需保留或删除
+ HttpGet getRequest = new HttpGet(SEARCH_FORM_URL);
+ // 添加一些伪装的 Headers 模拟浏览器访问
+ getRequest.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36");
+ getRequest.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
+ getRequest.setHeader("Accept-Language", "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7");
+
+ String formPageHtml = null;
+ try (CloseableHttpResponse response = httpClient.execute(getRequest, context)) {
+ int statusCode = response.getCode();
+ // System.out.println("GET Response Status: " + statusCode); // 调试信息
+
+ if (statusCode != 200) {
+ System.err.println("Error: GET request to form page failed with status code: " + statusCode);
+ EntityUtils.consume(response.getEntity()); // 确保消费掉实体,释放连接
+ return null; // 获取表单页面失败,返回 null
+ }
+
+ HttpEntity entity = response.getEntity();
+ if (entity != null) {
+ formPageHtml = EntityUtils.toString(entity, StandardCharsets.UTF_8);
+ EntityUtils.consume(entity); // 确保实体内容被完全消费
+ } else {
+ System.err.println("Error: Failed to get form page entity.");
+ return null; // 获取页面内容失败,返回 null
+ }
+ }
+ // System.out.println("Form page fetched successfully."); // 调试信息
+
+ // --- Step 3 & 4: 解析 HTML 提取 csrf_token 和 __ncforminfo ---
+ Document doc = Jsoup.parse(formPageHtml, SEARCH_FORM_URL); // 传入 base URI 有助于处理相对路径
+
+ // 查找隐藏的输入字段
+ Element csrfTokenInput = doc.selectFirst("input[name=csrf_token][type=hidden]");
+ Element ncFormInfoInput = doc.selectFirst("input[name=__ncforminfo][type=hidden]");
+
+ String csrfToken = null;
+ String ncFormInfo = null;
+
+ if (csrfTokenInput != null) {
+ csrfToken = csrfTokenInput.val();
+ // System.out.println("Extracted csrf_token: " + csrfToken); // 调试信息
+ } else {
+ System.err.println("Warning: Could not find csrf_token input field.");
+ return null; // 缺少关键 token,返回 null
+ }
+
+ if (ncFormInfoInput != null) {
+ ncFormInfo = ncFormInfoInput.val();
+ // System.out.println("Extracted __ncforminfo: " + ncFormInfo); // 调试信息
+ } else {
+ System.err.println("Warning: Could not find __ncforminfo input field.");
+ return null; // 缺少关键 token,返回 null
+ }
+
+ // 如果必要的 token 没有获取到,可能无法继续 (虽然上面的检查已经覆盖,这里作为双重保险)
+ if (csrfToken == null || ncFormInfo == null) {
+ System.err.println("Error: Missing required tokens. Cannot proceed with POST request.");
+ return null;
+ }
+
+ // --- Step 5 & 6: 构建 POST 请求参数并发送 ---
+ // System.out.println("\nPreparing POST request..."); // 调试信息
+ HttpPost postRequest = new HttpPost(SEARCH_SUBMIT_URL);
+ // 添加 Headers 模拟浏览器提交表单
+ postRequest.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36");
+ // 重要:设置 Referer Header
+ postRequest.setHeader("Referer", SEARCH_FORM_URL);
+ // 添加 Origin Header
+ postRequest.setHeader("Origin", "https://ctri.nic.in");
+ postRequest.setHeader("Content-Type", "application/x-www-form-urlencoded");
+ postRequest.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
+ postRequest.setHeader("Accept-Language", "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7");
+ postRequest.setHeader("Pragma", "no-cache");
+
+ List params = new ArrayList<>();
+ // 添加你之前分析的载荷中的所有参数,使用获取到的动态值
+ params.add(new BasicNameValuePair("stid", "1")); // 注意 stid 之前有两个,这里用 1
+ params.add(new BasicNameValuePair("csrf_token", csrfToken)); // 使用获取到的动态 token
+ params.add(new BasicNameValuePair("pros", "1"));
+ params.add(new BasicNameValuePair("month", String.format("%02d", month))); // 格式化月份为两位数
+ params.add(new BasicNameValuePair("year", String.valueOf(year)));
+ params.add(new BasicNameValuePair("study", "0"));
+ params.add(new BasicNameValuePair("sdid", "0"));
+ params.add(new BasicNameValuePair("phaseid", "0"));
+ params.add(new BasicNameValuePair("psponsor", "0"));
+ params.add(new BasicNameValuePair("recid", "0"));
+ params.add(new BasicNameValuePair("state", "0"));
+ params.add(new BasicNameValuePair("district", "0"));
+ params.add(new BasicNameValuePair("searchword", ""));
+ params.add(new BasicNameValuePair("T4", "anyvalue")); // T4既然无效,随便填
+ params.add(new BasicNameValuePair("btt", "Search"));
+ params.add(new BasicNameValuePair("__ncforminfo", ncFormInfo)); // 使用获取到的动态值
+
+ // 将参数列表设置到请求体中
+ postRequest.setEntity(new UrlEncodedFormEntity(params, StandardCharsets.UTF_8));
+
+ // System.out.println("Executing POST request to submit form..."); // 调试信息
+ try (CloseableHttpResponse postResponse = httpClient.execute(postRequest, context)) {
+ int postStatusCode = postResponse.getCode();
+ // System.out.println("POST Response Status: " + postStatusCode); // 打印状态码
+
+ if (postStatusCode != 200) {
+ System.err.println("Error: POST request to submit form failed with status code: " + postStatusCode);
+ EntityUtils.consume(postResponse.getEntity()); // 确保消费掉实体,释放连接
+ return null; // 提交表单失败,返回 null
+ }
+
+
+ HttpEntity postEntity = postResponse.getEntity();
+
+ if (postEntity != null) {
+ String searchResultsHtml = EntityUtils.toString(postEntity, StandardCharsets.UTF_8);
+ EntityUtils.consume(postEntity); // 确保实体内容被完全消费
+
+ // --- Step 7: 处理搜索结果页面 ---
+ // System.out.println("\nParsing search results..."); // 调试信息
+
+ Document resultsDoc = Jsoup.parse(searchResultsHtml, SEARCH_SUBMIT_URL);
+
+ Elements links = resultsDoc.select("tr a");
+
+ for (Element linkElement : links) {
+ String rawLink = linkElement.attr("href");
+ // System.out.println("Processing raw link: " + rawLink); // 调试信息
+
+ // 使用预编译的正则表达式 Pattern
+ Matcher matcher = LINK_REGEX_PATTERN.matcher(rawLink);
+
+ // 查找匹配项
+ if (matcher.find()) {
+ String extractedContent = matcher.group(1); // 提取单引号内的内容
+ // 构建完整的链接 URL
+ String fullLink = "https://ctri.nic.in/Clinicaltrials/" + extractedContent;
+ linkList.add(fullLink); // 将完整链接添加到列表中
+ // System.out.println("Added link: " + fullLink); // 调试信息
+ } else {
+ // 如果链接不符合模式,打印警告并跳过
+ System.err.println("Warning: Link does not match expected pattern: " + rawLink);
+ }
+ }
+
+ // --- 返回提取到的链接列表 ---
+ // 循环结束后,返回收集到的所有链接
+ // System.out.println("Finished link extraction. Returning list."); // 调试信息
+ return linkList;
+
+
+ } else {
+ System.err.println("Error: Failed to get search results entity.");
+ return null; // 获取结果内容失败,返回 null
+ }
+ }
+
+ } catch (IOException e) {
+ // 处理网络请求相关的异常
+ System.err.println("Network or IO error during scraping:");
+ e.printStackTrace();
+ return null; // 发生 IO 错误,返回 null
+ } catch (Exception e) {
+ // 处理其他可能的异常,例如解析错误或 NPE
+ System.err.println("An unexpected error occurred during scraping:");
+ e.printStackTrace();
+ return null; // 发生其他错误,返回 null
+ }
+ }
+ public static void main(String[] args) {
+ for (Integer year = Year.now().getValue(); year >= 2024; year--) {
+ int monthStart = (year == Year.now().getValue()) ? LocalDate.now().getMonthValue() : 12;
+
+ for (Integer month = monthStart; month >= 1; month--) {
+ try {
+ List links = getlink(year, month);
+ if (links == null) {
+ System.out.println("年份 " + year + " 月份 " + month + " 抓取失败!");
+ continue;
+ }
+
+ if (links.isEmpty()) {
+ System.out.println("年份 " + year + " 月份 " + month + " 无数据!");
+ continue;
+ }
+
+ int sleepTime = random.nextInt(1001) + 3000;
+ int count = 0;
+
+ for (String url : links) {
+ try {
+ Map result = reslutData(url);
+ result.put("crawlUrl", url);
+
+ String registNum = String.valueOf(result.get("registNum"));
+ String jsonValue = objectMapper.writeValueAsString(result);
+
+ ProducerRecord record = new ProducerRecord<>(TOPIC_NAME, registNum, jsonValue);
+
+ producer.send(record, (metadata, exception) -> {
+ if (exception == null) {
+ System.out.println("✅ 成功发送到 Kafka: " + registNum + " | Offset: " + metadata.offset() + " | " + url);
+ } else {
+ System.err.println("❌ Kafka 发送失败: " + exception.getMessage());
+ }
+ });
+
+ Thread.sleep(sleepTime); // 控制节奏
+ count++;
+ } catch (Exception e) {
+ System.err.println("抓取或发送失败: " + url);
+ e.printStackTrace();
+ }
+ }
+
+ System.out.println("📦 年份 " + year + " 月份 " + month + " 已完成,共上传 " + count + " 条数据。");
+
+ } catch (Exception e) {
+ System.err.println("处理年份 " + year + " 月份 " + month + " 失败: " + e.getMessage());
+ e.printStackTrace();
+ }
+ }
+ }
+ // 关闭 producer
+ producer.close();
+ }
+
+ public static Map reslutData(String url) throws IOException {
+ Map resultData = new HashMap<>();
+ OkHttpClient client = new OkHttpClient().newBuilder()
+ .build();
+ MediaType mediaType = MediaType.parse("text/plain");
+ RequestBody body = RequestBody.create(mediaType, "");
+ Request request = new Request.Builder()
+ .url(url)
+ .get()
+ .build();
+ Response response = client.newCall(request).execute();
+ String html = response.body().string();
+ Document parse = Jsoup.parse(html);
+ String title = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(7) > td:nth-child(2)").text();
+ String registNum = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(1) > td:nth-child(2) > b").text();
+ String registTime = extractAndConvertDate(parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(1) > td:nth-child(2)").text());
+ Map sponsor = new HashMap<>();
+ String SMMS = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(14) > td:nth-child(2) > table > tbody > tr > td").text();
+ String primarySponsor = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(15) > td:nth-child(2) > table > tbody").text();
+ sponsor.put("Source of Monetary or Material Support",SMMS);
+ sponsor.put("Primary Sponsor",primarySponsor);
+ String studyType = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(4) > td:nth-child(2)").text();
+ String phase = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(31) > td:nth-child(2)").text();
+ Map disease = new HashMap<>();
+ String healthType = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(21) > td:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(1)").text();
+ String condition = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(21) > td:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(2)").text();
+ disease.put("healthType",healthType);
+ disease.put("condition",condition);
+ String studyDesign = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(6) > td:nth-child(2)").text();
+ String inclusionCriteria = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(23) > td:nth-child(2) > table > tbody").text();
+ String exclusionCriteria = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(24) > td:nth-child(2) > table > tbody > tr > td:nth-child(2)").text();
+ String enrollment = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(30) > td:nth-child(2)").text();
+ String country = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(17) > td:nth-child(2)").text();
+ String intervention = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(22) > td:nth-child(2) > table").text();
+ Map primaryOutcome = new HashMap<>();
+ String firstOutcome = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(28) > td:nth-child(2) > table > tbody").text();
+ String secondOutcome = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(29) > td:nth-child(2) > table > tbody").text();
+ primaryOutcome.put("firstOutcome",firstOutcome);
+ primaryOutcome.put("secondOutcome",secondOutcome);
+
+ resultData.put("disease",disease);
+ resultData.put("primaryOutcome",primaryOutcome);
+ resultData.put("intervention",intervention);
+ resultData.put("country",country);
+ resultData.put("enrollment",enrollment);
+ resultData.put("exclusionCriteria",exclusionCriteria);
+ resultData.put("inclusionCriteria",inclusionCriteria);
+ resultData.put("studyDesign",studyDesign);
+ resultData.put("sponsor",sponsor);
+ resultData.put("title",title);
+ resultData.put("registNum",registNum);
+ resultData.put("registTime",registTime);
+ resultData.put("studyType",studyType);
+ resultData.put("phase",phase);
+ resultData.put("registStatus","");
+ resultData.put("registTitle","");
+ resultData.put("fullTitle","");
+ resultData.put("sponsorPart","");
+ resultData.put("studyObjective","");
+ resultData.put("studyStartDate","");
+ resultData.put("currentStatus","");
+ resultData.put("tagTime","");
+ resultData.put("crawlTime",getCurrentTime());
+ resultData.put("crawlUrl",url);
+ resultData.put("postTime",registTime);
+ resultData.put("content","content");
+ resultData.put("forwardcontent","forwardcontent");
+ resultData.put("cid","Nctrinicin");
+ return resultData;
+ }
+ public static String getCurrentTime() {
+ // 创建 DateTimeFormatter,指定输出格式
+ DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
+ // 获取当前时间
+ LocalDateTime now = LocalDateTime.now();
+ // 格式化
+ return now.format(formatter);
+ }
+ public static String extractAndConvertDate(String input) {
+ // 定义正则表达式提取 dd/MM/yyyy 格式的日期
+ Pattern pattern = Pattern.compile("\\[(?:Registered on|注册于):\\s*(\\d{2}/\\d{2}/\\d{4})\\]");
+ Matcher matcher = pattern.matcher(input);
+
+ if (matcher.find()) {
+ String dateStr = matcher.group(1); // 提取的日期字符串
+ try {
+ // 解析成 Date 对象
+ SimpleDateFormat inputFormat = new SimpleDateFormat("dd/MM/yyyy");
+ Date date = inputFormat.parse(dateStr);
+
+ // 格式化为 yyyy:MM:dd 00:00:00
+ SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MM-dd '00:00:00'");
+ return outputFormat.format(date);
+
+ } catch (ParseException e) {
+ e.printStackTrace();
+ }
+ }
+
+ return null; // 如果未匹配或转换失败
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/com/example/CtriScraperContent.java b/src/main/java/com/example/CtriScraperContent.java
new file mode 100644
index 0000000..9840b40
--- /dev/null
+++ b/src/main/java/com/example/CtriScraperContent.java
@@ -0,0 +1,121 @@
+package com.example;
+
+import okhttp3.*;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+
+import java.io.IOException;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.time.LocalDateTime;
+import java.time.format.DateTimeFormatter;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class CtriScraperContent {
+ public static void main(String[] args) throws IOException {
+ Map resultData = new HashMap<>();
+ String url = "https://ctri.nic.in/Clinicaltrials/pmaindet2.php?EncHid=MjQ3MjM=&Enc=&userName=";
+ OkHttpClient client = new OkHttpClient().newBuilder()
+ .build();
+ MediaType mediaType = MediaType.parse("text/plain");
+ RequestBody body = RequestBody.create(mediaType, "");
+ Request request = new Request.Builder()
+ .url(url)
+ .get()
+ .build();
+ Response response = client.newCall(request).execute();
+ String html = response.body().string();
+ Document parse = Jsoup.parse(html);
+ String title = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(7) > td:nth-child(2)").text();
+ String registNum = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(1) > td:nth-child(2) > b").text();
+ String registTime = extractAndConvertDate(parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(1) > td:nth-child(2)").text());
+ Map sponsor = new HashMap<>();
+ String SMMS = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(14) > td:nth-child(2) > table > tbody > tr > td").text();
+ String primarySponsor = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(15) > td:nth-child(2) > table > tbody").text();
+ sponsor.put("Source of Monetary or Material Support",SMMS);
+ sponsor.put("Primary Sponsor",primarySponsor);
+ String studyType = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(4) > td:nth-child(2)").text();
+ String phase = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(31) > td:nth-child(2)").text();
+ Map disease = new HashMap<>();
+ String healthType = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(21) > td:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(1)").text();
+ String condition = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(21) > td:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(2)").text();
+ disease.put("healthType",healthType);
+ disease.put("condition",condition);
+ String studyDesign = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(6) > td:nth-child(2)").text();
+ String inclusionCriteria = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(23) > td:nth-child(2) > table > tbody").text();
+ String exclusionCriteria = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(24) > td:nth-child(2) > table > tbody > tr > td:nth-child(2)").text();
+ String enrollment = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(30) > td:nth-child(2)").text();
+ String country = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(17) > td:nth-child(2)").text();
+ String intervention = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(22) > td:nth-child(2) > table").text();
+ Map primaryOutcome = new HashMap<>();
+ String firstOutcome = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(28) > td:nth-child(2) > table > tbody").text();
+ String secondOutcome = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(29) > td:nth-child(2) > table > tbody").text();
+ primaryOutcome.put("firstOutcome",firstOutcome);
+ primaryOutcome.put("secondOutcome",secondOutcome);
+
+ resultData.put("disease",disease);
+ resultData.put("primaryOutcome",primaryOutcome);
+ resultData.put("intervention",intervention);
+ resultData.put("country",country);
+ resultData.put("enrollment",enrollment);
+ resultData.put("exclusionCriteria",exclusionCriteria);
+ resultData.put("inclusionCriteria",inclusionCriteria);
+ resultData.put("studyDesign",studyDesign);
+ resultData.put("sponsor",sponsor);
+ resultData.put("title",title);
+ resultData.put("registNum",registNum);
+ resultData.put("registTime",registTime);
+ resultData.put("studyType",studyType);
+ resultData.put("phase",phase);
+ resultData.put("registStatus","");
+ resultData.put("registTitle","");
+ resultData.put("fullTitle","");
+ resultData.put("sponsorPart","");
+ resultData.put("studyObjective","");
+ resultData.put("studyStartDate","");
+ resultData.put("currentStatus","");
+ resultData.put("tagTime","");
+ resultData.put("crawlTime",getCurrentTime());
+ resultData.put("crawlUrl",url);
+ resultData.put("postTime",registTime);
+ resultData.put("content","content");
+ resultData.put("forwardcontent","forwardcontent");
+
+ System.out.println(resultData);
+ }
+ public static String getCurrentTime() {
+ // 创建 DateTimeFormatter,指定输出格式
+ DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
+ // 获取当前时间
+ LocalDateTime now = LocalDateTime.now();
+ // 格式化
+ return now.format(formatter);
+ }
+ public static String extractAndConvertDate(String input) {
+ // 定义正则表达式提取 dd/MM/yyyy 格式的日期
+ Pattern pattern = Pattern.compile("\\[(?:Registered on|注册于):\\s*(\\d{2}/\\d{2}/\\d{4})\\]");
+ Matcher matcher = pattern.matcher(input);
+
+ if (matcher.find()) {
+ String dateStr = matcher.group(1); // 提取的日期字符串
+ try {
+ // 解析成 Date 对象
+ SimpleDateFormat inputFormat = new SimpleDateFormat("dd/MM/yyyy");
+ Date date = inputFormat.parse(dateStr);
+
+ // 格式化为 yyyy:MM:dd 00:00:00
+ SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MM-dd '00:00:00'");
+ return outputFormat.format(date);
+
+ } catch (ParseException e) {
+ e.printStackTrace();
+ }
+ }
+
+ return null; // 如果未匹配或转换失败
+ }
+}
diff --git a/src/main/java/com/example/Inka.java b/src/main/java/com/example/Inka.java
new file mode 100644
index 0000000..c60f147
--- /dev/null
+++ b/src/main/java/com/example/Inka.java
@@ -0,0 +1,113 @@
+package com.example;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import okhttp3.*;
+import org.apache.kafka.clients.producer.KafkaProducer;
+import org.apache.kafka.clients.producer.ProducerConfig;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.time.LocalDate;
+import java.time.format.DateTimeFormatter;
+import java.util.*;
+import java.util.concurrent.TimeUnit;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class Inka {
+// private static final String TOPIC_NAME = "patentTopic";
+// private static final String BOOTSTRAP_SERVERS = "localhost:9092";
+// private static KafkaProducer producer;
+// private static ObjectMapper objectMapper = new ObjectMapper();
+// private static final Random random = new Random();
+ private static List proxyList = new ArrayList<>(); // 代理池
+ private static int currentProxyIndex = 0; // 当前使用的代理索引
+// static {
+// Properties props = new Properties();
+// props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, BOOTSTRAP_SERVERS);
+// props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
+// props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
+// props.put(ProducerConfig.ACKS_CONFIG, "all"); // 等待所有副本确认
+// props.put(ProducerConfig.RETRIES_CONFIG, 3); // 重试次数
+// producer = new KafkaProducer<>(props);
+// try {
+// proxyList = Files.readAllLines(Paths.get("proxy.txt"));
+// if (proxyList.isEmpty()) {
+// System.out.println("警告: proxy.txt 为空,未加载任何代理");
+// } else {
+// System.out.println("成功加载 " + proxyList.size() + " 个代理");
+// }
+// } catch (IOException e) {
+// System.err.println("读取 proxy.txt 失败: " + e.getMessage());
+// }
+// }
+ public static void main(String[] args) throws IOException, InterruptedException {
+ String load = "javax.faces.partial.ajax=true&javax.faces.source=advancedSearchForm%3AadvancedSearchInput%3Aj_idt1225&javax.faces.partial.execute=advancedSearchForm%3AadvancedSearchInput%3Aj_idt1225+advancedSearchForm&javax.faces.partial.render=advancedSearchForm+results-container+j_idt1272&advancedSearchForm%3AadvancedSearchInput%3Aj_idt1225=advancedSearchForm%3AadvancedSearchInput%3Aj_idt1225&advancedSearchForm=advancedSearchForm&advancedSearchForm%3AadvancedSearchAssistant=on&advancedSearchForm%3AadvancedSearchInput%3Ainput=rance10&javax.faces.ViewState=-3602994148230912322%3A-6313250694718303467";
+
+ OkHttpClient client = createClientWithProxy();
+
+ MediaType mediaType = MediaType.parse("application/x-www-form-urlencoded; charset=UTF-8");
+ RequestBody body = RequestBody.create(mediaType, load);
+
+ // 构建请求
+ Request request = new Request.Builder()
+ .url("https://patentscope.wipo.int/search/zh/result.jsf?_vid=P21-M9APK2-00815") // 更新为 Patentscope 的 URL
+ .method("POST", body)
+ .addHeader("Accept", "application/xml, text/xml, */*; q=0.01")
+ .addHeader("Accept-Language", "zh-CN,zh;q=0.9,th;q=0.8,en;q=0.7")
+ .addHeader("Cache-Control", "no-cache")
+ .addHeader("Connection", "keep-alive")
+ .addHeader("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8")
+// .addHeader("Cookie", "JSESSIONID=F253B7B0920FFACB89354339F51E325C.wapp2nB; ABIW=balancer.cms41; _ga=GA1.1.33840258.1744249893; Hm_lvt_95e64d347633bfd0a2462e25c93606d6=1744249893; Hm_lpvt_95e64d347633bfd0a2462e25c93606d6=1744249893; HMACCOUNT=0388A9D4AC1C33F5; _pk_id.14.ec75=5aa7b2d46edf6083.1744249894.; cebs=1; _ce.clock_data=-923%2C212.87.194.3%2C1%2C33d0f257a817d1ca4c4381b87f8ad83f%2CChrome%2CJP; cebsp_=1; _pk_uid=0%3DNWFhN2IyZDQ2ZWRmNjA4Mw%3D%3D; _gcl_au=1.1.1245117354.1744249928; wipo-visitor-uunid=28f5a645185bc7b; _pk_ref.9.ec75=%5B%22%22%2C%22%22%2C1744249929%2C%22https%3A%2F%2Fwww.wipo.int%2F%22%5D; _pk_id.9.ec75=957af9d7ac871adb.1744249929.; _ga_15TSHJ0HWP=GS1.1.1744249893.1.1.1744250058.58.0.0; _ce.s=v~274adfa655dbaad3ae6a47724ee5bf89d205d10f~lcw~1744250058720~vir~new~lva~1744249893962~vpv~0~v11.cs~411929~v11.s~559ada70-15ae-11f0-a979-459b55a048ba~v11.sla~1744250058728~gtrk.la~m9apg5tj~v11.send~1744250058720~lcw~1744250058728; _pk_id.5.ec75=ab8529a634a38653.1744250080.; wipo_language=zh; _pk_ses.5.ec75=1")
+ .addHeader("Faces-Request", "partial/ajax")
+ .addHeader("Host", "patentscope.wipo.int")
+ .addHeader("Origin", "https://patentscope.wipo.int")
+ .addHeader("Pragma", "no-cache")
+ .addHeader("Referer", "https://patentscope.wipo.int/search/zh/result.jsf?_vid=P21-M9APK2-00815")
+ .addHeader("Sec-Ch-Ua", "\"Google Chrome\";v=\"135\", \"Not-A.Brand\";v=\"8\", \"Chromium\";v=\"135\"")
+ .addHeader("Sec-Ch-Ua-Mobile", "?0")
+ .addHeader("Sec-Ch-Ua-Platform", "\"Windows\"")
+ .addHeader("Sec-Fetch-Dest", "empty")
+ .addHeader("Sec-Fetch-Mode", "cors")
+ .addHeader("Sec-Fetch-Site", "same-origin")
+ .addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36")
+ .addHeader("X-Requested-With", "XMLHttpRequest")
+ .build();
+
+ // 执行请求并打印响应
+ try (Response response = client.newCall(request).execute()) {
+ if (response.isSuccessful()) {
+ System.out.println("Response: " + response.body().string()+response.code());
+ } else {
+ System.out.println("Error: " + response.code() + " - " + response.message());
+ System.out.println("Response Body: " + response.body().string());
+ }
+ }
+ }
+
+ private static OkHttpClient createClientWithProxy() {
+ OkHttpClient.Builder builder = new OkHttpClient().newBuilder()
+ .connectTimeout(30, TimeUnit.SECONDS)
+ .readTimeout(30, TimeUnit.SECONDS)
+ .writeTimeout(30, TimeUnit.SECONDS);
+
+ if (!proxyList.isEmpty() && currentProxyIndex < proxyList.size()) {
+ String proxy = proxyList.get(currentProxyIndex);
+ String[] proxyParts = proxy.split(":");
+ if (proxyParts.length == 2) {
+ String proxyHost = proxyParts[0];
+ int proxyPort = Integer.parseInt(proxyParts[1]);
+ builder.proxy(new java.net.Proxy(java.net.Proxy.Type.HTTP,
+ new java.net.InetSocketAddress(proxyHost, proxyPort)));
+ System.out.println("使用代理: " + proxy);
+ }
+ }
+ return builder.build();
+ }
+}
diff --git a/src/main/java/com/example/NSFAwardCrawler.java b/src/main/java/com/example/NSFAwardCrawler.java
new file mode 100644
index 0000000..bc3100e
--- /dev/null
+++ b/src/main/java/com/example/NSFAwardCrawler.java
@@ -0,0 +1,111 @@
+package com.example;
+
+import org.openqa.selenium.By;
+import org.openqa.selenium.WebDriver;
+import org.openqa.selenium.WebElement;
+import org.openqa.selenium.chrome.ChromeDriver;
+import org.openqa.selenium.chrome.ChromeOptions;
+import org.openqa.selenium.support.ui.ExpectedConditions;
+import org.openqa.selenium.support.ui.WebDriverWait;
+import org.openqa.selenium.NoSuchElementException;
+
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.List;
+
+public class NSFAwardCrawler {
+ private static final int PAGE_SIZE = 30; // 每页基准条数
+
+ public static void main(String[] args) {
+ // 设置 ChromeDriver 路径
+ System.setProperty("webdriver.chrome.driver",
+ "F:\\tool\\EasySpider_0.6.2_Windows_x64\\EasySpider_windows_x64\\EasySpider\\resources\\app\\chrome_win64\\chromedriver_win64.exe");
+
+ ChromeOptions options = new ChromeOptions();
+ WebDriver driver = new ChromeDriver(options);
+
+ try {
+ String url = "https://www.nsf.gov/awardsearch/simpleSearchResult?queryText=ebola&ActiveAwards=true";
+ driver.get(url);
+
+ WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(10));
+ List allAwardIds = new ArrayList<>();
+ int pageNumber = 1;
+
+ while (true) {
+ System.out.println("Processing page " + pageNumber);
+
+ // 等待页面加载完成
+ wait.until(ExpectedConditions.presenceOfElementLocated(By.className("listview-item")));
+
+ // 获取当前页的结果项
+ List resultItems = driver.findElements(By.className("listview-item"));
+ int currentPageSize = resultItems.size();
+ System.out.println("Found " + currentPageSize + " items on page " + pageNumber);
+
+ // 如果当前页没有结果,退出
+ if (currentPageSize == 0) {
+ System.out.println("No items found on page " + pageNumber + ", stopping...");
+ break;
+ }
+
+ // 提取当前页的奖项 ID
+ for (WebElement item : resultItems) {
+ try {
+ String awardId = item.getAttribute("id");
+ if (awardId != null && !awardId.isEmpty() && !allAwardIds.contains(awardId)) {
+ allAwardIds.add(awardId);
+ }
+ } catch (Exception e) {
+ System.out.println("Error processing item: " + e.getMessage());
+ }
+ }
+
+ // 判断是否需要分页:如果当前页条数小于 30,认为是最后一页
+ if (currentPageSize < PAGE_SIZE) {
+ System.out.println("Page " + pageNumber + " has less than " + PAGE_SIZE + " items (" + currentPageSize + "), assuming last page, stopping...");
+ break;
+ }
+
+ // 检查下一页按钮
+ try {
+ WebElement nextButton = driver.findElement(By.name("NEXT"));
+ boolean isEnabled = nextButton.isEnabled();
+ System.out.println("Next button enabled: " + isEnabled);
+
+ if (!isEnabled) {
+ System.out.println("Next button is disabled, stopping...");
+ break;
+ }
+
+ // 点击下一页
+ nextButton.click();
+ Thread.sleep(2000); // 等待页面加载
+ pageNumber++;
+ } catch (NoSuchElementException e) {
+ System.out.println("Next button not found, stopping...");
+ break;
+ } catch (Exception e) {
+ System.out.println("Error clicking next button: " + e.getMessage());
+ break;
+ }
+ }
+
+ // 打印所有结果
+ System.out.println("Found " + allAwardIds.size() + " award IDs across all pages:");
+ for (int i = 0; i < allAwardIds.size(); i++) {
+ System.out.println((i + 1) + ". " + allAwardIds.get(i));
+ }
+
+ } catch (Exception e) {
+ System.out.println("An error occurred: " + e.getMessage());
+ } finally {
+ try {
+ Thread.sleep(2000);
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ driver.quit();
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/com/example/PatentscopeSeleniumCrawler.java b/src/main/java/com/example/PatentscopeSeleniumCrawler.java
new file mode 100644
index 0000000..4edb842
--- /dev/null
+++ b/src/main/java/com/example/PatentscopeSeleniumCrawler.java
@@ -0,0 +1,130 @@
+package com.example;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+import org.openqa.selenium.By;
+import org.openqa.selenium.Keys;
+import org.openqa.selenium.WebDriver;
+import org.openqa.selenium.WebElement;
+import org.openqa.selenium.chrome.ChromeDriver;
+import org.openqa.selenium.chrome.ChromeOptions;
+import org.openqa.selenium.support.ui.ExpectedConditions;
+import org.openqa.selenium.support.ui.WebDriverWait;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.time.Duration;
+import java.util.Random;
+
+public class PatentscopeSeleniumCrawler {
+ private static final Logger LOGGER = LoggerFactory.getLogger(PatentscopeSeleniumCrawler.class);
+ private static final String SEARCH_URL = "https://patentscope.wipo.int/search/en/search.jsf";
+ private static final String SEARCH_INPUT_ID = "simpleSearchForm:fpSearch:input";
+ private static final String SEARCH_BUTTON_ID = "simpleSearchForm:fpSearch:j_idt1319";
+ private static final Random RANDOM = new Random();
+
+ public static void main(String[] args) {
+ // 配置 ChromeDriver
+ System.setProperty("webdriver.chrome.driver", "F:\\tool\\EasySpider_0.6.2_Windows_x64\\EasySpider_windows_x64\\EasySpider\\resources\\app\\chrome_win64\\chromedriver_win64.exe");
+ ChromeOptions options = new ChromeOptions();
+ options.addArguments("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36");
+ options.addArguments("--disable-blink-features=AutomationControlled");
+ // 非无头模式,便于调试
+ WebDriver driver = null;
+
+ try {
+ driver = new ChromeDriver(options);
+ WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(15));
+
+ // Step 1: 访问搜索页面
+ LOGGER.info("Navigating to {}", SEARCH_URL);
+ driver.get(SEARCH_URL);
+ Thread.sleep(2000 + RANDOM.nextInt(2000)); // 等待页面加载
+
+ // Step 2: 输入搜索关键词
+ LOGGER.info("Entering search query: FP:(fever)");
+ WebElement searchInput = wait.until(ExpectedConditions.elementToBeClickable(By.id(SEARCH_INPUT_ID)));
+ searchInput.clear();
+ searchInput.sendKeys("FP:(fever)");
+ Thread.sleep(500 + RANDOM.nextInt(1000)); // 等待输入生效
+
+ // Step 3: 触发搜索
+ LOGGER.info("Attempting to trigger search...");
+ try {
+ // 方法 1: 点击搜索按钮
+ WebElement searchButton = wait.until(ExpectedConditions.elementToBeClickable(By.id(SEARCH_BUTTON_ID)));
+ LOGGER.info("Clicking search button");
+ searchButton.click();
+ Thread.sleep(3000 + RANDOM.nextInt(2000)); // 等待 AJAX 和跳转
+ } catch (Exception e) {
+ LOGGER.warn("Button click failed, trying Enter key: {}", e.getMessage());
+ // 方法 2: 模拟回车
+ searchInput.sendKeys(Keys.ENTER);
+ Thread.sleep(3000 + RANDOM.nextInt(2000));
+ }
+
+ // Step 4: 验证跳转
+ String currentUrl = driver.getCurrentUrl();
+ LOGGER.info("Current URL: {}", currentUrl);
+ if (!currentUrl.contains("result.jsf")) {
+ LOGGER.error("Failed to redirect to result.jsf, trying advanced search...");
+ // 尝试高级搜索(备用)
+ driver.get("https://patentscope.wipo.int/search/en/search.jsf?advancedSearch=true");
+ searchInput = wait.until(ExpectedConditions.elementToBeClickable(By.id("advancedSearchForm:advancedSearchInput:input")));
+ searchInput.clear();
+ searchInput.sendKeys("FP:(fever)");
+ WebElement advSearchButton = wait.until(ExpectedConditions.elementToBeClickable(By.id("advancedSearchForm:advancedSearchInput:j_idt1208")));
+ advSearchButton.click();
+ Thread.sleep(3000 + RANDOM.nextInt(2000));
+ currentUrl = driver.getCurrentUrl();
+ LOGGER.info("Advanced search URL: {}", currentUrl);
+ }
+
+ // Step 5: 解析结果页面
+ if (currentUrl.contains("result.jsf")) {
+ LOGGER.info("Successfully reached result page");
+ while (true) {
+ Document doc = Jsoup.parse(driver.getPageSource());
+ Elements results = doc.select("div.result-row"); // 需确认选择器
+ if (results.isEmpty()) {
+ LOGGER.warn("No results found, verify selector or query");
+ }
+
+ for (Element item : results) {
+ String title = item.select("a.result-title__text").text(); // 需确认
+ String patentId = item.select("div.result__number").text(); // 需确认
+ LOGGER.info("Title: {}", title.isEmpty() ? "N/A" : title);
+ LOGGER.info("Patent ID: {}", patentId.isEmpty() ? "N/A" : patentId);
+ }
+
+ // 分页
+ WebElement nextPage = driver.findElements(By.cssSelector("a.paginator__button--next:not(.is-disabled)"))
+ .stream()
+ .filter(WebElement::isDisplayed)
+ .findFirst()
+ .orElse(null);
+ if (nextPage == null) {
+ LOGGER.info("No more pages");
+ break;
+ }
+
+ LOGGER.info("Navigating to next page");
+ nextPage.click();
+ Thread.sleep(3000 + RANDOM.nextInt(2000));
+ }
+ } else {
+ LOGGER.error("Still not on result page, check query or network");
+ }
+
+ } catch (Exception e) {
+ LOGGER.error("Error during crawling: {}", e.getMessage(), e);
+ } finally {
+ if (driver != null) {
+ driver.quit();
+ LOGGER.info("WebDriver closed");
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/com/example/ProxyIPChecker.java b/src/main/java/com/example/ProxyIPChecker.java
new file mode 100644
index 0000000..8d027f2
--- /dev/null
+++ b/src/main/java/com/example/ProxyIPChecker.java
@@ -0,0 +1,25 @@
+package com.example;
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.net.HttpURLConnection;
+import java.net.URL;
+
+public class ProxyIPChecker {
+ public static void main(String[] args) throws Exception {
+ URL url = new URL("http://httpbin.org/ip");
+ HttpURLConnection conn = (HttpURLConnection) url.openConnection();
+ conn.setRequestMethod("GET");
+
+ BufferedReader in = new BufferedReader(new InputStreamReader(conn.getInputStream(), "UTF-8"));
+ String inputLine;
+ StringBuilder response = new StringBuilder();
+
+ while ((inputLine = in.readLine()) != null) {
+ response.append(inputLine);
+ }
+ in.close();
+
+ System.out.println("当前公网 IP 信息:");
+ System.out.println(response.toString());
+ }
+}
diff --git a/src/main/java/com/example/ScraperWithCaptcha.java b/src/main/java/com/example/ScraperWithCaptcha.java
new file mode 100644
index 0000000..f171732
--- /dev/null
+++ b/src/main/java/com/example/ScraperWithCaptcha.java
@@ -0,0 +1,496 @@
+package com.example;// 修改为你的包名
+
+import java.awt.image.BufferedImage;
+import java.io.*;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.net.URLEncoder;
+import java.nio.charset.StandardCharsets;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import javax.imageio.ImageIO;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import net.sourceforge.tess4j.Tesseract;
+import net.sourceforge.tess4j.TesseractException;
+
+public class ScraperWithCaptcha {
+
+ // --- 需要根据目标网站修改的常量 ---
+ private static final String BASE_URL = "https://ctri.nic.in/Clinicaltrials/advancesearchmain.php"; // *** 替换为目标网站包含表单和验证码的页面 URL ***
+ private static final String FORM_SUBMIT_URL = BASE_URL; // *** 表单提交的 URL,通常是页面本身或 action 属性指定的 URL ***
+ private static final String CAPTCHA_IMAGE_SRC_SUBSTRING = "captchasecurityimages.php"; // *** 验证码图片 src 中特有的字符串 ***
+ private static final String CAPTCHA_INPUT_SELECTOR = "input[name=T4]";
+ private static final String TARGET_FORM_SELECTOR = "form"; // *** 如果页面有多个表单,指定目标表单的选择器,例如 "#myFormId" ***
+
+ // --- 图像预处理相关的阈值,需要根据验证码样式调试 ---
+ private static final int BINARY_THRESHOLD = 128; // 二值化阈值 (0-255)
+
+ // --- Tesseract 配置 (根据你的安装修改) ---
+ // Tesseract tessdata 文件夹的路径
+ private static final String TESSDATA_PATH = "F:\\tool\\Tesseract-OCR\\tessdata"; // *** 请务必修改为你的实际路径 ***
+
+ // --- 其他通用配置 ---
+ private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36";
+ private Set cookies = new HashSet<>(); // 存储 cookies
+
+ public static void main(String[] args) {
+ ScraperWithCaptcha scraper = new ScraperWithCaptcha();
+ try {
+ // 1. 获取包含表单和验证码的页面
+ PageInfo pageInfo = scraper.fetchPage(BASE_URL, null, null, false); // 第一次 GET 不需要 Cookies 和 POST Data, 也不是 AJAX
+
+ if (pageInfo.htmlContent == null || pageInfo.statusCode != HttpURLConnection.HTTP_OK) {
+ System.err.println("Failed to fetch the initial page. Status code: " + pageInfo.statusCode);
+ return;
+ }
+
+ // 解析页面提取验证码信息和所有表单字段
+ Document doc = Jsoup.parse(pageInfo.htmlContent, BASE_URL);
+
+ // 提取验证码图片 URL
+ Element captchaImg = doc.selectFirst("img[src*=" + CAPTCHA_IMAGE_SRC_SUBSTRING + "]");
+ String captchaImageUrl = null;
+ if (captchaImg != null) {
+ captchaImageUrl = captchaImg.absUrl("src"); // 获取绝对 URL
+ System.out.println("Found CAPTCHA image URL: " + captchaImageUrl);
+ } else {
+ System.err.println("CAPTCHA image not found using selector: img[src*=" + CAPTCHA_IMAGE_SRC_SUBSTRING + "]");
+ // 如果找不到验证码,可能无法继续
+ return;
+ }
+
+ // 提取验证码输入框的 name
+ Element captchaInput = doc.selectFirst(CAPTCHA_INPUT_SELECTOR);
+ String captchaInputName = null;
+ if (captchaInput != null) {
+ captchaInputName = captchaInput.attr("name");
+ System.out.println("Found CAPTCHA input field name: " + captchaInputName);
+ } else {
+ System.err.println("CAPTCHA input field not found using selector: " + CAPTCHA_INPUT_SELECTOR);
+ // 如果找不到输入框,也无法提交
+ return;
+ }
+
+ // 2. 下载验证码图片
+ BufferedImage originalCaptchaImage = scraper.downloadImage(captchaImageUrl);
+ System.out.println("Captcha image downloaded.");
+
+ // 3. 预处理图片
+ BufferedImage preprocessedImage = scraper.preprocessImage(originalCaptchaImage);
+ System.out.println("Image preprocessed (saved as preprocessed_captcha.png).");
+
+ // 4. 识别验证码
+ String captchaCode = scraper.recognizeCaptcha(preprocessedImage);
+
+ if (captchaCode != null && !captchaCode.isEmpty()) {
+ System.out.println("Recognized CAPTCHA: " + captchaCode);
+
+ // 5. 构建包含验证码的 POST 数据
+ // 从页面表单中提取所有字段,并设置其值
+ Map formData = scraper.buildFormDataMap(doc, captchaInputName, captchaCode);
+
+ String postData = scraper.buildPostData(formData);
+ System.out.println("Built POST data: " + postData);
+
+ // 6. 提交表单
+ // 通常是标准的 POST 请求
+ PageInfo postResponseInfo = scraper.fetchPage(FORM_SUBMIT_URL, postData, scraper.getCookieHeader(), false); // 非 AJAX POST
+
+ System.out.println("Form submitted. Response status code: " + postResponseInfo.statusCode);
+ System.out.println("POST Response Body (partial): " + (postResponseInfo.htmlContent != null && postResponseInfo.htmlContent.length() > 500 ? postResponseInfo.htmlContent.substring(0, 500) + "..." : postResponseInfo.htmlContent)); // 打印部分响应查看
+
+ // 7. 检查响应判断是否成功
+ // 对于标准表单提交,成功通常是重定向 (302) 或返回新的页面
+ if (postResponseInfo.statusCode == HttpURLConnection.HTTP_MOVED_TEMP || postResponseInfo.statusCode == HttpURLConnection.HTTP_SEE_OTHER || postResponseInfo.statusCode == HttpURLConnection.HTTP_MOVED_PERM) {
+ String redirectUrl = postResponseInfo.redirectUrl;
+ System.out.println("POST resulted in redirect. Location: " + redirectUrl);
+ // TODO: 如果重定向到成功页面,可以继续爬取该页面
+ // 如果重定向回原页面或错误页,说明提交失败 (验证码错误或其他原因)
+ if (redirectUrl != null && redirectUrl.equals(BASE_URL)) { // <-- 检查是否重定向回原页面,需根据实际情况判断
+ System.err.println("Submission failed, redirected back to the form page.");
+ // TODO: 实现重试逻辑 (需要重新获取页面和验证码)
+ }
+
+ } else if (postResponseInfo.statusCode == HttpURLConnection.HTTP_OK) {
+ System.out.println("POST returned OK (200). Analyzing response content...");
+ // TODO: 解析 postResponseInfo.htmlContent 来判断是否成功(例如查找成功标志,或检查是否有验证码错误提示)
+ if (postResponseInfo.htmlContent != null && postResponseInfo.htmlContent.contains("成功标志字符串")) { // <-- *** 根据实际成功响应的特征修改 ***
+ System.out.println("Form submission appears successful based on content.");
+ // TODO: 从 postResponseInfo.htmlContent 中提取你想要的数据
+ } else if (postResponseInfo.htmlContent != null && postResponseInfo.htmlContent.contains("验证码错误提示字符串")) { // <-- *** 根据实际验证码错误提示修改 ***
+ System.err.println("CAPTCHA appears incorrect. Need to retry.");
+ // TODO: 实现重试逻辑 (可能需要重新获取页面,因为验证码会刷新)
+ } else {
+ System.out.println("POST returned 200, but content not clearly indicating success or failure.");
+ // 需要更详细地检查响应内容
+ }
+ }
+ else {
+ System.err.println("POST request failed with status code: " + postResponseInfo.statusCode);
+ }
+
+
+ } else {
+ System.err.println("CAPTCHA recognition failed. Cannot submit form.");
+ // TODO: 实现识别失败的重试逻辑
+ }
+
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ System.err.println("An I/O error occurred: " + e.getMessage());
+ } catch (TesseractException e) {
+ e.printStackTrace();
+ System.err.println("A Tesseract OCR error occurred: " + e.getMessage());
+ } catch (Exception e) {
+ e.printStackTrace();
+ System.err.println("An unexpected error occurred: " + e.getMessage());
+ }
+ }
+
+ /**
+ * 发起 HTTP 请求 (GET 或 POST),获取页面内容和 Cookies。
+ *
+ * @param urlString 请求 URL
+ * @param postData POST 请求体数据 (GET 请求时为 null)
+ * @param cookieHeader 请求头中的 Cookie 值 (第一次请求时为 null)
+ * @param isAjaxPost 是否是 AJAX POST 请求 (影响请求头设置)
+ * @return PageInfo 对象,包含响应信息和内容
+ * @throws IOException
+ */
+ private PageInfo fetchPage(String urlString, String postData, String cookieHeader, boolean isAjaxPost) throws IOException {
+ URL url = new URL(urlString);
+ HttpURLConnection conn = (HttpURLConnection) url.openConnection();
+
+ if (postData != null) {
+ conn.setRequestMethod("POST");
+ conn.setDoOutput(true); // 允许写入 POST 数据
+ } else {
+ conn.setRequestMethod("GET");
+ }
+
+ conn.setInstanceFollowRedirects(false);
+ conn.setConnectTimeout(10000);
+ conn.setReadTimeout(20000);
+
+ // 设置请求头 (不包括 Cookie,Cookie 在后面统一处理)
+ conn.setRequestProperty("User-Agent", USER_AGENT);
+ if (cookieHeader != null) {
+ conn.setRequestProperty("Cookie", cookieHeader);
+ }
+ conn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8");
+ if (postData != null) {
+ conn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8");
+ if(isAjaxPost) {
+ conn.setRequestProperty("X-Requested-With", "XMLHttpRequest");
+ conn.setRequestProperty("X-MicrosoftAjax", "Delta=true");
+ }
+ try {
+ conn.setRequestProperty("Referer", new URL(urlString).getProtocol() + "://" + new URL(urlString).getHost() + new URL(urlString).getPath());
+ conn.setRequestProperty("Origin", new URL(urlString).getProtocol() + "://" + new URL(urlString).getHost());
+ } catch (Exception e) { }
+ }
+
+ // --- 写入 POST 数据 (如果是 POST 请求) ---
+ // 这一块必须在读取响应之前
+ if (postData != null) {
+ try (OutputStream os = conn.getOutputStream()) { // 获取输出流,会触发连接
+ byte[] input = postData.getBytes(StandardCharsets.UTF_8);
+ os.write(input, 0, input.length);
+ } // os.close() 在 try-with-resources 结束时自动调用,数据在这里被发送
+ }
+ // --- End POST Data ---
+
+
+ // --- 现在可以获取响应信息了 ---
+ // 调用 getResponseCode() 会发送完整的请求 (包括头和体) 并接收响应头
+ int statusCode = conn.getResponseCode();
+ String redirectUrl = null;
+ if (statusCode == HttpURLConnection.HTTP_MOVED_TEMP || statusCode == HttpURLConnection.HTTP_SEE_OTHER || statusCode == HttpURLConnection.HTTP_MOVED_PERM) {
+ redirectUrl = conn.getHeaderField("Location");
+ }
+
+ // --- 处理 Cookies (从响应头读取) ---
+ // 这一块现在在获取响应码之后执行
+ Map> headerFields = conn.getHeaderFields();
+ List cookiesHeader = headerFields.get("Set-Cookie");
+ if (cookiesHeader != null) {
+ for (String cookie : cookiesHeader) {
+ String cookieValue = cookie.split(";")[0];
+ this.cookies.add(cookieValue);
+ }
+ }
+ // --- End Cookies ---
+
+
+ StringBuilder content = new StringBuilder();
+ // 只有当状态码表示成功 (2xx) 或客户端错误 (4xx) 且有响应体时才读取
+ if (statusCode >= 200 && statusCode < 300 || statusCode >= 400 && statusCode < 500 && conn.getContentLength() > 0) {
+ try (InputStream is = (statusCode >= 200 && statusCode < 300) ? conn.getInputStream() : conn.getErrorStream();
+ BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))) {
+ String line;
+ while ((line = reader.readLine()) != null) {
+ content.append(line).append("\n");
+ }
+ } catch (IOException e) {
+ System.err.println("Error reading response body for status " + statusCode + ": " + e.getMessage());
+ }
+ }
+
+ conn.disconnect();
+
+ PageInfo pageInfo = new PageInfo();
+ pageInfo.statusCode = statusCode;
+ pageInfo.redirectUrl = redirectUrl;
+ pageInfo.htmlContent = content.toString();
+
+ return pageInfo;
+ }
+
+ /**
+ * 从页面表单中提取所有字段,并设置验证码字段的值
+ * @param doc Jsoup 解析后的 Document 对象
+ * @param captchaInputName 验证码输入框的 name 属性值
+ * @param captchaCode 识别出的验证码字符串
+ * @return 包含所有表单字段名称和值的 Map
+ */
+ private Map buildFormDataMap(Document doc, String captchaInputName, String captchaCode) {
+ Map formData = new HashMap<>();
+ Element form = doc.selectFirst(TARGET_FORM_SELECTOR); // 找到目标表单
+
+ if (form == null) {
+ System.err.println("Target form not found using selector: " + TARGET_FORM_SELECTOR);
+ return formData; // 返回空 Map
+ }
+
+ Elements formElements = form.select("input, select, textarea"); // 查找表单内的所有输入元素
+
+ for (Element element : formElements) {
+ String name = element.attr("name");
+ String type = element.attr("type"); // 获取 input 的类型
+ String value = element.attr("value"); // 获取默认 value
+
+ if (name == null || name.isEmpty()) {
+ continue; // 忽略没有 name 属性的元素
+ }
+
+ // 处理不同类型的输入元素
+ if ("text".equals(type) || "hidden".equals(type) || "password".equals(type)) {
+ if (name.equals(captchaInputName)) {
+ // 这是验证码输入框,填入识别结果
+ formData.put(name, captchaCode);
+ } else {
+ // 其他文本/隐藏字段,使用默认值或留空,取决于需求
+ formData.put(name, value != null ? value : ""); // 通常爬取时这些是空的
+ }
+ } else if ("checkbox".equals(type)) {
+ // 复选框,如果被勾选则添加到 formData
+ if (element.hasAttr("checked")) {
+ formData.put(name, value != null ? value : "on"); // 复选框的值通常是 "on" 或 value 属性的值
+ }
+ } else if ("radio".equals(type)) {
+ // 单选按钮,如果被选中则添加到 formData
+ if (element.hasAttr("checked")) {
+ formData.put(name, value != null ? value : "on"); // 单选按钮的值通常是 value 属性的值
+ }
+ } else if ("select".equals(element.tagName().toLowerCase())) {
+ // 下拉列表,找到被选中的 option 的值
+ Element selectedOption = element.selectFirst("option[selected]");
+ if (selectedOption != null) {
+ formData.put(name, selectedOption.attr("value"));
+ } else {
+ // 如果没有选中的项,可能需要根据网站逻辑选择第一个或默认项
+ // 或者如果网站要求必须有值,这里需要更复杂的处理
+ Element firstOption = element.selectFirst("option");
+ if (firstOption != null) {
+ formData.put(name, firstOption.attr("value"));
+ } else {
+ formData.put(name, ""); // 没有选项,留空
+ }
+ }
+ } else if ("textarea".equals(element.tagName().toLowerCase())) {
+ // 文本域,获取其文本内容
+ formData.put(name, element.text());
+ }
+ // TODO: 根据需要处理其他类型的 input,如 file, submit, image, reset 等
+ // 注意:submit, image 类型的 input 通常只有在它们被点击时才会被包含在表单提交数据中,并且它们的值是按钮的值
+ }
+
+ // TODO: 如果网站通过 JavaScript 动态添加或修改了表单字段,你需要找到这些字段并手动添加到 formData 中。
+ // TODO: 有些表单提交按钮本身会作为 POST 数据的一部分被发送(例如 name="submitButton" value="提交")
+ // 你可能需要确定哪个按钮触发了提交,并将它的 name=value 对添加到 formData 中。
+
+ return formData;
+ }
+
+
+ /**
+ * 下载验证码图片 (Java 8 兼容版本)
+ * @param imageUrl 图片的完整 URL
+ * @return 图片的 BufferedImage 对象
+ * @throws IOException 如果下载失败
+ */
+ public BufferedImage downloadImage(String imageUrl) throws IOException {
+ URL url = new URL(imageUrl);
+ HttpURLConnection conn = (HttpURLConnection) url.openConnection();
+ conn.setRequestMethod("GET");
+ conn.setRequestProperty("User-Agent", USER_AGENT);
+ // 下载图片时通常也需要带上 cookies,确保会话一致性
+ conn.setRequestProperty("Cookie", getCookieHeader());
+
+
+ int responseCode = conn.getResponseCode();
+ if (responseCode == HttpURLConnection.HTTP_OK) {
+ try (InputStream is = conn.getInputStream()) {
+ // --- 兼容 Java 8 及更早版本读取 InputStream ---
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ byte[] buffer = new byte[4096]; // 缓冲区大小
+ int bytesRead;
+ while ((bytesRead = is.read(buffer)) != -1) {
+ baos.write(buffer, 0, bytesRead);
+ }
+ ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
+ // --- End 兼容代码 ---
+
+ BufferedImage image = ImageIO.read(bais);
+ if (image == null) {
+ throw new IOException("Failed to read image stream. Check image format or content for URL: " + imageUrl);
+ }
+ return image;
+ }
+ } else {
+ throw new IOException("Failed to download image. HTTP error code: " + responseCode + " for URL: " + imageUrl);
+ }
+ }
+
+ /**
+ * 对验证码图片进行预处理 (基础示例:转灰度+二值化)
+ * 这是最关键的部分,需要根据验证码样式调整
+ * @param originalImage 原始图片
+ * @return 预处理后的图片
+ */
+ public BufferedImage preprocessImage(BufferedImage originalImage) {
+ // TODO: 这是图像预处理的重点,需要根据实际验证码样式进行调整和优化
+ // 保存原始图片方便对比
+ try {
+ File originalFile = new File("original_captcha.png");
+ ImageIO.write(originalImage, "png", originalFile);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ // 基础处理:转灰度 -> 二值化
+ int width = originalImage.getWidth();
+ int height = originalImage.getHeight();
+ BufferedImage grayImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_GRAY);
+ grayImage.getGraphics().drawImage(originalImage, 0, 0, null);
+
+ BufferedImage binaryImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_BINARY);
+ // 二值化阈值,需要调整 (0-255)
+ for (int y = 0; y < height; y++) {
+ for (int x = 0; x < width; x++) {
+ int gray = grayImage.getRaster().getSample(x, y, 0);
+ if (gray < BINARY_THRESHOLD) {
+ binaryImage.getRaster().setSample(x, y, 0, 0); // 黑色
+ } else {
+ binaryImage.getRaster().setSample(x, y, 0, 1); // 白色
+ }
+ }
+ }
+
+ // TODO: 更高级的预处理包括:去噪点、去干扰线、字符分割、倾斜校正等
+ // 如果验证码只有数字,可以尝试裁剪掉图片上下左右的空白或干扰区域
+
+ // 为了调试,将预处理后的图片保存下来查看效果
+ try {
+ File outputfile = new File("preprocessed_captcha.png");
+ ImageIO.write(binaryImage, "png", outputfile);
+ System.out.println("Preprocessed image saved to " + outputfile.getAbsolutePath());
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ return binaryImage; // 返回预处理后的图片
+ }
+
+ /**
+ * 使用 Tess4J 识别图片中的文字
+ * @param image 待识别的图片 (最好是预处理后的)
+ * @return 识别出的字符串 (如果失败返回 null 或空字符串)
+ */
+ public String recognizeCaptcha(BufferedImage image) throws TesseractException {
+ Tesseract tesseract = new Tesseract();
+
+ // 设置 tessdata 路径 (如果 TESSDATA_PATH 已正确设置且 Tesseract 安装正确,这行可能不是必需的,Tess4J 会自动查找)
+ if (TESSDATA_PATH != null && !TESSDATA_PATH.isEmpty()) {
+ tesseract.setDatapath(TESSDATA_PATH);
+ } else {
+ System.err.println("WARNING: TESSDATA_PATH not set. Tess4J will try to find tessdata automatically.");
+ }
+
+ tesseract.setLanguage("eng"); // 设置识别语言为英文 (通常包含数字)
+ // 如果验证码只有数字,可以尝试设置仅识别数字,这有助于提高准确率
+ // tesseract.setTessVariable("tessedit_char_whitelist", "0123456789"); // 方法名请查阅 Tess4J 文档确认
+
+ String result = tesseract.doOCR(image);
+ // 清理识别结果,去除空格或换行符等
+ result = result != null ? result.trim().replaceAll("[^0-9a-zA-Z]", "") : ""; // 根据验证码内容(数字、字母)调整清理规则
+
+ return result;
+ }
+
+ /**
+ * 构建用于 POST 提交的表单数据字符串
+ * @param formDataMap 包含所有表单字段名称和值的 Map
+ * @return URL 编码后的表单数据字符串
+ * @throws IOException
+ */
+ private String buildPostData(Map formDataMap) throws IOException {
+ StringBuilder postDataBuilder = new StringBuilder();
+ boolean first = true;
+ // 遍历 Map 构建 POST 数据。如果需要特定顺序,使用 LinkedHashMap
+ for (Map.Entry entry : formDataMap.entrySet()) {
+ if (!first) {
+ postDataBuilder.append("&");
+ }
+ postDataBuilder.append(URLEncoder.encode(entry.getKey(), StandardCharsets.UTF_8.name()))
+ .append("=")
+ .append(URLEncoder.encode(entry.getValue() != null ? entry.getValue() : "", StandardCharsets.UTF_8.name()));
+ first = false;
+ }
+ return postDataBuilder.toString();
+ }
+
+ /**
+ * 将存储的 cookies 格式化为 HTTP 请求头部的 Cookie 字符串
+ */
+ private String getCookieHeader() {
+ StringBuilder cookieHeaderBuilder = new StringBuilder();
+ boolean first = true;
+ for (String cookie : this.cookies) {
+ if (!first) {
+ cookieHeaderBuilder.append("; ");
+ }
+ cookieHeaderBuilder.append(cookie);
+ first = false;
+ }
+ return cookieHeaderBuilder.toString();
+ }
+
+
+ // Helper class to hold information extracted from a page fetch
+ private static class PageInfo {
+ int statusCode;
+ String redirectUrl; // 如果发生重定向
+ String htmlContent; // 页面响应内容
+ // 这里不再包含 ASP.NET 特有的字段,因为它是通用的
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/com/example/StringFieldExtractor.java b/src/main/java/com/example/StringFieldExtractor.java
new file mode 100644
index 0000000..9a36144
--- /dev/null
+++ b/src/main/java/com/example/StringFieldExtractor.java
@@ -0,0 +1,74 @@
+package com.example;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class StringFieldExtractor {
+ public static void main(String[] args) {
+ // 输入字符串
+ String input = "postTime:05-06-2024 00:00:00,title:PT/013/2024,content:澳門大學-N21科研大樓六樓智慧城市物聯網國家重點實驗室(澳門大學)建造工程 OBRAS DE CONSTRUÇÃO DO LABORATÓRIO DE REFERÊNCIA DO ESTADO DE INTERNET DAS COISAS PARA A CIDADE INTELIGENTE (UNIVERSIDADE DE MACAU), LOCALIZADO NO 6.º ANDAR DO EDIFÍCIO DE INVESTIGAÇÃO CIENTÍFICA N21 DA UNIVERSIDADE DE MACAU,fileList:[https://pct.admo.um.edu.mo/wp-content/uploads/2024/06/招標文件電子檔cover-CHI.pdf###pdf, https://pct.admo.um.edu.mo/wp-content/uploads/2024/06/招標文件電子檔cover-ENG-1.pdf###pdf, https://pct.admo.um.edu.mo/wp-content/uploads/2024/07/開標結果.pdf###pdf, https://pct.admo.um.edu.mo/wp-content/uploads/2024/11/判給結果-N21-6G.pdf###pdf]";
+
+ try {
+ // 存储提取结果
+ String postTime = null;
+ String title = null;
+ String content = null;
+ List fileList = new ArrayList<>();
+
+ // Step 1: 分割 fileList(因为它包含方括号,可能干扰其他字段)
+ String fileListStr = null;
+ int fileListStart = input.indexOf("fileList:[");
+ if (fileListStart != -1) {
+ int fileListEnd = input.lastIndexOf("]");
+ if (fileListEnd != -1 && fileListEnd > fileListStart) {
+ fileListStr = input.substring(fileListStart + 9, fileListEnd + 1); // 提取 [..]
+ input = input.substring(0, fileListStart - 1); // 移除 fileList 部分
+ }
+ }
+
+ // Step 2: 解析其他字段(postTime, title, content)
+ String[] fields = input.split(",(?=\\w+:)", 3); // 按逗号分割,仅在键名前
+ for (String field : fields) {
+ String[] keyValue = field.split(":", 2); // 分割键值对
+ if (keyValue.length == 2) {
+ String key = keyValue[0].trim();
+ String value = keyValue[1].trim();
+ switch (key) {
+ case "postTime":
+ postTime = value;
+ break;
+ case "title":
+ title = value;
+ break;
+ case "content":
+ content = value;
+ break;
+ }
+ }
+ }
+
+ // Step 3: 解析 fileList
+ if (fileListStr != null && fileListStr.startsWith("[") && fileListStr.endsWith("]")) {
+ String listContent = fileListStr.substring(1, fileListStr.length() - 1).trim();
+ if (!listContent.isEmpty()) {
+ // 分割列表元素,注意 URL 内的逗号
+ String[] urls = listContent.split(",\\s*(?=https)");
+ for (String url : urls) {
+ fileList.add(url.trim());
+ }
+ }
+ }
+
+ // 输出结果
+ System.out.println("postTime: " + postTime);
+ System.out.println("title: " + title);
+ System.out.println("content: " + content);
+ System.out.println("fileList: " + fileList);
+
+ } catch (Exception e) {
+ System.err.println("Parsing error: " + e.getMessage());
+ e.printStackTrace();
+ }
+ }
+}
diff --git a/src/main/java/com/example/WipoPatentsSelenium.java b/src/main/java/com/example/WipoPatentsSelenium.java
new file mode 100644
index 0000000..5f933a3
--- /dev/null
+++ b/src/main/java/com/example/WipoPatentsSelenium.java
@@ -0,0 +1,60 @@
+package com.example;
+
+import io.github.bonigarcia.wdm.WebDriverManager;
+import org.openqa.selenium.By;
+import org.openqa.selenium.WebDriver;
+import org.openqa.selenium.WebElement;
+import org.openqa.selenium.chrome.ChromeDriver;
+
+import java.util.List;
+
+public class WipoPatentsSelenium {
+
+ public static void main(String[] args) throws InterruptedException {
+ // 自动管理驱动
+ WebDriverManager.chromedriver().setup();
+ WebDriver driver = new ChromeDriver();
+
+ try {
+ driver.get("https://patentscope.wipo.int/search/en/result.jsf?query=FP:(AI)");
+
+ // 等待页面加载(粗略等待)
+ Thread.sleep(3000);
+
+ int maxPages = 3;
+ int currentPage = 1;
+
+ while (currentPage <= maxPages) {
+ System.out.println("📄 当前第 " + currentPage + " 页:");
+
+ // 找到所有结果项
+ List results = driver.findElements(By.cssSelector(".resultitem"));
+
+ for (WebElement result : results) {
+ String title = result.findElement(By.cssSelector(".resulttitle")).getText();
+ String pubNum = result.findElement(By.cssSelector(".pubNumber")).getText();
+ System.out.println("🔹 " + pubNum + " - " + title);
+ }
+
+ // 查找“下一页”按钮,点击
+ WebElement nextButton = null;
+ try {
+ nextButton = driver.findElement(By.cssSelector("a[title='Next']"));
+ } catch (Exception e) {
+ System.out.println("✅ 已到最后一页或按钮未找到");
+ break;
+ }
+
+ if (nextButton != null && nextButton.isDisplayed()) {
+ nextButton.click();
+ currentPage++;
+ Thread.sleep(3000); // 等待下一页加载
+ } else {
+ break;
+ }
+ }
+ } finally {
+ driver.quit();
+ }
+ }
+}
diff --git a/src/main/java/com/example/cliniTopic.java b/src/main/java/com/example/cliniTopic.java
new file mode 100644
index 0000000..3142e1f
--- /dev/null
+++ b/src/main/java/com/example/cliniTopic.java
@@ -0,0 +1,594 @@
+package com.example;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import okhttp3.*;
+import org.apache.kafka.clients.producer.KafkaProducer;
+import org.apache.kafka.clients.producer.ProducerConfig;
+import org.apache.kafka.clients.producer.ProducerRecord;
+import org.apache.kafka.common.serialization.StringSerializer;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.net.URLEncoder;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.time.LocalDate;
+import java.time.LocalDateTime;
+import java.time.format.DateTimeFormatter;
+import java.util.*;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class cliniTopic {
+ private static final String TOPIC_NAME = "cliniTopic";
+ private static final String BOOTSTRAP_SERVERS = "localhost:9092";
+ private static KafkaProducer producer;
+ private static ObjectMapper objectMapper = new ObjectMapper();
+ private static final Random random = new Random();
+ private static List proxyList = new ArrayList<>(); // 代理池
+ private static int currentProxyIndex = 0; // 当前使用的代理索引
+ static {
+ Properties props = new Properties();
+ props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, BOOTSTRAP_SERVERS);
+ props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
+ props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
+ props.put(ProducerConfig.ACKS_CONFIG, "all"); // 等待所有副本确认
+ props.put(ProducerConfig.RETRIES_CONFIG, 3); // 重试次数
+ producer = new KafkaProducer<>(props);
+ try {
+ proxyList = Files.readAllLines(Paths.get("proxy.txt"));
+ if (proxyList.isEmpty()) {
+ System.out.println("警告: proxy.txt 为空,未加载任何代理");
+ } else {
+ System.out.println("成功加载 " + proxyList.size() + " 个代理");
+ }
+ } catch (IOException e) {
+ System.err.println("读取 proxy.txt 失败: " + e.getMessage());
+ }
+ }
+
+ public static void main(String[] args) throws IOException, InterruptedException {
+ List keywords = Files.readAllLines(Paths.get("keywords.txt"));
+ List cleanedKeywords = new ArrayList<>();
+ for (String keyword : keywords) {
+ String cleaned = keyword.split(",")[0].trim(); // 取逗号前的部分并去除首尾空格
+ cleanedKeywords.add(cleaned);
+ }
+ ExecutorService executor = Executors.newFixedThreadPool(4); // 4 个线程
+ for (String keyword : cleanedKeywords) {
+ executor.submit(() -> {
+ try {
+ int sleepTime = random.nextInt(1001) + 30000;
+ for (Integer i=1;i<=7;i++){
+ final Integer pageNum = i;
+ Map list = list(keyword,i);
+ List urls = (List) list.get("listUrl");
+ if (urls.isEmpty()){
+ System.out.println("没有关键词"+keyword+"检索结果");
+ break;
+ }
+ Integer count = Integer.parseInt(String.valueOf(list.get("count")));
+ Integer totalPage = Integer.parseInt(String.valueOf(list.get("totalPage")));
+ for(String url:urls){
+ Map result = content(url);
+ Thread.sleep(sleepTime);
+ String registNum = String.valueOf(result.get("registNum"));
+ String crawlUrl = String.valueOf(result.get("crawlUrl"));
+
+ try {
+ String jsonValue = objectMapper.writeValueAsString(result);
+ ProducerRecord record = new ProducerRecord<>(TOPIC_NAME, registNum, jsonValue);
+
+ producer.send(record, (metadata, exception) -> {
+ if (exception == null) {
+ System.out.println("成功发送到Kafka - Partition: " + metadata.partition() +
+ ", Offset: " + metadata.offset() + ", "+crawlUrl + ", "+ keyword + " , " + pageNum );
+ } else {
+ System.err.println("发送到Kafka失败: " + exception.getMessage());
+ }
+ });
+ } catch (Exception e) {
+ System.err.println("序列化或发送Kafka消息失败: " + e.getMessage());
+ }
+ Thread.sleep(sleepTime);
+
+ }
+ if(count<10||totalPage==i){
+ System.out.println("关键词"+keyword+"已检索完毕");
+ break;
+ }
+
+ }
+ } catch (Exception e) {
+ System.err.println("处理 " + keyword + " 失败: " + e.getMessage());
+ e.printStackTrace();
+ }
+ });
+ }
+ executor.shutdown();
+ executor.awaitTermination(5, TimeUnit.HOURS);
+ producer.close();
+ }
+
+ private static Map list(String keyword,Integer page) throws Exception{
+ Map map = new HashMap<>();
+ String baseUrl = "https://www.drks.de/search/de";
+ String hostUrl = "https://www.drks.de";
+ String cleanUrl = "https://www.drks.de/search/de/results";
+ System.out.println("Pure URL: " + cleanUrl);
+
+ System.out.println("Page Number: " + page);
+
+ // 存储 cookies
+ Set cookieSet = new HashSet<>();
+ String sessionId = null;
+
+ // 第一步:初始 GET 请求,获取 cookies 和 ViewState
+ URL initialUrl = new URL(baseUrl);
+ HttpURLConnection initialConn = (HttpURLConnection) initialUrl.openConnection();
+ initialConn.setRequestMethod("GET");
+ initialConn.setInstanceFollowRedirects(false);
+ initialConn.setConnectTimeout(10000);
+ initialConn.setReadTimeout(10000);
+
+ // 捕获 cookies
+ sessionId = updateCookies(initialConn, cookieSet);
+ System.out.println("Initial Cookies: " + cookieSet);
+ System.out.println("Initial Session ID: " + sessionId);
+
+ // 读取响应内容以获取 ViewState
+ BufferedReader in = new BufferedReader(new InputStreamReader(initialConn.getInputStream()));
+ StringBuilder content = new StringBuilder();
+ String inputLine;
+ while ((inputLine = in.readLine()) != null) {
+ content.append(inputLine);
+ }
+ in.close();
+ initialConn.disconnect();
+
+ // 提取初始 ViewState
+ String initialViewState = extractViewState(content.toString());
+ System.out.println("Initial ViewState: " + initialViewState);
+
+ // 第二步:发送搜索 POST 请求
+ HttpURLConnection searchConn = (HttpURLConnection) new URL(baseUrl).openConnection();
+ searchConn.setRequestMethod("POST");
+ searchConn.setInstanceFollowRedirects(false);
+ searchConn.setDoOutput(true);
+ searchConn.setConnectTimeout(10000);
+ searchConn.setReadTimeout(10000);
+
+ // 设置搜索请求的请求头
+ searchConn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8");
+ searchConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7");
+ searchConn.setRequestProperty("Cookie", String.join("; ", cookieSet));
+ searchConn.setRequestProperty("Origin", "https://www.drks.de");
+ searchConn.setRequestProperty("Referer", baseUrl);
+ searchConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36");
+
+ // 构建搜索请求的 POST 数据
+ String searchPostData = buildSearchPostData(initialViewState,keyword);
+
+ // 发送搜索 POST 请求
+ try (OutputStream os = searchConn.getOutputStream()) {
+ byte[] input = searchPostData.getBytes(StandardCharsets.UTF_8);
+ os.write(input, 0, input.length);
+ }
+
+ // 更新 cookies
+ String searchSessionId = updateCookies(searchConn, cookieSet);
+ System.out.println("Search Cookies: " + cookieSet);
+ System.out.println("Search Session ID: " + searchSessionId);
+
+ // 处理搜索响应
+ int searchResponseCode = searchConn.getResponseCode();
+ System.out.println("Search Response Code: " + searchResponseCode);
+ String redirectUrl = searchConn.getHeaderField("Location");
+ searchConn.disconnect();
+
+ if (searchResponseCode != 302 || redirectUrl == null) {
+ System.err.println("Search request did not return expected 302 redirect. Response code: " + searchResponseCode);
+ return null;
+ }
+ System.out.println("Redirect URL (raw): " + redirectUrl);
+
+ // 解析相对 URL
+ if (!redirectUrl.startsWith("http")) {
+ redirectUrl = hostUrl + (redirectUrl.startsWith("/") ? redirectUrl : "/" + redirectUrl);
+ }
+ System.out.println("Resolved Redirect URL: " + redirectUrl);
+
+ // 第三步:跟随重定向(使用 GET 请求)
+ URL resultsUrl = new URL(redirectUrl);
+ HttpURLConnection resultsConn = (HttpURLConnection) resultsUrl.openConnection();
+ resultsConn.setRequestMethod("GET");
+ resultsConn.setInstanceFollowRedirects(false);
+ resultsConn.setConnectTimeout(10000);
+ resultsConn.setReadTimeout(10000);
+ resultsConn.setRequestProperty("Cookie", String.join("; ", cookieSet));
+ resultsConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7");
+ resultsConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64ек; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36");
+
+ // 更新 cookies
+ String resultsSessionId = updateCookies(resultsConn, cookieSet);
+ System.out.println("Results Cookies: " + cookieSet);
+ System.out.println("Results Session ID: " + resultsSessionId);
+
+ // 读取重定向后的结果页面内容
+ BufferedReader resultsReader = new BufferedReader(new InputStreamReader(resultsConn.getInputStream()));
+ StringBuilder resultsContent = new StringBuilder();
+ while ((inputLine = resultsReader.readLine()) != null) {
+ resultsContent.append(inputLine);
+ }
+ resultsReader.close();
+ resultsConn.disconnect();
+
+ // 提取页面中的 ViewState(状态信息,用于后续请求)
+ String viewState = extractViewState(resultsContent.toString());
+ System.out.println("Results ViewState: " + viewState);
+
+ // 检查 Session ID 是否一致,确保会话未被重置
+ if (sessionId != null && !sessionId.equals(resultsSessionId)) {
+ System.out.println("Warning: Session ID changed. Initial: " + sessionId + ", Results: " + resultsSessionId);
+ }
+
+ // Step 4: 第四步:发送分页请求(使用 POST)
+ HttpURLConnection postConn = (HttpURLConnection) new URL(cleanUrl).openConnection();
+ postConn.setRequestMethod("POST");
+ postConn.setInstanceFollowRedirects(false);
+ postConn.setDoOutput(true);
+ postConn.setConnectTimeout(10000);
+ postConn.setReadTimeout(10000);
+
+ // 设置分页请求的请求头(非 AJAX,模拟浏览器常规请求)
+ postConn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8");
+ postConn.setRequestProperty("Cookie", String.join("; ", cookieSet));
+ postConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7");
+ postConn.setRequestProperty("Origin", "https://www.drks.de");
+ postConn.setRequestProperty("Referer", cleanUrl);
+ postConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36");
+ postConn.setRequestProperty("Sec-Fetch-Dest", "document");
+ postConn.setRequestProperty("Sec-Fetch-Mode", "navigate");
+
+ // 构建分页请求的 POST 参数(包括页码和 ViewState 等)
+ String postData = buildPostData(viewState, page);
+ // 发送分页的 POST 请求
+ try (OutputStream os = postConn.getOutputStream()) {
+ byte[] input = postData.getBytes(StandardCharsets.UTF_8);
+ os.write(input, 0, input.length);
+ }
+
+ // 更新 cookies(分页响应可能返回新的 Set-Cookie)
+ String paginationSessionId = updateCookies(postConn, cookieSet);
+ System.out.println("Pagination Cookies: " + cookieSet);
+ System.out.println("Pagination Session ID: " + paginationSessionId);
+
+ // 处理分页响应
+ int responseCode = postConn.getResponseCode();
+ System.out.println("Pagination Response Code: " + responseCode);
+
+ // 读取分页响应的 HTML 内容
+ StringBuilder postContent = new StringBuilder();
+ try (BufferedReader postReader = new BufferedReader(
+ new InputStreamReader(
+ responseCode >= 400 ? postConn.getErrorStream() : postConn.getInputStream()))) {
+ while ((inputLine = postReader.readLine()) != null) {
+ postContent.append(inputLine);
+ }
+ }
+ Document parse = null;
+ if (responseCode == HttpURLConnection.HTTP_MOVED_TEMP
+ || responseCode == HttpURLConnection.HTTP_MOVED_PERM
+ || responseCode == HttpURLConnection.HTTP_SEE_OTHER) {
+ String newUrl = postConn.getHeaderField("Location");
+ System.out.println("Pagination Redirecting to: " + newUrl);
+
+ // 解析重定向中的相对地址为完整 URL(如果是相对路径)
+ if (!newUrl.startsWith("http")) {
+ newUrl = hostUrl + (newUrl.startsWith("/") ? newUrl : "/" + newUrl);
+ }
+
+ // 重定向
+ URL redirectConn = new URL(newUrl);
+ HttpURLConnection followConn = (HttpURLConnection) redirectConn.openConnection();
+ followConn.setRequestMethod("GET");
+ followConn.setRequestProperty("Cookie", String.join("; ", cookieSet));
+ followConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7");
+ followConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36");
+
+ BufferedReader redirectReader = new BufferedReader(new InputStreamReader(followConn.getInputStream()));
+ StringBuilder redirectContent = new StringBuilder();
+ while ((inputLine = redirectReader.readLine()) != null) {
+ redirectContent.append(inputLine);
+ }
+ redirectReader.close();
+ followConn.disconnect();
+ parse = Jsoup.parse(String.valueOf(redirectContent));
+ } else if (responseCode == 200) {
+ parse = Jsoup.parse(String.valueOf(postContent));
+ }
+
+
+
+ Elements links = parse.select("div[data-label='Titel der Studie'] a");
+ List listUrl = new ArrayList();
+ Integer count = 0;
+ for (Element link : links) {
+ String href = link.attr("href");
+ String trueUrl = "https://www.drks.de/"+href;
+ listUrl.add(trueUrl);
+ count++;
+ }
+ String text = parse.select("div.col-md-2.pt-3.ps-0.text-md-end").text();
+ // 使用正则表达式提取 "第" 和 "/" 之间的数字
+ String regex = "Seite\\s*(\\d+)\\s*/";
+ Matcher matcher = Pattern.compile(regex).matcher(text);
+ if (matcher.find()) {
+ map.put("totalPage",matcher.group(1));// 返回第一个捕获组,即数字 "1"
+ }
+ map.put("listUrl",listUrl);
+ map.put("count",count);
+ map.put("keyword",keyword);
+ postConn.disconnect();
+ return map;
+ }
+ // 更新并返回当前连接中的 Cookie,包含 JSESSIONID 的提取
+ private static String updateCookies(HttpURLConnection conn, Set cookieSet) {
+ String sessionId = null;
+ Map> headerFields = conn.getHeaderFields();
+ List cookiesHeader = headerFields.get("Set-Cookie");
+ if (cookiesHeader != null) {
+ for (String cookie : cookiesHeader) {
+ String cookieValue = cookie.split(";")[0];
+ cookieSet.add(cookieValue);
+ if (cookieValue.startsWith("JSESSIONID=") || cookieValue.startsWith("csfcfc=")) {
+ sessionId = cookieValue;
+ }
+ }
+ }
+ return sessionId;
+ }
+ // 提取 __VIEWSTATE 隐藏字段的值
+ private static String extractViewState(String html) {
+ if (html == null || html.isEmpty()) {
+ System.err.println("HTML content is empty or null");
+ return "";
+ }
+
+ // 兼容 jakarta.faces.ViewState 和 javax.faces.ViewState
+ String regex = "]*name=[\"'](?:jakarta|javax)\\.faces\\.ViewState[\"'][^>]*value=[\"']([^\"']+)[\"']";
+ Pattern pattern = Pattern.compile(regex);
+ Matcher matcher = pattern.matcher(html);
+
+ if (matcher.find()) {
+ return matcher.group(1);
+ }
+
+ System.err.println("Failed to extract ViewState from HTML");
+ return "";
+ }
+
+ private static Map content(String url)throws Exception{
+
+ OkHttpClient client = new OkHttpClient().newBuilder()
+ .build();
+ MediaType mediaType = MediaType.parse("application/json");
+ Request request = new Request.Builder()
+ .url(url)
+ .get()
+ .addHeader("Content-Type", "application/json")
+ .build();
+ Response response = client.newCall(request).execute();
+ String html = response.body().string();
+ Document parse = Jsoup.parse(html, "UTF-8");
+ String title = parse.select(".title-bold").text();
+ String registNum = parse.select(".card.trial-details-float.mb-4 .card-body dl dd:nth-child(2)").text();
+ String registTime = convertDate(parse.select(".card.trial-details-float.mb-4 .card-body dl dd:nth-child(6)").text());
+ Map sponsor = new HashMap<>();
+ String header = parse.select("body > main > div.card-body > div:nth-child(9) > div.card-body > div > div > div > div.card-header > h4").text();
+ String site = parse.select("body > main > div.card-body > div:nth-child(9) > div.card-body > div > div > div > div.card-body > dl > dd:nth-child(2) > div").text();
+ String telefon = parse.select("body > main > div.card-body > div:nth-child(9) > div.card-body > div > div > div > div.card-body > dl > dd:nth-child(4) > span").text();
+ String disease = parse.select("body > main > div.card-body > div:nth-child(6) > div.card-body > div > div:nth-child(2) > dl > dd:nth-child(2) > span").text();
+ String studyType = parse.select("body > main > div.card-body > div:nth-child(3) > div.card-body > dl").text();
+ String inclusionCriteria = parse.select("body > main > div.card-body > div:nth-child(7) > div.card-body > div:nth-child(2) > div:nth-child(3) > div > div.card-body > div > div.col-12.mt-3 > dl > dd > span").text();
+ String exclusionCriteria = parse.select("body > main > div.card-body > div:nth-child(7) > div.card-body > div:nth-child(2) > div:nth-child(4) > div > div.card-body > p > span").text();
+ String country = parse.select("body > main > div.card-body > div:nth-child(7) > div.card-body > div:nth-child(2) > div:nth-child(1) > div > div.card-body > dl > dd:nth-child(2)").text();
+ String intervention = parse.select("body > main > div.card-body > div:nth-child(4) > div.card-body > dl").text();
+ String primaryOutcome = parse.select("body > main > div.card-body > div:nth-child(5) > div.card-body > div > div > dl").text();
+ String enrollment = parse.select("body > main > div.card-body > div:nth-child(7) > div.card-body > div:nth-child(2) > div:nth-child(2) > div > div.card-body > div > div:nth-child(5) > dl > dd > span").text();
+ sponsor.put("header",header);
+ sponsor.put("site",site);
+ sponsor.put("telefon",telefon);
+ Map resultData = new HashMap<>();
+ resultData.put("title", title);
+ resultData.put("registNum",registNum);
+ resultData.put("registTime",registTime);
+ resultData.put("registStatus","");
+ resultData.put("registTitle","");
+ resultData.put("fullTitle","");
+ resultData.put("sponsor",sponsor);
+ resultData.put("sponsorPart","");
+ resultData.put("studyType",studyType);
+ resultData.put("phase","");
+ resultData.put("disease",disease);
+ resultData.put("studyDesign","");
+ resultData.put("studyObjective","");
+ resultData.put("studyStartDate","");
+ resultData.put("inclusionCriteria",inclusionCriteria);
+ resultData.put("exclusionCriteria",exclusionCriteria);
+ resultData.put("currentStatus","");
+ resultData.put("enrollment",enrollment);
+ resultData.put("country",country);
+ resultData.put("tagTime","");
+ resultData.put("intervention",intervention);
+ resultData.put("primaryOutcome",primaryOutcome);
+ resultData.put("crawlTime",getCurrentTime());
+ resultData.put("crawlUrl",url);
+ resultData.put("postTime",registTime);
+ resultData.put("content","content");
+ resultData.put("forwardcontent","forwardcontent");
+ resultData.put("cid","Ndrks");
+ return resultData;
+ }
+ // 生成搜索请求的 POST 数据
+ private static String buildSearchPostData(String viewState,String keyword) {
+ try {
+ return "searchForm=searchForm" +
+ "&searchForm%3Aj_idt80=" + keyword +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AdrksId=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AsecondaryId=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AscientificSummary=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aoutcome=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AhealthOfCondition=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AhealthyVolunteers=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aaddresses=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt128=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AipdSharingPlan=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt135%3Afrom=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt135%3Ato=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt146%3Afrom=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt146%3Ato=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt157%3Afrom=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt157%3Ato=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3Agender=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AageInYears=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AinclusionCriteria=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AexclusionCriteria=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AtrialStatus=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3ArecrutingLocation=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3Aj_idt213%3Afrom=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3Aj_idt213%3Ato=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3AtrialDesign%3Apurpose=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3AtrialDesign%3AstudyType=" +
+ "&searchForm%3Aj_idt287=" +
+ "&javax.faces.ViewState=" + URLEncoder.encode(viewState, StandardCharsets.UTF_8.name());
+ } catch (Exception e) {
+ System.err.println("Error encoding search ViewState: " + e.getMessage());
+ return "";
+ }
+ }
+ // 生成分页请求的 POST 数据
+ private static String buildPostData(String viewState, int page) {
+ int adjustedPage = page - 1;
+ try {
+ return "resultForm=resultForm" +
+ "&resultForm%3Asorting%3ArowsPerPage=10" +
+ "&resultForm%3ApaginationTop%3Aj_idt156%3A"+ adjustedPage +"%3Aj_idt158=" + page +
+ "&resultForm%3Asorting%3AsortingBy=SCORE" +
+ "&resultForm%3Asorting%3Aj_idt141=true" +
+ "&resultForm%3Aj_idt221%3Aj_idt223%3AdownloadConfirmation=resultForm%3Aj_idt221%3Aj_idt223%3AdownloadConfirmation" +
+ "&selectedType=JSON" +
+ "&javax.faces.ViewState=" + URLEncoder.encode(viewState, StandardCharsets.UTF_8.name());
+ } catch (Exception e) {
+ System.err.println("Error encoding pagination ViewState: " + e.getMessage());
+ return "";
+ }
+ }
+ public static String convertDate(String inputDate) {
+ try {
+ // 输入格式:dd.MM.yyyy
+ SimpleDateFormat inputFormat = new SimpleDateFormat("dd.MM.yyyy");
+ // 解析输入日期
+ Date date = inputFormat.parse(inputDate);
+ // 输出格式:yyyy-MM-dd HH:mm:ss
+ SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ // 转换为目标格式
+ return outputFormat.format(date);
+ } catch (ParseException e) {
+ // 处理解析异常
+ return "Invalid date format";
+ }
+ }
+
+ public static String getCurrentTime() {
+ // 创建 DateTimeFormatter,指定输出格式
+ DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
+ // 获取当前时间
+ LocalDateTime now = LocalDateTime.now();
+ // 格式化
+ return now.format(formatter);
+ }
+ private static Response executeWithRetry(OkHttpClient client, Request request, String keyword) throws IOException {
+ int maxRetries = proxyList.isEmpty() ? 1 : proxyList.size(); // 如果没有代理,只尝试一次
+ int attempt = 0;
+
+ while (attempt < maxRetries) {
+ Response response = client.newCall(request).execute();
+ if (response.code() == 403) {
+ System.out.println("收到 403 状态码,尝试切换代理重试...");
+ response.close();
+ switchProxy();
+ client = createClientWithProxy(); // 使用新代理重建客户端
+ attempt++;
+ if (attempt == maxRetries) {
+ throw new IOException("所有代理尝试失败,仍然收到 403");
+ }
+ continue;
+ }
+ return response; // 成功或非 403 状态码,直接返回
+ }
+ throw new IOException("无法执行请求,未获取响应");
+ }
+ private static OkHttpClient createClientWithProxy() {
+ OkHttpClient.Builder builder = new OkHttpClient().newBuilder()
+ .connectTimeout(30, TimeUnit.SECONDS)
+ .readTimeout(30, TimeUnit.SECONDS)
+ .writeTimeout(30, TimeUnit.SECONDS);
+
+ if (!proxyList.isEmpty() && currentProxyIndex < proxyList.size()) {
+ String proxy = proxyList.get(currentProxyIndex);
+ String[] proxyParts = proxy.split(":");
+ if (proxyParts.length == 2) {
+ String proxyHost = proxyParts[0];
+ int proxyPort = Integer.parseInt(proxyParts[1]);
+ builder.proxy(new java.net.Proxy(java.net.Proxy.Type.HTTP,
+ new java.net.InetSocketAddress(proxyHost, proxyPort)));
+ System.out.println("使用代理: " + proxy);
+ }
+ }
+ return builder.build();
+ }
+ private static synchronized void switchProxy() {
+ if (proxyList.isEmpty()) return;
+ currentProxyIndex = (currentProxyIndex + 1) % proxyList.size();
+ System.out.println("切换到新代理: " + proxyList.get(currentProxyIndex));
+ }
+ public static String increaseOffsetBy30(String originalPayload) {
+ // 以 "|" 分割载荷为数组
+ String[] parts = originalPayload.split("\\|");
+
+ // 检查数组长度,确保有足够元素
+ if (parts.length < 4) {
+ throw new IllegalArgumentException("载荷格式无效,元素不足");
+ }
+
+ // 找到倒数第 4 个元素的位置
+ int targetIndex = parts.length - 4;
+
+ try {
+ // 将倒数第 4 个数字解析为整数
+ int currentOffset = Integer.parseInt(parts[targetIndex]);
+ // 增加 30
+ int newOffset = currentOffset + 30;
+ // 将新值放回数组
+ parts[targetIndex] = String.valueOf(newOffset);
+ // 重新拼接载荷
+ return String.join("|", parts);
+ } catch (NumberFormatException e) {
+ throw new IllegalArgumentException("倒数第 4 个元素不是有效数字: " + parts[targetIndex]);
+ }
+ }
+}
diff --git a/src/main/java/com/example/drks.java b/src/main/java/com/example/drks.java
new file mode 100644
index 0000000..379d7f2
--- /dev/null
+++ b/src/main/java/com/example/drks.java
@@ -0,0 +1,438 @@
+package com.example;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.net.URLEncoder;
+import java.nio.charset.StandardCharsets;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class drks {
+ public static void main(String[] args) throws Exception {
+ String targetUrl = "https://www.drks.de/search/de/results?page=4";
+ String baseUrl = "https://www.drks.de/search/de";
+ String hostUrl = "https://www.drks.de";
+ String cleanUrl = targetUrl.split("\\?")[0];
+ System.out.println("Pure URL: " + cleanUrl);
+
+
+ String pageNumber = targetUrl.contains("?page=") ? targetUrl.split("page=")[1] : "1";
+ int page = Integer.parseInt(pageNumber);
+ System.out.println("Page Number: " + page);
+
+ // 存储 cookies
+ Set cookieSet = new HashSet<>();
+ String sessionId = null;
+
+ // 第一步:初始 GET 请求,获取 cookies 和 ViewState
+ System.out.println("\n--- Step 1: Initial GET Request ---");
+ URL initialUrl = new URL(baseUrl);
+ HttpURLConnection initialConn = (HttpURLConnection) initialUrl.openConnection();
+ initialConn.setRequestMethod("GET");
+ initialConn.setInstanceFollowRedirects(false);
+ initialConn.setConnectTimeout(10000);
+ initialConn.setReadTimeout(10000);
+ initialConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36");
+
+
+ // 捕获 cookies
+ sessionId = updateCookies(initialConn, cookieSet);
+ System.out.println("Initial Cookies: " + cookieSet);
+ System.out.println("Initial Session ID: " + sessionId);
+
+ // 读取响应内容以获取 ViewState
+ BufferedReader in = new BufferedReader(new InputStreamReader(initialConn.getInputStream()));
+ StringBuilder content = new StringBuilder();
+ String inputLine;
+ while ((inputLine = in.readLine()) != null) {
+ content.append(inputLine);
+ }
+ in.close();
+ initialConn.disconnect();
+
+ // 提取初始 ViewState
+ String initialViewState = extractViewState(content.toString());
+ System.out.println("Initial ViewState: " + initialViewState);
+
+ // 第二步:发送搜索 POST 请求
+ System.out.println("\n--- Step 2: Search POST Request ---");
+ HttpURLConnection searchConn = (HttpURLConnection) new URL(baseUrl).openConnection();
+ searchConn.setRequestMethod("POST");
+ searchConn.setInstanceFollowRedirects(false);
+ searchConn.setDoOutput(true);
+ searchConn.setConnectTimeout(10000);
+ searchConn.setReadTimeout(10000);
+
+ // 设置搜索请求的请求头
+ searchConn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8");
+ searchConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7");
+ searchConn.setRequestProperty("Cookie", String.join("; ", cookieSet));
+ searchConn.setRequestProperty("Origin", "https://www.drks.de");
+ searchConn.setRequestProperty("Referer", baseUrl);
+ searchConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36");
+
+ // 构建搜索请求的 POST 数据
+ String searchPostData = buildSearchPostData(initialViewState);
+ System.out.println("Search POST Data: " + searchPostData);
+
+ // 发送搜索 POST 请求
+ try (OutputStream os = searchConn.getOutputStream()) {
+ byte[] input = searchPostData.getBytes(StandardCharsets.UTF_8);
+ os.write(input, 0, input.length);
+ }
+
+ // 更新 cookies
+ String searchSessionId = updateCookies(searchConn, cookieSet);
+ System.out.println("Search Cookies: " + cookieSet);
+ System.out.println("Search Session ID: " + searchSessionId); // This is null in your output, which is a potential issue
+
+ // 处理搜索响应
+ int searchResponseCode = searchConn.getResponseCode();
+ System.out.println("Search Response Code: " + searchResponseCode);
+
+ if (searchResponseCode == 302) {
+ String redirectUrl = searchConn.getHeaderField("Location");
+ searchConn.disconnect();
+
+ if (redirectUrl == null) {
+ System.err.println("Search request returned 302 but no Location header found.");
+ return;
+ }
+ System.out.println("Redirect URL (raw): " + redirectUrl);
+
+ // 解析相对 URL
+ if (!redirectUrl.startsWith("http")) {
+ redirectUrl = hostUrl + (redirectUrl.startsWith("/") ? redirectUrl : "/" + redirectUrl);
+ }
+ System.out.println("Resolved Redirect URL: " + redirectUrl);
+
+ // 第三步:跟随重定向(使用 GET 请求)
+ System.out.println("\n--- Step 3: Follow Redirect (GET Request) ---");
+ URL resultsUrl = new URL(redirectUrl);
+ HttpURLConnection resultsConn = (HttpURLConnection) resultsUrl.openConnection();
+ resultsConn.setRequestMethod("GET");
+ resultsConn.setInstanceFollowRedirects(false);
+ resultsConn.setConnectTimeout(10000);
+ resultsConn.setReadTimeout(10000);
+ resultsConn.setRequestProperty("Cookie", String.join("; ", cookieSet));
+ resultsConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7");
+ resultsConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36");
+
+ // 更新 cookies
+ String resultsSessionId = updateCookies(resultsConn, cookieSet);
+ System.out.println("Results Cookies: " + cookieSet);
+ System.out.println("Results Session ID: " + resultsSessionId);
+
+ // 读取重定向后的结果页面内容
+ BufferedReader resultsReader = new BufferedReader(new InputStreamReader(resultsConn.getInputStream()));
+ StringBuilder resultsContent = new StringBuilder();
+ while ((inputLine = resultsReader.readLine()) != null) {
+ resultsContent.append(inputLine);
+ }
+ resultsReader.close();
+ resultsConn.disconnect();
+
+ // 提取页面中的 ViewState(状态信息,用于后续请求)
+ String viewState = extractViewState(resultsContent.toString());
+ System.out.println("Results ViewState: " + viewState);
+
+ // 检查 Session ID 是否一致,确保会话未被重置
+ if (sessionId != null && !sessionId.equals(resultsSessionId)) {
+ System.out.println("Warning: Session ID changed. Initial: " + sessionId + ", Results: " + resultsSessionId);
+ }
+
+ // Step 4: 第四步:发送分页请求(使用 POST)
+ System.out.println("\n--- Step 4: Pagination POST Request ---");
+ HttpURLConnection postConn = (HttpURLConnection) new URL(cleanUrl).openConnection();
+ postConn.setRequestMethod("POST");
+ postConn.setInstanceFollowRedirects(false);
+ postConn.setDoOutput(true);
+ postConn.setConnectTimeout(10000);
+ postConn.setReadTimeout(10000);
+
+ // 设置分页请求的请求头(非 AJAX,模拟浏览器常规请求)
+ postConn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8");
+ postConn.setRequestProperty("Cookie", String.join("; ", cookieSet));
+ postConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7");
+ postConn.setRequestProperty("Origin", "https://www.drks.de");
+ postConn.setRequestProperty("Referer", cleanUrl);
+ postConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36");
+ postConn.setRequestProperty("Sec-Fetch-Dest", "document");
+ postConn.setRequestProperty("Sec-Fetch-Mode", "navigate");
+
+ // 构建分页请求的 POST 参数(包括页码和 ViewState 等)
+ String postData = buildPostData(viewState, page);
+ System.out.println("Pagination POST Data: " + postData);
+
+ // 发送分页的 POST 请求
+ try (OutputStream os = postConn.getOutputStream()) {
+ byte[] input = postData.getBytes(StandardCharsets.UTF_8);
+ os.write(input, 0, input.length);
+ }
+
+ // 更新 cookies(分页响应可能返回新的 Set-Cookie)
+ String paginationSessionId = updateCookies(postConn, cookieSet);
+ System.out.println("Pagination Cookies: " + cookieSet);
+ System.out.println("Pagination Session ID: " + paginationSessionId);
+
+ // 处理分页响应
+ int responseCode = postConn.getResponseCode();
+ System.out.println("Pagination Response Code: " + responseCode);
+
+ // Read and process the pagination response
+ StringBuilder postContent = new StringBuilder();
+ try (BufferedReader postReader = new BufferedReader(
+ new InputStreamReader(
+ responseCode >= 400 ? postConn.getErrorStream() : postConn.getInputStream()))) {
+ while ((inputLine = postReader.readLine()) != null) {
+ postContent.append(inputLine);
+ }
+ }
+
+ Document parse = null;
+ if (responseCode == HttpURLConnection.HTTP_MOVED_TEMP
+ || responseCode == HttpURLConnection.HTTP_MOVED_PERM
+ || responseCode == HttpURLConnection.HTTP_SEE_OTHER) {
+ String newUrl = postConn.getHeaderField("Location");
+ System.out.println("Pagination Redirecting to: " + newUrl);
+
+ // 解析重定向中的相对地址为完整 URL(如果是相对路径)
+ if (!newUrl.startsWith("http")) {
+ newUrl = hostUrl + (newUrl.startsWith("/") ? newUrl : "/" + newUrl);
+ }
+
+ // Follow the redirect
+ URL redirectConnUrl = new URL(newUrl);
+ HttpURLConnection followConn = (HttpURLConnection) redirectConnUrl.openConnection();
+ followConn.setRequestMethod("GET");
+ followConn.setRequestProperty("Cookie", String.join("; ", cookieSet));
+ followConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7");
+ followConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36");
+
+ BufferedReader redirectReader = new BufferedReader(new InputStreamReader(followConn.getInputStream()));
+ StringBuilder redirectContent = new StringBuilder();
+ while ((inputLine = redirectReader.readLine()) != null) {
+ redirectContent.append(inputLine);
+ }
+ redirectReader.close();
+ followConn.disconnect();
+
+ System.out.println("Redirect Response: " + redirectContent);
+ parse = Jsoup.parse(String.valueOf(redirectContent));
+ } else if (responseCode == 200) {
+ System.out.println("Pagination Response: " + postContent);
+ parse = Jsoup.parse(String.valueOf(postContent));
+ } else {
+ System.err.println("Unexpected Pagination Response Code: " + responseCode);
+ // Optionally read and print error stream for non-200/3xx codes
+ try (BufferedReader errorReader = new BufferedReader(new InputStreamReader(postConn.getErrorStream()))) {
+ String errorLine;
+ System.err.println("Error Stream:");
+ while ((errorLine = errorReader.readLine()) != null) {
+ System.err.println(errorLine);
+ }
+ } catch (Exception e) {
+ System.err.println("Could not read error stream: " + e.getMessage());
+ }
+ return; // Exit if pagination fails unexpectedly
+ }
+
+ Elements links = parse.select("div[data-label='Titel der Studie'] a");
+
+ for (Element link : links) {
+ String href = link.attr("href");
+ String text = link.text();
+
+ System.out.println("链接: " + href);
+ System.out.println("标题: " + text);
+ }
+ String text = parse.select("div.col-md-2.pt-3.ps-0.text-md-end").text();
+ // 使用正则表达式提取 "第" 和 "/" 之间的数字
+ String regex = "Seite\\s*(\\d+)\\s*/";
+ Matcher matcher = Pattern.compile(regex).matcher(text);
+ if (matcher.find()) {
+ System.out.println("总共有"+matcher.group(1));// 返回第一个捕获组,即数字 "1"
+ }
+ postConn.disconnect();
+
+ } else if (searchResponseCode == 200) {
+ System.out.println("Search request returned 200 OK. Reading response body:");
+ // Read and print the response body for debugging
+ try (BufferedReader searchReader = new BufferedReader(new InputStreamReader(searchConn.getInputStream()))) {
+ String line;
+ StringBuilder searchResponseBody = new StringBuilder();
+ while ((line = searchReader.readLine()) != null) {
+ searchResponseBody.append(line).append("\n");
+ }
+ System.out.println("Search Response Body:\n" + searchResponseBody.toString());
+ } catch (Exception e) {
+ System.err.println("Could not read search response body: " + e.getMessage());
+ } finally {
+ searchConn.disconnect();
+ }
+
+ System.err.println("Search request did not return expected 302 redirect. Response code: " + searchResponseCode);
+ System.err.println("The website's search mechanism may have changed.");
+
+ } else {
+ // Handle other unexpected response codes for the search request
+ System.err.println("Unexpected Search Response Code: " + searchResponseCode);
+ try (BufferedReader errorReader = new BufferedReader(new InputStreamReader(searchConn.getErrorStream()))) {
+ String errorLine;
+ System.err.println("Error Stream:");
+ while ((errorLine = errorReader.readLine()) != null) {
+ System.err.println(errorLine);
+ }
+ } catch (Exception e) {
+ System.err.println("Could not read error stream for search response: " + e.getMessage());
+ }
+ searchConn.disconnect();
+ }
+ }
+
+ // 更新并返回当前连接中的 Cookie,包含 JSESSIONID 的提取
+ private static String updateCookies(HttpURLConnection conn, Set cookieSet) {
+ String sessionId = null;
+ Map> headerFields = conn.getHeaderFields();
+ List cookiesHeader = headerFields.get("Set-Cookie");
+ if (cookiesHeader != null) {
+ for (String cookie : cookiesHeader) {
+ String cookieValue = cookie.split(";")[0];
+ cookieSet.add(cookieValue);
+ // Prioritize JSESSIONID or csfcfc if present
+ if (cookieValue.startsWith("JSESSIONID=")) {
+ sessionId = cookieValue;
+ } else if (cookieValue.startsWith("csfcfc=") && sessionId == null) {
+ sessionId = cookieValue;
+ }
+ }
+ }
+ return sessionId;
+ }
+
+ // 提取 __VIEWSTATE 隐藏字段的值
+ private static String extractViewState(String html) {
+ // Try regex first for jakarta.faces.ViewState
+ String regexJakarta = "name=\"jakarta\\.faces\\.ViewState\"[^>]*value=\"([^\"]+)\"";
+ Pattern patternJakarta = Pattern.compile(regexJakarta);
+ Matcher matcherJakarta = patternJakarta.matcher(html);
+
+ if (matcherJakarta.find()) {
+ return matcherJakarta.group(1);
+ }
+
+ // Fallback to regex for javax.faces.ViewState (older versions or other parts of site)
+ String regexJavax = "name=\"javax\\.faces\\.ViewState\"[^>]*value=\"([^\"]+)\"";
+ Pattern patternJavax = Pattern.compile(regexJavax);
+ Matcher matcherJavax = patternJavax.matcher(html);
+
+ if (matcherJavax.find()) {
+ return matcherJavax.group(1);
+ }
+
+ // Fallback to string search if regex fails (less reliable)
+ String searchStringJakarta = "jakarta.faces.ViewState";
+ int startIndexJakarta = html.indexOf(searchStringJakarta);
+ if (startIndexJakarta != -1) {
+ int valueStart = html.indexOf("value=\"", startIndexJakarta) + 7;
+ int valueEnd = html.indexOf("\"", valueStart);
+ if (valueStart != -1 && valueEnd != -1) {
+ return html.substring(valueStart, valueEnd);
+ }
+ }
+
+ String searchStringJavax = "javax.faces.ViewState";
+ int startIndexJavax = html.indexOf(searchStringJavax);
+ if (startIndexJavax != -1) {
+ int valueStart = html.indexOf("value=\"", startIndexJavax) + 7;
+ int valueEnd = html.indexOf("\"", valueStart);
+ if (valueStart != -1 && valueEnd != -1) {
+ return html.substring(valueStart, valueEnd);
+ }
+ }
+
+
+ System.err.println("Failed to extract ViewState from HTML");
+ return ""; // Return empty string if not found
+ }
+
+ // 生成搜索请求的 POST 数据
+ private static String buildSearchPostData(String viewState) {
+ try {
+ // URL-encode the ViewState
+ String encodedViewState = URLEncoder.encode(viewState, StandardCharsets.UTF_8.name());
+
+ return "searchForm=searchForm" +
+ "&searchForm%3Aj_idt80=Midwifery" + // Assuming 'Midwifery' is the search term
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AdrksId=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AsecondaryId=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AscientificSummary=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aoutcome=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AhealthOfCondition=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AhealthyVolunteers=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aaddresses=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt128=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AipdSharingPlan=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt135%3Afrom=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt135%3Ato=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt146%3Afrom=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt146%3Ato=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt157%3Afrom=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt157%3Ato=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3Agender=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AageInYears=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AinclusionCriteria=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AexclusionCriteria=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AtrialStatus=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3ArecrutingLocation=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3Aj_idt213%3Afrom=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3Aj_idt213%3Ato=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3AtrialDesign%3Apurpose=" +
+ "&searchForm%3AextendedSearch%3AextendedSearchTabs%3AtrialDesign%3AstudyType=" +
+ "&searchForm%3Aj_idt287=" + // This parameter might be related to the search button click
+ "&jakarta.faces.ViewState=" + encodedViewState; // Changed to jakarta.faces.ViewState
+ } catch (Exception e) {
+ System.err.println("Error encoding search ViewState: " + e.getMessage());
+ return "";
+ }
+ }
+
+ // 生成分页请求的 POST 数据
+ private static String buildPostData(String viewState, int page) {
+ // The page parameter in the POST data might be 0-indexed or 1-indexed
+ // Let's assume it's 0-indexed for the parameter name and 1-indexed for the value based on your original code
+ int parameterPage = page - 1;
+ int valuePage = page; // The value sent in the form might be the actual page number
+
+ try {
+ // URL-encode the ViewState
+ String encodedViewState = URLEncoder.encode(viewState, StandardCharsets.UTF_8.name());
+
+ return "resultForm=resultForm" +
+ "&resultForm%3Asorting%3ArowsPerPage=10" +
+ // The parameter name for pagination button might have changed
+ // Check browser network traffic for the exact parameter name for page buttons
+ "&resultForm%3ApaginationTop%3Aj_idt156%3A"+ parameterPage +"%3Aj_idt158=" + valuePage +
+ "&resultForm%3Asorting%3AsortingBy=SCORE" +
+ "&resultForm%3Asorting%3Aj_idt141=true" + // This might be for sorting direction
+ "&resultForm%3Aj_idt221%3Aj_idt223%3AdownloadConfirmation=resultForm%3Aj_idt221%3Aj_idt223%3AdownloadConfirmation" +
+ "&selectedType=JSON" + // This might be for download format, potentially not needed for pagination
+ "&jakarta.faces.ViewState=" + encodedViewState; // Changed to jakarta.faces.ViewState
+ } catch (Exception e) {
+ System.err.println("Error encoding pagination ViewState: " + e.getMessage());
+ return "";
+ }
+ }
+}
diff --git a/src/main/java/com/example/getInKa.java b/src/main/java/com/example/getInKa.java
new file mode 100644
index 0000000..c18d4e5
--- /dev/null
+++ b/src/main/java/com/example/getInKa.java
@@ -0,0 +1,165 @@
+package com.example;
+
+import org.apache.kafka.clients.producer.*;
+import org.apache.kafka.common.serialization.StringSerializer;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.select.Elements;
+import okhttp3.OkHttpClient;
+import okhttp3.Request;
+import okhttp3.Response;
+
+import java.io.*;
+import java.util.*;
+import java.util.concurrent.Future;
+
+public class getInKa {
+ // 初始化 OkHttp 客户端,用于发送 HTTP 请求
+ private static final OkHttpClient httpClient = new OkHttpClient();
+ private static final String PROCESSED_URLS_FILE = "processed_urls.txt"; // 记录已处理的 URL 文件
+ public static void main(String[] args) {
+ try {
+ // 获取目标 URL 列表
+ System.out.println("Starting URL collection...");
+ List urls = getUrls();
+ System.out.println("Collected " + urls.size() + " URLs.");
+
+ // 从 URL 中提取新闻数据并保存到 kafka
+ System.out.println("Starting news extraction...");
+ getNews(urls);
+ System.out.println("News extraction completed.");
+ } catch (IOException | InterruptedException e) {
+ System.out.println("Error in main: " + e.getMessage());
+ }
+ }
+ public static List getUrls() throws IOException, InterruptedException {
+ List urls = new ArrayList<>();
+ Set processedUrls = loadProcessedUrls(); // 加载已处理的 URL
+
+ for (int page = 1; page <= 28; page++) {
+ String url = "https://www.zyctd.com/zixun/201/pz102-" + page + ".html";
+ Request request = new Request.Builder()
+ .url(url)
+ .addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0")
+ .build();
+
+ System.out.println("Fetching page " + page + ": " + url);
+ try (Response response = httpClient.newCall(request).execute()) {
+ if (response.isSuccessful() && response.body() != null) {
+ System.out.println("Successfully fetched page " + page);
+ String html = response.body().string();
+ Document doc = Jsoup.parse(html);
+ Elements links = doc.select("div.zixun-list > div.zixun-item-box > div.zixun-item-title > p > a");
+ List projectIDs = links.eachAttr("href");
+ System.out.println("Found " + projectIDs.size() + " URLs on page " + page);
+
+ for (String projectUrl : projectIDs) {
+ if (!processedUrls.contains(projectUrl)) { // 检查是否已处理
+ urls.add(projectUrl);
+ processedUrls.add(projectUrl); // 添加到已处理集合
+ }
+ }
+ } else {
+ System.out.println("Failed to fetch page " + page + ": Status code " + response.code());
+ }
+ }
+ Thread.sleep(1000);
+ }
+ saveProcessedUrls(processedUrls); // 保存已处理的 URL
+ return urls;
+ }
+ public static void getNews(List urls) throws IOException {
+ for (int i = 0; i < urls.size(); i++) {
+ String url = urls.get(i);
+ Request request = new Request.Builder()
+ .url(url)
+ .addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0")
+ .build();
+
+ System.out.println("Processing URL " + (i + 1) + "/" + urls.size() + ": " + url);
+ try (Response response = httpClient.newCall(request).execute()) {
+ if (response.isSuccessful() && response.body() != null) {
+ System.out.println("Successfully fetched news from " + url);
+ String html = response.body().string();
+ Document doc = Jsoup.parse(html);
+ String title = doc.select("div.info-title.t-center > h1").text().trim();
+ String date = doc.select("div.author.color-grey.art-info > span:nth-child(1)").text().trim();
+ String content = String.join("\n", doc.select("div.info-content > div > p").eachText()).trim();
+ if (content.isEmpty()) {
+ content = String.join("\n", doc.select("div.info-content > p:nth-child(2)").eachText()).trim();
+ }
+
+ if (!title.isEmpty() && !date.isEmpty() && !content.isEmpty()) {
+ Map news = new HashMap<>();
+ news.put("title", title);
+ news.put("date", date);
+ news.put("content", content);
+ news.put("url", url);
+ System.out.println("Extracted news: " + news.get("title"));
+ saveData(news); // 调用修改后的 saveData 方法
+ } else {
+ System.out.println("Failed to extract complete data from " + url);
+ }
+ } else {
+ System.out.println("Failed to fetch news from " + url + ": Status code " + response.code());
+ }
+ } catch (Exception e) {
+ System.out.println("An error occurred while fetching " + url + ": " + e.getMessage());
+ }
+ try {
+ Thread.sleep(5000); // 休眠5秒
+ } catch (InterruptedException e) {
+ System.out.println("Sleep interrupted: " + e.getMessage());
+ }
+ }
+ }
+ public static void saveData(Map news) {
+ Properties properties = new Properties();
+ properties.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092");
+ properties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName());
+ properties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName());
+
+ try (Producer producer = new KafkaProducer<>(properties)) {
+ String topic = "news-topic";
+ String key = news.get("title");
+ String value = news.toString();
+ ProducerRecord record = new ProducerRecord<>(topic, key, value);
+
+ producer.send(record, (metadata, exception) -> {
+ if (exception == null) {
+ System.out.println("Data sent successfully to Kafka: topic=" + metadata.topic() +
+ ", partition=" + metadata.partition() + ", offset=" + metadata.offset());
+ } else {
+ System.err.println("Failed to send data to Kafka: " + exception.getMessage());
+ }
+ }).get();
+ } catch (Exception e) {
+ System.err.println("Error while sending data to Kafka: " + e.getMessage());
+ }
+ }
+ // 加载已处理的 URL
+ private static Set loadProcessedUrls() throws IOException {
+ Set processedUrls = new HashSet<>();
+ File file = new File(PROCESSED_URLS_FILE);
+ if (file.exists()) {
+ try (BufferedReader reader = new BufferedReader(new FileReader(file))) {
+ String line;
+ while ((line = reader.readLine()) != null) {
+ processedUrls.add(line.trim());
+ }
+ }
+ }
+ return processedUrls;
+ }
+
+ // 保存已处理的 URL
+ private static void saveProcessedUrls(Set processedUrls) throws IOException {
+ try (BufferedWriter writer = new BufferedWriter(new FileWriter(PROCESSED_URLS_FILE))) {
+ for (String url : processedUrls) {
+ writer.write(url);
+ writer.newLine();
+ }
+ }
+ }
+}
diff --git a/src/main/java/com/example/jsonGetOk.java b/src/main/java/com/example/jsonGetOk.java
new file mode 100644
index 0000000..ced112b
--- /dev/null
+++ b/src/main/java/com/example/jsonGetOk.java
@@ -0,0 +1,47 @@
+package com.example;
+
+import okhttp3.*;
+import org.json.JSONArray;
+import org.json.JSONObject;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+public class jsonGetOk {
+ public static void main(String[] args) throws IOException {
+ OkHttpClient client = new OkHttpClient().newBuilder()
+ .build();
+ MediaType mediaType = MediaType.parse("text/plain");
+ RequestBody body = RequestBody.create(mediaType, "");
+ Request request = new Request.Builder()
+ .url("https://www.dsscu.gov.mo/api/common/page_detail?PostType=page&EntityId=6654829e-8163-b801-0096-c02e09d690d1")
+ .get()
+ .build();
+ Response response = client.newCall(request).execute();
+ String responseBody = response.body().string();
+
+ // 解析 JSON
+ JSONObject jsonObject = new JSONObject(responseBody);
+ JSONObject data = jsonObject.getJSONObject("data");
+ String postTime = data.getString("onlineAt");
+ JSONObject metas = data.getJSONObject("metas");
+ String title = metas.getString("name");
+ String summary = metas.getString("summary");
+ Document parse = Jsoup.parse(summary);
+ String content = parse.text();
+ String forwardcontent = responseBody;
+ String fileList = metas.getString("biddersFile");
+ fileList = fileList+"###"+"pdf";
+ Map map = new HashMap<>();
+ map.put("postTime",postTime);
+ map.put("title",title);
+ map.put("content",content);
+ map.put("forwardcontent",forwardcontent);
+ map.put("fileList",fileList);
+ System.out.println(map);
+ }
+
+}
diff --git a/src/main/java/com/example/ook.java b/src/main/java/com/example/ook.java
new file mode 100644
index 0000000..2d67ed9
--- /dev/null
+++ b/src/main/java/com/example/ook.java
@@ -0,0 +1,256 @@
+package com.example;
+
+import okhttp3.*;
+import org.json.JSONObject;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import java.io.IOException;
+import java.net.InetSocketAddress;
+import java.net.Proxy;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.time.LocalDate;
+import java.time.LocalDateTime;
+import java.time.ZonedDateTime;
+import java.time.format.DateTimeFormatter;
+import java.time.format.DateTimeParseException;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Locale;
+import java.util.Map;
+import java.util.concurrent.TimeUnit;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class ook {
+
+
+ public static void main(String[] args) throws Exception {
+ // 1. 获取代理地址
+// String proxyJson = getProxyFromLocalService();
+// JSONObject proxyData = new JSONObject(proxyJson);
+// String httpProxy = proxyData.getString("http"); // 例如 "http://proxy1:port"
+//
+// // 2. 解析代理地址
+// String[] proxyParts = httpProxy.replace("http://", "").split(":");
+// String proxyHost = proxyParts[0]; // proxy1
+// int proxyPort = Integer.parseInt(proxyParts[1]); // port
+
+ OkHttpClient client = new OkHttpClient().newBuilder()
+ .connectTimeout(30, TimeUnit.SECONDS)
+ .readTimeout(30, TimeUnit.SECONDS)
+ .writeTimeout(30, TimeUnit.SECONDS)
+ .proxy(new Proxy(Proxy.Type.HTTP, new InetSocketAddress("127.0.0.1", 7897))) // 直接使用 7897 端口
+ .build();
+
+ MediaType mediaType = MediaType.parse("text/plain");
+ Request request = new Request.Builder()
+ .url("https://wrair.health.mil/News-Media/Press-Releases/")
+ .get()
+ // 添加关键请求头
+ .addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36")
+ .addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7")
+// .addHeader("Accept-Encoding", "gzip, deflate, br, zstd")
+ .addHeader("Accept-Language", "zh-CN,zh;q=0.9,th;q=0.8")
+ .addHeader("Cache-Control", "no-cache")
+ .addHeader("Pragma", "no-cache")
+ .addHeader("Referer", "https://wrair.health.mil/News-Media/Press-Releases/")
+ .addHeader("Cookie", "_ga=GA1.1.516170455.1740971326; .ASPXANONYMOUS=xUBztj4Ek1vHfBPe-1QqFJhd83I4bkB1k0_d-2QrQ7drfd7R7Y6eNsyyHVjSeffyIKzy_qm5tOKOCtbvst-s9ZGWThxifCGMdJE117EQlr1OZARa0; dnn_IsMobile=False; language=en-US; ARRAffinity=c30f7cdebcf208f7c5a996cb410451c36532afc64703669607f68f04a75f4b39; _ga_CSLL4ZEK4L=GS1.1.1742349582.4.1.1742350035.0.0.0")
+ .addHeader("Upgrade-Insecure-Requests", "1")
+ .addHeader("Sec-Fetch-Dest", "document")
+ .addHeader("Sec-Fetch-Mode", "navigate")
+ .addHeader("Sec-Fetch-Site", "same-origin")
+ .addHeader("Sec-Fetch-User", "?1")
+ .addHeader("Sec-Ch-Ua", "\"Chromium\";v=\"134\", \"Not:A-Brand\";v=\"24\", \"Google Chrome\";v=\"134\"")
+ .addHeader("Sec-Ch-Ua-Mobile", "?0")
+ .addHeader("Sec-Ch-Ua-Platform", "\"Windows\"")
+ .addHeader("Priority", "u=0, i")
+ .build();
+ Response response = client.newCall(request).execute();
+ String html = response.body().string();
+ Document parse = Jsoup.parse(html);
+// String url = "https://www.uu.se/nyheter/alla?newsResearch=researchtopic11%3Bresearchtopic7%3Bresearchtopic22%3Bresearchtopic10%3Bresearchtopic2&start=20";
+// // 定义正则表达式
+// String regex = "start=(\\d+)";
+// Pattern pattern = Pattern.compile(regex);
+// Matcher matcher = pattern.matcher(url);
+// Integer start = 0;
+
+
+// String postTime = convertToTimestamp(parse.select(".mr10").text());
+// String title = parse.select(".hdg01").text();
+// String content = parse.select(".container01 p").text();
+// String forwardcontent = parse.select("#main").html();
+// Map map = new HashMap<>();
+
+// if (matcher.find()) {
+// start = Integer.parseInt(matcher.group(1));
+// System.out.println("Start: " + start); // start = 12
+// }
+//
+// Elements allLinks = new Elements();
+// Elements links = parse.select(".search-result-hit-text-container a");
+// allLinks.addAll(links);
+//
+// int totalLinks = allLinks.size();
+// int startIndex = Math.max(0, totalLinks - 10);
+// for (int i = startIndex; i < totalLinks; i++) {
+// Map task = new HashMap(16);
+// task.put("link","https://www.uu.se"+allLinks.get(i).attr("href"));
+// task.put("linktype", "newscontent"); // 設置鏈接類型為 "newscontent"
+//
+// System.out.println(task);
+// }
+ Elements elements = parse.select(".title a");
+ for (Element element : elements) {
+ String link = element.attr("href");
+ System.out.println(link);
+ }
+
+
+// map.put("postTime",postTime);
+// map.put("title",title);
+// map.put("content",content);
+// map.put("forwardcontent",forwardcontent);
+// System.out.println(map);
+
+ }
+ public ook() throws IOException {
+ }
+// public static String convertToTimestamp(String dateStr) {
+// try {
+// // 定义输入格式:dd MMMM , yyyy(例如 "28 February , 2025")
+// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("MMMM dd, yyyy", Locale.ENGLISH);
+// // 定义输出格式:yyyy-MM-dd HH:mm:ss
+// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
+//
+// // 解析输入日期
+// LocalDate date = LocalDate.parse(dateStr, inputFormatter);
+// // 转换为带时间的格式,时间设为 00:00:00
+// return date.atStartOfDay().format(outputFormatter);
+// } catch (Exception e) {
+// e.printStackTrace();
+// return null; // 或抛出异常,根据需求调整
+// }
+// }
+// public static String convertToTimestamp(String dateStr) {
+// try {
+// // 定义输入格式:yyyy 年 MM 月 dd 日
+// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("MM-dd-yyyy", Locale.CHINESE);
+// // 定义输出格式:yyyy-MM-dd HH:mm:ss
+// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
+//
+// // 解析输入日期
+// LocalDate date = LocalDate.parse(dateStr, inputFormatter);
+// // 转换为带时间的格式,时间设为 00:00:00
+// return date.atStartOfDay().format(outputFormatter);
+// } catch (Exception e) {
+// e.printStackTrace();
+// return null; // 或抛出异常,根据需求调整
+// }
+// }
+
+// public static String convertToTimestamp(String dateStr) {
+// try {
+// // 定义输入格式
+// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("dd/MM/yyyy");
+// // 定义输出格式
+// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
+//
+// // 解析输入字符串为 LocalDate
+// LocalDate date = LocalDate.parse(dateStr, inputFormatter);
+// // 转换为 LocalDateTime,设置时间为 00:00:00
+// LocalDateTime dateTime = date.atStartOfDay();
+// // 格式化为目标字符串
+// return dateTime.format(outputFormatter);
+// } catch (Exception e) {
+// e.printStackTrace();
+// return null; // 或者抛出异常,根据需求调整
+// }
+// }
+// public static String convertToTimestamp(String dateStr) {
+// try {
+// // 定义输入格式:MMMM d, yyyy(例如 "June 3, 2015")
+// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("d MMMM, yyyy", Locale.ENGLISH);
+// // 定义输出格式:yyyy-MM-dd HH:mm:ss
+// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
+//
+// // 解析输入日期
+// LocalDate date = LocalDate.parse(dateStr, inputFormatter);
+// // 转换为带时间的格式,时间设为 00:00:00
+// return date.atStartOfDay().format(outputFormatter);
+// } catch (Exception e) {
+// e.printStackTrace();
+// return null; // 或抛出异常,根据需求调整
+// }
+// }
+// public static String convertToTimestamp(String input) {
+// try {
+// // 正则匹配 "d MMMM yyyy"
+// Pattern pattern = Pattern.compile("\\d{1,2} [A-Za-z]+ \\d{4}");
+// Matcher matcher = pattern.matcher(input);
+// if (matcher.find()) {
+// String dateStr = matcher.group();
+// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("d MMMM yyyy", Locale.ENGLISH);
+// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
+// LocalDate date = LocalDate.parse(dateStr, inputFormatter);
+// return date.atStartOfDay().format(outputFormatter);
+// } else {
+// System.out.println("No date found in: " + input);
+// return null;
+// }
+// } catch (Exception e) {
+// e.printStackTrace();
+// return null;
+// }
+// }
+// public static String convertToTimestamp(String dateStr) {
+// try {
+// // Parse the ISO 8601 date string (e.g., "2025-03-17T12:37:33.033Z")
+// ZonedDateTime zdt = ZonedDateTime.parse(dateStr, DateTimeFormatter.ISO_DATE_TIME);
+//
+// // Define the output format (yyyy-MM-dd hh:mm:ss)
+// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
+//
+// // Format the date to the desired output
+// return zdt.format(outputFormatter);
+// } catch (Exception e) {
+// e.printStackTrace();
+// return null; // Or throw an exception, depending on your needs
+// }
+// }
+ public static String convertToTimestamp(String dateStr) {
+ try {
+ // Parse "Jan. 9, 2025" (abbreviated month, dot, comma-separated)
+ DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("MMM. d, yyyy", Locale.ENGLISH);
+ LocalDate date = LocalDate.parse(dateStr, inputFormatter);
+
+ // Format to "yyyy-MM-dd HH:mm:ss" (defaulting time to 00:00:00)
+ DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
+ return date.atStartOfDay().format(outputFormatter);
+ } catch (Exception e) {
+ e.printStackTrace();
+ return null;
+ }
+ }
+ // 调用本地代理服务获取代理地址
+ private static String getProxyFromLocalService() throws Exception {
+ OkHttpClient client = new OkHttpClient();
+ Request request = new Request.Builder()
+ .url("http://127.0.0.1:7897")
+ .get()
+ .build();
+
+ try (Response response = client.newCall(request).execute()) {
+ if (response.isSuccessful()) {
+ return response.body().string(); // 返回 JSON 字符串
+ } else {
+ throw new Exception("获取代理失败,状态码: " + response.code());
+ }
+ }
+ }
+}
+
diff --git a/src/main/java/com/example/oook.java b/src/main/java/com/example/oook.java
new file mode 100644
index 0000000..d8c24d9
--- /dev/null
+++ b/src/main/java/com/example/oook.java
@@ -0,0 +1,524 @@
+package com.example;
+
+import okhttp3.*;
+import org.json.JSONArray;
+import org.json.JSONObject;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import java.io.IOException;
+import java.net.InetSocketAddress;
+import java.net.Proxy;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.time.*;
+import java.time.format.DateTimeFormatter;
+import java.time.format.DateTimeParseException;
+import java.util.*;
+import java.util.concurrent.TimeUnit;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class oook {
+
+
+ public static void main(String[] args) throws Exception {
+ // 1. 获取代理地址
+// String proxyJson = getProxyFromLocalService();
+// JSONObject proxyData = new JSONObject(proxyJson);
+// String httpProxy = proxyData.getString("http"); // 例如 "http://proxy1:port"
+//
+// // 2. 解析代理地址
+// String[] proxyParts = httpProxy.replace("http://", "").split(":");
+// String proxyHost = proxyParts[0]; // proxy1
+// int proxyPort = Integer.parseInt(proxyParts[1]); // port
+ OkHttpClient client = new OkHttpClient().newBuilder()
+ .connectTimeout(30, TimeUnit.SECONDS)
+ .readTimeout(30, TimeUnit.SECONDS)
+ .writeTimeout(30, TimeUnit.SECONDS)
+// .cookieJar(new CookieJar() {
+// private final HashMap> cookieStore = new HashMap<>();
+//
+// @Override
+// public void saveFromResponse(HttpUrl url, List cookies) {
+// cookieStore.put(url.host(), cookies); // 保存 Cookie
+// }
+//
+// @Override
+// public List loadForRequest(HttpUrl url) {
+// List cookies = cookieStore.get(url.host());
+// return cookies != null ? cookies : new ArrayList<>();
+// }
+// })
+// .followRedirects(true) // 自动处理重定向
+ .build();
+
+
+ // 发送目标请求,自动获取和使用 Cookie
+// Request request = new Request.Builder()
+// .url("https://thl.fi/aiheet/infektiotaudit-ja-rokotukset/ajankohtaista/infektio-ja-rokotusuutiset?p_p_id=com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_L2Jk5CCjrKPN&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_L2Jk5CCjrKPN_redirect=%2Faiheet%2Finfektiotaudit-ja-rokotukset%2Fajankohtaista%2Finfektio-ja-rokotusuutiset&_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_L2Jk5CCjrKPN_delta=50&p_r_p_resetCur=false&_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_L2Jk5CCjrKPN_cur=1")
+// .get()
+// .addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36")
+// .addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
+// .addHeader("Accept-Language", "en-US,en;q=0.5")
+// .addHeader("Cookie", "__cf_bm=HXf4OleH9DiJmEagV_4Wori6vFzyN4wf.CBVL57AQUI-1743471952-1.0.1.1-h0KqPKUW2_wblBJ1HWbn50Xi1EPDIxjvFhRyrkdPrAoRHNjlXk..tK_KDWGUs6f4Z1VbQUbJD1Vw3KTi9IYO5bx5af4ZqE2nABBXT.YpLKQ; _cfuvid=jdweOOZm.a8GWXZGqRHb.fiSFMKZuAppyOlkDBbafw0-1743471952167-0.0.1.1-604800000") .build();
+// OkHttpClient client = new OkHttpClient().newBuilder()
+// .connectTimeout(30, TimeUnit.SECONDS)
+// .readTimeout(30, TimeUnit.SECONDS)
+// .writeTimeout(30, TimeUnit.SECONDS)
+//// .proxy(new Proxy(Proxy.Type.HTTP, new InetSocketAddress("127.0.0.1", 7897))) // 直接使用 7897 端口
+// .build();
+ String url = "https://www.iranintl.com/en/202504116060";
+ MediaType mediaType = MediaType.parse("text/plain");
+ RequestBody body = RequestBody.create(mediaType, "");
+ Request request = new Request.Builder()
+ .url(url)
+ .get()
+ .build();
+ Response response = client.newCall(request).execute();
+ String html = response.body().string();
+ Document parse = Jsoup.parse(html);
+// String htmlData = null;
+// JSONArray jsonArray = new JSONArray(html);
+// for (int i = 0; i < jsonArray.length(); i++) {
+// JSONObject obj = jsonArray.getJSONObject(i);
+// if ("insert".equals(obj.optString("command")) && obj.has("data")) {
+// htmlData = obj.getString("data");
+// break;
+// }
+// }
+// Document doc = Jsoup.parse(htmlData);
+// Elements rows = doc.select(".o-grid__item.col-1, .o-grid__item.col-2, .o-grid__item.col-3");
+//
+// Set uniqueHrefs = new HashSet<>();
+//
+// for (Element row : rows) {
+// Elements links = row.select("a[href]"); // 选择所有 a 标签
+// if (!links.isEmpty()) {
+// // 只取第一个 href
+// String href = links.first().attr("href");
+// uniqueHrefs.add(href);
+// }
+// }
+////
+// for (String href : uniqueHrefs) {
+// System.out.println("Href: " + href);
+// }
+// String next = getNextPageUrl(url);
+// System.out.println(next);
+
+// JSONObject jsonObject = new JSONObject(html);
+// JSONObject response1 = jsonObject.getJSONObject("response");
+// JSONArray docs = response1.getJSONArray("docs");
+//
+// // 遍历 docs 数组,提取 permalink
+// for (int i = 0; i < docs.length(); i++) {
+// JSONObject doc = docs.getJSONObject(i);
+// String permalink = doc.getString("permalink");
+// System.out.println("Permalink: " + permalink);
+// }
+
+// String url = "https://www.uu.se/nyheter/alla?newsResearch=researchtopic11%3Bresearchtopic7%3Bresearchtopic22%3Bresearchtopic10%3Bresearchtopic2&start=20";
+// // 定义正则表达式
+// String regex = "start=(\\d+)";
+// Pattern pattern = Pattern.compile(regex);
+// Matcher matcher = pattern.matcher(url);
+// Integer start = 0;
+// String postTime = convertToTimestamp(
+// parse.select(".c-news-info__date.o-meta span.c-date").text().trim() + " " +
+// parse.select(".c-news-info__date.o-meta span.c-year").text().trim()
+// );
+// String postTime = parse.select("tr:nth-child(3) td:nth-child(3)").text()+" 00:00:00";
+ String postTime = convertIsoToTimestamp(parse.select(".WrittenContentBlock-module__9pvVhW__timeAgo time").attr("datetime"));
+ String title = parse.select(".WrittenContentBlock-module__9pvVhW__headline").text();
+ String content = parse.select(".WrittenContentBlock-module__9pvVhW__body p").text();
+ String forwardcontent = parse.select(".page").html();
+ Elements imgs = parse.select(".page img");
+// Elements pdfs = parse.select("tr:nth-child(3) td a");
+
+ String prefix = "";
+
+ List imgList = new ArrayList();
+
+ for (Element img : imgs) {
+ String src = img.attr("src");
+ if (src != null && !src.isEmpty()) {
+ // 判断是否以 https 开头
+ String fullUrl;
+ if (!src.startsWith("https")) {
+ // 如果不以 https 开头,拼接前缀
+ if (src.startsWith("/")) {
+ fullUrl = prefix + src;
+ } else {
+ fullUrl = prefix + "/" + src;
+ }
+ } else {
+ fullUrl = src;
+ }
+ // 拼接格式
+ String imgUrl = fullUrl + "###" + "avif";
+ imgList.add(imgUrl);
+ }
+ }
+
+// String prefix = "";
+//
+// List fileList = new ArrayList();
+//
+// for (Element pdf : pdfs) {
+// String pdfUrl = pdf.attr("href");
+// if (pdfUrl != null && !pdfUrl.isEmpty()) {
+// // 判断是否以 https 开头
+// String fullUrl;
+// if (!pdfUrl.startsWith("https")) {
+// // 如果不以 https 开头,拼接前缀
+// if (pdfUrl.startsWith("/")) {
+// fullUrl = prefix + pdfUrl;
+// } else {
+// fullUrl = prefix + "/" + pdfUrl;
+// }
+// } else {
+// fullUrl = pdfUrl;
+// }
+// // 拼接格式
+// String fileUrl = fullUrl + "###" + "pdf";
+// fileList.add(fileUrl);
+// }
+// }
+//
+
+
+// if (matcher.find()) {
+// start = Integer.parseInt(matcher.group(1));
+// System.out.println("Start: " + start); // start = 12
+// }
+
+// Elements allLinks = new Elements();
+// Elements links = parse.select(".card-body a");
+// allLinks.addAll(links);
+//
+// int totalLinks = allLinks.size();
+// int startIndex = Math.max(0, totalLinks - 10);
+// for (int i = startIndex; i < totalLinks; i++) {
+// Map task = new HashMap(16);
+// task.put("link","https://www.uu.se"+allLinks.get(i).attr("href"));
+// task.put("linktype", "newscontent"); // 設置鏈接類型為 "newscontent"
+//
+// System.out.println(task);
+// }
+
+// Elements elements = parse.select(".topic__grid__item a");
+// Integer count = elements.size();
+// for (Element element : elements) {
+// String link = element.attr("href"); // 獲取新聞鏈接的 href 屬性
+// System.out.println(link);
+// }
+
+// if(count <10){
+// String nextpageurl = getPreviousYearUrl(url);
+// System.out.println(nextpageurl);
+// }else {
+// String nextpageurl = getNextPageUrl(url);
+// System.out.println(nextpageurl);
+// }
+ Map map = new HashMap<>();
+ map.put("postTime",postTime);
+ map.put("title",title);
+ map.put("content",content);
+ map.put("forwardcontent",forwardcontent);
+ map.put("imgList",imgList);
+// map.put("fileList",fileList);
+ System.out.println(map);
+
+ }
+ public oook() throws IOException {
+ }
+// public static String convertToTimestamp(String dateStr) {
+// try {
+// // 定义输入格式:dd MMMM , yyyy(例如 "28 February , 2025")
+// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("MMMM dd, yyyy", Locale.ENGLISH);
+// // 定义输出格式:yyyy-MM-dd HH:mm:ss
+// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
+// dateStr = dateStr.replace("|", "").trim();
+// // 解析输入日期
+// LocalDate date = LocalDate.parse(dateStr, inputFormatter);
+// // 转换为带时间的格式,时间设为 00:00:00
+// return date.atStartOfDay().format(outputFormatter);
+// } catch (Exception e) {
+// e.printStackTrace();
+// return null; // 或抛出异常,根据需求调整
+// }
+// }
+// public static String convertToTimestamp(String dateStr) {
+// try {
+// // 去掉 "Publié le" 前缀并清理多余字符
+// dateStr = dateStr.replace("Publié le", "").trim();
+//
+// // 定义输入格式:dd MMMM yyyy(例如 "25 mars 2025")
+// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("dd MMMM yyyy", Locale.FRENCH);
+//
+// // 定义输出格式:yyyy-MM-dd HH:mm:ss
+// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
+//
+// // 解析输入日期
+// LocalDate date = LocalDate.parse(dateStr, inputFormatter);
+//
+// // 转换为带时间的格式,时间设为 00:00:00
+// return date.atStartOfDay().format(outputFormatter);
+// } catch (Exception e) {
+// e.printStackTrace();
+// return null; // 或抛出异常,根据需求调整
+// }
+// }
+// public static String convertToTimestamp(String dateStr) {
+// try {
+// // 定义输入格式:yyyy 年 MM 月 dd 日
+// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("MM-dd-yyyy", Locale.CHINESE);
+// // 定义输出格式:yyyy-MM-dd HH:mm:ss
+// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
+//
+// // 解析输入日期
+// LocalDate date = LocalDate.parse(dateStr, inputFormatter);
+// // 转换为带时间的格式,时间设为 00:00:00
+// return date.atStartOfDay().format(outputFormatter);
+// } catch (Exception e) {
+// e.printStackTrace();
+// return null; // 或抛出异常,根据需求调整
+// }
+// }
+
+ // public static String convertToTimestamp(String dateStr) {
+// try {
+// // 定义输入格式
+// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("dd/MM/yyyy");
+// // 定义输出格式
+// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
+//
+// // 解析输入字符串为 LocalDate
+// LocalDate date = LocalDate.parse(dateStr, inputFormatter);
+// // 转换为 LocalDateTime,设置时间为 00:00:00
+// LocalDateTime dateTime = date.atStartOfDay();
+// // 格式化为目标字符串
+// return dateTime.format(outputFormatter);
+// } catch (Exception e) {
+// e.printStackTrace();
+// return null; // 或者抛出异常,根据需求调整
+// }
+// }
+// public static String convertToTimestamp(String dateStr) {
+// try {
+// // 定义输入格式:MMMM d, yyyy(例如 "June 3, 2015")
+// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("d MMMM, yyyy", Locale.ENGLISH);
+// // 定义输出格式:yyyy-MM-dd HH:mm:ss
+// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
+//
+// // 解析输入日期
+// LocalDate date = LocalDate.parse(dateStr, inputFormatter);
+// // 转换为带时间的格式,时间设为 00:00:00
+// return date.atStartOfDay().format(outputFormatter);
+// } catch (Exception e) {
+// e.printStackTrace();
+// return null; // 或抛出异常,根据需求调整
+// }
+// }
+// public static String convertToTimestamp(String input) {
+// try {
+// // 正则匹配 "d MMMM yyyy"
+// Pattern pattern = Pattern.compile("\\d{1,2} [A-Za-z]+ \\d{4}");
+// Matcher matcher = pattern.matcher(input);
+// if (matcher.find()) {
+// String dateStr = matcher.group();
+// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("d MMMM yyyy", Locale.ENGLISH);
+// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
+// LocalDate date = LocalDate.parse(dateStr, inputFormatter);
+// return date.atStartOfDay().format(outputFormatter);
+// } else {
+// System.out.println("No date found in: " + input);
+// return null;
+// }
+// } catch (Exception e) {
+// e.printStackTrace();
+// return null;
+// }
+// }
+// public static String convertToTimestamp(String dateStr) {
+// try {
+// // Parse the ISO 8601 date string (e.g., "2025-03-17T12:37:33.033Z")
+// ZonedDateTime zdt = ZonedDateTime.parse(dateStr, DateTimeFormatter.ISO_DATE_TIME);
+//
+// // Define the output format (yyyy-MM-dd hh:mm:ss)
+// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
+//
+// // Format the date to the desired output
+// return zdt.format(outputFormatter);
+// } catch (Exception e) {
+// e.printStackTrace();
+// return null; // Or throw an exception, depending on your needs
+// }
+// }
+// public static String convertToTimestamp(String dateStr) {
+// try {
+// // Parse "Jan. 9, 2025" (abbreviated month, dot, comma-separated)
+// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("MMM d, yyyy", Locale.ENGLISH);
+// LocalDate date = LocalDate.parse(dateStr, inputFormatter);
+//
+// // Format to "yyyy-MM-dd HH:mm:ss" (defaulting time to 00:00:00)
+// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
+// return date.atStartOfDay().format(outputFormatter);
+// } catch (Exception e) {
+// e.printStackTrace();
+// return null;
+// }
+// }
+// public static String convertToTimestamp(String dateStr) {
+// try {
+// // 从文本中提取修改日期
+// String modifiedDateStr = extractModifiedDate(dateStr);
+// if (modifiedDateStr == null) {
+// throw new IllegalArgumentException("无法找到修改日期");
+// }
+//
+// // Parse "20/12/2024" (day/month/year format, Italian style)
+// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("dd/MM/yyyy", Locale.ITALIAN);
+// LocalDate date = LocalDate.parse(modifiedDateStr, inputFormatter);
+//
+// // Format to "yyyy-MM-dd HH:mm:ss" (defaulting time to 00:00:00)
+// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
+// return date.atStartOfDay().format(outputFormatter);
+// } catch (Exception e) {
+// e.printStackTrace();
+// return null;
+// }
+// }
+ public static String convertIsoToTimestamp(String dateStr) {
+ try {
+ // 解析 ISO 8601 格式的 UTC 时间为 Instant
+ Instant instant = Instant.parse(dateStr);
+ // 转为本地时间(系统默认时区),如果你不想转换时区,可以用 LocalDateTime.ofInstant
+ LocalDateTime localDateTime = LocalDateTime.ofInstant(instant, ZoneOffset.UTC);
+ // 定义输出格式
+ DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
+ return localDateTime.format(outputFormatter);
+ } catch (Exception e) {
+ e.printStackTrace();
+ return null;
+ }
+ }
+
+// public static String convertToTimestamp(String dateStr) {
+// try {
+// // 创建捷克语的日期格式器,解析 "27. listopadu 2024"
+// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("d. MMMM yyyy", new Locale("cs", "CZ"));
+// LocalDate date = LocalDate.parse(dateStr, inputFormatter);
+//
+// // 转换为 "yyyy-MM-dd HH:mm:ss" 格式,默认时间为 00:00:00
+// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
+// return date.atStartOfDay().format(outputFormatter);
+// } catch (Exception e) {
+// e.printStackTrace();
+// return null;
+// }
+// }
+ // 提取修改日期的方法
+ private static String extractModifiedDate(String text) {
+ String[] lines = text.split("\n");
+ for (String line : lines) {
+ if (line.contains("Modificato")) {
+ // 提取 "Modificato" 后面的日期部分
+ String[] parts = line.split("\\s+");
+ for (String part : parts) {
+ if (part.matches("\\d{2}/\\d{2}/\\d{4}")) {
+ return part; // 返回 "20/12/2024"
+ }
+ }
+ }
+ }
+ return null; // 如果没找到修改日期,返回 null
+ }
+ // 调用本地代理服务获取代理地址
+ private static String getProxyFromLocalService() throws Exception {
+ OkHttpClient client = new OkHttpClient();
+ Request request = new Request.Builder()
+ .url("http://127.0.0.1:7897")
+ .get()
+ .build();
+
+ try (Response response = client.newCall(request).execute()) {
+ if (response.isSuccessful()) {
+ return response.body().string(); // 返回 JSON 字符串
+ } else {
+ throw new Exception("获取代理失败,状态码: " + response.code());
+ }
+ }
+ }
+ public static String getNextPageUrl(String currentUrl) {
+ if (currentUrl == null || currentUrl.trim().isEmpty()) {
+ return null;
+ }
+
+// // 定义基础 URL
+// String baseUrl = "https://www.pasteur.dz/fr/espace-presse";
+//
+// // 如果是基础 URL,默认第 1 页,下一页为 ?page=2
+// if (currentUrl.equals(baseUrl)) {
+// return baseUrl + "?start=5";
+// }
+
+ // 定义正则表达式,匹配 ?page=数字
+ String regex = "_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_gJ3hFqMQsykM_cur=(\\d+)";
+ Pattern pattern = Pattern.compile(regex);
+ Matcher matcher = pattern.matcher(currentUrl);
+
+ // 如果找到 ?page=*
+ if (matcher.find()) {
+ // 提取页码(group(1) 是括号中的数字部分)
+ String pageNumStr = matcher.group(1);
+ try {
+ int currentPage = Integer.parseInt(pageNumStr);
+ // 替换旧页码为新页码
+ return matcher.replaceFirst("_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_gJ3hFqMQsykM_cur=" + (currentPage + 1));
+ } catch (NumberFormatException e) {
+ return null; // 页码解析失败
+ }
+ }else {
+ return null;
+ }
+ }
+ public static String getPreviousYearUrl(String url) {
+ if (url == null || url.trim().isEmpty()) {
+ return null;
+ }
+
+ // 定义正则表达式匹配年份
+ String yearRegex = "_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_gJ3hFqMQsykM_year=(\\d{4})";
+ Pattern yearPattern = Pattern.compile(yearRegex);
+ Matcher yearMatcher = yearPattern.matcher(url);
+
+ // 如果找到年份
+ if (yearMatcher.find()) {
+ String yearStr = yearMatcher.group(1); // 提取年份
+ Integer currentYear = Integer.parseInt(yearStr);
+ Integer previousYear = currentYear - 1; // 计算上一年
+
+ // 替换年份
+ url = yearMatcher.replaceFirst("_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_gJ3hFqMQsykM_year=" + previousYear);
+ }
+
+ // 定义正则表达式匹配页码
+ String pageRegex = "_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_gJ3hFqMQsykM_cur=(\\d+)";
+ Pattern pagePattern = Pattern.compile(pageRegex);
+ Matcher pageMatcher = pagePattern.matcher(url);
+
+ // 如果找到页码
+ if (pageMatcher.find()) {
+ // 替换页码为 1
+ return pageMatcher.replaceFirst("_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_gJ3hFqMQsykM_cur=1");
+ } else {
+ // 如果没有找到页码,默认页码为 1
+ return url + "_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_gJ3hFqMQsykM_cur=1";
+ }
+ }
+}
+
diff --git a/src/main/java/com/example/projTopic.java b/src/main/java/com/example/projTopic.java
new file mode 100644
index 0000000..f2377a7
--- /dev/null
+++ b/src/main/java/com/example/projTopic.java
@@ -0,0 +1,403 @@
+package com.example;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import okhttp3.*;
+import org.apache.kafka.clients.producer.KafkaProducer;
+import org.apache.kafka.clients.producer.ProducerConfig;
+import org.apache.kafka.clients.producer.ProducerRecord;
+import org.apache.kafka.common.serialization.StringSerializer;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.time.LocalDate;
+import java.time.LocalDateTime;
+import java.time.format.DateTimeFormatter;
+import java.util.*;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class projTopic {
+ private static final String TOPIC_NAME = "projTopic";
+ private static final String BOOTSTRAP_SERVERS = "node-01:19092";
+ private static KafkaProducer producer;
+ private static ObjectMapper objectMapper = new ObjectMapper();
+ private static final Random random = new Random();
+ private static List proxyList = new ArrayList<>(); // 代理池
+ private static int currentProxyIndex = 0; // 当前使用的代理索引
+ static {
+ Properties props = new Properties();
+ props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, BOOTSTRAP_SERVERS);
+ props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
+ props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
+ props.put(ProducerConfig.ACKS_CONFIG, "all"); // 等待所有副本确认
+ props.put(ProducerConfig.RETRIES_CONFIG, 3); // 重试次数
+ producer = new KafkaProducer<>(props);
+ try {
+ proxyList = Files.readAllLines(Paths.get("proxy.txt"));
+ if (proxyList.isEmpty()) {
+ System.out.println("警告: proxy.txt 为空,未加载任何代理");
+ } else {
+ System.out.println("成功加载 " + proxyList.size() + " 个代理");
+ }
+ } catch (IOException e) {
+ System.err.println("读取 proxy.txt 失败: " + e.getMessage());
+ }
+ }
+
+ public static void main(String[] args) throws IOException, InterruptedException {
+ List keywords = Files.readAllLines(Paths.get("keywords.txt"));
+ List cleanedKeywords = new ArrayList<>();
+ for (String keyword : keywords) {
+ String cleaned = keyword.split(",")[0].trim(); // 取逗号前的部分并去除首尾空格
+ cleaned = cleaned.replaceAll("\\s+", "+"); // 替换所有空格为 +
+ cleanedKeywords.add(cleaned);
+ }
+ ExecutorService executor = Executors.newFixedThreadPool(4); // 4 个线程
+ for (String keyword : cleanedKeywords) {
+ executor.submit(() -> {
+ try {
+ int sleepTime = random.nextInt(1001) + 30000;
+ String load = "5|0|20|https://www.nsf.gov/awardsearch/jsp/gwt/search/|57BE5CA45E781DC0159F727F8A8205EB|gov.nsf.research.awardsearch.gwt.client.SearchAwardService|getAwards|gov.nsf.research.awardsearch.gwt.bean.SearchRequestBean/3930579236|com.extjs.gxt.ui.client.data.PagingLoadConfig|java.util.HashMap/962170901|java.lang.String/2004016611|QueryText|" + keyword + "|ActiveAwards|true|com.extjs.gxt.ui.client.data.BasePagingLoadConfig/2011366567|com.extjs.gxt.ui.client.data.RpcMap/3441186752|sortField|sortDir|com.extjs.gxt.ui.client.Style$SortDir/640452531|offset|java.lang.Integer/3438268394|limit|1|2|3|4|2|5|6|5|7|2|8|9|8|10|8|11|8|12|13|0|1|14|4|15|0|16|17|0|18|19|0|20|19|30|";
+ for(int i=0;;i++){
+ OkHttpClient client = createClientWithProxy();
+ MediaType mediaType = MediaType.parse("text/x-gwt-rpc; charset=UTF-8");
+ RequestBody body = RequestBody.create(mediaType, load);
+
+ Request request = new Request.Builder()
+ .url("https://www.nsf.gov/awardsearch/jsp/gwt/search/.searchaward")
+ .method("POST", body)
+ .addHeader("Content-Type", "text/x-gwt-rpc; charset=UTF-8")
+ .addHeader("X-GWT-Module-Base", "https://www.nsf.gov/awardsearch/jsp/gwt/search/")
+ .addHeader("X-GWT-Permutation", "368C3CF86AA4CD7DB2250B35B844C1C2")
+// .addHeader("cookie", "JSESSIONID=E9DCB88F6AD2241C9973AFEC03158ECB")
+ .build();
+ Response response = executeWithRetry(client, request, keyword);
+ String content = response.body().string();
+
+ Pattern pattern = Pattern.compile("\"awdNumber\",\"(\\d+)\"");
+ Matcher matcher = pattern.matcher(content);
+
+ List numbers = new ArrayList<>(); // 用于存储匹配的数字
+ // 查找并提取数字
+ List additionalNumbers = new ArrayList<>();
+ List urls = new ArrayList<>();
+ // 查找匹配项
+ while (matcher.find()) {
+ // 获取捕获到的数字,并将其添加到列表中
+ numbers.add(matcher.group(1));
+ }
+
+ // 输出捕获到的数字
+ if (numbers.isEmpty()) {
+ System.out.println("没找到awdNumber,继续下一种查找");
+
+ } else {
+ for (String number : numbers) {
+ additionalNumbers.add(number);
+ }
+ }
+
+ Pattern additionalPattern = Pattern.compile("\"[^\"]+\",\"(?:\\d{2}/\\d{2}/\\d{4}|\\d+\\.\\d+)\"(?:,\"(?:\\d{2}/\\d{2}/\\d{4}|\\d+\\.\\d+)\")?,\"(\\d+)\"");
+ Matcher additionalMatcher = additionalPattern.matcher(content);
+
+
+ while (additionalMatcher.find()) {
+ additionalNumbers.add(additionalMatcher.group(1));
+ }
+ if (additionalNumbers.isEmpty()) {
+ System.out.println("没找到下一页内容链接");
+ Thread.sleep(sleepTime);
+ break;
+ } else {
+ for (String number : additionalNumbers) {
+ String url = "https://www.nsf.gov/awardsearch/showAward?AWD_ID=" + number + "&HistoricalAwards=false";
+ urls.add(url);
+ }
+ }
+ if (!urls.isEmpty() && urls.get(0).equals("https://www.nsf.gov/awardsearch/showAward?AWD_ID=2446604&HistoricalAwards=false")) {
+ System.out.println("第一个 URL 是 AWD_ID=2446604,跳过关键词: " + keyword);
+ Thread.sleep(sleepTime);
+ return; // 跳出当前任务,处理下一个关键词
+ }
+ for(String url:urls){
+ OkHttpClient client2 = createClientWithProxy();
+ MediaType mediaType2 = MediaType.parse("text/plain");
+ RequestBody body2 = RequestBody.create(mediaType2, "");
+ Request request2 = new Request.Builder()
+ .url(url)
+ .get()
+// .addHeader("Cookie", "JSESSIONID=E9DCB88F6AD2241C9973AFEC03158ECB")
+ .build();
+ Response response2 = executeWithRetry(client2, request2, keyword);
+ System.out.println(response2.code());
+ String html = response2.body().string();
+ Document parse = Jsoup.parse(html);
+ String title = parse.select(".pageheadline").text();
+ String projectNum = parse.select(".clear tr:nth-child(5) .tabletext2:nth-child(2)").text();
+ String projectLeader = parse.select(".clear tr:nth-child(13) .tabletext2:nth-child(2)").text();
+ String projectStartTime = convertToTimestamp(parse.select(".clear tr:nth-child(8) .tabletext2:nth-child(2)").text());
+ String projectEndTime = convertToTimestamp2(parse.select(".clear tr:nth-child(9) .tabletext2:nth-child(2)").text());
+ String sponsorPart = parse.select(".clear tr:nth-child(2) .tabletext2:nth-child(2)").text();
+ String country = "USA";
+ String brief = parse.select(".clear.margintop25 span").text();
+ String sponsor = parse.select(".clear tr:nth-child(1) .tabletext2:nth-child(2)").text();
+ String projectFunding = parse.select(".clear tr:nth-child(12) .tabletext2:nth-child(2)").text();
+ String relatedProject = parse.select(".clear tr:nth-child(20) .tabletext2:nth-child(2)").text();
+
+
+
+ String awardInstrument = parse.select(".clear tr:nth-child(6) .tabletext2:nth-child(2)").text();
+ String programManager = parse.select(".clear tr:nth-child(7) .tabletext2:nth-child(2)").text();
+ String totalIntendedAwardAmount = parse.select(".clear tr:nth-child(10) .tabletext2:nth-child(2)").text();
+ String totalAwardedAmountToDate = parse.select(".clear tr:nth-child(11) .tabletext2:nth-child(2)").text();
+ String recipientSponsoredResearchOffice = parse.select(".clear tr:nth-child(14) .tabletext2:nth-child(2)").text();
+ String sponsorCongressionalDistrict = parse.select(".clear tr:nth-child(15) .tabletext2:nth-child(2)").text();
+ String primaryPlaceOfPerformance = parse.select(".clear tr:nth-child(16) .tabletext2:nth-child(2)").text();
+ String primaryPlaceOfPerformanceCongressionalDistrict = parse.select(".clear tr:nth-child(17) .tabletext2:nth-child(2)").text();
+ String uniqueEntityIdentifier = parse.select(".clear tr:nth-child(18) .tabletext2:nth-child(2)").text();
+ String parentUEI = parse.select(".clear tr:nth-child(19) .tabletext2:nth-child(2)").text();
+ String primaryProgramSource = parse.select(".clear tr:nth-child(21) .tabletext2:nth-child(2)").text();
+ String programReferenceCode = parse.select(".clear tr:nth-child(22) .tabletext2:nth-child(2)").text();
+ String programElementCode = parse.select(".clear tr:nth-child(23) .tabletext2:nth-child(2)").text();
+ String awardAgencyCode = parse.select(".clear tr:nth-child(24) .tabletext2:nth-child(2)").text();
+ String fundAgencyCode = parse.select(".clear tr:nth-child(25) .tabletext2:nth-child(2)").text();
+ String assistanceListingNumber = parse.select(".clear tr:nth-child(26) .tabletext2:nth-child(2)").text();
+ String initialAmendmentDate = convertToTimestamp(parse.select(".clear tr:nth-child(3) .tabletext2:nth-child(2)").text());
+ String latestAmendmentDate = convertToTimestamp(parse.select(".clear tr:nth-child(4) .tabletext2:nth-child(2)").text());
+
+ List