commit 86b867f54155800bb9111936a9b24acd7bb64837 Author: guanjz <1826473923@qq.com> Date: Mon May 19 10:41:53 2025 +0800 本地部署 diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..73f69e0 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml +# Editor-based HTTP Client requests +/httpRequests/ diff --git a/.idea/compiler.xml b/.idea/compiler.xml new file mode 100644 index 0000000..e6b77de --- /dev/null +++ b/.idea/compiler.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/jarRepositories.xml b/.idea/jarRepositories.xml new file mode 100644 index 0000000..712ab9d --- /dev/null +++ b/.idea/jarRepositories.xml @@ -0,0 +1,20 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..ae9c995 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,14 @@ + + + + + + + + + + \ No newline at end of file diff --git a/.idea/uiDesigner.xml b/.idea/uiDesigner.xml new file mode 100644 index 0000000..e96534f --- /dev/null +++ b/.idea/uiDesigner.xml @@ -0,0 +1,124 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.project b/.project new file mode 100644 index 0000000..a8309f9 --- /dev/null +++ b/.project @@ -0,0 +1,23 @@ + + + DaKaES + + + + + + org.eclipse.jdt.core.javabuilder + + + + + org.eclipse.m2e.core.maven2Builder + + + + + + org.eclipse.jdt.core.javanature + org.eclipse.m2e.core.maven2Nature + + diff --git a/NsantegouvListRe.jar b/NsantegouvListRe.jar new file mode 100644 index 0000000..57433ef Binary files /dev/null and b/NsantegouvListRe.jar differ diff --git a/bin/.idea/.gitignore b/bin/.idea/.gitignore new file mode 100644 index 0000000..73f69e0 --- /dev/null +++ b/bin/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml +# Editor-based HTTP Client requests +/httpRequests/ diff --git a/bin/.idea/compiler.xml b/bin/.idea/compiler.xml new file mode 100644 index 0000000..e6b77de --- /dev/null +++ b/bin/.idea/compiler.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/bin/.idea/jarRepositories.xml b/bin/.idea/jarRepositories.xml new file mode 100644 index 0000000..712ab9d --- /dev/null +++ b/bin/.idea/jarRepositories.xml @@ -0,0 +1,20 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/bin/.idea/misc.xml b/bin/.idea/misc.xml new file mode 100644 index 0000000..ae9c995 --- /dev/null +++ b/bin/.idea/misc.xml @@ -0,0 +1,14 @@ + + + + + + + + + + \ No newline at end of file diff --git a/bin/.project b/bin/.project new file mode 100644 index 0000000..a8309f9 --- /dev/null +++ b/bin/.project @@ -0,0 +1,23 @@ + + + DaKaES + + + + + + org.eclipse.jdt.core.javabuilder + + + + + org.eclipse.m2e.core.maven2Builder + + + + + + org.eclipse.jdt.core.javanature + org.eclipse.m2e.core.maven2Nature + + diff --git a/bin/hs_err_pid15760.log b/bin/hs_err_pid15760.log new file mode 100644 index 0000000..3459d43 --- /dev/null +++ b/bin/hs_err_pid15760.log @@ -0,0 +1,167 @@ +# +# There is insufficient memory for the Java Runtime Environment to continue. +# Native memory allocation (malloc) failed to allocate 1048576 bytes for AllocateHeap +# Possible reasons: +# The system is out of physical RAM or swap space +# The process is running with CompressedOops enabled, and the Java Heap may be blocking the growth of the native heap +# Possible solutions: +# Reduce memory load on the system +# Increase physical memory or swap space +# Check if swap backing store is full +# Decrease Java heap size (-Xmx/-Xms) +# Decrease number of Java threads +# Decrease Java thread stack sizes (-Xss) +# Set larger code cache with -XX:ReservedCodeCacheSize= +# JVM is running with Zero Based Compressed Oops mode in which the Java heap is +# placed in the first 32GB address space. The Java Heap base address is the +# maximum limit for the native heap growth. Please use -XX:HeapBaseMinAddress +# to set the Java Heap base and to place the Java Heap above 32GB virtual address. +# This output file may be truncated or incomplete. +# +# Out of Memory Error (memory/allocation.inline.hpp:61), pid=15760, tid=0x0000000000003334 +# +# JRE version: (8.0_422-b05) (build ) +# Java VM: OpenJDK 64-Bit Server VM (25.422-b05 mixed mode windows-amd64 compressed oops) +# Failed to write core dump. Minidumps are not enabled by default on client versions of Windows +# + +--------------- T H R E A D --------------- + +Current thread (0x00000271b7d7d800): JavaThread "Unknown thread" [_thread_in_vm, id=13108, stack(0x00000082a1500000,0x00000082a1600000)] + +Stack: [0x00000082a1500000,0x00000082a1600000] +[error occurred during error reporting (printing stack bounds), id 0xc0000005] + +Native frames: (J=compiled Java code, j=interpreted, Vv=VM code, C=native code) + + +--------------- P R O C E S S --------------- + +Java Threads: ( => current thread ) + +Other Threads: + +=>0x00000271b7d7d800 (exited) JavaThread "Unknown thread" [_thread_in_vm, id=13108, stack(0x00000082a1500000,0x00000082a1600000)] + +VM state:not at safepoint (normal execution) + +VM Mutex/Monitor currently owned by a thread: None + +heap address: 0x00000006c4000000, size: 4032 MB, Compressed Oops mode: Zero based, Oop shift amount: 3 +Narrow klass base: 0x0000000000000000, Narrow klass shift: 3 +Compressed class space size: 1073741824 Address: 0x00000007c0000000 + +Heap: + PSYoungGen total 75264K, used 1290K [0x000000076c000000, 0x0000000771400000, 0x00000007c0000000) + eden space 64512K, 2% used [0x000000076c000000,0x000000076c142900,0x000000076ff00000) + from space 10752K, 0% used [0x0000000770980000,0x0000000770980000,0x0000000771400000) + to space 10752K, 0% used [0x000000076ff00000,0x000000076ff00000,0x0000000770980000) + ParOldGen total 172032K, used 0K [0x00000006c4000000, 0x00000006ce800000, 0x000000076c000000) + object space 172032K, 0% used [0x00000006c4000000,0x00000006c4000000,0x00000006ce800000) + Metaspace used 790K, capacity 4480K, committed 4480K, reserved 1056768K + class space used 76K, capacity 384K, committed 384K, reserved 1048576K + +Card table byte_map: [0x00000271c8b70000,0x00000271c9360000] byte_map_base: 0x00000271c5550000 + +Marking Bits: (ParMarkBitMap*) 0x00000000521f38d0 + Begin Bits: [0x00000271c98a0000, 0x00000271cd7a0000) + End Bits: [0x00000271cd7a0000, 0x00000271d16a0000) + +Polling page: 0x00000271b7eb0000 + +CodeCache: size=245760Kb used=328Kb max_used=328Kb free=245431Kb + bounds [0x00000271b97b0000, 0x00000271b9a20000, 0x00000271c87b0000] + total_blobs=57 nmethods=0 adapters=38 + compilation: enabled + +Compilation events (0 events): +No events + +GC Heap History (0 events): +No events + +Deoptimization events (0 events): +No events + +Classes redefined (0 events): +No events + +Internal exceptions (0 events): +No events + +Events (10 events): +Event: 0.012 loading class java/lang/Short +Event: 0.013 loading class java/lang/Short done +Event: 0.013 loading class java/lang/Integer +Event: 0.013 loading class java/lang/Integer done +Event: 0.013 loading class java/lang/Long +Event: 0.013 loading class java/lang/Long done +Event: 0.013 loading class java/lang/NullPointerException +Event: 0.013 loading class java/lang/NullPointerException done +Event: 0.013 loading class java/lang/ArithmeticException +Event: 0.013 loading class java/lang/ArithmeticException done + + +Dynamic libraries: +0x00007ff7d7590000 - 0x00007ff7d75d6000 C:\Users\18264\.jdks\corretto-1.8.0_422\bin\java.exe +0x00007ffa1d0b0000 - 0x00007ffa1d2a8000 C:\Windows\SYSTEM32\ntdll.dll +0x00007ffa1ce90000 - 0x00007ffa1cf52000 C:\Windows\System32\KERNEL32.DLL +0x00007ffa1add0000 - 0x00007ffa1b0cf000 C:\Windows\System32\KERNELBASE.dll +0x00007ffa1c470000 - 0x00007ffa1c51f000 C:\Windows\System32\ADVAPI32.dll +0x00007ffa1cf60000 - 0x00007ffa1cffe000 C:\Windows\System32\msvcrt.dll +0x00007ffa1cdf0000 - 0x00007ffa1ce8f000 C:\Windows\System32\sechost.dll +0x00007ffa1c580000 - 0x00007ffa1c6a3000 C:\Windows\System32\RPCRT4.dll +0x00007ffa1ada0000 - 0x00007ffa1adc7000 C:\Windows\System32\bcrypt.dll +0x00007ffa1be50000 - 0x00007ffa1bfed000 C:\Windows\System32\USER32.dll +0x00007ffa1a7a0000 - 0x00007ffa1a7c2000 C:\Windows\System32\win32u.dll +0x00007ffa1bff0000 - 0x00007ffa1c01b000 C:\Windows\System32\GDI32.dll +0x00007ffa1ac80000 - 0x00007ffa1ad9a000 C:\Windows\System32\gdi32full.dll +0x00007ffa1aaa0000 - 0x00007ffa1ab3d000 C:\Windows\System32\msvcp_win.dll +0x00007ffa1a9a0000 - 0x00007ffa1aaa0000 C:\Windows\System32\ucrtbase.dll +0x00007ffa00e00000 - 0x00007ffa0109a000 C:\Windows\WinSxS\amd64_microsoft.windows.common-controls_6595b64144ccf1df_6.0.19041.4355_none_60b8b9eb71f62e16\COMCTL32.dll +0x00007ffa1c030000 - 0x00007ffa1c05f000 C:\Windows\System32\IMM32.DLL +0x00007ffa10f70000 - 0x00007ffa10f85000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\vcruntime140.dll +0x00007ff9ceb10000 - 0x00007ff9cebab000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\msvcp140.dll +0x0000000051a10000 - 0x000000005226c000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\server\jvm.dll +0x00007ffa1c020000 - 0x00007ffa1c028000 C:\Windows\System32\PSAPI.DLL +0x00007ff9fac50000 - 0x00007ff9fac59000 C:\Windows\SYSTEM32\WSOCK32.dll +0x00007ffa0d800000 - 0x00007ffa0d827000 C:\Windows\SYSTEM32\WINMM.dll +0x00007ffa0ff90000 - 0x00007ffa0ff9a000 C:\Windows\SYSTEM32\VERSION.dll +0x00007ffa1c060000 - 0x00007ffa1c0cb000 C:\Windows\System32\WS2_32.dll +0x00007ffa18f70000 - 0x00007ffa18f82000 C:\Windows\SYSTEM32\kernel.appcore.dll +0x00007ffa10fc0000 - 0x00007ffa10fd0000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\verify.dll +0x00007ffa0aec0000 - 0x00007ffa0aeeb000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\java.dll +0x00007ff9ca260000 - 0x00007ff9ca296000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\jdwp.dll +0x00007ffa0af80000 - 0x00007ffa0af89000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\npt.dll +0x00007ff9c1ab0000 - 0x00007ff9c1ae2000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\instrument.dll +0x00007ffa008e0000 - 0x00007ffa008f8000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\zip.dll + +VM Arguments: +jvm_args: -agentlib:jdwp=transport=dt_socket,address=127.0.0.1:56727,suspend=y,server=n -javaagent:C:\Users\18264\AppData\Local\JetBrains\IntelliJIdea2021.1\captureAgent\debugger-agent.jar -Dfile.encoding=UTF-8 +java_command: com.example.saveInES +java_class_path (initial): C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\charsets.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\access-bridge-64.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\cldrdata.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\dnsns.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\jaccess.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\jfxrt.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\localedata.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\nashorn.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\sunec.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\sunjce_provider.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\sunmscapi.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\sunpkcs11.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\zipfs.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\jce.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\jfr.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\jfxswt.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\jsse.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\management-agent.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\resources.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\rt.jar;F:\workTest\DaKaES\target\classes;C:\Users\18264\.m2\repository\org\elasticsearch\client\elasticsearch-rest-high-level-client\7.17.0\elasticsearch-rest-high-level-client-7.17.0.jar;C:\Users\18264\.m2\repository\org\elasticsearch\elasticsearch\7.17.0\elasticsearch-7.17.0.jar;C:\Users\18264\.m2\repository\org\elasticsearch\elasticsearch-core\7.17.0\elasticsearch-core-7.17.0.jar;C:\Users\18264\.m2\repository\org\elasticsearch\elasticsearch-secure-sm\7.17.0\elasticsearch-secure-sm-7.17.0.jar;C:\Users\18264\.m2\repository\org\elasticsearch\elasticsearch-x-content\7.17.0\elasticsearch-x-content-7.17.0.jar;C:\Users\18264\.m2\repository\org\yaml\snakeyaml\1.26\snakeyaml-1.26.jar;C:\Users\18264\.m2\repository\c +Launcher Type: SUN_STANDARD + +Environment Variables: +JAVA_HOME=E:\java +PATH=C:\Program Files\Common Files\Oracle\Java\javapath;D:\vm\bin\;E:\app\18264\product\11.2.0\dbhome_1\bin;C:\Windows\system32;C:\Windows;C:\Windows\System32\Wbem;C:\Windows\System32\WindowsPowerShell\v1.0\;C:\Windows\System32\OpenSSH\;C:\Program Files (x86)\NVIDIA Corporation\PhysX\Common;C:\Program Files\NVIDIA Corporation\NVIDIA NvDLISR;C:\Windows\system32;C:\Windows;C:\Windows\System32\Wbem;C:\Windows\System32\WindowsPowerShell\v1.0\;C:\Windows\System32\OpenSSH\;E:\java\bin;F:\mysql\mysql-5.7.37-winx64\mysql-5.7.37-winx64\bin;D:\matlab\Matlab R2022a\bin;C:\Program Files (x86)\dotnet\;C:\Program Files\dotnet\;D:\winscp\WinSCP\;F:\javaAbout\apache-maven-3.6.3\bin;C:\Program Files\Git\cmd;F:\tool\nvm\nvm;F:\tool\node;C:\Users\18264\AppData\Local\Programs\Python\Python311\Scripts\;C:\Users\18264\AppData\Local\Programs\Python\Python311\;C:\Users\18264\AppData\Local\Programs\Python\Python37\Scripts\;C:\Users\18264\AppData\Local\Programs\Python\Python37\;C:\Users\18264\AppData\Local\Programs\Python\Launcher\;C:\Users\18264\AppData\Local\Microsoft\WindowsApps;D:\Microsoft VS Code\bin;F:\idea\IntelliJ IDEA 2021.1.3\bin;;F:\tool\nvm\nvm;F:\tool\node +USERNAME=18264 +OS=Windows_NT +PROCESSOR_IDENTIFIER=Intel64 Family 6 Model 141 Stepping 1, GenuineIntel + + + +--------------- S Y S T E M --------------- + +OS: Windows 10 , 64 bit Build 19041 (10.0.19041.5438) + +CPU:total 16 (initial active 16) (8 cores per cpu, 2 threads per core) family 6 model 141 stepping 1, cmov, cx8, fxsr, mmx, sse, sse2, sse3, ssse3, sse4.1, sse4.2, popcnt, avx, avx2, aes, clmul, erms, 3dnowpref, lzcnt, ht, tsc, tscinvbit, bmi1, bmi2, adx + +Memory: 4k page, physical 16509736k(919328k free), swap 36170532k(5620k free) + +vm_info: OpenJDK 64-Bit Server VM (25.422-b05) for windows-amd64 JRE (1.8.0_422-b05), built on Jul 11 2024 17:20:01 by "Administrator" with MS VC++ 15.9 (VS2017) + +time: Tue Mar 4 14:31:48 2025 +timezone: Intel64 Family 6 Model 141 Stepping 1, GenuineIntel +elapsed time: 0.022707 seconds (0d 0h 0m 0s) + diff --git a/bin/keywords.txt b/bin/keywords.txt new file mode 100644 index 0000000..2358ab6 --- /dev/null +++ b/bin/keywords.txt @@ -0,0 +1,6 @@ +Montpellier Institute of Virology, France +Ontario Public Health Laboratory, Canada +University of Texas Biosafety Laboratory, USA +Korea National Institute of Infectious Diseases (KCDC) +Israel Institute of Life Sciences +Biosafety Laboratory, University of Basel, Switzerland \ No newline at end of file diff --git a/bin/pom.xml b/bin/pom.xml new file mode 100644 index 0000000..730bf94 --- /dev/null +++ b/bin/pom.xml @@ -0,0 +1,138 @@ + + 4.0.0 + com.example + es-crawler + 1.0-SNAPSHOT + + + 8 + 8 + + + + + + org.elasticsearch.client + elasticsearch-rest-high-level-client + 7.17.0 + + + + co.elastic.clients + elasticsearch-java + 7.17.15 + + + com.fasterxml.jackson.core + jackson-databind + 2.15.0 + + + + + org.jsoup + jsoup + 1.17.2 + + + + + com.squareup.okhttp3 + okhttp + 4.9.3 + + + + + org.slf4j + slf4j-api + 1.7.36 + + + ch.qos.logback + logback-classic + 1.2.11 + + + + + org.apache.kafka + kafka-clients + 3.9.0 + + + + + org.seleniumhq.selenium + selenium-java + 4.10.0 + + + + + io.github.bonigarcia + webdrivermanager + 5.6.2 + + + + org.json + json + 20230227 + + + + com.google.code.gson + gson + 2.10.1 + + + + net.sourceforge.htmlunit + htmlunit + 2.61.0 + + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + 8 + 8 + + + + + org.apache.maven.plugins + maven-assembly-plugin + 3.3.0 + + + + com.example.projTopic + + + + jar-with-dependencies + + + + + make-assembly + package + + single + + + + + + + \ No newline at end of file diff --git a/bin/processed_urls.txt b/bin/processed_urls.txt new file mode 100644 index 0000000..f862ec3 --- /dev/null +++ b/bin/processed_urls.txt @@ -0,0 +1,281 @@ + +https://www.zyctd.com/zixun/201/1055143.html +https://www.zyctd.com/zixun/201/861786.html +https://www.zyctd.com/zixun/201/1053482.html +https://www.zyctd.com/zixun/201/269419.html +https://www.zyctd.com/zixun/201/1053149.html +https://www.zyctd.com/zixun/201/1023926.html +https://www.zyctd.com/zixun/201/435325.html +https://www.zyctd.com/zixun/201/1050302.html +https://www.zyctd.com/zixun/201/880441.html +https://www.zyctd.com/zixun/201/1019635.html +https://www.zyctd.com/zixun/201/970572.html +https://www.zyctd.com/zixun/201/912277.html +https://www.zyctd.com/zixun/201/372444.html +https://www.zyctd.com/zixun/201/1073629.html +https://www.zyctd.com/zixun/201/1069386.html +https://www.zyctd.com/zixun/201/730410.html +https://www.zyctd.com/zixun/201/953220.html +https://www.zyctd.com/zixun/201/1074339.html +https://www.zyctd.com/zixun/201/1072317.html +https://www.zyctd.com/zixun/201/294794.html +https://www.zyctd.com/zixun/201/267592.html +https://www.zyctd.com/zixun/201/979665.html +https://www.zyctd.com/zixun/201/869885.html +https://www.zyctd.com/zixun/201/1054064.html +https://www.zyctd.com/zixun/201/1049331.html +https://www.zyctd.com/zixun/201/442647.html +https://www.zyctd.com/zixun/201/285992.html +https://www.zyctd.com/zixun/201/1037972.html +https://www.zyctd.com/zixun/201/799801.html +https://www.zyctd.com/zixun/201/916078.html +https://www.zyctd.com/zixun/201/456647.html +https://www.zyctd.com/zixun/201/812121.html +https://www.zyctd.com/zixun/201/1042740.html +https://www.zyctd.com/zixun/201/1042708.html +https://www.zyctd.com/zixun/201/840450.html +https://www.zyctd.com/zixun/201/320749.html +https://www.zyctd.com/zixun/201/496106.html +https://www.zyctd.com/zixun/201/850201.html +https://www.zyctd.com/zixun/201/277145.html +https://www.zyctd.com/zixun/201/299091.html +https://www.zyctd.com/zixun/201/266080.html +https://www.zyctd.com/zixun/201/1051925.html +https://www.zyctd.com/zixun/201/898081.html +https://www.zyctd.com/zixun/201/873280.html +https://www.zyctd.com/zixun/201/703880.html +https://www.zyctd.com/zixun/201/873126.html +https://www.zyctd.com/zixun/201/887931.html +https://www.zyctd.com/zixun/201/432742.html +https://www.zyctd.com/zixun/201/1040431.html +https://www.zyctd.com/zixun/201/1040223.html +https://www.zyctd.com/zixun/201/858118.html +https://www.zyctd.com/zixun/201/971286.html +https://www.zyctd.com/zixun/201/458488.html +https://www.zyctd.com/zixun/201/1079381.html +https://www.zyctd.com/zixun/201/263578.html +https://www.zyctd.com/zixun/201/553513.html +https://www.zyctd.com/zixun/201/286229.html +https://www.zyctd.com/zixun/201/285365.html +https://www.zyctd.com/zixun/201/352921.html +https://www.zyctd.com/zixun/201/503267.html +https://www.zyctd.com/zixun/201/391337.html +https://www.zyctd.com/zixun/201/813052.html +https://www.zyctd.com/zixun/201/1053556.html +https://www.zyctd.com/zixun/201/1041197.html +https://www.zyctd.com/zixun/201/287420.html +https://www.zyctd.com/zixun/201/291563.html +https://www.zyctd.com/zixun/201/948250.html +https://www.zyctd.com/zixun/201/289034.html +https://www.zyctd.com/zixun/201/795965.html +https://www.zyctd.com/zixun/201/292962.html +https://www.zyctd.com/zixun/201/975850.html +https://www.zyctd.com/zixun/201/275335.html +https://www.zyctd.com/zixun/201/1031992.html +https://www.zyctd.com/zixun/201/1033886.html +https://www.zyctd.com/zixun/201/999510.html +https://www.zyctd.com/zixun/201/270144.html +https://www.zyctd.com/zixun/201/1055519.html +https://www.zyctd.com/zixun/201/272205.html +https://www.zyctd.com/zixun/201/526059.html +https://www.zyctd.com/zixun/201/456640.html +https://www.zyctd.com/zixun/201/267952.html +https://www.zyctd.com/zixun/201/803469.html +https://www.zyctd.com/zixun/201/270763.html +https://www.zyctd.com/zixun/201/1072987.html +https://www.zyctd.com/zixun/201/265176.html +https://www.zyctd.com/zixun/201/1022141.html +https://www.zyctd.com/zixun/201/290173.html +https://www.zyctd.com/zixun/201/269175.html +https://www.zyctd.com/zixun/201/744991.html +https://www.zyctd.com/zixun/201/1019131.html +https://www.zyctd.com/zixun/201/717054.html +https://www.zyctd.com/zixun/201/517358.html +https://www.zyctd.com/zixun/201/1058505.html +https://www.zyctd.com/zixun/201/905515.html +https://www.zyctd.com/zixun/201/287395.html +https://www.zyctd.com/zixun/201/934873.html +https://www.zyctd.com/zixun/201/1051317.html +https://www.zyctd.com/zixun/201/926018.html +https://www.zyctd.com/zixun/201/334511.html +https://www.zyctd.com/zixun/201/845896.html +https://www.zyctd.com/zixun/201/587785.html +https://www.zyctd.com/zixun/201/288376.html +https://www.zyctd.com/zixun/201/851405.html +https://www.zyctd.com/zixun/201/941404.html +https://www.zyctd.com/zixun/201/881855.html +https://www.zyctd.com/zixun/201/602632.html +https://www.zyctd.com/zixun/201/293601.html +https://www.zyctd.com/zixun/201/541809.html +https://www.zyctd.com/zixun/201/335120.html +https://www.zyctd.com/zixun/201/1031137.html +https://www.zyctd.com/zixun/201/960101.html +https://www.zyctd.com/zixun/201/1077142.html +https://www.zyctd.com/zixun/201/1063222.html +https://www.zyctd.com/zixun/201/681466.html +https://www.zyctd.com/zixun/201/1031130.html +https://www.zyctd.com/zixun/201/1073734.html +https://www.zyctd.com/zixun/201/1062186.html +https://www.zyctd.com/zixun/201/1046628.html +https://www.zyctd.com/zixun/201/358892.html +https://www.zyctd.com/zixun/201/285361.html +https://www.zyctd.com/zixun/201/1059889.html +https://www.zyctd.com/zixun/201/297824.html +https://www.zyctd.com/zixun/201/844307.html +https://www.zyctd.com/zixun/201/900524.html +https://www.zyctd.com/zixun/201/1057636.html +https://www.zyctd.com/zixun/201/1010080.html +https://www.zyctd.com/zixun/201/409152.html +https://www.zyctd.com/zixun/201/402782.html +https://www.zyctd.com/zixun/201/770296.html +https://www.zyctd.com/zixun/201/1040602.html +https://www.zyctd.com/zixun/201/606503.html +https://www.zyctd.com/zixun/201/784471.html +https://www.zyctd.com/zixun/201/466097.html +https://www.zyctd.com/zixun/201/1071160.html +https://www.zyctd.com/zixun/201/623226.html +https://www.zyctd.com/zixun/201/948264.html +https://www.zyctd.com/zixun/201/293462.html +https://www.zyctd.com/zixun/201/829348.html +https://www.zyctd.com/zixun/201/332369.html +https://www.zyctd.com/zixun/201/907461.html +https://www.zyctd.com/zixun/201/756555.html +https://www.zyctd.com/zixun/201/717915.html +https://www.zyctd.com/zixun/201/262203.html +https://www.zyctd.com/zixun/201/1055787.html +https://www.zyctd.com/zixun/201/432336.html +https://www.zyctd.com/zixun/201/907489.html +https://www.zyctd.com/zixun/201/1014686.html +https://www.zyctd.com/zixun/201/1053320.html +https://www.zyctd.com/zixun/201/480020.html +https://www.zyctd.com/zixun/201/287423.html +https://www.zyctd.com/zixun/201/385289.html +https://www.zyctd.com/zixun/201/1030421.html +https://www.zyctd.com/zixun/201/527648.html +https://www.zyctd.com/zixun/201/972959.html +https://www.zyctd.com/zixun/201/408767.html +https://www.zyctd.com/zixun/201/724887.html +https://www.zyctd.com/zixun/201/291480.html +https://www.zyctd.com/zixun/201/472544.html +https://www.zyctd.com/zixun/201/724873.html +https://www.zyctd.com/zixun/201/281751.html +https://www.zyctd.com/zixun/201/1049693.html +https://www.zyctd.com/zixun/201/869619.html +https://www.zyctd.com/zixun/201/355497.html +https://www.zyctd.com/zixun/201/341623.html +https://www.zyctd.com/zixun/201/450753.html +https://www.zyctd.com/zixun/201/1065837.html +https://www.zyctd.com/zixun/201/1031331.html +https://www.zyctd.com/zixun/201/669727.html +https://www.zyctd.com/zixun/201/1034010.html +https://www.zyctd.com/zixun/201/1054058.html +https://www.zyctd.com/zixun/201/954613.html +https://www.zyctd.com/zixun/201/715584.html +https://www.zyctd.com/zixun/201/1051110.html +https://www.zyctd.com/zixun/201/269963.html +https://www.zyctd.com/zixun/201/1048128.html +https://www.zyctd.com/zixun/201/793207.html +https://www.zyctd.com/zixun/201/284310.html +https://www.zyctd.com/zixun/201/282639.html +https://www.zyctd.com/zixun/201/1068138.html +https://www.zyctd.com/zixun/201/340678.html +https://www.zyctd.com/zixun/201/294371.html +https://www.zyctd.com/zixun/201/324277.html +https://www.zyctd.com/zixun/201/1048931.html +https://www.zyctd.com/zixun/201/851398.html +https://www.zyctd.com/zixun/201/263527.html +https://www.zyctd.com/zixun/201/919480.html +https://www.zyctd.com/zixun/201/685442.html +https://www.zyctd.com/zixun/201/428325.html +https://www.zyctd.com/zixun/201/1032698.html +https://www.zyctd.com/zixun/201/1003367.html +https://www.zyctd.com/zixun/201/852315.html +https://www.zyctd.com/zixun/201/283156.html +https://www.zyctd.com/zixun/201/262484.html +https://www.zyctd.com/zixun/201/1065225.html +https://www.zyctd.com/zixun/201/763331.html +https://www.zyctd.com/zixun/201/1066158.html +https://www.zyctd.com/zixun/201/1047744.html +https://www.zyctd.com/zixun/201/842795.html +https://www.zyctd.com/zixun/201/975374.html +https://www.zyctd.com/zixun/201/1055865.html +https://www.zyctd.com/zixun/201/1017367.html +https://www.zyctd.com/zixun/201/1057711.html +https://www.zyctd.com/zixun/201/1074295.html +https://www.zyctd.com/zixun/201/283647.html +https://www.zyctd.com/zixun/201/286896.html +https://www.zyctd.com/zixun/201/1043393.html +https://www.zyctd.com/zixun/201/305888.html +https://www.zyctd.com/zixun/201/487258.html +https://www.zyctd.com/zixun/201/1045652.html +https://www.zyctd.com/zixun/201/1064905.html +https://www.zyctd.com/zixun/201/515636.html +https://www.zyctd.com/zixun/201/1038609.html +https://www.zyctd.com/zixun/201/438083.html +https://www.zyctd.com/zixun/201/297327.html +https://www.zyctd.com/zixun/201/773537.html +https://www.zyctd.com/zixun/201/1043589.html +https://www.zyctd.com/zixun/201/815712.html +https://www.zyctd.com/zixun/201/698595.html +https://www.zyctd.com/zixun/201/269800.html +https://www.zyctd.com/zixun/201/1030332.html +https://www.zyctd.com/zixun/201/422676.html +https://www.zyctd.com/zixun/201/290130.html +https://www.zyctd.com/zixun/201/270359.html +https://www.zyctd.com/zixun/201/995604.html +https://www.zyctd.com/zixun/201/1074993.html +https://www.zyctd.com/zixun/201/1054825.html +https://www.zyctd.com/zixun/201/918577.html +https://www.zyctd.com/zixun/201/686527.html +https://www.zyctd.com/zixun/201/297509.html +https://www.zyctd.com/zixun/201/622708.html +https://www.zyctd.com/zixun/201/469870.html +https://www.zyctd.com/zixun/201/844328.html +https://www.zyctd.com/zixun/201/394508.html +https://www.zyctd.com/zixun/201/271744.html +https://www.zyctd.com/zixun/201/1054940.html +https://www.zyctd.com/zixun/201/732818.html +https://www.zyctd.com/zixun/201/1049547.html +https://www.zyctd.com/zixun/201/1059684.html +https://www.zyctd.com/zixun/201/1055301.html +https://www.zyctd.com/zixun/201/962068.html +https://www.zyctd.com/zixun/201/451355.html +https://www.zyctd.com/zixun/201/1056174.html +https://www.zyctd.com/zixun/201/930540.html +https://www.zyctd.com/zixun/201/871656.html +https://www.zyctd.com/zixun/201/363246.html +https://www.zyctd.com/zixun/201/845672.html +https://www.zyctd.com/zixun/201/452965.html +https://www.zyctd.com/zixun/201/1065920.html +https://www.zyctd.com/zixun/201/1058808.html +https://www.zyctd.com/zixun/201/986868.html +https://www.zyctd.com/zixun/201/489785.html +https://www.zyctd.com/zixun/201/307946.html +https://www.zyctd.com/zixun/201/833359.html +https://www.zyctd.com/zixun/201/806969.html +https://www.zyctd.com/zixun/201/1050812.html +https://www.zyctd.com/zixun/201/1033696.html +https://www.zyctd.com/zixun/201/501167.html +https://www.zyctd.com/zixun/201/1078919.html +https://www.zyctd.com/zixun/201/1036495.html +https://www.zyctd.com/zixun/201/1008736.html +https://www.zyctd.com/zixun/201/1054264.html +https://www.zyctd.com/zixun/201/493152.html +https://www.zyctd.com/zixun/201/685456.html +https://www.zyctd.com/zixun/201/995597.html +https://www.zyctd.com/zixun/201/905501.html +https://www.zyctd.com/zixun/201/347573.html +https://www.zyctd.com/zixun/201/1045494.html +https://www.zyctd.com/zixun/201/549775.html +https://www.zyctd.com/zixun/201/1037336.html +https://www.zyctd.com/zixun/201/1034972.html +https://www.zyctd.com/zixun/201/653046.html +https://www.zyctd.com/zixun/201/316612.html +https://www.zyctd.com/zixun/201/447064.html +https://www.zyctd.com/zixun/201/307603.html +https://www.zyctd.com/zixun/201/263437.html +https://www.zyctd.com/zixun/201/894490.html +https://www.zyctd.com/zixun/201/368629.html +https://www.zyctd.com/zixun/201/273285.html +https://www.zyctd.com/zixun/201/1059618.html +https://www.zyctd.com/zixun/201/459237.html diff --git a/bin/proxy.txt b/bin/proxy.txt new file mode 100644 index 0000000..199a16c --- /dev/null +++ b/bin/proxy.txt @@ -0,0 +1 @@ +127.0.0.1:7897 \ No newline at end of file diff --git a/bin/src/main/java/com/example/Inka.class b/bin/src/main/java/com/example/Inka.class new file mode 100644 index 0000000..ac137ee Binary files /dev/null and b/bin/src/main/java/com/example/Inka.class differ diff --git a/bin/src/main/java/com/example/NSFAwardCrawler.class b/bin/src/main/java/com/example/NSFAwardCrawler.class new file mode 100644 index 0000000..eb1e050 Binary files /dev/null and b/bin/src/main/java/com/example/NSFAwardCrawler.class differ diff --git a/bin/src/main/java/com/example/PatentscopeSeleniumCrawler.class b/bin/src/main/java/com/example/PatentscopeSeleniumCrawler.class new file mode 100644 index 0000000..ee2edab Binary files /dev/null and b/bin/src/main/java/com/example/PatentscopeSeleniumCrawler.class differ diff --git a/bin/src/main/java/com/example/ProxyIPChecker.class b/bin/src/main/java/com/example/ProxyIPChecker.class new file mode 100644 index 0000000..1b87f7c Binary files /dev/null and b/bin/src/main/java/com/example/ProxyIPChecker.class differ diff --git a/bin/src/main/java/com/example/StringFieldExtractor.class b/bin/src/main/java/com/example/StringFieldExtractor.class new file mode 100644 index 0000000..d938b0b Binary files /dev/null and b/bin/src/main/java/com/example/StringFieldExtractor.class differ diff --git a/bin/src/main/java/com/example/getInKa.class b/bin/src/main/java/com/example/getInKa.class new file mode 100644 index 0000000..a9baf48 Binary files /dev/null and b/bin/src/main/java/com/example/getInKa.class differ diff --git a/bin/src/main/java/com/example/jsonGetOk.class b/bin/src/main/java/com/example/jsonGetOk.class new file mode 100644 index 0000000..a9f68b9 Binary files /dev/null and b/bin/src/main/java/com/example/jsonGetOk.class differ diff --git a/bin/src/main/java/com/example/ook.class b/bin/src/main/java/com/example/ook.class new file mode 100644 index 0000000..8a7afb8 Binary files /dev/null and b/bin/src/main/java/com/example/ook.class differ diff --git a/bin/src/main/java/com/example/oook.class b/bin/src/main/java/com/example/oook.class new file mode 100644 index 0000000..a140aa5 Binary files /dev/null and b/bin/src/main/java/com/example/oook.class differ diff --git a/bin/src/main/java/com/example/projTopic.class b/bin/src/main/java/com/example/projTopic.class new file mode 100644 index 0000000..29af390 Binary files /dev/null and b/bin/src/main/java/com/example/projTopic.class differ diff --git a/bin/src/main/java/com/example/saveInES.class b/bin/src/main/java/com/example/saveInES.class new file mode 100644 index 0000000..1bf30e6 Binary files /dev/null and b/bin/src/main/java/com/example/saveInES.class differ diff --git a/bin/src/main/java/com/example/test.class b/bin/src/main/java/com/example/test.class new file mode 100644 index 0000000..95aa04c Binary files /dev/null and b/bin/src/main/java/com/example/test.class differ diff --git a/bin/src/main/java/com/example/test2.class b/bin/src/main/java/com/example/test2.class new file mode 100644 index 0000000..6f03608 Binary files /dev/null and b/bin/src/main/java/com/example/test2.class differ diff --git a/bin/src/main/java/com/example/testContent.class b/bin/src/main/java/com/example/testContent.class new file mode 100644 index 0000000..16cc481 Binary files /dev/null and b/bin/src/main/java/com/example/testContent.class differ diff --git a/bin/src/main/java/com/example/umlistTest.class b/bin/src/main/java/com/example/umlistTest.class new file mode 100644 index 0000000..ccfdbcd Binary files /dev/null and b/bin/src/main/java/com/example/umlistTest.class differ diff --git a/bin/target/classes/META-INF/MANIFEST.MF b/bin/target/classes/META-INF/MANIFEST.MF new file mode 100644 index 0000000..38f1f7e --- /dev/null +++ b/bin/target/classes/META-INF/MANIFEST.MF @@ -0,0 +1,4 @@ +Manifest-Version: 1.0 +Build-Jdk-Spec: 22 +Created-By: Maven Integration for Eclipse + diff --git a/bin/target/es-crawler-1.0-SNAPSHOT-jar-with-dependencies.jar b/bin/target/es-crawler-1.0-SNAPSHOT-jar-with-dependencies.jar new file mode 100644 index 0000000..041697a Binary files /dev/null and b/bin/target/es-crawler-1.0-SNAPSHOT-jar-with-dependencies.jar differ diff --git a/bin/target/es-crawler-1.0-SNAPSHOT.jar b/bin/target/es-crawler-1.0-SNAPSHOT.jar new file mode 100644 index 0000000..febbb6e Binary files /dev/null and b/bin/target/es-crawler-1.0-SNAPSHOT.jar differ diff --git a/bin/target/maven-archiver/pom.properties b/bin/target/maven-archiver/pom.properties new file mode 100644 index 0000000..c35b816 --- /dev/null +++ b/bin/target/maven-archiver/pom.properties @@ -0,0 +1,5 @@ +#Generated by Maven +#Fri Apr 18 18:29:46 CST 2025 +version=1.0-SNAPSHOT +groupId=com.example +artifactId=es-crawler diff --git a/bin/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst b/bin/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst new file mode 100644 index 0000000..5f1323f --- /dev/null +++ b/bin/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst @@ -0,0 +1 @@ +com\example\projTopic.class diff --git a/bin/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst b/bin/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst new file mode 100644 index 0000000..90e26e0 --- /dev/null +++ b/bin/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst @@ -0,0 +1 @@ +F:\workTest\DaKaES\src\main\java\com\example\projTopic.java diff --git a/bin/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/createdFiles.lst b/bin/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/createdFiles.lst new file mode 100644 index 0000000..e69de29 diff --git a/bin/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/inputFiles.lst b/bin/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/inputFiles.lst new file mode 100644 index 0000000..e69de29 diff --git a/hs_err_pid15760.log b/hs_err_pid15760.log new file mode 100644 index 0000000..3459d43 --- /dev/null +++ b/hs_err_pid15760.log @@ -0,0 +1,167 @@ +# +# There is insufficient memory for the Java Runtime Environment to continue. +# Native memory allocation (malloc) failed to allocate 1048576 bytes for AllocateHeap +# Possible reasons: +# The system is out of physical RAM or swap space +# The process is running with CompressedOops enabled, and the Java Heap may be blocking the growth of the native heap +# Possible solutions: +# Reduce memory load on the system +# Increase physical memory or swap space +# Check if swap backing store is full +# Decrease Java heap size (-Xmx/-Xms) +# Decrease number of Java threads +# Decrease Java thread stack sizes (-Xss) +# Set larger code cache with -XX:ReservedCodeCacheSize= +# JVM is running with Zero Based Compressed Oops mode in which the Java heap is +# placed in the first 32GB address space. The Java Heap base address is the +# maximum limit for the native heap growth. Please use -XX:HeapBaseMinAddress +# to set the Java Heap base and to place the Java Heap above 32GB virtual address. +# This output file may be truncated or incomplete. +# +# Out of Memory Error (memory/allocation.inline.hpp:61), pid=15760, tid=0x0000000000003334 +# +# JRE version: (8.0_422-b05) (build ) +# Java VM: OpenJDK 64-Bit Server VM (25.422-b05 mixed mode windows-amd64 compressed oops) +# Failed to write core dump. Minidumps are not enabled by default on client versions of Windows +# + +--------------- T H R E A D --------------- + +Current thread (0x00000271b7d7d800): JavaThread "Unknown thread" [_thread_in_vm, id=13108, stack(0x00000082a1500000,0x00000082a1600000)] + +Stack: [0x00000082a1500000,0x00000082a1600000] +[error occurred during error reporting (printing stack bounds), id 0xc0000005] + +Native frames: (J=compiled Java code, j=interpreted, Vv=VM code, C=native code) + + +--------------- P R O C E S S --------------- + +Java Threads: ( => current thread ) + +Other Threads: + +=>0x00000271b7d7d800 (exited) JavaThread "Unknown thread" [_thread_in_vm, id=13108, stack(0x00000082a1500000,0x00000082a1600000)] + +VM state:not at safepoint (normal execution) + +VM Mutex/Monitor currently owned by a thread: None + +heap address: 0x00000006c4000000, size: 4032 MB, Compressed Oops mode: Zero based, Oop shift amount: 3 +Narrow klass base: 0x0000000000000000, Narrow klass shift: 3 +Compressed class space size: 1073741824 Address: 0x00000007c0000000 + +Heap: + PSYoungGen total 75264K, used 1290K [0x000000076c000000, 0x0000000771400000, 0x00000007c0000000) + eden space 64512K, 2% used [0x000000076c000000,0x000000076c142900,0x000000076ff00000) + from space 10752K, 0% used [0x0000000770980000,0x0000000770980000,0x0000000771400000) + to space 10752K, 0% used [0x000000076ff00000,0x000000076ff00000,0x0000000770980000) + ParOldGen total 172032K, used 0K [0x00000006c4000000, 0x00000006ce800000, 0x000000076c000000) + object space 172032K, 0% used [0x00000006c4000000,0x00000006c4000000,0x00000006ce800000) + Metaspace used 790K, capacity 4480K, committed 4480K, reserved 1056768K + class space used 76K, capacity 384K, committed 384K, reserved 1048576K + +Card table byte_map: [0x00000271c8b70000,0x00000271c9360000] byte_map_base: 0x00000271c5550000 + +Marking Bits: (ParMarkBitMap*) 0x00000000521f38d0 + Begin Bits: [0x00000271c98a0000, 0x00000271cd7a0000) + End Bits: [0x00000271cd7a0000, 0x00000271d16a0000) + +Polling page: 0x00000271b7eb0000 + +CodeCache: size=245760Kb used=328Kb max_used=328Kb free=245431Kb + bounds [0x00000271b97b0000, 0x00000271b9a20000, 0x00000271c87b0000] + total_blobs=57 nmethods=0 adapters=38 + compilation: enabled + +Compilation events (0 events): +No events + +GC Heap History (0 events): +No events + +Deoptimization events (0 events): +No events + +Classes redefined (0 events): +No events + +Internal exceptions (0 events): +No events + +Events (10 events): +Event: 0.012 loading class java/lang/Short +Event: 0.013 loading class java/lang/Short done +Event: 0.013 loading class java/lang/Integer +Event: 0.013 loading class java/lang/Integer done +Event: 0.013 loading class java/lang/Long +Event: 0.013 loading class java/lang/Long done +Event: 0.013 loading class java/lang/NullPointerException +Event: 0.013 loading class java/lang/NullPointerException done +Event: 0.013 loading class java/lang/ArithmeticException +Event: 0.013 loading class java/lang/ArithmeticException done + + +Dynamic libraries: +0x00007ff7d7590000 - 0x00007ff7d75d6000 C:\Users\18264\.jdks\corretto-1.8.0_422\bin\java.exe +0x00007ffa1d0b0000 - 0x00007ffa1d2a8000 C:\Windows\SYSTEM32\ntdll.dll +0x00007ffa1ce90000 - 0x00007ffa1cf52000 C:\Windows\System32\KERNEL32.DLL +0x00007ffa1add0000 - 0x00007ffa1b0cf000 C:\Windows\System32\KERNELBASE.dll +0x00007ffa1c470000 - 0x00007ffa1c51f000 C:\Windows\System32\ADVAPI32.dll +0x00007ffa1cf60000 - 0x00007ffa1cffe000 C:\Windows\System32\msvcrt.dll +0x00007ffa1cdf0000 - 0x00007ffa1ce8f000 C:\Windows\System32\sechost.dll +0x00007ffa1c580000 - 0x00007ffa1c6a3000 C:\Windows\System32\RPCRT4.dll +0x00007ffa1ada0000 - 0x00007ffa1adc7000 C:\Windows\System32\bcrypt.dll +0x00007ffa1be50000 - 0x00007ffa1bfed000 C:\Windows\System32\USER32.dll +0x00007ffa1a7a0000 - 0x00007ffa1a7c2000 C:\Windows\System32\win32u.dll +0x00007ffa1bff0000 - 0x00007ffa1c01b000 C:\Windows\System32\GDI32.dll +0x00007ffa1ac80000 - 0x00007ffa1ad9a000 C:\Windows\System32\gdi32full.dll +0x00007ffa1aaa0000 - 0x00007ffa1ab3d000 C:\Windows\System32\msvcp_win.dll +0x00007ffa1a9a0000 - 0x00007ffa1aaa0000 C:\Windows\System32\ucrtbase.dll +0x00007ffa00e00000 - 0x00007ffa0109a000 C:\Windows\WinSxS\amd64_microsoft.windows.common-controls_6595b64144ccf1df_6.0.19041.4355_none_60b8b9eb71f62e16\COMCTL32.dll +0x00007ffa1c030000 - 0x00007ffa1c05f000 C:\Windows\System32\IMM32.DLL +0x00007ffa10f70000 - 0x00007ffa10f85000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\vcruntime140.dll +0x00007ff9ceb10000 - 0x00007ff9cebab000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\msvcp140.dll +0x0000000051a10000 - 0x000000005226c000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\server\jvm.dll +0x00007ffa1c020000 - 0x00007ffa1c028000 C:\Windows\System32\PSAPI.DLL +0x00007ff9fac50000 - 0x00007ff9fac59000 C:\Windows\SYSTEM32\WSOCK32.dll +0x00007ffa0d800000 - 0x00007ffa0d827000 C:\Windows\SYSTEM32\WINMM.dll +0x00007ffa0ff90000 - 0x00007ffa0ff9a000 C:\Windows\SYSTEM32\VERSION.dll +0x00007ffa1c060000 - 0x00007ffa1c0cb000 C:\Windows\System32\WS2_32.dll +0x00007ffa18f70000 - 0x00007ffa18f82000 C:\Windows\SYSTEM32\kernel.appcore.dll +0x00007ffa10fc0000 - 0x00007ffa10fd0000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\verify.dll +0x00007ffa0aec0000 - 0x00007ffa0aeeb000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\java.dll +0x00007ff9ca260000 - 0x00007ff9ca296000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\jdwp.dll +0x00007ffa0af80000 - 0x00007ffa0af89000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\npt.dll +0x00007ff9c1ab0000 - 0x00007ff9c1ae2000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\instrument.dll +0x00007ffa008e0000 - 0x00007ffa008f8000 C:\Users\18264\.jdks\corretto-1.8.0_422\jre\bin\zip.dll + +VM Arguments: +jvm_args: -agentlib:jdwp=transport=dt_socket,address=127.0.0.1:56727,suspend=y,server=n -javaagent:C:\Users\18264\AppData\Local\JetBrains\IntelliJIdea2021.1\captureAgent\debugger-agent.jar -Dfile.encoding=UTF-8 +java_command: com.example.saveInES +java_class_path (initial): C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\charsets.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\access-bridge-64.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\cldrdata.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\dnsns.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\jaccess.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\jfxrt.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\localedata.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\nashorn.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\sunec.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\sunjce_provider.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\sunmscapi.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\sunpkcs11.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\ext\zipfs.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\jce.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\jfr.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\jfxswt.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\jsse.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\management-agent.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\resources.jar;C:\Users\18264\.jdks\corretto-1.8.0_422\jre\lib\rt.jar;F:\workTest\DaKaES\target\classes;C:\Users\18264\.m2\repository\org\elasticsearch\client\elasticsearch-rest-high-level-client\7.17.0\elasticsearch-rest-high-level-client-7.17.0.jar;C:\Users\18264\.m2\repository\org\elasticsearch\elasticsearch\7.17.0\elasticsearch-7.17.0.jar;C:\Users\18264\.m2\repository\org\elasticsearch\elasticsearch-core\7.17.0\elasticsearch-core-7.17.0.jar;C:\Users\18264\.m2\repository\org\elasticsearch\elasticsearch-secure-sm\7.17.0\elasticsearch-secure-sm-7.17.0.jar;C:\Users\18264\.m2\repository\org\elasticsearch\elasticsearch-x-content\7.17.0\elasticsearch-x-content-7.17.0.jar;C:\Users\18264\.m2\repository\org\yaml\snakeyaml\1.26\snakeyaml-1.26.jar;C:\Users\18264\.m2\repository\c +Launcher Type: SUN_STANDARD + +Environment Variables: +JAVA_HOME=E:\java +PATH=C:\Program Files\Common Files\Oracle\Java\javapath;D:\vm\bin\;E:\app\18264\product\11.2.0\dbhome_1\bin;C:\Windows\system32;C:\Windows;C:\Windows\System32\Wbem;C:\Windows\System32\WindowsPowerShell\v1.0\;C:\Windows\System32\OpenSSH\;C:\Program Files (x86)\NVIDIA Corporation\PhysX\Common;C:\Program Files\NVIDIA Corporation\NVIDIA NvDLISR;C:\Windows\system32;C:\Windows;C:\Windows\System32\Wbem;C:\Windows\System32\WindowsPowerShell\v1.0\;C:\Windows\System32\OpenSSH\;E:\java\bin;F:\mysql\mysql-5.7.37-winx64\mysql-5.7.37-winx64\bin;D:\matlab\Matlab R2022a\bin;C:\Program Files (x86)\dotnet\;C:\Program Files\dotnet\;D:\winscp\WinSCP\;F:\javaAbout\apache-maven-3.6.3\bin;C:\Program Files\Git\cmd;F:\tool\nvm\nvm;F:\tool\node;C:\Users\18264\AppData\Local\Programs\Python\Python311\Scripts\;C:\Users\18264\AppData\Local\Programs\Python\Python311\;C:\Users\18264\AppData\Local\Programs\Python\Python37\Scripts\;C:\Users\18264\AppData\Local\Programs\Python\Python37\;C:\Users\18264\AppData\Local\Programs\Python\Launcher\;C:\Users\18264\AppData\Local\Microsoft\WindowsApps;D:\Microsoft VS Code\bin;F:\idea\IntelliJ IDEA 2021.1.3\bin;;F:\tool\nvm\nvm;F:\tool\node +USERNAME=18264 +OS=Windows_NT +PROCESSOR_IDENTIFIER=Intel64 Family 6 Model 141 Stepping 1, GenuineIntel + + + +--------------- S Y S T E M --------------- + +OS: Windows 10 , 64 bit Build 19041 (10.0.19041.5438) + +CPU:total 16 (initial active 16) (8 cores per cpu, 2 threads per core) family 6 model 141 stepping 1, cmov, cx8, fxsr, mmx, sse, sse2, sse3, ssse3, sse4.1, sse4.2, popcnt, avx, avx2, aes, clmul, erms, 3dnowpref, lzcnt, ht, tsc, tscinvbit, bmi1, bmi2, adx + +Memory: 4k page, physical 16509736k(919328k free), swap 36170532k(5620k free) + +vm_info: OpenJDK 64-Bit Server VM (25.422-b05) for windows-amd64 JRE (1.8.0_422-b05), built on Jul 11 2024 17:20:01 by "Administrator" with MS VC++ 15.9 (VS2017) + +time: Tue Mar 4 14:31:48 2025 +timezone: Intel64 Family 6 Model 141 Stepping 1, GenuineIntel +elapsed time: 0.022707 seconds (0d 0h 0m 0s) + diff --git a/keywords.txt b/keywords.txt new file mode 100644 index 0000000..51fc8fc --- /dev/null +++ b/keywords.txt @@ -0,0 +1,1045 @@ +Zoonotic disease +pandemic +Emerging and re-emerging diseases +biosafet +biosecurit +biodefen +biological defen +bioweapon +biologicalweapon +bioterroris +biological terroris +biowarfare +biological warfare +biosurveillan +biological surveillan +biohazard +biological hazard +bioincident +biological incident +biothreat +biological threat +bioagent +biologicalagent +biological protect +bioprotect +biological risk +Bacillus anthracis +Bacillus cereus Biovaranthracis +Brucella abortus +Brucella melitensis +Brucella neotomae +Brucella suis +Brucellamelitensis biovar suis +Burkholderia mallei +Pseudomonas mallei +Burkholderia pseudomallei +Acinetobacter mallei +Glanders bacillus +Bacillus mallei +Actinobacillus mallei +Pfeifferella mallei +Malleomyces mallei +Loefferella mallei +Chlamydophila psittaci +Chlamydia psittaci +Clostridium botulinum +Coxiella burnetii +Escherichia coli O157 +Escherichia coli +E coli O157-H7 +Escherichia coli O157:H7 +Francisella tularensis +Legionella pneumophila +Mycoplasma capricolum +Mycoplasma mycoides +Rickettsia prowazekii +Rickettsia rickettsii +Salmonella enterica +Salmonella choleraesuis +Vibrio cholerae +Vibrio comma +Yersinia pestis +Bacille de la peste +Bacterium pestis +Pasteurella pestis +African horse sickness virus +AHSV +African Swine Fever Virus +ASFV +Wart-Hog Disease Virus +Wart Hog Disease Virus +Avian influenza virus +Bluetongue virus +Bluetongue Viruses +Blue Tongue Virus +Ovine Catarrhal Fever Virus +Chapare virus +Chapare viruses +Chaparemammarenavirus +Chikungunya virus +CHIKV +Classical Swine Fever Virus +Hog CholeraVirus +Pestivirus C +CSFV +Crimean-Congohaemorrhagic fever virus +Crimean Congohemorrhagic fever virus +Congo Virus +denguevirus +DENV +Dengue Viruses +BreakboneFever Virus +Breakbone Fever Viruses +EasternEquine Encephalomyelitis Virus +Eastern EquineEncephalitis virus +EEE Virus +EEEV +Ebolavirus +Ebolaviruses +Ebola Virus +Ebola Viruses +Ebola-like Viruses +Ebola likeViruses +Ebola-like Virus +Foot-and-MouthDisease Virus +Foot and Mouth Disease Virus +Foot-and-Mouth Disease Viruses +FMDV +Goatpox virus +Goatpox viruses +Goat PoxVirus +Goat Pox Viruses +Guanarito virus +Guanarito viruses +Guanarito mammarenavirus +GTOV +Hantaan virus +Korean HemorrhagicFever Virus +Hantaan orthohantavirus +Hemorrhagic Nephroso-Nephritis Virus +Hemorrhagic Nephroso Nephritis Virus +Hemorrhagic Nephroso-Nephritis Viruses +Epidemic Hemorrhagic Fever Virus +HFRS Viruses +Hemorrhagic Fever Renal Syndrome Virus +HTNV +Hendra Virus +HendraViruses +Equine Morbillivirus +EquineMorbilliviruses +MojV +Japanese B EncephalitisVirus +Japanese Encephalitis Virus +JEV +Junin virus +Argentinian mammarenavirus +JUNV +Kyasanur Forest disease virus +KFDV +Lassa virus +Lassa fever virus +Lassamammarenavirus +LASV +Lujo virus +Lujomammarenavirus +LUJV +Lumpy skin diseasevirus +Neethling Virus +Machupo virus +Machupo mammarenavirus +MACV +Marburgvirus +Marburgviruses +Marburg Virus +Marburg Viruses +Marburg-like Viruses +Marburg like Viruses +Marburg-like Virus +Frankfurt-Marburg Syndrome Virus +FrankfurtMarburg Syndrome Virus +Monkeypox virus +Monkeypox viruses +Monkeypoxvirus +Monkeypoxviruses +Monkey Pox Virus +Monkey Pox Viruses +Newcastle disease virus +NDV +Nipah virus +Nipah henipavirus +Nipah Viruses +Omskhemorrhagic fever virus +OHFV +Omskhaemorrhagic fever virus +Peste-des-petits-ruminants virus +Peste des petitsruminants virus +Rabies virus +Rabies lyssavirus +Reconstructed 1918 Influenza virus +RiftValley fever virus +Rift Valley fever phlebovirus +RVFV +Rinderpest virus +Rinderpestmorbillivirus +Sabia virus +SARS Virus +Severe Acute Respiratory Syndrome Virus +RSARS-Related Coronavirus +SARS RelatedCoronavirus +SARS-CoV +SARS AssociatedCoronavirus +SARS Coronavirus +SARS-Associated Coronavirus +Severe acuterespiratory syndrome related coronavirus +Severeacute respiratory syndrome-related coronavirus +Sheeppox virus +Sheeppox viruses +Sheep PoxVirus +Sheep Pox Viruses +Sin Nombre virus +Muerto Canyon Virus +Four Corners Virus +Sin Nombre hantavirus +Swine vesicular diseasevirus +SVDV +Tick-Borne Encephalitis Virus +Tick Borne Encephalitis Virus +Tick-BorneEncephalitis Viruses +Tick Borne EncephalitisViruses +TBEV +Variola virus +Variolaviruses +Smallpox Virus +Smallpox Viruses +Poxvirus variolae +Variola minor virus +Variolamajor virus +Alastrim +Venezuelan equineencephalitis virus +Venezuelan Equine EncephalitisViruses +West Nile virus +Egypt 101 virus +Kunjin virus +WNV +WEE Virus +WEEViruses +Western Equine Encephalitis Viruses +Western equine encephalitis virus +WEEV +Yellow fever virus +Naegleria fowleri +Naegleria fowlerus +Fiji disease virus +Ralstoniasolanacearum +Rathayibacter toxicus +Xanthomonas oryzae +Erwinia amylovora +Xanthomonas albilineans +Dothistroma pini +Dothistroma septosporum +Scirrhia pini +Tilletiaindica +Coniothyrium glycines +Phomaglycinicola +Pyrenochaeta glycines +Coccidioides immitis +Histoplasma capsulatum +Synchytrium endobioticum +Colletotrichumcoffeanum +Peronospora hyoscyami +Peronosclerospora philippinensis +Sclerophthorarayssiae +Bacteriotoxins +Botulinum toxins +Clostridium perfringens toxins +Staphylococcalenterotoxins +Shigatoxins +Anatoxins +Ciguatoxins +Saxitoxins +Trichothecene toxins +Abrins +Ricin* +recin +Bungarotoxins +Botulinum neurotoxin producing species ofClostridium +Conotoxins +T-2 toxin +Tetrodotoxin +Diacetoxyscirpeno +SARS-COV-2 +COVID-19 +coronavirus disease 2019 +2019-nCov +Alastrim virus +Mpox virus +Hypr virus +Kumlinge virus +Louping ill virus +Hanzalova virus +Omsk hemorrhagic fever virus +St.Louis encephalitis virus +Crimean-Congo hemorrhagic fever virus (Xinjiang hemorrhagic fever virus) +Herpesvirus simiae +Eastern equine encephalitis virus +Venezuelan equine encephalitis virus +Flexal virus +Mopeia virus (and other Tacaribe viruses) +Tacaribe virus +Dabie bandavirus (SFTS Virus) +Gordil virus +Heartland bandavirus +Itaituba virus +Khasan virus +Razdan virus +Rift valley fever virus +Garba virus +Rabies virus (street virus) +Rochambeau virus +Inhangapi virus +Middle East Respiratory Syndrome coronavirus (MERS-CoV) +Severe acute respiratory syndrome coronavirus (SARS-CoV) +Severe acute respiratory syndrome coronavirus 2, (SARS-CoV-2) +Hantaviruses causing pulmonary syndrome +Hantaviruses causing hemorrhagic fever with renal syndrome +Murray valley encephalitis virus +Negishi virus +Powassan virus +Rocio virus +Sepik virus +Issyk-Kul virus +Nairobi sheep disease virus +Sapphire orthonairovirus (Paramushir virus) +Tamdy virus +Human immunodeficiency virus (HIV) (Type 1 and 2 virus) +Simian immunodeficiency virus (SIV) +Everglades virus +Kyzylagach virus +Mayaro virus +Middelburg virus +Mucambo virus +Ndumu virus +Sagiyama virus +Lymphocytic choriomeningitis (neurotropic) virus +Polio virus +Dhori virus +High pathogenic avian influenza virus +California encephalitis virus +Germiston virus +Inini virus (Simbu orthobunyavirus) +Oropouche virus +Sandfly fever virus +Norovirus +Sapovirus +Flanders virus +Hart Park virus +Rabies virus (fixed virus) +Vesicular stomatitis virus +Buffalopox virus +Camelpox virus +Cowpox virus +Molluscum contagiosum virus +Orf virus +Pseudocowpox virus (Milker‘s nodule virus) +Rabbitpox virus +Tanapox virus +Vaccinia virus +Polyoma virus +Simian virus 40 +Metapneumovirus +Respiratory syncytial virus +Rubivirus (Rubella) +Measles virus +Mumps virus +Parainfluenza virus +Sendai virus (murine parainfluenza virus type 1) +Coronavirus (low pathogenicity to human) +Coltivirus +Rotavirus +Dengue virus +Flaviviruses,other known non-highly pathogenic +Hepatitis C virus +Langat virus +Saumarez reef virus +Yellow fever virus, (vaccine strain, 17D) +Zika Virus +Hazara virus +Human T- lymphotropic virus (HTLV) +Lentivirus (Non highly pathogenic) +Cytomegalovirus +Epstein-Barr virus +Herpes simplex virus +Herpesvirus saimiri +Human herpes virus-6 +Human herpes virus-7 +Human herpes virus-8 +Varicella-Zoster virus +Alphaviruses, other known non-highly pathogenic +Barmah forest virus +Bebaru virus +Getah virus +O’nyong-nyong virus +Ross river virus +Semliki forest virus +Sindbis virus +Papillomavirus (human) +Lymphocytic choriomeningitis virus +Hepatitis B virus +Hepatitis D virus +Hepatitis E virus +Adeno-associated virus +Bocavirus +Parvovirus B19 +Adenovirus +Cardiovirus +Coxsakie virus +ECHO virus +Enterovirus +Enterovirus A-71 +Hepatitis A virus +Human Cosavirus +Kobuvirus +Parechovirus +Rhinovirus +Astrovirus +Influenza virus +Guaratuba virus +La Crosse virus +Tahyna orthobunyavirus +Tensaw virus +Turlock virus +Hamster leukemia virus +Mouse leukemia virus +Mouse mammary tumor virus +Rat leukemia virus +Guinea pig herpes virus +Bovine spongiform encephalopathy (BSE) +Creutzfeldt-Jakob disease (CJD) +Fatal familian insomnia (FFI) +Gerstmann- Sträussler -Scheinker syndrome(GSS) +Kuru disease +Variant Creutzfeldt-Jakob disease (vCJD) +Scrapie +Phagophilic cells without form +Brucella genus +Mycobacterium bovis +Mycobacterium tuberculosis +Rickettsia belongs to the spotted fever group +Rickettsia Mori +Przewalski's Rickettsia +Eastern body of scrub typhus +Lutheran rickettsia +Siberian Rickettsia +Tarasawich rickettsia +Goat shapeless +Acinetobacter baumannii +Acinetobacter lwoffii +Madura actinomycetes +Bai Lejie Madura actinomycete +Bovine actinomycete +Actinomyces granulosus +Yi's actinomycetes +Nei's actinomycetes +Other species of actinomycetes +Aeromonas hydrophila +Spotted Aeromonas +Other species of Aeromonas genus +Afipota genus +Actinobacteria agglomerating bacteria +Arachnia propionica +Arcanobacterium equi +Hemolytic Cryptococcus +Bacillus cereus +Fragile pseudomonas +Rod like Bartonella +Klebsiella pneumoniae +Duoshi Bartonella +Elizabethan Bartonella +Guillain Barr é body +bartonella henselae +Kochia Bartonella +5-Day Heat Bartonella Body +Tribal Bartonella +Wens Bartonella Wens subspecies +Botrytis bronchiolitis +Bordetella pertussis +Borrelia burgdorferi +Dashi sparse spiral body +Returning to the heat sparse spiral body +Fensenshu spirochete +Short spiral bacteria genus +Granuloma sheath bacteria +Campylobacter coli +Fetal Campylobacter +Campylobacter jejuni +Salivary Campylobacter +Other species of Campylobacter genus +Chlamydia pneumoniae +Chlamydia parrot +Chlamydia trachomatis +Difficult Clostridium difficile +Fusarium oxysporum +Hemolytic Clostridium +Clostridium novyi +Clostridium perfringens +Tetanus Clostridium +Lactobacillus bovis +Corynebacterium diphtheriae +Corynebacterium minutissimum +Fake Mycobacterium tuberculosis +Corynebacterium striatum +Acinetobacter canker +Congo Pichia +edwardsiella tarda +Yifei Erich's body +Eikenella corrodens +Gas producing Escherichia coli +Enterobacter cloacae +Other species of Escherichia coli +Adenothermic rickettsia +Porcine red spot erysipelas fungus +Dandelion fungus genus +Burkholderia meningoseptica +Bozeman's Legionella +The new subspecies of the killer of the Tula Francisella fungus +Fusobacterium necrophorum +gardnerella vaginalis +Hemophilus ducreyi +Haemophilus influenzae +Helicobacter pylori +Kingella Kingae +Klebsiella oxytoca +Question mark Leptospira +Listeria ivanovii +Listeria monocytogenes +Polymorphic small bacteria +Morganella morganii +African mycobacteria +Goat mycobacteria +Field mouse mycobacteria +Mycobacterium asiaticum +Mycobacterium avium +Occasional mycobacteria +Kansas mycobacteria +Mycobacterium leprae +Mycobacterium malmoense +Mycobacterium avium subsp. paratuberculosis +Mycobacterium scrotum +Mycobacterium hominis +Mycobacterium szulgai +Ulcerative mycobacteria +Other species of Mycobacterium genus +mycoplasma pneumoniae +neisseria gonorrhoeae +Neisseria meningitidis +Nocardia asteroides +Nocardia brasiliensis +Nocardia botulinum +Nocardia pyogenes +New Nocardia +Nocardia in guinea pig ear inflammation +Delancewanorca bacteria +Clostridium sporogenes +Pasteurella multocida +Rodent bacteria invading the lungs +Pathogenic Escherichia coli +Other pathogenic Escherichia coli genera +Anaerobic digestion streptococcus +Plesiomonas shigelloides +Prevotella genus +Proteus mirabilis +Proteus penneri +Ordinary Proteobacteria +Propionibacterium prolifera producing alkali +Prevotella reinhardtii +Pseudomonas aeruginosa +Autotrophic false Nocardia +Staphylococcus aureus +Bongor Salmonella +Serratia liquefaciens +Fading Salmonella +Shigella dysenteriae +Shigella flexneri +Shigella boydii +Shigella Songnei +Staphylococcus epidermidis +Candida albicans +Streptococcus pneumoniae +Streptococcus pyogenes +Streptococcus genus +streptococcus suis +Treponema carateum +Treponema pallidum (syphilis) +Treponema pertenue +Wen's density spiral body +Ureaplasma urealyticum +Vibrio vulnificus +Vibrio parahaemolyticus +River Vibrio +Vibrio alginolyticus +Other species of Vibrio genus +Yersinia enterocolitica +Yersinia pseudotuberculosis +mycoplasma genitalium +Cronobacter genus +Citrobacter genus +Photobacterium damselae +Shiwanju genus +Seafood Deformable Fungi +Defective autotrophic bacteria +Carbon dioxide fiber eating bacteria genus +Chromobacterium genus +Golden rod genus +Short chain Streptococcus genus +Dermatitis budding bacteria +Coarse ball spore fungus +Posadas spore forming bacteria +Capsular tissue cytoplasmic bacteria +Histoplasma bacteria and other pathogenic diseases +Brazilian Azospirillum +Other pathogenic diseases of the genus Ascomycota +Cladosporium genus +Rhizopus genus +Alternaria alternata +Infecting Alternaria +Other pathogenic diseases of the genus Alternaria +Scale mold genus +Genus Fusarium +Arthrobacter genus +Aspergillus flavus complex +Aspergillus fumigatus complex group +Aspergillus terreus +Short stem mold genus +Solid spore frog manure mold +Frog manure mold belongs to other pathogenic diseases +Beauveria genus +candida dubliniensis +Smooth Candida complex +Ji Yemeng Candida complex +Ximulong Candida complex +Candida krusei +Near smooth Candida complex +Tropical Candida +Candida auricula +Candida and other pathogenic diseases +Cephalosporin genus +Genus Trichoderma +Golden spore fungus genus +Curly mold genus +Botrytis cinerea +Other pathogenic diseases of Aspergillus genus +Saccharomyces genus +Trichoderma genus +Crown ear mold +Conidiobolus incongruus +Ear mold belongs to other pathogenic diseases +False black powdery mildew genus +Kashi cola rod mold +Other pathogenic diseases of Colletotrichum genus +Gert Cryptococcus complex +Cryptococcus neoformans complex +Cryptococcus and other pathogenic diseases +Cunninghamella bertholletiae +Xiaoke Yinhan mold belongs to other pathogenic diseases +Hawaiian curved fungus +Babendorf's curved fungus +Suiform curved fungus +Curvularia genus +Corydalis genus +Interstitial shell genus +The genus Bifidobacterium +Aemonas genus +Trichophyton flocs +Dermatitis external bottle mold +Zhen's external bottle mold complex group +Spinous external bottle mold +External bottle mold belongs to other pathogenic diseases +Magnum's navel mold +Beaked navel mold +Monofer coloring mold +Pei's coloring mold +Nubica coloring mold +Other pathogenic diseases of the genus Trichoderma +Fusarium oxysporum complex +Fusarium complex of eggplant disease +Other pathogenic diseases of Fusarium genus +Geotrichum genus +Genus Mucomycota +Venetobacter baumannii +Half new pillar top spore +Lasiodiplodia theobromae +Umbrella branch transverse stem mold +Multi branch transverse stem mold +Other pathogenic diseases of Streptomyces genus +Polyphenophore Spore +Gray Madura fungus +Podomycosis Madura bacteria +Madura bacteria and other pathogenic diseases +Malassezia furfur +Spherical Malassezia +Malassezia and other pathogenic diseases +Microsporidia canis +Rust colored microsporidia +Other pathogenic diseases of the genus Microsporidia +Aspergillus genus +Fusarium complex group +Irregular mold +Mucor racemosa +Other pathogenic diseases of Mucor genus +Gypsum Neisseria +Neosatobacter genus +New genus of balanoposthitis +Black spore fungus genus +Ochromycetes genus +Xufang yeast genus +Wan's Penicillium +Dark colored Cladosporium genus +Dark colored Cyclosporidium genus +Pingge bacteria genus +Single spore bottle mold genus +American bottle mold +Verrucous bottle mold +Bottle mold belongs to other pathogenic diseases +Stem point mold genus +Hedermann nodule fungus +The genus of crooked mouth shell +Wickham without green algae +Zufei no green algae +No other pathogenic diseases of the green algae genus +Conomycota genus +Rhizopus microsporus +rhizopus arrhizus +Rhizopus and other pathogenic diseases +Red yeast genus +Broomycota genus +Sharp tip Sedosporium +Other pathogenic diseases of the genus Zygomycota +Schizophyllum genus +Short broom mold +Other pathogenic diseases of the broom mold genus +Spheroidal sporophytes +Schenker Sporothrix fungus +Sporothrix bacteria and other pathogenic diseases +Copium genus +Marlini's basket shaped bacteria +Trichophyton complex +Red Trichophyton complex +Trichophyton schoenleinii +Trichophyton interruptus +Purple Trichophyton +Trichophyton genus and other pathogenic diseases +trichosporon asahii +Other pathogenic diseases of the genus Trichosporum +Wheat stalk mold genus +Monogramma genus +Viranthus genus +Verrucosporium genus +Ameba +Wuchereria bancrofti +Hydatid +Whipworm +Lung fluke +Liver fluke +Toxoplasma +Hookworm +Ascaris +Giardia +Scabies +Pinworm +Malaria +Plasmodium +Filarial worm +Taenia +Microsporidia +Schistosome +Cryptosporidium +Porcine tapeworm +Q hot +Ebola hemorrhagic fever +Bacillus subtilis +Brucella bacteria +Actinomycetes +Eperythrocytic disease +Para tuberculosis +tox +Leptospirasis +Echinococcosis +tuberculosis +Old World spiral maggot disease +Crimean Congo hemorrhagic fever +foot-and-mouth disease +rabies +Pseudomallei +Rift Valley fever +Nipah's disease +Japanese encephalitis +Schmallenberg disease +Vesicular stomatitis +anthrax +Pseudorabies +Siniro fever +Heart water disease +New World spiral maggot disease +Clostridium perfringens infections +Infection with Trichinella spp +Tularemia +Trypanosoma Evansi infection +Leishmaniasis +Infection with epizootic haemorrhagicdis-ease +Filariasis +Staphylococcosis +Schistosomiasis +Nipah virus Encephalitis +Rotavirus infection +Clostridum Perfringens +Salmonella disease +Listeriosis +Hemolytic brucellosis +Mycoplasma disease +Chlamydia disease +Eastern schistosomiasis +Clonorchiasis sinensis +Cysticercosis +Fasciola hepatica +Blood Spear Nematode Disease +Cryptosporidiosis +Akabane disease +Hemorrhagic sepsis +Ibaraki disease +Bovine leukemia +Bovine viral diarrhea +Bovine infectious rhinotracheitis +Bovine contagious pleuropneumonia +malignant catarrhal fever +Bovine spongiform encephalopathy +Bovine nodular dermatitis +Cattle popularity trend +Bovine hypodermatid myiasis +Bovine non plasma disease +Bovine mucosal disease +Zhongshan disease +Infectious bo-vine rhinotracheitis/Infectious pustular vulvovaginitis +Bovine genital campylobacteriosis +Bovine viral diarrhoea/Mucosal disease +Bovine babesiosis +Theileriosis +Trichomonosis +Dermatophilosis +Local epidemic bovine leukemia +Bovine coronavirus infection +Bovine pear shaped insect disease +African horse plague +Hendra's disease +Ulcerative lymphangitis +Equine glanders +Equine disease toxic arteritis +Equine infectious anemia +Equine infectious uterine inflammation +equine paratyphoid +Horse mating disease +Equine influenza +Equine epidemic lymphangitis +Horse gland disease +Venezuelan equine encephalomyelitis +Infection with equid herpesvirus-1 +Equine encephalomyelitis (East-ern and Western) +Horse flu +Equine nosed pneumonia +equine piroplasmosis +african swine fever +Seneca virus disease +Porcine infectious gastroenteritis +porcine contagious pleuropneumonia +Pig erysipelas +Porcine Reproductive and Respiratory Syndrome +Porcine paratyphoid fever +Porcine Epidemic Diarrhea +Swine influenza +Swine dysentery +Porcine vesicular disease +Porcine Tetreovirus induced encephalomyelitis +Atrophic rhinitis in pigs +swine fever +Mycoplasma hyopneumoniae pneumonia in pigs +Porcine parvovirus infection +Swine streptococosis +Porcine circovirus infection +Glaesser’s disease(Haemoph-ilus parasuis) +Infection with Taenia solium(Porcine cysticercosis) +Porcine deltacorona virus(PDCoV) +Porcine brucellosis +Porcine Circovirus Disease +Glaser's disease +swine flu +Porcine Coronavirus Infection +Porcine Seneca virus infection +Piglet dysentery +Porcine dysentery +Porcine proliferative intestinal disease +Infectious rhinitis +Infectious bursal disease +Low pathogenic avian influenza +Highly pathogenic avian influenza +turkey rhinotracheitis +Chicken white diarrhea +Chicken viral arthritis +Chicken egg production decline syndrome +Infectious laryngotracheitis in chickens +Infectious bronchitis in chickens +Marek's disease +Avian leukemia +Avian infectious encephalomyelitis +Avian pox +Avian paratyphoid fever +Avian spirochete disease +Avian typhoid fever +Avian nephritis +Avian reticuloendothelial hyperplasia +avian chlamydiosis +Avian mycoplasmosis +Newcastle disease +Duck viral hepatitis +Leucocytozoonosis +Goose parvovirus infection +Duck virus enteritis +Avian coccidiosis +Riemerella anatipestifer infection +Duck plague +Gosling plague +Avian Infectious Laryngotracheitis +avian infectious bronchitis +Marek’s Disease +egg drop syndrome +Duck serositis +Avian reticuloendothelial tissue proliferation disease +Chicken infectious rhinitis +Infection with avian Tembusu virus +Avian adenovirus infection +Chicken infectious anemia +Infection of avian influenza virus +Chicken red mite disease +necrotic enteritis +Duck reovirus infection +Boundary disease +Infectious azoospermia +Caseous lymphadenitis +Blue tongue disease +Medi Visna disease +enzootic abortion of ewes +Sheep pox and goat pox +Nairobi sheep disease +Contagious pleuropneumonia in goats +Goat encephalitis +Small ruminant plague +Sheep infectious pustular dermatitis +ovine pulmonary adenomatosis +Itchy disease +Caprine arthritis/encephalitis +Salmonellosis(S.abortusovis) +Sheep lung adenomatous disease +Sheep pear shaped worm disease +Sheep without plasma disease +Crayfish plague +Vitiligo syndrome +Spotted catfish viral disease +Viral hemorrhagic sepsis +Viral neuronecrosis disease +Infectious muscle necrosis disease +Infectious subcutaneous and hematopoietic organ necrosis disease +Infectious splenic and renal necrosis disease +Infectious Hematopoietic Organ Necrosis +Bacterial sepsis in freshwater fish +Salmon infectious anemia +Necrotizing liver pancreatitis +Huangtou disease +Catfish intestinal sepsis +Acute liver and pancreas necrosis +Koi herpesvirus disease +Carp spring viremia +Carp edema virus disease +Epidemic Ulcer Syndrome +epizootic haematopoietic necrosis +Tilapia Lake Virus Disease +White tail disease +Taura syndrome +Bacterial nephropathy +Red snapper rainbow virus disease +Infection with Gyrodactylus Salaris +Infection with abalone herpesvirus +Infection with Bonamia Ostreae +Infection with Bonamia Exitiosa +Infection with Marteilia Refringens +Infection with Perkinsus Olseni +Infection with Perkinsus Marinus +Infection with Xenohaliotis Californiensis +Infection with Batrachochytrium Dendrobatidis +Infection with Ranavirus species +Anisakiasis +Cryptocaryoniasis +Edwardsiellasis +Fish streptococcosis +Chryseobacterium meningsepticum of frog (Rana spp) +Infection with salmonid alphavirus +Infection with Batrachochytrium salamandrivorans +Infection with Decapod iridescent virus 1 +Grass carp hemorrhagic disease +Necrosis of hematopoietic organs in crucian carp +Carp float disease +Shrimp liver intestinal worm disease +schistosomiasis japonica +Infectious pancreatic necrosis disease +Paralichthys olivaceus virus disease +Fish Edwardellosis +Streptococcal disease +Salmon killing Aeromonas disease +Small melon worm disease +Myxosporidiosis +Third generation insect disease +Ringworm disease +Crab snail pathogen disease +Bao herpesvirus disease +Oyster herpesvirus disease +Beehive Beetle +american foul brood +Bee chalky disease +Bee shield mite disease +Honey bee bright heat mite disease +Bee mite disease +european foul brood +Small hive beetle infestation(Aethina tumida) +Nosemosis of honey bees +Bombyx mori polyhedrosis +Bright and hot mite disease +chalkbrood +white muscardine +Silkworm microsporidia +Rabbit hemorrhagic disease +Rabbit myxomatosis +Rabbit coccidiosis +Rabbit brucellosis +Feline panleukopenia +Canine infectious hepatitis +canine distemper +Canine parvovirus infection +Canine parvovirus disease +Cat cupping virus infection +Feline infectious peritonitis +canine babesiosis +Amphibian frog iridovirus disease +Turtle parotitis disease +Frog meningitis sepsis +Monkey viral immunodeficiency syndrome +Monkeypox +Lymphocytic choroidal meningitis +Chronic wasting disease +Camel pox +Marburg Hemorrhagic Fever +Rat pox +Mink Aleutian disease +Mink viral enteritis +Mouse hepatitis +Cercopithecine Herpesvirus Type I(B virus)infectious diseases +Sendai virus infectious disease +Infectious subcutaneous and hematopoietic tissue necrosis disease +Acute Hepatopancreatic Necrosis diff --git a/original_captcha.png b/original_captcha.png new file mode 100644 index 0000000..6a588a1 Binary files /dev/null and b/original_captcha.png differ diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..12f0a88 --- /dev/null +++ b/pom.xml @@ -0,0 +1,150 @@ + + 4.0.0 + com.example + es-crawler + 1.0-SNAPSHOT + + + 8 + 8 + + + + + + org.elasticsearch.client + elasticsearch-rest-high-level-client + 7.17.0 + + + + co.elastic.clients + elasticsearch-java + 7.17.15 + + + com.fasterxml.jackson.core + jackson-databind + 2.15.0 + + + + + org.jsoup + jsoup + 1.17.2 + + + + + com.squareup.okhttp3 + okhttp + 4.9.3 + + + + + org.slf4j + slf4j-api + 1.7.36 + + + ch.qos.logback + logback-classic + 1.2.11 + + + + + org.apache.kafka + kafka-clients + 3.9.0 + + + + + org.seleniumhq.selenium + selenium-java + 4.10.0 + + + + + io.github.bonigarcia + webdrivermanager + 5.6.2 + + + + org.json + json + 20230227 + + + + com.google.code.gson + gson + 2.10.1 + + + + net.sourceforge.htmlunit + htmlunit + 2.61.0 + + + + net.sourceforge.tess4j + tess4j + 4.5.4 + + + + org.apache.httpcomponents.client5 + httpclient5 + 5.3.1 + + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + 8 + 8 + + + + + org.apache.maven.plugins + maven-assembly-plugin + 3.3.0 + + + + com.example.CtriScraper + + + + jar-with-dependencies + + + + + make-assembly + package + + single + + + + + + + \ No newline at end of file diff --git a/preprocessed_captcha.png b/preprocessed_captcha.png new file mode 100644 index 0000000..20329de Binary files /dev/null and b/preprocessed_captcha.png differ diff --git a/processed_urls.txt b/processed_urls.txt new file mode 100644 index 0000000..f862ec3 --- /dev/null +++ b/processed_urls.txt @@ -0,0 +1,281 @@ + +https://www.zyctd.com/zixun/201/1055143.html +https://www.zyctd.com/zixun/201/861786.html +https://www.zyctd.com/zixun/201/1053482.html +https://www.zyctd.com/zixun/201/269419.html +https://www.zyctd.com/zixun/201/1053149.html +https://www.zyctd.com/zixun/201/1023926.html +https://www.zyctd.com/zixun/201/435325.html +https://www.zyctd.com/zixun/201/1050302.html +https://www.zyctd.com/zixun/201/880441.html +https://www.zyctd.com/zixun/201/1019635.html +https://www.zyctd.com/zixun/201/970572.html +https://www.zyctd.com/zixun/201/912277.html +https://www.zyctd.com/zixun/201/372444.html +https://www.zyctd.com/zixun/201/1073629.html +https://www.zyctd.com/zixun/201/1069386.html +https://www.zyctd.com/zixun/201/730410.html +https://www.zyctd.com/zixun/201/953220.html +https://www.zyctd.com/zixun/201/1074339.html +https://www.zyctd.com/zixun/201/1072317.html +https://www.zyctd.com/zixun/201/294794.html +https://www.zyctd.com/zixun/201/267592.html +https://www.zyctd.com/zixun/201/979665.html +https://www.zyctd.com/zixun/201/869885.html +https://www.zyctd.com/zixun/201/1054064.html +https://www.zyctd.com/zixun/201/1049331.html +https://www.zyctd.com/zixun/201/442647.html +https://www.zyctd.com/zixun/201/285992.html +https://www.zyctd.com/zixun/201/1037972.html +https://www.zyctd.com/zixun/201/799801.html +https://www.zyctd.com/zixun/201/916078.html +https://www.zyctd.com/zixun/201/456647.html +https://www.zyctd.com/zixun/201/812121.html +https://www.zyctd.com/zixun/201/1042740.html +https://www.zyctd.com/zixun/201/1042708.html +https://www.zyctd.com/zixun/201/840450.html +https://www.zyctd.com/zixun/201/320749.html +https://www.zyctd.com/zixun/201/496106.html +https://www.zyctd.com/zixun/201/850201.html +https://www.zyctd.com/zixun/201/277145.html +https://www.zyctd.com/zixun/201/299091.html +https://www.zyctd.com/zixun/201/266080.html +https://www.zyctd.com/zixun/201/1051925.html +https://www.zyctd.com/zixun/201/898081.html +https://www.zyctd.com/zixun/201/873280.html +https://www.zyctd.com/zixun/201/703880.html +https://www.zyctd.com/zixun/201/873126.html +https://www.zyctd.com/zixun/201/887931.html +https://www.zyctd.com/zixun/201/432742.html +https://www.zyctd.com/zixun/201/1040431.html +https://www.zyctd.com/zixun/201/1040223.html +https://www.zyctd.com/zixun/201/858118.html +https://www.zyctd.com/zixun/201/971286.html +https://www.zyctd.com/zixun/201/458488.html +https://www.zyctd.com/zixun/201/1079381.html +https://www.zyctd.com/zixun/201/263578.html +https://www.zyctd.com/zixun/201/553513.html +https://www.zyctd.com/zixun/201/286229.html +https://www.zyctd.com/zixun/201/285365.html +https://www.zyctd.com/zixun/201/352921.html +https://www.zyctd.com/zixun/201/503267.html +https://www.zyctd.com/zixun/201/391337.html +https://www.zyctd.com/zixun/201/813052.html +https://www.zyctd.com/zixun/201/1053556.html +https://www.zyctd.com/zixun/201/1041197.html +https://www.zyctd.com/zixun/201/287420.html +https://www.zyctd.com/zixun/201/291563.html +https://www.zyctd.com/zixun/201/948250.html +https://www.zyctd.com/zixun/201/289034.html +https://www.zyctd.com/zixun/201/795965.html +https://www.zyctd.com/zixun/201/292962.html +https://www.zyctd.com/zixun/201/975850.html +https://www.zyctd.com/zixun/201/275335.html +https://www.zyctd.com/zixun/201/1031992.html +https://www.zyctd.com/zixun/201/1033886.html +https://www.zyctd.com/zixun/201/999510.html +https://www.zyctd.com/zixun/201/270144.html +https://www.zyctd.com/zixun/201/1055519.html +https://www.zyctd.com/zixun/201/272205.html +https://www.zyctd.com/zixun/201/526059.html +https://www.zyctd.com/zixun/201/456640.html +https://www.zyctd.com/zixun/201/267952.html +https://www.zyctd.com/zixun/201/803469.html +https://www.zyctd.com/zixun/201/270763.html +https://www.zyctd.com/zixun/201/1072987.html +https://www.zyctd.com/zixun/201/265176.html +https://www.zyctd.com/zixun/201/1022141.html +https://www.zyctd.com/zixun/201/290173.html +https://www.zyctd.com/zixun/201/269175.html +https://www.zyctd.com/zixun/201/744991.html +https://www.zyctd.com/zixun/201/1019131.html +https://www.zyctd.com/zixun/201/717054.html +https://www.zyctd.com/zixun/201/517358.html +https://www.zyctd.com/zixun/201/1058505.html +https://www.zyctd.com/zixun/201/905515.html +https://www.zyctd.com/zixun/201/287395.html +https://www.zyctd.com/zixun/201/934873.html +https://www.zyctd.com/zixun/201/1051317.html +https://www.zyctd.com/zixun/201/926018.html +https://www.zyctd.com/zixun/201/334511.html +https://www.zyctd.com/zixun/201/845896.html +https://www.zyctd.com/zixun/201/587785.html +https://www.zyctd.com/zixun/201/288376.html +https://www.zyctd.com/zixun/201/851405.html +https://www.zyctd.com/zixun/201/941404.html +https://www.zyctd.com/zixun/201/881855.html +https://www.zyctd.com/zixun/201/602632.html +https://www.zyctd.com/zixun/201/293601.html +https://www.zyctd.com/zixun/201/541809.html +https://www.zyctd.com/zixun/201/335120.html +https://www.zyctd.com/zixun/201/1031137.html +https://www.zyctd.com/zixun/201/960101.html +https://www.zyctd.com/zixun/201/1077142.html +https://www.zyctd.com/zixun/201/1063222.html +https://www.zyctd.com/zixun/201/681466.html +https://www.zyctd.com/zixun/201/1031130.html +https://www.zyctd.com/zixun/201/1073734.html +https://www.zyctd.com/zixun/201/1062186.html +https://www.zyctd.com/zixun/201/1046628.html +https://www.zyctd.com/zixun/201/358892.html +https://www.zyctd.com/zixun/201/285361.html +https://www.zyctd.com/zixun/201/1059889.html +https://www.zyctd.com/zixun/201/297824.html +https://www.zyctd.com/zixun/201/844307.html +https://www.zyctd.com/zixun/201/900524.html +https://www.zyctd.com/zixun/201/1057636.html +https://www.zyctd.com/zixun/201/1010080.html +https://www.zyctd.com/zixun/201/409152.html +https://www.zyctd.com/zixun/201/402782.html +https://www.zyctd.com/zixun/201/770296.html +https://www.zyctd.com/zixun/201/1040602.html +https://www.zyctd.com/zixun/201/606503.html +https://www.zyctd.com/zixun/201/784471.html +https://www.zyctd.com/zixun/201/466097.html +https://www.zyctd.com/zixun/201/1071160.html +https://www.zyctd.com/zixun/201/623226.html +https://www.zyctd.com/zixun/201/948264.html +https://www.zyctd.com/zixun/201/293462.html +https://www.zyctd.com/zixun/201/829348.html +https://www.zyctd.com/zixun/201/332369.html +https://www.zyctd.com/zixun/201/907461.html +https://www.zyctd.com/zixun/201/756555.html +https://www.zyctd.com/zixun/201/717915.html +https://www.zyctd.com/zixun/201/262203.html +https://www.zyctd.com/zixun/201/1055787.html +https://www.zyctd.com/zixun/201/432336.html +https://www.zyctd.com/zixun/201/907489.html +https://www.zyctd.com/zixun/201/1014686.html +https://www.zyctd.com/zixun/201/1053320.html +https://www.zyctd.com/zixun/201/480020.html +https://www.zyctd.com/zixun/201/287423.html +https://www.zyctd.com/zixun/201/385289.html +https://www.zyctd.com/zixun/201/1030421.html +https://www.zyctd.com/zixun/201/527648.html +https://www.zyctd.com/zixun/201/972959.html +https://www.zyctd.com/zixun/201/408767.html +https://www.zyctd.com/zixun/201/724887.html +https://www.zyctd.com/zixun/201/291480.html +https://www.zyctd.com/zixun/201/472544.html +https://www.zyctd.com/zixun/201/724873.html +https://www.zyctd.com/zixun/201/281751.html +https://www.zyctd.com/zixun/201/1049693.html +https://www.zyctd.com/zixun/201/869619.html +https://www.zyctd.com/zixun/201/355497.html +https://www.zyctd.com/zixun/201/341623.html +https://www.zyctd.com/zixun/201/450753.html +https://www.zyctd.com/zixun/201/1065837.html +https://www.zyctd.com/zixun/201/1031331.html +https://www.zyctd.com/zixun/201/669727.html +https://www.zyctd.com/zixun/201/1034010.html +https://www.zyctd.com/zixun/201/1054058.html +https://www.zyctd.com/zixun/201/954613.html +https://www.zyctd.com/zixun/201/715584.html +https://www.zyctd.com/zixun/201/1051110.html +https://www.zyctd.com/zixun/201/269963.html +https://www.zyctd.com/zixun/201/1048128.html +https://www.zyctd.com/zixun/201/793207.html +https://www.zyctd.com/zixun/201/284310.html +https://www.zyctd.com/zixun/201/282639.html +https://www.zyctd.com/zixun/201/1068138.html +https://www.zyctd.com/zixun/201/340678.html +https://www.zyctd.com/zixun/201/294371.html +https://www.zyctd.com/zixun/201/324277.html +https://www.zyctd.com/zixun/201/1048931.html +https://www.zyctd.com/zixun/201/851398.html +https://www.zyctd.com/zixun/201/263527.html +https://www.zyctd.com/zixun/201/919480.html +https://www.zyctd.com/zixun/201/685442.html +https://www.zyctd.com/zixun/201/428325.html +https://www.zyctd.com/zixun/201/1032698.html +https://www.zyctd.com/zixun/201/1003367.html +https://www.zyctd.com/zixun/201/852315.html +https://www.zyctd.com/zixun/201/283156.html +https://www.zyctd.com/zixun/201/262484.html +https://www.zyctd.com/zixun/201/1065225.html +https://www.zyctd.com/zixun/201/763331.html +https://www.zyctd.com/zixun/201/1066158.html +https://www.zyctd.com/zixun/201/1047744.html +https://www.zyctd.com/zixun/201/842795.html +https://www.zyctd.com/zixun/201/975374.html +https://www.zyctd.com/zixun/201/1055865.html +https://www.zyctd.com/zixun/201/1017367.html +https://www.zyctd.com/zixun/201/1057711.html +https://www.zyctd.com/zixun/201/1074295.html +https://www.zyctd.com/zixun/201/283647.html +https://www.zyctd.com/zixun/201/286896.html +https://www.zyctd.com/zixun/201/1043393.html +https://www.zyctd.com/zixun/201/305888.html +https://www.zyctd.com/zixun/201/487258.html +https://www.zyctd.com/zixun/201/1045652.html +https://www.zyctd.com/zixun/201/1064905.html +https://www.zyctd.com/zixun/201/515636.html +https://www.zyctd.com/zixun/201/1038609.html +https://www.zyctd.com/zixun/201/438083.html +https://www.zyctd.com/zixun/201/297327.html +https://www.zyctd.com/zixun/201/773537.html +https://www.zyctd.com/zixun/201/1043589.html +https://www.zyctd.com/zixun/201/815712.html +https://www.zyctd.com/zixun/201/698595.html +https://www.zyctd.com/zixun/201/269800.html +https://www.zyctd.com/zixun/201/1030332.html +https://www.zyctd.com/zixun/201/422676.html +https://www.zyctd.com/zixun/201/290130.html +https://www.zyctd.com/zixun/201/270359.html +https://www.zyctd.com/zixun/201/995604.html +https://www.zyctd.com/zixun/201/1074993.html +https://www.zyctd.com/zixun/201/1054825.html +https://www.zyctd.com/zixun/201/918577.html +https://www.zyctd.com/zixun/201/686527.html +https://www.zyctd.com/zixun/201/297509.html +https://www.zyctd.com/zixun/201/622708.html +https://www.zyctd.com/zixun/201/469870.html +https://www.zyctd.com/zixun/201/844328.html +https://www.zyctd.com/zixun/201/394508.html +https://www.zyctd.com/zixun/201/271744.html +https://www.zyctd.com/zixun/201/1054940.html +https://www.zyctd.com/zixun/201/732818.html +https://www.zyctd.com/zixun/201/1049547.html +https://www.zyctd.com/zixun/201/1059684.html +https://www.zyctd.com/zixun/201/1055301.html +https://www.zyctd.com/zixun/201/962068.html +https://www.zyctd.com/zixun/201/451355.html +https://www.zyctd.com/zixun/201/1056174.html +https://www.zyctd.com/zixun/201/930540.html +https://www.zyctd.com/zixun/201/871656.html +https://www.zyctd.com/zixun/201/363246.html +https://www.zyctd.com/zixun/201/845672.html +https://www.zyctd.com/zixun/201/452965.html +https://www.zyctd.com/zixun/201/1065920.html +https://www.zyctd.com/zixun/201/1058808.html +https://www.zyctd.com/zixun/201/986868.html +https://www.zyctd.com/zixun/201/489785.html +https://www.zyctd.com/zixun/201/307946.html +https://www.zyctd.com/zixun/201/833359.html +https://www.zyctd.com/zixun/201/806969.html +https://www.zyctd.com/zixun/201/1050812.html +https://www.zyctd.com/zixun/201/1033696.html +https://www.zyctd.com/zixun/201/501167.html +https://www.zyctd.com/zixun/201/1078919.html +https://www.zyctd.com/zixun/201/1036495.html +https://www.zyctd.com/zixun/201/1008736.html +https://www.zyctd.com/zixun/201/1054264.html +https://www.zyctd.com/zixun/201/493152.html +https://www.zyctd.com/zixun/201/685456.html +https://www.zyctd.com/zixun/201/995597.html +https://www.zyctd.com/zixun/201/905501.html +https://www.zyctd.com/zixun/201/347573.html +https://www.zyctd.com/zixun/201/1045494.html +https://www.zyctd.com/zixun/201/549775.html +https://www.zyctd.com/zixun/201/1037336.html +https://www.zyctd.com/zixun/201/1034972.html +https://www.zyctd.com/zixun/201/653046.html +https://www.zyctd.com/zixun/201/316612.html +https://www.zyctd.com/zixun/201/447064.html +https://www.zyctd.com/zixun/201/307603.html +https://www.zyctd.com/zixun/201/263437.html +https://www.zyctd.com/zixun/201/894490.html +https://www.zyctd.com/zixun/201/368629.html +https://www.zyctd.com/zixun/201/273285.html +https://www.zyctd.com/zixun/201/1059618.html +https://www.zyctd.com/zixun/201/459237.html diff --git a/proxy.txt b/proxy.txt new file mode 100644 index 0000000..199a16c --- /dev/null +++ b/proxy.txt @@ -0,0 +1 @@ +127.0.0.1:7897 \ No newline at end of file diff --git a/src/main/java/com/example/AusContent.java b/src/main/java/com/example/AusContent.java new file mode 100644 index 0000000..f71c2d8 --- /dev/null +++ b/src/main/java/com/example/AusContent.java @@ -0,0 +1,119 @@ +package com.example; + +import okhttp3.*; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; + +import java.io.IOException; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.time.LocalDateTime; +import java.time.format.DateTimeFormatter; +import java.util.Date; +import java.util.HashMap; +import java.util.Map; + +public class AusContent { + public static void main(String[] args) throws IOException { + OkHttpClient client = new OkHttpClient().newBuilder() + .build(); + MediaType mediaType = MediaType.parse("text/plain"); + RequestBody body = RequestBody.create(mediaType, ""); + Request request = new Request.Builder() + .url("https://www.anzctr.org.auTrial/Registration/TrialReview.aspx?id=389345&isReview=true") + .get() + .build(); + Response response = client.newCall(request).execute(); + String html = response.body().string(); + Document parse = Jsoup.parse(html); + String title = parse.select("#ctl00_body_CXSTUDYTITLE").text(); + String registNum = parse.select("#ctl00_body_CXACTRNUMBER").text(); + String registTime = convertDate(parse.select("#ctl00_body_CXAPPROVALDATE").text()); + String sponsor = parse.select("#ctl00_body_repeater_TXFUNDINGSOURCE_ctl00_CXTYPE").text(); + String studyType = parse.select("#ctl00_body_CXSTUDYTYPE").text(); + String phase = parse.select("#ctl00_body_CXPHASE").text(); + String disease = parse.select("#ctl00_body_repeater_TXHEALTHCONDITION_ctl00_CXHEALTHCONDITION").text(); + String SD1 = parse.select("#ctl00_body_CXPURPOSE").text(); + String SD2 = parse.select("#ctl00_body_CXALLOCATION").text(); + String SD3 = parse.select("#ctl00_body_CXCONCEALMENT").text(); + String SD4 = parse.select("#ctl00_body_CXSEQUENCE").text(); + String SD5 = parse.select("#ctl00_body_CXMASKING").text(); + String SD6 = parse.select("#ctl00_body_maskingdiv > div > div.review-element-content").text(); + String SD7 = parse.select("#ctl00_body_CXASSIGNMENT").text(); + String SD8 = parse.select("#ctl00_body_CXPHASE").text(); + String SD9 = parse.select("#ctl00_body_CXENDPOINT").text(); + String SD10 = parse.select("#ctl00_body_CXSTATISTICALMETHODS").text(); + String SD11 = parse.select("#ctl00_body_interventional_div > div:nth-child(8) > div > div.review-element-content").text(); + String studyObjective = parse.select("#ctl00_body_CXPURPOSE").text(); + String inclusionCriteria = parse.select("#ctl00_body_CXINCLUSIVECRITERIA").text(); + String exclusionCriteria = parse.select("#ctl00_body_CXEXCLUSIVECRITERIA").text(); + String currentStatus = parse.select("#ctl00_body_CXRECRUITMENTSTATUS").text(); + String enrollment = parse.select("#ctl00_body_CXSAMPLESIZE").text(); + String country = parse.select("#ctl00_body_repeater_TXCOUNTRYOUTSIDEAUSTRALIA_ctl01_CXCOUNTRY").text(); + String intervention = parse.select("#ctl00_body_trialDiv > div:nth-child(30) > div > div.review-element-content").text(); + Map studyDesign = new HashMap<>(); + studyDesign.put("Purpose of the study",SD1); + studyDesign.put("Allocation to intervention",SD2); + studyDesign.put("Procedure for enrolling a subject and allocating the treatment (allocation concealment procedures)",SD3); + studyDesign.put("Methods used to generate the sequence in which subjects will be randomised (sequence generation)",SD4); + studyDesign.put("Masking / blinding",SD5); + studyDesign.put("Who is / are masked / blinded?",SD6); + studyDesign.put("Intervention assignment",SD7); + studyDesign.put("Other design features",SD11); + studyDesign.put("Phase",SD8); + studyDesign.put("Type of endpoint/s",SD9); + studyDesign.put("Statistical methods / analysis",SD10); + Map resultData = new HashMap<>(); + resultData.put("title",title); + resultData.put("registNum",registNum); + resultData.put("registTime",registTime); + resultData.put("registStatus",""); + resultData.put("registTitle",""); + resultData.put("fullTitle",""); + resultData.put("sponsor",sponsor); + resultData.put("sponsorPart",""); + resultData.put("studyType",studyType); + resultData.put("phase",phase); + resultData.put("disease",disease); + resultData.put("studyDesign",studyDesign); + resultData.put("studyObjective",studyObjective); + resultData.put("studyStartDate",""); + resultData.put("inclusionCriteria",inclusionCriteria); + resultData.put("exclusionCriteria",exclusionCriteria); + resultData.put("currentStatus",currentStatus); + resultData.put("enrollment",enrollment); + resultData.put("country",country); + resultData.put("tagTime",""); + resultData.put("intervention",intervention); + resultData.put("primaryOutcome",""); + resultData.put("crawlTime",getCurrentTime()); +// resultData.put("crawlUrl",url); + resultData.put("postTime",registTime); + resultData.put("content","content"); + resultData.put("forwardcontent","forwardcontent"); + System.out.println(resultData); + } + public static String convertDate(String inputDate) { + try { + + SimpleDateFormat inputFormat = new SimpleDateFormat("d/MM/yyyy"); + + Date date = inputFormat.parse(inputDate); + + SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + + return outputFormat.format(date); + } catch (ParseException e) { + + return "Invalid date format"; + } + } + public static String getCurrentTime() { + // 创建 DateTimeFormatter,指定输出格式 + DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); + // 获取当前时间 + LocalDateTime now = LocalDateTime.now(); + // 格式化 + return now.format(formatter); + } +} diff --git a/src/main/java/com/example/AusList.java b/src/main/java/com/example/AusList.java new file mode 100644 index 0000000..eaae8f9 --- /dev/null +++ b/src/main/java/com/example/AusList.java @@ -0,0 +1,200 @@ +package com.example; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.io.OutputStream; +import java.net.HttpURLConnection; +import java.net.MalformedURLException; +import java.net.URL; +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class AusList { + public static void main(String[] args) throws Exception { + String targetUrl = "https://www.anzctr.org.au/TrialSearch.aspx?page=20"; + String baseUrl = "https://www.anzctr.org.au/TrialSearch.aspx"; + String postUrl = "https://www.anzctr.org.au/TrialSearch.aspx"; + String pageNumber = targetUrl.contains("?page=") ? targetUrl.split("page=")[1] : "1"; + int page = Integer.parseInt(pageNumber); + System.out.println("Page Number: " + page); + // 存储 cookies + Set cookieSet = new HashSet<>(); + String sessionId = null; + + // 第一步:初始 GET 请求,获取 cookies 和 ViewState + URL initialUrl = new URL(baseUrl); + HttpURLConnection initialConn = (HttpURLConnection) initialUrl.openConnection(); + initialConn.setRequestMethod("GET"); + initialConn.setRequestProperty("User-Agent", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"); + + initialConn.setRequestProperty("Accept", + "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"); + initialConn.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.9,th;q=0.8,en;q=0.7"); + initialConn.setRequestProperty("Cache-Control", "no-cache"); + initialConn.setRequestProperty("Pragma", "no-cache"); + initialConn.setRequestProperty("Upgrade-Insecure-Requests", "1"); + initialConn.setRequestProperty("Sec-Fetch-Dest", "document"); + initialConn.setRequestProperty("Sec-Fetch-Mode", "navigate"); + initialConn.setRequestProperty("Sec-Fetch-Site", "same-origin"); + initialConn.setRequestProperty("Sec-Fetch-User", "?1"); + + initialConn.setRequestProperty("Sec-CH-UA", + "\"Google Chrome\";v=\"135\", \"Not-A.Brand\";v=\"8\", \"Chromium\";v=\"135\""); + initialConn.setRequestProperty("Sec-CH-UA-Mobile", "?0"); + initialConn.setRequestProperty("Sec-CH-UA-Platform", "\"Windows\""); + +// initialConn.setRequestProperty("Cookie", +// "ASP.NET_SessionId=gkhw0unpeytexsa40v1sdjf1; __utma=2822752...; _ga=..."); + + initialConn.setInstanceFollowRedirects(false); + initialConn.setConnectTimeout(10000); + initialConn.setReadTimeout(10000); + + // 捕获 cookies + sessionId = updateCookies(initialConn, cookieSet); + + // 读取响应内容以获取 ViewState + BufferedReader in = new BufferedReader(new InputStreamReader(initialConn.getInputStream())); + StringBuilder content = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + content.append(inputLine); + } + in.close(); + initialConn.disconnect(); + + // 提取初始 ViewState + Map viewStateData = extractViewStateData(content.toString()); + String viewState = viewStateData.get("__VIEWSTATE"); + String viewStateGen = viewStateData.get("__VIEWSTATEGENERATOR"); + String eventValidation = viewStateData.get("__EVENTVALIDATION"); + String payload = buildPostData(viewState,eventValidation,viewStateGen,page,sessionId); + + HttpURLConnection conn = (HttpURLConnection) new URL(postUrl).openConnection(); + conn.setRequestMethod("POST"); + conn.setDoOutput(true); + conn.setInstanceFollowRedirects(false); + conn.setConnectTimeout(10000); + conn.setReadTimeout(10000); + + // 设置请求头(仿浏览器) + conn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"); + conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"); + conn.setRequestProperty("Accept", "*/*"); + conn.setRequestProperty("X-Requested-With", "XMLHttpRequest"); + conn.setRequestProperty("X-MicrosoftAjax", "Delta=true"); + conn.setRequestProperty("Referer", "https://www.anzctr.org.au/TrialSearch.aspx"); + conn.setRequestProperty("Origin", "https://www.anzctr.org.au"); + + + // 构建 POST 表单数据 + String postData = payload; + // 写入 POST 数据 + try (OutputStream os = conn.getOutputStream()) { + byte[] input = postData.getBytes(StandardCharsets.UTF_8); + os.write(input); + } + + // 读取响应 + BufferedReader re = new BufferedReader(new InputStreamReader(conn.getInputStream(), StandardCharsets.UTF_8)); + StringBuilder response = new StringBuilder(); + String line; + while ((line = re.readLine()) != null) { + response.append(line); + } + String html = response.toString(); + Document parse = Jsoup.parse(html); + Elements elements =parse.select(".results-header-tools a"); + for (Element element:elements){ + String link = "https://www.anzctr.org.au" + element.attr("href"); + System.out.println(link); + } + re.close(); + conn.disconnect(); + } + + // 更新并返回当前连接中的 Cookie,包含 JSESSIONID 的提取 + private static String updateCookies(HttpURLConnection conn, Set cookieSet) { + String sessionId = null; + Map> headerFields = conn.getHeaderFields(); + List cookiesHeader = headerFields.get("Set-Cookie"); + if (cookiesHeader != null) { + for (String cookie : cookiesHeader) { + String cookieValue = cookie.split(";")[0]; + cookieSet.add(cookieValue); + if (cookieValue.startsWith("ASP.NET_SessionId=") || cookieValue.startsWith("csfcfc=")) { + sessionId = cookieValue; + } + } + } + return sessionId; + } + // 提取 __VIEWSTATE 隐藏字段的值 + private static Map extractViewStateData(String html) { + Map stateMap = new HashMap<>(); + + // 使用三个独立正则提取三个字段 + extractHiddenField(html, "__VIEWSTATE", stateMap); + extractHiddenField(html, "__VIEWSTATEGENERATOR", stateMap); + extractHiddenField(html, "__EVENTVALIDATION", stateMap); + + if (!stateMap.containsKey("__VIEWSTATE")) { + System.err.println("Failed to extract __VIEWSTATE from HTML"); + } + if (!stateMap.containsKey("__EVENTVALIDATION")) { + System.err.println("Failed to extract __EVENTVALIDATION from HTML"); + } + if (!stateMap.containsKey("__VIEWSTATEGENERATOR")) { + System.err.println("Failed to extract __VIEWSTATEGENERATOR from HTML"); + } + return stateMap; + } + + private static void extractHiddenField(String html, String fieldName, Map map) { + String regex = "(?i)]*name=[\"']" + fieldName + "[\"'][^>]*value=[\"']([^\"']+)[\"']"; + Pattern pattern = Pattern.compile(regex); + Matcher matcher = pattern.matcher(html); + + if (matcher.find()) { + map.put(fieldName, matcher.group(1)); + } + } + + private static String buildPostData(String viewState, String eventValidation, String viewStateGen, int page, String sessionId) { + try { + // 按照真实请求体的顺序和字段进行构建 + String payload = ""; + payload += URLEncoder.encode("ctl00$body$tsmAJAXScriptManager", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode("ctl00$body$tsmAJAXScriptManager|ctl00$body$tsmAJAXScriptManager", StandardCharsets.UTF_8.name()) + "&"; + payload += URLEncoder.encode("ctl00_body_tsmAJAXScriptManager_HiddenField", StandardCharsets.UTF_8.name()) + "=&"; // 添加缺失字段 + payload += URLEncoder.encode("__EVENTTARGET", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode("ctl00$body$tsmAJAXScriptManager", StandardCharsets.UTF_8.name()) + "&"; + payload += URLEncoder.encode("__EVENTARGUMENT", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode("conditionCode=&dateOfRegistrationFrom=&interventionDescription=&interventionCodeOperator=OR&primarySponsorType=&gender=&distance=&postcode=&pageSize=20&ageGroup=&recruitmentCountryOperator=OR&recruitmentRegion=ðicsReview=&countryOfRecruitment=®istry=&searchTxt=&studyType=&allocationToIntervention=&dateOfRegistrationTo=&recruitmentStatus=&interventionCode=&healthCondition=&healthyVolunteers=&page="+page+"&conditionCategory=&fundingSource=&trialStartDateTo=&trialStartDateFrom=&phase=", StandardCharsets.UTF_8.name()) + "&"; // 注意这里的参数字符串是完整的 + payload += URLEncoder.encode("__LASTFOCUS", StandardCharsets.UTF_8.name()) + "=&"; + payload += URLEncoder.encode("__VIEWSTATE", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode(viewState, StandardCharsets.UTF_8.name()) + "&"; + payload += URLEncoder.encode("__VIEWSTATEGENERATOR", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode(viewStateGen, StandardCharsets.UTF_8.name()) + "&"; + payload += URLEncoder.encode("__SCROLLPOSITIONX", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode("0", StandardCharsets.UTF_8.name()) + "&"; // 添加缺失字段 + payload += URLEncoder.encode("__SCROLLPOSITIONY", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode("0", StandardCharsets.UTF_8.name()) + "&"; // 添加缺失字段 + payload += URLEncoder.encode("__EVENTVALIDATION", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode(eventValidation, StandardCharsets.UTF_8.name()) + "&"; + + // ... 添加并按顺序排列其他所有字段,确保名称、值、编码与真实请求体一致 ... + + // 确保最后一个字段后面没有 & + payload += URLEncoder.encode("__ASYNCPOST", StandardCharsets.UTF_8.name()) + "=" + URLEncoder.encode("true", StandardCharsets.UTF_8.name()); + + return payload; + + } catch (Exception e) { + System.err.println("Error building POST data: " + e.getMessage()); + return ""; + } + } + +} diff --git a/src/main/java/com/example/CaptchaOCR.java b/src/main/java/com/example/CaptchaOCR.java new file mode 100644 index 0000000..f9f6c53 --- /dev/null +++ b/src/main/java/com/example/CaptchaOCR.java @@ -0,0 +1,173 @@ +package com.example; + +import java.awt.image.BufferedImage; +import java.io.*; +import java.net.HttpURLConnection; +import java.net.URL; +import javax.imageio.ImageIO; +import net.sourceforge.tess4j.Tesseract; +import net.sourceforge.tess4j.TesseractException; + +// ... 其他必要的导入 ... + +public class CaptchaOCR { + + // Tesseract data 路径 (tessdata 文件夹所在目录) + // Windows 示例: "C:\\Program Files\\Tesseract-OCR\\tessdata" + // Linux/macOS 示例: 通常不需要设置,Tess4J 会自动查找 + private static final String TESSDATA_PATH = "F:\\tool\\Tesseract-OCR\\tessdata"; // 根据你的安装路径修改 + + /** + * 下载验证码图片 + * @param imageUrl 图片的完整 URL + * @return 图片的 BufferedImage 对象 + * @throws IOException 如果下载失败 + */ + public static BufferedImage downloadImage(String imageUrl) throws IOException { + URL url = new URL(imageUrl); + HttpURLConnection conn = (HttpURLConnection) url.openConnection(); + conn.setRequestMethod("GET"); + // 添加 User-Agent 等必要的请求头,模拟浏览器 + conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"); + // ... 其他头 ... + + int responseCode = conn.getResponseCode(); + if (responseCode == HttpURLConnection.HTTP_OK) { + try (InputStream is = conn.getInputStream()) { + // 将输入流读取到字节数组,ImageIO 从字节数组读取更稳定 + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + byte[] buffer = new byte[4096]; // 缓冲区大小,可以调整 + int bytesRead; + while ((bytesRead = is.read(buffer)) != -1) { + baos.write(buffer, 0, bytesRead); + } + ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray()); + + BufferedImage image = ImageIO.read(bais); + + if (image == null) { + throw new IOException("Failed to read image from stream. Check image format."); + } + return image; + } + } else { + throw new IOException("Failed to download image. HTTP error code: " + responseCode); + } + } + + /** + * 对验证码图片进行预处理 (基础示例:转灰度+二值化) + * 这是最关键的部分,需要根据验证码样式调整 + * @param originalImage 原始图片 + * @return 预处理后的图片 + */ + public static BufferedImage preprocessImage(BufferedImage originalImage) { + // TODO: 这里是图像预处理的重点,需要根据实际验证码样式进行调整和优化 + // 基础处理:转灰度 -> 二值化 + int width = originalImage.getWidth(); + int height = originalImage.getHeight(); + BufferedImage grayImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_GRAY); + grayImage.getGraphics().drawImage(originalImage, 0, 0, null); + + BufferedImage binaryImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_BINARY); + // 二值化阈值,可能需要调整 (0-255) + int threshold = 128; + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + int gray = grayImage.getRaster().getSample(x, y, 0); + if (gray < threshold) { + binaryImage.getRaster().setSample(x, y, 0, 0); // 黑色 + } else { + binaryImage.getRaster().setSample(x, y, 0, 1); // 白色 + } + } + } + + // TODO: 更高级的预处理包括: + // - 去除干扰线、噪点 + // - 字符分割(如果字符粘连) + // - 倾斜校正 + // - 调整亮度和对比度等 + // 你可能需要引入更专业的图像处理库或算法 + + // 为了调试,可以将预处理后的图片保存下来查看效果 + try { + File outputfile = new File("preprocessed_captcha.png"); + ImageIO.write(binaryImage, "png", outputfile); + System.out.println("Preprocessed image saved to " + outputfile.getAbsolutePath()); + } catch (IOException e) { + e.printStackTrace(); + } + + return binaryImage; // 返回预处理后的图片 + } + + /** + * 使用 Tess4J 识别图片中的文字 + * @param image 待识别的图片 (最好是预处理后的) + * @return 识别出的字符串 + */ + public static String recognizeCaptcha(BufferedImage image) { + Tesseract tesseract = new Tesseract(); + + // 设置 tessdata 路径 (如果 TESSDATA_PATH 已正确设置且 Tesseract 安装正确,这行可能不是必需的,Tess4J 会自动查找) + // 但显式设置更保险 + if (TESSDATA_PATH != null && !TESSDATA_PATH.isEmpty()) { + tesseract.setDatapath(TESSDATA_PATH); + } else { + System.out.println("TESSDATA_PATH not set. Tess4J will try to find tessdata automatically."); + } + + + tesseract.setLanguage("eng"); // 设置识别语言为英文 (通常包含数字) + // 如果验证码只有数字,可以尝试设置仅识别数字 + // tesseract.setTessVariable("tessedit_char_whitelist", "0123456789"); + + try { + String result = tesseract.doOCR(image); + // 清理识别结果,去除空格或换行符等 + result = result.trim().replaceAll("[^0-9a-zA-Z]", ""); // 根据验证码内容调整清理规则 + System.out.println("OCR Result: " + result); + return result; + } catch (TesseractException e) { + System.err.println("Error during OCR: " + e.getMessage()); + return null; // 识别失败 + } + } + + // 示例如何在你的爬虫流程中使用 + public static void main(String[] args) { + String captchaImageUrl = "YOUR_CAPTCHA_IMAGE_URL"; // 从页面解析获取到的验证码图片 URL + + try { + // 1. 下载图片 + BufferedImage originalCaptchaImage = downloadImage(captchaImageUrl); + System.out.println("Image downloaded."); + + // 2. 预处理图片 + BufferedImage preprocessedImage = preprocessImage(originalCaptchaImage); + System.out.println("Image preprocessed."); + + // 3. 识别验证码 + String captchaCode = recognizeCaptcha(preprocessedImage); + + if (captchaCode != null && !captchaCode.isEmpty()) { + System.out.println("Recognized CAPTCHA: " + captchaCode); + // 4. 将 captchaCode 填入 POST 数据中,提交表单 + // ... (你的 ASP.NET WebForms POST 提交代码,将 captchaCode 放到对应的隐藏字段或输入框字段中) ... + // 例如:postData += "&captchaInputFieldName=" + URLEncoder.encode(captchaCode, StandardCharsets.UTF_8.name()); + // ... 提交 POST 请求 ... + + } else { + System.out.println("Failed to recognize CAPTCHA."); + // 5. 处理识别失败的情况,可能需要重试或记录日志 + } + + } catch (IOException e) { + System.err.println("Error downloading or processing image: " + e.getMessage()); + } + // catch (URISyntaxException e) { + // System.err.println("Invalid URL: " + e.getMessage()); + // } // 如果你的 downloadImage 方法 throws URISyntaxException + } +} \ No newline at end of file diff --git a/src/main/java/com/example/CsAirScraper.java b/src/main/java/com/example/CsAirScraper.java new file mode 100644 index 0000000..b47cf8d --- /dev/null +++ b/src/main/java/com/example/CsAirScraper.java @@ -0,0 +1,81 @@ +package com.example; + +import io.github.bonigarcia.wdm.WebDriverManager; +import org.apache.hc.client5.http.classic.methods.HttpPost; +import org.apache.hc.client5.http.impl.classic.CloseableHttpClient; +import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse; +import org.apache.hc.client5.http.impl.classic.HttpClients; +import org.apache.hc.core5.http.io.entity.StringEntity; +import org.apache.hc.core5.http.io.entity.EntityUtils; +import org.openqa.selenium.Cookie; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.chrome.ChromeDriver; + +import java.util.Set; +import java.util.stream.Collectors; + +public class CsAirScraper { + + public static void main(String[] args) throws Exception { + // 1. 启动 Selenium,访问南航主站 + WebDriverManager.chromedriver().setup(); + WebDriver driver = new ChromeDriver(); + driver.get("https://b2c.csair.com/portal/main/flight/direct/query"); + + // 等待 Cookie 被 JS 设置(稍等几秒) + Thread.sleep(5000); // 可根据实际页面响应调整等待时间 + + // 2. 获取浏览器中所有 Cookie + Set seleniumCookies = driver.manage().getCookies(); + String cookieHeader = seleniumCookies.stream() + .map(c -> c.getName() + "=" + c.getValue()) + .collect(Collectors.joining("; ")); + + System.out.println("获取到 Cookie: " + cookieHeader); + driver.quit(); // 关闭浏览器 + + // 3. 准备 HttpClient 请求,携带 Cookie + try (CloseableHttpClient httpClient = HttpClients.createDefault()) { + HttpPost post = new HttpPost("https://b2c.csair.com/portal/main/flight/direct/query"); + + // 设置请求头 + post.setHeader("Content-Type", "application/json"); + post.setHeader("Cookie", cookieHeader); + post.setHeader("User-Agent", "Mozilla/5.0"); + + // 设置请求体(JSON) + String json = "{" + + "\"action\": \"0\"," + + "\"adultNum\": \"1\"," + + "\"airLine\": 1," + + "\"arrCity\": \"PKX\"," + + "\"businessType\": \"COMMON\"," + + "\"cabinOrder\": \"0\"," + + "\"cache\": 0," + + "\"childNum\": \"0\"," + + "\"depCity\": \"CAN\"," + + "\"flightDate\": \"20250514\"," + + "\"flyType\": 0," + + "\"infantNum\": \"0\"," + + "\"international\": \"0\"," + + "\"isMember\": \"\"," + + "\"isMultipass\": 1," + + "\"language\": \"zh\"," + + "\"preUrl\": \"\"," + + "\"segType\": \"1\"," + + "\"tariffRules\": []" + + "}"; + + + post.setEntity(new StringEntity(json)); + + // 4. 发请求 + try (CloseableHttpResponse response = httpClient.execute(post)) { + int code = response.getCode(); + String result = EntityUtils.toString(response.getEntity()); + System.out.println("状态码: " + code); + System.out.println("响应: " + result); + } + } + } +} diff --git a/src/main/java/com/example/CtriScraper.java b/src/main/java/com/example/CtriScraper.java new file mode 100644 index 0000000..3ff578f --- /dev/null +++ b/src/main/java/com/example/CtriScraper.java @@ -0,0 +1,404 @@ +package com.example; + +import com.fasterxml.jackson.databind.ObjectMapper; +import okhttp3.*; +import org.apache.hc.client5.http.cookie.BasicCookieStore; +import org.apache.hc.client5.http.cookie.CookieStore; +import org.apache.hc.client5.http.classic.methods.HttpGet; +import org.apache.hc.client5.http.classic.methods.HttpPost; +import org.apache.hc.client5.http.entity.UrlEncodedFormEntity; +import org.apache.hc.client5.http.impl.classic.CloseableHttpClient; +import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse; +import org.apache.hc.client5.http.impl.classic.HttpClients; +import org.apache.hc.client5.http.protocol.HttpClientContext; +import org.apache.hc.core5.http.HttpEntity; +import org.apache.hc.core5.http.NameValuePair; +import org.apache.hc.core5.http.io.entity.EntityUtils; +import org.apache.hc.core5.http.message.BasicNameValuePair; +import org.apache.kafka.clients.producer.KafkaProducer; +import org.apache.kafka.clients.producer.ProducerConfig; +import org.apache.kafka.clients.producer.ProducerRecord; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.Month; +import java.time.Year; +import java.time.format.DateTimeFormatter; +import java.util.*; +import java.nio.charset.StandardCharsets; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class CtriScraper { + + private static final String SEARCH_FORM_URL = "https://ctri.nic.in/Clinicaltrials/advancesearchmain.php"; + + private static final String SEARCH_SUBMIT_URL = "https://ctri.nic.in/Clinicaltrials/advsearch.php"; + + private static final Pattern LINK_REGEX_PATTERN = Pattern.compile("'([^']*)'"); + + private static final String TOPIC_NAME = "cliniTopic"; + private static final String BOOTSTRAP_SERVERS = "node-01:19092"; + private static KafkaProducer producer; + private static ObjectMapper objectMapper = new ObjectMapper(); + private static final Random random = new Random(); + + static { + Properties props = new Properties(); + props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, BOOTSTRAP_SERVERS); + props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer"); + props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer"); + props.put(ProducerConfig.ACKS_CONFIG, "all"); // 等待所有副本确认 + props.put(ProducerConfig.RETRIES_CONFIG, 3); // 重试次数 + producer = new KafkaProducer<>(props); + + } + public static List getlink(Integer year, Integer month) { + List linkList = new ArrayList<>(); // 用于存放提取到的链接 + // 用于存储和管理 Cookies + CookieStore cookieStore = new BasicCookieStore(); + // 用于在请求之间维护状态,特别是关联 CookieStore + HttpClientContext context = HttpClientContext.create(); + context.setCookieStore(cookieStore); + + // 使用 try-with-resources 确保 HttpClient 被正确关闭 + try (CloseableHttpClient httpClient = HttpClients.custom() + .setDefaultCookieStore(cookieStore) // 将cookie store绑定到client + .build()) { + + // --- Step 1 & 2: 发送 GET 请求获取表单页面并解析 --- + // System.out.println("Fetching search form page..."); // 调试信息可以按需保留或删除 + HttpGet getRequest = new HttpGet(SEARCH_FORM_URL); + // 添加一些伪装的 Headers 模拟浏览器访问 + getRequest.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"); + getRequest.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"); + getRequest.setHeader("Accept-Language", "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7"); + + String formPageHtml = null; + try (CloseableHttpResponse response = httpClient.execute(getRequest, context)) { + int statusCode = response.getCode(); + // System.out.println("GET Response Status: " + statusCode); // 调试信息 + + if (statusCode != 200) { + System.err.println("Error: GET request to form page failed with status code: " + statusCode); + EntityUtils.consume(response.getEntity()); // 确保消费掉实体,释放连接 + return null; // 获取表单页面失败,返回 null + } + + HttpEntity entity = response.getEntity(); + if (entity != null) { + formPageHtml = EntityUtils.toString(entity, StandardCharsets.UTF_8); + EntityUtils.consume(entity); // 确保实体内容被完全消费 + } else { + System.err.println("Error: Failed to get form page entity."); + return null; // 获取页面内容失败,返回 null + } + } + // System.out.println("Form page fetched successfully."); // 调试信息 + + // --- Step 3 & 4: 解析 HTML 提取 csrf_token 和 __ncforminfo --- + Document doc = Jsoup.parse(formPageHtml, SEARCH_FORM_URL); // 传入 base URI 有助于处理相对路径 + + // 查找隐藏的输入字段 + Element csrfTokenInput = doc.selectFirst("input[name=csrf_token][type=hidden]"); + Element ncFormInfoInput = doc.selectFirst("input[name=__ncforminfo][type=hidden]"); + + String csrfToken = null; + String ncFormInfo = null; + + if (csrfTokenInput != null) { + csrfToken = csrfTokenInput.val(); + // System.out.println("Extracted csrf_token: " + csrfToken); // 调试信息 + } else { + System.err.println("Warning: Could not find csrf_token input field."); + return null; // 缺少关键 token,返回 null + } + + if (ncFormInfoInput != null) { + ncFormInfo = ncFormInfoInput.val(); + // System.out.println("Extracted __ncforminfo: " + ncFormInfo); // 调试信息 + } else { + System.err.println("Warning: Could not find __ncforminfo input field."); + return null; // 缺少关键 token,返回 null + } + + // 如果必要的 token 没有获取到,可能无法继续 (虽然上面的检查已经覆盖,这里作为双重保险) + if (csrfToken == null || ncFormInfo == null) { + System.err.println("Error: Missing required tokens. Cannot proceed with POST request."); + return null; + } + + // --- Step 5 & 6: 构建 POST 请求参数并发送 --- + // System.out.println("\nPreparing POST request..."); // 调试信息 + HttpPost postRequest = new HttpPost(SEARCH_SUBMIT_URL); + // 添加 Headers 模拟浏览器提交表单 + postRequest.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"); + // 重要:设置 Referer Header + postRequest.setHeader("Referer", SEARCH_FORM_URL); + // 添加 Origin Header + postRequest.setHeader("Origin", "https://ctri.nic.in"); + postRequest.setHeader("Content-Type", "application/x-www-form-urlencoded"); + postRequest.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"); + postRequest.setHeader("Accept-Language", "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7"); + postRequest.setHeader("Pragma", "no-cache"); + + List params = new ArrayList<>(); + // 添加你之前分析的载荷中的所有参数,使用获取到的动态值 + params.add(new BasicNameValuePair("stid", "1")); // 注意 stid 之前有两个,这里用 1 + params.add(new BasicNameValuePair("csrf_token", csrfToken)); // 使用获取到的动态 token + params.add(new BasicNameValuePair("pros", "1")); + params.add(new BasicNameValuePair("month", String.format("%02d", month))); // 格式化月份为两位数 + params.add(new BasicNameValuePair("year", String.valueOf(year))); + params.add(new BasicNameValuePair("study", "0")); + params.add(new BasicNameValuePair("sdid", "0")); + params.add(new BasicNameValuePair("phaseid", "0")); + params.add(new BasicNameValuePair("psponsor", "0")); + params.add(new BasicNameValuePair("recid", "0")); + params.add(new BasicNameValuePair("state", "0")); + params.add(new BasicNameValuePair("district", "0")); + params.add(new BasicNameValuePair("searchword", "")); + params.add(new BasicNameValuePair("T4", "anyvalue")); // T4既然无效,随便填 + params.add(new BasicNameValuePair("btt", "Search")); + params.add(new BasicNameValuePair("__ncforminfo", ncFormInfo)); // 使用获取到的动态值 + + // 将参数列表设置到请求体中 + postRequest.setEntity(new UrlEncodedFormEntity(params, StandardCharsets.UTF_8)); + + // System.out.println("Executing POST request to submit form..."); // 调试信息 + try (CloseableHttpResponse postResponse = httpClient.execute(postRequest, context)) { + int postStatusCode = postResponse.getCode(); + // System.out.println("POST Response Status: " + postStatusCode); // 打印状态码 + + if (postStatusCode != 200) { + System.err.println("Error: POST request to submit form failed with status code: " + postStatusCode); + EntityUtils.consume(postResponse.getEntity()); // 确保消费掉实体,释放连接 + return null; // 提交表单失败,返回 null + } + + + HttpEntity postEntity = postResponse.getEntity(); + + if (postEntity != null) { + String searchResultsHtml = EntityUtils.toString(postEntity, StandardCharsets.UTF_8); + EntityUtils.consume(postEntity); // 确保实体内容被完全消费 + + // --- Step 7: 处理搜索结果页面 --- + // System.out.println("\nParsing search results..."); // 调试信息 + + Document resultsDoc = Jsoup.parse(searchResultsHtml, SEARCH_SUBMIT_URL); + + Elements links = resultsDoc.select("tr a"); + + for (Element linkElement : links) { + String rawLink = linkElement.attr("href"); + // System.out.println("Processing raw link: " + rawLink); // 调试信息 + + // 使用预编译的正则表达式 Pattern + Matcher matcher = LINK_REGEX_PATTERN.matcher(rawLink); + + // 查找匹配项 + if (matcher.find()) { + String extractedContent = matcher.group(1); // 提取单引号内的内容 + // 构建完整的链接 URL + String fullLink = "https://ctri.nic.in/Clinicaltrials/" + extractedContent; + linkList.add(fullLink); // 将完整链接添加到列表中 + // System.out.println("Added link: " + fullLink); // 调试信息 + } else { + // 如果链接不符合模式,打印警告并跳过 + System.err.println("Warning: Link does not match expected pattern: " + rawLink); + } + } + + // --- 返回提取到的链接列表 --- + // 循环结束后,返回收集到的所有链接 + // System.out.println("Finished link extraction. Returning list."); // 调试信息 + return linkList; + + + } else { + System.err.println("Error: Failed to get search results entity."); + return null; // 获取结果内容失败,返回 null + } + } + + } catch (IOException e) { + // 处理网络请求相关的异常 + System.err.println("Network or IO error during scraping:"); + e.printStackTrace(); + return null; // 发生 IO 错误,返回 null + } catch (Exception e) { + // 处理其他可能的异常,例如解析错误或 NPE + System.err.println("An unexpected error occurred during scraping:"); + e.printStackTrace(); + return null; // 发生其他错误,返回 null + } + } + public static void main(String[] args) { + for (Integer year = Year.now().getValue(); year >= 2024; year--) { + int monthStart = (year == Year.now().getValue()) ? LocalDate.now().getMonthValue() : 12; + + for (Integer month = monthStart; month >= 1; month--) { + try { + List links = getlink(year, month); + if (links == null) { + System.out.println("年份 " + year + " 月份 " + month + " 抓取失败!"); + continue; + } + + if (links.isEmpty()) { + System.out.println("年份 " + year + " 月份 " + month + " 无数据!"); + continue; + } + + int sleepTime = random.nextInt(1001) + 3000; + int count = 0; + + for (String url : links) { + try { + Map result = reslutData(url); + result.put("crawlUrl", url); + + String registNum = String.valueOf(result.get("registNum")); + String jsonValue = objectMapper.writeValueAsString(result); + + ProducerRecord record = new ProducerRecord<>(TOPIC_NAME, registNum, jsonValue); + + producer.send(record, (metadata, exception) -> { + if (exception == null) { + System.out.println("✅ 成功发送到 Kafka: " + registNum + " | Offset: " + metadata.offset() + " | " + url); + } else { + System.err.println("❌ Kafka 发送失败: " + exception.getMessage()); + } + }); + + Thread.sleep(sleepTime); // 控制节奏 + count++; + } catch (Exception e) { + System.err.println("抓取或发送失败: " + url); + e.printStackTrace(); + } + } + + System.out.println("📦 年份 " + year + " 月份 " + month + " 已完成,共上传 " + count + " 条数据。"); + + } catch (Exception e) { + System.err.println("处理年份 " + year + " 月份 " + month + " 失败: " + e.getMessage()); + e.printStackTrace(); + } + } + } + // 关闭 producer + producer.close(); + } + + public static Map reslutData(String url) throws IOException { + Map resultData = new HashMap<>(); + OkHttpClient client = new OkHttpClient().newBuilder() + .build(); + MediaType mediaType = MediaType.parse("text/plain"); + RequestBody body = RequestBody.create(mediaType, ""); + Request request = new Request.Builder() + .url(url) + .get() + .build(); + Response response = client.newCall(request).execute(); + String html = response.body().string(); + Document parse = Jsoup.parse(html); + String title = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(7) > td:nth-child(2)").text(); + String registNum = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(1) > td:nth-child(2) > b").text(); + String registTime = extractAndConvertDate(parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(1) > td:nth-child(2)").text()); + Map sponsor = new HashMap<>(); + String SMMS = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(14) > td:nth-child(2) > table > tbody > tr > td").text(); + String primarySponsor = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(15) > td:nth-child(2) > table > tbody").text(); + sponsor.put("Source of Monetary or Material Support",SMMS); + sponsor.put("Primary Sponsor",primarySponsor); + String studyType = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(4) > td:nth-child(2)").text(); + String phase = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(31) > td:nth-child(2)").text(); + Map disease = new HashMap<>(); + String healthType = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(21) > td:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(1)").text(); + String condition = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(21) > td:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(2)").text(); + disease.put("healthType",healthType); + disease.put("condition",condition); + String studyDesign = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(6) > td:nth-child(2)").text(); + String inclusionCriteria = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(23) > td:nth-child(2) > table > tbody").text(); + String exclusionCriteria = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(24) > td:nth-child(2) > table > tbody > tr > td:nth-child(2)").text(); + String enrollment = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(30) > td:nth-child(2)").text(); + String country = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(17) > td:nth-child(2)").text(); + String intervention = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(22) > td:nth-child(2) > table").text(); + Map primaryOutcome = new HashMap<>(); + String firstOutcome = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(28) > td:nth-child(2) > table > tbody").text(); + String secondOutcome = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(29) > td:nth-child(2) > table > tbody").text(); + primaryOutcome.put("firstOutcome",firstOutcome); + primaryOutcome.put("secondOutcome",secondOutcome); + + resultData.put("disease",disease); + resultData.put("primaryOutcome",primaryOutcome); + resultData.put("intervention",intervention); + resultData.put("country",country); + resultData.put("enrollment",enrollment); + resultData.put("exclusionCriteria",exclusionCriteria); + resultData.put("inclusionCriteria",inclusionCriteria); + resultData.put("studyDesign",studyDesign); + resultData.put("sponsor",sponsor); + resultData.put("title",title); + resultData.put("registNum",registNum); + resultData.put("registTime",registTime); + resultData.put("studyType",studyType); + resultData.put("phase",phase); + resultData.put("registStatus",""); + resultData.put("registTitle",""); + resultData.put("fullTitle",""); + resultData.put("sponsorPart",""); + resultData.put("studyObjective",""); + resultData.put("studyStartDate",""); + resultData.put("currentStatus",""); + resultData.put("tagTime",""); + resultData.put("crawlTime",getCurrentTime()); + resultData.put("crawlUrl",url); + resultData.put("postTime",registTime); + resultData.put("content","content"); + resultData.put("forwardcontent","forwardcontent"); + resultData.put("cid","Nctrinicin"); + return resultData; + } + public static String getCurrentTime() { + // 创建 DateTimeFormatter,指定输出格式 + DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); + // 获取当前时间 + LocalDateTime now = LocalDateTime.now(); + // 格式化 + return now.format(formatter); + } + public static String extractAndConvertDate(String input) { + // 定义正则表达式提取 dd/MM/yyyy 格式的日期 + Pattern pattern = Pattern.compile("\\[(?:Registered on|注册于):\\s*(\\d{2}/\\d{2}/\\d{4})\\]"); + Matcher matcher = pattern.matcher(input); + + if (matcher.find()) { + String dateStr = matcher.group(1); // 提取的日期字符串 + try { + // 解析成 Date 对象 + SimpleDateFormat inputFormat = new SimpleDateFormat("dd/MM/yyyy"); + Date date = inputFormat.parse(dateStr); + + // 格式化为 yyyy:MM:dd 00:00:00 + SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MM-dd '00:00:00'"); + return outputFormat.format(date); + + } catch (ParseException e) { + e.printStackTrace(); + } + } + + return null; // 如果未匹配或转换失败 + } +} \ No newline at end of file diff --git a/src/main/java/com/example/CtriScraperContent.java b/src/main/java/com/example/CtriScraperContent.java new file mode 100644 index 0000000..9840b40 --- /dev/null +++ b/src/main/java/com/example/CtriScraperContent.java @@ -0,0 +1,121 @@ +package com.example; + +import okhttp3.*; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; + +import java.io.IOException; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.time.LocalDateTime; +import java.time.format.DateTimeFormatter; +import java.util.Date; +import java.util.HashMap; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class CtriScraperContent { + public static void main(String[] args) throws IOException { + Map resultData = new HashMap<>(); + String url = "https://ctri.nic.in/Clinicaltrials/pmaindet2.php?EncHid=MjQ3MjM=&Enc=&userName="; + OkHttpClient client = new OkHttpClient().newBuilder() + .build(); + MediaType mediaType = MediaType.parse("text/plain"); + RequestBody body = RequestBody.create(mediaType, ""); + Request request = new Request.Builder() + .url(url) + .get() + .build(); + Response response = client.newCall(request).execute(); + String html = response.body().string(); + Document parse = Jsoup.parse(html); + String title = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(7) > td:nth-child(2)").text(); + String registNum = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(1) > td:nth-child(2) > b").text(); + String registTime = extractAndConvertDate(parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(1) > td:nth-child(2)").text()); + Map sponsor = new HashMap<>(); + String SMMS = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(14) > td:nth-child(2) > table > tbody > tr > td").text(); + String primarySponsor = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(15) > td:nth-child(2) > table > tbody").text(); + sponsor.put("Source of Monetary or Material Support",SMMS); + sponsor.put("Primary Sponsor",primarySponsor); + String studyType = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(4) > td:nth-child(2)").text(); + String phase = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(31) > td:nth-child(2)").text(); + Map disease = new HashMap<>(); + String healthType = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(21) > td:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(1)").text(); + String condition = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(21) > td:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(2)").text(); + disease.put("healthType",healthType); + disease.put("condition",condition); + String studyDesign = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(6) > td:nth-child(2)").text(); + String inclusionCriteria = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(23) > td:nth-child(2) > table > tbody").text(); + String exclusionCriteria = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(24) > td:nth-child(2) > table > tbody > tr > td:nth-child(2)").text(); + String enrollment = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(30) > td:nth-child(2)").text(); + String country = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(17) > td:nth-child(2)").text(); + String intervention = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(22) > td:nth-child(2) > table").text(); + Map primaryOutcome = new HashMap<>(); + String firstOutcome = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(28) > td:nth-child(2) > table > tbody").text(); + String secondOutcome = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(29) > td:nth-child(2) > table > tbody").text(); + primaryOutcome.put("firstOutcome",firstOutcome); + primaryOutcome.put("secondOutcome",secondOutcome); + + resultData.put("disease",disease); + resultData.put("primaryOutcome",primaryOutcome); + resultData.put("intervention",intervention); + resultData.put("country",country); + resultData.put("enrollment",enrollment); + resultData.put("exclusionCriteria",exclusionCriteria); + resultData.put("inclusionCriteria",inclusionCriteria); + resultData.put("studyDesign",studyDesign); + resultData.put("sponsor",sponsor); + resultData.put("title",title); + resultData.put("registNum",registNum); + resultData.put("registTime",registTime); + resultData.put("studyType",studyType); + resultData.put("phase",phase); + resultData.put("registStatus",""); + resultData.put("registTitle",""); + resultData.put("fullTitle",""); + resultData.put("sponsorPart",""); + resultData.put("studyObjective",""); + resultData.put("studyStartDate",""); + resultData.put("currentStatus",""); + resultData.put("tagTime",""); + resultData.put("crawlTime",getCurrentTime()); + resultData.put("crawlUrl",url); + resultData.put("postTime",registTime); + resultData.put("content","content"); + resultData.put("forwardcontent","forwardcontent"); + + System.out.println(resultData); + } + public static String getCurrentTime() { + // 创建 DateTimeFormatter,指定输出格式 + DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); + // 获取当前时间 + LocalDateTime now = LocalDateTime.now(); + // 格式化 + return now.format(formatter); + } + public static String extractAndConvertDate(String input) { + // 定义正则表达式提取 dd/MM/yyyy 格式的日期 + Pattern pattern = Pattern.compile("\\[(?:Registered on|注册于):\\s*(\\d{2}/\\d{2}/\\d{4})\\]"); + Matcher matcher = pattern.matcher(input); + + if (matcher.find()) { + String dateStr = matcher.group(1); // 提取的日期字符串 + try { + // 解析成 Date 对象 + SimpleDateFormat inputFormat = new SimpleDateFormat("dd/MM/yyyy"); + Date date = inputFormat.parse(dateStr); + + // 格式化为 yyyy:MM:dd 00:00:00 + SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MM-dd '00:00:00'"); + return outputFormat.format(date); + + } catch (ParseException e) { + e.printStackTrace(); + } + } + + return null; // 如果未匹配或转换失败 + } +} diff --git a/src/main/java/com/example/Inka.java b/src/main/java/com/example/Inka.java new file mode 100644 index 0000000..c60f147 --- /dev/null +++ b/src/main/java/com/example/Inka.java @@ -0,0 +1,113 @@ +package com.example; + +import com.fasterxml.jackson.databind.ObjectMapper; +import okhttp3.*; +import org.apache.kafka.clients.producer.KafkaProducer; +import org.apache.kafka.clients.producer.ProducerConfig; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.time.LocalDate; +import java.time.format.DateTimeFormatter; +import java.util.*; +import java.util.concurrent.TimeUnit; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class Inka { +// private static final String TOPIC_NAME = "patentTopic"; +// private static final String BOOTSTRAP_SERVERS = "localhost:9092"; +// private static KafkaProducer producer; +// private static ObjectMapper objectMapper = new ObjectMapper(); +// private static final Random random = new Random(); + private static List proxyList = new ArrayList<>(); // 代理池 + private static int currentProxyIndex = 0; // 当前使用的代理索引 +// static { +// Properties props = new Properties(); +// props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, BOOTSTRAP_SERVERS); +// props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer"); +// props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer"); +// props.put(ProducerConfig.ACKS_CONFIG, "all"); // 等待所有副本确认 +// props.put(ProducerConfig.RETRIES_CONFIG, 3); // 重试次数 +// producer = new KafkaProducer<>(props); +// try { +// proxyList = Files.readAllLines(Paths.get("proxy.txt")); +// if (proxyList.isEmpty()) { +// System.out.println("警告: proxy.txt 为空,未加载任何代理"); +// } else { +// System.out.println("成功加载 " + proxyList.size() + " 个代理"); +// } +// } catch (IOException e) { +// System.err.println("读取 proxy.txt 失败: " + e.getMessage()); +// } +// } + public static void main(String[] args) throws IOException, InterruptedException { + String load = "javax.faces.partial.ajax=true&javax.faces.source=advancedSearchForm%3AadvancedSearchInput%3Aj_idt1225&javax.faces.partial.execute=advancedSearchForm%3AadvancedSearchInput%3Aj_idt1225+advancedSearchForm&javax.faces.partial.render=advancedSearchForm+results-container+j_idt1272&advancedSearchForm%3AadvancedSearchInput%3Aj_idt1225=advancedSearchForm%3AadvancedSearchInput%3Aj_idt1225&advancedSearchForm=advancedSearchForm&advancedSearchForm%3AadvancedSearchAssistant=on&advancedSearchForm%3AadvancedSearchInput%3Ainput=rance10&javax.faces.ViewState=-3602994148230912322%3A-6313250694718303467"; + + OkHttpClient client = createClientWithProxy(); + + MediaType mediaType = MediaType.parse("application/x-www-form-urlencoded; charset=UTF-8"); + RequestBody body = RequestBody.create(mediaType, load); + + // 构建请求 + Request request = new Request.Builder() + .url("https://patentscope.wipo.int/search/zh/result.jsf?_vid=P21-M9APK2-00815") // 更新为 Patentscope 的 URL + .method("POST", body) + .addHeader("Accept", "application/xml, text/xml, */*; q=0.01") + .addHeader("Accept-Language", "zh-CN,zh;q=0.9,th;q=0.8,en;q=0.7") + .addHeader("Cache-Control", "no-cache") + .addHeader("Connection", "keep-alive") + .addHeader("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8") +// .addHeader("Cookie", "JSESSIONID=F253B7B0920FFACB89354339F51E325C.wapp2nB; ABIW=balancer.cms41; _ga=GA1.1.33840258.1744249893; Hm_lvt_95e64d347633bfd0a2462e25c93606d6=1744249893; Hm_lpvt_95e64d347633bfd0a2462e25c93606d6=1744249893; HMACCOUNT=0388A9D4AC1C33F5; _pk_id.14.ec75=5aa7b2d46edf6083.1744249894.; cebs=1; _ce.clock_data=-923%2C212.87.194.3%2C1%2C33d0f257a817d1ca4c4381b87f8ad83f%2CChrome%2CJP; cebsp_=1; _pk_uid=0%3DNWFhN2IyZDQ2ZWRmNjA4Mw%3D%3D; _gcl_au=1.1.1245117354.1744249928; wipo-visitor-uunid=28f5a645185bc7b; _pk_ref.9.ec75=%5B%22%22%2C%22%22%2C1744249929%2C%22https%3A%2F%2Fwww.wipo.int%2F%22%5D; _pk_id.9.ec75=957af9d7ac871adb.1744249929.; _ga_15TSHJ0HWP=GS1.1.1744249893.1.1.1744250058.58.0.0; _ce.s=v~274adfa655dbaad3ae6a47724ee5bf89d205d10f~lcw~1744250058720~vir~new~lva~1744249893962~vpv~0~v11.cs~411929~v11.s~559ada70-15ae-11f0-a979-459b55a048ba~v11.sla~1744250058728~gtrk.la~m9apg5tj~v11.send~1744250058720~lcw~1744250058728; _pk_id.5.ec75=ab8529a634a38653.1744250080.; wipo_language=zh; _pk_ses.5.ec75=1") + .addHeader("Faces-Request", "partial/ajax") + .addHeader("Host", "patentscope.wipo.int") + .addHeader("Origin", "https://patentscope.wipo.int") + .addHeader("Pragma", "no-cache") + .addHeader("Referer", "https://patentscope.wipo.int/search/zh/result.jsf?_vid=P21-M9APK2-00815") + .addHeader("Sec-Ch-Ua", "\"Google Chrome\";v=\"135\", \"Not-A.Brand\";v=\"8\", \"Chromium\";v=\"135\"") + .addHeader("Sec-Ch-Ua-Mobile", "?0") + .addHeader("Sec-Ch-Ua-Platform", "\"Windows\"") + .addHeader("Sec-Fetch-Dest", "empty") + .addHeader("Sec-Fetch-Mode", "cors") + .addHeader("Sec-Fetch-Site", "same-origin") + .addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36") + .addHeader("X-Requested-With", "XMLHttpRequest") + .build(); + + // 执行请求并打印响应 + try (Response response = client.newCall(request).execute()) { + if (response.isSuccessful()) { + System.out.println("Response: " + response.body().string()+response.code()); + } else { + System.out.println("Error: " + response.code() + " - " + response.message()); + System.out.println("Response Body: " + response.body().string()); + } + } + } + + private static OkHttpClient createClientWithProxy() { + OkHttpClient.Builder builder = new OkHttpClient().newBuilder() + .connectTimeout(30, TimeUnit.SECONDS) + .readTimeout(30, TimeUnit.SECONDS) + .writeTimeout(30, TimeUnit.SECONDS); + + if (!proxyList.isEmpty() && currentProxyIndex < proxyList.size()) { + String proxy = proxyList.get(currentProxyIndex); + String[] proxyParts = proxy.split(":"); + if (proxyParts.length == 2) { + String proxyHost = proxyParts[0]; + int proxyPort = Integer.parseInt(proxyParts[1]); + builder.proxy(new java.net.Proxy(java.net.Proxy.Type.HTTP, + new java.net.InetSocketAddress(proxyHost, proxyPort))); + System.out.println("使用代理: " + proxy); + } + } + return builder.build(); + } +} diff --git a/src/main/java/com/example/NSFAwardCrawler.java b/src/main/java/com/example/NSFAwardCrawler.java new file mode 100644 index 0000000..bc3100e --- /dev/null +++ b/src/main/java/com/example/NSFAwardCrawler.java @@ -0,0 +1,111 @@ +package com.example; + +import org.openqa.selenium.By; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.WebElement; +import org.openqa.selenium.chrome.ChromeDriver; +import org.openqa.selenium.chrome.ChromeOptions; +import org.openqa.selenium.support.ui.ExpectedConditions; +import org.openqa.selenium.support.ui.WebDriverWait; +import org.openqa.selenium.NoSuchElementException; + +import java.time.Duration; +import java.util.ArrayList; +import java.util.List; + +public class NSFAwardCrawler { + private static final int PAGE_SIZE = 30; // 每页基准条数 + + public static void main(String[] args) { + // 设置 ChromeDriver 路径 + System.setProperty("webdriver.chrome.driver", + "F:\\tool\\EasySpider_0.6.2_Windows_x64\\EasySpider_windows_x64\\EasySpider\\resources\\app\\chrome_win64\\chromedriver_win64.exe"); + + ChromeOptions options = new ChromeOptions(); + WebDriver driver = new ChromeDriver(options); + + try { + String url = "https://www.nsf.gov/awardsearch/simpleSearchResult?queryText=ebola&ActiveAwards=true"; + driver.get(url); + + WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(10)); + List allAwardIds = new ArrayList<>(); + int pageNumber = 1; + + while (true) { + System.out.println("Processing page " + pageNumber); + + // 等待页面加载完成 + wait.until(ExpectedConditions.presenceOfElementLocated(By.className("listview-item"))); + + // 获取当前页的结果项 + List resultItems = driver.findElements(By.className("listview-item")); + int currentPageSize = resultItems.size(); + System.out.println("Found " + currentPageSize + " items on page " + pageNumber); + + // 如果当前页没有结果,退出 + if (currentPageSize == 0) { + System.out.println("No items found on page " + pageNumber + ", stopping..."); + break; + } + + // 提取当前页的奖项 ID + for (WebElement item : resultItems) { + try { + String awardId = item.getAttribute("id"); + if (awardId != null && !awardId.isEmpty() && !allAwardIds.contains(awardId)) { + allAwardIds.add(awardId); + } + } catch (Exception e) { + System.out.println("Error processing item: " + e.getMessage()); + } + } + + // 判断是否需要分页:如果当前页条数小于 30,认为是最后一页 + if (currentPageSize < PAGE_SIZE) { + System.out.println("Page " + pageNumber + " has less than " + PAGE_SIZE + " items (" + currentPageSize + "), assuming last page, stopping..."); + break; + } + + // 检查下一页按钮 + try { + WebElement nextButton = driver.findElement(By.name("NEXT")); + boolean isEnabled = nextButton.isEnabled(); + System.out.println("Next button enabled: " + isEnabled); + + if (!isEnabled) { + System.out.println("Next button is disabled, stopping..."); + break; + } + + // 点击下一页 + nextButton.click(); + Thread.sleep(2000); // 等待页面加载 + pageNumber++; + } catch (NoSuchElementException e) { + System.out.println("Next button not found, stopping..."); + break; + } catch (Exception e) { + System.out.println("Error clicking next button: " + e.getMessage()); + break; + } + } + + // 打印所有结果 + System.out.println("Found " + allAwardIds.size() + " award IDs across all pages:"); + for (int i = 0; i < allAwardIds.size(); i++) { + System.out.println((i + 1) + ". " + allAwardIds.get(i)); + } + + } catch (Exception e) { + System.out.println("An error occurred: " + e.getMessage()); + } finally { + try { + Thread.sleep(2000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + driver.quit(); + } + } +} \ No newline at end of file diff --git a/src/main/java/com/example/PatentscopeSeleniumCrawler.java b/src/main/java/com/example/PatentscopeSeleniumCrawler.java new file mode 100644 index 0000000..4edb842 --- /dev/null +++ b/src/main/java/com/example/PatentscopeSeleniumCrawler.java @@ -0,0 +1,130 @@ +package com.example; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.openqa.selenium.By; +import org.openqa.selenium.Keys; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.WebElement; +import org.openqa.selenium.chrome.ChromeDriver; +import org.openqa.selenium.chrome.ChromeOptions; +import org.openqa.selenium.support.ui.ExpectedConditions; +import org.openqa.selenium.support.ui.WebDriverWait; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.time.Duration; +import java.util.Random; + +public class PatentscopeSeleniumCrawler { + private static final Logger LOGGER = LoggerFactory.getLogger(PatentscopeSeleniumCrawler.class); + private static final String SEARCH_URL = "https://patentscope.wipo.int/search/en/search.jsf"; + private static final String SEARCH_INPUT_ID = "simpleSearchForm:fpSearch:input"; + private static final String SEARCH_BUTTON_ID = "simpleSearchForm:fpSearch:j_idt1319"; + private static final Random RANDOM = new Random(); + + public static void main(String[] args) { + // 配置 ChromeDriver + System.setProperty("webdriver.chrome.driver", "F:\\tool\\EasySpider_0.6.2_Windows_x64\\EasySpider_windows_x64\\EasySpider\\resources\\app\\chrome_win64\\chromedriver_win64.exe"); + ChromeOptions options = new ChromeOptions(); + options.addArguments("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"); + options.addArguments("--disable-blink-features=AutomationControlled"); + // 非无头模式,便于调试 + WebDriver driver = null; + + try { + driver = new ChromeDriver(options); + WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(15)); + + // Step 1: 访问搜索页面 + LOGGER.info("Navigating to {}", SEARCH_URL); + driver.get(SEARCH_URL); + Thread.sleep(2000 + RANDOM.nextInt(2000)); // 等待页面加载 + + // Step 2: 输入搜索关键词 + LOGGER.info("Entering search query: FP:(fever)"); + WebElement searchInput = wait.until(ExpectedConditions.elementToBeClickable(By.id(SEARCH_INPUT_ID))); + searchInput.clear(); + searchInput.sendKeys("FP:(fever)"); + Thread.sleep(500 + RANDOM.nextInt(1000)); // 等待输入生效 + + // Step 3: 触发搜索 + LOGGER.info("Attempting to trigger search..."); + try { + // 方法 1: 点击搜索按钮 + WebElement searchButton = wait.until(ExpectedConditions.elementToBeClickable(By.id(SEARCH_BUTTON_ID))); + LOGGER.info("Clicking search button"); + searchButton.click(); + Thread.sleep(3000 + RANDOM.nextInt(2000)); // 等待 AJAX 和跳转 + } catch (Exception e) { + LOGGER.warn("Button click failed, trying Enter key: {}", e.getMessage()); + // 方法 2: 模拟回车 + searchInput.sendKeys(Keys.ENTER); + Thread.sleep(3000 + RANDOM.nextInt(2000)); + } + + // Step 4: 验证跳转 + String currentUrl = driver.getCurrentUrl(); + LOGGER.info("Current URL: {}", currentUrl); + if (!currentUrl.contains("result.jsf")) { + LOGGER.error("Failed to redirect to result.jsf, trying advanced search..."); + // 尝试高级搜索(备用) + driver.get("https://patentscope.wipo.int/search/en/search.jsf?advancedSearch=true"); + searchInput = wait.until(ExpectedConditions.elementToBeClickable(By.id("advancedSearchForm:advancedSearchInput:input"))); + searchInput.clear(); + searchInput.sendKeys("FP:(fever)"); + WebElement advSearchButton = wait.until(ExpectedConditions.elementToBeClickable(By.id("advancedSearchForm:advancedSearchInput:j_idt1208"))); + advSearchButton.click(); + Thread.sleep(3000 + RANDOM.nextInt(2000)); + currentUrl = driver.getCurrentUrl(); + LOGGER.info("Advanced search URL: {}", currentUrl); + } + + // Step 5: 解析结果页面 + if (currentUrl.contains("result.jsf")) { + LOGGER.info("Successfully reached result page"); + while (true) { + Document doc = Jsoup.parse(driver.getPageSource()); + Elements results = doc.select("div.result-row"); // 需确认选择器 + if (results.isEmpty()) { + LOGGER.warn("No results found, verify selector or query"); + } + + for (Element item : results) { + String title = item.select("a.result-title__text").text(); // 需确认 + String patentId = item.select("div.result__number").text(); // 需确认 + LOGGER.info("Title: {}", title.isEmpty() ? "N/A" : title); + LOGGER.info("Patent ID: {}", patentId.isEmpty() ? "N/A" : patentId); + } + + // 分页 + WebElement nextPage = driver.findElements(By.cssSelector("a.paginator__button--next:not(.is-disabled)")) + .stream() + .filter(WebElement::isDisplayed) + .findFirst() + .orElse(null); + if (nextPage == null) { + LOGGER.info("No more pages"); + break; + } + + LOGGER.info("Navigating to next page"); + nextPage.click(); + Thread.sleep(3000 + RANDOM.nextInt(2000)); + } + } else { + LOGGER.error("Still not on result page, check query or network"); + } + + } catch (Exception e) { + LOGGER.error("Error during crawling: {}", e.getMessage(), e); + } finally { + if (driver != null) { + driver.quit(); + LOGGER.info("WebDriver closed"); + } + } + } +} \ No newline at end of file diff --git a/src/main/java/com/example/ProxyIPChecker.java b/src/main/java/com/example/ProxyIPChecker.java new file mode 100644 index 0000000..8d027f2 --- /dev/null +++ b/src/main/java/com/example/ProxyIPChecker.java @@ -0,0 +1,25 @@ +package com.example; +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.net.HttpURLConnection; +import java.net.URL; + +public class ProxyIPChecker { + public static void main(String[] args) throws Exception { + URL url = new URL("http://httpbin.org/ip"); + HttpURLConnection conn = (HttpURLConnection) url.openConnection(); + conn.setRequestMethod("GET"); + + BufferedReader in = new BufferedReader(new InputStreamReader(conn.getInputStream(), "UTF-8")); + String inputLine; + StringBuilder response = new StringBuilder(); + + while ((inputLine = in.readLine()) != null) { + response.append(inputLine); + } + in.close(); + + System.out.println("当前公网 IP 信息:"); + System.out.println(response.toString()); + } +} diff --git a/src/main/java/com/example/ScraperWithCaptcha.java b/src/main/java/com/example/ScraperWithCaptcha.java new file mode 100644 index 0000000..f171732 --- /dev/null +++ b/src/main/java/com/example/ScraperWithCaptcha.java @@ -0,0 +1,496 @@ +package com.example;// 修改为你的包名 + +import java.awt.image.BufferedImage; +import java.io.*; +import java.net.HttpURLConnection; +import java.net.URL; +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import javax.imageio.ImageIO; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import net.sourceforge.tess4j.Tesseract; +import net.sourceforge.tess4j.TesseractException; + +public class ScraperWithCaptcha { + + // --- 需要根据目标网站修改的常量 --- + private static final String BASE_URL = "https://ctri.nic.in/Clinicaltrials/advancesearchmain.php"; // *** 替换为目标网站包含表单和验证码的页面 URL *** + private static final String FORM_SUBMIT_URL = BASE_URL; // *** 表单提交的 URL,通常是页面本身或 action 属性指定的 URL *** + private static final String CAPTCHA_IMAGE_SRC_SUBSTRING = "captchasecurityimages.php"; // *** 验证码图片 src 中特有的字符串 *** + private static final String CAPTCHA_INPUT_SELECTOR = "input[name=T4]"; + private static final String TARGET_FORM_SELECTOR = "form"; // *** 如果页面有多个表单,指定目标表单的选择器,例如 "#myFormId" *** + + // --- 图像预处理相关的阈值,需要根据验证码样式调试 --- + private static final int BINARY_THRESHOLD = 128; // 二值化阈值 (0-255) + + // --- Tesseract 配置 (根据你的安装修改) --- + // Tesseract tessdata 文件夹的路径 + private static final String TESSDATA_PATH = "F:\\tool\\Tesseract-OCR\\tessdata"; // *** 请务必修改为你的实际路径 *** + + // --- 其他通用配置 --- + private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"; + private Set cookies = new HashSet<>(); // 存储 cookies + + public static void main(String[] args) { + ScraperWithCaptcha scraper = new ScraperWithCaptcha(); + try { + // 1. 获取包含表单和验证码的页面 + PageInfo pageInfo = scraper.fetchPage(BASE_URL, null, null, false); // 第一次 GET 不需要 Cookies 和 POST Data, 也不是 AJAX + + if (pageInfo.htmlContent == null || pageInfo.statusCode != HttpURLConnection.HTTP_OK) { + System.err.println("Failed to fetch the initial page. Status code: " + pageInfo.statusCode); + return; + } + + // 解析页面提取验证码信息和所有表单字段 + Document doc = Jsoup.parse(pageInfo.htmlContent, BASE_URL); + + // 提取验证码图片 URL + Element captchaImg = doc.selectFirst("img[src*=" + CAPTCHA_IMAGE_SRC_SUBSTRING + "]"); + String captchaImageUrl = null; + if (captchaImg != null) { + captchaImageUrl = captchaImg.absUrl("src"); // 获取绝对 URL + System.out.println("Found CAPTCHA image URL: " + captchaImageUrl); + } else { + System.err.println("CAPTCHA image not found using selector: img[src*=" + CAPTCHA_IMAGE_SRC_SUBSTRING + "]"); + // 如果找不到验证码,可能无法继续 + return; + } + + // 提取验证码输入框的 name + Element captchaInput = doc.selectFirst(CAPTCHA_INPUT_SELECTOR); + String captchaInputName = null; + if (captchaInput != null) { + captchaInputName = captchaInput.attr("name"); + System.out.println("Found CAPTCHA input field name: " + captchaInputName); + } else { + System.err.println("CAPTCHA input field not found using selector: " + CAPTCHA_INPUT_SELECTOR); + // 如果找不到输入框,也无法提交 + return; + } + + // 2. 下载验证码图片 + BufferedImage originalCaptchaImage = scraper.downloadImage(captchaImageUrl); + System.out.println("Captcha image downloaded."); + + // 3. 预处理图片 + BufferedImage preprocessedImage = scraper.preprocessImage(originalCaptchaImage); + System.out.println("Image preprocessed (saved as preprocessed_captcha.png)."); + + // 4. 识别验证码 + String captchaCode = scraper.recognizeCaptcha(preprocessedImage); + + if (captchaCode != null && !captchaCode.isEmpty()) { + System.out.println("Recognized CAPTCHA: " + captchaCode); + + // 5. 构建包含验证码的 POST 数据 + // 从页面表单中提取所有字段,并设置其值 + Map formData = scraper.buildFormDataMap(doc, captchaInputName, captchaCode); + + String postData = scraper.buildPostData(formData); + System.out.println("Built POST data: " + postData); + + // 6. 提交表单 + // 通常是标准的 POST 请求 + PageInfo postResponseInfo = scraper.fetchPage(FORM_SUBMIT_URL, postData, scraper.getCookieHeader(), false); // 非 AJAX POST + + System.out.println("Form submitted. Response status code: " + postResponseInfo.statusCode); + System.out.println("POST Response Body (partial): " + (postResponseInfo.htmlContent != null && postResponseInfo.htmlContent.length() > 500 ? postResponseInfo.htmlContent.substring(0, 500) + "..." : postResponseInfo.htmlContent)); // 打印部分响应查看 + + // 7. 检查响应判断是否成功 + // 对于标准表单提交,成功通常是重定向 (302) 或返回新的页面 + if (postResponseInfo.statusCode == HttpURLConnection.HTTP_MOVED_TEMP || postResponseInfo.statusCode == HttpURLConnection.HTTP_SEE_OTHER || postResponseInfo.statusCode == HttpURLConnection.HTTP_MOVED_PERM) { + String redirectUrl = postResponseInfo.redirectUrl; + System.out.println("POST resulted in redirect. Location: " + redirectUrl); + // TODO: 如果重定向到成功页面,可以继续爬取该页面 + // 如果重定向回原页面或错误页,说明提交失败 (验证码错误或其他原因) + if (redirectUrl != null && redirectUrl.equals(BASE_URL)) { // <-- 检查是否重定向回原页面,需根据实际情况判断 + System.err.println("Submission failed, redirected back to the form page."); + // TODO: 实现重试逻辑 (需要重新获取页面和验证码) + } + + } else if (postResponseInfo.statusCode == HttpURLConnection.HTTP_OK) { + System.out.println("POST returned OK (200). Analyzing response content..."); + // TODO: 解析 postResponseInfo.htmlContent 来判断是否成功(例如查找成功标志,或检查是否有验证码错误提示) + if (postResponseInfo.htmlContent != null && postResponseInfo.htmlContent.contains("成功标志字符串")) { // <-- *** 根据实际成功响应的特征修改 *** + System.out.println("Form submission appears successful based on content."); + // TODO: 从 postResponseInfo.htmlContent 中提取你想要的数据 + } else if (postResponseInfo.htmlContent != null && postResponseInfo.htmlContent.contains("验证码错误提示字符串")) { // <-- *** 根据实际验证码错误提示修改 *** + System.err.println("CAPTCHA appears incorrect. Need to retry."); + // TODO: 实现重试逻辑 (可能需要重新获取页面,因为验证码会刷新) + } else { + System.out.println("POST returned 200, but content not clearly indicating success or failure."); + // 需要更详细地检查响应内容 + } + } + else { + System.err.println("POST request failed with status code: " + postResponseInfo.statusCode); + } + + + } else { + System.err.println("CAPTCHA recognition failed. Cannot submit form."); + // TODO: 实现识别失败的重试逻辑 + } + + + } catch (IOException e) { + e.printStackTrace(); + System.err.println("An I/O error occurred: " + e.getMessage()); + } catch (TesseractException e) { + e.printStackTrace(); + System.err.println("A Tesseract OCR error occurred: " + e.getMessage()); + } catch (Exception e) { + e.printStackTrace(); + System.err.println("An unexpected error occurred: " + e.getMessage()); + } + } + + /** + * 发起 HTTP 请求 (GET 或 POST),获取页面内容和 Cookies。 + * + * @param urlString 请求 URL + * @param postData POST 请求体数据 (GET 请求时为 null) + * @param cookieHeader 请求头中的 Cookie 值 (第一次请求时为 null) + * @param isAjaxPost 是否是 AJAX POST 请求 (影响请求头设置) + * @return PageInfo 对象,包含响应信息和内容 + * @throws IOException + */ + private PageInfo fetchPage(String urlString, String postData, String cookieHeader, boolean isAjaxPost) throws IOException { + URL url = new URL(urlString); + HttpURLConnection conn = (HttpURLConnection) url.openConnection(); + + if (postData != null) { + conn.setRequestMethod("POST"); + conn.setDoOutput(true); // 允许写入 POST 数据 + } else { + conn.setRequestMethod("GET"); + } + + conn.setInstanceFollowRedirects(false); + conn.setConnectTimeout(10000); + conn.setReadTimeout(20000); + + // 设置请求头 (不包括 Cookie,Cookie 在后面统一处理) + conn.setRequestProperty("User-Agent", USER_AGENT); + if (cookieHeader != null) { + conn.setRequestProperty("Cookie", cookieHeader); + } + conn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"); + if (postData != null) { + conn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"); + if(isAjaxPost) { + conn.setRequestProperty("X-Requested-With", "XMLHttpRequest"); + conn.setRequestProperty("X-MicrosoftAjax", "Delta=true"); + } + try { + conn.setRequestProperty("Referer", new URL(urlString).getProtocol() + "://" + new URL(urlString).getHost() + new URL(urlString).getPath()); + conn.setRequestProperty("Origin", new URL(urlString).getProtocol() + "://" + new URL(urlString).getHost()); + } catch (Exception e) { } + } + + // --- 写入 POST 数据 (如果是 POST 请求) --- + // 这一块必须在读取响应之前 + if (postData != null) { + try (OutputStream os = conn.getOutputStream()) { // 获取输出流,会触发连接 + byte[] input = postData.getBytes(StandardCharsets.UTF_8); + os.write(input, 0, input.length); + } // os.close() 在 try-with-resources 结束时自动调用,数据在这里被发送 + } + // --- End POST Data --- + + + // --- 现在可以获取响应信息了 --- + // 调用 getResponseCode() 会发送完整的请求 (包括头和体) 并接收响应头 + int statusCode = conn.getResponseCode(); + String redirectUrl = null; + if (statusCode == HttpURLConnection.HTTP_MOVED_TEMP || statusCode == HttpURLConnection.HTTP_SEE_OTHER || statusCode == HttpURLConnection.HTTP_MOVED_PERM) { + redirectUrl = conn.getHeaderField("Location"); + } + + // --- 处理 Cookies (从响应头读取) --- + // 这一块现在在获取响应码之后执行 + Map> headerFields = conn.getHeaderFields(); + List cookiesHeader = headerFields.get("Set-Cookie"); + if (cookiesHeader != null) { + for (String cookie : cookiesHeader) { + String cookieValue = cookie.split(";")[0]; + this.cookies.add(cookieValue); + } + } + // --- End Cookies --- + + + StringBuilder content = new StringBuilder(); + // 只有当状态码表示成功 (2xx) 或客户端错误 (4xx) 且有响应体时才读取 + if (statusCode >= 200 && statusCode < 300 || statusCode >= 400 && statusCode < 500 && conn.getContentLength() > 0) { + try (InputStream is = (statusCode >= 200 && statusCode < 300) ? conn.getInputStream() : conn.getErrorStream(); + BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))) { + String line; + while ((line = reader.readLine()) != null) { + content.append(line).append("\n"); + } + } catch (IOException e) { + System.err.println("Error reading response body for status " + statusCode + ": " + e.getMessage()); + } + } + + conn.disconnect(); + + PageInfo pageInfo = new PageInfo(); + pageInfo.statusCode = statusCode; + pageInfo.redirectUrl = redirectUrl; + pageInfo.htmlContent = content.toString(); + + return pageInfo; + } + + /** + * 从页面表单中提取所有字段,并设置验证码字段的值 + * @param doc Jsoup 解析后的 Document 对象 + * @param captchaInputName 验证码输入框的 name 属性值 + * @param captchaCode 识别出的验证码字符串 + * @return 包含所有表单字段名称和值的 Map + */ + private Map buildFormDataMap(Document doc, String captchaInputName, String captchaCode) { + Map formData = new HashMap<>(); + Element form = doc.selectFirst(TARGET_FORM_SELECTOR); // 找到目标表单 + + if (form == null) { + System.err.println("Target form not found using selector: " + TARGET_FORM_SELECTOR); + return formData; // 返回空 Map + } + + Elements formElements = form.select("input, select, textarea"); // 查找表单内的所有输入元素 + + for (Element element : formElements) { + String name = element.attr("name"); + String type = element.attr("type"); // 获取 input 的类型 + String value = element.attr("value"); // 获取默认 value + + if (name == null || name.isEmpty()) { + continue; // 忽略没有 name 属性的元素 + } + + // 处理不同类型的输入元素 + if ("text".equals(type) || "hidden".equals(type) || "password".equals(type)) { + if (name.equals(captchaInputName)) { + // 这是验证码输入框,填入识别结果 + formData.put(name, captchaCode); + } else { + // 其他文本/隐藏字段,使用默认值或留空,取决于需求 + formData.put(name, value != null ? value : ""); // 通常爬取时这些是空的 + } + } else if ("checkbox".equals(type)) { + // 复选框,如果被勾选则添加到 formData + if (element.hasAttr("checked")) { + formData.put(name, value != null ? value : "on"); // 复选框的值通常是 "on" 或 value 属性的值 + } + } else if ("radio".equals(type)) { + // 单选按钮,如果被选中则添加到 formData + if (element.hasAttr("checked")) { + formData.put(name, value != null ? value : "on"); // 单选按钮的值通常是 value 属性的值 + } + } else if ("select".equals(element.tagName().toLowerCase())) { + // 下拉列表,找到被选中的 option 的值 + Element selectedOption = element.selectFirst("option[selected]"); + if (selectedOption != null) { + formData.put(name, selectedOption.attr("value")); + } else { + // 如果没有选中的项,可能需要根据网站逻辑选择第一个或默认项 + // 或者如果网站要求必须有值,这里需要更复杂的处理 + Element firstOption = element.selectFirst("option"); + if (firstOption != null) { + formData.put(name, firstOption.attr("value")); + } else { + formData.put(name, ""); // 没有选项,留空 + } + } + } else if ("textarea".equals(element.tagName().toLowerCase())) { + // 文本域,获取其文本内容 + formData.put(name, element.text()); + } + // TODO: 根据需要处理其他类型的 input,如 file, submit, image, reset 等 + // 注意:submit, image 类型的 input 通常只有在它们被点击时才会被包含在表单提交数据中,并且它们的值是按钮的值 + } + + // TODO: 如果网站通过 JavaScript 动态添加或修改了表单字段,你需要找到这些字段并手动添加到 formData 中。 + // TODO: 有些表单提交按钮本身会作为 POST 数据的一部分被发送(例如 name="submitButton" value="提交") + // 你可能需要确定哪个按钮触发了提交,并将它的 name=value 对添加到 formData 中。 + + return formData; + } + + + /** + * 下载验证码图片 (Java 8 兼容版本) + * @param imageUrl 图片的完整 URL + * @return 图片的 BufferedImage 对象 + * @throws IOException 如果下载失败 + */ + public BufferedImage downloadImage(String imageUrl) throws IOException { + URL url = new URL(imageUrl); + HttpURLConnection conn = (HttpURLConnection) url.openConnection(); + conn.setRequestMethod("GET"); + conn.setRequestProperty("User-Agent", USER_AGENT); + // 下载图片时通常也需要带上 cookies,确保会话一致性 + conn.setRequestProperty("Cookie", getCookieHeader()); + + + int responseCode = conn.getResponseCode(); + if (responseCode == HttpURLConnection.HTTP_OK) { + try (InputStream is = conn.getInputStream()) { + // --- 兼容 Java 8 及更早版本读取 InputStream --- + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + byte[] buffer = new byte[4096]; // 缓冲区大小 + int bytesRead; + while ((bytesRead = is.read(buffer)) != -1) { + baos.write(buffer, 0, bytesRead); + } + ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray()); + // --- End 兼容代码 --- + + BufferedImage image = ImageIO.read(bais); + if (image == null) { + throw new IOException("Failed to read image stream. Check image format or content for URL: " + imageUrl); + } + return image; + } + } else { + throw new IOException("Failed to download image. HTTP error code: " + responseCode + " for URL: " + imageUrl); + } + } + + /** + * 对验证码图片进行预处理 (基础示例:转灰度+二值化) + * 这是最关键的部分,需要根据验证码样式调整 + * @param originalImage 原始图片 + * @return 预处理后的图片 + */ + public BufferedImage preprocessImage(BufferedImage originalImage) { + // TODO: 这是图像预处理的重点,需要根据实际验证码样式进行调整和优化 + // 保存原始图片方便对比 + try { + File originalFile = new File("original_captcha.png"); + ImageIO.write(originalImage, "png", originalFile); + } catch (IOException e) { + e.printStackTrace(); + } + + // 基础处理:转灰度 -> 二值化 + int width = originalImage.getWidth(); + int height = originalImage.getHeight(); + BufferedImage grayImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_GRAY); + grayImage.getGraphics().drawImage(originalImage, 0, 0, null); + + BufferedImage binaryImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_BINARY); + // 二值化阈值,需要调整 (0-255) + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + int gray = grayImage.getRaster().getSample(x, y, 0); + if (gray < BINARY_THRESHOLD) { + binaryImage.getRaster().setSample(x, y, 0, 0); // 黑色 + } else { + binaryImage.getRaster().setSample(x, y, 0, 1); // 白色 + } + } + } + + // TODO: 更高级的预处理包括:去噪点、去干扰线、字符分割、倾斜校正等 + // 如果验证码只有数字,可以尝试裁剪掉图片上下左右的空白或干扰区域 + + // 为了调试,将预处理后的图片保存下来查看效果 + try { + File outputfile = new File("preprocessed_captcha.png"); + ImageIO.write(binaryImage, "png", outputfile); + System.out.println("Preprocessed image saved to " + outputfile.getAbsolutePath()); + } catch (IOException e) { + e.printStackTrace(); + } + + return binaryImage; // 返回预处理后的图片 + } + + /** + * 使用 Tess4J 识别图片中的文字 + * @param image 待识别的图片 (最好是预处理后的) + * @return 识别出的字符串 (如果失败返回 null 或空字符串) + */ + public String recognizeCaptcha(BufferedImage image) throws TesseractException { + Tesseract tesseract = new Tesseract(); + + // 设置 tessdata 路径 (如果 TESSDATA_PATH 已正确设置且 Tesseract 安装正确,这行可能不是必需的,Tess4J 会自动查找) + if (TESSDATA_PATH != null && !TESSDATA_PATH.isEmpty()) { + tesseract.setDatapath(TESSDATA_PATH); + } else { + System.err.println("WARNING: TESSDATA_PATH not set. Tess4J will try to find tessdata automatically."); + } + + tesseract.setLanguage("eng"); // 设置识别语言为英文 (通常包含数字) + // 如果验证码只有数字,可以尝试设置仅识别数字,这有助于提高准确率 + // tesseract.setTessVariable("tessedit_char_whitelist", "0123456789"); // 方法名请查阅 Tess4J 文档确认 + + String result = tesseract.doOCR(image); + // 清理识别结果,去除空格或换行符等 + result = result != null ? result.trim().replaceAll("[^0-9a-zA-Z]", "") : ""; // 根据验证码内容(数字、字母)调整清理规则 + + return result; + } + + /** + * 构建用于 POST 提交的表单数据字符串 + * @param formDataMap 包含所有表单字段名称和值的 Map + * @return URL 编码后的表单数据字符串 + * @throws IOException + */ + private String buildPostData(Map formDataMap) throws IOException { + StringBuilder postDataBuilder = new StringBuilder(); + boolean first = true; + // 遍历 Map 构建 POST 数据。如果需要特定顺序,使用 LinkedHashMap + for (Map.Entry entry : formDataMap.entrySet()) { + if (!first) { + postDataBuilder.append("&"); + } + postDataBuilder.append(URLEncoder.encode(entry.getKey(), StandardCharsets.UTF_8.name())) + .append("=") + .append(URLEncoder.encode(entry.getValue() != null ? entry.getValue() : "", StandardCharsets.UTF_8.name())); + first = false; + } + return postDataBuilder.toString(); + } + + /** + * 将存储的 cookies 格式化为 HTTP 请求头部的 Cookie 字符串 + */ + private String getCookieHeader() { + StringBuilder cookieHeaderBuilder = new StringBuilder(); + boolean first = true; + for (String cookie : this.cookies) { + if (!first) { + cookieHeaderBuilder.append("; "); + } + cookieHeaderBuilder.append(cookie); + first = false; + } + return cookieHeaderBuilder.toString(); + } + + + // Helper class to hold information extracted from a page fetch + private static class PageInfo { + int statusCode; + String redirectUrl; // 如果发生重定向 + String htmlContent; // 页面响应内容 + // 这里不再包含 ASP.NET 特有的字段,因为它是通用的 + } +} \ No newline at end of file diff --git a/src/main/java/com/example/StringFieldExtractor.java b/src/main/java/com/example/StringFieldExtractor.java new file mode 100644 index 0000000..9a36144 --- /dev/null +++ b/src/main/java/com/example/StringFieldExtractor.java @@ -0,0 +1,74 @@ +package com.example; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class StringFieldExtractor { + public static void main(String[] args) { + // 输入字符串 + String input = "postTime:05-06-2024 00:00:00,title:PT/013/2024,content:澳門大學-N21科研大樓六樓智慧城市物聯網國家重點實驗室(澳門大學)建造工程 OBRAS DE CONSTRUÇÃO DO LABORATÓRIO DE REFERÊNCIA DO ESTADO DE INTERNET DAS COISAS PARA A CIDADE INTELIGENTE (UNIVERSIDADE DE MACAU), LOCALIZADO NO 6.º ANDAR DO EDIFÍCIO DE INVESTIGAÇÃO CIENTÍFICA N21 DA UNIVERSIDADE DE MACAU,fileList:[https://pct.admo.um.edu.mo/wp-content/uploads/2024/06/招標文件電子檔cover-CHI.pdf###pdf, https://pct.admo.um.edu.mo/wp-content/uploads/2024/06/招標文件電子檔cover-ENG-1.pdf###pdf, https://pct.admo.um.edu.mo/wp-content/uploads/2024/07/開標結果.pdf###pdf, https://pct.admo.um.edu.mo/wp-content/uploads/2024/11/判給結果-N21-6G.pdf###pdf]"; + + try { + // 存储提取结果 + String postTime = null; + String title = null; + String content = null; + List fileList = new ArrayList<>(); + + // Step 1: 分割 fileList(因为它包含方括号,可能干扰其他字段) + String fileListStr = null; + int fileListStart = input.indexOf("fileList:["); + if (fileListStart != -1) { + int fileListEnd = input.lastIndexOf("]"); + if (fileListEnd != -1 && fileListEnd > fileListStart) { + fileListStr = input.substring(fileListStart + 9, fileListEnd + 1); // 提取 [..] + input = input.substring(0, fileListStart - 1); // 移除 fileList 部分 + } + } + + // Step 2: 解析其他字段(postTime, title, content) + String[] fields = input.split(",(?=\\w+:)", 3); // 按逗号分割,仅在键名前 + for (String field : fields) { + String[] keyValue = field.split(":", 2); // 分割键值对 + if (keyValue.length == 2) { + String key = keyValue[0].trim(); + String value = keyValue[1].trim(); + switch (key) { + case "postTime": + postTime = value; + break; + case "title": + title = value; + break; + case "content": + content = value; + break; + } + } + } + + // Step 3: 解析 fileList + if (fileListStr != null && fileListStr.startsWith("[") && fileListStr.endsWith("]")) { + String listContent = fileListStr.substring(1, fileListStr.length() - 1).trim(); + if (!listContent.isEmpty()) { + // 分割列表元素,注意 URL 内的逗号 + String[] urls = listContent.split(",\\s*(?=https)"); + for (String url : urls) { + fileList.add(url.trim()); + } + } + } + + // 输出结果 + System.out.println("postTime: " + postTime); + System.out.println("title: " + title); + System.out.println("content: " + content); + System.out.println("fileList: " + fileList); + + } catch (Exception e) { + System.err.println("Parsing error: " + e.getMessage()); + e.printStackTrace(); + } + } +} diff --git a/src/main/java/com/example/WipoPatentsSelenium.java b/src/main/java/com/example/WipoPatentsSelenium.java new file mode 100644 index 0000000..5f933a3 --- /dev/null +++ b/src/main/java/com/example/WipoPatentsSelenium.java @@ -0,0 +1,60 @@ +package com.example; + +import io.github.bonigarcia.wdm.WebDriverManager; +import org.openqa.selenium.By; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.WebElement; +import org.openqa.selenium.chrome.ChromeDriver; + +import java.util.List; + +public class WipoPatentsSelenium { + + public static void main(String[] args) throws InterruptedException { + // 自动管理驱动 + WebDriverManager.chromedriver().setup(); + WebDriver driver = new ChromeDriver(); + + try { + driver.get("https://patentscope.wipo.int/search/en/result.jsf?query=FP:(AI)"); + + // 等待页面加载(粗略等待) + Thread.sleep(3000); + + int maxPages = 3; + int currentPage = 1; + + while (currentPage <= maxPages) { + System.out.println("📄 当前第 " + currentPage + " 页:"); + + // 找到所有结果项 + List results = driver.findElements(By.cssSelector(".resultitem")); + + for (WebElement result : results) { + String title = result.findElement(By.cssSelector(".resulttitle")).getText(); + String pubNum = result.findElement(By.cssSelector(".pubNumber")).getText(); + System.out.println("🔹 " + pubNum + " - " + title); + } + + // 查找“下一页”按钮,点击 + WebElement nextButton = null; + try { + nextButton = driver.findElement(By.cssSelector("a[title='Next']")); + } catch (Exception e) { + System.out.println("✅ 已到最后一页或按钮未找到"); + break; + } + + if (nextButton != null && nextButton.isDisplayed()) { + nextButton.click(); + currentPage++; + Thread.sleep(3000); // 等待下一页加载 + } else { + break; + } + } + } finally { + driver.quit(); + } + } +} diff --git a/src/main/java/com/example/cliniTopic.java b/src/main/java/com/example/cliniTopic.java new file mode 100644 index 0000000..3142e1f --- /dev/null +++ b/src/main/java/com/example/cliniTopic.java @@ -0,0 +1,594 @@ +package com.example; +import com.fasterxml.jackson.databind.ObjectMapper; +import okhttp3.*; +import org.apache.kafka.clients.producer.KafkaProducer; +import org.apache.kafka.clients.producer.ProducerConfig; +import org.apache.kafka.clients.producer.ProducerRecord; +import org.apache.kafka.common.serialization.StringSerializer; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStream; +import java.net.HttpURLConnection; +import java.net.URL; +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.format.DateTimeFormatter; +import java.util.*; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class cliniTopic { + private static final String TOPIC_NAME = "cliniTopic"; + private static final String BOOTSTRAP_SERVERS = "localhost:9092"; + private static KafkaProducer producer; + private static ObjectMapper objectMapper = new ObjectMapper(); + private static final Random random = new Random(); + private static List proxyList = new ArrayList<>(); // 代理池 + private static int currentProxyIndex = 0; // 当前使用的代理索引 + static { + Properties props = new Properties(); + props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, BOOTSTRAP_SERVERS); + props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer"); + props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer"); + props.put(ProducerConfig.ACKS_CONFIG, "all"); // 等待所有副本确认 + props.put(ProducerConfig.RETRIES_CONFIG, 3); // 重试次数 + producer = new KafkaProducer<>(props); + try { + proxyList = Files.readAllLines(Paths.get("proxy.txt")); + if (proxyList.isEmpty()) { + System.out.println("警告: proxy.txt 为空,未加载任何代理"); + } else { + System.out.println("成功加载 " + proxyList.size() + " 个代理"); + } + } catch (IOException e) { + System.err.println("读取 proxy.txt 失败: " + e.getMessage()); + } + } + + public static void main(String[] args) throws IOException, InterruptedException { + List keywords = Files.readAllLines(Paths.get("keywords.txt")); + List cleanedKeywords = new ArrayList<>(); + for (String keyword : keywords) { + String cleaned = keyword.split(",")[0].trim(); // 取逗号前的部分并去除首尾空格 + cleanedKeywords.add(cleaned); + } + ExecutorService executor = Executors.newFixedThreadPool(4); // 4 个线程 + for (String keyword : cleanedKeywords) { + executor.submit(() -> { + try { + int sleepTime = random.nextInt(1001) + 30000; + for (Integer i=1;i<=7;i++){ + final Integer pageNum = i; + Map list = list(keyword,i); + List urls = (List) list.get("listUrl"); + if (urls.isEmpty()){ + System.out.println("没有关键词"+keyword+"检索结果"); + break; + } + Integer count = Integer.parseInt(String.valueOf(list.get("count"))); + Integer totalPage = Integer.parseInt(String.valueOf(list.get("totalPage"))); + for(String url:urls){ + Map result = content(url); + Thread.sleep(sleepTime); + String registNum = String.valueOf(result.get("registNum")); + String crawlUrl = String.valueOf(result.get("crawlUrl")); + + try { + String jsonValue = objectMapper.writeValueAsString(result); + ProducerRecord record = new ProducerRecord<>(TOPIC_NAME, registNum, jsonValue); + + producer.send(record, (metadata, exception) -> { + if (exception == null) { + System.out.println("成功发送到Kafka - Partition: " + metadata.partition() + + ", Offset: " + metadata.offset() + ", "+crawlUrl + ", "+ keyword + " , " + pageNum ); + } else { + System.err.println("发送到Kafka失败: " + exception.getMessage()); + } + }); + } catch (Exception e) { + System.err.println("序列化或发送Kafka消息失败: " + e.getMessage()); + } + Thread.sleep(sleepTime); + + } + if(count<10||totalPage==i){ + System.out.println("关键词"+keyword+"已检索完毕"); + break; + } + + } + } catch (Exception e) { + System.err.println("处理 " + keyword + " 失败: " + e.getMessage()); + e.printStackTrace(); + } + }); + } + executor.shutdown(); + executor.awaitTermination(5, TimeUnit.HOURS); + producer.close(); + } + + private static Map list(String keyword,Integer page) throws Exception{ + Map map = new HashMap<>(); + String baseUrl = "https://www.drks.de/search/de"; + String hostUrl = "https://www.drks.de"; + String cleanUrl = "https://www.drks.de/search/de/results"; + System.out.println("Pure URL: " + cleanUrl); + + System.out.println("Page Number: " + page); + + // 存储 cookies + Set cookieSet = new HashSet<>(); + String sessionId = null; + + // 第一步:初始 GET 请求,获取 cookies 和 ViewState + URL initialUrl = new URL(baseUrl); + HttpURLConnection initialConn = (HttpURLConnection) initialUrl.openConnection(); + initialConn.setRequestMethod("GET"); + initialConn.setInstanceFollowRedirects(false); + initialConn.setConnectTimeout(10000); + initialConn.setReadTimeout(10000); + + // 捕获 cookies + sessionId = updateCookies(initialConn, cookieSet); + System.out.println("Initial Cookies: " + cookieSet); + System.out.println("Initial Session ID: " + sessionId); + + // 读取响应内容以获取 ViewState + BufferedReader in = new BufferedReader(new InputStreamReader(initialConn.getInputStream())); + StringBuilder content = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + content.append(inputLine); + } + in.close(); + initialConn.disconnect(); + + // 提取初始 ViewState + String initialViewState = extractViewState(content.toString()); + System.out.println("Initial ViewState: " + initialViewState); + + // 第二步:发送搜索 POST 请求 + HttpURLConnection searchConn = (HttpURLConnection) new URL(baseUrl).openConnection(); + searchConn.setRequestMethod("POST"); + searchConn.setInstanceFollowRedirects(false); + searchConn.setDoOutput(true); + searchConn.setConnectTimeout(10000); + searchConn.setReadTimeout(10000); + + // 设置搜索请求的请求头 + searchConn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"); + searchConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"); + searchConn.setRequestProperty("Cookie", String.join("; ", cookieSet)); + searchConn.setRequestProperty("Origin", "https://www.drks.de"); + searchConn.setRequestProperty("Referer", baseUrl); + searchConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"); + + // 构建搜索请求的 POST 数据 + String searchPostData = buildSearchPostData(initialViewState,keyword); + + // 发送搜索 POST 请求 + try (OutputStream os = searchConn.getOutputStream()) { + byte[] input = searchPostData.getBytes(StandardCharsets.UTF_8); + os.write(input, 0, input.length); + } + + // 更新 cookies + String searchSessionId = updateCookies(searchConn, cookieSet); + System.out.println("Search Cookies: " + cookieSet); + System.out.println("Search Session ID: " + searchSessionId); + + // 处理搜索响应 + int searchResponseCode = searchConn.getResponseCode(); + System.out.println("Search Response Code: " + searchResponseCode); + String redirectUrl = searchConn.getHeaderField("Location"); + searchConn.disconnect(); + + if (searchResponseCode != 302 || redirectUrl == null) { + System.err.println("Search request did not return expected 302 redirect. Response code: " + searchResponseCode); + return null; + } + System.out.println("Redirect URL (raw): " + redirectUrl); + + // 解析相对 URL + if (!redirectUrl.startsWith("http")) { + redirectUrl = hostUrl + (redirectUrl.startsWith("/") ? redirectUrl : "/" + redirectUrl); + } + System.out.println("Resolved Redirect URL: " + redirectUrl); + + // 第三步:跟随重定向(使用 GET 请求) + URL resultsUrl = new URL(redirectUrl); + HttpURLConnection resultsConn = (HttpURLConnection) resultsUrl.openConnection(); + resultsConn.setRequestMethod("GET"); + resultsConn.setInstanceFollowRedirects(false); + resultsConn.setConnectTimeout(10000); + resultsConn.setReadTimeout(10000); + resultsConn.setRequestProperty("Cookie", String.join("; ", cookieSet)); + resultsConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"); + resultsConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64ек; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"); + + // 更新 cookies + String resultsSessionId = updateCookies(resultsConn, cookieSet); + System.out.println("Results Cookies: " + cookieSet); + System.out.println("Results Session ID: " + resultsSessionId); + + // 读取重定向后的结果页面内容 + BufferedReader resultsReader = new BufferedReader(new InputStreamReader(resultsConn.getInputStream())); + StringBuilder resultsContent = new StringBuilder(); + while ((inputLine = resultsReader.readLine()) != null) { + resultsContent.append(inputLine); + } + resultsReader.close(); + resultsConn.disconnect(); + + // 提取页面中的 ViewState(状态信息,用于后续请求) + String viewState = extractViewState(resultsContent.toString()); + System.out.println("Results ViewState: " + viewState); + + // 检查 Session ID 是否一致,确保会话未被重置 + if (sessionId != null && !sessionId.equals(resultsSessionId)) { + System.out.println("Warning: Session ID changed. Initial: " + sessionId + ", Results: " + resultsSessionId); + } + + // Step 4: 第四步:发送分页请求(使用 POST) + HttpURLConnection postConn = (HttpURLConnection) new URL(cleanUrl).openConnection(); + postConn.setRequestMethod("POST"); + postConn.setInstanceFollowRedirects(false); + postConn.setDoOutput(true); + postConn.setConnectTimeout(10000); + postConn.setReadTimeout(10000); + + // 设置分页请求的请求头(非 AJAX,模拟浏览器常规请求) + postConn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"); + postConn.setRequestProperty("Cookie", String.join("; ", cookieSet)); + postConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"); + postConn.setRequestProperty("Origin", "https://www.drks.de"); + postConn.setRequestProperty("Referer", cleanUrl); + postConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"); + postConn.setRequestProperty("Sec-Fetch-Dest", "document"); + postConn.setRequestProperty("Sec-Fetch-Mode", "navigate"); + + // 构建分页请求的 POST 参数(包括页码和 ViewState 等) + String postData = buildPostData(viewState, page); + // 发送分页的 POST 请求 + try (OutputStream os = postConn.getOutputStream()) { + byte[] input = postData.getBytes(StandardCharsets.UTF_8); + os.write(input, 0, input.length); + } + + // 更新 cookies(分页响应可能返回新的 Set-Cookie) + String paginationSessionId = updateCookies(postConn, cookieSet); + System.out.println("Pagination Cookies: " + cookieSet); + System.out.println("Pagination Session ID: " + paginationSessionId); + + // 处理分页响应 + int responseCode = postConn.getResponseCode(); + System.out.println("Pagination Response Code: " + responseCode); + + // 读取分页响应的 HTML 内容 + StringBuilder postContent = new StringBuilder(); + try (BufferedReader postReader = new BufferedReader( + new InputStreamReader( + responseCode >= 400 ? postConn.getErrorStream() : postConn.getInputStream()))) { + while ((inputLine = postReader.readLine()) != null) { + postContent.append(inputLine); + } + } + Document parse = null; + if (responseCode == HttpURLConnection.HTTP_MOVED_TEMP + || responseCode == HttpURLConnection.HTTP_MOVED_PERM + || responseCode == HttpURLConnection.HTTP_SEE_OTHER) { + String newUrl = postConn.getHeaderField("Location"); + System.out.println("Pagination Redirecting to: " + newUrl); + + // 解析重定向中的相对地址为完整 URL(如果是相对路径) + if (!newUrl.startsWith("http")) { + newUrl = hostUrl + (newUrl.startsWith("/") ? newUrl : "/" + newUrl); + } + + // 重定向 + URL redirectConn = new URL(newUrl); + HttpURLConnection followConn = (HttpURLConnection) redirectConn.openConnection(); + followConn.setRequestMethod("GET"); + followConn.setRequestProperty("Cookie", String.join("; ", cookieSet)); + followConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"); + followConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"); + + BufferedReader redirectReader = new BufferedReader(new InputStreamReader(followConn.getInputStream())); + StringBuilder redirectContent = new StringBuilder(); + while ((inputLine = redirectReader.readLine()) != null) { + redirectContent.append(inputLine); + } + redirectReader.close(); + followConn.disconnect(); + parse = Jsoup.parse(String.valueOf(redirectContent)); + } else if (responseCode == 200) { + parse = Jsoup.parse(String.valueOf(postContent)); + } + + + + Elements links = parse.select("div[data-label='Titel der Studie'] a"); + List listUrl = new ArrayList(); + Integer count = 0; + for (Element link : links) { + String href = link.attr("href"); + String trueUrl = "https://www.drks.de/"+href; + listUrl.add(trueUrl); + count++; + } + String text = parse.select("div.col-md-2.pt-3.ps-0.text-md-end").text(); + // 使用正则表达式提取 "第" 和 "/" 之间的数字 + String regex = "Seite\\s*(\\d+)\\s*/"; + Matcher matcher = Pattern.compile(regex).matcher(text); + if (matcher.find()) { + map.put("totalPage",matcher.group(1));// 返回第一个捕获组,即数字 "1" + } + map.put("listUrl",listUrl); + map.put("count",count); + map.put("keyword",keyword); + postConn.disconnect(); + return map; + } + // 更新并返回当前连接中的 Cookie,包含 JSESSIONID 的提取 + private static String updateCookies(HttpURLConnection conn, Set cookieSet) { + String sessionId = null; + Map> headerFields = conn.getHeaderFields(); + List cookiesHeader = headerFields.get("Set-Cookie"); + if (cookiesHeader != null) { + for (String cookie : cookiesHeader) { + String cookieValue = cookie.split(";")[0]; + cookieSet.add(cookieValue); + if (cookieValue.startsWith("JSESSIONID=") || cookieValue.startsWith("csfcfc=")) { + sessionId = cookieValue; + } + } + } + return sessionId; + } + // 提取 __VIEWSTATE 隐藏字段的值 + private static String extractViewState(String html) { + if (html == null || html.isEmpty()) { + System.err.println("HTML content is empty or null"); + return ""; + } + + // 兼容 jakarta.faces.ViewState 和 javax.faces.ViewState + String regex = "]*name=[\"'](?:jakarta|javax)\\.faces\\.ViewState[\"'][^>]*value=[\"']([^\"']+)[\"']"; + Pattern pattern = Pattern.compile(regex); + Matcher matcher = pattern.matcher(html); + + if (matcher.find()) { + return matcher.group(1); + } + + System.err.println("Failed to extract ViewState from HTML"); + return ""; + } + + private static Map content(String url)throws Exception{ + + OkHttpClient client = new OkHttpClient().newBuilder() + .build(); + MediaType mediaType = MediaType.parse("application/json"); + Request request = new Request.Builder() + .url(url) + .get() + .addHeader("Content-Type", "application/json") + .build(); + Response response = client.newCall(request).execute(); + String html = response.body().string(); + Document parse = Jsoup.parse(html, "UTF-8"); + String title = parse.select(".title-bold").text(); + String registNum = parse.select(".card.trial-details-float.mb-4 .card-body dl dd:nth-child(2)").text(); + String registTime = convertDate(parse.select(".card.trial-details-float.mb-4 .card-body dl dd:nth-child(6)").text()); + Map sponsor = new HashMap<>(); + String header = parse.select("body > main > div.card-body > div:nth-child(9) > div.card-body > div > div > div > div.card-header > h4").text(); + String site = parse.select("body > main > div.card-body > div:nth-child(9) > div.card-body > div > div > div > div.card-body > dl > dd:nth-child(2) > div").text(); + String telefon = parse.select("body > main > div.card-body > div:nth-child(9) > div.card-body > div > div > div > div.card-body > dl > dd:nth-child(4) > span").text(); + String disease = parse.select("body > main > div.card-body > div:nth-child(6) > div.card-body > div > div:nth-child(2) > dl > dd:nth-child(2) > span").text(); + String studyType = parse.select("body > main > div.card-body > div:nth-child(3) > div.card-body > dl").text(); + String inclusionCriteria = parse.select("body > main > div.card-body > div:nth-child(7) > div.card-body > div:nth-child(2) > div:nth-child(3) > div > div.card-body > div > div.col-12.mt-3 > dl > dd > span").text(); + String exclusionCriteria = parse.select("body > main > div.card-body > div:nth-child(7) > div.card-body > div:nth-child(2) > div:nth-child(4) > div > div.card-body > p > span").text(); + String country = parse.select("body > main > div.card-body > div:nth-child(7) > div.card-body > div:nth-child(2) > div:nth-child(1) > div > div.card-body > dl > dd:nth-child(2)").text(); + String intervention = parse.select("body > main > div.card-body > div:nth-child(4) > div.card-body > dl").text(); + String primaryOutcome = parse.select("body > main > div.card-body > div:nth-child(5) > div.card-body > div > div > dl").text(); + String enrollment = parse.select("body > main > div.card-body > div:nth-child(7) > div.card-body > div:nth-child(2) > div:nth-child(2) > div > div.card-body > div > div:nth-child(5) > dl > dd > span").text(); + sponsor.put("header",header); + sponsor.put("site",site); + sponsor.put("telefon",telefon); + Map resultData = new HashMap<>(); + resultData.put("title", title); + resultData.put("registNum",registNum); + resultData.put("registTime",registTime); + resultData.put("registStatus",""); + resultData.put("registTitle",""); + resultData.put("fullTitle",""); + resultData.put("sponsor",sponsor); + resultData.put("sponsorPart",""); + resultData.put("studyType",studyType); + resultData.put("phase",""); + resultData.put("disease",disease); + resultData.put("studyDesign",""); + resultData.put("studyObjective",""); + resultData.put("studyStartDate",""); + resultData.put("inclusionCriteria",inclusionCriteria); + resultData.put("exclusionCriteria",exclusionCriteria); + resultData.put("currentStatus",""); + resultData.put("enrollment",enrollment); + resultData.put("country",country); + resultData.put("tagTime",""); + resultData.put("intervention",intervention); + resultData.put("primaryOutcome",primaryOutcome); + resultData.put("crawlTime",getCurrentTime()); + resultData.put("crawlUrl",url); + resultData.put("postTime",registTime); + resultData.put("content","content"); + resultData.put("forwardcontent","forwardcontent"); + resultData.put("cid","Ndrks"); + return resultData; + } + // 生成搜索请求的 POST 数据 + private static String buildSearchPostData(String viewState,String keyword) { + try { + return "searchForm=searchForm" + + "&searchForm%3Aj_idt80=" + keyword + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AdrksId=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AsecondaryId=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AscientificSummary=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aoutcome=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AhealthOfCondition=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AhealthyVolunteers=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aaddresses=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt128=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AipdSharingPlan=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt135%3Afrom=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt135%3Ato=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt146%3Afrom=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt146%3Ato=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt157%3Afrom=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt157%3Ato=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3Agender=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AageInYears=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AinclusionCriteria=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AexclusionCriteria=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AtrialStatus=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3ArecrutingLocation=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3Aj_idt213%3Afrom=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3Aj_idt213%3Ato=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3AtrialDesign%3Apurpose=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3AtrialDesign%3AstudyType=" + + "&searchForm%3Aj_idt287=" + + "&javax.faces.ViewState=" + URLEncoder.encode(viewState, StandardCharsets.UTF_8.name()); + } catch (Exception e) { + System.err.println("Error encoding search ViewState: " + e.getMessage()); + return ""; + } + } + // 生成分页请求的 POST 数据 + private static String buildPostData(String viewState, int page) { + int adjustedPage = page - 1; + try { + return "resultForm=resultForm" + + "&resultForm%3Asorting%3ArowsPerPage=10" + + "&resultForm%3ApaginationTop%3Aj_idt156%3A"+ adjustedPage +"%3Aj_idt158=" + page + + "&resultForm%3Asorting%3AsortingBy=SCORE" + + "&resultForm%3Asorting%3Aj_idt141=true" + + "&resultForm%3Aj_idt221%3Aj_idt223%3AdownloadConfirmation=resultForm%3Aj_idt221%3Aj_idt223%3AdownloadConfirmation" + + "&selectedType=JSON" + + "&javax.faces.ViewState=" + URLEncoder.encode(viewState, StandardCharsets.UTF_8.name()); + } catch (Exception e) { + System.err.println("Error encoding pagination ViewState: " + e.getMessage()); + return ""; + } + } + public static String convertDate(String inputDate) { + try { + // 输入格式:dd.MM.yyyy + SimpleDateFormat inputFormat = new SimpleDateFormat("dd.MM.yyyy"); + // 解析输入日期 + Date date = inputFormat.parse(inputDate); + // 输出格式:yyyy-MM-dd HH:mm:ss + SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + // 转换为目标格式 + return outputFormat.format(date); + } catch (ParseException e) { + // 处理解析异常 + return "Invalid date format"; + } + } + + public static String getCurrentTime() { + // 创建 DateTimeFormatter,指定输出格式 + DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); + // 获取当前时间 + LocalDateTime now = LocalDateTime.now(); + // 格式化 + return now.format(formatter); + } + private static Response executeWithRetry(OkHttpClient client, Request request, String keyword) throws IOException { + int maxRetries = proxyList.isEmpty() ? 1 : proxyList.size(); // 如果没有代理,只尝试一次 + int attempt = 0; + + while (attempt < maxRetries) { + Response response = client.newCall(request).execute(); + if (response.code() == 403) { + System.out.println("收到 403 状态码,尝试切换代理重试..."); + response.close(); + switchProxy(); + client = createClientWithProxy(); // 使用新代理重建客户端 + attempt++; + if (attempt == maxRetries) { + throw new IOException("所有代理尝试失败,仍然收到 403"); + } + continue; + } + return response; // 成功或非 403 状态码,直接返回 + } + throw new IOException("无法执行请求,未获取响应"); + } + private static OkHttpClient createClientWithProxy() { + OkHttpClient.Builder builder = new OkHttpClient().newBuilder() + .connectTimeout(30, TimeUnit.SECONDS) + .readTimeout(30, TimeUnit.SECONDS) + .writeTimeout(30, TimeUnit.SECONDS); + + if (!proxyList.isEmpty() && currentProxyIndex < proxyList.size()) { + String proxy = proxyList.get(currentProxyIndex); + String[] proxyParts = proxy.split(":"); + if (proxyParts.length == 2) { + String proxyHost = proxyParts[0]; + int proxyPort = Integer.parseInt(proxyParts[1]); + builder.proxy(new java.net.Proxy(java.net.Proxy.Type.HTTP, + new java.net.InetSocketAddress(proxyHost, proxyPort))); + System.out.println("使用代理: " + proxy); + } + } + return builder.build(); + } + private static synchronized void switchProxy() { + if (proxyList.isEmpty()) return; + currentProxyIndex = (currentProxyIndex + 1) % proxyList.size(); + System.out.println("切换到新代理: " + proxyList.get(currentProxyIndex)); + } + public static String increaseOffsetBy30(String originalPayload) { + // 以 "|" 分割载荷为数组 + String[] parts = originalPayload.split("\\|"); + + // 检查数组长度,确保有足够元素 + if (parts.length < 4) { + throw new IllegalArgumentException("载荷格式无效,元素不足"); + } + + // 找到倒数第 4 个元素的位置 + int targetIndex = parts.length - 4; + + try { + // 将倒数第 4 个数字解析为整数 + int currentOffset = Integer.parseInt(parts[targetIndex]); + // 增加 30 + int newOffset = currentOffset + 30; + // 将新值放回数组 + parts[targetIndex] = String.valueOf(newOffset); + // 重新拼接载荷 + return String.join("|", parts); + } catch (NumberFormatException e) { + throw new IllegalArgumentException("倒数第 4 个元素不是有效数字: " + parts[targetIndex]); + } + } +} diff --git a/src/main/java/com/example/drks.java b/src/main/java/com/example/drks.java new file mode 100644 index 0000000..379d7f2 --- /dev/null +++ b/src/main/java/com/example/drks.java @@ -0,0 +1,438 @@ +package com.example; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.io.OutputStream; +import java.net.HttpURLConnection; +import java.net.URL; +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class drks { + public static void main(String[] args) throws Exception { + String targetUrl = "https://www.drks.de/search/de/results?page=4"; + String baseUrl = "https://www.drks.de/search/de"; + String hostUrl = "https://www.drks.de"; + String cleanUrl = targetUrl.split("\\?")[0]; + System.out.println("Pure URL: " + cleanUrl); + + + String pageNumber = targetUrl.contains("?page=") ? targetUrl.split("page=")[1] : "1"; + int page = Integer.parseInt(pageNumber); + System.out.println("Page Number: " + page); + + // 存储 cookies + Set cookieSet = new HashSet<>(); + String sessionId = null; + + // 第一步:初始 GET 请求,获取 cookies 和 ViewState + System.out.println("\n--- Step 1: Initial GET Request ---"); + URL initialUrl = new URL(baseUrl); + HttpURLConnection initialConn = (HttpURLConnection) initialUrl.openConnection(); + initialConn.setRequestMethod("GET"); + initialConn.setInstanceFollowRedirects(false); + initialConn.setConnectTimeout(10000); + initialConn.setReadTimeout(10000); + initialConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"); + + + // 捕获 cookies + sessionId = updateCookies(initialConn, cookieSet); + System.out.println("Initial Cookies: " + cookieSet); + System.out.println("Initial Session ID: " + sessionId); + + // 读取响应内容以获取 ViewState + BufferedReader in = new BufferedReader(new InputStreamReader(initialConn.getInputStream())); + StringBuilder content = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + content.append(inputLine); + } + in.close(); + initialConn.disconnect(); + + // 提取初始 ViewState + String initialViewState = extractViewState(content.toString()); + System.out.println("Initial ViewState: " + initialViewState); + + // 第二步:发送搜索 POST 请求 + System.out.println("\n--- Step 2: Search POST Request ---"); + HttpURLConnection searchConn = (HttpURLConnection) new URL(baseUrl).openConnection(); + searchConn.setRequestMethod("POST"); + searchConn.setInstanceFollowRedirects(false); + searchConn.setDoOutput(true); + searchConn.setConnectTimeout(10000); + searchConn.setReadTimeout(10000); + + // 设置搜索请求的请求头 + searchConn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"); + searchConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"); + searchConn.setRequestProperty("Cookie", String.join("; ", cookieSet)); + searchConn.setRequestProperty("Origin", "https://www.drks.de"); + searchConn.setRequestProperty("Referer", baseUrl); + searchConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"); + + // 构建搜索请求的 POST 数据 + String searchPostData = buildSearchPostData(initialViewState); + System.out.println("Search POST Data: " + searchPostData); + + // 发送搜索 POST 请求 + try (OutputStream os = searchConn.getOutputStream()) { + byte[] input = searchPostData.getBytes(StandardCharsets.UTF_8); + os.write(input, 0, input.length); + } + + // 更新 cookies + String searchSessionId = updateCookies(searchConn, cookieSet); + System.out.println("Search Cookies: " + cookieSet); + System.out.println("Search Session ID: " + searchSessionId); // This is null in your output, which is a potential issue + + // 处理搜索响应 + int searchResponseCode = searchConn.getResponseCode(); + System.out.println("Search Response Code: " + searchResponseCode); + + if (searchResponseCode == 302) { + String redirectUrl = searchConn.getHeaderField("Location"); + searchConn.disconnect(); + + if (redirectUrl == null) { + System.err.println("Search request returned 302 but no Location header found."); + return; + } + System.out.println("Redirect URL (raw): " + redirectUrl); + + // 解析相对 URL + if (!redirectUrl.startsWith("http")) { + redirectUrl = hostUrl + (redirectUrl.startsWith("/") ? redirectUrl : "/" + redirectUrl); + } + System.out.println("Resolved Redirect URL: " + redirectUrl); + + // 第三步:跟随重定向(使用 GET 请求) + System.out.println("\n--- Step 3: Follow Redirect (GET Request) ---"); + URL resultsUrl = new URL(redirectUrl); + HttpURLConnection resultsConn = (HttpURLConnection) resultsUrl.openConnection(); + resultsConn.setRequestMethod("GET"); + resultsConn.setInstanceFollowRedirects(false); + resultsConn.setConnectTimeout(10000); + resultsConn.setReadTimeout(10000); + resultsConn.setRequestProperty("Cookie", String.join("; ", cookieSet)); + resultsConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"); + resultsConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"); + + // 更新 cookies + String resultsSessionId = updateCookies(resultsConn, cookieSet); + System.out.println("Results Cookies: " + cookieSet); + System.out.println("Results Session ID: " + resultsSessionId); + + // 读取重定向后的结果页面内容 + BufferedReader resultsReader = new BufferedReader(new InputStreamReader(resultsConn.getInputStream())); + StringBuilder resultsContent = new StringBuilder(); + while ((inputLine = resultsReader.readLine()) != null) { + resultsContent.append(inputLine); + } + resultsReader.close(); + resultsConn.disconnect(); + + // 提取页面中的 ViewState(状态信息,用于后续请求) + String viewState = extractViewState(resultsContent.toString()); + System.out.println("Results ViewState: " + viewState); + + // 检查 Session ID 是否一致,确保会话未被重置 + if (sessionId != null && !sessionId.equals(resultsSessionId)) { + System.out.println("Warning: Session ID changed. Initial: " + sessionId + ", Results: " + resultsSessionId); + } + + // Step 4: 第四步:发送分页请求(使用 POST) + System.out.println("\n--- Step 4: Pagination POST Request ---"); + HttpURLConnection postConn = (HttpURLConnection) new URL(cleanUrl).openConnection(); + postConn.setRequestMethod("POST"); + postConn.setInstanceFollowRedirects(false); + postConn.setDoOutput(true); + postConn.setConnectTimeout(10000); + postConn.setReadTimeout(10000); + + // 设置分页请求的请求头(非 AJAX,模拟浏览器常规请求) + postConn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"); + postConn.setRequestProperty("Cookie", String.join("; ", cookieSet)); + postConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"); + postConn.setRequestProperty("Origin", "https://www.drks.de"); + postConn.setRequestProperty("Referer", cleanUrl); + postConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"); + postConn.setRequestProperty("Sec-Fetch-Dest", "document"); + postConn.setRequestProperty("Sec-Fetch-Mode", "navigate"); + + // 构建分页请求的 POST 参数(包括页码和 ViewState 等) + String postData = buildPostData(viewState, page); + System.out.println("Pagination POST Data: " + postData); + + // 发送分页的 POST 请求 + try (OutputStream os = postConn.getOutputStream()) { + byte[] input = postData.getBytes(StandardCharsets.UTF_8); + os.write(input, 0, input.length); + } + + // 更新 cookies(分页响应可能返回新的 Set-Cookie) + String paginationSessionId = updateCookies(postConn, cookieSet); + System.out.println("Pagination Cookies: " + cookieSet); + System.out.println("Pagination Session ID: " + paginationSessionId); + + // 处理分页响应 + int responseCode = postConn.getResponseCode(); + System.out.println("Pagination Response Code: " + responseCode); + + // Read and process the pagination response + StringBuilder postContent = new StringBuilder(); + try (BufferedReader postReader = new BufferedReader( + new InputStreamReader( + responseCode >= 400 ? postConn.getErrorStream() : postConn.getInputStream()))) { + while ((inputLine = postReader.readLine()) != null) { + postContent.append(inputLine); + } + } + + Document parse = null; + if (responseCode == HttpURLConnection.HTTP_MOVED_TEMP + || responseCode == HttpURLConnection.HTTP_MOVED_PERM + || responseCode == HttpURLConnection.HTTP_SEE_OTHER) { + String newUrl = postConn.getHeaderField("Location"); + System.out.println("Pagination Redirecting to: " + newUrl); + + // 解析重定向中的相对地址为完整 URL(如果是相对路径) + if (!newUrl.startsWith("http")) { + newUrl = hostUrl + (newUrl.startsWith("/") ? newUrl : "/" + newUrl); + } + + // Follow the redirect + URL redirectConnUrl = new URL(newUrl); + HttpURLConnection followConn = (HttpURLConnection) redirectConnUrl.openConnection(); + followConn.setRequestMethod("GET"); + followConn.setRequestProperty("Cookie", String.join("; ", cookieSet)); + followConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"); + followConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"); + + BufferedReader redirectReader = new BufferedReader(new InputStreamReader(followConn.getInputStream())); + StringBuilder redirectContent = new StringBuilder(); + while ((inputLine = redirectReader.readLine()) != null) { + redirectContent.append(inputLine); + } + redirectReader.close(); + followConn.disconnect(); + + System.out.println("Redirect Response: " + redirectContent); + parse = Jsoup.parse(String.valueOf(redirectContent)); + } else if (responseCode == 200) { + System.out.println("Pagination Response: " + postContent); + parse = Jsoup.parse(String.valueOf(postContent)); + } else { + System.err.println("Unexpected Pagination Response Code: " + responseCode); + // Optionally read and print error stream for non-200/3xx codes + try (BufferedReader errorReader = new BufferedReader(new InputStreamReader(postConn.getErrorStream()))) { + String errorLine; + System.err.println("Error Stream:"); + while ((errorLine = errorReader.readLine()) != null) { + System.err.println(errorLine); + } + } catch (Exception e) { + System.err.println("Could not read error stream: " + e.getMessage()); + } + return; // Exit if pagination fails unexpectedly + } + + Elements links = parse.select("div[data-label='Titel der Studie'] a"); + + for (Element link : links) { + String href = link.attr("href"); + String text = link.text(); + + System.out.println("链接: " + href); + System.out.println("标题: " + text); + } + String text = parse.select("div.col-md-2.pt-3.ps-0.text-md-end").text(); + // 使用正则表达式提取 "第" 和 "/" 之间的数字 + String regex = "Seite\\s*(\\d+)\\s*/"; + Matcher matcher = Pattern.compile(regex).matcher(text); + if (matcher.find()) { + System.out.println("总共有"+matcher.group(1));// 返回第一个捕获组,即数字 "1" + } + postConn.disconnect(); + + } else if (searchResponseCode == 200) { + System.out.println("Search request returned 200 OK. Reading response body:"); + // Read and print the response body for debugging + try (BufferedReader searchReader = new BufferedReader(new InputStreamReader(searchConn.getInputStream()))) { + String line; + StringBuilder searchResponseBody = new StringBuilder(); + while ((line = searchReader.readLine()) != null) { + searchResponseBody.append(line).append("\n"); + } + System.out.println("Search Response Body:\n" + searchResponseBody.toString()); + } catch (Exception e) { + System.err.println("Could not read search response body: " + e.getMessage()); + } finally { + searchConn.disconnect(); + } + + System.err.println("Search request did not return expected 302 redirect. Response code: " + searchResponseCode); + System.err.println("The website's search mechanism may have changed."); + + } else { + // Handle other unexpected response codes for the search request + System.err.println("Unexpected Search Response Code: " + searchResponseCode); + try (BufferedReader errorReader = new BufferedReader(new InputStreamReader(searchConn.getErrorStream()))) { + String errorLine; + System.err.println("Error Stream:"); + while ((errorLine = errorReader.readLine()) != null) { + System.err.println(errorLine); + } + } catch (Exception e) { + System.err.println("Could not read error stream for search response: " + e.getMessage()); + } + searchConn.disconnect(); + } + } + + // 更新并返回当前连接中的 Cookie,包含 JSESSIONID 的提取 + private static String updateCookies(HttpURLConnection conn, Set cookieSet) { + String sessionId = null; + Map> headerFields = conn.getHeaderFields(); + List cookiesHeader = headerFields.get("Set-Cookie"); + if (cookiesHeader != null) { + for (String cookie : cookiesHeader) { + String cookieValue = cookie.split(";")[0]; + cookieSet.add(cookieValue); + // Prioritize JSESSIONID or csfcfc if present + if (cookieValue.startsWith("JSESSIONID=")) { + sessionId = cookieValue; + } else if (cookieValue.startsWith("csfcfc=") && sessionId == null) { + sessionId = cookieValue; + } + } + } + return sessionId; + } + + // 提取 __VIEWSTATE 隐藏字段的值 + private static String extractViewState(String html) { + // Try regex first for jakarta.faces.ViewState + String regexJakarta = "name=\"jakarta\\.faces\\.ViewState\"[^>]*value=\"([^\"]+)\""; + Pattern patternJakarta = Pattern.compile(regexJakarta); + Matcher matcherJakarta = patternJakarta.matcher(html); + + if (matcherJakarta.find()) { + return matcherJakarta.group(1); + } + + // Fallback to regex for javax.faces.ViewState (older versions or other parts of site) + String regexJavax = "name=\"javax\\.faces\\.ViewState\"[^>]*value=\"([^\"]+)\""; + Pattern patternJavax = Pattern.compile(regexJavax); + Matcher matcherJavax = patternJavax.matcher(html); + + if (matcherJavax.find()) { + return matcherJavax.group(1); + } + + // Fallback to string search if regex fails (less reliable) + String searchStringJakarta = "jakarta.faces.ViewState"; + int startIndexJakarta = html.indexOf(searchStringJakarta); + if (startIndexJakarta != -1) { + int valueStart = html.indexOf("value=\"", startIndexJakarta) + 7; + int valueEnd = html.indexOf("\"", valueStart); + if (valueStart != -1 && valueEnd != -1) { + return html.substring(valueStart, valueEnd); + } + } + + String searchStringJavax = "javax.faces.ViewState"; + int startIndexJavax = html.indexOf(searchStringJavax); + if (startIndexJavax != -1) { + int valueStart = html.indexOf("value=\"", startIndexJavax) + 7; + int valueEnd = html.indexOf("\"", valueStart); + if (valueStart != -1 && valueEnd != -1) { + return html.substring(valueStart, valueEnd); + } + } + + + System.err.println("Failed to extract ViewState from HTML"); + return ""; // Return empty string if not found + } + + // 生成搜索请求的 POST 数据 + private static String buildSearchPostData(String viewState) { + try { + // URL-encode the ViewState + String encodedViewState = URLEncoder.encode(viewState, StandardCharsets.UTF_8.name()); + + return "searchForm=searchForm" + + "&searchForm%3Aj_idt80=Midwifery" + // Assuming 'Midwifery' is the search term + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AdrksId=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AsecondaryId=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AscientificSummary=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aoutcome=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AhealthOfCondition=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AhealthyVolunteers=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aaddresses=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt128=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AipdSharingPlan=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt135%3Afrom=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt135%3Ato=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt146%3Afrom=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt146%3Ato=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt157%3Afrom=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt157%3Ato=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3Agender=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AageInYears=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AinclusionCriteria=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AexclusionCriteria=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AtrialStatus=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3ArecrutingLocation=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3Aj_idt213%3Afrom=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3Aj_idt213%3Ato=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3AtrialDesign%3Apurpose=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3AtrialDesign%3AstudyType=" + + "&searchForm%3Aj_idt287=" + // This parameter might be related to the search button click + "&jakarta.faces.ViewState=" + encodedViewState; // Changed to jakarta.faces.ViewState + } catch (Exception e) { + System.err.println("Error encoding search ViewState: " + e.getMessage()); + return ""; + } + } + + // 生成分页请求的 POST 数据 + private static String buildPostData(String viewState, int page) { + // The page parameter in the POST data might be 0-indexed or 1-indexed + // Let's assume it's 0-indexed for the parameter name and 1-indexed for the value based on your original code + int parameterPage = page - 1; + int valuePage = page; // The value sent in the form might be the actual page number + + try { + // URL-encode the ViewState + String encodedViewState = URLEncoder.encode(viewState, StandardCharsets.UTF_8.name()); + + return "resultForm=resultForm" + + "&resultForm%3Asorting%3ArowsPerPage=10" + + // The parameter name for pagination button might have changed + // Check browser network traffic for the exact parameter name for page buttons + "&resultForm%3ApaginationTop%3Aj_idt156%3A"+ parameterPage +"%3Aj_idt158=" + valuePage + + "&resultForm%3Asorting%3AsortingBy=SCORE" + + "&resultForm%3Asorting%3Aj_idt141=true" + // This might be for sorting direction + "&resultForm%3Aj_idt221%3Aj_idt223%3AdownloadConfirmation=resultForm%3Aj_idt221%3Aj_idt223%3AdownloadConfirmation" + + "&selectedType=JSON" + // This might be for download format, potentially not needed for pagination + "&jakarta.faces.ViewState=" + encodedViewState; // Changed to jakarta.faces.ViewState + } catch (Exception e) { + System.err.println("Error encoding pagination ViewState: " + e.getMessage()); + return ""; + } + } +} diff --git a/src/main/java/com/example/getInKa.java b/src/main/java/com/example/getInKa.java new file mode 100644 index 0000000..c18d4e5 --- /dev/null +++ b/src/main/java/com/example/getInKa.java @@ -0,0 +1,165 @@ +package com.example; + +import org.apache.kafka.clients.producer.*; +import org.apache.kafka.common.serialization.StringSerializer; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.select.Elements; +import okhttp3.OkHttpClient; +import okhttp3.Request; +import okhttp3.Response; + +import java.io.*; +import java.util.*; +import java.util.concurrent.Future; + +public class getInKa { + // 初始化 OkHttp 客户端,用于发送 HTTP 请求 + private static final OkHttpClient httpClient = new OkHttpClient(); + private static final String PROCESSED_URLS_FILE = "processed_urls.txt"; // 记录已处理的 URL 文件 + public static void main(String[] args) { + try { + // 获取目标 URL 列表 + System.out.println("Starting URL collection..."); + List urls = getUrls(); + System.out.println("Collected " + urls.size() + " URLs."); + + // 从 URL 中提取新闻数据并保存到 kafka + System.out.println("Starting news extraction..."); + getNews(urls); + System.out.println("News extraction completed."); + } catch (IOException | InterruptedException e) { + System.out.println("Error in main: " + e.getMessage()); + } + } + public static List getUrls() throws IOException, InterruptedException { + List urls = new ArrayList<>(); + Set processedUrls = loadProcessedUrls(); // 加载已处理的 URL + + for (int page = 1; page <= 28; page++) { + String url = "https://www.zyctd.com/zixun/201/pz102-" + page + ".html"; + Request request = new Request.Builder() + .url(url) + .addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0") + .build(); + + System.out.println("Fetching page " + page + ": " + url); + try (Response response = httpClient.newCall(request).execute()) { + if (response.isSuccessful() && response.body() != null) { + System.out.println("Successfully fetched page " + page); + String html = response.body().string(); + Document doc = Jsoup.parse(html); + Elements links = doc.select("div.zixun-list > div.zixun-item-box > div.zixun-item-title > p > a"); + List projectIDs = links.eachAttr("href"); + System.out.println("Found " + projectIDs.size() + " URLs on page " + page); + + for (String projectUrl : projectIDs) { + if (!processedUrls.contains(projectUrl)) { // 检查是否已处理 + urls.add(projectUrl); + processedUrls.add(projectUrl); // 添加到已处理集合 + } + } + } else { + System.out.println("Failed to fetch page " + page + ": Status code " + response.code()); + } + } + Thread.sleep(1000); + } + saveProcessedUrls(processedUrls); // 保存已处理的 URL + return urls; + } + public static void getNews(List urls) throws IOException { + for (int i = 0; i < urls.size(); i++) { + String url = urls.get(i); + Request request = new Request.Builder() + .url(url) + .addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0") + .build(); + + System.out.println("Processing URL " + (i + 1) + "/" + urls.size() + ": " + url); + try (Response response = httpClient.newCall(request).execute()) { + if (response.isSuccessful() && response.body() != null) { + System.out.println("Successfully fetched news from " + url); + String html = response.body().string(); + Document doc = Jsoup.parse(html); + String title = doc.select("div.info-title.t-center > h1").text().trim(); + String date = doc.select("div.author.color-grey.art-info > span:nth-child(1)").text().trim(); + String content = String.join("\n", doc.select("div.info-content > div > p").eachText()).trim(); + if (content.isEmpty()) { + content = String.join("\n", doc.select("div.info-content > p:nth-child(2)").eachText()).trim(); + } + + if (!title.isEmpty() && !date.isEmpty() && !content.isEmpty()) { + Map news = new HashMap<>(); + news.put("title", title); + news.put("date", date); + news.put("content", content); + news.put("url", url); + System.out.println("Extracted news: " + news.get("title")); + saveData(news); // 调用修改后的 saveData 方法 + } else { + System.out.println("Failed to extract complete data from " + url); + } + } else { + System.out.println("Failed to fetch news from " + url + ": Status code " + response.code()); + } + } catch (Exception e) { + System.out.println("An error occurred while fetching " + url + ": " + e.getMessage()); + } + try { + Thread.sleep(5000); // 休眠5秒 + } catch (InterruptedException e) { + System.out.println("Sleep interrupted: " + e.getMessage()); + } + } + } + public static void saveData(Map news) { + Properties properties = new Properties(); + properties.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092"); + properties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName()); + properties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName()); + + try (Producer producer = new KafkaProducer<>(properties)) { + String topic = "news-topic"; + String key = news.get("title"); + String value = news.toString(); + ProducerRecord record = new ProducerRecord<>(topic, key, value); + + producer.send(record, (metadata, exception) -> { + if (exception == null) { + System.out.println("Data sent successfully to Kafka: topic=" + metadata.topic() + + ", partition=" + metadata.partition() + ", offset=" + metadata.offset()); + } else { + System.err.println("Failed to send data to Kafka: " + exception.getMessage()); + } + }).get(); + } catch (Exception e) { + System.err.println("Error while sending data to Kafka: " + e.getMessage()); + } + } + // 加载已处理的 URL + private static Set loadProcessedUrls() throws IOException { + Set processedUrls = new HashSet<>(); + File file = new File(PROCESSED_URLS_FILE); + if (file.exists()) { + try (BufferedReader reader = new BufferedReader(new FileReader(file))) { + String line; + while ((line = reader.readLine()) != null) { + processedUrls.add(line.trim()); + } + } + } + return processedUrls; + } + + // 保存已处理的 URL + private static void saveProcessedUrls(Set processedUrls) throws IOException { + try (BufferedWriter writer = new BufferedWriter(new FileWriter(PROCESSED_URLS_FILE))) { + for (String url : processedUrls) { + writer.write(url); + writer.newLine(); + } + } + } +} diff --git a/src/main/java/com/example/jsonGetOk.java b/src/main/java/com/example/jsonGetOk.java new file mode 100644 index 0000000..ced112b --- /dev/null +++ b/src/main/java/com/example/jsonGetOk.java @@ -0,0 +1,47 @@ +package com.example; + +import okhttp3.*; +import org.json.JSONArray; +import org.json.JSONObject; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +public class jsonGetOk { + public static void main(String[] args) throws IOException { + OkHttpClient client = new OkHttpClient().newBuilder() + .build(); + MediaType mediaType = MediaType.parse("text/plain"); + RequestBody body = RequestBody.create(mediaType, ""); + Request request = new Request.Builder() + .url("https://www.dsscu.gov.mo/api/common/page_detail?PostType=page&EntityId=6654829e-8163-b801-0096-c02e09d690d1") + .get() + .build(); + Response response = client.newCall(request).execute(); + String responseBody = response.body().string(); + + // 解析 JSON + JSONObject jsonObject = new JSONObject(responseBody); + JSONObject data = jsonObject.getJSONObject("data"); + String postTime = data.getString("onlineAt"); + JSONObject metas = data.getJSONObject("metas"); + String title = metas.getString("name"); + String summary = metas.getString("summary"); + Document parse = Jsoup.parse(summary); + String content = parse.text(); + String forwardcontent = responseBody; + String fileList = metas.getString("biddersFile"); + fileList = fileList+"###"+"pdf"; + Map map = new HashMap<>(); + map.put("postTime",postTime); + map.put("title",title); + map.put("content",content); + map.put("forwardcontent",forwardcontent); + map.put("fileList",fileList); + System.out.println(map); + } + +} diff --git a/src/main/java/com/example/ook.java b/src/main/java/com/example/ook.java new file mode 100644 index 0000000..2d67ed9 --- /dev/null +++ b/src/main/java/com/example/ook.java @@ -0,0 +1,256 @@ +package com.example; + +import okhttp3.*; +import org.json.JSONObject; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.io.IOException; +import java.net.InetSocketAddress; +import java.net.Proxy; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.ZonedDateTime; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeParseException; +import java.util.Date; +import java.util.HashMap; +import java.util.Locale; +import java.util.Map; +import java.util.concurrent.TimeUnit; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class ook { + + + public static void main(String[] args) throws Exception { + // 1. 获取代理地址 +// String proxyJson = getProxyFromLocalService(); +// JSONObject proxyData = new JSONObject(proxyJson); +// String httpProxy = proxyData.getString("http"); // 例如 "http://proxy1:port" +// +// // 2. 解析代理地址 +// String[] proxyParts = httpProxy.replace("http://", "").split(":"); +// String proxyHost = proxyParts[0]; // proxy1 +// int proxyPort = Integer.parseInt(proxyParts[1]); // port + + OkHttpClient client = new OkHttpClient().newBuilder() + .connectTimeout(30, TimeUnit.SECONDS) + .readTimeout(30, TimeUnit.SECONDS) + .writeTimeout(30, TimeUnit.SECONDS) + .proxy(new Proxy(Proxy.Type.HTTP, new InetSocketAddress("127.0.0.1", 7897))) // 直接使用 7897 端口 + .build(); + + MediaType mediaType = MediaType.parse("text/plain"); + Request request = new Request.Builder() + .url("https://wrair.health.mil/News-Media/Press-Releases/") + .get() + // 添加关键请求头 + .addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36") + .addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7") +// .addHeader("Accept-Encoding", "gzip, deflate, br, zstd") + .addHeader("Accept-Language", "zh-CN,zh;q=0.9,th;q=0.8") + .addHeader("Cache-Control", "no-cache") + .addHeader("Pragma", "no-cache") + .addHeader("Referer", "https://wrair.health.mil/News-Media/Press-Releases/") + .addHeader("Cookie", "_ga=GA1.1.516170455.1740971326; .ASPXANONYMOUS=xUBztj4Ek1vHfBPe-1QqFJhd83I4bkB1k0_d-2QrQ7drfd7R7Y6eNsyyHVjSeffyIKzy_qm5tOKOCtbvst-s9ZGWThxifCGMdJE117EQlr1OZARa0; dnn_IsMobile=False; language=en-US; ARRAffinity=c30f7cdebcf208f7c5a996cb410451c36532afc64703669607f68f04a75f4b39; _ga_CSLL4ZEK4L=GS1.1.1742349582.4.1.1742350035.0.0.0") + .addHeader("Upgrade-Insecure-Requests", "1") + .addHeader("Sec-Fetch-Dest", "document") + .addHeader("Sec-Fetch-Mode", "navigate") + .addHeader("Sec-Fetch-Site", "same-origin") + .addHeader("Sec-Fetch-User", "?1") + .addHeader("Sec-Ch-Ua", "\"Chromium\";v=\"134\", \"Not:A-Brand\";v=\"24\", \"Google Chrome\";v=\"134\"") + .addHeader("Sec-Ch-Ua-Mobile", "?0") + .addHeader("Sec-Ch-Ua-Platform", "\"Windows\"") + .addHeader("Priority", "u=0, i") + .build(); + Response response = client.newCall(request).execute(); + String html = response.body().string(); + Document parse = Jsoup.parse(html); +// String url = "https://www.uu.se/nyheter/alla?newsResearch=researchtopic11%3Bresearchtopic7%3Bresearchtopic22%3Bresearchtopic10%3Bresearchtopic2&start=20"; +// // 定义正则表达式 +// String regex = "start=(\\d+)"; +// Pattern pattern = Pattern.compile(regex); +// Matcher matcher = pattern.matcher(url); +// Integer start = 0; + + +// String postTime = convertToTimestamp(parse.select(".mr10").text()); +// String title = parse.select(".hdg01").text(); +// String content = parse.select(".container01 p").text(); +// String forwardcontent = parse.select("#main").html(); +// Map map = new HashMap<>(); + +// if (matcher.find()) { +// start = Integer.parseInt(matcher.group(1)); +// System.out.println("Start: " + start); // start = 12 +// } +// +// Elements allLinks = new Elements(); +// Elements links = parse.select(".search-result-hit-text-container a"); +// allLinks.addAll(links); +// +// int totalLinks = allLinks.size(); +// int startIndex = Math.max(0, totalLinks - 10); +// for (int i = startIndex; i < totalLinks; i++) { +// Map task = new HashMap(16); +// task.put("link","https://www.uu.se"+allLinks.get(i).attr("href")); +// task.put("linktype", "newscontent"); // 設置鏈接類型為 "newscontent" +// +// System.out.println(task); +// } + Elements elements = parse.select(".title a"); + for (Element element : elements) { + String link = element.attr("href"); + System.out.println(link); + } + + +// map.put("postTime",postTime); +// map.put("title",title); +// map.put("content",content); +// map.put("forwardcontent",forwardcontent); +// System.out.println(map); + + } + public ook() throws IOException { + } +// public static String convertToTimestamp(String dateStr) { +// try { +// // 定义输入格式:dd MMMM , yyyy(例如 "28 February , 2025") +// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("MMMM dd, yyyy", Locale.ENGLISH); +// // 定义输出格式:yyyy-MM-dd HH:mm:ss +// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); +// +// // 解析输入日期 +// LocalDate date = LocalDate.parse(dateStr, inputFormatter); +// // 转换为带时间的格式,时间设为 00:00:00 +// return date.atStartOfDay().format(outputFormatter); +// } catch (Exception e) { +// e.printStackTrace(); +// return null; // 或抛出异常,根据需求调整 +// } +// } +// public static String convertToTimestamp(String dateStr) { +// try { +// // 定义输入格式:yyyy 年 MM 月 dd 日 +// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("MM-dd-yyyy", Locale.CHINESE); +// // 定义输出格式:yyyy-MM-dd HH:mm:ss +// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); +// +// // 解析输入日期 +// LocalDate date = LocalDate.parse(dateStr, inputFormatter); +// // 转换为带时间的格式,时间设为 00:00:00 +// return date.atStartOfDay().format(outputFormatter); +// } catch (Exception e) { +// e.printStackTrace(); +// return null; // 或抛出异常,根据需求调整 +// } +// } + +// public static String convertToTimestamp(String dateStr) { +// try { +// // 定义输入格式 +// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("dd/MM/yyyy"); +// // 定义输出格式 +// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); +// +// // 解析输入字符串为 LocalDate +// LocalDate date = LocalDate.parse(dateStr, inputFormatter); +// // 转换为 LocalDateTime,设置时间为 00:00:00 +// LocalDateTime dateTime = date.atStartOfDay(); +// // 格式化为目标字符串 +// return dateTime.format(outputFormatter); +// } catch (Exception e) { +// e.printStackTrace(); +// return null; // 或者抛出异常,根据需求调整 +// } +// } +// public static String convertToTimestamp(String dateStr) { +// try { +// // 定义输入格式:MMMM d, yyyy(例如 "June 3, 2015") +// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("d MMMM, yyyy", Locale.ENGLISH); +// // 定义输出格式:yyyy-MM-dd HH:mm:ss +// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); +// +// // 解析输入日期 +// LocalDate date = LocalDate.parse(dateStr, inputFormatter); +// // 转换为带时间的格式,时间设为 00:00:00 +// return date.atStartOfDay().format(outputFormatter); +// } catch (Exception e) { +// e.printStackTrace(); +// return null; // 或抛出异常,根据需求调整 +// } +// } +// public static String convertToTimestamp(String input) { +// try { +// // 正则匹配 "d MMMM yyyy" +// Pattern pattern = Pattern.compile("\\d{1,2} [A-Za-z]+ \\d{4}"); +// Matcher matcher = pattern.matcher(input); +// if (matcher.find()) { +// String dateStr = matcher.group(); +// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("d MMMM yyyy", Locale.ENGLISH); +// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); +// LocalDate date = LocalDate.parse(dateStr, inputFormatter); +// return date.atStartOfDay().format(outputFormatter); +// } else { +// System.out.println("No date found in: " + input); +// return null; +// } +// } catch (Exception e) { +// e.printStackTrace(); +// return null; +// } +// } +// public static String convertToTimestamp(String dateStr) { +// try { +// // Parse the ISO 8601 date string (e.g., "2025-03-17T12:37:33.033Z") +// ZonedDateTime zdt = ZonedDateTime.parse(dateStr, DateTimeFormatter.ISO_DATE_TIME); +// +// // Define the output format (yyyy-MM-dd hh:mm:ss) +// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); +// +// // Format the date to the desired output +// return zdt.format(outputFormatter); +// } catch (Exception e) { +// e.printStackTrace(); +// return null; // Or throw an exception, depending on your needs +// } +// } + public static String convertToTimestamp(String dateStr) { + try { + // Parse "Jan. 9, 2025" (abbreviated month, dot, comma-separated) + DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("MMM. d, yyyy", Locale.ENGLISH); + LocalDate date = LocalDate.parse(dateStr, inputFormatter); + + // Format to "yyyy-MM-dd HH:mm:ss" (defaulting time to 00:00:00) + DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); + return date.atStartOfDay().format(outputFormatter); + } catch (Exception e) { + e.printStackTrace(); + return null; + } + } + // 调用本地代理服务获取代理地址 + private static String getProxyFromLocalService() throws Exception { + OkHttpClient client = new OkHttpClient(); + Request request = new Request.Builder() + .url("http://127.0.0.1:7897") + .get() + .build(); + + try (Response response = client.newCall(request).execute()) { + if (response.isSuccessful()) { + return response.body().string(); // 返回 JSON 字符串 + } else { + throw new Exception("获取代理失败,状态码: " + response.code()); + } + } + } +} + diff --git a/src/main/java/com/example/oook.java b/src/main/java/com/example/oook.java new file mode 100644 index 0000000..d8c24d9 --- /dev/null +++ b/src/main/java/com/example/oook.java @@ -0,0 +1,524 @@ +package com.example; + +import okhttp3.*; +import org.json.JSONArray; +import org.json.JSONObject; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.io.IOException; +import java.net.InetSocketAddress; +import java.net.Proxy; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.time.*; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeParseException; +import java.util.*; +import java.util.concurrent.TimeUnit; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class oook { + + + public static void main(String[] args) throws Exception { + // 1. 获取代理地址 +// String proxyJson = getProxyFromLocalService(); +// JSONObject proxyData = new JSONObject(proxyJson); +// String httpProxy = proxyData.getString("http"); // 例如 "http://proxy1:port" +// +// // 2. 解析代理地址 +// String[] proxyParts = httpProxy.replace("http://", "").split(":"); +// String proxyHost = proxyParts[0]; // proxy1 +// int proxyPort = Integer.parseInt(proxyParts[1]); // port + OkHttpClient client = new OkHttpClient().newBuilder() + .connectTimeout(30, TimeUnit.SECONDS) + .readTimeout(30, TimeUnit.SECONDS) + .writeTimeout(30, TimeUnit.SECONDS) +// .cookieJar(new CookieJar() { +// private final HashMap> cookieStore = new HashMap<>(); +// +// @Override +// public void saveFromResponse(HttpUrl url, List cookies) { +// cookieStore.put(url.host(), cookies); // 保存 Cookie +// } +// +// @Override +// public List loadForRequest(HttpUrl url) { +// List cookies = cookieStore.get(url.host()); +// return cookies != null ? cookies : new ArrayList<>(); +// } +// }) +// .followRedirects(true) // 自动处理重定向 + .build(); + + + // 发送目标请求,自动获取和使用 Cookie +// Request request = new Request.Builder() +// .url("https://thl.fi/aiheet/infektiotaudit-ja-rokotukset/ajankohtaista/infektio-ja-rokotusuutiset?p_p_id=com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_L2Jk5CCjrKPN&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_L2Jk5CCjrKPN_redirect=%2Faiheet%2Finfektiotaudit-ja-rokotukset%2Fajankohtaista%2Finfektio-ja-rokotusuutiset&_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_L2Jk5CCjrKPN_delta=50&p_r_p_resetCur=false&_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_L2Jk5CCjrKPN_cur=1") +// .get() +// .addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36") +// .addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") +// .addHeader("Accept-Language", "en-US,en;q=0.5") +// .addHeader("Cookie", "__cf_bm=HXf4OleH9DiJmEagV_4Wori6vFzyN4wf.CBVL57AQUI-1743471952-1.0.1.1-h0KqPKUW2_wblBJ1HWbn50Xi1EPDIxjvFhRyrkdPrAoRHNjlXk..tK_KDWGUs6f4Z1VbQUbJD1Vw3KTi9IYO5bx5af4ZqE2nABBXT.YpLKQ; _cfuvid=jdweOOZm.a8GWXZGqRHb.fiSFMKZuAppyOlkDBbafw0-1743471952167-0.0.1.1-604800000") .build(); +// OkHttpClient client = new OkHttpClient().newBuilder() +// .connectTimeout(30, TimeUnit.SECONDS) +// .readTimeout(30, TimeUnit.SECONDS) +// .writeTimeout(30, TimeUnit.SECONDS) +//// .proxy(new Proxy(Proxy.Type.HTTP, new InetSocketAddress("127.0.0.1", 7897))) // 直接使用 7897 端口 +// .build(); + String url = "https://www.iranintl.com/en/202504116060"; + MediaType mediaType = MediaType.parse("text/plain"); + RequestBody body = RequestBody.create(mediaType, ""); + Request request = new Request.Builder() + .url(url) + .get() + .build(); + Response response = client.newCall(request).execute(); + String html = response.body().string(); + Document parse = Jsoup.parse(html); +// String htmlData = null; +// JSONArray jsonArray = new JSONArray(html); +// for (int i = 0; i < jsonArray.length(); i++) { +// JSONObject obj = jsonArray.getJSONObject(i); +// if ("insert".equals(obj.optString("command")) && obj.has("data")) { +// htmlData = obj.getString("data"); +// break; +// } +// } +// Document doc = Jsoup.parse(htmlData); +// Elements rows = doc.select(".o-grid__item.col-1, .o-grid__item.col-2, .o-grid__item.col-3"); +// +// Set uniqueHrefs = new HashSet<>(); +// +// for (Element row : rows) { +// Elements links = row.select("a[href]"); // 选择所有 a 标签 +// if (!links.isEmpty()) { +// // 只取第一个 href +// String href = links.first().attr("href"); +// uniqueHrefs.add(href); +// } +// } +//// +// for (String href : uniqueHrefs) { +// System.out.println("Href: " + href); +// } +// String next = getNextPageUrl(url); +// System.out.println(next); + +// JSONObject jsonObject = new JSONObject(html); +// JSONObject response1 = jsonObject.getJSONObject("response"); +// JSONArray docs = response1.getJSONArray("docs"); +// +// // 遍历 docs 数组,提取 permalink +// for (int i = 0; i < docs.length(); i++) { +// JSONObject doc = docs.getJSONObject(i); +// String permalink = doc.getString("permalink"); +// System.out.println("Permalink: " + permalink); +// } + +// String url = "https://www.uu.se/nyheter/alla?newsResearch=researchtopic11%3Bresearchtopic7%3Bresearchtopic22%3Bresearchtopic10%3Bresearchtopic2&start=20"; +// // 定义正则表达式 +// String regex = "start=(\\d+)"; +// Pattern pattern = Pattern.compile(regex); +// Matcher matcher = pattern.matcher(url); +// Integer start = 0; +// String postTime = convertToTimestamp( +// parse.select(".c-news-info__date.o-meta span.c-date").text().trim() + " " + +// parse.select(".c-news-info__date.o-meta span.c-year").text().trim() +// ); +// String postTime = parse.select("tr:nth-child(3) td:nth-child(3)").text()+" 00:00:00"; + String postTime = convertIsoToTimestamp(parse.select(".WrittenContentBlock-module__9pvVhW__timeAgo time").attr("datetime")); + String title = parse.select(".WrittenContentBlock-module__9pvVhW__headline").text(); + String content = parse.select(".WrittenContentBlock-module__9pvVhW__body p").text(); + String forwardcontent = parse.select(".page").html(); + Elements imgs = parse.select(".page img"); +// Elements pdfs = parse.select("tr:nth-child(3) td a"); + + String prefix = ""; + + List imgList = new ArrayList(); + + for (Element img : imgs) { + String src = img.attr("src"); + if (src != null && !src.isEmpty()) { + // 判断是否以 https 开头 + String fullUrl; + if (!src.startsWith("https")) { + // 如果不以 https 开头,拼接前缀 + if (src.startsWith("/")) { + fullUrl = prefix + src; + } else { + fullUrl = prefix + "/" + src; + } + } else { + fullUrl = src; + } + // 拼接格式 + String imgUrl = fullUrl + "###" + "avif"; + imgList.add(imgUrl); + } + } + +// String prefix = ""; +// +// List fileList = new ArrayList(); +// +// for (Element pdf : pdfs) { +// String pdfUrl = pdf.attr("href"); +// if (pdfUrl != null && !pdfUrl.isEmpty()) { +// // 判断是否以 https 开头 +// String fullUrl; +// if (!pdfUrl.startsWith("https")) { +// // 如果不以 https 开头,拼接前缀 +// if (pdfUrl.startsWith("/")) { +// fullUrl = prefix + pdfUrl; +// } else { +// fullUrl = prefix + "/" + pdfUrl; +// } +// } else { +// fullUrl = pdfUrl; +// } +// // 拼接格式 +// String fileUrl = fullUrl + "###" + "pdf"; +// fileList.add(fileUrl); +// } +// } +// + + +// if (matcher.find()) { +// start = Integer.parseInt(matcher.group(1)); +// System.out.println("Start: " + start); // start = 12 +// } + +// Elements allLinks = new Elements(); +// Elements links = parse.select(".card-body a"); +// allLinks.addAll(links); +// +// int totalLinks = allLinks.size(); +// int startIndex = Math.max(0, totalLinks - 10); +// for (int i = startIndex; i < totalLinks; i++) { +// Map task = new HashMap(16); +// task.put("link","https://www.uu.se"+allLinks.get(i).attr("href")); +// task.put("linktype", "newscontent"); // 設置鏈接類型為 "newscontent" +// +// System.out.println(task); +// } + +// Elements elements = parse.select(".topic__grid__item a"); +// Integer count = elements.size(); +// for (Element element : elements) { +// String link = element.attr("href"); // 獲取新聞鏈接的 href 屬性 +// System.out.println(link); +// } + +// if(count <10){ +// String nextpageurl = getPreviousYearUrl(url); +// System.out.println(nextpageurl); +// }else { +// String nextpageurl = getNextPageUrl(url); +// System.out.println(nextpageurl); +// } + Map map = new HashMap<>(); + map.put("postTime",postTime); + map.put("title",title); + map.put("content",content); + map.put("forwardcontent",forwardcontent); + map.put("imgList",imgList); +// map.put("fileList",fileList); + System.out.println(map); + + } + public oook() throws IOException { + } +// public static String convertToTimestamp(String dateStr) { +// try { +// // 定义输入格式:dd MMMM , yyyy(例如 "28 February , 2025") +// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("MMMM dd, yyyy", Locale.ENGLISH); +// // 定义输出格式:yyyy-MM-dd HH:mm:ss +// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); +// dateStr = dateStr.replace("|", "").trim(); +// // 解析输入日期 +// LocalDate date = LocalDate.parse(dateStr, inputFormatter); +// // 转换为带时间的格式,时间设为 00:00:00 +// return date.atStartOfDay().format(outputFormatter); +// } catch (Exception e) { +// e.printStackTrace(); +// return null; // 或抛出异常,根据需求调整 +// } +// } +// public static String convertToTimestamp(String dateStr) { +// try { +// // 去掉 "Publié le" 前缀并清理多余字符 +// dateStr = dateStr.replace("Publié le", "").trim(); +// +// // 定义输入格式:dd MMMM yyyy(例如 "25 mars 2025") +// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("dd MMMM yyyy", Locale.FRENCH); +// +// // 定义输出格式:yyyy-MM-dd HH:mm:ss +// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); +// +// // 解析输入日期 +// LocalDate date = LocalDate.parse(dateStr, inputFormatter); +// +// // 转换为带时间的格式,时间设为 00:00:00 +// return date.atStartOfDay().format(outputFormatter); +// } catch (Exception e) { +// e.printStackTrace(); +// return null; // 或抛出异常,根据需求调整 +// } +// } +// public static String convertToTimestamp(String dateStr) { +// try { +// // 定义输入格式:yyyy 年 MM 月 dd 日 +// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("MM-dd-yyyy", Locale.CHINESE); +// // 定义输出格式:yyyy-MM-dd HH:mm:ss +// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); +// +// // 解析输入日期 +// LocalDate date = LocalDate.parse(dateStr, inputFormatter); +// // 转换为带时间的格式,时间设为 00:00:00 +// return date.atStartOfDay().format(outputFormatter); +// } catch (Exception e) { +// e.printStackTrace(); +// return null; // 或抛出异常,根据需求调整 +// } +// } + + // public static String convertToTimestamp(String dateStr) { +// try { +// // 定义输入格式 +// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("dd/MM/yyyy"); +// // 定义输出格式 +// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); +// +// // 解析输入字符串为 LocalDate +// LocalDate date = LocalDate.parse(dateStr, inputFormatter); +// // 转换为 LocalDateTime,设置时间为 00:00:00 +// LocalDateTime dateTime = date.atStartOfDay(); +// // 格式化为目标字符串 +// return dateTime.format(outputFormatter); +// } catch (Exception e) { +// e.printStackTrace(); +// return null; // 或者抛出异常,根据需求调整 +// } +// } +// public static String convertToTimestamp(String dateStr) { +// try { +// // 定义输入格式:MMMM d, yyyy(例如 "June 3, 2015") +// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("d MMMM, yyyy", Locale.ENGLISH); +// // 定义输出格式:yyyy-MM-dd HH:mm:ss +// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); +// +// // 解析输入日期 +// LocalDate date = LocalDate.parse(dateStr, inputFormatter); +// // 转换为带时间的格式,时间设为 00:00:00 +// return date.atStartOfDay().format(outputFormatter); +// } catch (Exception e) { +// e.printStackTrace(); +// return null; // 或抛出异常,根据需求调整 +// } +// } +// public static String convertToTimestamp(String input) { +// try { +// // 正则匹配 "d MMMM yyyy" +// Pattern pattern = Pattern.compile("\\d{1,2} [A-Za-z]+ \\d{4}"); +// Matcher matcher = pattern.matcher(input); +// if (matcher.find()) { +// String dateStr = matcher.group(); +// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("d MMMM yyyy", Locale.ENGLISH); +// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); +// LocalDate date = LocalDate.parse(dateStr, inputFormatter); +// return date.atStartOfDay().format(outputFormatter); +// } else { +// System.out.println("No date found in: " + input); +// return null; +// } +// } catch (Exception e) { +// e.printStackTrace(); +// return null; +// } +// } +// public static String convertToTimestamp(String dateStr) { +// try { +// // Parse the ISO 8601 date string (e.g., "2025-03-17T12:37:33.033Z") +// ZonedDateTime zdt = ZonedDateTime.parse(dateStr, DateTimeFormatter.ISO_DATE_TIME); +// +// // Define the output format (yyyy-MM-dd hh:mm:ss) +// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); +// +// // Format the date to the desired output +// return zdt.format(outputFormatter); +// } catch (Exception e) { +// e.printStackTrace(); +// return null; // Or throw an exception, depending on your needs +// } +// } +// public static String convertToTimestamp(String dateStr) { +// try { +// // Parse "Jan. 9, 2025" (abbreviated month, dot, comma-separated) +// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("MMM d, yyyy", Locale.ENGLISH); +// LocalDate date = LocalDate.parse(dateStr, inputFormatter); +// +// // Format to "yyyy-MM-dd HH:mm:ss" (defaulting time to 00:00:00) +// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); +// return date.atStartOfDay().format(outputFormatter); +// } catch (Exception e) { +// e.printStackTrace(); +// return null; +// } +// } +// public static String convertToTimestamp(String dateStr) { +// try { +// // 从文本中提取修改日期 +// String modifiedDateStr = extractModifiedDate(dateStr); +// if (modifiedDateStr == null) { +// throw new IllegalArgumentException("无法找到修改日期"); +// } +// +// // Parse "20/12/2024" (day/month/year format, Italian style) +// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("dd/MM/yyyy", Locale.ITALIAN); +// LocalDate date = LocalDate.parse(modifiedDateStr, inputFormatter); +// +// // Format to "yyyy-MM-dd HH:mm:ss" (defaulting time to 00:00:00) +// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); +// return date.atStartOfDay().format(outputFormatter); +// } catch (Exception e) { +// e.printStackTrace(); +// return null; +// } +// } + public static String convertIsoToTimestamp(String dateStr) { + try { + // 解析 ISO 8601 格式的 UTC 时间为 Instant + Instant instant = Instant.parse(dateStr); + // 转为本地时间(系统默认时区),如果你不想转换时区,可以用 LocalDateTime.ofInstant + LocalDateTime localDateTime = LocalDateTime.ofInstant(instant, ZoneOffset.UTC); + // 定义输出格式 + DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); + return localDateTime.format(outputFormatter); + } catch (Exception e) { + e.printStackTrace(); + return null; + } + } + +// public static String convertToTimestamp(String dateStr) { +// try { +// // 创建捷克语的日期格式器,解析 "27. listopadu 2024" +// DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("d. MMMM yyyy", new Locale("cs", "CZ")); +// LocalDate date = LocalDate.parse(dateStr, inputFormatter); +// +// // 转换为 "yyyy-MM-dd HH:mm:ss" 格式,默认时间为 00:00:00 +// DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); +// return date.atStartOfDay().format(outputFormatter); +// } catch (Exception e) { +// e.printStackTrace(); +// return null; +// } +// } + // 提取修改日期的方法 + private static String extractModifiedDate(String text) { + String[] lines = text.split("\n"); + for (String line : lines) { + if (line.contains("Modificato")) { + // 提取 "Modificato" 后面的日期部分 + String[] parts = line.split("\\s+"); + for (String part : parts) { + if (part.matches("\\d{2}/\\d{2}/\\d{4}")) { + return part; // 返回 "20/12/2024" + } + } + } + } + return null; // 如果没找到修改日期,返回 null + } + // 调用本地代理服务获取代理地址 + private static String getProxyFromLocalService() throws Exception { + OkHttpClient client = new OkHttpClient(); + Request request = new Request.Builder() + .url("http://127.0.0.1:7897") + .get() + .build(); + + try (Response response = client.newCall(request).execute()) { + if (response.isSuccessful()) { + return response.body().string(); // 返回 JSON 字符串 + } else { + throw new Exception("获取代理失败,状态码: " + response.code()); + } + } + } + public static String getNextPageUrl(String currentUrl) { + if (currentUrl == null || currentUrl.trim().isEmpty()) { + return null; + } + +// // 定义基础 URL +// String baseUrl = "https://www.pasteur.dz/fr/espace-presse"; +// +// // 如果是基础 URL,默认第 1 页,下一页为 ?page=2 +// if (currentUrl.equals(baseUrl)) { +// return baseUrl + "?start=5"; +// } + + // 定义正则表达式,匹配 ?page=数字 + String regex = "_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_gJ3hFqMQsykM_cur=(\\d+)"; + Pattern pattern = Pattern.compile(regex); + Matcher matcher = pattern.matcher(currentUrl); + + // 如果找到 ?page=* + if (matcher.find()) { + // 提取页码(group(1) 是括号中的数字部分) + String pageNumStr = matcher.group(1); + try { + int currentPage = Integer.parseInt(pageNumStr); + // 替换旧页码为新页码 + return matcher.replaceFirst("_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_gJ3hFqMQsykM_cur=" + (currentPage + 1)); + } catch (NumberFormatException e) { + return null; // 页码解析失败 + } + }else { + return null; + } + } + public static String getPreviousYearUrl(String url) { + if (url == null || url.trim().isEmpty()) { + return null; + } + + // 定义正则表达式匹配年份 + String yearRegex = "_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_gJ3hFqMQsykM_year=(\\d{4})"; + Pattern yearPattern = Pattern.compile(yearRegex); + Matcher yearMatcher = yearPattern.matcher(url); + + // 如果找到年份 + if (yearMatcher.find()) { + String yearStr = yearMatcher.group(1); // 提取年份 + Integer currentYear = Integer.parseInt(yearStr); + Integer previousYear = currentYear - 1; // 计算上一年 + + // 替换年份 + url = yearMatcher.replaceFirst("_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_gJ3hFqMQsykM_year=" + previousYear); + } + + // 定义正则表达式匹配页码 + String pageRegex = "_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_gJ3hFqMQsykM_cur=(\\d+)"; + Pattern pagePattern = Pattern.compile(pageRegex); + Matcher pageMatcher = pagePattern.matcher(url); + + // 如果找到页码 + if (pageMatcher.find()) { + // 替换页码为 1 + return pageMatcher.replaceFirst("_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_gJ3hFqMQsykM_cur=1"); + } else { + // 如果没有找到页码,默认页码为 1 + return url + "_com_liferay_asset_publisher_web_portlet_AssetPublisherPortlet_INSTANCE_gJ3hFqMQsykM_cur=1"; + } + } +} + diff --git a/src/main/java/com/example/projTopic.java b/src/main/java/com/example/projTopic.java new file mode 100644 index 0000000..f2377a7 --- /dev/null +++ b/src/main/java/com/example/projTopic.java @@ -0,0 +1,403 @@ +package com.example; +import com.fasterxml.jackson.databind.ObjectMapper; +import okhttp3.*; +import org.apache.kafka.clients.producer.KafkaProducer; +import org.apache.kafka.clients.producer.ProducerConfig; +import org.apache.kafka.clients.producer.ProducerRecord; +import org.apache.kafka.common.serialization.StringSerializer; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.format.DateTimeFormatter; +import java.util.*; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class projTopic { + private static final String TOPIC_NAME = "projTopic"; + private static final String BOOTSTRAP_SERVERS = "node-01:19092"; + private static KafkaProducer producer; + private static ObjectMapper objectMapper = new ObjectMapper(); + private static final Random random = new Random(); + private static List proxyList = new ArrayList<>(); // 代理池 + private static int currentProxyIndex = 0; // 当前使用的代理索引 + static { + Properties props = new Properties(); + props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, BOOTSTRAP_SERVERS); + props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer"); + props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer"); + props.put(ProducerConfig.ACKS_CONFIG, "all"); // 等待所有副本确认 + props.put(ProducerConfig.RETRIES_CONFIG, 3); // 重试次数 + producer = new KafkaProducer<>(props); + try { + proxyList = Files.readAllLines(Paths.get("proxy.txt")); + if (proxyList.isEmpty()) { + System.out.println("警告: proxy.txt 为空,未加载任何代理"); + } else { + System.out.println("成功加载 " + proxyList.size() + " 个代理"); + } + } catch (IOException e) { + System.err.println("读取 proxy.txt 失败: " + e.getMessage()); + } + } + + public static void main(String[] args) throws IOException, InterruptedException { + List keywords = Files.readAllLines(Paths.get("keywords.txt")); + List cleanedKeywords = new ArrayList<>(); + for (String keyword : keywords) { + String cleaned = keyword.split(",")[0].trim(); // 取逗号前的部分并去除首尾空格 + cleaned = cleaned.replaceAll("\\s+", "+"); // 替换所有空格为 + + cleanedKeywords.add(cleaned); + } + ExecutorService executor = Executors.newFixedThreadPool(4); // 4 个线程 + for (String keyword : cleanedKeywords) { + executor.submit(() -> { + try { + int sleepTime = random.nextInt(1001) + 30000; + String load = "5|0|20|https://www.nsf.gov/awardsearch/jsp/gwt/search/|57BE5CA45E781DC0159F727F8A8205EB|gov.nsf.research.awardsearch.gwt.client.SearchAwardService|getAwards|gov.nsf.research.awardsearch.gwt.bean.SearchRequestBean/3930579236|com.extjs.gxt.ui.client.data.PagingLoadConfig|java.util.HashMap/962170901|java.lang.String/2004016611|QueryText|" + keyword + "|ActiveAwards|true|com.extjs.gxt.ui.client.data.BasePagingLoadConfig/2011366567|com.extjs.gxt.ui.client.data.RpcMap/3441186752|sortField|sortDir|com.extjs.gxt.ui.client.Style$SortDir/640452531|offset|java.lang.Integer/3438268394|limit|1|2|3|4|2|5|6|5|7|2|8|9|8|10|8|11|8|12|13|0|1|14|4|15|0|16|17|0|18|19|0|20|19|30|"; + for(int i=0;;i++){ + OkHttpClient client = createClientWithProxy(); + MediaType mediaType = MediaType.parse("text/x-gwt-rpc; charset=UTF-8"); + RequestBody body = RequestBody.create(mediaType, load); + + Request request = new Request.Builder() + .url("https://www.nsf.gov/awardsearch/jsp/gwt/search/.searchaward") + .method("POST", body) + .addHeader("Content-Type", "text/x-gwt-rpc; charset=UTF-8") + .addHeader("X-GWT-Module-Base", "https://www.nsf.gov/awardsearch/jsp/gwt/search/") + .addHeader("X-GWT-Permutation", "368C3CF86AA4CD7DB2250B35B844C1C2") +// .addHeader("cookie", "JSESSIONID=E9DCB88F6AD2241C9973AFEC03158ECB") + .build(); + Response response = executeWithRetry(client, request, keyword); + String content = response.body().string(); + + Pattern pattern = Pattern.compile("\"awdNumber\",\"(\\d+)\""); + Matcher matcher = pattern.matcher(content); + + List numbers = new ArrayList<>(); // 用于存储匹配的数字 + // 查找并提取数字 + List additionalNumbers = new ArrayList<>(); + List urls = new ArrayList<>(); + // 查找匹配项 + while (matcher.find()) { + // 获取捕获到的数字,并将其添加到列表中 + numbers.add(matcher.group(1)); + } + + // 输出捕获到的数字 + if (numbers.isEmpty()) { + System.out.println("没找到awdNumber,继续下一种查找"); + + } else { + for (String number : numbers) { + additionalNumbers.add(number); + } + } + + Pattern additionalPattern = Pattern.compile("\"[^\"]+\",\"(?:\\d{2}/\\d{2}/\\d{4}|\\d+\\.\\d+)\"(?:,\"(?:\\d{2}/\\d{2}/\\d{4}|\\d+\\.\\d+)\")?,\"(\\d+)\""); + Matcher additionalMatcher = additionalPattern.matcher(content); + + + while (additionalMatcher.find()) { + additionalNumbers.add(additionalMatcher.group(1)); + } + if (additionalNumbers.isEmpty()) { + System.out.println("没找到下一页内容链接"); + Thread.sleep(sleepTime); + break; + } else { + for (String number : additionalNumbers) { + String url = "https://www.nsf.gov/awardsearch/showAward?AWD_ID=" + number + "&HistoricalAwards=false"; + urls.add(url); + } + } + if (!urls.isEmpty() && urls.get(0).equals("https://www.nsf.gov/awardsearch/showAward?AWD_ID=2446604&HistoricalAwards=false")) { + System.out.println("第一个 URL 是 AWD_ID=2446604,跳过关键词: " + keyword); + Thread.sleep(sleepTime); + return; // 跳出当前任务,处理下一个关键词 + } + for(String url:urls){ + OkHttpClient client2 = createClientWithProxy(); + MediaType mediaType2 = MediaType.parse("text/plain"); + RequestBody body2 = RequestBody.create(mediaType2, ""); + Request request2 = new Request.Builder() + .url(url) + .get() +// .addHeader("Cookie", "JSESSIONID=E9DCB88F6AD2241C9973AFEC03158ECB") + .build(); + Response response2 = executeWithRetry(client2, request2, keyword); + System.out.println(response2.code()); + String html = response2.body().string(); + Document parse = Jsoup.parse(html); + String title = parse.select(".pageheadline").text(); + String projectNum = parse.select(".clear tr:nth-child(5) .tabletext2:nth-child(2)").text(); + String projectLeader = parse.select(".clear tr:nth-child(13) .tabletext2:nth-child(2)").text(); + String projectStartTime = convertToTimestamp(parse.select(".clear tr:nth-child(8) .tabletext2:nth-child(2)").text()); + String projectEndTime = convertToTimestamp2(parse.select(".clear tr:nth-child(9) .tabletext2:nth-child(2)").text()); + String sponsorPart = parse.select(".clear tr:nth-child(2) .tabletext2:nth-child(2)").text(); + String country = "USA"; + String brief = parse.select(".clear.margintop25 span").text(); + String sponsor = parse.select(".clear tr:nth-child(1) .tabletext2:nth-child(2)").text(); + String projectFunding = parse.select(".clear tr:nth-child(12) .tabletext2:nth-child(2)").text(); + String relatedProject = parse.select(".clear tr:nth-child(20) .tabletext2:nth-child(2)").text(); + + + + String awardInstrument = parse.select(".clear tr:nth-child(6) .tabletext2:nth-child(2)").text(); + String programManager = parse.select(".clear tr:nth-child(7) .tabletext2:nth-child(2)").text(); + String totalIntendedAwardAmount = parse.select(".clear tr:nth-child(10) .tabletext2:nth-child(2)").text(); + String totalAwardedAmountToDate = parse.select(".clear tr:nth-child(11) .tabletext2:nth-child(2)").text(); + String recipientSponsoredResearchOffice = parse.select(".clear tr:nth-child(14) .tabletext2:nth-child(2)").text(); + String sponsorCongressionalDistrict = parse.select(".clear tr:nth-child(15) .tabletext2:nth-child(2)").text(); + String primaryPlaceOfPerformance = parse.select(".clear tr:nth-child(16) .tabletext2:nth-child(2)").text(); + String primaryPlaceOfPerformanceCongressionalDistrict = parse.select(".clear tr:nth-child(17) .tabletext2:nth-child(2)").text(); + String uniqueEntityIdentifier = parse.select(".clear tr:nth-child(18) .tabletext2:nth-child(2)").text(); + String parentUEI = parse.select(".clear tr:nth-child(19) .tabletext2:nth-child(2)").text(); + String primaryProgramSource = parse.select(".clear tr:nth-child(21) .tabletext2:nth-child(2)").text(); + String programReferenceCode = parse.select(".clear tr:nth-child(22) .tabletext2:nth-child(2)").text(); + String programElementCode = parse.select(".clear tr:nth-child(23) .tabletext2:nth-child(2)").text(); + String awardAgencyCode = parse.select(".clear tr:nth-child(24) .tabletext2:nth-child(2)").text(); + String fundAgencyCode = parse.select(".clear tr:nth-child(25) .tabletext2:nth-child(2)").text(); + String assistanceListingNumber = parse.select(".clear tr:nth-child(26) .tabletext2:nth-child(2)").text(); + String initialAmendmentDate = convertToTimestamp(parse.select(".clear tr:nth-child(3) .tabletext2:nth-child(2)").text()); + String latestAmendmentDate = convertToTimestamp(parse.select(".clear tr:nth-child(4) .tabletext2:nth-child(2)").text()); + + List> citations = extractAllCitationInfo(html); + Map data = new HashMap<>(); + data.put("title",title); + data.put("projectNum",projectNum); + data.put("projectLeader",projectLeader); + data.put("projectStartTime",projectStartTime); + data.put("projectEndTime",projectEndTime); + data.put("sponsorPart",sponsorPart); + data.put("country",country); + data.put("brief",brief); + data.put("sponsor",sponsor); + data.put("projectFunding",projectFunding); + data.put("relatedProject",relatedProject); + data.put("awardInstrument",awardInstrument); + data.put("programManager",programManager); + data.put("totalIntendedAwardAmount",totalIntendedAwardAmount); + data.put("totalAwardedAmountToDate",totalAwardedAmountToDate); + data.put("recipientSponsoredResearchOffice",recipientSponsoredResearchOffice); + data.put("sponsorCongressionalDistrict",sponsorCongressionalDistrict); + data.put("primaryPlaceOfPerformance",primaryPlaceOfPerformance); + data.put("primaryPlaceOfPerformanceCongressionalDistrict",primaryPlaceOfPerformanceCongressionalDistrict); + data.put("uniqueEntityIdentifier",uniqueEntityIdentifier); + data.put("parentUEI",parentUEI); + data.put("primaryProgramSource",primaryProgramSource); + data.put("programReferenceCode",programReferenceCode); + data.put("programElementCode",programElementCode); + data.put("awardAgencyCode",awardAgencyCode); + data.put("fundAgencyCode",fundAgencyCode); + data.put("assistanceListingNumber",assistanceListingNumber); + data.put("publications",citations); + data.put("initialAmendmentDate",initialAmendmentDate); + data.put("latestAmendmentDate",latestAmendmentDate); + data.put("crawlUrl",url); + data.put("crawlTime",localDateTime()); + Map result = new HashMap<>(); + result.put("keyword",keyword); + result.put("data",data); + try { + String jsonValue = objectMapper.writeValueAsString(result); + ProducerRecord record = new ProducerRecord<>(TOPIC_NAME, projectNum, jsonValue); + + producer.send(record, (metadata, exception) -> { + if (exception == null) { + System.out.println("成功发送到Kafka - Partition: " + metadata.partition() + + ", Offset: " + metadata.offset()); + } else { + System.err.println("发送到Kafka失败: " + exception.getMessage()); + } + }); + } catch (Exception e) { + System.err.println("序列化或发送Kafka消息失败: " + e.getMessage()); + + } + + Thread.sleep(sleepTime); + } + load = increaseOffsetBy30(load); + } + + } catch (Exception e) { + System.err.println("处理 " + keyword + " 失败: " + e.getMessage()); + e.printStackTrace(); + } + }); + } + executor.shutdown(); + executor.awaitTermination(5, TimeUnit.HOURS); + producer.close(); + } + + public static String convertToTimestamp(String dateStr) { + try { + // Parse "Jan. 9, 2025" (abbreviated month, dot, comma-separated) + DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("MMMM d, yyyy", Locale.ENGLISH); + LocalDate date = LocalDate.parse(dateStr, inputFormatter); + + // Format to "yyyy-MM-dd HH:mm:ss" (defaulting time to 00:00:00) + DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); + return date.atStartOfDay().format(outputFormatter); + } catch (Exception e) { + e.printStackTrace(); + return null; + } + + } + public static String convertToTimestamp2(String dateStr) { + try { + // 移除 "(Estimated)" 部分 + String cleanDateStr = dateStr.replace(" (Estimated)", "").trim(); + + // Parse "June 30, 2025" (full month, day, comma-separated year) + DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("MMMM d, yyyy", Locale.ENGLISH); + LocalDate date = LocalDate.parse(cleanDateStr, inputFormatter); + + // Format to "yyyy-MM-dd HH:mm:ss" (defaulting time to 00:00:00) + DateTimeFormatter outputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); + return date.atStartOfDay().format(outputFormatter); + } catch (Exception e) { + e.printStackTrace(); + return null; + } + } + public static List> extractAllCitationInfo(String html) { + Document doc = Jsoup.parse(html); + List> citations = new ArrayList<>(); + + // 选择所有 margintop15 + Elements marginDivs = doc.select(".margintop15"); + Pattern urlPattern = Pattern.compile("javascript:popwin\\('(.*?)'\\)"); + + for (Element div : marginDivs) { + Map info = new HashMap<>(); + + // 提取 span 中的文本 + Elements spans = div.select("> span"); + if (spans.size() >= 3) { + info.put("authors", spans.get(0).text()); + info.put("title", spans.get(1).text()); + info.put("year", spans.get(2).text()); + } + + // 提取链接 + Elements links = div.select("a"); + String doiUrl = ""; + String citationUrl = ""; + for (Element link : links) { + String href = link.attr("href"); + Matcher matcher = urlPattern.matcher(href); + if (matcher.find()) { + String url = matcher.group(1); + if (link.text().contains("doi.org") && doiUrl.isEmpty()) { + doiUrl = url; + } else if (link.text().contains("引用详细信息") && citationUrl.isEmpty()) { + citationUrl = url; + } + } + } + info.put("doiUrl", doiUrl); + info.put("citationUrl", citationUrl); + + // 添加到结果列表 + citations.add(info); + } + + return citations; + } + public static String localDateTime(){ + LocalDateTime dateTime = LocalDateTime.now(); + + // 创建 DateTimeFormatter,定义日期时间的格式 + DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); + + // 使用 formatter 格式化 LocalDateTime + String formattedDateTime = dateTime.format(formatter); + + return formattedDateTime; // 输出类似: 2025-04-08 13:45:30 + } + public static String increaseOffsetBy30(String originalPayload) { + // 以 "|" 分割载荷为数组 + String[] parts = originalPayload.split("\\|"); + + // 检查数组长度,确保有足够元素 + if (parts.length < 4) { + throw new IllegalArgumentException("载荷格式无效,元素不足"); + } + + // 找到倒数第 4 个元素的位置 + int targetIndex = parts.length - 4; + + try { + // 将倒数第 4 个数字解析为整数 + int currentOffset = Integer.parseInt(parts[targetIndex]); + // 增加 30 + int newOffset = currentOffset + 30; + // 将新值放回数组 + parts[targetIndex] = String.valueOf(newOffset); + + // 重新拼接载荷 + return String.join("|", parts); + } catch (NumberFormatException e) { + throw new IllegalArgumentException("倒数第 4 个元素不是有效数字: " + parts[targetIndex]); + } + } + private static Response executeWithRetry(OkHttpClient client, Request request, String keyword) throws IOException { + int maxRetries = proxyList.isEmpty() ? 1 : proxyList.size(); // 如果没有代理,只尝试一次 + int attempt = 0; + + while (attempt < maxRetries) { + Response response = client.newCall(request).execute(); + if (response.code() == 403) { + System.out.println("收到 403 状态码,尝试切换代理重试..."); + response.close(); + switchProxy(); + client = createClientWithProxy(); // 使用新代理重建客户端 + attempt++; + if (attempt == maxRetries) { + throw new IOException("所有代理尝试失败,仍然收到 403"); + } + continue; + } + return response; // 成功或非 403 状态码,直接返回 + } + throw new IOException("无法执行请求,未获取响应"); + } + private static OkHttpClient createClientWithProxy() { + OkHttpClient.Builder builder = new OkHttpClient().newBuilder() + .connectTimeout(30, TimeUnit.SECONDS) + .readTimeout(30, TimeUnit.SECONDS) + .writeTimeout(30, TimeUnit.SECONDS); + + if (!proxyList.isEmpty() && currentProxyIndex < proxyList.size()) { + String proxy = proxyList.get(currentProxyIndex); + String[] proxyParts = proxy.split(":"); + if (proxyParts.length == 2) { + String proxyHost = proxyParts[0]; + int proxyPort = Integer.parseInt(proxyParts[1]); + builder.proxy(new java.net.Proxy(java.net.Proxy.Type.HTTP, + new java.net.InetSocketAddress(proxyHost, proxyPort))); + System.out.println("使用代理: " + proxy); + } + } + return builder.build(); + } + private static synchronized void switchProxy() { + if (proxyList.isEmpty()) return; + currentProxyIndex = (currentProxyIndex + 1) % proxyList.size(); + System.out.println("切换到新代理: " + proxyList.get(currentProxyIndex)); + } +} diff --git a/src/main/java/com/example/saveInES.java b/src/main/java/com/example/saveInES.java new file mode 100644 index 0000000..ce1b21d --- /dev/null +++ b/src/main/java/com/example/saveInES.java @@ -0,0 +1,122 @@ +package com.example; + +import co.elastic.clients.elasticsearch.ElasticsearchClient; +import co.elastic.clients.elasticsearch.core.IndexRequest; +import co.elastic.clients.elasticsearch.core.IndexResponse; +import co.elastic.clients.json.jackson.JacksonJsonpMapper; +import co.elastic.clients.transport.ElasticsearchTransport; +import co.elastic.clients.transport.rest_client.RestClientTransport; +import org.apache.http.HttpHost; +import org.apache.kafka.clients.consumer.ConsumerConfig; +import org.apache.kafka.clients.consumer.ConsumerRecord; +import org.apache.kafka.clients.consumer.ConsumerRecords; +import org.apache.kafka.clients.consumer.KafkaConsumer; +import org.apache.kafka.common.serialization.StringDeserializer; +import org.elasticsearch.client.RestClient; + +import java.io.IOException; +import java.time.Duration; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.Properties; + +public class saveInES { + public static void main(String[] args) { + ElasticsearchClient esClient = createElasticsearchClient(); + Properties properties = new Properties(); + properties.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092"); + properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName()); + properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName()); + properties.put(ConsumerConfig.GROUP_ID_CONFIG, "news-consumer-group"); + properties.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"); + properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false"); // 关闭自动提交偏移量 + // 创建 Kafka 消费者 + KafkaConsumer consumer = new KafkaConsumer<>(properties); + + // 订阅主题 + String topic = "news-topic"; // Kafka 主题 + consumer.subscribe(Collections.singletonList(topic)); + + // 消费消息 + try { + while (true) { + // 拉取消息 + ConsumerRecords records = consumer.poll(Duration.ofMillis(1000)); + + // 处理消息 + for (ConsumerRecord record : records) { + System.out.println("Received message: key=" + record.key() + ", value=" + record.value()); + + // 将消息保存到 Elasticsearch + saveToElasticsearch(esClient, record.value()); + } + consumer.commitSync(); + } + } finally { + // 关闭消费者 + consumer.close(); + try { + esClient._transport().close(); + } catch (IOException e) { + System.err.println("Error closing Elasticsearch client: " + e.getMessage()); + } + } + } + + /** + * 初始化 Elasticsearch 客户端 + */ + private static ElasticsearchClient createElasticsearchClient() { + RestClient restClient = RestClient.builder(new HttpHost("localhost", 9200)).build(); + ElasticsearchTransport transport = new RestClientTransport(restClient, new JacksonJsonpMapper()); + return new ElasticsearchClient(transport); + } + + /** + * 将消息保存到 Elasticsearch + * + * @param esClient Elasticsearch 客户端 + * @param message 消息内容(JSON 格式) + */ + private static void saveToElasticsearch(ElasticsearchClient esClient, String message) { + try { + // 将消息解析为 Map(假设消息是 JSON 格式) + Map news = parseMessageToMap(message); + String docId = news.get("url"); + // 创建索引请求 + IndexRequest> request = IndexRequest.of(b -> b + .index("news") // 索引名称 + .id(docId) + .document(news) // 要保存的数据 + ); + + // 执行索引请求 + IndexResponse response = esClient.index(request); + System.out.println("Data saved to Elasticsearch: ID=" + response.id()); + } catch (Exception e) { + System.err.println("Failed to save data to Elasticsearch: " + e.getMessage()); + } + } + + /** + * 将消息解析为 Map + * + * @param message 消息内容(JSON 格式) + * @return 解析后的 Map + */ + private static Map parseMessageToMap(String message) { + // 这里假设消息是 JSON 格式,例如:{"title":"...", "date":"...", "content":"...", "url":"..."} + // 可以使用 JSON 库(如 Jackson)解析消息 + // 这里简单地将消息按逗号分割并转换为 Map + Map map = new HashMap<>(); + String[] pairs = message.replace("{", "").replace("}", "").split(","); + for (String pair : pairs) { + String[] keyValue = pair.split("="); + if (keyValue.length == 2) { + map.put(keyValue[0].trim(), keyValue[1].trim()); + } + } + return map; + } +} \ No newline at end of file diff --git a/src/main/java/com/example/test.java b/src/main/java/com/example/test.java new file mode 100644 index 0000000..e136915 --- /dev/null +++ b/src/main/java/com/example/test.java @@ -0,0 +1,101 @@ +package com.example;// 注意:如果你使用手动设置路径,就不需要导入 WebDriverManager 了 +// import io.github.bonigarcia.wdm.WebDriverManager; + +import org.openqa.selenium.By; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.WebElement; +import org.openqa.selenium.chrome.ChromeDriver; +import org.openqa.selenium.chrome.ChromeOptions; +import org.openqa.selenium.support.ui.WebDriverWait; +import org.openqa.selenium.support.ui.ExpectedConditions; +import org.openqa.selenium.NoSuchElementException; +import org.openqa.selenium.TimeoutException; + +import java.time.Duration; +import java.util.List; + +public class test { // 更改类名以示区别 + + public static void main(String[] args) { + // 手动设置 ChromeDriver 的路径 (如果你选择手动方式的话) + // *** 将这里的路径替换为你实际的 chromedriver.exe 路径 *** + System.setProperty("webdriver.chrome.driver", "F:\\tool\\EasySpider_0.6.2_Windows_x64\\EasySpider_windows_x64\\EasySpider\\resources\\app\\chrome_win64\\chromedriver_win64.exe"); + + // 如果你选择使用 WebDriverManager,则使用以下代码替代上面的 System.setProperty: + // import io.github.bonigarcia.wdm.WebDriverManager; + // WebDriverManager.chromedriver().setup(); + + + WebDriver driver = null; + + try { + // 配置 Chrome 选项 (可选) + ChromeOptions options = new ChromeOptions(); + // options.addArguments("--headless"); // 启用无头模式 + // options.addArguments("--disable-gpu"); + + // 初始化 WebDriver + driver = new ChromeDriver(options); + + // 直接打开包含搜索条件的 URL + // 注意这里使用的 URL 已经包含了查询参数 + driver.get("https://patentscope.wipo.int/search/en/result.jsf?query=FP:(AI)"); + + // 设置一个显式等待 + WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(20)); + + // 由于直接打开了结果页,我们不再需要等待搜索框和点击按钮 + // 直接等待搜索结果列表加载 + // *** 请使用浏览器开发者工具确认这里的元素定位器是否正确 *** + // "div.ps-result-list" 是一个可能的 CSS 选择器示例,你需要根据实际页面确认 + wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("div.ps-result-list"))); + + // --- 在这里添加提取搜索结果的代码 --- + // *** 请使用浏览器开发者工具确认这里的元素定位器是否正确 *** + List resultItems = driver.findElements(By.cssSelector("div.ps-result-item")); // 定位每个结果项 + + System.out.println("Found " + resultItems.size() + " results:"); + + for (WebElement resultItem : resultItems) { + try { + // 提取标题 (示例选择器) + // *** 请使用浏览器开发者工具确认这里的元素定位器是否正确 *** + WebElement titleElement = resultItem.findElement(By.cssSelector("span.ps-field-value.ps-field-title")); + String title = titleElement.getText().trim(); + + // 提取链接 (示例选择器) + // *** 请使用浏览器开发者工具确认这里的元素定位器是否正确 *** + WebElement linkElement = resultItem.findElement(By.tagName("a")); + String patentLink = linkElement.getAttribute("href"); + + + System.out.println("Title: " + title + ", Link: " + patentLink); + + } catch (NoSuchElementException e) { + System.out.println("Could not find title or link for a result item in this result item."); + continue; + } + } + + // --- 处理分页(如果需要)--- + // 这部分逻辑与之前相同,你需要找到下一页按钮的定位器并实现循环点击和等待 + // 尽管是直接打开结果页,如果结果有多页,你仍然需要处理分页来获取所有结果。 + // ... + + + } catch (TimeoutException e) { + System.err.println("等待元素超时,可能页面结构发生变化或加载缓慢: " + e.getMessage()); + } catch (NoSuchElementException e) { + System.err.println("未能找到指定的元素,请检查元素定位器是否正确: " + e.getMessage()); + } catch (Exception e) { + System.err.println("发生其他错误: " + e.getMessage()); + e.printStackTrace(); + } finally { + // 关闭浏览器 + if (driver != null) { + driver.quit(); + System.out.println("Browser closed."); + } + } + } +} \ No newline at end of file diff --git a/src/main/java/com/example/testContent.java b/src/main/java/com/example/testContent.java new file mode 100644 index 0000000..821211a --- /dev/null +++ b/src/main/java/com/example/testContent.java @@ -0,0 +1,103 @@ +package com.example; + +import okhttp3.*; +import org.joda.time.DateTime; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; + +import java.io.IOException; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.time.LocalDateTime; +import java.time.format.DateTimeFormatter; +import java.util.Date; +import java.util.HashMap; +import java.util.Map; + +public class testContent { + public static void main(String[] args) throws IOException { + String url = "https://www.drks.de/search/de/trial/DRKS00036725/details"; + OkHttpClient client = new OkHttpClient().newBuilder() + .build(); + MediaType mediaType = MediaType.parse("application/json"); + Request request = new Request.Builder() + .url(url) + .get() +// .addHeader("Cookie", "JSESSIONID=F6B6320CBBC2A27482AEFC0EC641EBF8; JSESSIONID=D9A5D49C09D091E9791733727D8AF2F1") + .addHeader("Content-Type", "application/json") + .build(); + Response response = client.newCall(request).execute(); + String html = response.body().string(); + Document parse = Jsoup.parse(html); + String title = parse.select(".title-bold").text(); + String registNum = parse.select(".card.trial-details-float.mb-4 .card-body dl dd:nth-child(2)").text(); + String registTime = convertDate(parse.select(".card.trial-details-float.mb-4 .card-body dl dd:nth-child(6)").text()); + Map sponsor = new HashMap<>(); + String header = parse.select("body > main > div.card-body > div:nth-child(9) > div.card-body > div > div > div > div.card-header > h4").text(); + String site = parse.select("body > main > div.card-body > div:nth-child(9) > div.card-body > div > div > div > div.card-body > dl > dd:nth-child(2) > div").text(); + String telefon = parse.select("body > main > div.card-body > div:nth-child(9) > div.card-body > div > div > div > div.card-body > dl > dd:nth-child(4) > span").text(); + String disease = parse.select("body > main > div.card-body > div:nth-child(6) > div.card-body > div > div:nth-child(2) > dl > dd:nth-child(2) > span").text(); + String studyType = parse.select("body > main > div.card-body > div:nth-child(3) > div.card-body > dl").text(); + String inclusionCriteria = parse.select("body > main > div.card-body > div:nth-child(7) > div.card-body > div:nth-child(2) > div:nth-child(3) > div > div.card-body > div > div.col-12.mt-3 > dl > dd > span").text(); + String exclusionCriteria = parse.select("body > main > div.card-body > div:nth-child(7) > div.card-body > div:nth-child(2) > div:nth-child(4) > div > div.card-body > p > span").text(); + String country = parse.select("body > main > div.card-body > div:nth-child(7) > div.card-body > div:nth-child(2) > div:nth-child(1) > div > div.card-body > dl > dd:nth-child(2)").text(); + String intervention = parse.select("body > main > div.card-body > div:nth-child(4) > div.card-body > dl").text(); + String primaryOutcome = parse.select("body > main > div.card-body > div:nth-child(5) > div.card-body > div > div > dl").text(); + String enrollment = parse.select("body > main > div.card-body > div:nth-child(7) > div.card-body > div:nth-child(2) > div:nth-child(2) > div > div.card-body > div > div:nth-child(5) > dl > dd > span").text(); + sponsor.put("header",header); + sponsor.put("site",site); + sponsor.put("telefon",telefon); + Map resultData = new HashMap<>(); + resultData.put("title", title); + resultData.put("registNum",registNum); + resultData.put("registTime",registTime); + resultData.put("registStatus","无"); + resultData.put("registTitle","无"); + resultData.put("fullTitle","无"); + resultData.put("sponsor",sponsor); + resultData.put("sponsorPart","无"); + resultData.put("studyType",studyType); + resultData.put("phase","无"); + resultData.put("disease",disease); + resultData.put("studyDesign","无"); + resultData.put("studyObjective","无"); + resultData.put("studyStartDate","无"); + resultData.put("inclusionCriteria",inclusionCriteria); + resultData.put("exclusionCriteria",exclusionCriteria); + resultData.put("currentStatus","无"); + resultData.put("enrollment",enrollment); + resultData.put("country",country); + resultData.put("tagTime","无"); + resultData.put("intervention",intervention); + resultData.put("primaryOutcome",primaryOutcome); + resultData.put("crawlTime",getCurrentTime()); + resultData.put("crawlUrl",url); + resultData.put("postTime",registTime); + resultData.put("content","content"); + resultData.put("forwardcontent","forwardcontent"); + System.out.println(resultData); + } + public static String convertDate(String inputDate) { + try { + // 输入格式:dd.MM.yyyy + SimpleDateFormat inputFormat = new SimpleDateFormat("dd.MM.yyyy"); + // 解析输入日期 + Date date = inputFormat.parse(inputDate); + // 输出格式:yyyy-MM-dd HH:mm:ss + SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + // 转换为目标格式 + return outputFormat.format(date); + } catch (ParseException e) { + // 处理解析异常 + return "Invalid date format"; + } + } + public static String getCurrentTime() { + // 创建 DateTimeFormatter,指定输出格式 + DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); + // 获取当前时间 + LocalDateTime now = LocalDateTime.now(); + // 格式化 + return now.format(formatter); + } +} diff --git a/src/main/java/com/example/testList.java b/src/main/java/com/example/testList.java new file mode 100644 index 0000000..b43bade --- /dev/null +++ b/src/main/java/com/example/testList.java @@ -0,0 +1,340 @@ +package com.example; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.io.OutputStream; +import java.net.HttpURLConnection; +import java.net.URL; +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class testList { + public static void main(String[] args) throws Exception { + String targetUrl = "https://www.drks.de/search/de/results?page=4"; + String baseUrl = "https://www.drks.de/search/de"; + String hostUrl = "https://www.drks.de"; + String cleanUrl = targetUrl.split("\\?")[0]; + System.out.println("Pure URL: " + cleanUrl); + + + String pageNumber = targetUrl.contains("?page=") ? targetUrl.split("page=")[1] : "1"; + int page = Integer.parseInt(pageNumber); + System.out.println("Page Number: " + page); + + // 存储 cookies + Set cookieSet = new HashSet<>(); + String sessionId = null; + + // 第一步:初始 GET 请求,获取 cookies 和 ViewState + URL initialUrl = new URL(baseUrl); + HttpURLConnection initialConn = (HttpURLConnection) initialUrl.openConnection(); + initialConn.setRequestMethod("GET"); + initialConn.setInstanceFollowRedirects(false); + initialConn.setConnectTimeout(10000); + initialConn.setReadTimeout(10000); + + // 捕获 cookies + sessionId = updateCookies(initialConn, cookieSet); + System.out.println("Initial Cookies: " + cookieSet); + System.out.println("Initial Session ID: " + sessionId); + + // 读取响应内容以获取 ViewState + BufferedReader in = new BufferedReader(new InputStreamReader(initialConn.getInputStream())); + StringBuilder content = new StringBuilder(); + String inputLine; + while ((inputLine = in.readLine()) != null) { + content.append(inputLine); + } + in.close(); + initialConn.disconnect(); + + // 提取初始 ViewState + String initialViewState = extractViewState(content.toString()); + System.out.println("Initial ViewState: " + initialViewState); + + // 第二步:发送搜索 POST 请求 + HttpURLConnection searchConn = (HttpURLConnection) new URL(baseUrl).openConnection(); + searchConn.setRequestMethod("POST"); + searchConn.setInstanceFollowRedirects(false); + searchConn.setDoOutput(true); + searchConn.setConnectTimeout(10000); + searchConn.setReadTimeout(10000); + + // 设置搜索请求的请求头 + searchConn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"); + searchConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"); + searchConn.setRequestProperty("Cookie", String.join("; ", cookieSet)); + searchConn.setRequestProperty("Origin", "https://www.drks.de"); + searchConn.setRequestProperty("Referer", baseUrl); + searchConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"); + + // 构建搜索请求的 POST 数据 + String searchPostData = buildSearchPostData(initialViewState); + System.out.println("Search POST Data: " + searchPostData); + + // 发送搜索 POST 请求 + try (OutputStream os = searchConn.getOutputStream()) { + byte[] input = searchPostData.getBytes(StandardCharsets.UTF_8); + os.write(input, 0, input.length); + } + + // 更新 cookies + String searchSessionId = updateCookies(searchConn, cookieSet); + System.out.println("Search Cookies: " + cookieSet); + System.out.println("Search Session ID: " + searchSessionId); + + // 处理搜索响应 + int searchResponseCode = searchConn.getResponseCode(); + System.out.println("Search Response Code: " + searchResponseCode); + String redirectUrl = searchConn.getHeaderField("Location"); + searchConn.disconnect(); + + if (searchResponseCode != 302 || redirectUrl == null) { + System.err.println("Search request did not return expected 302 redirect. Response code: " + searchResponseCode); + return; + } + System.out.println("Redirect URL (raw): " + redirectUrl); + + // 解析相对 URL + if (!redirectUrl.startsWith("http")) { + redirectUrl = hostUrl + (redirectUrl.startsWith("/") ? redirectUrl : "/" + redirectUrl); + } + System.out.println("Resolved Redirect URL: " + redirectUrl); + + // 第三步:跟随重定向(使用 GET 请求) + URL resultsUrl = new URL(redirectUrl); + HttpURLConnection resultsConn = (HttpURLConnection) resultsUrl.openConnection(); + resultsConn.setRequestMethod("GET"); + resultsConn.setInstanceFollowRedirects(false); + resultsConn.setConnectTimeout(10000); + resultsConn.setReadTimeout(10000); + resultsConn.setRequestProperty("Cookie", String.join("; ", cookieSet)); + resultsConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"); + resultsConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64ек; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"); + + // 更新 cookies + String resultsSessionId = updateCookies(resultsConn, cookieSet); + System.out.println("Results Cookies: " + cookieSet); + System.out.println("Results Session ID: " + resultsSessionId); + + // 读取重定向后的结果页面内容 + BufferedReader resultsReader = new BufferedReader(new InputStreamReader(resultsConn.getInputStream())); + StringBuilder resultsContent = new StringBuilder(); + while ((inputLine = resultsReader.readLine()) != null) { + resultsContent.append(inputLine); + } + resultsReader.close(); + resultsConn.disconnect(); + + // 提取页面中的 ViewState(状态信息,用于后续请求) + String viewState = extractViewState(resultsContent.toString()); + System.out.println("Results ViewState: " + viewState); + + // 检查 Session ID 是否一致,确保会话未被重置 + if (sessionId != null && !sessionId.equals(resultsSessionId)) { + System.out.println("Warning: Session ID changed. Initial: " + sessionId + ", Results: " + resultsSessionId); + } + + // Step 4: 第四步:发送分页请求(使用 POST) + HttpURLConnection postConn = (HttpURLConnection) new URL(cleanUrl).openConnection(); + postConn.setRequestMethod("POST"); + postConn.setInstanceFollowRedirects(false); + postConn.setDoOutput(true); + postConn.setConnectTimeout(10000); + postConn.setReadTimeout(10000); + + // 设置分页请求的请求头(非 AJAX,模拟浏览器常规请求) + postConn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"); + postConn.setRequestProperty("Cookie", String.join("; ", cookieSet)); + postConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"); + postConn.setRequestProperty("Origin", "https://www.drks.de"); + postConn.setRequestProperty("Referer", cleanUrl); + postConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"); + postConn.setRequestProperty("Sec-Fetch-Dest", "document"); + postConn.setRequestProperty("Sec-Fetch-Mode", "navigate"); + + // 构建分页请求的 POST 参数(包括页码和 ViewState 等) + String postData = buildPostData(viewState, page); + System.out.println("Pagination POST Data: " + postData); + + // 发送分页的 POST 请求 + try (OutputStream os = postConn.getOutputStream()) { + byte[] input = postData.getBytes(StandardCharsets.UTF_8); + os.write(input, 0, input.length); + } + + // 更新 cookies(分页响应可能返回新的 Set-Cookie) + String paginationSessionId = updateCookies(postConn, cookieSet); + System.out.println("Pagination Cookies: " + cookieSet); + System.out.println("Pagination Session ID: " + paginationSessionId); + + // 处理分页响应 + int responseCode = postConn.getResponseCode(); + System.out.println("Pagination Response Code: " + responseCode); + + // 读取分页响应的 HTML 内容 + StringBuilder postContent = new StringBuilder(); + try (BufferedReader postReader = new BufferedReader( + new InputStreamReader( + responseCode >= 400 ? postConn.getErrorStream() : postConn.getInputStream()))) { + while ((inputLine = postReader.readLine()) != null) { + postContent.append(inputLine); + } + } + Document parse = null; + if (responseCode == HttpURLConnection.HTTP_MOVED_TEMP + || responseCode == HttpURLConnection.HTTP_MOVED_PERM + || responseCode == HttpURLConnection.HTTP_SEE_OTHER) { + String newUrl = postConn.getHeaderField("Location"); + System.out.println("Pagination Redirecting to: " + newUrl); + + // 解析重定向中的相对地址为完整 URL(如果是相对路径) + if (!newUrl.startsWith("http")) { + newUrl = hostUrl + (newUrl.startsWith("/") ? newUrl : "/" + newUrl); + } + + // 重定向 + URL redirectConn = new URL(newUrl); + HttpURLConnection followConn = (HttpURLConnection) redirectConn.openConnection(); + followConn.setRequestMethod("GET"); + followConn.setRequestProperty("Cookie", String.join("; ", cookieSet)); + followConn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"); + followConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"); + + BufferedReader redirectReader = new BufferedReader(new InputStreamReader(followConn.getInputStream())); + StringBuilder redirectContent = new StringBuilder(); + while ((inputLine = redirectReader.readLine()) != null) { + redirectContent.append(inputLine); + } + redirectReader.close(); + followConn.disconnect(); + + System.out.println("Redirect Response: " + redirectContent); + parse = Jsoup.parse(String.valueOf(redirectContent)); + } else if (responseCode == 200) { + System.out.println("Pagination Response: " + postContent); + parse = Jsoup.parse(String.valueOf(postContent)); + } + + + + Elements links = parse.select("div[data-label='Titel der Studie'] a"); + + for (Element link : links) { + String href = link.attr("href"); + String text = link.text(); + + System.out.println("链接: " + href); + System.out.println("标题: " + text); + } + String text = parse.select("div.col-md-2.pt-3.ps-0.text-md-end").text(); + // 使用正则表达式提取 "第" 和 "/" 之间的数字 + String regex = "Seite\\s*(\\d+)\\s*/"; + Matcher matcher = Pattern.compile(regex).matcher(text); + if (matcher.find()) { + System.out.println("总共有"+matcher.group(1));// 返回第一个捕获组,即数字 "1" + } + postConn.disconnect(); + } + // 更新并返回当前连接中的 Cookie,包含 JSESSIONID 的提取 + private static String updateCookies(HttpURLConnection conn, Set cookieSet) { + String sessionId = null; + Map> headerFields = conn.getHeaderFields(); + List cookiesHeader = headerFields.get("Set-Cookie"); + if (cookiesHeader != null) { + for (String cookie : cookiesHeader) { + String cookieValue = cookie.split(";")[0]; + cookieSet.add(cookieValue); + if (cookieValue.startsWith("JSESSIONID=") || cookieValue.startsWith("csfcfc=")) { + sessionId = cookieValue; + } + } + } + return sessionId; + } + // 提取 __VIEWSTATE 隐藏字段的值 + private static String extractViewState(String html) { + if (html == null || html.isEmpty()) { + System.err.println("HTML content is empty or null"); + return ""; + } + + // 兼容 jakarta.faces.ViewState 和 javax.faces.ViewState + String regex = "]*name=[\"'](?:jakarta|javax)\\.faces\\.ViewState[\"'][^>]*value=[\"']([^\"']+)[\"']"; + Pattern pattern = Pattern.compile(regex); + Matcher matcher = pattern.matcher(html); + + if (matcher.find()) { + return matcher.group(1); + } + + System.err.println("Failed to extract ViewState from HTML"); + return ""; + } + + // 生成搜索请求的 POST 数据 + private static String buildSearchPostData(String viewState) { + try { + return "searchForm=searchForm" + + "&searchForm%3Aj_idt80=Midwifery" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AdrksId=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AsecondaryId=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AscientificSummary=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aoutcome=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AhealthOfCondition=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AhealthyVolunteers=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aaddresses=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt128=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3AipdSharingPlan=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt135%3Afrom=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt135%3Ato=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt146%3Afrom=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt146%3Ato=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt157%3Afrom=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Acharacteristics%3Aj_idt157%3Ato=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3Agender=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AageInYears=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AinclusionCriteria=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AexclusionCriteria=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3AtrialStatus=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3ArecrutingLocation=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3Aj_idt213%3Afrom=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3Arecruitment%3Aj_idt213%3Ato=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3AtrialDesign%3Apurpose=" + + "&searchForm%3AextendedSearch%3AextendedSearchTabs%3AtrialDesign%3AstudyType=" + + "&searchForm%3Aj_idt287=" + + "&javax.faces.ViewState=" + URLEncoder.encode(viewState, StandardCharsets.UTF_8.name()); + } catch (Exception e) { + System.err.println("Error encoding search ViewState: " + e.getMessage()); + return ""; + } + } + // 生成分页请求的 POST 数据 + private static String buildPostData(String viewState, int page) { + int adjustedPage = page - 1; + try { + return "resultForm=resultForm" + + "&resultForm%3Asorting%3ArowsPerPage=10" + + "&resultForm%3ApaginationTop%3Aj_idt156%3A"+ adjustedPage +"%3Aj_idt158=" + page + + "&resultForm%3Asorting%3AsortingBy=SCORE" + + "&resultForm%3Asorting%3Aj_idt141=true" + + "&resultForm%3Aj_idt221%3Aj_idt223%3AdownloadConfirmation=resultForm%3Aj_idt221%3Aj_idt223%3AdownloadConfirmation" + + "&selectedType=JSON" + + "&javax.faces.ViewState=" + URLEncoder.encode(viewState, StandardCharsets.UTF_8.name()); + } catch (Exception e) { + System.err.println("Error encoding pagination ViewState: " + e.getMessage()); + return ""; + } + } +} \ No newline at end of file diff --git a/src/main/java/com/example/umlistTest.java b/src/main/java/com/example/umlistTest.java new file mode 100644 index 0000000..3e9b959 --- /dev/null +++ b/src/main/java/com/example/umlistTest.java @@ -0,0 +1,22 @@ +package com.example; + +import okhttp3.*; + +import java.io.IOException; + +public class umlistTest { + public static void main(String[] args) throws IOException { + OkHttpClient client = new OkHttpClient().newBuilder() + .build(); + MediaType mediaType = MediaType.parse("text/plain"); + RequestBody body = RequestBody.create(mediaType, ""); + Request request = new Request.Builder() + .url("http://who.int/westernpacific/publications/m/item/bi-weekly-covid-19-situation-update--11-april-2025") + .get() +// .addHeader("Cookie", "_cfuvid=Y2mczEYT8OCAEN719Uv9vPTpARSDmHju6OjSUfxYbb4-1745207891947-0.0.1.1-604800000") + .build(); + Response response = client.newCall(request).execute(); + String html = response.body().string(); + System.out.println(html); + } +} diff --git a/src/main/resources/logback.xml b/src/main/resources/logback.xml new file mode 100644 index 0000000..5f5a5d1 --- /dev/null +++ b/src/main/resources/logback.xml @@ -0,0 +1,12 @@ + + + + %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + + + + + \ No newline at end of file diff --git a/target/classes/com/example/AusContent.class b/target/classes/com/example/AusContent.class new file mode 100644 index 0000000..048b882 Binary files /dev/null and b/target/classes/com/example/AusContent.class differ diff --git a/target/classes/com/example/AusList.class b/target/classes/com/example/AusList.class new file mode 100644 index 0000000..9ec0bcd Binary files /dev/null and b/target/classes/com/example/AusList.class differ diff --git a/target/classes/com/example/CaptchaOCR.class b/target/classes/com/example/CaptchaOCR.class new file mode 100644 index 0000000..8cd5bc8 Binary files /dev/null and b/target/classes/com/example/CaptchaOCR.class differ diff --git a/target/classes/com/example/CsAirScraper.class b/target/classes/com/example/CsAirScraper.class new file mode 100644 index 0000000..b084982 Binary files /dev/null and b/target/classes/com/example/CsAirScraper.class differ diff --git a/target/classes/com/example/CtriScraper.class b/target/classes/com/example/CtriScraper.class new file mode 100644 index 0000000..27d35cb Binary files /dev/null and b/target/classes/com/example/CtriScraper.class differ diff --git a/target/classes/com/example/CtriScraperContent.class b/target/classes/com/example/CtriScraperContent.class new file mode 100644 index 0000000..e80414c Binary files /dev/null and b/target/classes/com/example/CtriScraperContent.class differ diff --git a/target/classes/com/example/Inka.class b/target/classes/com/example/Inka.class new file mode 100644 index 0000000..a10742c Binary files /dev/null and b/target/classes/com/example/Inka.class differ diff --git a/target/classes/com/example/NSFAwardCrawler.class b/target/classes/com/example/NSFAwardCrawler.class new file mode 100644 index 0000000..5f730e8 Binary files /dev/null and b/target/classes/com/example/NSFAwardCrawler.class differ diff --git a/target/classes/com/example/PatentscopeSeleniumCrawler.class b/target/classes/com/example/PatentscopeSeleniumCrawler.class new file mode 100644 index 0000000..d4d8231 Binary files /dev/null and b/target/classes/com/example/PatentscopeSeleniumCrawler.class differ diff --git a/target/classes/com/example/ProxyIPChecker.class b/target/classes/com/example/ProxyIPChecker.class new file mode 100644 index 0000000..d1ebe4a Binary files /dev/null and b/target/classes/com/example/ProxyIPChecker.class differ diff --git a/target/classes/com/example/ScraperWithCaptcha$1.class b/target/classes/com/example/ScraperWithCaptcha$1.class new file mode 100644 index 0000000..d5e54e3 Binary files /dev/null and b/target/classes/com/example/ScraperWithCaptcha$1.class differ diff --git a/target/classes/com/example/ScraperWithCaptcha$PageInfo.class b/target/classes/com/example/ScraperWithCaptcha$PageInfo.class new file mode 100644 index 0000000..066e29b Binary files /dev/null and b/target/classes/com/example/ScraperWithCaptcha$PageInfo.class differ diff --git a/target/classes/com/example/ScraperWithCaptcha.class b/target/classes/com/example/ScraperWithCaptcha.class new file mode 100644 index 0000000..6152d09 Binary files /dev/null and b/target/classes/com/example/ScraperWithCaptcha.class differ diff --git a/target/classes/com/example/StringFieldExtractor.class b/target/classes/com/example/StringFieldExtractor.class new file mode 100644 index 0000000..44d216e Binary files /dev/null and b/target/classes/com/example/StringFieldExtractor.class differ diff --git a/target/classes/com/example/WipoPatentsSelenium.class b/target/classes/com/example/WipoPatentsSelenium.class new file mode 100644 index 0000000..e8b6ee2 Binary files /dev/null and b/target/classes/com/example/WipoPatentsSelenium.class differ diff --git a/target/classes/com/example/cliniTopic.class b/target/classes/com/example/cliniTopic.class new file mode 100644 index 0000000..2160bde Binary files /dev/null and b/target/classes/com/example/cliniTopic.class differ diff --git a/target/classes/com/example/drks.class b/target/classes/com/example/drks.class new file mode 100644 index 0000000..8c2a53c Binary files /dev/null and b/target/classes/com/example/drks.class differ diff --git a/target/classes/com/example/getInKa.class b/target/classes/com/example/getInKa.class new file mode 100644 index 0000000..f7fa167 Binary files /dev/null and b/target/classes/com/example/getInKa.class differ diff --git a/target/classes/com/example/jsonGetOk.class b/target/classes/com/example/jsonGetOk.class new file mode 100644 index 0000000..6e97745 Binary files /dev/null and b/target/classes/com/example/jsonGetOk.class differ diff --git a/target/classes/com/example/ook.class b/target/classes/com/example/ook.class new file mode 100644 index 0000000..959dbb8 Binary files /dev/null and b/target/classes/com/example/ook.class differ diff --git a/target/classes/com/example/oook.class b/target/classes/com/example/oook.class new file mode 100644 index 0000000..9dc6ff1 Binary files /dev/null and b/target/classes/com/example/oook.class differ diff --git a/target/classes/com/example/projTopic.class b/target/classes/com/example/projTopic.class new file mode 100644 index 0000000..2f2adee Binary files /dev/null and b/target/classes/com/example/projTopic.class differ diff --git a/target/classes/com/example/saveInES.class b/target/classes/com/example/saveInES.class new file mode 100644 index 0000000..07702c6 Binary files /dev/null and b/target/classes/com/example/saveInES.class differ diff --git a/target/classes/com/example/test.class b/target/classes/com/example/test.class new file mode 100644 index 0000000..927763f Binary files /dev/null and b/target/classes/com/example/test.class differ diff --git a/target/classes/com/example/testContent.class b/target/classes/com/example/testContent.class new file mode 100644 index 0000000..b75d67b Binary files /dev/null and b/target/classes/com/example/testContent.class differ diff --git a/target/classes/com/example/testList.class b/target/classes/com/example/testList.class new file mode 100644 index 0000000..066638e Binary files /dev/null and b/target/classes/com/example/testList.class differ diff --git a/target/classes/com/example/umlistTest.class b/target/classes/com/example/umlistTest.class new file mode 100644 index 0000000..8c5f5a5 Binary files /dev/null and b/target/classes/com/example/umlistTest.class differ diff --git a/target/classes/logback.xml b/target/classes/logback.xml new file mode 100644 index 0000000..5f5a5d1 --- /dev/null +++ b/target/classes/logback.xml @@ -0,0 +1,12 @@ + + + + %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + + + + + \ No newline at end of file diff --git a/target/es-crawler-1.0-SNAPSHOT-jar-with-dependencies.jar b/target/es-crawler-1.0-SNAPSHOT-jar-with-dependencies.jar new file mode 100644 index 0000000..9a161d3 Binary files /dev/null and b/target/es-crawler-1.0-SNAPSHOT-jar-with-dependencies.jar differ diff --git a/target/es-crawler-1.0-SNAPSHOT.jar b/target/es-crawler-1.0-SNAPSHOT.jar new file mode 100644 index 0000000..a26fac1 Binary files /dev/null and b/target/es-crawler-1.0-SNAPSHOT.jar differ diff --git a/target/maven-archiver/pom.properties b/target/maven-archiver/pom.properties new file mode 100644 index 0000000..17b9cce --- /dev/null +++ b/target/maven-archiver/pom.properties @@ -0,0 +1,5 @@ +#Generated by Maven +#Tue May 13 14:32:58 CST 2025 +version=1.0-SNAPSHOT +groupId=com.example +artifactId=es-crawler diff --git a/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst b/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst new file mode 100644 index 0000000..39fff43 --- /dev/null +++ b/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst @@ -0,0 +1 @@ +com\example\CtriScraper.class diff --git a/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst b/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst new file mode 100644 index 0000000..b8c08cb --- /dev/null +++ b/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst @@ -0,0 +1 @@ +F:\workTest\DaKaES\src\main\java\com\example\CtriScraper.java diff --git a/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/createdFiles.lst b/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/createdFiles.lst new file mode 100644 index 0000000..e69de29 diff --git a/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/inputFiles.lst b/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/inputFiles.lst new file mode 100644 index 0000000..e69de29