You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

403 lines
22 KiB

1 month ago
  1. package com.example;
  2. import com.fasterxml.jackson.databind.ObjectMapper;
  3. import okhttp3.*;
  4. import org.apache.hc.client5.http.cookie.BasicCookieStore;
  5. import org.apache.hc.client5.http.cookie.CookieStore;
  6. import org.apache.hc.client5.http.classic.methods.HttpGet;
  7. import org.apache.hc.client5.http.classic.methods.HttpPost;
  8. import org.apache.hc.client5.http.entity.UrlEncodedFormEntity;
  9. import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
  10. import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
  11. import org.apache.hc.client5.http.impl.classic.HttpClients;
  12. import org.apache.hc.client5.http.protocol.HttpClientContext;
  13. import org.apache.hc.core5.http.HttpEntity;
  14. import org.apache.hc.core5.http.NameValuePair;
  15. import org.apache.hc.core5.http.io.entity.EntityUtils;
  16. import org.apache.hc.core5.http.message.BasicNameValuePair;
  17. import org.apache.kafka.clients.producer.KafkaProducer;
  18. import org.apache.kafka.clients.producer.ProducerConfig;
  19. import org.apache.kafka.clients.producer.ProducerRecord;
  20. import org.jsoup.Jsoup;
  21. import org.jsoup.nodes.Document;
  22. import org.jsoup.nodes.Element;
  23. import org.jsoup.select.Elements;
  24. import java.io.IOException;
  25. import java.nio.file.Files;
  26. import java.nio.file.Paths;
  27. import java.text.ParseException;
  28. import java.text.SimpleDateFormat;
  29. import java.time.LocalDate;
  30. import java.time.LocalDateTime;
  31. import java.time.Month;
  32. import java.time.Year;
  33. import java.time.format.DateTimeFormatter;
  34. import java.util.*;
  35. import java.nio.charset.StandardCharsets;
  36. import java.util.regex.Matcher;
  37. import java.util.regex.Pattern;
  38. public class CtriScraper {
  39. private static final String SEARCH_FORM_URL = "https://ctri.nic.in/Clinicaltrials/advancesearchmain.php";
  40. private static final String SEARCH_SUBMIT_URL = "https://ctri.nic.in/Clinicaltrials/advsearch.php";
  41. private static final Pattern LINK_REGEX_PATTERN = Pattern.compile("'([^']*)'");
  42. private static final String TOPIC_NAME = "cliniTopic";
  43. private static final String BOOTSTRAP_SERVERS = "node-01:19092";
  44. private static KafkaProducer<String, String> producer;
  45. private static ObjectMapper objectMapper = new ObjectMapper();
  46. private static final Random random = new Random();
  47. static {
  48. Properties props = new Properties();
  49. props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, BOOTSTRAP_SERVERS);
  50. props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
  51. props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
  52. props.put(ProducerConfig.ACKS_CONFIG, "all"); // 等待所有副本确认
  53. props.put(ProducerConfig.RETRIES_CONFIG, 3); // 重试次数
  54. producer = new KafkaProducer<>(props);
  55. }
  56. public static List<String> getlink(Integer year, Integer month) {
  57. List<String> linkList = new ArrayList<>(); // 用于存放提取到的链接
  58. // 用于存储和管理 Cookies
  59. CookieStore cookieStore = new BasicCookieStore();
  60. // 用于在请求之间维护状态,特别是关联 CookieStore
  61. HttpClientContext context = HttpClientContext.create();
  62. context.setCookieStore(cookieStore);
  63. // 使用 try-with-resources 确保 HttpClient 被正确关闭
  64. try (CloseableHttpClient httpClient = HttpClients.custom()
  65. .setDefaultCookieStore(cookieStore) // 将cookie store绑定到client
  66. .build()) {
  67. // --- Step 1 & 2: 发送 GET 请求获取表单页面并解析 ---
  68. // System.out.println("Fetching search form page..."); // 调试信息可以按需保留或删除
  69. HttpGet getRequest = new HttpGet(SEARCH_FORM_URL);
  70. // 添加一些伪装的 Headers 模拟浏览器访问
  71. getRequest.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36");
  72. getRequest.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
  73. getRequest.setHeader("Accept-Language", "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7");
  74. String formPageHtml = null;
  75. try (CloseableHttpResponse response = httpClient.execute(getRequest, context)) {
  76. int statusCode = response.getCode();
  77. // System.out.println("GET Response Status: " + statusCode); // 调试信息
  78. if (statusCode != 200) {
  79. System.err.println("Error: GET request to form page failed with status code: " + statusCode);
  80. EntityUtils.consume(response.getEntity()); // 确保消费掉实体,释放连接
  81. return null; // 获取表单页面失败,返回 null
  82. }
  83. HttpEntity entity = response.getEntity();
  84. if (entity != null) {
  85. formPageHtml = EntityUtils.toString(entity, StandardCharsets.UTF_8);
  86. EntityUtils.consume(entity); // 确保实体内容被完全消费
  87. } else {
  88. System.err.println("Error: Failed to get form page entity.");
  89. return null; // 获取页面内容失败,返回 null
  90. }
  91. }
  92. // System.out.println("Form page fetched successfully."); // 调试信息
  93. // --- Step 3 & 4: 解析 HTML 提取 csrf_token 和 __ncforminfo ---
  94. Document doc = Jsoup.parse(formPageHtml, SEARCH_FORM_URL); // 传入 base URI 有助于处理相对路径
  95. // 查找隐藏的输入字段
  96. Element csrfTokenInput = doc.selectFirst("input[name=csrf_token][type=hidden]");
  97. Element ncFormInfoInput = doc.selectFirst("input[name=__ncforminfo][type=hidden]");
  98. String csrfToken = null;
  99. String ncFormInfo = null;
  100. if (csrfTokenInput != null) {
  101. csrfToken = csrfTokenInput.val();
  102. // System.out.println("Extracted csrf_token: " + csrfToken); // 调试信息
  103. } else {
  104. System.err.println("Warning: Could not find csrf_token input field.");
  105. return null; // 缺少关键 token,返回 null
  106. }
  107. if (ncFormInfoInput != null) {
  108. ncFormInfo = ncFormInfoInput.val();
  109. // System.out.println("Extracted __ncforminfo: " + ncFormInfo); // 调试信息
  110. } else {
  111. System.err.println("Warning: Could not find __ncforminfo input field.");
  112. return null; // 缺少关键 token,返回 null
  113. }
  114. // 如果必要的 token 没有获取到,可能无法继续 (虽然上面的检查已经覆盖,这里作为双重保险)
  115. if (csrfToken == null || ncFormInfo == null) {
  116. System.err.println("Error: Missing required tokens. Cannot proceed with POST request.");
  117. return null;
  118. }
  119. // --- Step 5 & 6: 构建 POST 请求参数并发送 ---
  120. // System.out.println("\nPreparing POST request..."); // 调试信息
  121. HttpPost postRequest = new HttpPost(SEARCH_SUBMIT_URL);
  122. // 添加 Headers 模拟浏览器提交表单
  123. postRequest.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36");
  124. // 重要:设置 Referer Header
  125. postRequest.setHeader("Referer", SEARCH_FORM_URL);
  126. // 添加 Origin Header
  127. postRequest.setHeader("Origin", "https://ctri.nic.in");
  128. postRequest.setHeader("Content-Type", "application/x-www-form-urlencoded");
  129. postRequest.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
  130. postRequest.setHeader("Accept-Language", "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7");
  131. postRequest.setHeader("Pragma", "no-cache");
  132. List<NameValuePair> params = new ArrayList<>();
  133. // 添加你之前分析的载荷中的所有参数,使用获取到的动态值
  134. params.add(new BasicNameValuePair("stid", "1")); // 注意 stid 之前有两个,这里用 1
  135. params.add(new BasicNameValuePair("csrf_token", csrfToken)); // 使用获取到的动态 token
  136. params.add(new BasicNameValuePair("pros", "1"));
  137. params.add(new BasicNameValuePair("month", String.format("%02d", month))); // 格式化月份为两位数
  138. params.add(new BasicNameValuePair("year", String.valueOf(year)));
  139. params.add(new BasicNameValuePair("study", "0"));
  140. params.add(new BasicNameValuePair("sdid", "0"));
  141. params.add(new BasicNameValuePair("phaseid", "0"));
  142. params.add(new BasicNameValuePair("psponsor", "0"));
  143. params.add(new BasicNameValuePair("recid", "0"));
  144. params.add(new BasicNameValuePair("state", "0"));
  145. params.add(new BasicNameValuePair("district", "0"));
  146. params.add(new BasicNameValuePair("searchword", ""));
  147. params.add(new BasicNameValuePair("T4", "anyvalue")); // T4既然无效,随便填
  148. params.add(new BasicNameValuePair("btt", "Search"));
  149. params.add(new BasicNameValuePair("__ncforminfo", ncFormInfo)); // 使用获取到的动态值
  150. // 将参数列表设置到请求体中
  151. postRequest.setEntity(new UrlEncodedFormEntity(params, StandardCharsets.UTF_8));
  152. // System.out.println("Executing POST request to submit form..."); // 调试信息
  153. try (CloseableHttpResponse postResponse = httpClient.execute(postRequest, context)) {
  154. int postStatusCode = postResponse.getCode();
  155. // System.out.println("POST Response Status: " + postStatusCode); // 打印状态码
  156. if (postStatusCode != 200) {
  157. System.err.println("Error: POST request to submit form failed with status code: " + postStatusCode);
  158. EntityUtils.consume(postResponse.getEntity()); // 确保消费掉实体,释放连接
  159. return null; // 提交表单失败,返回 null
  160. }
  161. HttpEntity postEntity = postResponse.getEntity();
  162. if (postEntity != null) {
  163. String searchResultsHtml = EntityUtils.toString(postEntity, StandardCharsets.UTF_8);
  164. EntityUtils.consume(postEntity); // 确保实体内容被完全消费
  165. // --- Step 7: 处理搜索结果页面 ---
  166. // System.out.println("\nParsing search results..."); // 调试信息
  167. Document resultsDoc = Jsoup.parse(searchResultsHtml, SEARCH_SUBMIT_URL);
  168. Elements links = resultsDoc.select("tr a");
  169. for (Element linkElement : links) {
  170. String rawLink = linkElement.attr("href");
  171. // System.out.println("Processing raw link: " + rawLink); // 调试信息
  172. // 使用预编译的正则表达式 Pattern
  173. Matcher matcher = LINK_REGEX_PATTERN.matcher(rawLink);
  174. // 查找匹配项
  175. if (matcher.find()) {
  176. String extractedContent = matcher.group(1); // 提取单引号内的内容
  177. // 构建完整的链接 URL
  178. String fullLink = "https://ctri.nic.in/Clinicaltrials/" + extractedContent;
  179. linkList.add(fullLink); // 将完整链接添加到列表中
  180. // System.out.println("Added link: " + fullLink); // 调试信息
  181. } else {
  182. // 如果链接不符合模式,打印警告并跳过
  183. System.err.println("Warning: Link does not match expected pattern: " + rawLink);
  184. }
  185. }
  186. // --- 返回提取到的链接列表 ---
  187. // 循环结束后,返回收集到的所有链接
  188. // System.out.println("Finished link extraction. Returning list."); // 调试信息
  189. return linkList;
  190. } else {
  191. System.err.println("Error: Failed to get search results entity.");
  192. return null; // 获取结果内容失败,返回 null
  193. }
  194. }
  195. } catch (IOException e) {
  196. // 处理网络请求相关的异常
  197. System.err.println("Network or IO error during scraping:");
  198. e.printStackTrace();
  199. return null; // 发生 IO 错误,返回 null
  200. } catch (Exception e) {
  201. // 处理其他可能的异常,例如解析错误或 NPE
  202. System.err.println("An unexpected error occurred during scraping:");
  203. e.printStackTrace();
  204. return null; // 发生其他错误,返回 null
  205. }
  206. }
  207. public static void main(String[] args) {
  208. for (Integer year = Year.now().getValue(); year >= 2024; year--) {
  209. int monthStart = (year == Year.now().getValue()) ? LocalDate.now().getMonthValue() : 12;
  210. for (Integer month = monthStart; month >= 1; month--) {
  211. try {
  212. List<String> links = getlink(year, month);
  213. if (links == null) {
  214. System.out.println("年份 " + year + " 月份 " + month + " 抓取失败!");
  215. continue;
  216. }
  217. if (links.isEmpty()) {
  218. System.out.println("年份 " + year + " 月份 " + month + " 无数据!");
  219. continue;
  220. }
  221. int sleepTime = random.nextInt(1001) + 3000;
  222. int count = 0;
  223. for (String url : links) {
  224. try {
  225. Map<String, Object> result = reslutData(url);
  226. result.put("crawlUrl", url);
  227. String registNum = String.valueOf(result.get("registNum"));
  228. String jsonValue = objectMapper.writeValueAsString(result);
  229. ProducerRecord<String, String> record = new ProducerRecord<>(TOPIC_NAME, registNum, jsonValue);
  230. producer.send(record, (metadata, exception) -> {
  231. if (exception == null) {
  232. System.out.println("✅ 成功发送到 Kafka: " + registNum + " | Offset: " + metadata.offset() + " | " + url);
  233. } else {
  234. System.err.println("❌ Kafka 发送失败: " + exception.getMessage());
  235. }
  236. });
  237. Thread.sleep(sleepTime); // 控制节奏
  238. count++;
  239. } catch (Exception e) {
  240. System.err.println("抓取或发送失败: " + url);
  241. e.printStackTrace();
  242. }
  243. }
  244. System.out.println("📦 年份 " + year + " 月份 " + month + " 已完成,共上传 " + count + " 条数据。");
  245. } catch (Exception e) {
  246. System.err.println("处理年份 " + year + " 月份 " + month + " 失败: " + e.getMessage());
  247. e.printStackTrace();
  248. }
  249. }
  250. }
  251. // 关闭 producer
  252. producer.close();
  253. }
  254. public static Map<String,Object> reslutData(String url) throws IOException {
  255. Map<String,Object> resultData = new HashMap<>();
  256. OkHttpClient client = new OkHttpClient().newBuilder()
  257. .build();
  258. MediaType mediaType = MediaType.parse("text/plain");
  259. RequestBody body = RequestBody.create(mediaType, "");
  260. Request request = new Request.Builder()
  261. .url(url)
  262. .get()
  263. .build();
  264. Response response = client.newCall(request).execute();
  265. String html = response.body().string();
  266. Document parse = Jsoup.parse(html);
  267. String title = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(7) > td:nth-child(2)").text();
  268. String registNum = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(1) > td:nth-child(2) > b").text();
  269. String registTime = extractAndConvertDate(parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(1) > td:nth-child(2)").text());
  270. Map<String,Object> sponsor = new HashMap<>();
  271. String SMMS = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(14) > td:nth-child(2) > table > tbody > tr > td").text();
  272. String primarySponsor = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(15) > td:nth-child(2) > table > tbody").text();
  273. sponsor.put("Source of Monetary or Material Support",SMMS);
  274. sponsor.put("Primary Sponsor",primarySponsor);
  275. String studyType = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(4) > td:nth-child(2)").text();
  276. String phase = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(31) > td:nth-child(2)").text();
  277. Map<String,Object> disease = new HashMap<>();
  278. String healthType = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(21) > td:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(1)").text();
  279. String condition = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(21) > td:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(2)").text();
  280. disease.put("healthType",healthType);
  281. disease.put("condition",condition);
  282. String studyDesign = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(6) > td:nth-child(2)").text();
  283. String inclusionCriteria = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(23) > td:nth-child(2) > table > tbody").text();
  284. String exclusionCriteria = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(24) > td:nth-child(2) > table > tbody > tr > td:nth-child(2)").text();
  285. String enrollment = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(30) > td:nth-child(2)").text();
  286. String country = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(17) > td:nth-child(2)").text();
  287. String intervention = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(22) > td:nth-child(2) > table").text();
  288. Map<String,Object> primaryOutcome = new HashMap<>();
  289. String firstOutcome = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(28) > td:nth-child(2) > table > tbody").text();
  290. String secondOutcome = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(29) > td:nth-child(2) > table > tbody").text();
  291. primaryOutcome.put("firstOutcome",firstOutcome);
  292. primaryOutcome.put("secondOutcome",secondOutcome);
  293. resultData.put("disease",disease);
  294. resultData.put("primaryOutcome",primaryOutcome);
  295. resultData.put("intervention",intervention);
  296. resultData.put("country",country);
  297. resultData.put("enrollment",enrollment);
  298. resultData.put("exclusionCriteria",exclusionCriteria);
  299. resultData.put("inclusionCriteria",inclusionCriteria);
  300. resultData.put("studyDesign",studyDesign);
  301. resultData.put("sponsor",sponsor);
  302. resultData.put("title",title);
  303. resultData.put("registNum",registNum);
  304. resultData.put("registTime",registTime);
  305. resultData.put("studyType",studyType);
  306. resultData.put("phase",phase);
  307. resultData.put("registStatus","");
  308. resultData.put("registTitle","");
  309. resultData.put("fullTitle","");
  310. resultData.put("sponsorPart","");
  311. resultData.put("studyObjective","");
  312. resultData.put("studyStartDate","");
  313. resultData.put("currentStatus","");
  314. resultData.put("tagTime","");
  315. resultData.put("crawlTime",getCurrentTime());
  316. resultData.put("crawlUrl",url);
  317. resultData.put("postTime",registTime);
  318. resultData.put("content","content");
  319. resultData.put("forwardcontent","forwardcontent");
  320. resultData.put("cid","Nctrinicin");
  321. return resultData;
  322. }
  323. public static String getCurrentTime() {
  324. // 创建 DateTimeFormatter,指定输出格式
  325. DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
  326. // 获取当前时间
  327. LocalDateTime now = LocalDateTime.now();
  328. // 格式化
  329. return now.format(formatter);
  330. }
  331. public static String extractAndConvertDate(String input) {
  332. // 定义正则表达式提取 dd/MM/yyyy 格式的日期
  333. Pattern pattern = Pattern.compile("\\[(?:Registered on|注册于):\\s*(\\d{2}/\\d{2}/\\d{4})\\]");
  334. Matcher matcher = pattern.matcher(input);
  335. if (matcher.find()) {
  336. String dateStr = matcher.group(1); // 提取的日期字符串
  337. try {
  338. // 解析成 Date 对象
  339. SimpleDateFormat inputFormat = new SimpleDateFormat("dd/MM/yyyy");
  340. Date date = inputFormat.parse(dateStr);
  341. // 格式化为 yyyy:MM:dd 00:00:00
  342. SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MM-dd '00:00:00'");
  343. return outputFormat.format(date);
  344. } catch (ParseException e) {
  345. e.printStackTrace();
  346. }
  347. }
  348. return null; // 如果未匹配或转换失败
  349. }
  350. }