package com.example; import com.fasterxml.jackson.databind.ObjectMapper; import okhttp3.*; import org.apache.hc.client5.http.cookie.BasicCookieStore; import org.apache.hc.client5.http.cookie.CookieStore; import org.apache.hc.client5.http.classic.methods.HttpGet; import org.apache.hc.client5.http.classic.methods.HttpPost; import org.apache.hc.client5.http.entity.UrlEncodedFormEntity; import org.apache.hc.client5.http.impl.classic.CloseableHttpClient; import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse; import org.apache.hc.client5.http.impl.classic.HttpClients; import org.apache.hc.client5.http.protocol.HttpClientContext; import org.apache.hc.core5.http.HttpEntity; import org.apache.hc.core5.http.NameValuePair; import org.apache.hc.core5.http.io.entity.EntityUtils; import org.apache.hc.core5.http.message.BasicNameValuePair; import org.apache.kafka.clients.producer.KafkaProducer; import org.apache.kafka.clients.producer.ProducerConfig; import org.apache.kafka.clients.producer.ProducerRecord; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Paths; import java.text.ParseException; import java.text.SimpleDateFormat; import java.time.LocalDate; import java.time.LocalDateTime; import java.time.Month; import java.time.Year; import java.time.format.DateTimeFormatter; import java.util.*; import java.nio.charset.StandardCharsets; import java.util.regex.Matcher; import java.util.regex.Pattern; public class CtriScraper { private static final String SEARCH_FORM_URL = "https://ctri.nic.in/Clinicaltrials/advancesearchmain.php"; private static final String SEARCH_SUBMIT_URL = "https://ctri.nic.in/Clinicaltrials/advsearch.php"; private static final Pattern LINK_REGEX_PATTERN = Pattern.compile("'([^']*)'"); private static final String TOPIC_NAME = "cliniTopic"; private static final String BOOTSTRAP_SERVERS = "node-01:19092"; private static KafkaProducer producer; private static ObjectMapper objectMapper = new ObjectMapper(); private static final Random random = new Random(); static { Properties props = new Properties(); props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, BOOTSTRAP_SERVERS); props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer"); props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer"); props.put(ProducerConfig.ACKS_CONFIG, "all"); // 等待所有副本确认 props.put(ProducerConfig.RETRIES_CONFIG, 3); // 重试次数 producer = new KafkaProducer<>(props); } public static List getlink(Integer year, Integer month) { List linkList = new ArrayList<>(); // 用于存放提取到的链接 // 用于存储和管理 Cookies CookieStore cookieStore = new BasicCookieStore(); // 用于在请求之间维护状态,特别是关联 CookieStore HttpClientContext context = HttpClientContext.create(); context.setCookieStore(cookieStore); // 使用 try-with-resources 确保 HttpClient 被正确关闭 try (CloseableHttpClient httpClient = HttpClients.custom() .setDefaultCookieStore(cookieStore) // 将cookie store绑定到client .build()) { // --- Step 1 & 2: 发送 GET 请求获取表单页面并解析 --- // System.out.println("Fetching search form page..."); // 调试信息可以按需保留或删除 HttpGet getRequest = new HttpGet(SEARCH_FORM_URL); // 添加一些伪装的 Headers 模拟浏览器访问 getRequest.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"); getRequest.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"); getRequest.setHeader("Accept-Language", "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7"); String formPageHtml = null; try (CloseableHttpResponse response = httpClient.execute(getRequest, context)) { int statusCode = response.getCode(); // System.out.println("GET Response Status: " + statusCode); // 调试信息 if (statusCode != 200) { System.err.println("Error: GET request to form page failed with status code: " + statusCode); EntityUtils.consume(response.getEntity()); // 确保消费掉实体,释放连接 return null; // 获取表单页面失败,返回 null } HttpEntity entity = response.getEntity(); if (entity != null) { formPageHtml = EntityUtils.toString(entity, StandardCharsets.UTF_8); EntityUtils.consume(entity); // 确保实体内容被完全消费 } else { System.err.println("Error: Failed to get form page entity."); return null; // 获取页面内容失败,返回 null } } // System.out.println("Form page fetched successfully."); // 调试信息 // --- Step 3 & 4: 解析 HTML 提取 csrf_token 和 __ncforminfo --- Document doc = Jsoup.parse(formPageHtml, SEARCH_FORM_URL); // 传入 base URI 有助于处理相对路径 // 查找隐藏的输入字段 Element csrfTokenInput = doc.selectFirst("input[name=csrf_token][type=hidden]"); Element ncFormInfoInput = doc.selectFirst("input[name=__ncforminfo][type=hidden]"); String csrfToken = null; String ncFormInfo = null; if (csrfTokenInput != null) { csrfToken = csrfTokenInput.val(); // System.out.println("Extracted csrf_token: " + csrfToken); // 调试信息 } else { System.err.println("Warning: Could not find csrf_token input field."); return null; // 缺少关键 token,返回 null } if (ncFormInfoInput != null) { ncFormInfo = ncFormInfoInput.val(); // System.out.println("Extracted __ncforminfo: " + ncFormInfo); // 调试信息 } else { System.err.println("Warning: Could not find __ncforminfo input field."); return null; // 缺少关键 token,返回 null } // 如果必要的 token 没有获取到,可能无法继续 (虽然上面的检查已经覆盖,这里作为双重保险) if (csrfToken == null || ncFormInfo == null) { System.err.println("Error: Missing required tokens. Cannot proceed with POST request."); return null; } // --- Step 5 & 6: 构建 POST 请求参数并发送 --- // System.out.println("\nPreparing POST request..."); // 调试信息 HttpPost postRequest = new HttpPost(SEARCH_SUBMIT_URL); // 添加 Headers 模拟浏览器提交表单 postRequest.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"); // 重要:设置 Referer Header postRequest.setHeader("Referer", SEARCH_FORM_URL); // 添加 Origin Header postRequest.setHeader("Origin", "https://ctri.nic.in"); postRequest.setHeader("Content-Type", "application/x-www-form-urlencoded"); postRequest.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"); postRequest.setHeader("Accept-Language", "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7"); postRequest.setHeader("Pragma", "no-cache"); List params = new ArrayList<>(); // 添加你之前分析的载荷中的所有参数,使用获取到的动态值 params.add(new BasicNameValuePair("stid", "1")); // 注意 stid 之前有两个,这里用 1 params.add(new BasicNameValuePair("csrf_token", csrfToken)); // 使用获取到的动态 token params.add(new BasicNameValuePair("pros", "1")); params.add(new BasicNameValuePair("month", String.format("%02d", month))); // 格式化月份为两位数 params.add(new BasicNameValuePair("year", String.valueOf(year))); params.add(new BasicNameValuePair("study", "0")); params.add(new BasicNameValuePair("sdid", "0")); params.add(new BasicNameValuePair("phaseid", "0")); params.add(new BasicNameValuePair("psponsor", "0")); params.add(new BasicNameValuePair("recid", "0")); params.add(new BasicNameValuePair("state", "0")); params.add(new BasicNameValuePair("district", "0")); params.add(new BasicNameValuePair("searchword", "")); params.add(new BasicNameValuePair("T4", "anyvalue")); // T4既然无效,随便填 params.add(new BasicNameValuePair("btt", "Search")); params.add(new BasicNameValuePair("__ncforminfo", ncFormInfo)); // 使用获取到的动态值 // 将参数列表设置到请求体中 postRequest.setEntity(new UrlEncodedFormEntity(params, StandardCharsets.UTF_8)); // System.out.println("Executing POST request to submit form..."); // 调试信息 try (CloseableHttpResponse postResponse = httpClient.execute(postRequest, context)) { int postStatusCode = postResponse.getCode(); // System.out.println("POST Response Status: " + postStatusCode); // 打印状态码 if (postStatusCode != 200) { System.err.println("Error: POST request to submit form failed with status code: " + postStatusCode); EntityUtils.consume(postResponse.getEntity()); // 确保消费掉实体,释放连接 return null; // 提交表单失败,返回 null } HttpEntity postEntity = postResponse.getEntity(); if (postEntity != null) { String searchResultsHtml = EntityUtils.toString(postEntity, StandardCharsets.UTF_8); EntityUtils.consume(postEntity); // 确保实体内容被完全消费 // --- Step 7: 处理搜索结果页面 --- // System.out.println("\nParsing search results..."); // 调试信息 Document resultsDoc = Jsoup.parse(searchResultsHtml, SEARCH_SUBMIT_URL); Elements links = resultsDoc.select("tr a"); for (Element linkElement : links) { String rawLink = linkElement.attr("href"); // System.out.println("Processing raw link: " + rawLink); // 调试信息 // 使用预编译的正则表达式 Pattern Matcher matcher = LINK_REGEX_PATTERN.matcher(rawLink); // 查找匹配项 if (matcher.find()) { String extractedContent = matcher.group(1); // 提取单引号内的内容 // 构建完整的链接 URL String fullLink = "https://ctri.nic.in/Clinicaltrials/" + extractedContent; linkList.add(fullLink); // 将完整链接添加到列表中 // System.out.println("Added link: " + fullLink); // 调试信息 } else { // 如果链接不符合模式,打印警告并跳过 System.err.println("Warning: Link does not match expected pattern: " + rawLink); } } // --- 返回提取到的链接列表 --- // 循环结束后,返回收集到的所有链接 // System.out.println("Finished link extraction. Returning list."); // 调试信息 return linkList; } else { System.err.println("Error: Failed to get search results entity."); return null; // 获取结果内容失败,返回 null } } } catch (IOException e) { // 处理网络请求相关的异常 System.err.println("Network or IO error during scraping:"); e.printStackTrace(); return null; // 发生 IO 错误,返回 null } catch (Exception e) { // 处理其他可能的异常,例如解析错误或 NPE System.err.println("An unexpected error occurred during scraping:"); e.printStackTrace(); return null; // 发生其他错误,返回 null } } public static void main(String[] args) { for (Integer year = Year.now().getValue(); year >= 2024; year--) { int monthStart = (year == Year.now().getValue()) ? LocalDate.now().getMonthValue() : 12; for (Integer month = monthStart; month >= 1; month--) { try { List links = getlink(year, month); if (links == null) { System.out.println("年份 " + year + " 月份 " + month + " 抓取失败!"); continue; } if (links.isEmpty()) { System.out.println("年份 " + year + " 月份 " + month + " 无数据!"); continue; } int sleepTime = random.nextInt(1001) + 3000; int count = 0; for (String url : links) { try { Map result = reslutData(url); result.put("crawlUrl", url); String registNum = String.valueOf(result.get("registNum")); String jsonValue = objectMapper.writeValueAsString(result); ProducerRecord record = new ProducerRecord<>(TOPIC_NAME, registNum, jsonValue); producer.send(record, (metadata, exception) -> { if (exception == null) { System.out.println("✅ 成功发送到 Kafka: " + registNum + " | Offset: " + metadata.offset() + " | " + url); } else { System.err.println("❌ Kafka 发送失败: " + exception.getMessage()); } }); Thread.sleep(sleepTime); // 控制节奏 count++; } catch (Exception e) { System.err.println("抓取或发送失败: " + url); e.printStackTrace(); } } System.out.println("📦 年份 " + year + " 月份 " + month + " 已完成,共上传 " + count + " 条数据。"); } catch (Exception e) { System.err.println("处理年份 " + year + " 月份 " + month + " 失败: " + e.getMessage()); e.printStackTrace(); } } } // 关闭 producer producer.close(); } public static Map reslutData(String url) throws IOException { Map resultData = new HashMap<>(); OkHttpClient client = new OkHttpClient().newBuilder() .build(); MediaType mediaType = MediaType.parse("text/plain"); RequestBody body = RequestBody.create(mediaType, ""); Request request = new Request.Builder() .url(url) .get() .build(); Response response = client.newCall(request).execute(); String html = response.body().string(); Document parse = Jsoup.parse(html); String title = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(7) > td:nth-child(2)").text(); String registNum = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(1) > td:nth-child(2) > b").text(); String registTime = extractAndConvertDate(parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(1) > td:nth-child(2)").text()); Map sponsor = new HashMap<>(); String SMMS = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(14) > td:nth-child(2) > table > tbody > tr > td").text(); String primarySponsor = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(15) > td:nth-child(2) > table > tbody").text(); sponsor.put("Source of Monetary or Material Support",SMMS); sponsor.put("Primary Sponsor",primarySponsor); String studyType = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(4) > td:nth-child(2)").text(); String phase = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(31) > td:nth-child(2)").text(); Map disease = new HashMap<>(); String healthType = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(21) > td:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(1)").text(); String condition = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(21) > td:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(2)").text(); disease.put("healthType",healthType); disease.put("condition",condition); String studyDesign = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(6) > td:nth-child(2)").text(); String inclusionCriteria = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(23) > td:nth-child(2) > table > tbody").text(); String exclusionCriteria = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(24) > td:nth-child(2) > table > tbody > tr > td:nth-child(2)").text(); String enrollment = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(30) > td:nth-child(2)").text(); String country = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(17) > td:nth-child(2)").text(); String intervention = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(22) > td:nth-child(2) > table").text(); Map primaryOutcome = new HashMap<>(); String firstOutcome = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(28) > td:nth-child(2) > table > tbody").text(); String secondOutcome = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(29) > td:nth-child(2) > table > tbody").text(); primaryOutcome.put("firstOutcome",firstOutcome); primaryOutcome.put("secondOutcome",secondOutcome); resultData.put("disease",disease); resultData.put("primaryOutcome",primaryOutcome); resultData.put("intervention",intervention); resultData.put("country",country); resultData.put("enrollment",enrollment); resultData.put("exclusionCriteria",exclusionCriteria); resultData.put("inclusionCriteria",inclusionCriteria); resultData.put("studyDesign",studyDesign); resultData.put("sponsor",sponsor); resultData.put("title",title); resultData.put("registNum",registNum); resultData.put("registTime",registTime); resultData.put("studyType",studyType); resultData.put("phase",phase); resultData.put("registStatus",""); resultData.put("registTitle",""); resultData.put("fullTitle",""); resultData.put("sponsorPart",""); resultData.put("studyObjective",""); resultData.put("studyStartDate",""); resultData.put("currentStatus",""); resultData.put("tagTime",""); resultData.put("crawlTime",getCurrentTime()); resultData.put("crawlUrl",url); resultData.put("postTime",registTime); resultData.put("content","content"); resultData.put("forwardcontent","forwardcontent"); resultData.put("cid","Nctrinicin"); return resultData; } public static String getCurrentTime() { // 创建 DateTimeFormatter,指定输出格式 DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); // 获取当前时间 LocalDateTime now = LocalDateTime.now(); // 格式化 return now.format(formatter); } public static String extractAndConvertDate(String input) { // 定义正则表达式提取 dd/MM/yyyy 格式的日期 Pattern pattern = Pattern.compile("\\[(?:Registered on|注册于):\\s*(\\d{2}/\\d{2}/\\d{4})\\]"); Matcher matcher = pattern.matcher(input); if (matcher.find()) { String dateStr = matcher.group(1); // 提取的日期字符串 try { // 解析成 Date 对象 SimpleDateFormat inputFormat = new SimpleDateFormat("dd/MM/yyyy"); Date date = inputFormat.parse(dateStr); // 格式化为 yyyy:MM:dd 00:00:00 SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MM-dd '00:00:00'"); return outputFormat.format(date); } catch (ParseException e) { e.printStackTrace(); } } return null; // 如果未匹配或转换失败 } }