|
|
package com.example;
import com.fasterxml.jackson.databind.ObjectMapper; import okhttp3.*; import org.apache.hc.client5.http.cookie.BasicCookieStore; import org.apache.hc.client5.http.cookie.CookieStore; import org.apache.hc.client5.http.classic.methods.HttpGet; import org.apache.hc.client5.http.classic.methods.HttpPost; import org.apache.hc.client5.http.entity.UrlEncodedFormEntity; import org.apache.hc.client5.http.impl.classic.CloseableHttpClient; import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse; import org.apache.hc.client5.http.impl.classic.HttpClients; import org.apache.hc.client5.http.protocol.HttpClientContext; import org.apache.hc.core5.http.HttpEntity; import org.apache.hc.core5.http.NameValuePair; import org.apache.hc.core5.http.io.entity.EntityUtils; import org.apache.hc.core5.http.message.BasicNameValuePair; import org.apache.kafka.clients.producer.KafkaProducer; import org.apache.kafka.clients.producer.ProducerConfig; import org.apache.kafka.clients.producer.ProducerRecord; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements;
import java.io.IOException; import java.nio.file.Files; import java.nio.file.Paths; import java.text.ParseException; import java.text.SimpleDateFormat; import java.time.LocalDate; import java.time.LocalDateTime; import java.time.Month; import java.time.Year; import java.time.format.DateTimeFormatter; import java.util.*; import java.nio.charset.StandardCharsets; import java.util.regex.Matcher; import java.util.regex.Pattern;
public class CtriScraper {
private static final String SEARCH_FORM_URL = "https://ctri.nic.in/Clinicaltrials/advancesearchmain.php";
private static final String SEARCH_SUBMIT_URL = "https://ctri.nic.in/Clinicaltrials/advsearch.php";
private static final Pattern LINK_REGEX_PATTERN = Pattern.compile("'([^']*)'");
private static final String TOPIC_NAME = "cliniTopic"; private static final String BOOTSTRAP_SERVERS = "node-01:19092"; private static KafkaProducer<String, String> producer; private static ObjectMapper objectMapper = new ObjectMapper(); private static final Random random = new Random();
static { Properties props = new Properties(); props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, BOOTSTRAP_SERVERS); props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer"); props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer"); props.put(ProducerConfig.ACKS_CONFIG, "all"); // 等待所有副本确认
props.put(ProducerConfig.RETRIES_CONFIG, 3); // 重试次数
producer = new KafkaProducer<>(props);
} public static List<String> getlink(Integer year, Integer month) { List<String> linkList = new ArrayList<>(); // 用于存放提取到的链接
// 用于存储和管理 Cookies
CookieStore cookieStore = new BasicCookieStore(); // 用于在请求之间维护状态,特别是关联 CookieStore
HttpClientContext context = HttpClientContext.create(); context.setCookieStore(cookieStore);
// 使用 try-with-resources 确保 HttpClient 被正确关闭
try (CloseableHttpClient httpClient = HttpClients.custom() .setDefaultCookieStore(cookieStore) // 将cookie store绑定到client
.build()) {
// --- Step 1 & 2: 发送 GET 请求获取表单页面并解析 ---
// System.out.println("Fetching search form page..."); // 调试信息可以按需保留或删除
HttpGet getRequest = new HttpGet(SEARCH_FORM_URL); // 添加一些伪装的 Headers 模拟浏览器访问
getRequest.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"); getRequest.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"); getRequest.setHeader("Accept-Language", "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7");
String formPageHtml = null; try (CloseableHttpResponse response = httpClient.execute(getRequest, context)) { int statusCode = response.getCode(); // System.out.println("GET Response Status: " + statusCode); // 调试信息
if (statusCode != 200) { System.err.println("Error: GET request to form page failed with status code: " + statusCode); EntityUtils.consume(response.getEntity()); // 确保消费掉实体,释放连接
return null; // 获取表单页面失败,返回 null
}
HttpEntity entity = response.getEntity(); if (entity != null) { formPageHtml = EntityUtils.toString(entity, StandardCharsets.UTF_8); EntityUtils.consume(entity); // 确保实体内容被完全消费
} else { System.err.println("Error: Failed to get form page entity."); return null; // 获取页面内容失败,返回 null
} } // System.out.println("Form page fetched successfully."); // 调试信息
// --- Step 3 & 4: 解析 HTML 提取 csrf_token 和 __ncforminfo ---
Document doc = Jsoup.parse(formPageHtml, SEARCH_FORM_URL); // 传入 base URI 有助于处理相对路径
// 查找隐藏的输入字段
Element csrfTokenInput = doc.selectFirst("input[name=csrf_token][type=hidden]"); Element ncFormInfoInput = doc.selectFirst("input[name=__ncforminfo][type=hidden]");
String csrfToken = null; String ncFormInfo = null;
if (csrfTokenInput != null) { csrfToken = csrfTokenInput.val(); // System.out.println("Extracted csrf_token: " + csrfToken); // 调试信息
} else { System.err.println("Warning: Could not find csrf_token input field."); return null; // 缺少关键 token,返回 null
}
if (ncFormInfoInput != null) { ncFormInfo = ncFormInfoInput.val(); // System.out.println("Extracted __ncforminfo: " + ncFormInfo); // 调试信息
} else { System.err.println("Warning: Could not find __ncforminfo input field."); return null; // 缺少关键 token,返回 null
}
// 如果必要的 token 没有获取到,可能无法继续 (虽然上面的检查已经覆盖,这里作为双重保险)
if (csrfToken == null || ncFormInfo == null) { System.err.println("Error: Missing required tokens. Cannot proceed with POST request."); return null; }
// --- Step 5 & 6: 构建 POST 请求参数并发送 ---
// System.out.println("\nPreparing POST request..."); // 调试信息
HttpPost postRequest = new HttpPost(SEARCH_SUBMIT_URL); // 添加 Headers 模拟浏览器提交表单
postRequest.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"); // 重要:设置 Referer Header
postRequest.setHeader("Referer", SEARCH_FORM_URL); // 添加 Origin Header
postRequest.setHeader("Origin", "https://ctri.nic.in"); postRequest.setHeader("Content-Type", "application/x-www-form-urlencoded"); postRequest.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"); postRequest.setHeader("Accept-Language", "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7"); postRequest.setHeader("Pragma", "no-cache");
List<NameValuePair> params = new ArrayList<>(); // 添加你之前分析的载荷中的所有参数,使用获取到的动态值
params.add(new BasicNameValuePair("stid", "1")); // 注意 stid 之前有两个,这里用 1
params.add(new BasicNameValuePair("csrf_token", csrfToken)); // 使用获取到的动态 token
params.add(new BasicNameValuePair("pros", "1")); params.add(new BasicNameValuePair("month", String.format("%02d", month))); // 格式化月份为两位数
params.add(new BasicNameValuePair("year", String.valueOf(year))); params.add(new BasicNameValuePair("study", "0")); params.add(new BasicNameValuePair("sdid", "0")); params.add(new BasicNameValuePair("phaseid", "0")); params.add(new BasicNameValuePair("psponsor", "0")); params.add(new BasicNameValuePair("recid", "0")); params.add(new BasicNameValuePair("state", "0")); params.add(new BasicNameValuePair("district", "0")); params.add(new BasicNameValuePair("searchword", "")); params.add(new BasicNameValuePair("T4", "anyvalue")); // T4既然无效,随便填
params.add(new BasicNameValuePair("btt", "Search")); params.add(new BasicNameValuePair("__ncforminfo", ncFormInfo)); // 使用获取到的动态值
// 将参数列表设置到请求体中
postRequest.setEntity(new UrlEncodedFormEntity(params, StandardCharsets.UTF_8));
// System.out.println("Executing POST request to submit form..."); // 调试信息
try (CloseableHttpResponse postResponse = httpClient.execute(postRequest, context)) { int postStatusCode = postResponse.getCode(); // System.out.println("POST Response Status: " + postStatusCode); // 打印状态码
if (postStatusCode != 200) { System.err.println("Error: POST request to submit form failed with status code: " + postStatusCode); EntityUtils.consume(postResponse.getEntity()); // 确保消费掉实体,释放连接
return null; // 提交表单失败,返回 null
}
HttpEntity postEntity = postResponse.getEntity();
if (postEntity != null) { String searchResultsHtml = EntityUtils.toString(postEntity, StandardCharsets.UTF_8); EntityUtils.consume(postEntity); // 确保实体内容被完全消费
// --- Step 7: 处理搜索结果页面 ---
// System.out.println("\nParsing search results..."); // 调试信息
Document resultsDoc = Jsoup.parse(searchResultsHtml, SEARCH_SUBMIT_URL);
Elements links = resultsDoc.select("tr a");
for (Element linkElement : links) { String rawLink = linkElement.attr("href"); // System.out.println("Processing raw link: " + rawLink); // 调试信息
// 使用预编译的正则表达式 Pattern
Matcher matcher = LINK_REGEX_PATTERN.matcher(rawLink);
// 查找匹配项
if (matcher.find()) { String extractedContent = matcher.group(1); // 提取单引号内的内容
// 构建完整的链接 URL
String fullLink = "https://ctri.nic.in/Clinicaltrials/" + extractedContent; linkList.add(fullLink); // 将完整链接添加到列表中
// System.out.println("Added link: " + fullLink); // 调试信息
} else { // 如果链接不符合模式,打印警告并跳过
System.err.println("Warning: Link does not match expected pattern: " + rawLink); } }
// --- 返回提取到的链接列表 ---
// 循环结束后,返回收集到的所有链接
// System.out.println("Finished link extraction. Returning list."); // 调试信息
return linkList;
} else { System.err.println("Error: Failed to get search results entity."); return null; // 获取结果内容失败,返回 null
} }
} catch (IOException e) { // 处理网络请求相关的异常
System.err.println("Network or IO error during scraping:"); e.printStackTrace(); return null; // 发生 IO 错误,返回 null
} catch (Exception e) { // 处理其他可能的异常,例如解析错误或 NPE
System.err.println("An unexpected error occurred during scraping:"); e.printStackTrace(); return null; // 发生其他错误,返回 null
} } public static void main(String[] args) { for (Integer year = Year.now().getValue(); year >= 2024; year--) { int monthStart = (year == Year.now().getValue()) ? LocalDate.now().getMonthValue() : 12;
for (Integer month = monthStart; month >= 1; month--) { try { List<String> links = getlink(year, month); if (links == null) { System.out.println("年份 " + year + " 月份 " + month + " 抓取失败!"); continue; }
if (links.isEmpty()) { System.out.println("年份 " + year + " 月份 " + month + " 无数据!"); continue; }
int sleepTime = random.nextInt(1001) + 3000; int count = 0;
for (String url : links) { try { Map<String, Object> result = reslutData(url); result.put("crawlUrl", url);
String registNum = String.valueOf(result.get("registNum")); String jsonValue = objectMapper.writeValueAsString(result);
ProducerRecord<String, String> record = new ProducerRecord<>(TOPIC_NAME, registNum, jsonValue);
producer.send(record, (metadata, exception) -> { if (exception == null) { System.out.println("✅ 成功发送到 Kafka: " + registNum + " | Offset: " + metadata.offset() + " | " + url); } else { System.err.println("❌ Kafka 发送失败: " + exception.getMessage()); } });
Thread.sleep(sleepTime); // 控制节奏
count++; } catch (Exception e) { System.err.println("抓取或发送失败: " + url); e.printStackTrace(); } }
System.out.println("📦 年份 " + year + " 月份 " + month + " 已完成,共上传 " + count + " 条数据。");
} catch (Exception e) { System.err.println("处理年份 " + year + " 月份 " + month + " 失败: " + e.getMessage()); e.printStackTrace(); } } } // 关闭 producer
producer.close(); }
public static Map<String,Object> reslutData(String url) throws IOException { Map<String,Object> resultData = new HashMap<>(); OkHttpClient client = new OkHttpClient().newBuilder() .build(); MediaType mediaType = MediaType.parse("text/plain"); RequestBody body = RequestBody.create(mediaType, ""); Request request = new Request.Builder() .url(url) .get() .build(); Response response = client.newCall(request).execute(); String html = response.body().string(); Document parse = Jsoup.parse(html); String title = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(7) > td:nth-child(2)").text(); String registNum = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(1) > td:nth-child(2) > b").text(); String registTime = extractAndConvertDate(parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(1) > td:nth-child(2)").text()); Map<String,Object> sponsor = new HashMap<>(); String SMMS = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(14) > td:nth-child(2) > table > tbody > tr > td").text(); String primarySponsor = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(15) > td:nth-child(2) > table > tbody").text(); sponsor.put("Source of Monetary or Material Support",SMMS); sponsor.put("Primary Sponsor",primarySponsor); String studyType = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(4) > td:nth-child(2)").text(); String phase = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(31) > td:nth-child(2)").text(); Map<String,Object> disease = new HashMap<>(); String healthType = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(21) > td:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(1)").text(); String condition = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(21) > td:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(2)").text(); disease.put("healthType",healthType); disease.put("condition",condition); String studyDesign = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(6) > td:nth-child(2)").text(); String inclusionCriteria = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(23) > td:nth-child(2) > table > tbody").text(); String exclusionCriteria = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(24) > td:nth-child(2) > table > tbody > tr > td:nth-child(2)").text(); String enrollment = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(30) > td:nth-child(2)").text(); String country = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(17) > td:nth-child(2)").text(); String intervention = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(22) > td:nth-child(2) > table").text(); Map<String,Object> primaryOutcome = new HashMap<>(); String firstOutcome = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(28) > td:nth-child(2) > table > tbody").text(); String secondOutcome = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(29) > td:nth-child(2) > table > tbody").text(); primaryOutcome.put("firstOutcome",firstOutcome); primaryOutcome.put("secondOutcome",secondOutcome);
resultData.put("disease",disease); resultData.put("primaryOutcome",primaryOutcome); resultData.put("intervention",intervention); resultData.put("country",country); resultData.put("enrollment",enrollment); resultData.put("exclusionCriteria",exclusionCriteria); resultData.put("inclusionCriteria",inclusionCriteria); resultData.put("studyDesign",studyDesign); resultData.put("sponsor",sponsor); resultData.put("title",title); resultData.put("registNum",registNum); resultData.put("registTime",registTime); resultData.put("studyType",studyType); resultData.put("phase",phase); resultData.put("registStatus",""); resultData.put("registTitle",""); resultData.put("fullTitle",""); resultData.put("sponsorPart",""); resultData.put("studyObjective",""); resultData.put("studyStartDate",""); resultData.put("currentStatus",""); resultData.put("tagTime",""); resultData.put("crawlTime",getCurrentTime()); resultData.put("crawlUrl",url); resultData.put("postTime",registTime); resultData.put("content","content"); resultData.put("forwardcontent","forwardcontent"); resultData.put("cid","Nctrinicin"); return resultData; } public static String getCurrentTime() { // 创建 DateTimeFormatter,指定输出格式
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); // 获取当前时间
LocalDateTime now = LocalDateTime.now(); // 格式化
return now.format(formatter); } public static String extractAndConvertDate(String input) { // 定义正则表达式提取 dd/MM/yyyy 格式的日期
Pattern pattern = Pattern.compile("\\[(?:Registered on|注册于):\\s*(\\d{2}/\\d{2}/\\d{4})\\]"); Matcher matcher = pattern.matcher(input);
if (matcher.find()) { String dateStr = matcher.group(1); // 提取的日期字符串
try { // 解析成 Date 对象
SimpleDateFormat inputFormat = new SimpleDateFormat("dd/MM/yyyy"); Date date = inputFormat.parse(dateStr);
// 格式化为 yyyy:MM:dd 00:00:00
SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MM-dd '00:00:00'"); return outputFormat.format(date);
} catch (ParseException e) { e.printStackTrace(); } }
return null; // 如果未匹配或转换失败
} }
|