You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

404 lines
22 KiB

package com.example;
import com.fasterxml.jackson.databind.ObjectMapper;
import okhttp3.*;
import org.apache.hc.client5.http.cookie.BasicCookieStore;
import org.apache.hc.client5.http.cookie.CookieStore;
import org.apache.hc.client5.http.classic.methods.HttpGet;
import org.apache.hc.client5.http.classic.methods.HttpPost;
import org.apache.hc.client5.http.entity.UrlEncodedFormEntity;
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
import org.apache.hc.client5.http.impl.classic.HttpClients;
import org.apache.hc.client5.http.protocol.HttpClientContext;
import org.apache.hc.core5.http.HttpEntity;
import org.apache.hc.core5.http.NameValuePair;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.message.BasicNameValuePair;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.Month;
import java.time.Year;
import java.time.format.DateTimeFormatter;
import java.util.*;
import java.nio.charset.StandardCharsets;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class CtriScraper {
private static final String SEARCH_FORM_URL = "https://ctri.nic.in/Clinicaltrials/advancesearchmain.php";
private static final String SEARCH_SUBMIT_URL = "https://ctri.nic.in/Clinicaltrials/advsearch.php";
private static final Pattern LINK_REGEX_PATTERN = Pattern.compile("'([^']*)'");
private static final String TOPIC_NAME = "cliniTopic";
private static final String BOOTSTRAP_SERVERS = "node-01:19092";
private static KafkaProducer<String, String> producer;
private static ObjectMapper objectMapper = new ObjectMapper();
private static final Random random = new Random();
static {
Properties props = new Properties();
props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, BOOTSTRAP_SERVERS);
props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
props.put(ProducerConfig.ACKS_CONFIG, "all"); // 等待所有副本确认
props.put(ProducerConfig.RETRIES_CONFIG, 3); // 重试次数
producer = new KafkaProducer<>(props);
}
public static List<String> getlink(Integer year, Integer month) {
List<String> linkList = new ArrayList<>(); // 用于存放提取到的链接
// 用于存储和管理 Cookies
CookieStore cookieStore = new BasicCookieStore();
// 用于在请求之间维护状态,特别是关联 CookieStore
HttpClientContext context = HttpClientContext.create();
context.setCookieStore(cookieStore);
// 使用 try-with-resources 确保 HttpClient 被正确关闭
try (CloseableHttpClient httpClient = HttpClients.custom()
.setDefaultCookieStore(cookieStore) // 将cookie store绑定到client
.build()) {
// --- Step 1 & 2: 发送 GET 请求获取表单页面并解析 ---
// System.out.println("Fetching search form page..."); // 调试信息可以按需保留或删除
HttpGet getRequest = new HttpGet(SEARCH_FORM_URL);
// 添加一些伪装的 Headers 模拟浏览器访问
getRequest.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36");
getRequest.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
getRequest.setHeader("Accept-Language", "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7");
String formPageHtml = null;
try (CloseableHttpResponse response = httpClient.execute(getRequest, context)) {
int statusCode = response.getCode();
// System.out.println("GET Response Status: " + statusCode); // 调试信息
if (statusCode != 200) {
System.err.println("Error: GET request to form page failed with status code: " + statusCode);
EntityUtils.consume(response.getEntity()); // 确保消费掉实体,释放连接
return null; // 获取表单页面失败,返回 null
}
HttpEntity entity = response.getEntity();
if (entity != null) {
formPageHtml = EntityUtils.toString(entity, StandardCharsets.UTF_8);
EntityUtils.consume(entity); // 确保实体内容被完全消费
} else {
System.err.println("Error: Failed to get form page entity.");
return null; // 获取页面内容失败,返回 null
}
}
// System.out.println("Form page fetched successfully."); // 调试信息
// --- Step 3 & 4: 解析 HTML 提取 csrf_token 和 __ncforminfo ---
Document doc = Jsoup.parse(formPageHtml, SEARCH_FORM_URL); // 传入 base URI 有助于处理相对路径
// 查找隐藏的输入字段
Element csrfTokenInput = doc.selectFirst("input[name=csrf_token][type=hidden]");
Element ncFormInfoInput = doc.selectFirst("input[name=__ncforminfo][type=hidden]");
String csrfToken = null;
String ncFormInfo = null;
if (csrfTokenInput != null) {
csrfToken = csrfTokenInput.val();
// System.out.println("Extracted csrf_token: " + csrfToken); // 调试信息
} else {
System.err.println("Warning: Could not find csrf_token input field.");
return null; // 缺少关键 token,返回 null
}
if (ncFormInfoInput != null) {
ncFormInfo = ncFormInfoInput.val();
// System.out.println("Extracted __ncforminfo: " + ncFormInfo); // 调试信息
} else {
System.err.println("Warning: Could not find __ncforminfo input field.");
return null; // 缺少关键 token,返回 null
}
// 如果必要的 token 没有获取到,可能无法继续 (虽然上面的检查已经覆盖,这里作为双重保险)
if (csrfToken == null || ncFormInfo == null) {
System.err.println("Error: Missing required tokens. Cannot proceed with POST request.");
return null;
}
// --- Step 5 & 6: 构建 POST 请求参数并发送 ---
// System.out.println("\nPreparing POST request..."); // 调试信息
HttpPost postRequest = new HttpPost(SEARCH_SUBMIT_URL);
// 添加 Headers 模拟浏览器提交表单
postRequest.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36");
// 重要:设置 Referer Header
postRequest.setHeader("Referer", SEARCH_FORM_URL);
// 添加 Origin Header
postRequest.setHeader("Origin", "https://ctri.nic.in");
postRequest.setHeader("Content-Type", "application/x-www-form-urlencoded");
postRequest.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
postRequest.setHeader("Accept-Language", "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7");
postRequest.setHeader("Pragma", "no-cache");
List<NameValuePair> params = new ArrayList<>();
// 添加你之前分析的载荷中的所有参数,使用获取到的动态值
params.add(new BasicNameValuePair("stid", "1")); // 注意 stid 之前有两个,这里用 1
params.add(new BasicNameValuePair("csrf_token", csrfToken)); // 使用获取到的动态 token
params.add(new BasicNameValuePair("pros", "1"));
params.add(new BasicNameValuePair("month", String.format("%02d", month))); // 格式化月份为两位数
params.add(new BasicNameValuePair("year", String.valueOf(year)));
params.add(new BasicNameValuePair("study", "0"));
params.add(new BasicNameValuePair("sdid", "0"));
params.add(new BasicNameValuePair("phaseid", "0"));
params.add(new BasicNameValuePair("psponsor", "0"));
params.add(new BasicNameValuePair("recid", "0"));
params.add(new BasicNameValuePair("state", "0"));
params.add(new BasicNameValuePair("district", "0"));
params.add(new BasicNameValuePair("searchword", ""));
params.add(new BasicNameValuePair("T4", "anyvalue")); // T4既然无效,随便填
params.add(new BasicNameValuePair("btt", "Search"));
params.add(new BasicNameValuePair("__ncforminfo", ncFormInfo)); // 使用获取到的动态值
// 将参数列表设置到请求体中
postRequest.setEntity(new UrlEncodedFormEntity(params, StandardCharsets.UTF_8));
// System.out.println("Executing POST request to submit form..."); // 调试信息
try (CloseableHttpResponse postResponse = httpClient.execute(postRequest, context)) {
int postStatusCode = postResponse.getCode();
// System.out.println("POST Response Status: " + postStatusCode); // 打印状态码
if (postStatusCode != 200) {
System.err.println("Error: POST request to submit form failed with status code: " + postStatusCode);
EntityUtils.consume(postResponse.getEntity()); // 确保消费掉实体,释放连接
return null; // 提交表单失败,返回 null
}
HttpEntity postEntity = postResponse.getEntity();
if (postEntity != null) {
String searchResultsHtml = EntityUtils.toString(postEntity, StandardCharsets.UTF_8);
EntityUtils.consume(postEntity); // 确保实体内容被完全消费
// --- Step 7: 处理搜索结果页面 ---
// System.out.println("\nParsing search results..."); // 调试信息
Document resultsDoc = Jsoup.parse(searchResultsHtml, SEARCH_SUBMIT_URL);
Elements links = resultsDoc.select("tr a");
for (Element linkElement : links) {
String rawLink = linkElement.attr("href");
// System.out.println("Processing raw link: " + rawLink); // 调试信息
// 使用预编译的正则表达式 Pattern
Matcher matcher = LINK_REGEX_PATTERN.matcher(rawLink);
// 查找匹配项
if (matcher.find()) {
String extractedContent = matcher.group(1); // 提取单引号内的内容
// 构建完整的链接 URL
String fullLink = "https://ctri.nic.in/Clinicaltrials/" + extractedContent;
linkList.add(fullLink); // 将完整链接添加到列表中
// System.out.println("Added link: " + fullLink); // 调试信息
} else {
// 如果链接不符合模式,打印警告并跳过
System.err.println("Warning: Link does not match expected pattern: " + rawLink);
}
}
// --- 返回提取到的链接列表 ---
// 循环结束后,返回收集到的所有链接
// System.out.println("Finished link extraction. Returning list."); // 调试信息
return linkList;
} else {
System.err.println("Error: Failed to get search results entity.");
return null; // 获取结果内容失败,返回 null
}
}
} catch (IOException e) {
// 处理网络请求相关的异常
System.err.println("Network or IO error during scraping:");
e.printStackTrace();
return null; // 发生 IO 错误,返回 null
} catch (Exception e) {
// 处理其他可能的异常,例如解析错误或 NPE
System.err.println("An unexpected error occurred during scraping:");
e.printStackTrace();
return null; // 发生其他错误,返回 null
}
}
public static void main(String[] args) {
for (Integer year = Year.now().getValue(); year >= 2024; year--) {
int monthStart = (year == Year.now().getValue()) ? LocalDate.now().getMonthValue() : 12;
for (Integer month = monthStart; month >= 1; month--) {
try {
List<String> links = getlink(year, month);
if (links == null) {
System.out.println("年份 " + year + " 月份 " + month + " 抓取失败!");
continue;
}
if (links.isEmpty()) {
System.out.println("年份 " + year + " 月份 " + month + " 无数据!");
continue;
}
int sleepTime = random.nextInt(1001) + 3000;
int count = 0;
for (String url : links) {
try {
Map<String, Object> result = reslutData(url);
result.put("crawlUrl", url);
String registNum = String.valueOf(result.get("registNum"));
String jsonValue = objectMapper.writeValueAsString(result);
ProducerRecord<String, String> record = new ProducerRecord<>(TOPIC_NAME, registNum, jsonValue);
producer.send(record, (metadata, exception) -> {
if (exception == null) {
System.out.println("✅ 成功发送到 Kafka: " + registNum + " | Offset: " + metadata.offset() + " | " + url);
} else {
System.err.println("❌ Kafka 发送失败: " + exception.getMessage());
}
});
Thread.sleep(sleepTime); // 控制节奏
count++;
} catch (Exception e) {
System.err.println("抓取或发送失败: " + url);
e.printStackTrace();
}
}
System.out.println("📦 年份 " + year + " 月份 " + month + " 已完成,共上传 " + count + " 条数据。");
} catch (Exception e) {
System.err.println("处理年份 " + year + " 月份 " + month + " 失败: " + e.getMessage());
e.printStackTrace();
}
}
}
// 关闭 producer
producer.close();
}
public static Map<String,Object> reslutData(String url) throws IOException {
Map<String,Object> resultData = new HashMap<>();
OkHttpClient client = new OkHttpClient().newBuilder()
.build();
MediaType mediaType = MediaType.parse("text/plain");
RequestBody body = RequestBody.create(mediaType, "");
Request request = new Request.Builder()
.url(url)
.get()
.build();
Response response = client.newCall(request).execute();
String html = response.body().string();
Document parse = Jsoup.parse(html);
String title = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(7) > td:nth-child(2)").text();
String registNum = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(1) > td:nth-child(2) > b").text();
String registTime = extractAndConvertDate(parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(1) > td:nth-child(2)").text());
Map<String,Object> sponsor = new HashMap<>();
String SMMS = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(14) > td:nth-child(2) > table > tbody > tr > td").text();
String primarySponsor = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(15) > td:nth-child(2) > table > tbody").text();
sponsor.put("Source of Monetary or Material Support",SMMS);
sponsor.put("Primary Sponsor",primarySponsor);
String studyType = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(4) > td:nth-child(2)").text();
String phase = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(31) > td:nth-child(2)").text();
Map<String,Object> disease = new HashMap<>();
String healthType = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(21) > td:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(1)").text();
String condition = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(21) > td:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(2)").text();
disease.put("healthType",healthType);
disease.put("condition",condition);
String studyDesign = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(6) > td:nth-child(2)").text();
String inclusionCriteria = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(23) > td:nth-child(2) > table > tbody").text();
String exclusionCriteria = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(24) > td:nth-child(2) > table > tbody > tr > td:nth-child(2)").text();
String enrollment = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(30) > td:nth-child(2)").text();
String country = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(17) > td:nth-child(2)").text();
String intervention = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(22) > td:nth-child(2) > table").text();
Map<String,Object> primaryOutcome = new HashMap<>();
String firstOutcome = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(28) > td:nth-child(2) > table > tbody").text();
String secondOutcome = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(29) > td:nth-child(2) > table > tbody").text();
primaryOutcome.put("firstOutcome",firstOutcome);
primaryOutcome.put("secondOutcome",secondOutcome);
resultData.put("disease",disease);
resultData.put("primaryOutcome",primaryOutcome);
resultData.put("intervention",intervention);
resultData.put("country",country);
resultData.put("enrollment",enrollment);
resultData.put("exclusionCriteria",exclusionCriteria);
resultData.put("inclusionCriteria",inclusionCriteria);
resultData.put("studyDesign",studyDesign);
resultData.put("sponsor",sponsor);
resultData.put("title",title);
resultData.put("registNum",registNum);
resultData.put("registTime",registTime);
resultData.put("studyType",studyType);
resultData.put("phase",phase);
resultData.put("registStatus","");
resultData.put("registTitle","");
resultData.put("fullTitle","");
resultData.put("sponsorPart","");
resultData.put("studyObjective","");
resultData.put("studyStartDate","");
resultData.put("currentStatus","");
resultData.put("tagTime","");
resultData.put("crawlTime",getCurrentTime());
resultData.put("crawlUrl",url);
resultData.put("postTime",registTime);
resultData.put("content","content");
resultData.put("forwardcontent","forwardcontent");
resultData.put("cid","Nctrinicin");
return resultData;
}
public static String getCurrentTime() {
// 创建 DateTimeFormatter,指定输出格式
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
// 获取当前时间
LocalDateTime now = LocalDateTime.now();
// 格式化
return now.format(formatter);
}
public static String extractAndConvertDate(String input) {
// 定义正则表达式提取 dd/MM/yyyy 格式的日期
Pattern pattern = Pattern.compile("\\[(?:Registered on|注册于):\\s*(\\d{2}/\\d{2}/\\d{4})\\]");
Matcher matcher = pattern.matcher(input);
if (matcher.find()) {
String dateStr = matcher.group(1); // 提取的日期字符串
try {
// 解析成 Date 对象
SimpleDateFormat inputFormat = new SimpleDateFormat("dd/MM/yyyy");
Date date = inputFormat.parse(dateStr);
// 格式化为 yyyy:MM:dd 00:00:00
SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MM-dd '00:00:00'");
return outputFormat.format(date);
} catch (ParseException e) {
e.printStackTrace();
}
}
return null; // 如果未匹配或转换失败
}
}