You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
404 lines
22 KiB
404 lines
22 KiB
package com.example;
|
|
|
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
import okhttp3.*;
|
|
import org.apache.hc.client5.http.cookie.BasicCookieStore;
|
|
import org.apache.hc.client5.http.cookie.CookieStore;
|
|
import org.apache.hc.client5.http.classic.methods.HttpGet;
|
|
import org.apache.hc.client5.http.classic.methods.HttpPost;
|
|
import org.apache.hc.client5.http.entity.UrlEncodedFormEntity;
|
|
import org.apache.hc.client5.http.impl.classic.CloseableHttpClient;
|
|
import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse;
|
|
import org.apache.hc.client5.http.impl.classic.HttpClients;
|
|
import org.apache.hc.client5.http.protocol.HttpClientContext;
|
|
import org.apache.hc.core5.http.HttpEntity;
|
|
import org.apache.hc.core5.http.NameValuePair;
|
|
import org.apache.hc.core5.http.io.entity.EntityUtils;
|
|
import org.apache.hc.core5.http.message.BasicNameValuePair;
|
|
import org.apache.kafka.clients.producer.KafkaProducer;
|
|
import org.apache.kafka.clients.producer.ProducerConfig;
|
|
import org.apache.kafka.clients.producer.ProducerRecord;
|
|
import org.jsoup.Jsoup;
|
|
import org.jsoup.nodes.Document;
|
|
import org.jsoup.nodes.Element;
|
|
import org.jsoup.select.Elements;
|
|
|
|
import java.io.IOException;
|
|
import java.nio.file.Files;
|
|
import java.nio.file.Paths;
|
|
import java.text.ParseException;
|
|
import java.text.SimpleDateFormat;
|
|
import java.time.LocalDate;
|
|
import java.time.LocalDateTime;
|
|
import java.time.Month;
|
|
import java.time.Year;
|
|
import java.time.format.DateTimeFormatter;
|
|
import java.util.*;
|
|
import java.nio.charset.StandardCharsets;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Pattern;
|
|
|
|
public class CtriScraper {
|
|
|
|
private static final String SEARCH_FORM_URL = "https://ctri.nic.in/Clinicaltrials/advancesearchmain.php";
|
|
|
|
private static final String SEARCH_SUBMIT_URL = "https://ctri.nic.in/Clinicaltrials/advsearch.php";
|
|
|
|
private static final Pattern LINK_REGEX_PATTERN = Pattern.compile("'([^']*)'");
|
|
|
|
private static final String TOPIC_NAME = "cliniTopic";
|
|
private static final String BOOTSTRAP_SERVERS = "node-01:19092";
|
|
private static KafkaProducer<String, String> producer;
|
|
private static ObjectMapper objectMapper = new ObjectMapper();
|
|
private static final Random random = new Random();
|
|
|
|
static {
|
|
Properties props = new Properties();
|
|
props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, BOOTSTRAP_SERVERS);
|
|
props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
|
|
props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
|
|
props.put(ProducerConfig.ACKS_CONFIG, "all"); // 等待所有副本确认
|
|
props.put(ProducerConfig.RETRIES_CONFIG, 3); // 重试次数
|
|
producer = new KafkaProducer<>(props);
|
|
|
|
}
|
|
public static List<String> getlink(Integer year, Integer month) {
|
|
List<String> linkList = new ArrayList<>(); // 用于存放提取到的链接
|
|
// 用于存储和管理 Cookies
|
|
CookieStore cookieStore = new BasicCookieStore();
|
|
// 用于在请求之间维护状态,特别是关联 CookieStore
|
|
HttpClientContext context = HttpClientContext.create();
|
|
context.setCookieStore(cookieStore);
|
|
|
|
// 使用 try-with-resources 确保 HttpClient 被正确关闭
|
|
try (CloseableHttpClient httpClient = HttpClients.custom()
|
|
.setDefaultCookieStore(cookieStore) // 将cookie store绑定到client
|
|
.build()) {
|
|
|
|
// --- Step 1 & 2: 发送 GET 请求获取表单页面并解析 ---
|
|
// System.out.println("Fetching search form page..."); // 调试信息可以按需保留或删除
|
|
HttpGet getRequest = new HttpGet(SEARCH_FORM_URL);
|
|
// 添加一些伪装的 Headers 模拟浏览器访问
|
|
getRequest.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36");
|
|
getRequest.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
|
|
getRequest.setHeader("Accept-Language", "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7");
|
|
|
|
String formPageHtml = null;
|
|
try (CloseableHttpResponse response = httpClient.execute(getRequest, context)) {
|
|
int statusCode = response.getCode();
|
|
// System.out.println("GET Response Status: " + statusCode); // 调试信息
|
|
|
|
if (statusCode != 200) {
|
|
System.err.println("Error: GET request to form page failed with status code: " + statusCode);
|
|
EntityUtils.consume(response.getEntity()); // 确保消费掉实体,释放连接
|
|
return null; // 获取表单页面失败,返回 null
|
|
}
|
|
|
|
HttpEntity entity = response.getEntity();
|
|
if (entity != null) {
|
|
formPageHtml = EntityUtils.toString(entity, StandardCharsets.UTF_8);
|
|
EntityUtils.consume(entity); // 确保实体内容被完全消费
|
|
} else {
|
|
System.err.println("Error: Failed to get form page entity.");
|
|
return null; // 获取页面内容失败,返回 null
|
|
}
|
|
}
|
|
// System.out.println("Form page fetched successfully."); // 调试信息
|
|
|
|
// --- Step 3 & 4: 解析 HTML 提取 csrf_token 和 __ncforminfo ---
|
|
Document doc = Jsoup.parse(formPageHtml, SEARCH_FORM_URL); // 传入 base URI 有助于处理相对路径
|
|
|
|
// 查找隐藏的输入字段
|
|
Element csrfTokenInput = doc.selectFirst("input[name=csrf_token][type=hidden]");
|
|
Element ncFormInfoInput = doc.selectFirst("input[name=__ncforminfo][type=hidden]");
|
|
|
|
String csrfToken = null;
|
|
String ncFormInfo = null;
|
|
|
|
if (csrfTokenInput != null) {
|
|
csrfToken = csrfTokenInput.val();
|
|
// System.out.println("Extracted csrf_token: " + csrfToken); // 调试信息
|
|
} else {
|
|
System.err.println("Warning: Could not find csrf_token input field.");
|
|
return null; // 缺少关键 token,返回 null
|
|
}
|
|
|
|
if (ncFormInfoInput != null) {
|
|
ncFormInfo = ncFormInfoInput.val();
|
|
// System.out.println("Extracted __ncforminfo: " + ncFormInfo); // 调试信息
|
|
} else {
|
|
System.err.println("Warning: Could not find __ncforminfo input field.");
|
|
return null; // 缺少关键 token,返回 null
|
|
}
|
|
|
|
// 如果必要的 token 没有获取到,可能无法继续 (虽然上面的检查已经覆盖,这里作为双重保险)
|
|
if (csrfToken == null || ncFormInfo == null) {
|
|
System.err.println("Error: Missing required tokens. Cannot proceed with POST request.");
|
|
return null;
|
|
}
|
|
|
|
// --- Step 5 & 6: 构建 POST 请求参数并发送 ---
|
|
// System.out.println("\nPreparing POST request..."); // 调试信息
|
|
HttpPost postRequest = new HttpPost(SEARCH_SUBMIT_URL);
|
|
// 添加 Headers 模拟浏览器提交表单
|
|
postRequest.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36");
|
|
// 重要:设置 Referer Header
|
|
postRequest.setHeader("Referer", SEARCH_FORM_URL);
|
|
// 添加 Origin Header
|
|
postRequest.setHeader("Origin", "https://ctri.nic.in");
|
|
postRequest.setHeader("Content-Type", "application/x-www-form-urlencoded");
|
|
postRequest.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
|
|
postRequest.setHeader("Accept-Language", "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7");
|
|
postRequest.setHeader("Pragma", "no-cache");
|
|
|
|
List<NameValuePair> params = new ArrayList<>();
|
|
// 添加你之前分析的载荷中的所有参数,使用获取到的动态值
|
|
params.add(new BasicNameValuePair("stid", "1")); // 注意 stid 之前有两个,这里用 1
|
|
params.add(new BasicNameValuePair("csrf_token", csrfToken)); // 使用获取到的动态 token
|
|
params.add(new BasicNameValuePair("pros", "1"));
|
|
params.add(new BasicNameValuePair("month", String.format("%02d", month))); // 格式化月份为两位数
|
|
params.add(new BasicNameValuePair("year", String.valueOf(year)));
|
|
params.add(new BasicNameValuePair("study", "0"));
|
|
params.add(new BasicNameValuePair("sdid", "0"));
|
|
params.add(new BasicNameValuePair("phaseid", "0"));
|
|
params.add(new BasicNameValuePair("psponsor", "0"));
|
|
params.add(new BasicNameValuePair("recid", "0"));
|
|
params.add(new BasicNameValuePair("state", "0"));
|
|
params.add(new BasicNameValuePair("district", "0"));
|
|
params.add(new BasicNameValuePair("searchword", ""));
|
|
params.add(new BasicNameValuePair("T4", "anyvalue")); // T4既然无效,随便填
|
|
params.add(new BasicNameValuePair("btt", "Search"));
|
|
params.add(new BasicNameValuePair("__ncforminfo", ncFormInfo)); // 使用获取到的动态值
|
|
|
|
// 将参数列表设置到请求体中
|
|
postRequest.setEntity(new UrlEncodedFormEntity(params, StandardCharsets.UTF_8));
|
|
|
|
// System.out.println("Executing POST request to submit form..."); // 调试信息
|
|
try (CloseableHttpResponse postResponse = httpClient.execute(postRequest, context)) {
|
|
int postStatusCode = postResponse.getCode();
|
|
// System.out.println("POST Response Status: " + postStatusCode); // 打印状态码
|
|
|
|
if (postStatusCode != 200) {
|
|
System.err.println("Error: POST request to submit form failed with status code: " + postStatusCode);
|
|
EntityUtils.consume(postResponse.getEntity()); // 确保消费掉实体,释放连接
|
|
return null; // 提交表单失败,返回 null
|
|
}
|
|
|
|
|
|
HttpEntity postEntity = postResponse.getEntity();
|
|
|
|
if (postEntity != null) {
|
|
String searchResultsHtml = EntityUtils.toString(postEntity, StandardCharsets.UTF_8);
|
|
EntityUtils.consume(postEntity); // 确保实体内容被完全消费
|
|
|
|
// --- Step 7: 处理搜索结果页面 ---
|
|
// System.out.println("\nParsing search results..."); // 调试信息
|
|
|
|
Document resultsDoc = Jsoup.parse(searchResultsHtml, SEARCH_SUBMIT_URL);
|
|
|
|
Elements links = resultsDoc.select("tr a");
|
|
|
|
for (Element linkElement : links) {
|
|
String rawLink = linkElement.attr("href");
|
|
// System.out.println("Processing raw link: " + rawLink); // 调试信息
|
|
|
|
// 使用预编译的正则表达式 Pattern
|
|
Matcher matcher = LINK_REGEX_PATTERN.matcher(rawLink);
|
|
|
|
// 查找匹配项
|
|
if (matcher.find()) {
|
|
String extractedContent = matcher.group(1); // 提取单引号内的内容
|
|
// 构建完整的链接 URL
|
|
String fullLink = "https://ctri.nic.in/Clinicaltrials/" + extractedContent;
|
|
linkList.add(fullLink); // 将完整链接添加到列表中
|
|
// System.out.println("Added link: " + fullLink); // 调试信息
|
|
} else {
|
|
// 如果链接不符合模式,打印警告并跳过
|
|
System.err.println("Warning: Link does not match expected pattern: " + rawLink);
|
|
}
|
|
}
|
|
|
|
// --- 返回提取到的链接列表 ---
|
|
// 循环结束后,返回收集到的所有链接
|
|
// System.out.println("Finished link extraction. Returning list."); // 调试信息
|
|
return linkList;
|
|
|
|
|
|
} else {
|
|
System.err.println("Error: Failed to get search results entity.");
|
|
return null; // 获取结果内容失败,返回 null
|
|
}
|
|
}
|
|
|
|
} catch (IOException e) {
|
|
// 处理网络请求相关的异常
|
|
System.err.println("Network or IO error during scraping:");
|
|
e.printStackTrace();
|
|
return null; // 发生 IO 错误,返回 null
|
|
} catch (Exception e) {
|
|
// 处理其他可能的异常,例如解析错误或 NPE
|
|
System.err.println("An unexpected error occurred during scraping:");
|
|
e.printStackTrace();
|
|
return null; // 发生其他错误,返回 null
|
|
}
|
|
}
|
|
public static void main(String[] args) {
|
|
for (Integer year = Year.now().getValue(); year >= 2024; year--) {
|
|
int monthStart = (year == Year.now().getValue()) ? LocalDate.now().getMonthValue() : 12;
|
|
|
|
for (Integer month = monthStart; month >= 1; month--) {
|
|
try {
|
|
List<String> links = getlink(year, month);
|
|
if (links == null) {
|
|
System.out.println("年份 " + year + " 月份 " + month + " 抓取失败!");
|
|
continue;
|
|
}
|
|
|
|
if (links.isEmpty()) {
|
|
System.out.println("年份 " + year + " 月份 " + month + " 无数据!");
|
|
continue;
|
|
}
|
|
|
|
int sleepTime = random.nextInt(1001) + 3000;
|
|
int count = 0;
|
|
|
|
for (String url : links) {
|
|
try {
|
|
Map<String, Object> result = reslutData(url);
|
|
result.put("crawlUrl", url);
|
|
|
|
String registNum = String.valueOf(result.get("registNum"));
|
|
String jsonValue = objectMapper.writeValueAsString(result);
|
|
|
|
ProducerRecord<String, String> record = new ProducerRecord<>(TOPIC_NAME, registNum, jsonValue);
|
|
|
|
producer.send(record, (metadata, exception) -> {
|
|
if (exception == null) {
|
|
System.out.println("✅ 成功发送到 Kafka: " + registNum + " | Offset: " + metadata.offset() + " | " + url);
|
|
} else {
|
|
System.err.println("❌ Kafka 发送失败: " + exception.getMessage());
|
|
}
|
|
});
|
|
|
|
Thread.sleep(sleepTime); // 控制节奏
|
|
count++;
|
|
} catch (Exception e) {
|
|
System.err.println("抓取或发送失败: " + url);
|
|
e.printStackTrace();
|
|
}
|
|
}
|
|
|
|
System.out.println("📦 年份 " + year + " 月份 " + month + " 已完成,共上传 " + count + " 条数据。");
|
|
|
|
} catch (Exception e) {
|
|
System.err.println("处理年份 " + year + " 月份 " + month + " 失败: " + e.getMessage());
|
|
e.printStackTrace();
|
|
}
|
|
}
|
|
}
|
|
// 关闭 producer
|
|
producer.close();
|
|
}
|
|
|
|
public static Map<String,Object> reslutData(String url) throws IOException {
|
|
Map<String,Object> resultData = new HashMap<>();
|
|
OkHttpClient client = new OkHttpClient().newBuilder()
|
|
.build();
|
|
MediaType mediaType = MediaType.parse("text/plain");
|
|
RequestBody body = RequestBody.create(mediaType, "");
|
|
Request request = new Request.Builder()
|
|
.url(url)
|
|
.get()
|
|
.build();
|
|
Response response = client.newCall(request).execute();
|
|
String html = response.body().string();
|
|
Document parse = Jsoup.parse(html);
|
|
String title = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(7) > td:nth-child(2)").text();
|
|
String registNum = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(1) > td:nth-child(2) > b").text();
|
|
String registTime = extractAndConvertDate(parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(1) > td:nth-child(2)").text());
|
|
Map<String,Object> sponsor = new HashMap<>();
|
|
String SMMS = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(14) > td:nth-child(2) > table > tbody > tr > td").text();
|
|
String primarySponsor = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(15) > td:nth-child(2) > table > tbody").text();
|
|
sponsor.put("Source of Monetary or Material Support",SMMS);
|
|
sponsor.put("Primary Sponsor",primarySponsor);
|
|
String studyType = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(4) > td:nth-child(2)").text();
|
|
String phase = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(31) > td:nth-child(2)").text();
|
|
Map<String,Object> disease = new HashMap<>();
|
|
String healthType = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(21) > td:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(1)").text();
|
|
String condition = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(21) > td:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(2)").text();
|
|
disease.put("healthType",healthType);
|
|
disease.put("condition",condition);
|
|
String studyDesign = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(6) > td:nth-child(2)").text();
|
|
String inclusionCriteria = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(23) > td:nth-child(2) > table > tbody").text();
|
|
String exclusionCriteria = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(24) > td:nth-child(2) > table > tbody > tr > td:nth-child(2)").text();
|
|
String enrollment = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(30) > td:nth-child(2)").text();
|
|
String country = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(17) > td:nth-child(2)").text();
|
|
String intervention = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(22) > td:nth-child(2) > table").text();
|
|
Map<String,Object> primaryOutcome = new HashMap<>();
|
|
String firstOutcome = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(28) > td:nth-child(2) > table > tbody").text();
|
|
String secondOutcome = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(29) > td:nth-child(2) > table > tbody").text();
|
|
primaryOutcome.put("firstOutcome",firstOutcome);
|
|
primaryOutcome.put("secondOutcome",secondOutcome);
|
|
|
|
resultData.put("disease",disease);
|
|
resultData.put("primaryOutcome",primaryOutcome);
|
|
resultData.put("intervention",intervention);
|
|
resultData.put("country",country);
|
|
resultData.put("enrollment",enrollment);
|
|
resultData.put("exclusionCriteria",exclusionCriteria);
|
|
resultData.put("inclusionCriteria",inclusionCriteria);
|
|
resultData.put("studyDesign",studyDesign);
|
|
resultData.put("sponsor",sponsor);
|
|
resultData.put("title",title);
|
|
resultData.put("registNum",registNum);
|
|
resultData.put("registTime",registTime);
|
|
resultData.put("studyType",studyType);
|
|
resultData.put("phase",phase);
|
|
resultData.put("registStatus","");
|
|
resultData.put("registTitle","");
|
|
resultData.put("fullTitle","");
|
|
resultData.put("sponsorPart","");
|
|
resultData.put("studyObjective","");
|
|
resultData.put("studyStartDate","");
|
|
resultData.put("currentStatus","");
|
|
resultData.put("tagTime","");
|
|
resultData.put("crawlTime",getCurrentTime());
|
|
resultData.put("crawlUrl",url);
|
|
resultData.put("postTime",registTime);
|
|
resultData.put("content","content");
|
|
resultData.put("forwardcontent","forwardcontent");
|
|
resultData.put("cid","Nctrinicin");
|
|
return resultData;
|
|
}
|
|
public static String getCurrentTime() {
|
|
// 创建 DateTimeFormatter,指定输出格式
|
|
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
|
|
// 获取当前时间
|
|
LocalDateTime now = LocalDateTime.now();
|
|
// 格式化
|
|
return now.format(formatter);
|
|
}
|
|
public static String extractAndConvertDate(String input) {
|
|
// 定义正则表达式提取 dd/MM/yyyy 格式的日期
|
|
Pattern pattern = Pattern.compile("\\[(?:Registered on|注册于):\\s*(\\d{2}/\\d{2}/\\d{4})\\]");
|
|
Matcher matcher = pattern.matcher(input);
|
|
|
|
if (matcher.find()) {
|
|
String dateStr = matcher.group(1); // 提取的日期字符串
|
|
try {
|
|
// 解析成 Date 对象
|
|
SimpleDateFormat inputFormat = new SimpleDateFormat("dd/MM/yyyy");
|
|
Date date = inputFormat.parse(dateStr);
|
|
|
|
// 格式化为 yyyy:MM:dd 00:00:00
|
|
SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MM-dd '00:00:00'");
|
|
return outputFormat.format(date);
|
|
|
|
} catch (ParseException e) {
|
|
e.printStackTrace();
|
|
}
|
|
}
|
|
|
|
return null; // 如果未匹配或转换失败
|
|
}
|
|
}
|