You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

121 lines
6.9 KiB

package com.example;
import okhttp3.*;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class CtriScraperContent {
public static void main(String[] args) throws IOException {
Map<String,Object> resultData = new HashMap<>();
String url = "https://ctri.nic.in/Clinicaltrials/pmaindet2.php?EncHid=MjQ3MjM=&Enc=&userName=";
OkHttpClient client = new OkHttpClient().newBuilder()
.build();
MediaType mediaType = MediaType.parse("text/plain");
RequestBody body = RequestBody.create(mediaType, "");
Request request = new Request.Builder()
.url(url)
.get()
.build();
Response response = client.newCall(request).execute();
String html = response.body().string();
Document parse = Jsoup.parse(html);
String title = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(7) > td:nth-child(2)").text();
String registNum = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(1) > td:nth-child(2) > b").text();
String registTime = extractAndConvertDate(parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(1) > td:nth-child(2)").text());
Map<String,Object> sponsor = new HashMap<>();
String SMMS = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(14) > td:nth-child(2) > table > tbody > tr > td").text();
String primarySponsor = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(15) > td:nth-child(2) > table > tbody").text();
sponsor.put("Source of Monetary or Material Support",SMMS);
sponsor.put("Primary Sponsor",primarySponsor);
String studyType = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(4) > td:nth-child(2)").text();
String phase = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(31) > td:nth-child(2)").text();
Map<String,Object> disease = new HashMap<>();
String healthType = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(21) > td:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(1)").text();
String condition = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(21) > td:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(2)").text();
disease.put("healthType",healthType);
disease.put("condition",condition);
String studyDesign = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(6) > td:nth-child(2)").text();
String inclusionCriteria = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(23) > td:nth-child(2) > table > tbody").text();
String exclusionCriteria = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(24) > td:nth-child(2) > table > tbody > tr > td:nth-child(2)").text();
String enrollment = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(30) > td:nth-child(2)").text();
String country = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(17) > td:nth-child(2)").text();
String intervention = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(22) > td:nth-child(2) > table").text();
Map<String,Object> primaryOutcome = new HashMap<>();
String firstOutcome = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(28) > td:nth-child(2) > table > tbody").text();
String secondOutcome = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(29) > td:nth-child(2) > table > tbody").text();
primaryOutcome.put("firstOutcome",firstOutcome);
primaryOutcome.put("secondOutcome",secondOutcome);
resultData.put("disease",disease);
resultData.put("primaryOutcome",primaryOutcome);
resultData.put("intervention",intervention);
resultData.put("country",country);
resultData.put("enrollment",enrollment);
resultData.put("exclusionCriteria",exclusionCriteria);
resultData.put("inclusionCriteria",inclusionCriteria);
resultData.put("studyDesign",studyDesign);
resultData.put("sponsor",sponsor);
resultData.put("title",title);
resultData.put("registNum",registNum);
resultData.put("registTime",registTime);
resultData.put("studyType",studyType);
resultData.put("phase",phase);
resultData.put("registStatus","");
resultData.put("registTitle","");
resultData.put("fullTitle","");
resultData.put("sponsorPart","");
resultData.put("studyObjective","");
resultData.put("studyStartDate","");
resultData.put("currentStatus","");
resultData.put("tagTime","");
resultData.put("crawlTime",getCurrentTime());
resultData.put("crawlUrl",url);
resultData.put("postTime",registTime);
resultData.put("content","content");
resultData.put("forwardcontent","forwardcontent");
System.out.println(resultData);
}
public static String getCurrentTime() {
// 创建 DateTimeFormatter,指定输出格式
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
// 获取当前时间
LocalDateTime now = LocalDateTime.now();
// 格式化
return now.format(formatter);
}
public static String extractAndConvertDate(String input) {
// 定义正则表达式提取 dd/MM/yyyy 格式的日期
Pattern pattern = Pattern.compile("\\[(?:Registered on|注册于):\\s*(\\d{2}/\\d{2}/\\d{4})\\]");
Matcher matcher = pattern.matcher(input);
if (matcher.find()) {
String dateStr = matcher.group(1); // 提取的日期字符串
try {
// 解析成 Date 对象
SimpleDateFormat inputFormat = new SimpleDateFormat("dd/MM/yyyy");
Date date = inputFormat.parse(dateStr);
// 格式化为 yyyy:MM:dd 00:00:00
SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MM-dd '00:00:00'");
return outputFormat.format(date);
} catch (ParseException e) {
e.printStackTrace();
}
}
return null; // 如果未匹配或转换失败
}
}