|
|
package com.example;
import okhttp3.*; import org.jsoup.Jsoup; import org.jsoup.nodes.Document;
import java.io.IOException; import java.text.ParseException; import java.text.SimpleDateFormat; import java.time.LocalDateTime; import java.time.format.DateTimeFormatter; import java.util.Date; import java.util.HashMap; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern;
public class CtriScraperContent { public static void main(String[] args) throws IOException { Map<String,Object> resultData = new HashMap<>(); String url = "https://ctri.nic.in/Clinicaltrials/pmaindet2.php?EncHid=MjQ3MjM=&Enc=&userName="; OkHttpClient client = new OkHttpClient().newBuilder() .build(); MediaType mediaType = MediaType.parse("text/plain"); RequestBody body = RequestBody.create(mediaType, ""); Request request = new Request.Builder() .url(url) .get() .build(); Response response = client.newCall(request).execute(); String html = response.body().string(); Document parse = Jsoup.parse(html); String title = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(7) > td:nth-child(2)").text(); String registNum = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(1) > td:nth-child(2) > b").text(); String registTime = extractAndConvertDate(parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(1) > td:nth-child(2)").text()); Map<String,Object> sponsor = new HashMap<>(); String SMMS = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(14) > td:nth-child(2) > table > tbody > tr > td").text(); String primarySponsor = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(15) > td:nth-child(2) > table > tbody").text(); sponsor.put("Source of Monetary or Material Support",SMMS); sponsor.put("Primary Sponsor",primarySponsor); String studyType = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(4) > td:nth-child(2)").text(); String phase = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(31) > td:nth-child(2)").text(); Map<String,Object> disease = new HashMap<>(); String healthType = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(21) > td:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(1)").text(); String condition = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(21) > td:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(2)").text(); disease.put("healthType",healthType); disease.put("condition",condition); String studyDesign = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(6) > td:nth-child(2)").text(); String inclusionCriteria = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(23) > td:nth-child(2) > table > tbody").text(); String exclusionCriteria = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(24) > td:nth-child(2) > table > tbody > tr > td:nth-child(2)").text(); String enrollment = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(30) > td:nth-child(2)").text(); String country = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(17) > td:nth-child(2)").text(); String intervention = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(22) > td:nth-child(2) > table").text(); Map<String,Object> primaryOutcome = new HashMap<>(); String firstOutcome = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(28) > td:nth-child(2) > table > tbody").text(); String secondOutcome = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(29) > td:nth-child(2) > table > tbody").text(); primaryOutcome.put("firstOutcome",firstOutcome); primaryOutcome.put("secondOutcome",secondOutcome);
resultData.put("disease",disease); resultData.put("primaryOutcome",primaryOutcome); resultData.put("intervention",intervention); resultData.put("country",country); resultData.put("enrollment",enrollment); resultData.put("exclusionCriteria",exclusionCriteria); resultData.put("inclusionCriteria",inclusionCriteria); resultData.put("studyDesign",studyDesign); resultData.put("sponsor",sponsor); resultData.put("title",title); resultData.put("registNum",registNum); resultData.put("registTime",registTime); resultData.put("studyType",studyType); resultData.put("phase",phase); resultData.put("registStatus",""); resultData.put("registTitle",""); resultData.put("fullTitle",""); resultData.put("sponsorPart",""); resultData.put("studyObjective",""); resultData.put("studyStartDate",""); resultData.put("currentStatus",""); resultData.put("tagTime",""); resultData.put("crawlTime",getCurrentTime()); resultData.put("crawlUrl",url); resultData.put("postTime",registTime); resultData.put("content","content"); resultData.put("forwardcontent","forwardcontent");
System.out.println(resultData); } public static String getCurrentTime() { // 创建 DateTimeFormatter,指定输出格式
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); // 获取当前时间
LocalDateTime now = LocalDateTime.now(); // 格式化
return now.format(formatter); } public static String extractAndConvertDate(String input) { // 定义正则表达式提取 dd/MM/yyyy 格式的日期
Pattern pattern = Pattern.compile("\\[(?:Registered on|注册于):\\s*(\\d{2}/\\d{2}/\\d{4})\\]"); Matcher matcher = pattern.matcher(input);
if (matcher.find()) { String dateStr = matcher.group(1); // 提取的日期字符串
try { // 解析成 Date 对象
SimpleDateFormat inputFormat = new SimpleDateFormat("dd/MM/yyyy"); Date date = inputFormat.parse(dateStr);
// 格式化为 yyyy:MM:dd 00:00:00
SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MM-dd '00:00:00'"); return outputFormat.format(date);
} catch (ParseException e) { e.printStackTrace(); } }
return null; // 如果未匹配或转换失败
} }
|