You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
121 lines
6.9 KiB
121 lines
6.9 KiB
package com.example;
|
|
|
|
import okhttp3.*;
|
|
import org.jsoup.Jsoup;
|
|
import org.jsoup.nodes.Document;
|
|
|
|
import java.io.IOException;
|
|
import java.text.ParseException;
|
|
import java.text.SimpleDateFormat;
|
|
import java.time.LocalDateTime;
|
|
import java.time.format.DateTimeFormatter;
|
|
import java.util.Date;
|
|
import java.util.HashMap;
|
|
import java.util.Map;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Pattern;
|
|
|
|
public class CtriScraperContent {
|
|
public static void main(String[] args) throws IOException {
|
|
Map<String,Object> resultData = new HashMap<>();
|
|
String url = "https://ctri.nic.in/Clinicaltrials/pmaindet2.php?EncHid=MjQ3MjM=&Enc=&userName=";
|
|
OkHttpClient client = new OkHttpClient().newBuilder()
|
|
.build();
|
|
MediaType mediaType = MediaType.parse("text/plain");
|
|
RequestBody body = RequestBody.create(mediaType, "");
|
|
Request request = new Request.Builder()
|
|
.url(url)
|
|
.get()
|
|
.build();
|
|
Response response = client.newCall(request).execute();
|
|
String html = response.body().string();
|
|
Document parse = Jsoup.parse(html);
|
|
String title = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(7) > td:nth-child(2)").text();
|
|
String registNum = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(1) > td:nth-child(2) > b").text();
|
|
String registTime = extractAndConvertDate(parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(1) > td:nth-child(2)").text());
|
|
Map<String,Object> sponsor = new HashMap<>();
|
|
String SMMS = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(14) > td:nth-child(2) > table > tbody > tr > td").text();
|
|
String primarySponsor = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(15) > td:nth-child(2) > table > tbody").text();
|
|
sponsor.put("Source of Monetary or Material Support",SMMS);
|
|
sponsor.put("Primary Sponsor",primarySponsor);
|
|
String studyType = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(4) > td:nth-child(2)").text();
|
|
String phase = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(31) > td:nth-child(2)").text();
|
|
Map<String,Object> disease = new HashMap<>();
|
|
String healthType = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(21) > td:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(1)").text();
|
|
String condition = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(21) > td:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(2)").text();
|
|
disease.put("healthType",healthType);
|
|
disease.put("condition",condition);
|
|
String studyDesign = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(6) > td:nth-child(2)").text();
|
|
String inclusionCriteria = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(23) > td:nth-child(2) > table > tbody").text();
|
|
String exclusionCriteria = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(24) > td:nth-child(2) > table > tbody > tr > td:nth-child(2)").text();
|
|
String enrollment = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(30) > td:nth-child(2)").text();
|
|
String country = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(17) > td:nth-child(2)").text();
|
|
String intervention = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(22) > td:nth-child(2) > table").text();
|
|
Map<String,Object> primaryOutcome = new HashMap<>();
|
|
String firstOutcome = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(28) > td:nth-child(2) > table > tbody").text();
|
|
String secondOutcome = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(29) > td:nth-child(2) > table > tbody").text();
|
|
primaryOutcome.put("firstOutcome",firstOutcome);
|
|
primaryOutcome.put("secondOutcome",secondOutcome);
|
|
|
|
resultData.put("disease",disease);
|
|
resultData.put("primaryOutcome",primaryOutcome);
|
|
resultData.put("intervention",intervention);
|
|
resultData.put("country",country);
|
|
resultData.put("enrollment",enrollment);
|
|
resultData.put("exclusionCriteria",exclusionCriteria);
|
|
resultData.put("inclusionCriteria",inclusionCriteria);
|
|
resultData.put("studyDesign",studyDesign);
|
|
resultData.put("sponsor",sponsor);
|
|
resultData.put("title",title);
|
|
resultData.put("registNum",registNum);
|
|
resultData.put("registTime",registTime);
|
|
resultData.put("studyType",studyType);
|
|
resultData.put("phase",phase);
|
|
resultData.put("registStatus","");
|
|
resultData.put("registTitle","");
|
|
resultData.put("fullTitle","");
|
|
resultData.put("sponsorPart","");
|
|
resultData.put("studyObjective","");
|
|
resultData.put("studyStartDate","");
|
|
resultData.put("currentStatus","");
|
|
resultData.put("tagTime","");
|
|
resultData.put("crawlTime",getCurrentTime());
|
|
resultData.put("crawlUrl",url);
|
|
resultData.put("postTime",registTime);
|
|
resultData.put("content","content");
|
|
resultData.put("forwardcontent","forwardcontent");
|
|
|
|
System.out.println(resultData);
|
|
}
|
|
public static String getCurrentTime() {
|
|
// 创建 DateTimeFormatter,指定输出格式
|
|
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
|
|
// 获取当前时间
|
|
LocalDateTime now = LocalDateTime.now();
|
|
// 格式化
|
|
return now.format(formatter);
|
|
}
|
|
public static String extractAndConvertDate(String input) {
|
|
// 定义正则表达式提取 dd/MM/yyyy 格式的日期
|
|
Pattern pattern = Pattern.compile("\\[(?:Registered on|注册于):\\s*(\\d{2}/\\d{2}/\\d{4})\\]");
|
|
Matcher matcher = pattern.matcher(input);
|
|
|
|
if (matcher.find()) {
|
|
String dateStr = matcher.group(1); // 提取的日期字符串
|
|
try {
|
|
// 解析成 Date 对象
|
|
SimpleDateFormat inputFormat = new SimpleDateFormat("dd/MM/yyyy");
|
|
Date date = inputFormat.parse(dateStr);
|
|
|
|
// 格式化为 yyyy:MM:dd 00:00:00
|
|
SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MM-dd '00:00:00'");
|
|
return outputFormat.format(date);
|
|
|
|
} catch (ParseException e) {
|
|
e.printStackTrace();
|
|
}
|
|
}
|
|
|
|
return null; // 如果未匹配或转换失败
|
|
}
|
|
}
|