You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

121 lines
6.9 KiB

1 month ago
  1. package com.example;
  2. import okhttp3.*;
  3. import org.jsoup.Jsoup;
  4. import org.jsoup.nodes.Document;
  5. import java.io.IOException;
  6. import java.text.ParseException;
  7. import java.text.SimpleDateFormat;
  8. import java.time.LocalDateTime;
  9. import java.time.format.DateTimeFormatter;
  10. import java.util.Date;
  11. import java.util.HashMap;
  12. import java.util.Map;
  13. import java.util.regex.Matcher;
  14. import java.util.regex.Pattern;
  15. public class CtriScraperContent {
  16. public static void main(String[] args) throws IOException {
  17. Map<String,Object> resultData = new HashMap<>();
  18. String url = "https://ctri.nic.in/Clinicaltrials/pmaindet2.php?EncHid=MjQ3MjM=&Enc=&userName=";
  19. OkHttpClient client = new OkHttpClient().newBuilder()
  20. .build();
  21. MediaType mediaType = MediaType.parse("text/plain");
  22. RequestBody body = RequestBody.create(mediaType, "");
  23. Request request = new Request.Builder()
  24. .url(url)
  25. .get()
  26. .build();
  27. Response response = client.newCall(request).execute();
  28. String html = response.body().string();
  29. Document parse = Jsoup.parse(html);
  30. String title = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(7) > td:nth-child(2)").text();
  31. String registNum = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(1) > td:nth-child(2) > b").text();
  32. String registTime = extractAndConvertDate(parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(1) > td:nth-child(2)").text());
  33. Map<String,Object> sponsor = new HashMap<>();
  34. String SMMS = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(14) > td:nth-child(2) > table > tbody > tr > td").text();
  35. String primarySponsor = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(15) > td:nth-child(2) > table > tbody").text();
  36. sponsor.put("Source of Monetary or Material Support",SMMS);
  37. sponsor.put("Primary Sponsor",primarySponsor);
  38. String studyType = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(4) > td:nth-child(2)").text();
  39. String phase = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(31) > td:nth-child(2)").text();
  40. Map<String,Object> disease = new HashMap<>();
  41. String healthType = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(21) > td:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(1)").text();
  42. String condition = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(21) > td:nth-child(2) > table > tbody > tr:nth-child(2) > td:nth-child(2)").text();
  43. disease.put("healthType",healthType);
  44. disease.put("condition",condition);
  45. String studyDesign = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(6) > td:nth-child(2)").text();
  46. String inclusionCriteria = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(23) > td:nth-child(2) > table > tbody").text();
  47. String exclusionCriteria = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(24) > td:nth-child(2) > table > tbody > tr > td:nth-child(2)").text();
  48. String enrollment = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(30) > td:nth-child(2)").text();
  49. String country = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(17) > td:nth-child(2)").text();
  50. String intervention = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(22) > td:nth-child(2) > table").text();
  51. Map<String,Object> primaryOutcome = new HashMap<>();
  52. String firstOutcome = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(28) > td:nth-child(2) > table > tbody").text();
  53. String secondOutcome = parse.select("body > table > tbody > tr > td > table:nth-child(11) > tbody > tr:nth-child(29) > td:nth-child(2) > table > tbody").text();
  54. primaryOutcome.put("firstOutcome",firstOutcome);
  55. primaryOutcome.put("secondOutcome",secondOutcome);
  56. resultData.put("disease",disease);
  57. resultData.put("primaryOutcome",primaryOutcome);
  58. resultData.put("intervention",intervention);
  59. resultData.put("country",country);
  60. resultData.put("enrollment",enrollment);
  61. resultData.put("exclusionCriteria",exclusionCriteria);
  62. resultData.put("inclusionCriteria",inclusionCriteria);
  63. resultData.put("studyDesign",studyDesign);
  64. resultData.put("sponsor",sponsor);
  65. resultData.put("title",title);
  66. resultData.put("registNum",registNum);
  67. resultData.put("registTime",registTime);
  68. resultData.put("studyType",studyType);
  69. resultData.put("phase",phase);
  70. resultData.put("registStatus","");
  71. resultData.put("registTitle","");
  72. resultData.put("fullTitle","");
  73. resultData.put("sponsorPart","");
  74. resultData.put("studyObjective","");
  75. resultData.put("studyStartDate","");
  76. resultData.put("currentStatus","");
  77. resultData.put("tagTime","");
  78. resultData.put("crawlTime",getCurrentTime());
  79. resultData.put("crawlUrl",url);
  80. resultData.put("postTime",registTime);
  81. resultData.put("content","content");
  82. resultData.put("forwardcontent","forwardcontent");
  83. System.out.println(resultData);
  84. }
  85. public static String getCurrentTime() {
  86. // 创建 DateTimeFormatter,指定输出格式
  87. DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
  88. // 获取当前时间
  89. LocalDateTime now = LocalDateTime.now();
  90. // 格式化
  91. return now.format(formatter);
  92. }
  93. public static String extractAndConvertDate(String input) {
  94. // 定义正则表达式提取 dd/MM/yyyy 格式的日期
  95. Pattern pattern = Pattern.compile("\\[(?:Registered on|注册于):\\s*(\\d{2}/\\d{2}/\\d{4})\\]");
  96. Matcher matcher = pattern.matcher(input);
  97. if (matcher.find()) {
  98. String dateStr = matcher.group(1); // 提取的日期字符串
  99. try {
  100. // 解析成 Date 对象
  101. SimpleDateFormat inputFormat = new SimpleDateFormat("dd/MM/yyyy");
  102. Date date = inputFormat.parse(dateStr);
  103. // 格式化为 yyyy:MM:dd 00:00:00
  104. SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MM-dd '00:00:00'");
  105. return outputFormat.format(date);
  106. } catch (ParseException e) {
  107. e.printStackTrace();
  108. }
  109. }
  110. return null; // 如果未匹配或转换失败
  111. }
  112. }