专利、论文、临床、药物数据读取 卡夫卡写es
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

160 lines
7.1 KiB

  1. package com.zyzs.otherdatasave.service;
  2. import com.alibaba.fastjson.JSONObject;
  3. import com.bfd.crawler.elasti.ElastiProducerHigh;
  4. import com.bfd.crawler.utils.JsonUtils;
  5. import com.zyzs.otherdatasave.bean.Clini;
  6. import com.zyzs.otherdatasave.bean.Paper;
  7. import com.zyzs.otherdatasave.cache.Constants;
  8. import com.zyzs.otherdatasave.util.DataCheckUtil;
  9. import org.slf4j.Logger;
  10. import org.slf4j.LoggerFactory;
  11. import org.springframework.stereotype.Service;
  12. import javax.annotation.PostConstruct;
  13. import java.time.LocalDateTime;
  14. import java.time.format.DateTimeFormatter;
  15. import java.util.*;
  16. import static com.zyzs.otherdatasave.util.MfMD5Util.GetMD5Code;
  17. @Service
  18. public class QueryPaper {
  19. private static final Logger LOGGER = LoggerFactory.getLogger(QueryPaper.class);
  20. @PostConstruct
  21. public void init() {
  22. }
  23. public void query() {
  24. String inputMessage = Constants.getPaper().poll();// poll -->若队列为空,返回null
  25. if (Objects.isNull(inputMessage)) {
  26. return;
  27. }
  28. Map<String, Object> messageMap = new HashMap<>();
  29. if (inputMessage.length()>10) {
  30. try {
  31. try {
  32. messageMap = JsonUtils.parseObject(inputMessage);
  33. } catch (Exception e) {
  34. e.printStackTrace();
  35. }
  36. Paper paper=new Paper();
  37. paper.setTitle((String) messageMap.get("title"));
  38. paper.setCrawlUrl((String) messageMap.get("crawlUrl"));
  39. paper.setContent((String) messageMap.get("content"));
  40. //
  41. // List<String> authorList = new ArrayList<>();
  42. // String author=(String) messageMap.get("author");
  43. // if(author.contains(";")){
  44. // // 如果包含分号,则按分号分割
  45. // authorList = Arrays.asList(author.split(";"));
  46. // }else if (author.contains(",")){
  47. // // 如果包含分号,则按分号分割
  48. // authorList = Arrays.asList(author.split(","));
  49. // }
  50. // if(authorList.size()>0){
  51. // List authors=new ArrayList<>();
  52. // for (String name:authorList){
  53. // Map ma=new HashMap();
  54. // ma.put("id","");
  55. // ma.put("name",name);
  56. // // 组织信息
  57. // Map or =new HashMap();
  58. // or.put("id","");
  59. // or.put("name","");
  60. // List organs=new ArrayList<>();
  61. // organs.add(or);
  62. // ma.put("organs",organs);
  63. // authors.add(ma);
  64. // }
  65. // paper.setAuthors(authors);
  66. // }
  67. if (messageMap.containsKey("filePath")){
  68. List<String> file = (List) messageMap.get("filePath");
  69. List<String> cleanedList = new ArrayList<>();
  70. for (String url : file) {
  71. // 使用replaceAll方法移除匹配的<url>标签
  72. String cleanedUrl = url.replaceAll("http://192.168.0.41:8081", "");
  73. cleanedList.add(cleanedUrl);
  74. }
  75. paper.setFilePath(cleanedList);
  76. paper.setFilePathSize((List) messageMap.get("filePathSize"));
  77. }
  78. List<Map> authors= (List<Map>) messageMap.get("authors");
  79. if(authors.size()>0){
  80. List authorsall=new ArrayList<>();
  81. for (Map<String,Object>keyValueMap : authors){
  82. for (Map.Entry<String, Object> entry : keyValueMap.entrySet()) {
  83. String key = entry.getKey(); // 获取键
  84. Object value = entry.getValue(); // 获取值
  85. Map ma=new HashMap();
  86. ma.put("id","");
  87. ma.put("name",key);
  88. // 组织信息
  89. Map or =new HashMap();
  90. or.put("id","");
  91. or.put("name",value);
  92. List organs=new ArrayList<>();
  93. organs.add(or);
  94. ma.put("organs",organs);
  95. authorsall.add(ma);
  96. paper.setAuthors(authorsall);
  97. }
  98. }
  99. }
  100. //来源信息
  101. Map source=new HashMap();
  102. source.put("id", "");
  103. source.put("name", messageMap.get("crawlUrl"));
  104. source.put("type", "");
  105. List sourcelist=new ArrayList<>();
  106. sourcelist.add(source);
  107. paper.setSource(sourcelist);
  108. String inputDate = (String) messageMap.get("pubDate");
  109. String outputFormat = "yyyy-MM-dd";
  110. // 定义输入格式
  111. DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
  112. // 解析输入字符串为 LocalDateTime 对象
  113. LocalDateTime dateTime = LocalDateTime.parse(inputDate, inputFormatter);
  114. // 格式化为所需的输出格式
  115. String formattedDate = dateTime.format(DateTimeFormatter.ofPattern(outputFormat));
  116. paper.setPubDate(formattedDate);
  117. paper.setDoi((String) messageMap.get("doi"));
  118. paper.setClassify((String) messageMap.get("classify"));
  119. paper.setKeywords((String) messageMap.get("keywords"));
  120. paper.setSummary((String) messageMap.get("summary"));
  121. paper.setTopics((String) messageMap.get("topics"));
  122. paper.setFieldsSubject((String) messageMap.get("fieldsSubject"));
  123. paper.setReferences((String) messageMap.get("references"));
  124. String docid = GetMD5Code((String) messageMap.get("doi"));
  125. paper.setDocId(docid);
  126. paper.setDataId(docid);
  127. paper.set_id_(docid);
  128. paper.setCountry((String) messageMap.get("country"));
  129. paper.setTranslatetitle("");
  130. paper.setTranslatekeywords ("");
  131. paper.setTranslatesummary("");
  132. paper.setIsshow("20250520");
  133. long dateTimenow = System.currentTimeMillis();
  134. paper.setCreateTime(dateTimenow);
  135. String createTimeStr=DataCheckUtil.getCurrentTime(dateTimenow);
  136. paper.setCreateTimeStr(createTimeStr);
  137. LOGGER.info("Parse Paper={}", JSONObject.toJSON(paper));
  138. ElastiProducerHigh elastiProducer = ElastiProducerHigh.getInstance(1, 3, "cl_special_1.0_paper_csci","_doc" );
  139. elastiProducer.sendMessageToEs(JsonUtils.toJSONString(paper));
  140. }
  141. catch (Exception e) {
  142. LOGGER.info("Parse PaperError={}", JSONObject.toJSON(inputMessage));
  143. e.printStackTrace();
  144. }
  145. }
  146. }
  147. }