专利、论文、临床、药物数据读取 卡夫卡写es
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

160 lines
7.1 KiB

package com.zyzs.otherdatasave.service;
import com.alibaba.fastjson.JSONObject;
import com.bfd.crawler.elasti.ElastiProducerHigh;
import com.bfd.crawler.utils.JsonUtils;
import com.zyzs.otherdatasave.bean.Clini;
import com.zyzs.otherdatasave.bean.Paper;
import com.zyzs.otherdatasave.cache.Constants;
import com.zyzs.otherdatasave.util.DataCheckUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;
import javax.annotation.PostConstruct;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.*;
import static com.zyzs.otherdatasave.util.MfMD5Util.GetMD5Code;
@Service
public class QueryPaper {
private static final Logger LOGGER = LoggerFactory.getLogger(QueryPaper.class);
@PostConstruct
public void init() {
}
public void query() {
String inputMessage = Constants.getPaper().poll();// poll -->若队列为空,返回null
if (Objects.isNull(inputMessage)) {
return;
}
Map<String, Object> messageMap = new HashMap<>();
if (inputMessage.length()>10) {
try {
try {
messageMap = JsonUtils.parseObject(inputMessage);
} catch (Exception e) {
e.printStackTrace();
}
Paper paper=new Paper();
paper.setTitle((String) messageMap.get("title"));
paper.setCrawlUrl((String) messageMap.get("crawlUrl"));
paper.setContent((String) messageMap.get("content"));
//
// List<String> authorList = new ArrayList<>();
// String author=(String) messageMap.get("author");
// if(author.contains(";")){
// // 如果包含分号,则按分号分割
// authorList = Arrays.asList(author.split(";"));
// }else if (author.contains(",")){
// // 如果包含分号,则按分号分割
// authorList = Arrays.asList(author.split(","));
// }
// if(authorList.size()>0){
// List authors=new ArrayList<>();
// for (String name:authorList){
// Map ma=new HashMap();
// ma.put("id","");
// ma.put("name",name);
// // 组织信息
// Map or =new HashMap();
// or.put("id","");
// or.put("name","");
// List organs=new ArrayList<>();
// organs.add(or);
// ma.put("organs",organs);
// authors.add(ma);
// }
// paper.setAuthors(authors);
// }
if (messageMap.containsKey("filePath")){
List<String> file = (List) messageMap.get("filePath");
List<String> cleanedList = new ArrayList<>();
for (String url : file) {
// 使用replaceAll方法移除匹配的<url>标签
String cleanedUrl = url.replaceAll("http://192.168.0.41:8081", "");
cleanedList.add(cleanedUrl);
}
paper.setFilePath(cleanedList);
paper.setFilePathSize((List) messageMap.get("filePathSize"));
}
List<Map> authors= (List<Map>) messageMap.get("authors");
if(authors.size()>0){
List authorsall=new ArrayList<>();
for (Map<String,Object>keyValueMap : authors){
for (Map.Entry<String, Object> entry : keyValueMap.entrySet()) {
String key = entry.getKey(); // 获取键
Object value = entry.getValue(); // 获取值
Map ma=new HashMap();
ma.put("id","");
ma.put("name",key);
// 组织信息
Map or =new HashMap();
or.put("id","");
or.put("name",value);
List organs=new ArrayList<>();
organs.add(or);
ma.put("organs",organs);
authorsall.add(ma);
paper.setAuthors(authorsall);
}
}
}
//来源信息
Map source=new HashMap();
source.put("id", "");
source.put("name", messageMap.get("crawlUrl"));
source.put("type", "");
List sourcelist=new ArrayList<>();
sourcelist.add(source);
paper.setSource(sourcelist);
String inputDate = (String) messageMap.get("pubDate");
String outputFormat = "yyyy-MM-dd";
// 定义输入格式
DateTimeFormatter inputFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
// 解析输入字符串为 LocalDateTime 对象
LocalDateTime dateTime = LocalDateTime.parse(inputDate, inputFormatter);
// 格式化为所需的输出格式
String formattedDate = dateTime.format(DateTimeFormatter.ofPattern(outputFormat));
paper.setPubDate(formattedDate);
paper.setDoi((String) messageMap.get("doi"));
paper.setClassify((String) messageMap.get("classify"));
paper.setKeywords((String) messageMap.get("keywords"));
paper.setSummary((String) messageMap.get("summary"));
paper.setTopics((String) messageMap.get("topics"));
paper.setFieldsSubject((String) messageMap.get("fieldsSubject"));
paper.setReferences((String) messageMap.get("references"));
String docid = GetMD5Code((String) messageMap.get("doi"));
paper.setDocId(docid);
paper.setDataId(docid);
paper.set_id_(docid);
paper.setCountry((String) messageMap.get("country"));
paper.setTranslatetitle("");
paper.setTranslatekeywords ("");
paper.setTranslatesummary("");
paper.setIsshow("20250520");
long dateTimenow = System.currentTimeMillis();
paper.setCreateTime(dateTimenow);
String createTimeStr=DataCheckUtil.getCurrentTime(dateTimenow);
paper.setCreateTimeStr(createTimeStr);
LOGGER.info("Parse Paper={}", JSONObject.toJSON(paper));
ElastiProducerHigh elastiProducer = ElastiProducerHigh.getInstance(1, 3, "cl_special_1.0_paper_csci","_doc" );
elastiProducer.sendMessageToEs(JsonUtils.toJSONString(paper));
}
catch (Exception e) {
LOGGER.info("Parse PaperError={}", JSONObject.toJSON(inputMessage));
e.printStackTrace();
}
}
}
}