Browse Source

英文文献数据采集

master
maojian 7 months ago
commit
ffc71d82fa
  1. 3
      .gitignore
  2. 163
      pom.xml
  3. 28
      src/main/java/com/bfd/youzhiapi/YouzhiApiApplication.java
  4. 55
      src/main/java/com/bfd/youzhiapi/config/DataTypeEnum.java
  5. 25
      src/main/java/com/bfd/youzhiapi/config/GlobalConfig.java
  6. 19
      src/main/java/com/bfd/youzhiapi/entity/CacheEntity.java
  7. 33
      src/main/java/com/bfd/youzhiapi/entity/KfkEntity.java
  8. 33
      src/main/java/com/bfd/youzhiapi/entity/TaskEntity.java
  9. 31
      src/main/java/com/bfd/youzhiapi/mapper/ScheduleMapper.java
  10. 336
      src/main/java/com/bfd/youzhiapi/service/ScheduleService.java
  11. 88
      src/main/java/com/bfd/youzhiapi/util/HttpUtil.java
  12. 86
      src/main/java/com/bfd/youzhiapi/util/KfkUtil.java
  13. 270
      src/main/java/com/bfd/youzhiapi/util/Md5SignUtil.java
  14. 29
      src/main/java/com/bfd/youzhiapi/util/Utils.java
  15. 26
      src/main/resources/application.yml
  16. 38
      src/main/resources/logback-spring.xml
  17. 17
      src/main/resources/mapper/ScheduleMapper.xml
  18. 13
      src/test/java/com/bfd/youzhiapi/YouzhiApiApplicationTests.java

3
.gitignore

@ -0,0 +1,3 @@
/target/
/logs/
/.idea/

163
pom.xml

@ -0,0 +1,163 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.bfd</groupId>
<artifactId>youzhiApi</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>youzhiApi</name>
<description>youzhiApi</description>
<properties>
<java.version>1.8</java.version>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<spring-boot.version>2.6.13</spring-boot.version>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.mybatis.spring.boot</groupId>
<artifactId>mybatis-spring-boot-starter</artifactId>
<version>2.2.2</version>
</dependency>
<dependency>
<groupId>com.mysql</groupId>
<artifactId>mysql-connector-j</artifactId>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.squareup.okhttp3</groupId>
<artifactId>okhttp</artifactId>
</dependency>
<dependency>
<groupId>com.alibaba.fastjson2</groupId>
<artifactId>fastjson2</artifactId>
<version>2.0.17</version>
</dependency>
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>5.8.27</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>2.7.1</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.7.3</version>
</dependency>
</dependencies>
<dependencyManagement>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-dependencies</artifactId>
<version>${spring-boot.version}</version>
<type>pom</type>
<scope>import</scope>
</dependency>
</dependencies>
</dependencyManagement>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<configuration>
<!--不打入jar包的文件类型或者路径-->
<excludes>
<exclude>*.properties</exclude>
<exclude>*.yml</exclude>
<exclude>*.yaml</exclude>
</excludes>
<archive>
<manifest>
<!-- 执行的主程序路径 -->
<mainClass>com.bfd.youzhiapi.YouzhiApiApplication</mainClass>
<!--是否要把第三方jar放到manifest的classpath中-->
<addClasspath>true</addClasspath>
<!--生成的manifest中classpath的前缀,因为要把第三方jar放到lib目录下,所以classpath的前缀是lib/-->
<classpathPrefix>lib/</classpathPrefix>
<!-- 打包时 MANIFEST.MF 文件不记录的时间戳版本 -->
<useUniqueVersions>false</useUniqueVersions>
</manifest>
<manifestEntries>
<!-- 在 Class-Path 下添加配置文件的路径 -->
<Class-Path>config/</Class-Path>
</manifestEntries>
</archive>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<executions>
<execution>
<id>copy</id>
<phase>package</phase>
<goals>
<goal>copy-dependencies</goal>
</goals>
<configuration>
<outputDirectory>${project.build.directory}/lib/</outputDirectory>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-resources-plugin</artifactId>
<executions>
<execution>
<id>copy-resources</id>
<phase>package</phase>
<goals>
<goal>copy-resources</goal>
</goals>
<configuration>
<resources>
<!--把配置文件打包到指定路径-->
<resource>
<directory>src/main/resources/</directory>
<includes>
<include>*.properties</include>
<include>*.yml</include>
<exclude>*.yaml</exclude>
</includes>
</resource>
</resources>
<outputDirectory>${project.build.directory}/config</outputDirectory>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>8</source>
<target>8</target>
</configuration>
</plugin>
</plugins>
</build>
</project>

28
src/main/java/com/bfd/youzhiapi/YouzhiApiApplication.java

@ -0,0 +1,28 @@
package com.bfd.youzhiapi;
import com.bfd.youzhiapi.service.ScheduleService;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.context.ConfigurableApplicationContext;
import org.springframework.scheduling.annotation.EnableScheduling;
import javax.annotation.Resource;
@SpringBootApplication
@EnableScheduling
public class YouzhiApiApplication {
@Resource
ScheduleService scheduleService;
public static void main(String[] args) {
ConfigurableApplicationContext run = SpringApplication.run(YouzhiApiApplication.class, args);
YouzhiApiApplication bean = run.getBean(YouzhiApiApplication.class);
// bean.start();
}
// public void start(){
// Thread thread = new Thread(scheduleService);
// thread.start();
// }
}

55
src/main/java/com/bfd/youzhiapi/config/DataTypeEnum.java

@ -0,0 +1,55 @@
package com.bfd.youzhiapi.config;
/**
* @author guowei
* 采集库 字段 和检索接口 映射枚举类
*/
public enum DataTypeEnum {
//期刊论文
PERIODICAL(10, 1,"学术期刊"),
//学位论文
DISSERTATION(20, 2,"学位论文"),
//会议论文
CONFERENCE(30, 3,"会议");
private final Integer code;
private final Integer field;
private final String type;
DataTypeEnum(Integer code, Integer field,String type) {
this.code = code;
this.field = field;
this.type = type;
}
public Integer getCode() {
return code;
}
public Integer getField(){return field;}
public String getType(){return type;}
public static Integer getCodeByField(Integer field) {
for (DataTypeEnum dataTypeEnum : DataTypeEnum.values()) {
if (dataTypeEnum.getField()==(field)) {
return dataTypeEnum.getCode();
}
}
return null; // 或者可以抛出异常或返回一个默认值
}
public static String getTypeByCode(Integer code) {
for (DataTypeEnum dataTypeEnum : DataTypeEnum.values()) {
if (dataTypeEnum.getCode()==(code)) {
return dataTypeEnum.getType();
}
}
return null; // 或者可以抛出异常或返回一个默认值
}
}

25
src/main/java/com/bfd/youzhiapi/config/GlobalConfig.java

@ -0,0 +1,25 @@
package com.bfd.youzhiapi.config;
import org.springframework.stereotype.Component;
/**
* @author guowei
*/
@Component
public class GlobalConfig {
/**
* 外部接口所用 appId
*/
public static final String APPID = "c4d532304c6b4497b1ad";
/**
* 外部接口所用 appSecret
*/
public static final String APPSECRET = "dc41973ee03e471887c77c4a532dbfc3";
/**
* 外部接口所用 机构Id
*/
public static final Integer ORGANID = 392;
}

19
src/main/java/com/bfd/youzhiapi/entity/CacheEntity.java

@ -0,0 +1,19 @@
package com.bfd.youzhiapi.entity;
import lombok.Data;
/**
* @author guowei
*/
@Data
public class CacheEntity {
private int id;
private String doi;
private String downloadId;
private String uploadTime;
private String downloadUrl;
}

33
src/main/java/com/bfd/youzhiapi/entity/KfkEntity.java

@ -0,0 +1,33 @@
package com.bfd.youzhiapi.entity;
import lombok.Data;
import java.util.ArrayList;
import java.util.List;
/**
* @author guowei
*/
@Data
public class KfkEntity {
private Object attr;
private String author;
private String brief;
private String cid = "Nkyzd";
private String content;
private String dedupKey;
private String field;
private String forwardcontent;
private String iid;
private Boolean isDownload = false;
private String news_id;
private String post_time;
private String searchKeyword;
private String source;
private List tasks = new ArrayList<>();
private String title;
private String type = "newscontent";
private String url;
private String version = "1";
}

33
src/main/java/com/bfd/youzhiapi/entity/TaskEntity.java

@ -0,0 +1,33 @@
package com.bfd.youzhiapi.entity;
import lombok.Data;
/**
* @author guowei
*/
@Data
public class TaskEntity {
private Integer rid;
private Integer siteId;
private String cid;
private String channelName;
private String keyword;
private Integer pageTypeID;
private Integer weight;
private String url;
private Integer pageIdx;
private Integer nextPageTime;
private Integer status;
private Integer intv;
private String attachTag;
private String lastcrawltime;
private String nextcrawltime;
private String createTime;
private String modiTime;
private Integer crawl_mode;
private Integer crawl_account;
private String page_switchs;
private Integer task_hash_code;
private Long crawlStartTime;
private Long crawlEndTime;
}

31
src/main/java/com/bfd/youzhiapi/mapper/ScheduleMapper.java

@ -0,0 +1,31 @@
package com.bfd.youzhiapi.mapper;
import com.bfd.youzhiapi.entity.CacheEntity;
import com.bfd.youzhiapi.entity.TaskEntity;
import org.apache.ibatis.annotations.Mapper;
import org.springframework.stereotype.Repository;
import java.util.List;
/**
* @author guowei
*/
@Mapper
public interface ScheduleMapper {
/**
* 查询任务
* @param status
* @return
*/
List<TaskEntity> queryTaskByStatus(int status);
/**
* 更改任务状态
* @param taskEntity
* @return
*/
int updateTaskStatus(TaskEntity taskEntity);
CacheEntity queryCacheByDoi(String doi);
}

336
src/main/java/com/bfd/youzhiapi/service/ScheduleService.java

@ -0,0 +1,336 @@
package com.bfd.youzhiapi.service;
import cn.hutool.core.date.DateUtil;
import cn.hutool.core.util.IdUtil;
import cn.hutool.core.util.StrUtil;
import cn.hutool.crypto.SecureUtil;
import com.alibaba.fastjson2.JSON;
import com.alibaba.fastjson2.JSONArray;
import com.alibaba.fastjson2.JSONObject;
import com.bfd.youzhiapi.config.DataTypeEnum;
import com.bfd.youzhiapi.config.GlobalConfig;
import com.bfd.youzhiapi.entity.CacheEntity;
import com.bfd.youzhiapi.entity.KfkEntity;
import com.bfd.youzhiapi.entity.TaskEntity;
import com.bfd.youzhiapi.mapper.ScheduleMapper;
import com.bfd.youzhiapi.util.HttpUtil;
import com.bfd.youzhiapi.util.KfkUtil;
import com.bfd.youzhiapi.util.Md5SignUtil;
import com.bfd.youzhiapi.util.Utils;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Service;
import javax.annotation.Resource;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @author guowei
*/
@Service
@Slf4j
public class ScheduleService{
@Resource
ScheduleMapper scheduleMapper;
@Scheduled(cron = "0 0/2 * * * ?")
public void run() {
List<TaskEntity> taskEntities = scheduleMapper.queryTaskByStatus(1);
log.info("查询到{}条未采集任务", taskEntities.size());
for (TaskEntity task : taskEntities) {
log.info("开始采集任务:{}", task.getKeyword());
//最大翻页限制20页
Integer maxPageNum = task.getNextPageTime();
if (maxPageNum == -1){ maxPageNum = 20;}
String attachTag = task.getAttachTag();
JSONObject attrJSON = JSONObject.parseObject(attachTag);
if (!attrJSON.containsKey("field")) {
log.error("没找到field,keyword:{},跳过采集", task.getKeyword());
continue;
}
Integer field = attrJSON.getInteger("field");
String documentType = attrJSON.getString("documentType");
String[] documentSplit = documentType.split(",");
List<String> apiType = new ArrayList<>();
//根据field 映射 检索接口的参数
for (String type : documentSplit) {
Integer codeByField = DataTypeEnum.getCodeByField(Integer.valueOf(type));
apiType.add(String.valueOf(codeByField));
}
if (apiType.size() == 0) {
log.error("没找到documentType,keyword:{},跳过采集", task.getKeyword());
continue;
} else {
log.info("采集类型:{}", String.join(",", apiType));
}
Long crawlStartTime = task.getCrawlStartTime();
Long crawlEndTime = task.getCrawlEndTime();
//获取采集范围 年份
int yearStart = Utils.getYearFromTimestamp(crawlStartTime);
int yearend = Utils.getYearFromTimestamp(crawlEndTime);
log.info("采集年份范围:{} ~ {}",yearStart,yearend);
//每个采集类型都采集一遍
for(String type:apiType) {
int currentPageNum = 1;
log.info("开始采集 {} 类型数据",type);
do {
try {
SortedMap<Object, Object> parameters = new TreeMap<Object, Object>();
parameters.put("randomStr", DateUtil.format(new Date(), "yyyyMMddHHmmss"));
parameters.put("appId", GlobalConfig.APPID);
parameters.put("title", task.getKeyword());
parameters.put("page", currentPageNum);
parameters.put("pageSize", 10);
parameters.put("type", type);
String sign = Md5SignUtil.sign(parameters);
parameters.put("sign", sign);
String data = HttpUtil.getData(JSON.toJSONString(parameters));
JSONObject jsonObject = JSONObject.parseObject(data);
JSONObject data1 = jsonObject.getJSONObject("data");
JSONArray records = data1.getJSONArray("records");
for (Object record : records) {
JSONObject item = (JSONObject) record;
KfkEntity kfk = new KfkEntity();
Map attr = new HashMap<>();
attr.put("attachTag", JSON.parse(task.getAttachTag()));
kfk.setAttr(attr);
kfk.setAuthor(item.getString("author"));
kfk.setContent(item.getString("abstractE"));
kfk.setForwardcontent(item.getString("abstractE"));
String organ = item.getString("organ");
Map brief = new HashMap<>();
kfk.setField(String.valueOf(field));
kfk.setTitle(Jsoup.parse(item.getString("title")).text());
kfk.setSearchKeyword(task.getKeyword());
String source = "";
switch (item.getInteger("type")) {
case 10:
source = item.getString("name");
break;
case 20:
source = item.getString("school");
break;
case 30:
source = item.getString("conferenceName");
break;
}
kfk.setSource(source);
// if (item.getString("abstractURL") != null) {
// kfk.setUrl(item.getString("abstractURL"));
// } else {
// kfk.setUrl(item.getString("pdfURL"));
// }
String uuid = IdUtil.simpleUUID();
kfk.setIid(uuid);
kfk.setNews_id(uuid);
kfk.setUrl(uuid);
String year = item.getString("year");
//判断年份 是不是4位数字 有错误数据的情况
boolean fourDigitNumber = Utils.isFourDigitNumber(year);
if (!fourDigitNumber) {
log.error("year不是4位数字,跳过,year:{}", year);
continue;
}
if (!Utils.isYearInRange(Integer.parseInt(year), yearStart, yearend)) {
log.error("year不在采集年份范围,跳过,year:{}", year);
continue;
}
kfk.setPost_time(item.getString("year") + "-01-01 00:00:00");
kfk.setField(String.valueOf(field));
//没有机构字段 并且是学位论文学校作为机构
if (organ == null || organ == "") {
organ = item.getString("school");
List agencys = new ArrayList<>();
if (organ == null || organ == "") {
organ = "";
} else {
Map<String, Object> agency = new HashMap<>();
agency.put("name", organ);
agency.put("url", IdUtil.simpleUUID()); // 添加第二个字段
agencys.add(agency);
}
brief.put("agency", organ);
brief.put("agencys", agencys);
brief.put("author", item.getString("author"));
Map<String, Object> agencyAuthor = new HashMap<>();
agencyAuthor.put("agency", organ);
agencyAuthor.put("author", item.getString("author")); // 添加第二个字段
List authorAndAgency = new ArrayList<>();
authorAndAgency.add(agencyAuthor);
brief.put("authorAndAgency", authorAndAgency);
} else {
List agencys = new ArrayList<>();
List agencyString = new ArrayList<>();
Pattern pattern = Pattern.compile("\\[([a-z\\d])\\]([^;]+)");
Matcher matcher = pattern.matcher(organ);
Map agencyMap = new HashMap<>();
while (matcher.find()) {
String key = matcher.group(1); // 获取编号
String value = matcher.group(2).trim(); // 获取机构名称并去除前后空格
agencyMap.put(key, value);
agencyString.add(value);
Map agency = new HashMap<>();
agency.put("name", value);
agency.put("url", IdUtil.simpleUUID());
agencys.add(agency);
}
brief.put("agency", String.join(",", agencyString));
brief.put("agencys", agencys);
// 正则表达式匹配模式匹配 "姓名[编号][编号]..."
// Pattern patternAuthor = Pattern.compile("([\\p{L} .]+)(\\[\\d+])+(?=;|$)");
Pattern patternAuthor = Pattern.compile("([\\p{L} .-]+)((\\[\\d+\\])|(\\[[a-zA-Z,]+\\]))+(?=;|$)");
Matcher matcherAuthor = patternAuthor.matcher(item.getString("author"));
Map<String, List<String>> authorAffiliations = new HashMap<>();
while (matcherAuthor.find()) {
String name = matcherAuthor.group(1).trim(); // 获取姓名并去除前后空格
String affiliationPart = matcherAuthor.group(0); // 获取整个匹配串
// 提取所有编号
List<String> affiliations = new ArrayList<>();
Matcher numberMatcher = Pattern.compile("\\[(\\d+)]").matcher(affiliationPart);
while (numberMatcher.find()) {
affiliations.add(numberMatcher.group(1));
}
Matcher letterMatcher = Pattern.compile("\\[(.*?)\\]").matcher(affiliationPart);
if (letterMatcher.find()) {
String values = letterMatcher.group(1);
String[] items = values.split(",");
for (String key : items) {
affiliations.add(key);
}
}
authorAffiliations.put(name, affiliations);
}
Set<String> strings = authorAffiliations.keySet();
List authorAndAgency = new ArrayList<>();
if (strings.size() > 0) {
brief.put("author", String.join(",", strings));
kfk.setAuthor(String.join(",", strings));
for (String name : strings) {
List<String> organNum = authorAffiliations.get(name);
for (String authorOrgan : organNum) {
Map<String, Object> agencyAuthor = new HashMap<>();
agencyAuthor.put("agency", agencyMap.get(authorOrgan));
agencyAuthor.put("author", name);
authorAndAgency.add(agencyAuthor);
}
}
brief.put("authorAndAgency", authorAndAgency);
} else {
brief.put("author", item.getString("author"));
brief.put("authorAndAgency", authorAndAgency);
}
}
brief.put("author_agency_urls", new ArrayList<>());
brief.put("author_urls", new ArrayList<>());
brief.put("data", DataTypeEnum.getTypeByCode(item.getInteger("type")));
brief.put("date", DateUtil.formatDate(DateUtil.parse(kfk.getPost_time())));
brief.put("detailUrl", kfk.getUrl());
brief.put("download", "");
brief.put("eisci", "");
brief.put("fileUrl", "");
brief.put("funding", "");
brief.put("id", item.getString("id"));
brief.put("initial_mark", "");
brief.put("keyword", task.getKeyword());
brief.put("keywords", item.getString("keyword"));
String language = item.getString("language");
if (StrUtil.isEmpty(language)){
language = "英语";
}
brief.put("language", language);
brief.put("paramter", new HashMap<>());
brief.put("publish_agency_urls", new ArrayList<>());
brief.put("quote", "");
brief.put("source", source);
brief.put("summary", kfk.getContent().replace("\"", ""));
brief.put("title", kfk.getTitle());
brief.put("wxtype", language);
kfk.setBrief(JSON.toJSONString(brief));
/**2024.12.27 新增下载文件需求,下载文件需要3个接口
* 1.新增单条需求根据doi
* 2.获取清单已上传时间查询
* 3.获取下载链接
* 根据doi 查询缓存表里是否已经下载过附件有的话 直接下载推送
* 没有的话 放入队列等待处理完成
*/
// String doi = item.getString("doi");
// CacheEntity cacheEntity = scheduleMapper.queryCacheByDoi(doi);
// if (cacheEntity == null){
// log.info("缓存库没有此数据,新增数据");
// //组装参数
// SortedMap<Object, Object> params = new TreeMap<Object, Object>();
// params.put("randomStr", DateUtil.format(new Date(), "yyyyMMddHHmmss"));
// params.put("appId", GlobalConfig.APPID);
// params.put("organId",GlobalConfig.ORGANID);
// params.put("doi",doi);
// String uploadSign = Md5SignUtil.sign(params);
// params.put("sign", uploadSign);
// String uploadData = HttpUtil.uploadDoi(JSON.toJSONString(params));
// JSONObject uploadJon = JSONObject.parseObject(uploadData);
//
// cacheEntity = new CacheEntity();
// cacheEntity.setDoi(doi);
//
// }
System.out.println(JSON.toJSONString(kfk));
KfkUtil.sendKafka(JSON.toJSONString(kfk));
}
Integer totalPage = data1.getInteger("pages");
if (maxPageNum > totalPage) {
maxPageNum = totalPage;
}
log.info("第{}页采集,最大限制页数{},共{}页", currentPageNum, maxPageNum, totalPage);
currentPageNum++;
Thread.sleep(1000 * 5);
} catch (Exception e) {
e.printStackTrace();
log.error("采集失败", e);
}
} while (currentPageNum <= maxPageNum);
log.info("采集完成 {} 类型数据",type);
}
log.info("关键词:{} 采集完成",task.getKeyword());
task.setStatus(3);
scheduleMapper.updateTaskStatus(task);
}
}
public static void main(String[] args) {
Map<String, List<String>> authorAffiliations = new HashMap<>();
String author = "Utku Kumbul[1]; Faruk Uysal[1]; Cicero S. Vaucher[1][2]; Alexander Yarovoy[1]";
Pattern pattern = Pattern.compile("([\\p{L} .]+)(\\[\\d+])+(?=;|$)");
Matcher matcher = pattern.matcher(author);
while (matcher.find()) {
String name = matcher.group(1).trim(); // 获取姓名并去除前后空格
String affiliationPart = matcher.group(0); // 获取整个匹配串
// 提取所有编号
List<String> affiliations = new ArrayList<>();
Matcher numberMatcher = Pattern.compile("\\[(\\d+)]").matcher(affiliationPart);
while (numberMatcher.find()) {
affiliations.add(numberMatcher.group(1));
}
authorAffiliations.put(name, affiliations);
}
// 打印结果
authorAffiliations.forEach((k, v) -> System.out.println(k + " -> " + v));
}
}

88
src/main/java/com/bfd/youzhiapi/util/HttpUtil.java

@ -0,0 +1,88 @@
package com.bfd.youzhiapi.util;
import com.alibaba.fastjson2.JSONObject;
import lombok.extern.slf4j.Slf4j;
import okhttp3.*;
import org.springframework.stereotype.Component;
import java.util.concurrent.TimeUnit;
/**
* @author guowei
*/
@Component
@Slf4j
public class HttpUtil {
/**
* 关键词检索 请求接口
* @param parameters
* @return
*/
public static String getData(String parameters){
String result = "";
try {
// 目标 URL
String url = "http://api.keyanzhidian.com/api/literature/search";
// 创建 JSON 请求体
MediaType JSON = MediaType.parse("application/json; charset=utf-8");
// 使用 fastjson 构建 JSON
// JSONObject jsonObject = new JSONObject();
// jsonObject.put("content", content);
// String json = jsonObject.toJSONString();
RequestBody body = RequestBody.create(JSON, parameters);
// 构建 POST 请求
Request request = new Request.Builder()
.url(url)
.post(body)
.build();
OkHttpClient client = new OkHttpClient.Builder()
.connectTimeout(60, TimeUnit.SECONDS) // 连接超时
.readTimeout(30, TimeUnit.SECONDS) // 读取超时
.writeTimeout(15, TimeUnit.SECONDS) // 写入超时
.build();
Response response = client.newCall(request).execute();
if (response.isSuccessful()) {
result = response.body().string();
log.warn("Response: " + result);
}
}catch (Exception e){
e.printStackTrace();
}
return result;
}
/**
* 新增单条需求
* @param parameters
* @return
*/
public static String uploadDoi(String parameters){
String result = "";
try {
// 目标 URL
String url = "http://api.keyanzhidian.com/api/demand/create";
// 创建 JSON 请求体
MediaType JSON = MediaType.parse("application/json; charset=utf-8");
RequestBody body = RequestBody.create(JSON, parameters);
// 构建 POST 请求
Request request = new Request.Builder()
.url(url)
.post(body)
.build();
OkHttpClient client = new OkHttpClient.Builder()
.connectTimeout(60, TimeUnit.SECONDS) // 连接超时
.readTimeout(30, TimeUnit.SECONDS) // 读取超时
.writeTimeout(15, TimeUnit.SECONDS) // 写入超时
.build();
Response response = client.newCall(request).execute();
if (response.isSuccessful()) {
result = response.body().string();
log.warn("Response: " + result);
}
}catch (Exception e){
e.printStackTrace();
}
return result;
}
}

86
src/main/java/com/bfd/youzhiapi/util/KfkUtil.java

@ -0,0 +1,86 @@
package com.bfd.youzhiapi.util;
import lombok.extern.slf4j.Slf4j;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.apache.kafka.common.serialization.StringSerializer;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import java.util.Properties;
/**
* @author guowei
* kfk工具类
*/
@Component
@Slf4j
public class KfkUtil {
private static String topic;
private static String brokerList;
@Value("${crawl.kafka.topic}")
public void setTopic(String topic) {
KfkUtil.topic = topic;
}
@Value("${crawl.kafka.brokers}")
public void setBrokerList(String brokerList) {
KfkUtil.brokerList = brokerList;
}
private static KafkaProducer<String, String> kafkaProducer;
public static int num = 0;
/**
* 获取KafkaProducer实例
*/
public static KafkaProducer<String, String> getProducer() {
// synchronized (kafkaProducer) {
if (kafkaProducer == null) {
Properties props = new Properties();
//xxx服务器ip
props.put("bootstrap.servers", brokerList);
// props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG,brokerList);
//所有follower都响应了才认为消息提交成功"committed"
props.put("acks", "all");
//retries = MAX 无限重试直到你意识到出现了问题:)
props.put("retries", 3);
//producer将试图批处理消息记录以减少请求次数.默认的批量处理消息字节数
props.put("batch.size", 16384);
//batch.size当批量的数据大小达到设定值后就会立即发送不顾下面的linger.ms
//延迟1ms发送这项设置将通过增加小的延迟来完成--不是立即发送一条记录producer将会等待给定的延迟时间以允许其他消息记录发送这些消息记录可以批量处理
props.put("linger.ms", 1);
//producer可以用来缓存数据的内存大小
props.put("buffer.memory", 33554432);
props.put("key.serializer",
StringSerializer.class.getName());
props.put("value.serializer",
StringSerializer.class.getName());
kafkaProducer = new KafkaProducer<String, String>(props);
}
// }
return kafkaProducer;
}
/**
* 关闭KafkaProducer实例
*/
public static void closeProducer() {
if (kafkaProducer != null) {
log.info("----------close producer----------");
kafkaProducer.close();
kafkaProducer = null;
}
}
public static void sendKafka(String resultData) {
KafkaProducer<String, String> producer = getProducer();
ProducerRecord<String, String> se = new ProducerRecord<String, String>(topic, resultData);
producer.send(se);
log.info("发送kafka成功");
// num++;
}
}

270
src/main/java/com/bfd/youzhiapi/util/Md5SignUtil.java

@ -0,0 +1,270 @@
package com.bfd.youzhiapi.util;
import cn.hutool.core.date.DateUtil;
import com.alibaba.fastjson2.JSON;
import com.alibaba.fastjson2.JSONObject;
import com.bfd.youzhiapi.config.GlobalConfig;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Component;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.util.*;
/**
* Title: 接口签名工具类
*
* <p>
* Description:
* </p>
*
* @author
*/
@Component
@Slf4j
public class Md5SignUtil {
public static String Encoding_utf8 = "UTF-8";
private static final String hexDigits[] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f"};
/**
* 签名sign
* @param parameters
* 数据拼接
* 将所有发送或者接收到的数据为集合M将集合M内非空参数值的参数按照参数名ASCII码从小到大排序字典序使用URL键值对的格式即key1=value1&key2=value2拼接成字符串stringA
* 特别注意以下重要规则 参数名ASCII码从小到大排序字典序 如果参数的值为空不参与签名 参数名区分大小写
* 验证调用返回或主动通知时传送的sign参数不参与签名将生成的签名与该sign值作校验 数据签名
* 在stringA最后拼接上key得到stringSignTemp字符串并对stringSignTemp进行MD5运算再将得到的字符串所有字符转换为大写得到sign值signValue
* @return
*/
public static String sign(SortedMap<Object, Object> parameters) {
StringBuffer sb = new StringBuffer();
Set es = parameters.entrySet();// 所有参与传参的参数按照accsii排序升序
Iterator it = es.iterator();
while (it.hasNext()) {
Map.Entry entry = (Map.Entry) it.next();
String k = (String) entry.getKey();
Object v = entry.getValue();
if (null != v && !"".equals(v) && !"sign".equals(k) && !"key".equals(k)) {
sb.append(k + "=" + v + "&");
}
}
sb.append("appSecret=" + GlobalConfig.APPSECRET);
log.debug("Md5SignUtil.sign加密串为:{}", sb.toString());
String sign = MD5Encode(sb.toString(), Encoding_utf8).toUpperCase();
log.debug("Md5SignUtil.sign加密串后的签名为:{}", sign);
return sign;
}
/**
* 返回结果签名sign
*
* @param responseDRO
* @param key
* @return
*/
// public static String sign(ResponseDRO responseDRO, String key) {
// JSONObject infoJson = (JSONObject) JSONObject.toJSON(responseDRO);
// StringBuilder sb = new StringBuilder();
// Object[] sortArra = infoJson.keySet().toArray();
// // 按照ASCII排序
// Arrays.sort(sortArra);
// for (Object k : sortArra) {
// Object v = infoJson.get(k);
// if (null != v && !"".equals(v)) {
// if (v instanceof JSONArray) {
// JSONArray tempJson = (JSONArray) v;
// List<SortedMap<String, Object>> allList = new ArrayList<>();
// for (int i = 0; i < tempJson.size(); i++) {
// JSONObject obj = tempJson.getJSONObject(i);
// Map<String, Object> objMap = new HashMap<>();
// for (Map.Entry<String, Object> entry : obj.entrySet()) {
// objMap.put(entry.getKey(), entry.getValue());
// }
//
// SortedMap<String, Object> temps = new TreeMap<>();// 升序
// temps.putAll(objMap);
// allList.add(temps);
// }
// v = JSON.toJSONString(allList);
// }
// sb.append(k).append("=").append(v).append("&");
// }
// }
// sb.append("appSecret=" + key);
// logger.debug("Md5SignUtil.sign加密串为:{}", sb.toString());
// String sign = MD5Encode(sb.toString(), Encoding_utf8).toUpperCase();
// logger.debug("Md5SignUtil.sign加密串后的签名为:{}", sign);
// return sign;
// }
/**
* 请求参数sign签名
*
* @param baseQuery
* @param key
* @return
*/
// public static String sign(BaseQuery baseQuery, String key) {
// JSONObject infoJson = (JSONObject) JSONObject.toJSON(baseQuery);
// StringBuilder sb = new StringBuilder();
// Object[] sortArra = infoJson.keySet().toArray();
// // 按照ASCII排序
// Arrays.sort(sortArra);
// for (Object k : sortArra) {
// Object v = infoJson.get(k);
// if (null != v && !"".equals(v) && !"sign".equals(k)) {
// if (v instanceof JSONArray) {
// JSONArray tempJson = (JSONArray) v;
// List<SortedMap<String, Object>> allList = new ArrayList<>();
// for (int i = 0; i < tempJson.size(); i++) {
// JSONObject obj = tempJson.getJSONObject(i);
// Map<String, Object> objMap = new HashMap<>();
// for (Map.Entry<String, Object> entry : obj.entrySet()) {
// objMap.put(entry.getKey(), entry.getValue());
// }
//
// SortedMap<String, Object> temps = new TreeMap<>();// 升序
// temps.putAll(objMap);
// allList.add(temps);
// }
// v = JSON.toJSONString(allList);
// }
// sb.append(k).append("=").append(v).append("&");
// }
// }
// sb.append("appSecret=" + key);
// logger.info("Md5SignUtil.sign加密串为:{}", sb.toString());
// String sign = MD5Encode(sb.toString(), Encoding_utf8).toUpperCase();
// logger.info("Md5SignUtil.sign加密串后的签名为:{}", sign);
// return sign;
// }
/**
* 字符串 md5加密
*
* @param str
* @param charsetName
* @return
*/
public static String MD5Encode(String str, String charsetName) {
String resultString = null;
try {
resultString = new String(str);
MessageDigest md = MessageDigest.getInstance("MD5");
if (charsetName == null || "".equals(charsetName)) {
resultString = byteArrayToHexString(md.digest(resultString.getBytes()));
} else {
resultString = byteArrayToHexString(md.digest(resultString.getBytes(charsetName)));
}
} catch (Exception e) {
}
return resultString;
}
private static String byteArrayToHexString(byte b[]) {
StringBuffer resultSb = new StringBuffer();
for (int i = 0; i < b.length; i++)
resultSb.append(byteToHexString(b[i]));
return resultSb.toString().toUpperCase();
}
private static String byteToHexString(byte b) {
int n = b;
if (n < 0) {
n += 256;
}
int d1 = n / 16;
int d2 = n % 16;
return hexDigits[d1] + hexDigits[d2];
}
/**
* md5加密 默认加密串小写
*
* @param
* @return
*/
// public static String md5(String plaintext) {
// return md5(plaintext, StandardCharsets.UTF_8);
// }
public static void main(String[] args) {
String key = "dc41973ee03e471887c77c4a532dbfc3";
String appId = "c4d532304c6b4497b1ad";
String name = "radar";
String randomStr = DateUtil.format(new Date(), "yyyyMMddHHmmss");
// String randomStr = "20241225114130";
List typeSet = new ArrayList<>();
typeSet.add(10);
SortedMap<Object, Object> parameters = new TreeMap<Object, Object>();
parameters.put("randomStr", randomStr);
parameters.put("appId", appId);
parameters.put("title", name);
parameters.put("page",1);
parameters.put("pageSize",10);
// parameters.put("year","2024");
parameters.put("type", "30");
// parameters.put("id","1010052234917");
// String apiSign = "D3CC78105AA4C5F594AE733E78DB6E02";
// log.info("接口传入的签名串是:" + apiSign);
String mySign = sign(parameters);
log.info("生成的签名串是:" + mySign);
parameters.put("sign",mySign);
System.out.println(new JSONObject(parameters));
// String data = HttpUtil.getData(JSON.toJSONString(parameters));
// System.out.println(data);
SortedMap<Object, Object> parameterss = new TreeMap<Object, Object>();
parameterss.put("appId", appId);
parameterss.put("randomStr", randomStr);
parameterss.put("doi", "10.1109/ICCC57789.2023.10164991");
parameterss.put("organId",392);
String mySigns = sign(parameterss);
log.info("生成的签名串是:" + mySigns);
parameterss.put("sign",mySigns);
System.out.println(new JSONObject(parameterss));
SortedMap<Object, Object> parametersss = new TreeMap<Object, Object>();
parametersss.put("appId", appId);
parametersss.put("randomStr", randomStr);
// parametersss.put("status", 2);
parametersss.put("page",1);
parametersss.put("pageSize",10);
parametersss.put("organId",392);
String mySignss = sign(parametersss);
log.info("生成的签名串是:" + mySignss);
parametersss.put("sign",mySignss);
System.out.println(new JSONObject(parametersss));
SortedMap<Object, Object> parameterssss = new TreeMap<Object, Object>();
parameterssss.put("appId", appId);
parameterssss.put("randomStr", randomStr);
// parametersss.put("status", 2);
parameterssss.put("id",8417);
String mySignsss = sign(parameterssss);
log.info("生成的签名串是:" + mySignsss);
parameterssss.put("sign",mySignsss);
System.out.println(new JSONObject(parameterssss));
// PreOrderResponseDto yardResultDto = new PreOrderResponseDto();
// yardResultDto.setRetCode(retCode);
// yardResultDto.setRetMsg(retMsg);
// yardResultDto.setRandomStr(randomStr);
// yardResultDto.setSign(Md5SignUtil.sign(yardResultDto , key));
// System.out.println(yardResultDto);
}
}

29
src/main/java/com/bfd/youzhiapi/util/Utils.java

@ -0,0 +1,29 @@
package com.bfd.youzhiapi.util;
import org.springframework.stereotype.Component;
import java.time.Instant;
import java.time.ZoneId;
import java.time.ZonedDateTime;
/**
* @author guowei
*/
@Component
public class Utils {
public static boolean isFourDigitNumber(String input) {
return input.matches("\\d{4}");
}
public static int getYearFromTimestamp(long timestamp) {
// 使用 UTC 时区转换为年份
ZonedDateTime dateTime = Instant.ofEpochMilli(timestamp).atZone(ZoneId.of("Asia/Shanghai"));
return dateTime.getYear();
}
public static boolean isYearInRange(int year, int startYear, int endYear) {
// 判断年份是否在区间内
return year >= startYear && year <= endYear;
}
}

26
src/main/resources/application.yml

@ -0,0 +1,26 @@
spring:
datasource:
driver-class-name: com.mysql.cj.jdbc.Driver
url: jdbc:mysql://172.18.1.134:3306/cnki_crawl
username: crawl666
password: lx2a4jN1xFT96kj20LU=
crawl:
kafka:
topic: zhiWangTest2
brokers: 172.18.1.146:9092,172.18.1.147:9092,172.18.1.148:9092
mybatis:
mapper-locations: classpath:mapper/*.xml
#目的是为了省略resultType里的代码量
type-aliases-package: com.bfd.youzhiapi.entity
configuration:
log-impl: org.apache.ibatis.logging.stdout.StdOutImpl
server:
port: 7071
#日志级别
logging:
level:
com:
bfd: INFO
#日志路径
log:
path: ./logs

38
src/main/resources/logback-spring.xml

@ -0,0 +1,38 @@
<configuration>
<!-- 属性文件:在properties文件中找到对应的配置项 -->
<springProperty scope="context" name="logging.path" source="logging.log.path"/>
<springProperty scope="context" name="logging.level" source="logging.level.com.bfd"/>
<!-- 默认的控制台日志输出,一般生产环境都是后台启动,这个没太大作用 -->
<!-- <appender name="STDOUT"
class="ch.qos.logback.core.ConsoleAppender">
<encoder class="ch.qos.logback.classic.encoder.PatternLayoutEncoder">
<Pattern>%d{HH:mm:ss.SSS} %-5level %logger{80} - %msg%n</Pattern>
</encoder>
</appender> -->
<appender name="GLMAPPER-LOGGERONE"
class="ch.qos.logback.core.rolling.RollingFileAppender">
<append>true</append>
<filter class="ch.qos.logback.classic.filter.ThresholdFilter">
<level>${logging.level}</level>
</filter>
<file>
${logging.path}/crawlSchedule.log
<!-- ${logging.path}/sendKafka.log -->
</file>
<rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy">
<FileNamePattern>${logging.path}/crawlSchedule.log.%d{yyyy-MM-dd}</FileNamePattern>
<!-- <FileNamePattern>${logging.path}/sendKafka.log.%d{yyyy-MM-dd}</FileNamePattern> -->
<MaxHistory>7</MaxHistory>
</rollingPolicy>
<encoder class="ch.qos.logback.classic.encoder.PatternLayoutEncoder">
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %line %-5level %logger{50} - %msg%n</pattern>
<charset>UTF-8</charset>
</encoder>
</appender>
<root level="info">
<appender-ref ref="GLMAPPER-LOGGERONE"/>
<!-- <appender-ref ref="STDOUT"/> -->
</root>
</configuration>

17
src/main/resources/mapper/ScheduleMapper.xml

@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE mapper
PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
"http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.bfd.youzhiapi.mapper.ScheduleMapper">
<update id="updateTaskStatus">
update `newslist_111` set status=#{status} where rid = #{rid}
</update>
<select id="queryTaskByStatus" parameterType="int" resultType="com.bfd.youzhiapi.entity.TaskEntity">
SELECT * FROM `newslist_111` WHERE status = #{status}
</select>
<select id="queryCacheByDoi" resultType="com.bfd.youzhiapi.entity.CacheEntity">
SELECT downloadId,downloadUrl FROM kyzd_cache WHERE doi = #{doi}
</select>
</mapper>

13
src/test/java/com/bfd/youzhiapi/YouzhiApiApplicationTests.java

@ -0,0 +1,13 @@
package com.bfd.youzhiapi;
import org.junit.jupiter.api.Test;
import org.springframework.boot.test.context.SpringBootTest;
@SpringBootTest
class YouzhiApiApplicationTests {
@Test
void contextLoads() {
}
}
Loading…
Cancel
Save