commit ffc71d82fa5f8be610fdb57ca1685228e5d8d4e1
Author: maojian <550076202@qq.com>
Date: Wed Jan 8 15:10:42 2025 +0800
英文文献数据采集
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..e3b721c
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+/target/
+/logs/
+/.idea/
\ No newline at end of file
diff --git a/pom.xml b/pom.xml
new file mode 100644
index 0000000..72bf40e
--- /dev/null
+++ b/pom.xml
@@ -0,0 +1,163 @@
+
+
+ 4.0.0
+ com.bfd
+ youzhiApi
+ 0.0.1-SNAPSHOT
+ youzhiApi
+ youzhiApi
+
+ 1.8
+ UTF-8
+ UTF-8
+ 2.6.13
+
+
+
+ org.springframework.boot
+ spring-boot-starter-web
+
+
+ org.mybatis.spring.boot
+ mybatis-spring-boot-starter
+ 2.2.2
+
+
+
+ com.mysql
+ mysql-connector-j
+ runtime
+
+
+ org.projectlombok
+ lombok
+ true
+
+
+ org.springframework.boot
+ spring-boot-starter-test
+ test
+
+
+ com.squareup.okhttp3
+ okhttp
+
+
+ com.alibaba.fastjson2
+ fastjson2
+ 2.0.17
+
+
+ cn.hutool
+ hutool-all
+ 5.8.27
+
+
+ org.apache.kafka
+ kafka-clients
+ 2.7.1
+
+
+ org.jsoup
+ jsoup
+ 1.7.3
+
+
+
+
+
+ org.springframework.boot
+ spring-boot-dependencies
+ ${spring-boot.version}
+ pom
+ import
+
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-jar-plugin
+
+
+
+ *.properties
+ *.yml
+ *.yaml
+
+
+
+
+ com.bfd.youzhiapi.YouzhiApiApplication
+
+ true
+
+ lib/
+
+ false
+
+
+
+ config/
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-dependency-plugin
+
+
+ copy
+ package
+
+ copy-dependencies
+
+
+ ${project.build.directory}/lib/
+
+
+
+
+
+
+ maven-resources-plugin
+
+
+ copy-resources
+ package
+
+ copy-resources
+
+
+
+
+
+ src/main/resources/
+
+ *.properties
+ *.yml
+ *.yaml
+
+
+
+ ${project.build.directory}/config
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+
+ 8
+ 8
+
+
+
+
+
+
diff --git a/src/main/java/com/bfd/youzhiapi/YouzhiApiApplication.java b/src/main/java/com/bfd/youzhiapi/YouzhiApiApplication.java
new file mode 100644
index 0000000..a30ca56
--- /dev/null
+++ b/src/main/java/com/bfd/youzhiapi/YouzhiApiApplication.java
@@ -0,0 +1,28 @@
+package com.bfd.youzhiapi;
+
+import com.bfd.youzhiapi.service.ScheduleService;
+import org.springframework.boot.SpringApplication;
+import org.springframework.boot.autoconfigure.SpringBootApplication;
+import org.springframework.context.ConfigurableApplicationContext;
+import org.springframework.scheduling.annotation.EnableScheduling;
+
+import javax.annotation.Resource;
+
+@SpringBootApplication
+@EnableScheduling
+public class YouzhiApiApplication {
+
+ @Resource
+ ScheduleService scheduleService;
+ public static void main(String[] args) {
+ ConfigurableApplicationContext run = SpringApplication.run(YouzhiApiApplication.class, args);
+ YouzhiApiApplication bean = run.getBean(YouzhiApiApplication.class);
+// bean.start();
+ }
+
+// public void start(){
+// Thread thread = new Thread(scheduleService);
+// thread.start();
+// }
+
+}
diff --git a/src/main/java/com/bfd/youzhiapi/config/DataTypeEnum.java b/src/main/java/com/bfd/youzhiapi/config/DataTypeEnum.java
new file mode 100644
index 0000000..dc9539e
--- /dev/null
+++ b/src/main/java/com/bfd/youzhiapi/config/DataTypeEnum.java
@@ -0,0 +1,55 @@
+package com.bfd.youzhiapi.config;
+
+/**
+ * @author guowei
+ * 采集库 字段 和检索接口 映射枚举类
+ */
+public enum DataTypeEnum {
+
+
+ //期刊论文
+ PERIODICAL(10, 1,"学术期刊"),
+ //学位论文
+ DISSERTATION(20, 2,"学位论文"),
+ //会议论文
+ CONFERENCE(30, 3,"会议");
+
+
+ private final Integer code;
+
+ private final Integer field;
+
+ private final String type;
+
+ DataTypeEnum(Integer code, Integer field,String type) {
+ this.code = code;
+ this.field = field;
+ this.type = type;
+ }
+
+ public Integer getCode() {
+ return code;
+ }
+
+ public Integer getField(){return field;}
+
+ public String getType(){return type;}
+
+ public static Integer getCodeByField(Integer field) {
+ for (DataTypeEnum dataTypeEnum : DataTypeEnum.values()) {
+ if (dataTypeEnum.getField()==(field)) {
+ return dataTypeEnum.getCode();
+ }
+ }
+ return null; // 或者可以抛出异常,或返回一个默认值
+ }
+
+ public static String getTypeByCode(Integer code) {
+ for (DataTypeEnum dataTypeEnum : DataTypeEnum.values()) {
+ if (dataTypeEnum.getCode()==(code)) {
+ return dataTypeEnum.getType();
+ }
+ }
+ return null; // 或者可以抛出异常,或返回一个默认值
+ }
+}
diff --git a/src/main/java/com/bfd/youzhiapi/config/GlobalConfig.java b/src/main/java/com/bfd/youzhiapi/config/GlobalConfig.java
new file mode 100644
index 0000000..f6bc8ac
--- /dev/null
+++ b/src/main/java/com/bfd/youzhiapi/config/GlobalConfig.java
@@ -0,0 +1,25 @@
+package com.bfd.youzhiapi.config;
+
+import org.springframework.stereotype.Component;
+
+/**
+ * @author guowei
+ */
+@Component
+public class GlobalConfig {
+
+ /**
+ * 外部接口所用 appId
+ */
+ public static final String APPID = "c4d532304c6b4497b1ad";
+
+ /**
+ * 外部接口所用 appSecret
+ */
+ public static final String APPSECRET = "dc41973ee03e471887c77c4a532dbfc3";
+
+ /**
+ * 外部接口所用 机构Id
+ */
+ public static final Integer ORGANID = 392;
+}
diff --git a/src/main/java/com/bfd/youzhiapi/entity/CacheEntity.java b/src/main/java/com/bfd/youzhiapi/entity/CacheEntity.java
new file mode 100644
index 0000000..e7a3e7a
--- /dev/null
+++ b/src/main/java/com/bfd/youzhiapi/entity/CacheEntity.java
@@ -0,0 +1,19 @@
+package com.bfd.youzhiapi.entity;
+
+import lombok.Data;
+
+/**
+ * @author guowei
+ */
+@Data
+public class CacheEntity {
+ private int id;
+
+ private String doi;
+
+ private String downloadId;
+
+ private String uploadTime;
+
+ private String downloadUrl;
+}
diff --git a/src/main/java/com/bfd/youzhiapi/entity/KfkEntity.java b/src/main/java/com/bfd/youzhiapi/entity/KfkEntity.java
new file mode 100644
index 0000000..6906366
--- /dev/null
+++ b/src/main/java/com/bfd/youzhiapi/entity/KfkEntity.java
@@ -0,0 +1,33 @@
+package com.bfd.youzhiapi.entity;
+
+import lombok.Data;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * @author guowei
+ */
+@Data
+public class KfkEntity {
+ private Object attr;
+ private String author;
+ private String brief;
+ private String cid = "Nkyzd";
+ private String content;
+ private String dedupKey;
+
+ private String field;
+ private String forwardcontent;
+ private String iid;
+ private Boolean isDownload = false;
+ private String news_id;
+ private String post_time;
+ private String searchKeyword;
+ private String source;
+ private List tasks = new ArrayList<>();
+ private String title;
+ private String type = "newscontent";
+ private String url;
+ private String version = "1";
+}
diff --git a/src/main/java/com/bfd/youzhiapi/entity/TaskEntity.java b/src/main/java/com/bfd/youzhiapi/entity/TaskEntity.java
new file mode 100644
index 0000000..21a4f75
--- /dev/null
+++ b/src/main/java/com/bfd/youzhiapi/entity/TaskEntity.java
@@ -0,0 +1,33 @@
+package com.bfd.youzhiapi.entity;
+
+import lombok.Data;
+
+/**
+ * @author guowei
+ */
+@Data
+public class TaskEntity {
+ private Integer rid;
+ private Integer siteId;
+ private String cid;
+ private String channelName;
+ private String keyword;
+ private Integer pageTypeID;
+ private Integer weight;
+ private String url;
+ private Integer pageIdx;
+ private Integer nextPageTime;
+ private Integer status;
+ private Integer intv;
+ private String attachTag;
+ private String lastcrawltime;
+ private String nextcrawltime;
+ private String createTime;
+ private String modiTime;
+ private Integer crawl_mode;
+ private Integer crawl_account;
+ private String page_switchs;
+ private Integer task_hash_code;
+ private Long crawlStartTime;
+ private Long crawlEndTime;
+}
diff --git a/src/main/java/com/bfd/youzhiapi/mapper/ScheduleMapper.java b/src/main/java/com/bfd/youzhiapi/mapper/ScheduleMapper.java
new file mode 100644
index 0000000..3a4869c
--- /dev/null
+++ b/src/main/java/com/bfd/youzhiapi/mapper/ScheduleMapper.java
@@ -0,0 +1,31 @@
+package com.bfd.youzhiapi.mapper;
+
+import com.bfd.youzhiapi.entity.CacheEntity;
+import com.bfd.youzhiapi.entity.TaskEntity;
+import org.apache.ibatis.annotations.Mapper;
+import org.springframework.stereotype.Repository;
+
+import java.util.List;
+
+/**
+ * @author guowei
+ */
+@Mapper
+public interface ScheduleMapper {
+
+ /**
+ * 查询任务
+ * @param status
+ * @return
+ */
+ List queryTaskByStatus(int status);
+
+ /**
+ * 更改任务状态
+ * @param taskEntity
+ * @return
+ */
+ int updateTaskStatus(TaskEntity taskEntity);
+
+ CacheEntity queryCacheByDoi(String doi);
+}
diff --git a/src/main/java/com/bfd/youzhiapi/service/ScheduleService.java b/src/main/java/com/bfd/youzhiapi/service/ScheduleService.java
new file mode 100644
index 0000000..6ba6c79
--- /dev/null
+++ b/src/main/java/com/bfd/youzhiapi/service/ScheduleService.java
@@ -0,0 +1,336 @@
+package com.bfd.youzhiapi.service;
+
+import cn.hutool.core.date.DateUtil;
+import cn.hutool.core.util.IdUtil;
+import cn.hutool.core.util.StrUtil;
+import cn.hutool.crypto.SecureUtil;
+import com.alibaba.fastjson2.JSON;
+import com.alibaba.fastjson2.JSONArray;
+import com.alibaba.fastjson2.JSONObject;
+import com.bfd.youzhiapi.config.DataTypeEnum;
+import com.bfd.youzhiapi.config.GlobalConfig;
+import com.bfd.youzhiapi.entity.CacheEntity;
+import com.bfd.youzhiapi.entity.KfkEntity;
+import com.bfd.youzhiapi.entity.TaskEntity;
+import com.bfd.youzhiapi.mapper.ScheduleMapper;
+import com.bfd.youzhiapi.util.HttpUtil;
+import com.bfd.youzhiapi.util.KfkUtil;
+import com.bfd.youzhiapi.util.Md5SignUtil;
+import com.bfd.youzhiapi.util.Utils;
+import lombok.extern.slf4j.Slf4j;
+import org.jsoup.Jsoup;
+import org.springframework.scheduling.annotation.Scheduled;
+import org.springframework.stereotype.Service;
+
+import javax.annotation.Resource;
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * @author guowei
+ */
+@Service
+@Slf4j
+public class ScheduleService{
+ @Resource
+ ScheduleMapper scheduleMapper;
+
+ @Scheduled(cron = "0 0/2 * * * ?")
+ public void run() {
+ List taskEntities = scheduleMapper.queryTaskByStatus(1);
+ log.info("查询到{}条未采集任务", taskEntities.size());
+ for (TaskEntity task : taskEntities) {
+
+ log.info("开始采集任务:{}", task.getKeyword());
+ //最大翻页限制20页
+ Integer maxPageNum = task.getNextPageTime();
+ if (maxPageNum == -1){ maxPageNum = 20;}
+
+ String attachTag = task.getAttachTag();
+ JSONObject attrJSON = JSONObject.parseObject(attachTag);
+ if (!attrJSON.containsKey("field")) {
+ log.error("没找到field,keyword:{},跳过采集", task.getKeyword());
+ continue;
+ }
+ Integer field = attrJSON.getInteger("field");
+ String documentType = attrJSON.getString("documentType");
+ String[] documentSplit = documentType.split(",");
+ List apiType = new ArrayList<>();
+ //根据field 映射 检索接口的参数
+ for (String type : documentSplit) {
+ Integer codeByField = DataTypeEnum.getCodeByField(Integer.valueOf(type));
+ apiType.add(String.valueOf(codeByField));
+ }
+ if (apiType.size() == 0) {
+ log.error("没找到documentType,keyword:{},跳过采集", task.getKeyword());
+ continue;
+ } else {
+ log.info("采集类型:{}", String.join(",", apiType));
+ }
+ Long crawlStartTime = task.getCrawlStartTime();
+ Long crawlEndTime = task.getCrawlEndTime();
+ //获取采集范围 年份
+ int yearStart = Utils.getYearFromTimestamp(crawlStartTime);
+ int yearend = Utils.getYearFromTimestamp(crawlEndTime);
+ log.info("采集年份范围:{} ~ {}",yearStart,yearend);
+ //每个采集类型都采集一遍
+ for(String type:apiType) {
+ int currentPageNum = 1;
+ log.info("开始采集 {} 类型数据",type);
+ do {
+ try {
+ SortedMap