From ffc71d82fa5f8be610fdb57ca1685228e5d8d4e1 Mon Sep 17 00:00:00 2001 From: maojian <550076202@qq.com> Date: Wed, 8 Jan 2025 15:10:42 +0800 Subject: [PATCH] =?UTF-8?q?=E8=8B=B1=E6=96=87=E6=96=87=E7=8C=AE=E6=95=B0?= =?UTF-8?q?=E6=8D=AE=E9=87=87=E9=9B=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 3 + pom.xml | 163 ++++++++++ .../com/bfd/youzhiapi/YouzhiApiApplication.java | 28 ++ .../com/bfd/youzhiapi/config/DataTypeEnum.java | 55 ++++ .../com/bfd/youzhiapi/config/GlobalConfig.java | 25 ++ .../java/com/bfd/youzhiapi/entity/CacheEntity.java | 19 ++ .../java/com/bfd/youzhiapi/entity/KfkEntity.java | 33 ++ .../java/com/bfd/youzhiapi/entity/TaskEntity.java | 33 ++ .../com/bfd/youzhiapi/mapper/ScheduleMapper.java | 31 ++ .../com/bfd/youzhiapi/service/ScheduleService.java | 336 +++++++++++++++++++++ src/main/java/com/bfd/youzhiapi/util/HttpUtil.java | 88 ++++++ src/main/java/com/bfd/youzhiapi/util/KfkUtil.java | 86 ++++++ .../java/com/bfd/youzhiapi/util/Md5SignUtil.java | 270 +++++++++++++++++ src/main/java/com/bfd/youzhiapi/util/Utils.java | 29 ++ src/main/resources/application.yml | 26 ++ src/main/resources/logback-spring.xml | 38 +++ src/main/resources/mapper/ScheduleMapper.xml | 17 ++ .../bfd/youzhiapi/YouzhiApiApplicationTests.java | 13 + 18 files changed, 1293 insertions(+) create mode 100644 .gitignore create mode 100644 pom.xml create mode 100644 src/main/java/com/bfd/youzhiapi/YouzhiApiApplication.java create mode 100644 src/main/java/com/bfd/youzhiapi/config/DataTypeEnum.java create mode 100644 src/main/java/com/bfd/youzhiapi/config/GlobalConfig.java create mode 100644 src/main/java/com/bfd/youzhiapi/entity/CacheEntity.java create mode 100644 src/main/java/com/bfd/youzhiapi/entity/KfkEntity.java create mode 100644 src/main/java/com/bfd/youzhiapi/entity/TaskEntity.java create mode 100644 src/main/java/com/bfd/youzhiapi/mapper/ScheduleMapper.java create mode 100644 src/main/java/com/bfd/youzhiapi/service/ScheduleService.java create mode 100644 src/main/java/com/bfd/youzhiapi/util/HttpUtil.java create mode 100644 src/main/java/com/bfd/youzhiapi/util/KfkUtil.java create mode 100644 src/main/java/com/bfd/youzhiapi/util/Md5SignUtil.java create mode 100644 src/main/java/com/bfd/youzhiapi/util/Utils.java create mode 100644 src/main/resources/application.yml create mode 100644 src/main/resources/logback-spring.xml create mode 100644 src/main/resources/mapper/ScheduleMapper.xml create mode 100644 src/test/java/com/bfd/youzhiapi/YouzhiApiApplicationTests.java diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e3b721c --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/target/ +/logs/ +/.idea/ \ No newline at end of file diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..72bf40e --- /dev/null +++ b/pom.xml @@ -0,0 +1,163 @@ + + + 4.0.0 + com.bfd + youzhiApi + 0.0.1-SNAPSHOT + youzhiApi + youzhiApi + + 1.8 + UTF-8 + UTF-8 + 2.6.13 + + + + org.springframework.boot + spring-boot-starter-web + + + org.mybatis.spring.boot + mybatis-spring-boot-starter + 2.2.2 + + + + com.mysql + mysql-connector-j + runtime + + + org.projectlombok + lombok + true + + + org.springframework.boot + spring-boot-starter-test + test + + + com.squareup.okhttp3 + okhttp + + + com.alibaba.fastjson2 + fastjson2 + 2.0.17 + + + cn.hutool + hutool-all + 5.8.27 + + + org.apache.kafka + kafka-clients + 2.7.1 + + + org.jsoup + jsoup + 1.7.3 + + + + + + org.springframework.boot + spring-boot-dependencies + ${spring-boot.version} + pom + import + + + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + + *.properties + *.yml + *.yaml + + + + + com.bfd.youzhiapi.YouzhiApiApplication + + true + + lib/ + + false + + + + config/ + + + + + + org.apache.maven.plugins + maven-dependency-plugin + + + copy + package + + copy-dependencies + + + ${project.build.directory}/lib/ + + + + + + + maven-resources-plugin + + + copy-resources + package + + copy-resources + + + + + + src/main/resources/ + + *.properties + *.yml + *.yaml + + + + ${project.build.directory}/config + + + + + + org.apache.maven.plugins + maven-compiler-plugin + + 8 + 8 + + + + + + diff --git a/src/main/java/com/bfd/youzhiapi/YouzhiApiApplication.java b/src/main/java/com/bfd/youzhiapi/YouzhiApiApplication.java new file mode 100644 index 0000000..a30ca56 --- /dev/null +++ b/src/main/java/com/bfd/youzhiapi/YouzhiApiApplication.java @@ -0,0 +1,28 @@ +package com.bfd.youzhiapi; + +import com.bfd.youzhiapi.service.ScheduleService; +import org.springframework.boot.SpringApplication; +import org.springframework.boot.autoconfigure.SpringBootApplication; +import org.springframework.context.ConfigurableApplicationContext; +import org.springframework.scheduling.annotation.EnableScheduling; + +import javax.annotation.Resource; + +@SpringBootApplication +@EnableScheduling +public class YouzhiApiApplication { + + @Resource + ScheduleService scheduleService; + public static void main(String[] args) { + ConfigurableApplicationContext run = SpringApplication.run(YouzhiApiApplication.class, args); + YouzhiApiApplication bean = run.getBean(YouzhiApiApplication.class); +// bean.start(); + } + +// public void start(){ +// Thread thread = new Thread(scheduleService); +// thread.start(); +// } + +} diff --git a/src/main/java/com/bfd/youzhiapi/config/DataTypeEnum.java b/src/main/java/com/bfd/youzhiapi/config/DataTypeEnum.java new file mode 100644 index 0000000..dc9539e --- /dev/null +++ b/src/main/java/com/bfd/youzhiapi/config/DataTypeEnum.java @@ -0,0 +1,55 @@ +package com.bfd.youzhiapi.config; + +/** + * @author guowei + * 采集库 字段 和检索接口 映射枚举类 + */ +public enum DataTypeEnum { + + + //期刊论文 + PERIODICAL(10, 1,"学术期刊"), + //学位论文 + DISSERTATION(20, 2,"学位论文"), + //会议论文 + CONFERENCE(30, 3,"会议"); + + + private final Integer code; + + private final Integer field; + + private final String type; + + DataTypeEnum(Integer code, Integer field,String type) { + this.code = code; + this.field = field; + this.type = type; + } + + public Integer getCode() { + return code; + } + + public Integer getField(){return field;} + + public String getType(){return type;} + + public static Integer getCodeByField(Integer field) { + for (DataTypeEnum dataTypeEnum : DataTypeEnum.values()) { + if (dataTypeEnum.getField()==(field)) { + return dataTypeEnum.getCode(); + } + } + return null; // 或者可以抛出异常,或返回一个默认值 + } + + public static String getTypeByCode(Integer code) { + for (DataTypeEnum dataTypeEnum : DataTypeEnum.values()) { + if (dataTypeEnum.getCode()==(code)) { + return dataTypeEnum.getType(); + } + } + return null; // 或者可以抛出异常,或返回一个默认值 + } +} diff --git a/src/main/java/com/bfd/youzhiapi/config/GlobalConfig.java b/src/main/java/com/bfd/youzhiapi/config/GlobalConfig.java new file mode 100644 index 0000000..f6bc8ac --- /dev/null +++ b/src/main/java/com/bfd/youzhiapi/config/GlobalConfig.java @@ -0,0 +1,25 @@ +package com.bfd.youzhiapi.config; + +import org.springframework.stereotype.Component; + +/** + * @author guowei + */ +@Component +public class GlobalConfig { + + /** + * 外部接口所用 appId + */ + public static final String APPID = "c4d532304c6b4497b1ad"; + + /** + * 外部接口所用 appSecret + */ + public static final String APPSECRET = "dc41973ee03e471887c77c4a532dbfc3"; + + /** + * 外部接口所用 机构Id + */ + public static final Integer ORGANID = 392; +} diff --git a/src/main/java/com/bfd/youzhiapi/entity/CacheEntity.java b/src/main/java/com/bfd/youzhiapi/entity/CacheEntity.java new file mode 100644 index 0000000..e7a3e7a --- /dev/null +++ b/src/main/java/com/bfd/youzhiapi/entity/CacheEntity.java @@ -0,0 +1,19 @@ +package com.bfd.youzhiapi.entity; + +import lombok.Data; + +/** + * @author guowei + */ +@Data +public class CacheEntity { + private int id; + + private String doi; + + private String downloadId; + + private String uploadTime; + + private String downloadUrl; +} diff --git a/src/main/java/com/bfd/youzhiapi/entity/KfkEntity.java b/src/main/java/com/bfd/youzhiapi/entity/KfkEntity.java new file mode 100644 index 0000000..6906366 --- /dev/null +++ b/src/main/java/com/bfd/youzhiapi/entity/KfkEntity.java @@ -0,0 +1,33 @@ +package com.bfd.youzhiapi.entity; + +import lombok.Data; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author guowei + */ +@Data +public class KfkEntity { + private Object attr; + private String author; + private String brief; + private String cid = "Nkyzd"; + private String content; + private String dedupKey; + + private String field; + private String forwardcontent; + private String iid; + private Boolean isDownload = false; + private String news_id; + private String post_time; + private String searchKeyword; + private String source; + private List tasks = new ArrayList<>(); + private String title; + private String type = "newscontent"; + private String url; + private String version = "1"; +} diff --git a/src/main/java/com/bfd/youzhiapi/entity/TaskEntity.java b/src/main/java/com/bfd/youzhiapi/entity/TaskEntity.java new file mode 100644 index 0000000..21a4f75 --- /dev/null +++ b/src/main/java/com/bfd/youzhiapi/entity/TaskEntity.java @@ -0,0 +1,33 @@ +package com.bfd.youzhiapi.entity; + +import lombok.Data; + +/** + * @author guowei + */ +@Data +public class TaskEntity { + private Integer rid; + private Integer siteId; + private String cid; + private String channelName; + private String keyword; + private Integer pageTypeID; + private Integer weight; + private String url; + private Integer pageIdx; + private Integer nextPageTime; + private Integer status; + private Integer intv; + private String attachTag; + private String lastcrawltime; + private String nextcrawltime; + private String createTime; + private String modiTime; + private Integer crawl_mode; + private Integer crawl_account; + private String page_switchs; + private Integer task_hash_code; + private Long crawlStartTime; + private Long crawlEndTime; +} diff --git a/src/main/java/com/bfd/youzhiapi/mapper/ScheduleMapper.java b/src/main/java/com/bfd/youzhiapi/mapper/ScheduleMapper.java new file mode 100644 index 0000000..3a4869c --- /dev/null +++ b/src/main/java/com/bfd/youzhiapi/mapper/ScheduleMapper.java @@ -0,0 +1,31 @@ +package com.bfd.youzhiapi.mapper; + +import com.bfd.youzhiapi.entity.CacheEntity; +import com.bfd.youzhiapi.entity.TaskEntity; +import org.apache.ibatis.annotations.Mapper; +import org.springframework.stereotype.Repository; + +import java.util.List; + +/** + * @author guowei + */ +@Mapper +public interface ScheduleMapper { + + /** + * 查询任务 + * @param status + * @return + */ + List queryTaskByStatus(int status); + + /** + * 更改任务状态 + * @param taskEntity + * @return + */ + int updateTaskStatus(TaskEntity taskEntity); + + CacheEntity queryCacheByDoi(String doi); +} diff --git a/src/main/java/com/bfd/youzhiapi/service/ScheduleService.java b/src/main/java/com/bfd/youzhiapi/service/ScheduleService.java new file mode 100644 index 0000000..6ba6c79 --- /dev/null +++ b/src/main/java/com/bfd/youzhiapi/service/ScheduleService.java @@ -0,0 +1,336 @@ +package com.bfd.youzhiapi.service; + +import cn.hutool.core.date.DateUtil; +import cn.hutool.core.util.IdUtil; +import cn.hutool.core.util.StrUtil; +import cn.hutool.crypto.SecureUtil; +import com.alibaba.fastjson2.JSON; +import com.alibaba.fastjson2.JSONArray; +import com.alibaba.fastjson2.JSONObject; +import com.bfd.youzhiapi.config.DataTypeEnum; +import com.bfd.youzhiapi.config.GlobalConfig; +import com.bfd.youzhiapi.entity.CacheEntity; +import com.bfd.youzhiapi.entity.KfkEntity; +import com.bfd.youzhiapi.entity.TaskEntity; +import com.bfd.youzhiapi.mapper.ScheduleMapper; +import com.bfd.youzhiapi.util.HttpUtil; +import com.bfd.youzhiapi.util.KfkUtil; +import com.bfd.youzhiapi.util.Md5SignUtil; +import com.bfd.youzhiapi.util.Utils; +import lombok.extern.slf4j.Slf4j; +import org.jsoup.Jsoup; +import org.springframework.scheduling.annotation.Scheduled; +import org.springframework.stereotype.Service; + +import javax.annotation.Resource; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * @author guowei + */ +@Service +@Slf4j +public class ScheduleService{ + @Resource + ScheduleMapper scheduleMapper; + + @Scheduled(cron = "0 0/2 * * * ?") + public void run() { + List taskEntities = scheduleMapper.queryTaskByStatus(1); + log.info("查询到{}条未采集任务", taskEntities.size()); + for (TaskEntity task : taskEntities) { + + log.info("开始采集任务:{}", task.getKeyword()); + //最大翻页限制20页 + Integer maxPageNum = task.getNextPageTime(); + if (maxPageNum == -1){ maxPageNum = 20;} + + String attachTag = task.getAttachTag(); + JSONObject attrJSON = JSONObject.parseObject(attachTag); + if (!attrJSON.containsKey("field")) { + log.error("没找到field,keyword:{},跳过采集", task.getKeyword()); + continue; + } + Integer field = attrJSON.getInteger("field"); + String documentType = attrJSON.getString("documentType"); + String[] documentSplit = documentType.split(","); + List apiType = new ArrayList<>(); + //根据field 映射 检索接口的参数 + for (String type : documentSplit) { + Integer codeByField = DataTypeEnum.getCodeByField(Integer.valueOf(type)); + apiType.add(String.valueOf(codeByField)); + } + if (apiType.size() == 0) { + log.error("没找到documentType,keyword:{},跳过采集", task.getKeyword()); + continue; + } else { + log.info("采集类型:{}", String.join(",", apiType)); + } + Long crawlStartTime = task.getCrawlStartTime(); + Long crawlEndTime = task.getCrawlEndTime(); + //获取采集范围 年份 + int yearStart = Utils.getYearFromTimestamp(crawlStartTime); + int yearend = Utils.getYearFromTimestamp(crawlEndTime); + log.info("采集年份范围:{} ~ {}",yearStart,yearend); + //每个采集类型都采集一遍 + for(String type:apiType) { + int currentPageNum = 1; + log.info("开始采集 {} 类型数据",type); + do { + try { + SortedMap parameters = new TreeMap(); + parameters.put("randomStr", DateUtil.format(new Date(), "yyyyMMddHHmmss")); + parameters.put("appId", GlobalConfig.APPID); + parameters.put("title", task.getKeyword()); + parameters.put("page", currentPageNum); + parameters.put("pageSize", 10); + parameters.put("type", type); + String sign = Md5SignUtil.sign(parameters); + parameters.put("sign", sign); + String data = HttpUtil.getData(JSON.toJSONString(parameters)); + JSONObject jsonObject = JSONObject.parseObject(data); + JSONObject data1 = jsonObject.getJSONObject("data"); + JSONArray records = data1.getJSONArray("records"); + for (Object record : records) { + JSONObject item = (JSONObject) record; + KfkEntity kfk = new KfkEntity(); + Map attr = new HashMap<>(); + attr.put("attachTag", JSON.parse(task.getAttachTag())); + kfk.setAttr(attr); + kfk.setAuthor(item.getString("author")); + kfk.setContent(item.getString("abstractE")); + kfk.setForwardcontent(item.getString("abstractE")); + String organ = item.getString("organ"); + Map brief = new HashMap<>(); + + kfk.setField(String.valueOf(field)); + kfk.setTitle(Jsoup.parse(item.getString("title")).text()); + kfk.setSearchKeyword(task.getKeyword()); + String source = ""; + switch (item.getInteger("type")) { + case 10: + source = item.getString("name"); + break; + case 20: + source = item.getString("school"); + break; + case 30: + source = item.getString("conferenceName"); + break; + } + kfk.setSource(source); +// if (item.getString("abstractURL") != null) { +// kfk.setUrl(item.getString("abstractURL")); +// } else { +// kfk.setUrl(item.getString("pdfURL")); +// } + String uuid = IdUtil.simpleUUID(); + kfk.setIid(uuid); + kfk.setNews_id(uuid); + kfk.setUrl(uuid); + String year = item.getString("year"); + //判断年份 是不是4位数字 (有错误数据的情况) + boolean fourDigitNumber = Utils.isFourDigitNumber(year); + if (!fourDigitNumber) { + log.error("year不是4位数字,跳过,year:{}", year); + continue; + } + if (!Utils.isYearInRange(Integer.parseInt(year), yearStart, yearend)) { + log.error("year不在采集年份范围,跳过,year:{}", year); + continue; + } + kfk.setPost_time(item.getString("year") + "-01-01 00:00:00"); + kfk.setField(String.valueOf(field)); + + //没有机构字段 并且是学位论文,学校作为机构 + if (organ == null || organ == "") { + organ = item.getString("school"); + List agencys = new ArrayList<>(); + if (organ == null || organ == "") { + organ = ""; + } else { + Map agency = new HashMap<>(); + agency.put("name", organ); + agency.put("url", IdUtil.simpleUUID()); // 添加第二个字段 + agencys.add(agency); + } + brief.put("agency", organ); + brief.put("agencys", agencys); + brief.put("author", item.getString("author")); + Map agencyAuthor = new HashMap<>(); + agencyAuthor.put("agency", organ); + agencyAuthor.put("author", item.getString("author")); // 添加第二个字段 + List authorAndAgency = new ArrayList<>(); + authorAndAgency.add(agencyAuthor); + brief.put("authorAndAgency", authorAndAgency); + } else { + List agencys = new ArrayList<>(); + List agencyString = new ArrayList<>(); + Pattern pattern = Pattern.compile("\\[([a-z\\d])\\]([^;]+)"); + Matcher matcher = pattern.matcher(organ); + Map agencyMap = new HashMap<>(); + while (matcher.find()) { + String key = matcher.group(1); // 获取编号 + String value = matcher.group(2).trim(); // 获取机构名称并去除前后空格 + agencyMap.put(key, value); + agencyString.add(value); + Map agency = new HashMap<>(); + agency.put("name", value); + agency.put("url", IdUtil.simpleUUID()); + agencys.add(agency); + } + brief.put("agency", String.join(",", agencyString)); + brief.put("agencys", agencys); + + // 正则表达式匹配模式,匹配 "姓名[编号][编号]..." +// Pattern patternAuthor = Pattern.compile("([\\p{L} .]+)(\\[\\d+])+(?=;|$)"); + Pattern patternAuthor = Pattern.compile("([\\p{L} .-]+)((\\[\\d+\\])|(\\[[a-zA-Z,]+\\]))+(?=;|$)"); + Matcher matcherAuthor = patternAuthor.matcher(item.getString("author")); + Map> authorAffiliations = new HashMap<>(); + while (matcherAuthor.find()) { + String name = matcherAuthor.group(1).trim(); // 获取姓名并去除前后空格 + String affiliationPart = matcherAuthor.group(0); // 获取整个匹配串 + + // 提取所有编号 + List affiliations = new ArrayList<>(); + Matcher numberMatcher = Pattern.compile("\\[(\\d+)]").matcher(affiliationPart); + while (numberMatcher.find()) { + affiliations.add(numberMatcher.group(1)); + } + Matcher letterMatcher = Pattern.compile("\\[(.*?)\\]").matcher(affiliationPart); + if (letterMatcher.find()) { + String values = letterMatcher.group(1); + String[] items = values.split(","); + for (String key : items) { + affiliations.add(key); + } + } + authorAffiliations.put(name, affiliations); + } + Set strings = authorAffiliations.keySet(); + List authorAndAgency = new ArrayList<>(); + if (strings.size() > 0) { + brief.put("author", String.join(",", strings)); + kfk.setAuthor(String.join(",", strings)); + for (String name : strings) { + List organNum = authorAffiliations.get(name); + for (String authorOrgan : organNum) { + Map agencyAuthor = new HashMap<>(); + agencyAuthor.put("agency", agencyMap.get(authorOrgan)); + agencyAuthor.put("author", name); + authorAndAgency.add(agencyAuthor); + } + } + brief.put("authorAndAgency", authorAndAgency); + } else { + brief.put("author", item.getString("author")); + brief.put("authorAndAgency", authorAndAgency); + } + + } + + brief.put("author_agency_urls", new ArrayList<>()); + brief.put("author_urls", new ArrayList<>()); + brief.put("data", DataTypeEnum.getTypeByCode(item.getInteger("type"))); + brief.put("date", DateUtil.formatDate(DateUtil.parse(kfk.getPost_time()))); + brief.put("detailUrl", kfk.getUrl()); + brief.put("download", ""); + brief.put("eisci", ""); + brief.put("fileUrl", ""); + brief.put("funding", ""); + brief.put("id", item.getString("id")); + brief.put("initial_mark", ""); + brief.put("keyword", task.getKeyword()); + brief.put("keywords", item.getString("keyword")); + String language = item.getString("language"); + if (StrUtil.isEmpty(language)){ + language = "英语"; + } + brief.put("language", language); + brief.put("paramter", new HashMap<>()); + brief.put("publish_agency_urls", new ArrayList<>()); + brief.put("quote", ""); + brief.put("source", source); + brief.put("summary", kfk.getContent().replace("\"", "")); + brief.put("title", kfk.getTitle()); + brief.put("wxtype", language); + kfk.setBrief(JSON.toJSONString(brief)); + + + /**2024.12.27 新增下载文件需求,下载文件需要3个接口 + * 1.新增单条需求(根据doi) + * 2.获取清单,已上传时间查询 + * 3.获取下载链接 + * 根据doi 查询缓存表里是否已经下载过附件,有的话 直接下载推送 + * 没有的话 放入队列等待处理完成 + */ +// String doi = item.getString("doi"); +// CacheEntity cacheEntity = scheduleMapper.queryCacheByDoi(doi); +// if (cacheEntity == null){ +// log.info("缓存库没有此数据,新增数据"); +// //组装参数 +// SortedMap params = new TreeMap(); +// params.put("randomStr", DateUtil.format(new Date(), "yyyyMMddHHmmss")); +// params.put("appId", GlobalConfig.APPID); +// params.put("organId",GlobalConfig.ORGANID); +// params.put("doi",doi); +// String uploadSign = Md5SignUtil.sign(params); +// params.put("sign", uploadSign); +// String uploadData = HttpUtil.uploadDoi(JSON.toJSONString(params)); +// JSONObject uploadJon = JSONObject.parseObject(uploadData); +// +// cacheEntity = new CacheEntity(); +// cacheEntity.setDoi(doi); +// +// } + System.out.println(JSON.toJSONString(kfk)); + KfkUtil.sendKafka(JSON.toJSONString(kfk)); + + } + + + Integer totalPage = data1.getInteger("pages"); + if (maxPageNum > totalPage) { + maxPageNum = totalPage; + } + log.info("第{}页采集,最大限制页数{},共{}页", currentPageNum, maxPageNum, totalPage); + currentPageNum++; + Thread.sleep(1000 * 5); + } catch (Exception e) { + e.printStackTrace(); + log.error("采集失败", e); + } + } while (currentPageNum <= maxPageNum); + log.info("采集完成 {} 类型数据",type); + } + log.info("关键词:{} 采集完成",task.getKeyword()); + task.setStatus(3); + scheduleMapper.updateTaskStatus(task); + + } + } + + public static void main(String[] args) { + Map> authorAffiliations = new HashMap<>(); + + String author = "Utku Kumbul[1]; Faruk Uysal[1]; Cicero S. Vaucher[1][2]; Alexander Yarovoy[1]"; + Pattern pattern = Pattern.compile("([\\p{L} .]+)(\\[\\d+])+(?=;|$)"); + Matcher matcher = pattern.matcher(author); + while (matcher.find()) { + String name = matcher.group(1).trim(); // 获取姓名并去除前后空格 + String affiliationPart = matcher.group(0); // 获取整个匹配串 + + // 提取所有编号 + List affiliations = new ArrayList<>(); + Matcher numberMatcher = Pattern.compile("\\[(\\d+)]").matcher(affiliationPart); + while (numberMatcher.find()) { + affiliations.add(numberMatcher.group(1)); + } + authorAffiliations.put(name, affiliations); + } + // 打印结果 + authorAffiliations.forEach((k, v) -> System.out.println(k + " -> " + v)); + } +} diff --git a/src/main/java/com/bfd/youzhiapi/util/HttpUtil.java b/src/main/java/com/bfd/youzhiapi/util/HttpUtil.java new file mode 100644 index 0000000..b2bd09a --- /dev/null +++ b/src/main/java/com/bfd/youzhiapi/util/HttpUtil.java @@ -0,0 +1,88 @@ +package com.bfd.youzhiapi.util; + +import com.alibaba.fastjson2.JSONObject; +import lombok.extern.slf4j.Slf4j; +import okhttp3.*; +import org.springframework.stereotype.Component; + +import java.util.concurrent.TimeUnit; + +/** + * @author guowei + */ +@Component +@Slf4j +public class HttpUtil { + + /** + * 关键词检索 请求接口 + * @param parameters + * @return + */ + public static String getData(String parameters){ + String result = ""; + try { + // 目标 URL + String url = "http://api.keyanzhidian.com/api/literature/search"; + // 创建 JSON 请求体 + MediaType JSON = MediaType.parse("application/json; charset=utf-8"); + // 使用 fastjson 构建 JSON +// JSONObject jsonObject = new JSONObject(); +// jsonObject.put("content", content); +// String json = jsonObject.toJSONString(); + RequestBody body = RequestBody.create(JSON, parameters); + // 构建 POST 请求 + Request request = new Request.Builder() + .url(url) + .post(body) + .build(); + OkHttpClient client = new OkHttpClient.Builder() + .connectTimeout(60, TimeUnit.SECONDS) // 连接超时 + .readTimeout(30, TimeUnit.SECONDS) // 读取超时 + .writeTimeout(15, TimeUnit.SECONDS) // 写入超时 + .build(); + Response response = client.newCall(request).execute(); + if (response.isSuccessful()) { + result = response.body().string(); + log.warn("Response: " + result); + } + }catch (Exception e){ + e.printStackTrace(); + } + return result; + } + + /** + * 新增单条需求 + * @param parameters + * @return + */ + public static String uploadDoi(String parameters){ + String result = ""; + try { + // 目标 URL + String url = "http://api.keyanzhidian.com/api/demand/create"; + // 创建 JSON 请求体 + MediaType JSON = MediaType.parse("application/json; charset=utf-8"); + RequestBody body = RequestBody.create(JSON, parameters); + // 构建 POST 请求 + Request request = new Request.Builder() + .url(url) + .post(body) + .build(); + OkHttpClient client = new OkHttpClient.Builder() + .connectTimeout(60, TimeUnit.SECONDS) // 连接超时 + .readTimeout(30, TimeUnit.SECONDS) // 读取超时 + .writeTimeout(15, TimeUnit.SECONDS) // 写入超时 + .build(); + Response response = client.newCall(request).execute(); + if (response.isSuccessful()) { + result = response.body().string(); + log.warn("Response: " + result); + } + }catch (Exception e){ + e.printStackTrace(); + } + return result; + } +} diff --git a/src/main/java/com/bfd/youzhiapi/util/KfkUtil.java b/src/main/java/com/bfd/youzhiapi/util/KfkUtil.java new file mode 100644 index 0000000..f58aa52 --- /dev/null +++ b/src/main/java/com/bfd/youzhiapi/util/KfkUtil.java @@ -0,0 +1,86 @@ +package com.bfd.youzhiapi.util; + +import lombok.extern.slf4j.Slf4j; +import org.apache.kafka.clients.producer.KafkaProducer; +import org.apache.kafka.clients.producer.ProducerConfig; +import org.apache.kafka.clients.producer.ProducerRecord; +import org.apache.kafka.common.serialization.StringSerializer; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.stereotype.Component; + +import java.util.Properties; + +/** + * @author guowei + * kfk工具类 + */ +@Component +@Slf4j +public class KfkUtil { + private static String topic; + + private static String brokerList; + + @Value("${crawl.kafka.topic}") + public void setTopic(String topic) { + KfkUtil.topic = topic; + } + + @Value("${crawl.kafka.brokers}") + public void setBrokerList(String brokerList) { + KfkUtil.brokerList = brokerList; + } + private static KafkaProducer kafkaProducer; + + public static int num = 0; + + /** + * 获取KafkaProducer实例 + */ + public static KafkaProducer getProducer() { +// synchronized (kafkaProducer) { + if (kafkaProducer == null) { + Properties props = new Properties(); + //xxx服务器ip + props.put("bootstrap.servers", brokerList); +// props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG,brokerList); + //所有follower都响应了才认为消息提交成功,即"committed" + props.put("acks", "all"); + //retries = MAX 无限重试,直到你意识到出现了问题:) + props.put("retries", 3); + //producer将试图批处理消息记录,以减少请求次数.默认的批量处理消息字节数 + props.put("batch.size", 16384); + //batch.size当批量的数据大小达到设定值后,就会立即发送,不顾下面的linger.ms + //延迟1ms发送,这项设置将通过增加小的延迟来完成--即,不是立即发送一条记录,producer将会等待给定的延迟时间以允许其他消息记录发送,这些消息记录可以批量处理 + props.put("linger.ms", 1); + //producer可以用来缓存数据的内存大小。 + props.put("buffer.memory", 33554432); + props.put("key.serializer", + StringSerializer.class.getName()); + props.put("value.serializer", + StringSerializer.class.getName()); + kafkaProducer = new KafkaProducer(props); + } +// } + return kafkaProducer; + } + + /** + * 关闭KafkaProducer实例 + */ + public static void closeProducer() { + if (kafkaProducer != null) { + log.info("----------close producer----------"); + kafkaProducer.close(); + kafkaProducer = null; + } + } + + public static void sendKafka(String resultData) { + KafkaProducer producer = getProducer(); + ProducerRecord se = new ProducerRecord(topic, resultData); + producer.send(se); + log.info("发送kafka成功"); +// num++; + } +} diff --git a/src/main/java/com/bfd/youzhiapi/util/Md5SignUtil.java b/src/main/java/com/bfd/youzhiapi/util/Md5SignUtil.java new file mode 100644 index 0000000..595272b --- /dev/null +++ b/src/main/java/com/bfd/youzhiapi/util/Md5SignUtil.java @@ -0,0 +1,270 @@ +package com.bfd.youzhiapi.util; + + +import cn.hutool.core.date.DateUtil; +import com.alibaba.fastjson2.JSON; +import com.alibaba.fastjson2.JSONObject; +import com.bfd.youzhiapi.config.GlobalConfig; +import lombok.extern.slf4j.Slf4j; +import org.springframework.stereotype.Component; + +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.security.MessageDigest; +import java.util.*; + +/** + * Title: 接口签名工具类 + * + *

+ * Description: + *

+ * + * @author + */ +@Component +@Slf4j +public class Md5SignUtil { + public static String Encoding_utf8 = "UTF-8"; + private static final String hexDigits[] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f"}; + + /** + * 签名sign + * @param parameters + * 数据拼接 + * 将所有发送或者接收到的数据为集合M,将集合M内非空参数值的参数按照参数名ASCII码从小到大排序(字典序),使用URL键值对的格式(即key1=value1&key2=value2…)拼接成字符串stringA。 + * 特别注意以下重要规则: ◆ 参数名ASCII码从小到大排序(字典序); ◆ 如果参数的值为空不参与签名; ◆ 参数名区分大小写; ◆ + * 验证调用返回或主动通知时,传送的sign参数不参与签名,将生成的签名与该sign值作校验。 数据签名 + * 在stringA最后拼接上key得到stringSignTemp字符串,并对stringSignTemp进行MD5运算,再将得到的字符串所有字符转换为大写,得到sign值signValue。 + * @return + */ + public static String sign(SortedMap parameters) { + StringBuffer sb = new StringBuffer(); + Set es = parameters.entrySet();// 所有参与传参的参数按照accsii排序(升序) + Iterator it = es.iterator(); + while (it.hasNext()) { + Map.Entry entry = (Map.Entry) it.next(); + String k = (String) entry.getKey(); + Object v = entry.getValue(); + if (null != v && !"".equals(v) && !"sign".equals(k) && !"key".equals(k)) { + sb.append(k + "=" + v + "&"); + } + } + sb.append("appSecret=" + GlobalConfig.APPSECRET); + log.debug("Md5SignUtil.sign加密串为:{}", sb.toString()); + String sign = MD5Encode(sb.toString(), Encoding_utf8).toUpperCase(); + log.debug("Md5SignUtil.sign加密串后的签名为:{}", sign); + return sign; + } + + /** + * 返回结果签名sign + * + * @param responseDRO + * @param key + * @return + */ +// public static String sign(ResponseDRO responseDRO, String key) { +// JSONObject infoJson = (JSONObject) JSONObject.toJSON(responseDRO); +// StringBuilder sb = new StringBuilder(); +// Object[] sortArra = infoJson.keySet().toArray(); +// // 按照ASCII排序 +// Arrays.sort(sortArra); +// for (Object k : sortArra) { +// Object v = infoJson.get(k); +// if (null != v && !"".equals(v)) { +// if (v instanceof JSONArray) { +// JSONArray tempJson = (JSONArray) v; +// List> allList = new ArrayList<>(); +// for (int i = 0; i < tempJson.size(); i++) { +// JSONObject obj = tempJson.getJSONObject(i); +// Map objMap = new HashMap<>(); +// for (Map.Entry entry : obj.entrySet()) { +// objMap.put(entry.getKey(), entry.getValue()); +// } +// +// SortedMap temps = new TreeMap<>();// 升序 +// temps.putAll(objMap); +// allList.add(temps); +// } +// v = JSON.toJSONString(allList); +// } +// sb.append(k).append("=").append(v).append("&"); +// } +// } +// sb.append("appSecret=" + key); +// logger.debug("Md5SignUtil.sign加密串为:{}", sb.toString()); +// String sign = MD5Encode(sb.toString(), Encoding_utf8).toUpperCase(); +// logger.debug("Md5SignUtil.sign加密串后的签名为:{}", sign); +// return sign; +// } + + /** + * 请求参数sign签名 + * + * @param baseQuery + * @param key + * @return + */ +// public static String sign(BaseQuery baseQuery, String key) { +// JSONObject infoJson = (JSONObject) JSONObject.toJSON(baseQuery); +// StringBuilder sb = new StringBuilder(); +// Object[] sortArra = infoJson.keySet().toArray(); +// // 按照ASCII排序 +// Arrays.sort(sortArra); +// for (Object k : sortArra) { +// Object v = infoJson.get(k); +// if (null != v && !"".equals(v) && !"sign".equals(k)) { +// if (v instanceof JSONArray) { +// JSONArray tempJson = (JSONArray) v; +// List> allList = new ArrayList<>(); +// for (int i = 0; i < tempJson.size(); i++) { +// JSONObject obj = tempJson.getJSONObject(i); +// Map objMap = new HashMap<>(); +// for (Map.Entry entry : obj.entrySet()) { +// objMap.put(entry.getKey(), entry.getValue()); +// } +// +// SortedMap temps = new TreeMap<>();// 升序 +// temps.putAll(objMap); +// allList.add(temps); +// } +// v = JSON.toJSONString(allList); +// } +// sb.append(k).append("=").append(v).append("&"); +// } +// } +// sb.append("appSecret=" + key); +// logger.info("Md5SignUtil.sign加密串为:{}", sb.toString()); +// String sign = MD5Encode(sb.toString(), Encoding_utf8).toUpperCase(); +// logger.info("Md5SignUtil.sign加密串后的签名为:{}", sign); +// return sign; +// } + + /** + * 字符串 md5加密 + * + * @param str + * @param charsetName + * @return + */ + public static String MD5Encode(String str, String charsetName) { + String resultString = null; + try { + resultString = new String(str); + MessageDigest md = MessageDigest.getInstance("MD5"); + if (charsetName == null || "".equals(charsetName)) { + resultString = byteArrayToHexString(md.digest(resultString.getBytes())); + } else { + resultString = byteArrayToHexString(md.digest(resultString.getBytes(charsetName))); + } + } catch (Exception e) { + } + return resultString; + } + + private static String byteArrayToHexString(byte b[]) { + StringBuffer resultSb = new StringBuffer(); + for (int i = 0; i < b.length; i++) + resultSb.append(byteToHexString(b[i])); + + return resultSb.toString().toUpperCase(); + } + + private static String byteToHexString(byte b) { + int n = b; + if (n < 0) { + n += 256; + } + int d1 = n / 16; + int d2 = n % 16; + return hexDigits[d1] + hexDigits[d2]; + } + + /** + * md5加密 默认加密串小写 + * + * @param + * @return + */ +// public static String md5(String plaintext) { +// return md5(plaintext, StandardCharsets.UTF_8); +// } + + + + public static void main(String[] args) { + + String key = "dc41973ee03e471887c77c4a532dbfc3"; + + String appId = "c4d532304c6b4497b1ad"; + String name = "radar"; + String randomStr = DateUtil.format(new Date(), "yyyyMMddHHmmss"); +// String randomStr = "20241225114130"; + List typeSet = new ArrayList<>(); + typeSet.add(10); + + + SortedMap parameters = new TreeMap(); + + parameters.put("randomStr", randomStr); + parameters.put("appId", appId); + parameters.put("title", name); + parameters.put("page",1); + parameters.put("pageSize",10); +// parameters.put("year","2024"); + + parameters.put("type", "30"); +// parameters.put("id","1010052234917"); + +// String apiSign = "D3CC78105AA4C5F594AE733E78DB6E02"; +// log.info("接口传入的签名串是:" + apiSign); + String mySign = sign(parameters); + log.info("生成的签名串是:" + mySign); + parameters.put("sign",mySign); + System.out.println(new JSONObject(parameters)); +// String data = HttpUtil.getData(JSON.toJSONString(parameters)); +// System.out.println(data); + + SortedMap parameterss = new TreeMap(); + parameterss.put("appId", appId); + parameterss.put("randomStr", randomStr); + parameterss.put("doi", "10.1109/ICCC57789.2023.10164991"); + parameterss.put("organId",392); + String mySigns = sign(parameterss); + log.info("生成的签名串是:" + mySigns); + parameterss.put("sign",mySigns); + System.out.println(new JSONObject(parameterss)); + + SortedMap parametersss = new TreeMap(); + parametersss.put("appId", appId); + parametersss.put("randomStr", randomStr); +// parametersss.put("status", 2); + parametersss.put("page",1); + parametersss.put("pageSize",10); + parametersss.put("organId",392); + String mySignss = sign(parametersss); + log.info("生成的签名串是:" + mySignss); + parametersss.put("sign",mySignss); + System.out.println(new JSONObject(parametersss)); + + SortedMap parameterssss = new TreeMap(); + parameterssss.put("appId", appId); + parameterssss.put("randomStr", randomStr); +// parametersss.put("status", 2); + parameterssss.put("id",8417); + String mySignsss = sign(parameterssss); + log.info("生成的签名串是:" + mySignsss); + parameterssss.put("sign",mySignsss); + System.out.println(new JSONObject(parameterssss)); + + // PreOrderResponseDto yardResultDto = new PreOrderResponseDto(); + // yardResultDto.setRetCode(retCode); + // yardResultDto.setRetMsg(retMsg); + // yardResultDto.setRandomStr(randomStr); + // yardResultDto.setSign(Md5SignUtil.sign(yardResultDto , key)); + // System.out.println(yardResultDto); + + } + +} \ No newline at end of file diff --git a/src/main/java/com/bfd/youzhiapi/util/Utils.java b/src/main/java/com/bfd/youzhiapi/util/Utils.java new file mode 100644 index 0000000..d8950d0 --- /dev/null +++ b/src/main/java/com/bfd/youzhiapi/util/Utils.java @@ -0,0 +1,29 @@ +package com.bfd.youzhiapi.util; + +import org.springframework.stereotype.Component; + +import java.time.Instant; +import java.time.ZoneId; +import java.time.ZonedDateTime; + +/** + * @author guowei + */ +@Component +public class Utils { + + public static boolean isFourDigitNumber(String input) { + return input.matches("\\d{4}"); + } + + public static int getYearFromTimestamp(long timestamp) { + // 使用 UTC 时区转换为年份 + ZonedDateTime dateTime = Instant.ofEpochMilli(timestamp).atZone(ZoneId.of("Asia/Shanghai")); + return dateTime.getYear(); + } + + public static boolean isYearInRange(int year, int startYear, int endYear) { + // 判断年份是否在区间内 + return year >= startYear && year <= endYear; + } +} diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml new file mode 100644 index 0000000..f7b315d --- /dev/null +++ b/src/main/resources/application.yml @@ -0,0 +1,26 @@ +spring: + datasource: + driver-class-name: com.mysql.cj.jdbc.Driver + url: jdbc:mysql://172.18.1.134:3306/cnki_crawl + username: crawl666 + password: lx2a4jN1xFT96kj20LU= +crawl: + kafka: + topic: zhiWangTest2 + brokers: 172.18.1.146:9092,172.18.1.147:9092,172.18.1.148:9092 +mybatis: + mapper-locations: classpath:mapper/*.xml + #目的是为了省略resultType里的代码量 + type-aliases-package: com.bfd.youzhiapi.entity + configuration: + log-impl: org.apache.ibatis.logging.stdout.StdOutImpl +server: + port: 7071 +#日志级别 +logging: + level: + com: + bfd: INFO + #日志路径 + log: + path: ./logs \ No newline at end of file diff --git a/src/main/resources/logback-spring.xml b/src/main/resources/logback-spring.xml new file mode 100644 index 0000000..0c59240 --- /dev/null +++ b/src/main/resources/logback-spring.xml @@ -0,0 +1,38 @@ + + + + + + + + + true + + ${logging.level} + + + ${logging.path}/crawlSchedule.log + + + + ${logging.path}/crawlSchedule.log.%d{yyyy-MM-dd} + + 7 + + + %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %line %-5level %logger{50} - %msg%n + UTF-8 + + + + + + + + diff --git a/src/main/resources/mapper/ScheduleMapper.xml b/src/main/resources/mapper/ScheduleMapper.xml new file mode 100644 index 0000000..bdb54b7 --- /dev/null +++ b/src/main/resources/mapper/ScheduleMapper.xml @@ -0,0 +1,17 @@ + + + + + update `newslist_111` set status=#{status} where rid = #{rid} + + + + + + \ No newline at end of file diff --git a/src/test/java/com/bfd/youzhiapi/YouzhiApiApplicationTests.java b/src/test/java/com/bfd/youzhiapi/YouzhiApiApplicationTests.java new file mode 100644 index 0000000..312f9d0 --- /dev/null +++ b/src/test/java/com/bfd/youzhiapi/YouzhiApiApplicationTests.java @@ -0,0 +1,13 @@ +package com.bfd.youzhiapi; + +import org.junit.jupiter.api.Test; +import org.springframework.boot.test.context.SpringBootTest; + +@SpringBootTest +class YouzhiApiApplicationTests { + + @Test + void contextLoads() { + } + +}