From e7325fdbb133d09d63f4012c68889959b1caf33c Mon Sep 17 00:00:00 2001
From: 55007 <55007@maojian>
Date: Tue, 7 Jan 2025 18:16:36 +0800
Subject: [PATCH] =?UTF-8?q?ocr=E5=BA=94=E7=94=A8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.gitignore | 33 +++++
README.md | 7 +
pom.xml | 123 ++++++++++++++++
.../crawl/ocrhandler/OcrHandlerApplication.java | 13 ++
.../com/bfd/crawl/ocrhandler/bean/ResponsePo.java | 59 ++++++++
.../config/AsyncThreadConfiguration.java | 48 ++++++
.../controller/DataFilterController.java | 46 ++++++
.../bfd/crawl/ocrhandler/enums/ResponseCode.java | 32 ++++
.../ocrhandler/service/FileProcessingService.java | 81 ++++++++++
.../crawl/ocrhandler/service/HandlerService.java | 164 +++++++++++++++++++++
.../bfd/crawl/ocrhandler/service/SendService.java | 50 +++++++
.../bfd/crawl/ocrhandler/service/StartServcie.java | 63 ++++++++
.../com/bfd/crawl/ocrhandler/util/DataUtil.java | 60 ++++++++
.../bfd/crawl/ocrhandler/util/FileDownloader.java | 117 +++++++++++++++
.../com/bfd/crawl/ocrhandler/util/FileUtil.java | 42 ++++++
.../com/bfd/crawl/ocrhandler/util/OcrUtil.java | 64 ++++++++
.../java/com/bfd/crawl/ocrhandler/util/OsUtil.java | 23 +++
.../com/bfd/crawl/ocrhandler/util/QueueUtil.java | 19 +++
.../com/bfd/crawl/ocrhandler/util/StringUtil.java | 94 ++++++++++++
src/main/resources/application.yml | 48 ++++++
.../ocrhandler/OcrHandlerApplicationTests.java | 17 +++
.../bfd/crawl/ocrhandler/PdfToImageConverter.java | 42 ++++++
22 files changed, 1245 insertions(+)
create mode 100644 .gitignore
create mode 100644 README.md
create mode 100644 pom.xml
create mode 100644 src/main/java/com/bfd/crawl/ocrhandler/OcrHandlerApplication.java
create mode 100644 src/main/java/com/bfd/crawl/ocrhandler/bean/ResponsePo.java
create mode 100644 src/main/java/com/bfd/crawl/ocrhandler/config/AsyncThreadConfiguration.java
create mode 100644 src/main/java/com/bfd/crawl/ocrhandler/controller/DataFilterController.java
create mode 100644 src/main/java/com/bfd/crawl/ocrhandler/enums/ResponseCode.java
create mode 100644 src/main/java/com/bfd/crawl/ocrhandler/service/FileProcessingService.java
create mode 100644 src/main/java/com/bfd/crawl/ocrhandler/service/HandlerService.java
create mode 100644 src/main/java/com/bfd/crawl/ocrhandler/service/SendService.java
create mode 100644 src/main/java/com/bfd/crawl/ocrhandler/service/StartServcie.java
create mode 100644 src/main/java/com/bfd/crawl/ocrhandler/util/DataUtil.java
create mode 100644 src/main/java/com/bfd/crawl/ocrhandler/util/FileDownloader.java
create mode 100644 src/main/java/com/bfd/crawl/ocrhandler/util/FileUtil.java
create mode 100644 src/main/java/com/bfd/crawl/ocrhandler/util/OcrUtil.java
create mode 100644 src/main/java/com/bfd/crawl/ocrhandler/util/OsUtil.java
create mode 100644 src/main/java/com/bfd/crawl/ocrhandler/util/QueueUtil.java
create mode 100644 src/main/java/com/bfd/crawl/ocrhandler/util/StringUtil.java
create mode 100644 src/main/resources/application.yml
create mode 100644 src/test/java/com/bfd/crawl/ocrhandler/OcrHandlerApplicationTests.java
create mode 100644 src/test/java/com/bfd/crawl/ocrhandler/PdfToImageConverter.java
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..549e00a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,33 @@
+HELP.md
+target/
+!.mvn/wrapper/maven-wrapper.jar
+!**/src/main/**/target/
+!**/src/test/**/target/
+
+### STS ###
+.apt_generated
+.classpath
+.factorypath
+.project
+.settings
+.springBeans
+.sts4-cache
+
+### IntelliJ IDEA ###
+.idea
+*.iws
+*.iml
+*.ipr
+
+### NetBeans ###
+/nbproject/private/
+/nbbuild/
+/dist/
+/nbdist/
+/.nb-gradle/
+build/
+!**/src/main/**/build/
+!**/src/test/**/build/
+
+### VS Code ###
+.vscode/
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..2c356ef
--- /dev/null
+++ b/README.md
@@ -0,0 +1,7 @@
+# ocr文本识别
+
+### 已支持如下功能:
+
+- PDF的拆分解析识别
+- doc以及docx的内容识别
+- 图片类的识别
diff --git a/pom.xml b/pom.xml
new file mode 100644
index 0000000..70bc8a9
--- /dev/null
+++ b/pom.xml
@@ -0,0 +1,123 @@
+
+
+ 4.0.0
+
+ org.springframework.boot
+ spring-boot-starter-parent
+ 2.2.4.RELEASE
+
+
+ com.bfd.crawl
+ ocrHandler
+ 0.0.1-SNAPSHOT
+ ocrHandler
+ ocrHandler
+
+ 2.17.2
+ 1.8
+
+
+
+
+
+ de.codecentric
+ spring-boot-admin-client
+ 2.2.4
+
+
+ org.springframework.kafka
+ spring-kafka
+
+
+ org.springframework.boot
+ spring-boot-starter
+
+
+ org.springframework.boot
+ spring-boot-starter-web
+
+
+ org.springframework.boot
+ spring-boot-devtools
+ runtime
+ true
+
+
+ org.projectlombok
+ lombok
+ true
+
+
+ org.springframework.boot
+ spring-boot-starter-test
+ test
+
+
+ org.apache.pdfbox
+ pdfbox
+ 2.0.28
+
+
+
+ org.apache.poi
+ poi-scratchpad
+ 5.2.0
+
+
+ org.apache.poi
+ poi
+ 5.2.0
+
+
+ org.apache.poi
+ poi-ooxml
+ 5.2.0
+
+
+
+ com.alibaba
+ fastjson
+ 2.0.17
+
+
+
+ com.squareup.okhttp3
+ okhttp
+ 3.9.1
+
+
+ com.google.code.gson
+ gson
+ 2.8.8
+
+
+ org.apache.kafka
+ kafka-clients
+ 2.3.1
+
+
+ org.springframework.kafka
+ spring-kafka-test
+ test
+
+
+
+
+
+
+ org.springframework.boot
+ spring-boot-maven-plugin
+
+
+
+ org.projectlombok
+ lombok
+
+
+
+
+
+
+
+
diff --git a/src/main/java/com/bfd/crawl/ocrhandler/OcrHandlerApplication.java b/src/main/java/com/bfd/crawl/ocrhandler/OcrHandlerApplication.java
new file mode 100644
index 0000000..93df53e
--- /dev/null
+++ b/src/main/java/com/bfd/crawl/ocrhandler/OcrHandlerApplication.java
@@ -0,0 +1,13 @@
+package com.bfd.crawl.ocrhandler;
+
+import org.springframework.boot.SpringApplication;
+import org.springframework.boot.autoconfigure.SpringBootApplication;
+
+@SpringBootApplication
+public class OcrHandlerApplication {
+
+ public static void main(String[] args) {
+ SpringApplication.run(OcrHandlerApplication.class, args);
+ }
+
+}
diff --git a/src/main/java/com/bfd/crawl/ocrhandler/bean/ResponsePo.java b/src/main/java/com/bfd/crawl/ocrhandler/bean/ResponsePo.java
new file mode 100644
index 0000000..185b47e
--- /dev/null
+++ b/src/main/java/com/bfd/crawl/ocrhandler/bean/ResponsePo.java
@@ -0,0 +1,59 @@
+package com.bfd.crawl.ocrhandler.bean;
+
+
+import com.bfd.crawl.ocrhandler.enums.ResponseCode;
+import lombok.AllArgsConstructor;
+import lombok.Data;
+import lombok.NoArgsConstructor;
+
+/**
+ * @author:jinming
+ * @className:ResponsePo
+ * @version:1.0
+ * @description:
+ * @Date:2023/4/3 17:23
+ */
+@Data
+@NoArgsConstructor
+@AllArgsConstructor
+public class ResponsePo {
+ /**
+ * 响应码
+ */
+ private int code;
+
+ /**
+ * 正常放 返回数据 的JSON串
+ */
+ private Object data;
+
+ /**
+ * 提示消息
+ */
+ private String message;
+
+ public static ResponsePo success() {
+ return setStatus(ResponseCode.SUCCESS.getCode(), ResponseCode.SUCCESS.getMessage());
+ }
+
+ public static ResponsePo error() {
+ return setStatus(ResponseCode.FAILURE.getCode(), ResponseCode.FAILURE.getMessage());
+ }
+
+ public static ResponsePo setStatus(int code, String message) {
+ ResponsePo resultBean = new ResponsePo();
+ resultBean.code = code;
+ resultBean.message = message;
+ return resultBean;
+ }
+ public ResponsePo(int code, String message) {
+ this.code = code;
+ this.message = message;
+ this.data = data;
+ }
+ public ResponsePo(ResponseCode responseCode){
+ this.code = responseCode.getCode();
+ this.message = responseCode.getMessage();
+ this.data = data;
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/com/bfd/crawl/ocrhandler/config/AsyncThreadConfiguration.java b/src/main/java/com/bfd/crawl/ocrhandler/config/AsyncThreadConfiguration.java
new file mode 100644
index 0000000..3734360
--- /dev/null
+++ b/src/main/java/com/bfd/crawl/ocrhandler/config/AsyncThreadConfiguration.java
@@ -0,0 +1,48 @@
+package com.bfd.crawl.ocrhandler.config;
+
+
+import org.springframework.context.annotation.Bean;
+import org.springframework.context.annotation.Configuration;
+import org.springframework.scheduling.annotation.EnableAsync;
+import org.springframework.scheduling.concurrent.ThreadPoolTaskExecutor;
+
+import java.util.concurrent.Executor;
+
+/**
+ * @author jinming
+ * @version 1.0
+ * @className AsyncThreadConfiguration
+ * @Date 2022/2/17 18:37
+ */
+@Configuration
+@EnableAsync
+public class AsyncThreadConfiguration {
+ @Bean
+ public Executor asyncExecutor() {
+ ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
+ // 核心线程数
+ executor.setCorePoolSize(500);
+ // 并发线程的数量限制为2
+ executor.setMaxPoolSize(500);
+ // 线程队列
+ executor.setQueueCapacity(500);
+ executor.setThreadNamePrefix("handlerData-");
+ executor.initialize();
+ executor.setWaitForTasksToCompleteOnShutdown(true);
+ return executor;
+ }
+ @Bean
+ public Executor sendExecutor() {
+ ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
+ // 核心线程数
+ executor.setCorePoolSize(500);
+ // 并发线程的数量限制为2
+ executor.setMaxPoolSize(500);
+ // 线程队列
+ executor.setQueueCapacity(500);
+ executor.setThreadNamePrefix("sendData-");
+ executor.initialize();
+ executor.setWaitForTasksToCompleteOnShutdown(true);
+ return executor;
+ }
+}
diff --git a/src/main/java/com/bfd/crawl/ocrhandler/controller/DataFilterController.java b/src/main/java/com/bfd/crawl/ocrhandler/controller/DataFilterController.java
new file mode 100644
index 0000000..9e301e3
--- /dev/null
+++ b/src/main/java/com/bfd/crawl/ocrhandler/controller/DataFilterController.java
@@ -0,0 +1,46 @@
+package com.bfd.crawl.ocrhandler.controller;
+
+
+import com.alibaba.fastjson.JSON;
+import com.bfd.crawl.ocrhandler.bean.ResponsePo;
+import com.bfd.crawl.ocrhandler.enums.ResponseCode;
+import com.bfd.crawl.ocrhandler.util.QueueUtil;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.web.bind.annotation.PostMapping;
+import org.springframework.web.bind.annotation.RequestBody;
+import org.springframework.web.bind.annotation.RequestMapping;
+import org.springframework.web.bind.annotation.RestController;
+
+
+import java.util.Map;
+
+/**
+ * @author:jinming
+ * @className:DataFilterController
+ * @version:1.0
+ * @description:
+ * @Date:2023/7/26 11:21
+ */
+@RestController
+@RequestMapping("/handlerdata")
+@Slf4j
+public class DataFilterController {
+ @PostMapping("/ocr")
+ public ResponsePo documentFeedback(@RequestBody String dataJson) {
+
+ ResponsePo responsePo = ResponsePo.success();
+ try {
+ Map parse = (Map) JSON.parse(dataJson);
+ } catch (Exception e) {
+ e.printStackTrace();
+ log.error("请求格式发生异常" + e.getMessage());
+ responsePo.setCode(ResponseCode.FAILURE.getCode());
+ responsePo.setMessage(ResponseCode.FAILURE.getMessage());
+ return responsePo;
+ }
+ log.info("新增任务:"+dataJson);
+ QueueUtil.taskQueue.add(dataJson);
+
+ return responsePo;
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/com/bfd/crawl/ocrhandler/enums/ResponseCode.java b/src/main/java/com/bfd/crawl/ocrhandler/enums/ResponseCode.java
new file mode 100644
index 0000000..8fa5b19
--- /dev/null
+++ b/src/main/java/com/bfd/crawl/ocrhandler/enums/ResponseCode.java
@@ -0,0 +1,32 @@
+package com.bfd.crawl.ocrhandler.enums;
+
+/**
+ * @author:jinming
+ * @className:ResponseCodeEnum
+ * @version:1.0
+ * @description:响应结果码枚举类
+ * @Date:2023/2/28 11:40
+ */
+public enum ResponseCode {
+ //返回结果码枚举类
+ SUCCESS(200, "操作成功"),
+ FAILURE(400, "参数错误"),
+ INTERNAL_SERVER_ERROR(500, "服务器内部错误"),
+ TYPE_NOT_SUPPORT(601,"文件类型不支持");
+
+ private int code;
+ private String message;
+
+ ResponseCode(int code, String message) {
+ this.code = code;
+ this.message = message;
+ }
+
+ public int getCode() {
+ return code;
+ }
+
+ public String getMessage() {
+ return message;
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/com/bfd/crawl/ocrhandler/service/FileProcessingService.java b/src/main/java/com/bfd/crawl/ocrhandler/service/FileProcessingService.java
new file mode 100644
index 0000000..f7fe173
--- /dev/null
+++ b/src/main/java/com/bfd/crawl/ocrhandler/service/FileProcessingService.java
@@ -0,0 +1,81 @@
+package com.bfd.crawl.ocrhandler.service;
+
+import java.awt.image.BufferedImage;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+
+import javax.imageio.ImageIO;
+
+import lombok.extern.slf4j.Slf4j;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.rendering.PDFRenderer;
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.extractor.WordExtractor;
+import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
+import org.apache.poi.xwpf.usermodel.XWPFDocument;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.stereotype.Service;
+
+/**
+ * @author:jinming
+ * @className:PdfToImageConverterService
+ * @version:1.0
+ * @description:
+ * @Date:2023/7/28 17:20
+ */
+@Service
+@Slf4j
+public class FileProcessingService {
+
+ public void converterPdfToImg(String fileName, String outputFolder) {
+ try {
+ PDDocument document = PDDocument.load(new File(fileName));
+ PDFRenderer pdfRenderer = new PDFRenderer(document);
+ for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) {
+ // 设置 DPI(分辨率)
+ BufferedImage bim = pdfRenderer.renderImageWithDPI(pageIndex, 300);
+ // 图片文件名
+ String imageName = "page_" + (pageIndex + 1) + ".png";
+ // 完整的图片文件路径
+ String imagePath = outputFolder + imageName;
+ File file = new File(imagePath);
+
+ if (!file.getParentFile().exists()) {
+ file.getParentFile().mkdirs();
+ }
+
+ ImageIO.write(bim, "png", file);
+ }
+ document.close();
+ System.out.println("PDF 已成功拆分为图片!");
+ } catch (Exception e) {
+// e.printStackTrace();
+ System.err.println("拆分 PDF 为图片时出现错误:" + e.getMessage());
+ }
+ }
+
+ public String readWordFile(String filePath) throws IOException {
+ InputStream inputStream = new FileInputStream(filePath);
+ String fileTypeDoc = "doc";
+ String fileTypeDocx = "docx";
+ if (filePath.endsWith(fileTypeDoc)) {
+ try (HWPFDocument document = new HWPFDocument(inputStream)) {
+ WordExtractor extractor = new WordExtractor(document);
+ return extractor.getText();
+ }
+ } else if (filePath.endsWith(fileTypeDocx)) {
+ try (XWPFDocument document = new XWPFDocument(inputStream)) {
+ XWPFWordExtractor extractor = new XWPFWordExtractor(document);
+ return extractor.getText();
+ }
+ } else {
+ throw new IllegalArgumentException("Unsupported file format");
+ }
+ }
+
+
+}
\ No newline at end of file
diff --git a/src/main/java/com/bfd/crawl/ocrhandler/service/HandlerService.java b/src/main/java/com/bfd/crawl/ocrhandler/service/HandlerService.java
new file mode 100644
index 0000000..15d3f5c
--- /dev/null
+++ b/src/main/java/com/bfd/crawl/ocrhandler/service/HandlerService.java
@@ -0,0 +1,164 @@
+package com.bfd.crawl.ocrhandler.service;
+
+import com.alibaba.fastjson.JSON;
+import com.alibaba.fastjson.JSONObject;
+import com.alibaba.fastjson.JSONPath;
+import com.bfd.crawl.ocrhandler.util.*;
+import com.google.gson.Gson;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.scheduling.annotation.Async;
+import org.springframework.stereotype.Service;
+import org.springframework.web.bind.annotation.RequestBody;
+
+import java.io.File;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.UUID;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * @author:jinming
+ * @className:HandlerService
+ * @version:1.0
+ * @description:
+ * @Date:2023/8/1 16:05
+ */
+@Service
+@Slf4j
+public class HandlerService {
+ @Autowired
+ private FileProcessingService fileProcessingService;
+ @Value("${file.path}")
+ private String downloadFilePath;
+ @Value("${file.uploadUrl}")
+ private String uploadUrl;
+ @Value("${file.ocrApi}")
+ private String ocrApi;
+
+ @Async("asyncExecutor")
+ void run() {
+ Gson gson = new Gson();
+ while (true) {
+ try {
+ if (QueueUtil.taskQueue.size() > 0) {
+
+ String dataJson = null;
+ try {
+ dataJson = QueueUtil.taskQueue.take();
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ continue;
+ }
+ Map parse = (Map) JSON.parse(dataJson);
+ Map dataMap = (Map) parse.get("data");
+ int id = (int) parse.get("id");
+ Map admin = (Map) parse.get("input");
+ int maxPage = (int) admin.get("maxPage");
+ String fileType = (String) admin.get("fileType");
+ log.info("任务:" + id + "的文件类型为" + fileType);
+ try {
+ String key = (String) admin.get("fileUrl");
+
+ String fileUrl = (String) DataUtil.getValue(key, dataMap);
+ String pdf = "pdf";
+ String png = "png";
+ String jpg = "jpg";
+ String doc = "doc";
+ boolean windows = OsUtil.isWindows();
+ Map requestDataMap = new HashMap(32);
+ requestDataMap.put("id", UUID.randomUUID().toString());
+ if (fileType.contains(pdf)) {
+ String fileName = StringUtil.getMd5(fileUrl);
+ //定义PDF的下载路径
+ String pdfDir = downloadFilePath.concat(windows ? "\\pdf\\" : "/pdf/").concat(fileName).concat(".pdf");
+ FileDownloader.downloadFile(fileUrl, new File(pdfDir));
+ //定义PDF拆分后的图片路径
+ String imgPath = downloadFilePath.concat(fileName).concat(windows ? "\\" : "/");
+ fileProcessingService.converterPdfToImg(pdfDir, imgPath);
+ List imgFileList = FileUtil.traverseAndReturnFilePath(imgPath);
+ int i = 1;
+ for (String imgFilePath : imgFileList) {
+ Map imgUploadMap = FileDownloader.uploadFile(uploadUrl, imgFilePath);
+ String uploadFileUrl = imgUploadMap.get("fileUrl");
+ String dataText = OcrUtil.doOcr(uploadFileUrl, ocrApi);
+ //当页面达到最大或者拆分页数小于最大页码时将总页数发出
+ if (i == maxPage || i == imgFileList.size()) {
+ Map result = new HashMap(32);
+ requestDataMap.put("content", dataText);
+ requestDataMap.put("pageNum", i);
+ result.put("results", JSON.toJSONString(requestDataMap));
+ parse.put("result", result);
+ String message = gson.toJson(parse);
+ QueueUtil.sendQueue.put(message);
+ break;
+ }
+ Map result = new HashMap(32);
+ requestDataMap.put("content", dataText);
+ result.put("results", JSON.toJSONString(requestDataMap));
+ parse.put("result", result);
+ String message = gson.toJson(parse);
+ QueueUtil.sendQueue.put(message);
+ i++;
+ }
+ } else if (fileType.contains(jpg) || fileUrl.contains(png)) {
+ String dataText = OcrUtil.doOcr(fileUrl, ocrApi);
+ Map result = new HashMap(32);
+ requestDataMap.put("content", dataText);
+ requestDataMap.put("pageNum", 1);
+ result.put("results", JSON.toJSONString(requestDataMap));
+ parse.put("result", result);
+ String message = gson.toJson(parse);
+ QueueUtil.sendQueue.put(message);
+ } else if (fileType.contains(doc)) {
+ String fileAndDirName = StringUtil.getMd5(fileUrl);
+ String pdfDir = downloadFilePath.concat(windows ? "\\doc\\" : "/doc/").concat(fileAndDirName).concat(".").concat(fileType);
+ FileDownloader.downloadFile(fileUrl, new File(pdfDir));
+ String readData = fileProcessingService.readWordFile(pdfDir);
+ Map result = new HashMap(32);
+ requestDataMap.put("content", readData);
+ requestDataMap.put("pageNum", 1);
+ result.put("results", JSON.toJSONString(requestDataMap));
+ parse.put("result", result);
+ String message = JSON.toJSONString(parse);
+ QueueUtil.sendQueue.put(message);
+ } else {
+ Map result = new HashMap(32);
+ requestDataMap.put("content", "文件类型无法被识别");
+ requestDataMap.put("pageNum", 1);
+ result.put("results", JSON.toJSONString(requestDataMap));
+ parse.put("result", result);
+ String message = gson.toJson(parse);
+ QueueUtil.sendQueue.put(message);
+ }
+
+ } catch (Throwable e) {
+ log.error("处理程序发生异常:" + e.getMessage());
+ e.printStackTrace();
+ }
+ } else {
+ log.info("任务队列为空,休眠3秒");
+ try {
+ Thread.sleep(3000);
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+ } catch (Throwable e) {
+ e.printStackTrace();
+ log.error("工作线程发生异常" + e.getMessage());
+ }
+
+
+ }
+ }
+
+ public static void main(String[] args) {
+ String a = "{\"metadata\":{\"output\":{\"output_type\":\"table\",\"label_col\":[\"总结果汇总\"]},\"input\":{\"input_type\":\"text\",\"label\":[\"6_结果汇总\"]},\"address\":\"http://172.24.12.126:9088/handlerdata/dataclean\",\"admin\":{\"summary\":{\"fieldMapping\":[{\"summaryType\":9007,\"valueType\":\"String\",\"value\":\"$.\",\"key\":\"6_结果汇总:\"}],\"filed\":\":$.businessKey\",\"pageNum\":\"1_文件上传:$.resultNumber \"},\"datasource\":\"6_结果汇总\",\"type\":8001,\"category\":\"总结\"},\"index\":0,\"user\":{\"tag\":\"\"}},\"data\":{\"1_文件上传\":\"{\\\"resultList\\\":[{\\\"fileUrl\\\":\\\"http://172.16.3.82:38080/group1/default/20230823/10/47/0/1692758859642_p63j.pdf\\\",\\\"fileType\\\":\\\"pdf\\\",\\\"filePath\\\":\\\"/\\\",\\\"fileId\\\":1414,\\\"fileName\\\":\\\"EI-EORSA2016_Landsat Time Series Clustering under Modified Dynamic Time Warping.pdf\\\",\\\"ossPath\\\":\\\"/group1/default/20230823/10/47/0/1692758859642_p63j.pdf\\\"}],\\\"resultNumber\\\":1}\",\"3_OCR识别内容\":\"{\\\"fileName\\\":\\\"EI-EORSA2016_Landsat Time Series Clustering under Modified Dynamic Time Warping.pdf\\\",\\\"filePath\\\":\\\"/\\\",\\\"fileUrl\\\":\\\"http://172.16.3.82:38080/group1/default/20230823/10/47/0/1692758859642_p63j.pdf\\\",\\\"pageNum\\\":1,\\\"fileType\\\":\\\"pdf\\\",\\\"content\\\":\\\" 2016 Fourth International Workshop on Earth Observation and Remote Sensing Applications\\\\nChangping District Beijing, between N40°2118”-402313”generated. Based on the characteristics of land cover type in\\\\nand E115°5017”-116°2949”,is located in the northwest of\\\\nthe region. six classes were identified as the final class types\\\\n(3I) culy ZSEI Aleleuuxordde Jo Bare ue 3utaAos 3ultea\\\\nof the regional land cover classification experiment. which\\\\nincluded crop,water, forest. grass,impervious.and bare land.\\\\nChangping District Beijing belongs to a temperate zone with\\\\nSub categories contained in each of the major class types is\\\\na continental monsoon climate. The climate in the study area\\\\nshown in the table 1. These validating samples were easily\\\\nhas four distinct seasons with hot and humid summers and\\\\nidentified on the G-F1 image and Google Earth map by visual\\\\ncoldwindy,and dry winters. The average annual temperature\\\\nobservation\\\\nis approximately 11.8C and the average annual precipitation\\\\nTablel.Land cover classification system\\\\nis approximately 550.3 mm. Regional terrains from the\\\\nClass typesSub categories\\\\nnorthwest to the southeast gradually formed a gentle slope\\\\nThe paddy field,irrigated\\\\nand thin brown earth formed by weathering of rocks is the\\\\ncrop\\\\nmain soil type. The land cover types are abundant. including\\\\nland,dry land\\\\nRivers, lakes,\\\\nforest grass, cropland. construction land and water.\\\\nreservoirs.\\\\nwater\\\\nponds.\\\\nForest.orchard.\\\\n2.2.Landsat data and preprocess\\\\nshrub\\\\nforest\\\\nwoodland.other\\\\ngarden\\\\nFourteen landsate8 images covering the Changping District\\\\nland\\\\nof Beijing City were selected as the research data. shot on\\\\nNatural\\\\npasture,artificial\\\\ngrassland\\\\nJuly 31.2013.September 1.2013.October3.2013.October\\\\npasture, other grassland\\\\nUrban\\\\nland.\\\\nrural\\\\nresidentialland.\\\\nimpervious\\\\ntransportation\\\\nland\\\\nand\\\\nother construction sites\\\\n115°50°0E11600E116°200E110°100E118*30E\\\\nOther land.saline land.bare\\\\n2230\\\\n40°223\\\\nsand,swamp,beaches.and\\\\n人\\\\nbare land\\\\nother land types\\\\nnot\\\\nincluded in the above 5\\\\n40°120H\\\\n40°120\\\\nclass types\\\\n510203040\\\\nKilometers\\\\n·3.METHOD\\\\n02412\\\\n40°130T\\\\nomelers\\\\n116°00日116°100E16°200E116°30°0°1\\\\nFig.l. The geographical region of the study area and the background\\\\n3.1.The flowchart of clustering\\\\ninformation is the true color image CR: red,G: green, B: blue) of Landsat&\\\\ndata acquired on October 3,2013.\\\\nA flowchart of Landsat time series clustering based on CD-\\\\n19.2013.November4.2013.November20.2013.Decembei\\\\nDTW distance was presented in Fig 2. The selected fourteen\\\\n6,2013,February8,2014,April13,2014,April29,2014,\\\\nlandsate8 images covering the Changping District of Beijing\\\\n+100ZIaquraides +10t iaquaides+10Z 'SI AtW\\\\nCity were firstly preprocessed to have good quality. Then the\\\\nOctober6,2014 respectively\\\\npreprocessed images were used to derive time series NDVI.\\\\nThe multi temporal Landsat8 data processing mainly\\\\nSecondly. multivariate time series data. consisting of time\\\\nincluded relative radiometric calibration. geometric precision\\\\nseries green bands. time series red bands, time series near\\\\ncorrection. and image subset. NDVI of all fourteen temporal\\\\ninfrared bands and time series NDVIwas prepared for land\\\\ncover clustering. Thirdly, K-means clustering was conducted\\\\nimages are calculated by using the red and near infrared\\\\nspectral bands of the images after the above correction and\\\\nbased on CD-DTW method. In the process of clustering, 1)\\\\nsubset. In view of the fact that the green.red and near infrared\\\\nbased on multivariate time series data. Randomly select the\\\\nbands of remote sensing images contain rich information of\\\\nclustering number and the clustering center sequence; 2)\\\\nground features, and NDVI can be used to well distinguished\\\\nCalculate the CD-DTW distances between the other\\\\nvegetation from other various objects, multivariate time\\\\nsequences and the clustering center sequence: 3) Cluster\\\\nseries data for land cover classification consist of time series\\\\nbased on CD-DTW:4) Judge whether the clustering center is\\\\nconvergent, if the answer is Yes, then end the clustering\\\\ngreen bands, time series red bands. time series near infrared\\\\nbands and time series NDVI.\\\\notherwise Update the clustering center sequence by DBA\\\\n[9]method and returned back to 2) until the answer is Yes.\\\\nFinally, Classification images\\\\ns were obtained by post\\\\n2.3.Classification system and validating data\\\\nclassifying the clustering results\\\\nBy computing the distance between the center sequences\\\\nof the categories obtained by unsupervised clustering merge\\\\n3.2.CD-DTW\\\\nsimilar categories, and the final classification image is\\\\n978-1-5090-1479-8/16/$31.00@2016卫EE\\\",\\\"fileId\\\":1414,\\\"ossPath\\\":\\\"/group1/default/20230823/10/47/0/1692758859642_p63j.pdf\\\"}\",\"4_翻译总结\":\"{\\\"fileName\\\":\\\"EI-EORSA2016_Landsat Time Series Clustering under Modified Dynamic Time Warping.pdf\\\",\\\"filePath\\\":\\\"/\\\",\\\"fileUrl\\\":\\\"http://172.16.3.82:38080/group1/default/20230823/10/47/0/1692758859642_p63j.pdf\\\",\\\"pageNum\\\":1,\\\"fileType\\\":\\\"pdf\\\",\\\"content\\\":\\\"2016年第四届地球观测与遥感应用国际研讨会\\n北京市昌平区,位于北纬N40°2118”-402313”和东经E115°5017”-116°2949”之间。根据该地区土地覆盖类型的特征,确定了六个最终的类别类型(3I)culy ZSEI Aleleuuxordde Jo Bare ue 3utaAos 3ultea,包括农作物、水域、森林、草地、不透水面和裸地。北京市昌平区属于温带区,具有大陆性季风气候。研究区的气候表现为四季分明,夏季炎热潮湿,冬季寒冷多风,年平均气温约为11.8摄氏度,年平均降水量约为550.3毫米。从西北到东南的区域地形逐渐形成了一个缓坡和由岩石风化形成的薄褐色土壤是主要的土壤类型。土地覆盖类型丰富,包括河流、湖泊、水库、池塘、森林、果园、灌木林、林地、其他草地、城市土地、农村住宅用地、不透水面和其他建设用地。选择了覆盖北京市昌平区的14幅Landsat8影像作为研究数据,拍摄日期分别为2013年7月31日、2013年9月1日、2013年10月3日、2013年10月19日、2013年11月4日、2013年11月20日、2013年12月6日、2014年2月8日、2014年4月13日、2014年4月29日、2014年10月6日。首先对这些影像进行预处理,以获得良好的质量。然后使用预处理后的影像来计算时间序列的归一化植被指数(NDVI)。接下来,使用基于CD-DTW方法的K-means聚类对多时相的Landsat8数据进行处理。在聚类过程中,首先基于多变量时间序列数据,随机选择聚类数和聚类中心序列;其次,计算其他序列与聚类中心序列之间的CD-DTW距离;然后基于CD-DTW进行聚类;最后,通过计算未监督聚类合并得到的类别的中心序列之间的距离,得到最终的分类图像。\\\"}\",\"5_内容总结\":\"该内容总结如下:\\n该文档是关于2016年第四届地球观测与遥感应用国际研讨会的内容。研究区域位于北京市昌平区,根据土地覆盖类型的特征,确定了六个最终的类别类型,包括农作物、水域、森林、草地、不透水面和裸地。研究区的气候为温带区的大陆性季风气候,具有四季分明的特点。研究使用了14幅Landsat8影像作为研究数据,通过预处理和计算归一化植被指数(NDVI)来处理这些影像。然后使用基于CD-DTW方法的K-means聚类对多时相的Landsat8数据进行处理,最终得到了分类图像。\",\"businessKey\":\"a0386e9c8019e7c6\",\"2_任务提取\":\"[{\\\"fileId\\\":1414,\\\"fileName\\\":\\\"EI-EORSA2016_Landsat Time Series Clustering under Modified Dynamic Time Warping.pdf\\\",\\\"filePath\\\":\\\"/\\\",\\\"fileType\\\":\\\"pdf\\\",\\\"fileUrl\\\":\\\"http://172.16.3.82:38080/group1/default/20230823/10/47/0/1692758859642_p63j.pdf\\\",\\\"ossPath\\\":\\\"/group1/default/20230823/10/47/0/1692758859642_p63j.pdf\\\"}]\",\"6_结果汇总\":\"{\\\"file\\\":{\\\"fileId\\\":\\\"1414\\\",\\\"fileName\\\":\\\"EI-EORSA2016_Landsat Time Series Clustering under Modified Dynamic Time Warping.pdf\\\",\\\"filePath\\\":\\\"/\\\",\\\"fileType\\\":\\\"pdf\\\",\\\"fileUrl\\\":\\\"http://172.16.3.82:38080/group1/default/20230823/10/47/0/1692758859642_p63j.pdf\\\",\\\"ossPath\\\":\\\"/group1/default/20230823/10/47/0/1692758859642_p63j.pdf\\\"},\\\"form\\\":{\\\"内容总结\\\":\\\"该内容总结如下:\\\\n该文档是关于2016年第四届地球观测与遥感应用国际研讨会的内容。研究区域位于北京市昌平区,根据土地覆盖类型的特征,确定了六个最终的类别类型,包括农作物、水域、森林、草地、不透水面和裸地。研究区的气候为温带区的大陆性季风气候,具有四季分明的特点。研究使用了14幅Landsat8影像作为研究数据,通过预处理和计算归一化植被指数(NDVI)来处理这些影像。然后使用基于CD-DTW方法的K-means聚类对多时相的Landsat8数据进行处理,最终得到了分类图像。\\\",\\\"文件名\\\":\\\"EI-EORSA2016_Landsat Time Series Clustering under Modified Dynamic Time Warping.pdf\\\",\\\"翻译结果\\\":\\\"{\\\\\\\"fileName\\\\\\\":\\\\\\\"EI-EORSA2016_Landsat Time Series Clustering under Modified Dynamic Time Warping.pdf\\\\\\\",\\\\\\\"filePath\\\\\\\":\\\\\\\"/\\\\\\\",\\\\\\\"fileUrl\\\\\\\":\\\\\\\"http://172.16.3.82:38080/group1/default/20230823/10/47/0/1692758859642_p63j.pdf\\\\\\\",\\\\\\\"pageNum\\\\\\\":1,\\\\\\\"fileType\\\\\\\":\\\\\\\"pdf\\\\\\\",\\\\\\\"content\\\\\\\":\\\\\\\"2016年第四届地球观测与遥感应用国际研讨会\\\\n北京市昌平区,位于北纬N40°2118”-402313”和东经E115°5017”-116°2949”之间。根据该地区土地覆盖类型的特征,确定了六个最终的类别类型(3I)culy ZSEI Aleleuuxordde Jo Bare ue 3utaAos 3ultea,包括农作物、水域、森林、草地、不透水面和裸地。北京市昌平区属于温带区,具有大陆性季风气候。研究区的气候表现为四季分明,夏季炎热潮湿,冬季寒冷多风,年平均气温约为11.8摄氏度,年平均降水量约为550.3毫米。从西北到东南的区域地形逐渐形成了一个缓坡和由岩石风化形成的薄褐色土壤是主要的土壤类型。土地覆盖类型丰富,包括河流、湖泊、水库、池塘、森林、果园、灌木林、林地、其他草地、城市土地、农村住宅用地、不透水面和其他建设用地。选择了覆盖北京市昌平区的14幅Landsat8影像作为研究数据,拍摄日期分别为2013年7月31日、2013年9月1日、2013年10月3日、2013年10月19日、2013年11月4日、2013年11月20日、2013年12月6日、2014年2月8日、2014年4月13日、2014年4月29日、2014年10月6日。首先对这些影像进行预处理,以获得良好的质量。然后使用预处理后的影像来计算时间序列的归一化植被指数(NDVI)。接下来,使用基于CD-DTW方法的K-means聚类对多时相的Landsat8数据进行处理。在聚类过程中,首先基于多变量时间序列数据,随机选择聚类数和聚类中心序列;其次,计算其他序列与聚类中心序列之间的CD-DTW距离;然后基于CD-DTW进行聚类;最后,通过计算未监督聚类合并得到的类别的中心序列之间的距离,得到最终的分类图像。\\\\\\\"}\\\"}}\"},\"created\":1691004265000,\"module\":\"数据清洗\",\"start_tag\":false,\"multi_branch\":0,\"last_edit\":1693571680000,\"next_app_id\":[{\"start_id\":158,\"edge_id\":79,\"end_id\":159}],\"transfer_id\":7,\"version\":1,\"blueprint_id\":4,\"scenes_id\":5,\"scenario\":{\"dataloss\":1,\"autoCommitTriggerLast\":1,\"maxErrors\":3,\"autoCommit\":1,\"freshVariables\":1},\"wait_condition\":[],\"scheduling\":{\"interval\":-1,\"type\":\"single\"},\"name\":\"总结果汇总\",\"businessKey\":\"a0386e9c8019e7c6\",\"id\":158,\"position\":[100,200],\"describe\":\"总结果汇总\"}";
+ JSONObject jsonObject = JSON.parseObject(a);
+ System.out.println(JSONPath.eval(jsonObject, "$['metadata']"));
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/com/bfd/crawl/ocrhandler/service/SendService.java b/src/main/java/com/bfd/crawl/ocrhandler/service/SendService.java
new file mode 100644
index 0000000..d37cc92
--- /dev/null
+++ b/src/main/java/com/bfd/crawl/ocrhandler/service/SendService.java
@@ -0,0 +1,50 @@
+package com.bfd.crawl.ocrhandler.service;
+
+
+import com.bfd.crawl.ocrhandler.util.QueueUtil;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.kafka.core.KafkaTemplate;
+import org.springframework.scheduling.annotation.Async;
+import org.springframework.stereotype.Service;
+
+import javax.annotation.Resource;
+
+/**
+ * @author:jinming
+ * @className:SendService
+ * @version:1.0
+ * @description:
+ * @Date:2023/7/31 17:53
+ */
+@Slf4j
+@Service
+public class SendService {
+ @Value("${send.topic}")
+ private String topic;
+
+ @Resource
+ private KafkaTemplate kafkaTemplate;
+
+ @Async("sendExecutor")
+ void sendToKafka() {
+ while (true) {
+ if (QueueUtil.sendQueue.size() > 0) {
+ try {
+ String message = QueueUtil.sendQueue.take();
+ kafkaTemplate.send(topic,message);
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }else {
+ log.info("任务队列为空,休眠3秒");
+ try {
+ Thread.sleep(3000);
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+}
diff --git a/src/main/java/com/bfd/crawl/ocrhandler/service/StartServcie.java b/src/main/java/com/bfd/crawl/ocrhandler/service/StartServcie.java
new file mode 100644
index 0000000..2f5cb76
--- /dev/null
+++ b/src/main/java/com/bfd/crawl/ocrhandler/service/StartServcie.java
@@ -0,0 +1,63 @@
+package com.bfd.crawl.ocrhandler.service;
+
+
+import com.bfd.crawl.ocrhandler.util.QueueUtil;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.boot.ApplicationArguments;
+import org.springframework.boot.ApplicationRunner;
+import org.springframework.stereotype.Service;
+
+/**
+ * @author:jinming
+ * @className:StartServcie
+ * @version:1.0
+ * @description:
+ * @Date:2023/7/31 17:14
+ */
+@Service
+@Slf4j
+public class StartServcie implements ApplicationRunner {
+ @Value("${thread.handler}")
+ private int handlerNumber;
+ @Value("${thread.send}")
+ private int sendNumber;
+
+ @Autowired
+ private HandlerService handlerService;
+ @Autowired
+ private SendService sendService;
+
+ @Override
+ public void run(ApplicationArguments args) throws Exception {
+ for (int i = 0; i < handlerNumber; i++) {
+ log.info("处理服务线程" + i + "已启动 ");
+ handlerService.run();
+ }
+ for (int i = 0; i < sendNumber; i++) {
+ log.info("发送服务线程" + i + "已启动 ");
+ sendService.sendToKafka();
+ }
+ Runnable myRunnable = new Runnable() {
+ @Override
+ public void run() {
+ while (true) {
+ log.info("任务队列长度为" + QueueUtil.taskQueue.size());
+ log.info("发送队列长度为" + QueueUtil.sendQueue.size());
+ try {
+ Thread.sleep(10000);
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+ };
+ // 创建一个新的线程,并将Runnable对象传递给Thread构造函数
+ Thread myThread = new Thread(myRunnable);
+ // 启动线程
+ myThread.start();
+
+
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/com/bfd/crawl/ocrhandler/util/DataUtil.java b/src/main/java/com/bfd/crawl/ocrhandler/util/DataUtil.java
new file mode 100644
index 0000000..6d0a6c3
--- /dev/null
+++ b/src/main/java/com/bfd/crawl/ocrhandler/util/DataUtil.java
@@ -0,0 +1,60 @@
+package com.bfd.crawl.ocrhandler.util;
+
+import com.alibaba.fastjson.JSON;
+import com.alibaba.fastjson.JSONObject;
+import com.alibaba.fastjson.JSONPath;
+import lombok.extern.slf4j.Slf4j;
+
+import java.util.Map;
+
+/**
+ * @author:jinming
+ * @className:DataUtil
+ * @version:1.0
+ * @description: 获取dataValue的值
+ * @Date:2023/11/1 9:54
+ */
+@Slf4j
+public class DataUtil {
+ /**
+ * @param key 传入的key
+ * @param dataMap 数据map
+ * @return 根据传入的参数进行判断解析,返回正确的dataValue
+ */
+ public static Object getValue(String key, Map dataMap) {
+ try {
+ //公式为空直接就返回
+ if (!StringUtil.hasValue(key)) {
+ return "";
+ }
+ Object dataValue;
+ String isJson = "#json#";
+ if (key.contains(isJson)) {
+ //进行第一次拆分,获取#json#前面的部分
+ String[] keySplit = key.split(isJson);
+ String firstDataKey = keySplit[0];
+ String[] firstDataKeySplit = firstDataKey.split(":");
+ //取出前半部分对应的JSON数据并转换为JSONObject
+ String dataJson = (String) dataMap.get(firstDataKeySplit[0]);
+ JSONObject dataJsonObject = JSON.parseObject(dataJson);
+ //根据key的后半部分取出对应JSONObject中的值
+ String firstDataKeyJson = (String) JSONPath.eval(dataJsonObject, firstDataKeySplit[1]);
+ String secDataKey = keySplit[1];
+ JSONObject firstDataJsonObject = JSON.parseObject(firstDataKeyJson);
+ dataValue = JSONPath.eval(firstDataJsonObject, secDataKey);
+ return dataValue;
+ }
+ String[] keySplit = key.split(":");
+ String jsonPath = keySplit[1];
+ String dataJson = (String) dataMap.get(keySplit[0]);
+ JSONObject dataJsonObject = JSON.parseObject(dataJson);
+ dataValue = JSONPath.eval(dataJsonObject, jsonPath);
+ return dataValue;
+ } catch (Exception e) {
+ // TODO: handle exception
+ log.error("jsonpath公式取值异常,", e);
+ return null;
+ }
+
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/com/bfd/crawl/ocrhandler/util/FileDownloader.java b/src/main/java/com/bfd/crawl/ocrhandler/util/FileDownloader.java
new file mode 100644
index 0000000..dd88e9e
--- /dev/null
+++ b/src/main/java/com/bfd/crawl/ocrhandler/util/FileDownloader.java
@@ -0,0 +1,117 @@
+package com.bfd.crawl.ocrhandler.util;
+
+import com.alibaba.fastjson.JSON;
+import okhttp3.*;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * @author:jinming
+ * @className:FileDownloader
+ * @version:1.0
+ * @description:
+ * @Date:2023/8/1 16:31
+ */
+public class FileDownloader {
+ private static OkHttpClient okHttpClient;
+
+ private static OkHttpClient getOkHttpClient() {
+ if (okHttpClient == null) {
+ okHttpClient = new OkHttpClient();
+ }
+ return okHttpClient;
+ }
+
+ public static void downloadFile(String url, File destination) throws IOException {
+ OkHttpClient client = getOkHttpClient();
+ Request request = new Request.Builder()
+ .url(url)
+ .build();
+
+ try (Response response = client.newCall(request).execute()) {
+ if (!response.isSuccessful()) {
+ throw new IOException("Failed to download file: " + response);
+ }
+
+ ResponseBody body = response.body();
+ if (body == null) {
+ throw new IOException("Response body is null");
+ }
+ if (!destination.getParentFile().exists()) {
+
+ destination.getParentFile().mkdirs();
+ }
+ try (InputStream inputStream = body.byteStream();
+ FileOutputStream outputStream = new FileOutputStream(destination)) {
+ byte[] buffer = new byte[8192];
+ int bytesRead;
+ while ((bytesRead = inputStream.read(buffer)) != -1) {
+ outputStream.write(buffer, 0, bytesRead);
+ }
+ outputStream.flush();
+ }
+ }
+ }
+
+ public static Map uploadFile(String url, String filePath) throws Exception {
+ File file = new File(filePath);
+
+ Map returnMap = new HashMap(32);
+ OkHttpClient client = getOkHttpClient();
+ // 设置文件上传的媒体类型
+ MediaType mediaType = MediaType.parse("application/octet-stream");
+ // 创建请求体,将文件添加到请求体中
+ RequestBody requestBody = RequestBody.create(mediaType, file);
+
+ // 创建多部分请求体,用于上传文件
+ MultipartBody multipartBody = new MultipartBody.Builder()
+ .setType(MultipartBody.FORM)
+ .addFormDataPart("file", file.getName(), requestBody)
+ .build();
+ // 创建上传文件的请求
+ Request request = new Request.Builder()
+ .url(url)
+ .post(multipartBody)
+ .build();
+
+ try (Response response = client.newCall(request).execute()) {
+ if (!response.isSuccessful()) {
+ throw new IOException("Failed to upload file: " + response);
+ }
+ String html = response.body().string();
+
+ try {
+ Map parse = (Map) JSON.parse(html);
+ Map data = (Map) parse.get("data");
+ String domain = (String) data.get("domain");
+ String src = (String) data.get("src");
+ String fileUrl = domain.concat(src);
+ returnMap.put("fileUrl", fileUrl);
+ } catch (Exception e) {
+ returnMap.put("fileUrl", html);
+ }
+ // 处理上传成功的响应
+ System.out.println("File uploaded successfully!");
+ }
+ return returnMap;
+ }
+
+ public static void deleteFile(String url, String md5) throws Exception {
+ OkHttpClient client = getOkHttpClient();
+ url = url.concat("delete?md5=").concat(md5);
+ Request request = new Request.Builder()
+ .url(url)
+ .get()
+ .build();
+ try (Response response = client.newCall(request).execute()) {
+ if (!response.isSuccessful()) {
+ throw new IOException("Failed to upload file: " + response);
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/com/bfd/crawl/ocrhandler/util/FileUtil.java b/src/main/java/com/bfd/crawl/ocrhandler/util/FileUtil.java
new file mode 100644
index 0000000..1cef225
--- /dev/null
+++ b/src/main/java/com/bfd/crawl/ocrhandler/util/FileUtil.java
@@ -0,0 +1,42 @@
+package com.bfd.crawl.ocrhandler.util;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * @author:jinming
+ * @className:FileUtil
+ * @version:1.0
+ * @description:
+ * @Date:2023/8/2 10:57
+ */
+public class FileUtil {
+
+ public static void main(String[] args) {
+ System.out.println(traverseAndReturnFilePath("D:\\\\ocr\\\\305ce27d7a05770456fdc09d0b3044f7\\\\"));
+ }
+
+ public static List traverseAndReturnFilePath(String folderPath) {
+ List fileList = new ArrayList<>();
+ File folder = new File(folderPath);
+ // 检查文件夹是否存在并且是一个文件夹
+ if (folder.exists() && folder.isDirectory()) {
+ // 获取文件夹中的所有文件和子文件夹
+ File[] files = folder.listFiles();
+ if (files != null) {
+ for (File file : files) {
+ if (file.isFile()) {
+ // 如果是文件,则输出全路径
+ fileList.add(file.getAbsolutePath());
+ } else if (file.isDirectory()) {
+
+ }
+ }
+ }
+ } else {
+ System.out.println("指定的路径不是一个文件夹或文件夹不存在。");
+ }
+ return fileList;
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/com/bfd/crawl/ocrhandler/util/OcrUtil.java b/src/main/java/com/bfd/crawl/ocrhandler/util/OcrUtil.java
new file mode 100644
index 0000000..786625c
--- /dev/null
+++ b/src/main/java/com/bfd/crawl/ocrhandler/util/OcrUtil.java
@@ -0,0 +1,64 @@
+package com.bfd.crawl.ocrhandler.util;
+
+import com.alibaba.fastjson.JSON;
+import okhttp3.*;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.TimeUnit;
+
+/**
+ * @author:jinming
+ * @className:ocrUtil
+ * @version:1.0
+ * @description:
+ * @Date:2023/8/1 16:38
+ */
+public class OcrUtil {
+ private static OkHttpClient okHttpClient;
+
+ private static OkHttpClient getOkHttpClient() {
+ if (okHttpClient == null) {
+ okHttpClient = new OkHttpClient();
+ }
+ return okHttpClient;
+ }
+
+ public static String doOcr(String url,String ocrApi) {
+ String text = "";
+ int reTryTimes = 3;
+ for (int i = 0; i < reTryTimes; i++) {
+ int okCode = 200;
+ OkHttpClient client = getOkHttpClient();
+ OkHttpClient.Builder builder = client.newBuilder().writeTimeout(600, TimeUnit.SECONDS).connectTimeout(600, TimeUnit.SECONDS).readTimeout(600, TimeUnit.SECONDS);
+ client = builder.build();
+ MediaType mediaType = MediaType.parse("application/json");
+ RequestBody body = RequestBody.create(mediaType, "{\"id\":\"\",\"url\":\"" + url + "\"}");
+ Request request = new Request.Builder()
+ .url(ocrApi)
+ .method("POST", body)
+ .addHeader("Content-Type", "application/json")
+ .build();
+ try {
+ Response response = client.newCall(request).execute();
+ String html = response.body().string();
+ Map dataMap = (Map) JSON.parse(html);
+ int code = (int) dataMap.get("code");
+ if (code == okCode) {
+ text = (String) dataMap.get("text");
+ }
+ if (StringUtil.hasValue(text)) {
+ break;
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+
+ return text;
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/com/bfd/crawl/ocrhandler/util/OsUtil.java b/src/main/java/com/bfd/crawl/ocrhandler/util/OsUtil.java
new file mode 100644
index 0000000..09a0cb1
--- /dev/null
+++ b/src/main/java/com/bfd/crawl/ocrhandler/util/OsUtil.java
@@ -0,0 +1,23 @@
+package com.bfd.crawl.ocrhandler.util;
+
+import lombok.extern.slf4j.Slf4j;
+
+import java.util.Locale;
+
+/**
+ * @author:jinming
+ * @className:OsUtil
+ * @version:1.0
+ * @description:
+ * @Date:2023/4/23 9:40
+ */
+@Slf4j
+public class OsUtil {
+ public static Boolean isWindows() {
+ String windows = "windows";
+ String osName = System.getProperty("os.name").toLowerCase(Locale.ROOT);
+ log.info("osName = " + osName);
+ return osName.contains(windows);
+ }
+
+}
\ No newline at end of file
diff --git a/src/main/java/com/bfd/crawl/ocrhandler/util/QueueUtil.java b/src/main/java/com/bfd/crawl/ocrhandler/util/QueueUtil.java
new file mode 100644
index 0000000..b32b5d8
--- /dev/null
+++ b/src/main/java/com/bfd/crawl/ocrhandler/util/QueueUtil.java
@@ -0,0 +1,19 @@
+package com.bfd.crawl.ocrhandler.util;
+
+import java.util.concurrent.LinkedBlockingDeque;
+
+/**
+ * @author:jinming
+ * @className:QueueUtil
+ * @version:1.0
+ * @description:
+ * @Date:2023/7/13 15:00
+ */
+public class QueueUtil {
+
+
+ public static LinkedBlockingDeque taskQueue = new LinkedBlockingDeque();
+
+ public static LinkedBlockingDeque sendQueue = new LinkedBlockingDeque();
+
+}
\ No newline at end of file
diff --git a/src/main/java/com/bfd/crawl/ocrhandler/util/StringUtil.java b/src/main/java/com/bfd/crawl/ocrhandler/util/StringUtil.java
new file mode 100644
index 0000000..c5cc1e7
--- /dev/null
+++ b/src/main/java/com/bfd/crawl/ocrhandler/util/StringUtil.java
@@ -0,0 +1,94 @@
+package com.bfd.crawl.ocrhandler.util;
+
+
+import lombok.extern.slf4j.Slf4j;
+
+import java.security.MessageDigest;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * @author jinming
+ * @version 1.0
+ * @className StringUtile
+ * @Date 2022/1/21 11:46
+ */
+@Slf4j
+public class StringUtil {
+ public static boolean hasValue(String str) {
+ return str != null && !"".equals(str.trim());
+ }
+
+ public static String getRegexGroup(String regex, String str, int id) {
+ String resultStr = "";
+ if (hasValue(str)) {
+ Pattern p = Pattern.compile(regex);
+ Matcher m = p.matcher(str);
+ if (m.find()) {
+ resultStr = m.group(id);
+ }
+ }
+
+ if ("".equals(resultStr)) {
+ }
+
+ return resultStr;
+ }
+
+ public static Set getEmailAddress(String message) {
+ Set emailList = new HashSet<>();
+ Pattern pattern = Pattern.compile("\\w+\\.?\\w+\\@\\w+\\.\\w+");
+ Matcher m = pattern.matcher(message);
+ while (m.find()) {
+ emailList.add(m.group(0));
+ }
+ return emailList;
+ }
+ public static String getMd5(String string) {
+ try {
+ MessageDigest md5 = MessageDigest.getInstance("MD5");
+ byte[] bs = md5.digest(string.getBytes("UTF-8"));
+ StringBuilder sb = new StringBuilder(40);
+ for (byte x : bs) {
+ if ((x & 0xff) >> 4 == 0) {
+ sb.append("0").append(Integer.toHexString(x & 0xff));
+ } else {
+ sb.append(Integer.toHexString(x & 0xff));
+ }
+ }
+ return sb.toString();
+ } catch (Exception e) {
+ //LOG.error("获取md5异常", e);
+ return "nceaform" + System.currentTimeMillis();
+ }
+ }
+
+ public static String removeAllHtmlTags(String str) {
+ return hasValue(str) ? str.replaceAll("<[^<>]+?>", "") : "";
+ }
+
+ public static String getRegexGroup(Pattern regex, String str, int id) {
+ String resultStr = "";
+ if (hasValue(str)) {
+ Matcher m = regex.matcher(str);
+ if (m.find()) {
+ resultStr = m.group(id);
+ }
+ }
+
+ if ("".equals(resultStr)) {
+ log.error(regex + " parser error!");
+ }
+
+ return resultStr;
+ }
+
+ public static String getStrByPattern(String str, String regex) {
+ Pattern pattern = Pattern.compile(regex);
+ Matcher m = pattern.matcher(str);
+ return m.find() ? m.group(0) : "";
+ }
+
+}
diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml
new file mode 100644
index 0000000..5caef31
--- /dev/null
+++ b/src/main/resources/application.yml
@@ -0,0 +1,48 @@
+server:
+ port: 7080
+spring:
+ application:
+ name: 文字识别
+ boot:
+ admin:
+ client:
+ health:
+ timeout: 10s
+ url: http://172.16.12.55:8001
+ instance:
+ service-base-url: http://172.16.12.55:7088
+ kafka:
+ bootstrap-servers: 172.26.28.30:9092
+ producer:
+ retries: 3
+ acks: all
+ batch-size: 4096
+ buffer-memory: 102476800
+ key-serializer: org.apache.kafka.common.serialization.StringSerializer
+ value-serializer: org.apache.kafka.common.serialization.StringSerializer
+
+
+logging:
+ file:
+ path: ./logs
+
+management:
+ endpoints:
+ web:
+ exposure:
+ include: "*"
+ endpoint:
+ health:
+ show-details: always
+
+send:
+ topic: analyze0912
+
+file:
+ path: D:\\ocr\\
+ uploadUrl: http://172.18.1.130:9985/group33/upload
+ ocrApi:
+thread:
+ handler: 1
+ send: 1
+
diff --git a/src/test/java/com/bfd/crawl/ocrhandler/OcrHandlerApplicationTests.java b/src/test/java/com/bfd/crawl/ocrhandler/OcrHandlerApplicationTests.java
new file mode 100644
index 0000000..b411ff8
--- /dev/null
+++ b/src/test/java/com/bfd/crawl/ocrhandler/OcrHandlerApplicationTests.java
@@ -0,0 +1,17 @@
+package com.bfd.crawl.ocrhandler;
+
+import org.junit.jupiter.api.Test;
+import org.springframework.boot.test.context.SpringBootTest;
+
+import java.util.ArrayList;
+import java.util.List;
+
+class OcrHandlerApplicationTests {
+
+ @Test
+ void contextLoads() {
+ }
+
+
+
+}
diff --git a/src/test/java/com/bfd/crawl/ocrhandler/PdfToImageConverter.java b/src/test/java/com/bfd/crawl/ocrhandler/PdfToImageConverter.java
new file mode 100644
index 0000000..a47bf65
--- /dev/null
+++ b/src/test/java/com/bfd/crawl/ocrhandler/PdfToImageConverter.java
@@ -0,0 +1,42 @@
+package com.bfd.crawl.ocrhandler;
+import java.awt.image.BufferedImage;
+import java.io.File;
+import java.io.IOException;
+
+import javax.imageio.ImageIO;
+
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.rendering.PDFRenderer;
+/**
+ * @author:jinming
+ * @className:PdfToImageConverter
+ * @version:1.0
+ * @description:
+ * @Date:2023/7/28 16:20
+ */
+public class PdfToImageConverter {
+ public static void main(String[] args) {
+ String pdfFilePath = "D:\\迅雷下载\\73c3fd1d6a4eb54fe1c42d8bd963f03d.pdf"; // 替换为实际的 PDF 文件路径
+ String outputFolder = "D:\\txt\\yilong\\"; // 替换为输出图片的文件夹路径
+
+ try {
+ PDDocument document = PDDocument.load(new File(pdfFilePath));
+ PDFRenderer pdfRenderer = new PDFRenderer(document);
+
+ for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) {
+ BufferedImage bim = pdfRenderer.renderImageWithDPI(pageIndex, 300); // 设置 DPI(分辨率)
+
+ String imageName = "page_" + (pageIndex + 1) + ".png"; // 图片文件名
+ String imagePath = outputFolder + imageName; // 完整的图片文件路径
+
+ ImageIO.write(bim, "png", new File(imagePath));
+ }
+
+ document.close();
+ System.out.println("PDF 已成功拆分为图片!");
+ } catch (IOException e) {
+ e.printStackTrace();
+ System.err.println("拆分 PDF 为图片时出现错误:" + e.getMessage());
+ }
+ }
+}
\ No newline at end of file