commit
e7325fdbb1
22 changed files with 1245 additions and 0 deletions
-
33.gitignore
-
7README.md
-
123pom.xml
-
13src/main/java/com/bfd/crawl/ocrhandler/OcrHandlerApplication.java
-
59src/main/java/com/bfd/crawl/ocrhandler/bean/ResponsePo.java
-
48src/main/java/com/bfd/crawl/ocrhandler/config/AsyncThreadConfiguration.java
-
46src/main/java/com/bfd/crawl/ocrhandler/controller/DataFilterController.java
-
32src/main/java/com/bfd/crawl/ocrhandler/enums/ResponseCode.java
-
81src/main/java/com/bfd/crawl/ocrhandler/service/FileProcessingService.java
-
164src/main/java/com/bfd/crawl/ocrhandler/service/HandlerService.java
-
50src/main/java/com/bfd/crawl/ocrhandler/service/SendService.java
-
63src/main/java/com/bfd/crawl/ocrhandler/service/StartServcie.java
-
60src/main/java/com/bfd/crawl/ocrhandler/util/DataUtil.java
-
117src/main/java/com/bfd/crawl/ocrhandler/util/FileDownloader.java
-
42src/main/java/com/bfd/crawl/ocrhandler/util/FileUtil.java
-
64src/main/java/com/bfd/crawl/ocrhandler/util/OcrUtil.java
-
23src/main/java/com/bfd/crawl/ocrhandler/util/OsUtil.java
-
19src/main/java/com/bfd/crawl/ocrhandler/util/QueueUtil.java
-
94src/main/java/com/bfd/crawl/ocrhandler/util/StringUtil.java
-
48src/main/resources/application.yml
-
17src/test/java/com/bfd/crawl/ocrhandler/OcrHandlerApplicationTests.java
-
42src/test/java/com/bfd/crawl/ocrhandler/PdfToImageConverter.java
@ -0,0 +1,33 @@ |
|||
HELP.md |
|||
target/ |
|||
!.mvn/wrapper/maven-wrapper.jar |
|||
!**/src/main/**/target/ |
|||
!**/src/test/**/target/ |
|||
|
|||
### STS ### |
|||
.apt_generated |
|||
.classpath |
|||
.factorypath |
|||
.project |
|||
.settings |
|||
.springBeans |
|||
.sts4-cache |
|||
|
|||
### IntelliJ IDEA ### |
|||
.idea |
|||
*.iws |
|||
*.iml |
|||
*.ipr |
|||
|
|||
### NetBeans ### |
|||
/nbproject/private/ |
|||
/nbbuild/ |
|||
/dist/ |
|||
/nbdist/ |
|||
/.nb-gradle/ |
|||
build/ |
|||
!**/src/main/**/build/ |
|||
!**/src/test/**/build/ |
|||
|
|||
### VS Code ### |
|||
.vscode/ |
@ -0,0 +1,7 @@ |
|||
# ocr文本识别 |
|||
|
|||
### 已支持如下功能: |
|||
|
|||
- PDF的拆分解析识别 |
|||
- doc以及docx的内容识别 |
|||
- 图片类的识别 |
@ -0,0 +1,123 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
|||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> |
|||
<modelVersion>4.0.0</modelVersion> |
|||
<parent> |
|||
<groupId>org.springframework.boot</groupId> |
|||
<artifactId>spring-boot-starter-parent</artifactId> |
|||
<version>2.2.4.RELEASE</version> |
|||
<relativePath/> <!-- lookup parent from repository --> |
|||
</parent> |
|||
<groupId>com.bfd.crawl</groupId> |
|||
<artifactId>ocrHandler</artifactId> |
|||
<version>0.0.1-SNAPSHOT</version> |
|||
<name>ocrHandler</name> |
|||
<description>ocrHandler</description> |
|||
<properties> |
|||
<log4j2.version>2.17.2</log4j2.version> |
|||
<java.version>1.8</java.version> |
|||
</properties> |
|||
<dependencies> |
|||
|
|||
<!-- https://mvnrepository.com/artifact/de.codecentric/spring-boot-admin-starter-client --> |
|||
<dependency> |
|||
<groupId>de.codecentric</groupId> |
|||
<artifactId>spring-boot-admin-client</artifactId> |
|||
<version>2.2.4</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>org.springframework.kafka</groupId> |
|||
<artifactId>spring-kafka</artifactId> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>org.springframework.boot</groupId> |
|||
<artifactId>spring-boot-starter</artifactId> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>org.springframework.boot</groupId> |
|||
<artifactId>spring-boot-starter-web</artifactId> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>org.springframework.boot</groupId> |
|||
<artifactId>spring-boot-devtools</artifactId> |
|||
<scope>runtime</scope> |
|||
<optional>true</optional> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>org.projectlombok</groupId> |
|||
<artifactId>lombok</artifactId> |
|||
<optional>true</optional> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>org.springframework.boot</groupId> |
|||
<artifactId>spring-boot-starter-test</artifactId> |
|||
<scope>test</scope> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>org.apache.pdfbox</groupId> |
|||
<artifactId>pdfbox</artifactId> |
|||
<version>2.0.28</version> |
|||
</dependency> |
|||
<!-- Apache POI --> |
|||
<dependency> |
|||
<groupId>org.apache.poi</groupId> |
|||
<artifactId>poi-scratchpad</artifactId> |
|||
<version>5.2.0</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>org.apache.poi</groupId> |
|||
<artifactId>poi</artifactId> |
|||
<version>5.2.0</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>org.apache.poi</groupId> |
|||
<artifactId>poi-ooxml</artifactId> |
|||
<version>5.2.0</version> |
|||
</dependency> |
|||
<!--JSON--> |
|||
<dependency> |
|||
<groupId>com.alibaba</groupId> |
|||
<artifactId>fastjson</artifactId> |
|||
<version>2.0.17</version> |
|||
</dependency> |
|||
<!--OKHTTP--> |
|||
<dependency> |
|||
<groupId>com.squareup.okhttp3</groupId> |
|||
<artifactId>okhttp</artifactId> |
|||
<version>3.9.1</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>com.google.code.gson</groupId> |
|||
<artifactId>gson</artifactId> |
|||
<version>2.8.8</version> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>org.apache.kafka</groupId> |
|||
<artifactId>kafka-clients</artifactId> |
|||
<version>2.3.1</version> <!--根据您正在使用的Kafka版本选择合适的版本号--> |
|||
</dependency> |
|||
<dependency> |
|||
<groupId>org.springframework.kafka</groupId> |
|||
<artifactId>spring-kafka-test</artifactId> |
|||
<scope>test</scope> |
|||
</dependency> |
|||
</dependencies> |
|||
|
|||
<build> |
|||
<plugins> |
|||
<plugin> |
|||
<groupId>org.springframework.boot</groupId> |
|||
<artifactId>spring-boot-maven-plugin</artifactId> |
|||
<configuration> |
|||
<excludes> |
|||
<exclude> |
|||
<groupId>org.projectlombok</groupId> |
|||
<artifactId>lombok</artifactId> |
|||
</exclude> |
|||
</excludes> |
|||
</configuration> |
|||
</plugin> |
|||
</plugins> |
|||
</build> |
|||
|
|||
</project> |
@ -0,0 +1,13 @@ |
|||
package com.bfd.crawl.ocrhandler; |
|||
|
|||
import org.springframework.boot.SpringApplication; |
|||
import org.springframework.boot.autoconfigure.SpringBootApplication; |
|||
|
|||
@SpringBootApplication |
|||
public class OcrHandlerApplication { |
|||
|
|||
public static void main(String[] args) { |
|||
SpringApplication.run(OcrHandlerApplication.class, args); |
|||
} |
|||
|
|||
} |
@ -0,0 +1,59 @@ |
|||
package com.bfd.crawl.ocrhandler.bean; |
|||
|
|||
|
|||
import com.bfd.crawl.ocrhandler.enums.ResponseCode; |
|||
import lombok.AllArgsConstructor; |
|||
import lombok.Data; |
|||
import lombok.NoArgsConstructor; |
|||
|
|||
/** |
|||
* @author:jinming |
|||
* @className:ResponsePo |
|||
* @version:1.0 |
|||
* @description: |
|||
* @Date:2023/4/3 17:23 |
|||
*/ |
|||
@Data |
|||
@NoArgsConstructor |
|||
@AllArgsConstructor |
|||
public class ResponsePo { |
|||
/** |
|||
* 响应码 |
|||
*/ |
|||
private int code; |
|||
|
|||
/** |
|||
* 正常放 返回数据 的JSON串 |
|||
*/ |
|||
private Object data; |
|||
|
|||
/** |
|||
* 提示消息 |
|||
*/ |
|||
private String message; |
|||
|
|||
public static ResponsePo success() { |
|||
return setStatus(ResponseCode.SUCCESS.getCode(), ResponseCode.SUCCESS.getMessage()); |
|||
} |
|||
|
|||
public static ResponsePo error() { |
|||
return setStatus(ResponseCode.FAILURE.getCode(), ResponseCode.FAILURE.getMessage()); |
|||
} |
|||
|
|||
public static ResponsePo setStatus(int code, String message) { |
|||
ResponsePo resultBean = new ResponsePo(); |
|||
resultBean.code = code; |
|||
resultBean.message = message; |
|||
return resultBean; |
|||
} |
|||
public ResponsePo(int code, String message) { |
|||
this.code = code; |
|||
this.message = message; |
|||
this.data = data; |
|||
} |
|||
public ResponsePo(ResponseCode responseCode){ |
|||
this.code = responseCode.getCode(); |
|||
this.message = responseCode.getMessage(); |
|||
this.data = data; |
|||
} |
|||
} |
@ -0,0 +1,48 @@ |
|||
package com.bfd.crawl.ocrhandler.config; |
|||
|
|||
|
|||
import org.springframework.context.annotation.Bean; |
|||
import org.springframework.context.annotation.Configuration; |
|||
import org.springframework.scheduling.annotation.EnableAsync; |
|||
import org.springframework.scheduling.concurrent.ThreadPoolTaskExecutor; |
|||
|
|||
import java.util.concurrent.Executor; |
|||
|
|||
/** |
|||
* @author jinming |
|||
* @version 1.0 |
|||
* @className AsyncThreadConfiguration |
|||
* @Date 2022/2/17 18:37 |
|||
*/ |
|||
@Configuration |
|||
@EnableAsync |
|||
public class AsyncThreadConfiguration { |
|||
@Bean |
|||
public Executor asyncExecutor() { |
|||
ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor(); |
|||
// 核心线程数 |
|||
executor.setCorePoolSize(500); |
|||
// 并发线程的数量限制为2 |
|||
executor.setMaxPoolSize(500); |
|||
// 线程队列 |
|||
executor.setQueueCapacity(500); |
|||
executor.setThreadNamePrefix("handlerData-"); |
|||
executor.initialize(); |
|||
executor.setWaitForTasksToCompleteOnShutdown(true); |
|||
return executor; |
|||
} |
|||
@Bean |
|||
public Executor sendExecutor() { |
|||
ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor(); |
|||
// 核心线程数 |
|||
executor.setCorePoolSize(500); |
|||
// 并发线程的数量限制为2 |
|||
executor.setMaxPoolSize(500); |
|||
// 线程队列 |
|||
executor.setQueueCapacity(500); |
|||
executor.setThreadNamePrefix("sendData-"); |
|||
executor.initialize(); |
|||
executor.setWaitForTasksToCompleteOnShutdown(true); |
|||
return executor; |
|||
} |
|||
} |
@ -0,0 +1,46 @@ |
|||
package com.bfd.crawl.ocrhandler.controller; |
|||
|
|||
|
|||
import com.alibaba.fastjson.JSON; |
|||
import com.bfd.crawl.ocrhandler.bean.ResponsePo; |
|||
import com.bfd.crawl.ocrhandler.enums.ResponseCode; |
|||
import com.bfd.crawl.ocrhandler.util.QueueUtil; |
|||
import lombok.extern.slf4j.Slf4j; |
|||
import org.springframework.web.bind.annotation.PostMapping; |
|||
import org.springframework.web.bind.annotation.RequestBody; |
|||
import org.springframework.web.bind.annotation.RequestMapping; |
|||
import org.springframework.web.bind.annotation.RestController; |
|||
|
|||
|
|||
import java.util.Map; |
|||
|
|||
/** |
|||
* @author:jinming |
|||
* @className:DataFilterController |
|||
* @version:1.0 |
|||
* @description: |
|||
* @Date:2023/7/26 11:21 |
|||
*/ |
|||
@RestController |
|||
@RequestMapping("/handlerdata") |
|||
@Slf4j |
|||
public class DataFilterController { |
|||
@PostMapping("/ocr") |
|||
public ResponsePo documentFeedback(@RequestBody String dataJson) { |
|||
|
|||
ResponsePo responsePo = ResponsePo.success(); |
|||
try { |
|||
Map parse = (Map) JSON.parse(dataJson); |
|||
} catch (Exception e) { |
|||
e.printStackTrace(); |
|||
log.error("请求格式发生异常" + e.getMessage()); |
|||
responsePo.setCode(ResponseCode.FAILURE.getCode()); |
|||
responsePo.setMessage(ResponseCode.FAILURE.getMessage()); |
|||
return responsePo; |
|||
} |
|||
log.info("新增任务:"+dataJson); |
|||
QueueUtil.taskQueue.add(dataJson); |
|||
|
|||
return responsePo; |
|||
} |
|||
} |
@ -0,0 +1,32 @@ |
|||
package com.bfd.crawl.ocrhandler.enums; |
|||
|
|||
/** |
|||
* @author:jinming |
|||
* @className:ResponseCodeEnum |
|||
* @version:1.0 |
|||
* @description:响应结果码枚举类 |
|||
* @Date:2023/2/28 11:40 |
|||
*/ |
|||
public enum ResponseCode { |
|||
//返回结果码枚举类 |
|||
SUCCESS(200, "操作成功"), |
|||
FAILURE(400, "参数错误"), |
|||
INTERNAL_SERVER_ERROR(500, "服务器内部错误"), |
|||
TYPE_NOT_SUPPORT(601,"文件类型不支持"); |
|||
|
|||
private int code; |
|||
private String message; |
|||
|
|||
ResponseCode(int code, String message) { |
|||
this.code = code; |
|||
this.message = message; |
|||
} |
|||
|
|||
public int getCode() { |
|||
return code; |
|||
} |
|||
|
|||
public String getMessage() { |
|||
return message; |
|||
} |
|||
} |
@ -0,0 +1,81 @@ |
|||
package com.bfd.crawl.ocrhandler.service; |
|||
|
|||
import java.awt.image.BufferedImage; |
|||
import java.io.File; |
|||
import java.io.FileInputStream; |
|||
import java.io.IOException; |
|||
import java.io.InputStream; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
import javax.imageio.ImageIO; |
|||
|
|||
import lombok.extern.slf4j.Slf4j; |
|||
import org.apache.pdfbox.pdmodel.PDDocument; |
|||
import org.apache.pdfbox.rendering.PDFRenderer; |
|||
import org.apache.poi.hwpf.HWPFDocument; |
|||
import org.apache.poi.hwpf.extractor.WordExtractor; |
|||
import org.apache.poi.xwpf.extractor.XWPFWordExtractor; |
|||
import org.apache.poi.xwpf.usermodel.XWPFDocument; |
|||
import org.springframework.beans.factory.annotation.Value; |
|||
import org.springframework.stereotype.Service; |
|||
|
|||
/** |
|||
* @author:jinming |
|||
* @className:PdfToImageConverterService |
|||
* @version:1.0 |
|||
* @description: |
|||
* @Date:2023/7/28 17:20 |
|||
*/ |
|||
@Service |
|||
@Slf4j |
|||
public class FileProcessingService { |
|||
|
|||
public void converterPdfToImg(String fileName, String outputFolder) { |
|||
try { |
|||
PDDocument document = PDDocument.load(new File(fileName)); |
|||
PDFRenderer pdfRenderer = new PDFRenderer(document); |
|||
for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) { |
|||
// 设置 DPI(分辨率) |
|||
BufferedImage bim = pdfRenderer.renderImageWithDPI(pageIndex, 300); |
|||
// 图片文件名 |
|||
String imageName = "page_" + (pageIndex + 1) + ".png"; |
|||
// 完整的图片文件路径 |
|||
String imagePath = outputFolder + imageName; |
|||
File file = new File(imagePath); |
|||
|
|||
if (!file.getParentFile().exists()) { |
|||
file.getParentFile().mkdirs(); |
|||
} |
|||
|
|||
ImageIO.write(bim, "png", file); |
|||
} |
|||
document.close(); |
|||
System.out.println("PDF 已成功拆分为图片!"); |
|||
} catch (Exception e) { |
|||
// e.printStackTrace(); |
|||
System.err.println("拆分 PDF 为图片时出现错误:" + e.getMessage()); |
|||
} |
|||
} |
|||
|
|||
public String readWordFile(String filePath) throws IOException { |
|||
InputStream inputStream = new FileInputStream(filePath); |
|||
String fileTypeDoc = "doc"; |
|||
String fileTypeDocx = "docx"; |
|||
if (filePath.endsWith(fileTypeDoc)) { |
|||
try (HWPFDocument document = new HWPFDocument(inputStream)) { |
|||
WordExtractor extractor = new WordExtractor(document); |
|||
return extractor.getText(); |
|||
} |
|||
} else if (filePath.endsWith(fileTypeDocx)) { |
|||
try (XWPFDocument document = new XWPFDocument(inputStream)) { |
|||
XWPFWordExtractor extractor = new XWPFWordExtractor(document); |
|||
return extractor.getText(); |
|||
} |
|||
} else { |
|||
throw new IllegalArgumentException("Unsupported file format"); |
|||
} |
|||
} |
|||
|
|||
|
|||
} |
164
src/main/java/com/bfd/crawl/ocrhandler/service/HandlerService.java
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
@ -0,0 +1,50 @@ |
|||
package com.bfd.crawl.ocrhandler.service; |
|||
|
|||
|
|||
import com.bfd.crawl.ocrhandler.util.QueueUtil; |
|||
import lombok.extern.slf4j.Slf4j; |
|||
import org.springframework.beans.factory.annotation.Autowired; |
|||
import org.springframework.beans.factory.annotation.Value; |
|||
import org.springframework.kafka.core.KafkaTemplate; |
|||
import org.springframework.scheduling.annotation.Async; |
|||
import org.springframework.stereotype.Service; |
|||
|
|||
import javax.annotation.Resource; |
|||
|
|||
/** |
|||
* @author:jinming |
|||
* @className:SendService |
|||
* @version:1.0 |
|||
* @description: |
|||
* @Date:2023/7/31 17:53 |
|||
*/ |
|||
@Slf4j |
|||
@Service |
|||
public class SendService { |
|||
@Value("${send.topic}") |
|||
private String topic; |
|||
|
|||
@Resource |
|||
private KafkaTemplate kafkaTemplate; |
|||
|
|||
@Async("sendExecutor") |
|||
void sendToKafka() { |
|||
while (true) { |
|||
if (QueueUtil.sendQueue.size() > 0) { |
|||
try { |
|||
String message = QueueUtil.sendQueue.take(); |
|||
kafkaTemplate.send(topic,message); |
|||
} catch (Exception e) { |
|||
e.printStackTrace(); |
|||
} |
|||
}else { |
|||
log.info("任务队列为空,休眠3秒"); |
|||
try { |
|||
Thread.sleep(3000); |
|||
} catch (InterruptedException e) { |
|||
e.printStackTrace(); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
} |
@ -0,0 +1,63 @@ |
|||
package com.bfd.crawl.ocrhandler.service; |
|||
|
|||
|
|||
import com.bfd.crawl.ocrhandler.util.QueueUtil; |
|||
import lombok.extern.slf4j.Slf4j; |
|||
import org.springframework.beans.factory.annotation.Autowired; |
|||
import org.springframework.beans.factory.annotation.Value; |
|||
import org.springframework.boot.ApplicationArguments; |
|||
import org.springframework.boot.ApplicationRunner; |
|||
import org.springframework.stereotype.Service; |
|||
|
|||
/** |
|||
* @author:jinming |
|||
* @className:StartServcie |
|||
* @version:1.0 |
|||
* @description: |
|||
* @Date:2023/7/31 17:14 |
|||
*/ |
|||
@Service |
|||
@Slf4j |
|||
public class StartServcie implements ApplicationRunner { |
|||
@Value("${thread.handler}") |
|||
private int handlerNumber; |
|||
@Value("${thread.send}") |
|||
private int sendNumber; |
|||
|
|||
@Autowired |
|||
private HandlerService handlerService; |
|||
@Autowired |
|||
private SendService sendService; |
|||
|
|||
@Override |
|||
public void run(ApplicationArguments args) throws Exception { |
|||
for (int i = 0; i < handlerNumber; i++) { |
|||
log.info("处理服务线程" + i + "已启动 "); |
|||
handlerService.run(); |
|||
} |
|||
for (int i = 0; i < sendNumber; i++) { |
|||
log.info("发送服务线程" + i + "已启动 "); |
|||
sendService.sendToKafka(); |
|||
} |
|||
Runnable myRunnable = new Runnable() { |
|||
@Override |
|||
public void run() { |
|||
while (true) { |
|||
log.info("任务队列长度为" + QueueUtil.taskQueue.size()); |
|||
log.info("发送队列长度为" + QueueUtil.sendQueue.size()); |
|||
try { |
|||
Thread.sleep(10000); |
|||
} catch (InterruptedException e) { |
|||
e.printStackTrace(); |
|||
} |
|||
} |
|||
} |
|||
}; |
|||
// 创建一个新的线程,并将Runnable对象传递给Thread构造函数 |
|||
Thread myThread = new Thread(myRunnable); |
|||
// 启动线程 |
|||
myThread.start(); |
|||
|
|||
|
|||
} |
|||
} |
@ -0,0 +1,60 @@ |
|||
package com.bfd.crawl.ocrhandler.util; |
|||
|
|||
import com.alibaba.fastjson.JSON; |
|||
import com.alibaba.fastjson.JSONObject; |
|||
import com.alibaba.fastjson.JSONPath; |
|||
import lombok.extern.slf4j.Slf4j; |
|||
|
|||
import java.util.Map; |
|||
|
|||
/** |
|||
* @author:jinming |
|||
* @className:DataUtil |
|||
* @version:1.0 |
|||
* @description: 获取dataValue的值 |
|||
* @Date:2023/11/1 9:54 |
|||
*/ |
|||
@Slf4j |
|||
public class DataUtil { |
|||
/** |
|||
* @param key 传入的key |
|||
* @param dataMap 数据map |
|||
* @return 根据传入的参数进行判断解析,返回正确的dataValue |
|||
*/ |
|||
public static Object getValue(String key, Map dataMap) { |
|||
try { |
|||
//公式为空直接就返回 |
|||
if (!StringUtil.hasValue(key)) { |
|||
return ""; |
|||
} |
|||
Object dataValue; |
|||
String isJson = "#json#"; |
|||
if (key.contains(isJson)) { |
|||
//进行第一次拆分,获取#json#前面的部分 |
|||
String[] keySplit = key.split(isJson); |
|||
String firstDataKey = keySplit[0]; |
|||
String[] firstDataKeySplit = firstDataKey.split(":"); |
|||
//取出前半部分对应的JSON数据并转换为JSONObject |
|||
String dataJson = (String) dataMap.get(firstDataKeySplit[0]); |
|||
JSONObject dataJsonObject = JSON.parseObject(dataJson); |
|||
//根据key的后半部分取出对应JSONObject中的值 |
|||
String firstDataKeyJson = (String) JSONPath.eval(dataJsonObject, firstDataKeySplit[1]); |
|||
String secDataKey = keySplit[1]; |
|||
JSONObject firstDataJsonObject = JSON.parseObject(firstDataKeyJson); |
|||
dataValue = JSONPath.eval(firstDataJsonObject, secDataKey); |
|||
return dataValue; |
|||
} |
|||
String[] keySplit = key.split(":"); |
|||
String jsonPath = keySplit[1]; |
|||
String dataJson = (String) dataMap.get(keySplit[0]); |
|||
JSONObject dataJsonObject = JSON.parseObject(dataJson); |
|||
dataValue = JSONPath.eval(dataJsonObject, jsonPath); |
|||
return dataValue; |
|||
} catch (Exception e) { |
|||
// TODO: handle exception |
|||
log.error("jsonpath公式取值异常,", e); |
|||
return null; |
|||
} |
|||
|
|||
} |
|||
} |
@ -0,0 +1,117 @@ |
|||
package com.bfd.crawl.ocrhandler.util; |
|||
|
|||
import com.alibaba.fastjson.JSON; |
|||
import okhttp3.*; |
|||
|
|||
import java.io.File; |
|||
import java.io.FileOutputStream; |
|||
import java.io.IOException; |
|||
import java.io.InputStream; |
|||
import java.util.HashMap; |
|||
import java.util.Map; |
|||
|
|||
/** |
|||
* @author:jinming |
|||
* @className:FileDownloader |
|||
* @version:1.0 |
|||
* @description: |
|||
* @Date:2023/8/1 16:31 |
|||
*/ |
|||
public class FileDownloader { |
|||
private static OkHttpClient okHttpClient; |
|||
|
|||
private static OkHttpClient getOkHttpClient() { |
|||
if (okHttpClient == null) { |
|||
okHttpClient = new OkHttpClient(); |
|||
} |
|||
return okHttpClient; |
|||
} |
|||
|
|||
public static void downloadFile(String url, File destination) throws IOException { |
|||
OkHttpClient client = getOkHttpClient(); |
|||
Request request = new Request.Builder() |
|||
.url(url) |
|||
.build(); |
|||
|
|||
try (Response response = client.newCall(request).execute()) { |
|||
if (!response.isSuccessful()) { |
|||
throw new IOException("Failed to download file: " + response); |
|||
} |
|||
|
|||
ResponseBody body = response.body(); |
|||
if (body == null) { |
|||
throw new IOException("Response body is null"); |
|||
} |
|||
if (!destination.getParentFile().exists()) { |
|||
|
|||
destination.getParentFile().mkdirs(); |
|||
} |
|||
try (InputStream inputStream = body.byteStream(); |
|||
FileOutputStream outputStream = new FileOutputStream(destination)) { |
|||
byte[] buffer = new byte[8192]; |
|||
int bytesRead; |
|||
while ((bytesRead = inputStream.read(buffer)) != -1) { |
|||
outputStream.write(buffer, 0, bytesRead); |
|||
} |
|||
outputStream.flush(); |
|||
} |
|||
} |
|||
} |
|||
|
|||
public static Map<String, String> uploadFile(String url, String filePath) throws Exception { |
|||
File file = new File(filePath); |
|||
|
|||
Map returnMap = new HashMap(32); |
|||
OkHttpClient client = getOkHttpClient(); |
|||
// 设置文件上传的媒体类型 |
|||
MediaType mediaType = MediaType.parse("application/octet-stream"); |
|||
// 创建请求体,将文件添加到请求体中 |
|||
RequestBody requestBody = RequestBody.create(mediaType, file); |
|||
|
|||
// 创建多部分请求体,用于上传文件 |
|||
MultipartBody multipartBody = new MultipartBody.Builder() |
|||
.setType(MultipartBody.FORM) |
|||
.addFormDataPart("file", file.getName(), requestBody) |
|||
.build(); |
|||
// 创建上传文件的请求 |
|||
Request request = new Request.Builder() |
|||
.url(url) |
|||
.post(multipartBody) |
|||
.build(); |
|||
|
|||
try (Response response = client.newCall(request).execute()) { |
|||
if (!response.isSuccessful()) { |
|||
throw new IOException("Failed to upload file: " + response); |
|||
} |
|||
String html = response.body().string(); |
|||
|
|||
try { |
|||
Map parse = (Map) JSON.parse(html); |
|||
Map data = (Map) parse.get("data"); |
|||
String domain = (String) data.get("domain"); |
|||
String src = (String) data.get("src"); |
|||
String fileUrl = domain.concat(src); |
|||
returnMap.put("fileUrl", fileUrl); |
|||
} catch (Exception e) { |
|||
returnMap.put("fileUrl", html); |
|||
} |
|||
// 处理上传成功的响应 |
|||
System.out.println("File uploaded successfully!"); |
|||
} |
|||
return returnMap; |
|||
} |
|||
|
|||
public static void deleteFile(String url, String md5) throws Exception { |
|||
OkHttpClient client = getOkHttpClient(); |
|||
url = url.concat("delete?md5=").concat(md5); |
|||
Request request = new Request.Builder() |
|||
.url(url) |
|||
.get() |
|||
.build(); |
|||
try (Response response = client.newCall(request).execute()) { |
|||
if (!response.isSuccessful()) { |
|||
throw new IOException("Failed to upload file: " + response); |
|||
} |
|||
} |
|||
} |
|||
} |
@ -0,0 +1,42 @@ |
|||
package com.bfd.crawl.ocrhandler.util; |
|||
|
|||
import java.io.File; |
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
/** |
|||
* @author:jinming |
|||
* @className:FileUtil |
|||
* @version:1.0 |
|||
* @description: |
|||
* @Date:2023/8/2 10:57 |
|||
*/ |
|||
public class FileUtil { |
|||
|
|||
public static void main(String[] args) { |
|||
System.out.println(traverseAndReturnFilePath("D:\\\\ocr\\\\305ce27d7a05770456fdc09d0b3044f7\\\\")); |
|||
} |
|||
|
|||
public static List<String> traverseAndReturnFilePath(String folderPath) { |
|||
List<String> fileList = new ArrayList<>(); |
|||
File folder = new File(folderPath); |
|||
// 检查文件夹是否存在并且是一个文件夹 |
|||
if (folder.exists() && folder.isDirectory()) { |
|||
// 获取文件夹中的所有文件和子文件夹 |
|||
File[] files = folder.listFiles(); |
|||
if (files != null) { |
|||
for (File file : files) { |
|||
if (file.isFile()) { |
|||
// 如果是文件,则输出全路径 |
|||
fileList.add(file.getAbsolutePath()); |
|||
} else if (file.isDirectory()) { |
|||
|
|||
} |
|||
} |
|||
} |
|||
} else { |
|||
System.out.println("指定的路径不是一个文件夹或文件夹不存在。"); |
|||
} |
|||
return fileList; |
|||
} |
|||
} |
@ -0,0 +1,64 @@ |
|||
package com.bfd.crawl.ocrhandler.util; |
|||
|
|||
import com.alibaba.fastjson.JSON; |
|||
import okhttp3.*; |
|||
|
|||
import java.io.File; |
|||
import java.io.FileOutputStream; |
|||
import java.io.IOException; |
|||
import java.io.InputStream; |
|||
import java.util.HashMap; |
|||
import java.util.Map; |
|||
import java.util.concurrent.TimeUnit; |
|||
|
|||
/** |
|||
* @author:jinming |
|||
* @className:ocrUtil |
|||
* @version:1.0 |
|||
* @description: |
|||
* @Date:2023/8/1 16:38 |
|||
*/ |
|||
public class OcrUtil { |
|||
private static OkHttpClient okHttpClient; |
|||
|
|||
private static OkHttpClient getOkHttpClient() { |
|||
if (okHttpClient == null) { |
|||
okHttpClient = new OkHttpClient(); |
|||
} |
|||
return okHttpClient; |
|||
} |
|||
|
|||
public static String doOcr(String url,String ocrApi) { |
|||
String text = ""; |
|||
int reTryTimes = 3; |
|||
for (int i = 0; i < reTryTimes; i++) { |
|||
int okCode = 200; |
|||
OkHttpClient client = getOkHttpClient(); |
|||
OkHttpClient.Builder builder = client.newBuilder().writeTimeout(600, TimeUnit.SECONDS).connectTimeout(600, TimeUnit.SECONDS).readTimeout(600, TimeUnit.SECONDS); |
|||
client = builder.build(); |
|||
MediaType mediaType = MediaType.parse("application/json"); |
|||
RequestBody body = RequestBody.create(mediaType, "{\"id\":\"\",\"url\":\"" + url + "\"}"); |
|||
Request request = new Request.Builder() |
|||
.url(ocrApi) |
|||
.method("POST", body) |
|||
.addHeader("Content-Type", "application/json") |
|||
.build(); |
|||
try { |
|||
Response response = client.newCall(request).execute(); |
|||
String html = response.body().string(); |
|||
Map dataMap = (Map) JSON.parse(html); |
|||
int code = (int) dataMap.get("code"); |
|||
if (code == okCode) { |
|||
text = (String) dataMap.get("text"); |
|||
} |
|||
if (StringUtil.hasValue(text)) { |
|||
break; |
|||
} |
|||
} catch (Exception e) { |
|||
e.printStackTrace(); |
|||
} |
|||
} |
|||
|
|||
return text; |
|||
} |
|||
} |
@ -0,0 +1,23 @@ |
|||
package com.bfd.crawl.ocrhandler.util; |
|||
|
|||
import lombok.extern.slf4j.Slf4j; |
|||
|
|||
import java.util.Locale; |
|||
|
|||
/** |
|||
* @author:jinming |
|||
* @className:OsUtil |
|||
* @version:1.0 |
|||
* @description: |
|||
* @Date:2023/4/23 9:40 |
|||
*/ |
|||
@Slf4j |
|||
public class OsUtil { |
|||
public static Boolean isWindows() { |
|||
String windows = "windows"; |
|||
String osName = System.getProperty("os.name").toLowerCase(Locale.ROOT); |
|||
log.info("osName = " + osName); |
|||
return osName.contains(windows); |
|||
} |
|||
|
|||
} |
@ -0,0 +1,19 @@ |
|||
package com.bfd.crawl.ocrhandler.util; |
|||
|
|||
import java.util.concurrent.LinkedBlockingDeque; |
|||
|
|||
/** |
|||
* @author:jinming |
|||
* @className:QueueUtil |
|||
* @version:1.0 |
|||
* @description: |
|||
* @Date:2023/7/13 15:00 |
|||
*/ |
|||
public class QueueUtil { |
|||
|
|||
|
|||
public static LinkedBlockingDeque<String> taskQueue = new LinkedBlockingDeque<String>(); |
|||
|
|||
public static LinkedBlockingDeque<String> sendQueue = new LinkedBlockingDeque<String>(); |
|||
|
|||
} |
@ -0,0 +1,94 @@ |
|||
package com.bfd.crawl.ocrhandler.util; |
|||
|
|||
|
|||
import lombok.extern.slf4j.Slf4j; |
|||
|
|||
import java.security.MessageDigest; |
|||
import java.util.HashSet; |
|||
import java.util.Set; |
|||
import java.util.regex.Matcher; |
|||
import java.util.regex.Pattern; |
|||
|
|||
/** |
|||
* @author jinming |
|||
* @version 1.0 |
|||
* @className StringUtile |
|||
* @Date 2022/1/21 11:46 |
|||
*/ |
|||
@Slf4j |
|||
public class StringUtil { |
|||
public static boolean hasValue(String str) { |
|||
return str != null && !"".equals(str.trim()); |
|||
} |
|||
|
|||
public static String getRegexGroup(String regex, String str, int id) { |
|||
String resultStr = ""; |
|||
if (hasValue(str)) { |
|||
Pattern p = Pattern.compile(regex); |
|||
Matcher m = p.matcher(str); |
|||
if (m.find()) { |
|||
resultStr = m.group(id); |
|||
} |
|||
} |
|||
|
|||
if ("".equals(resultStr)) { |
|||
} |
|||
|
|||
return resultStr; |
|||
} |
|||
|
|||
public static Set<String> getEmailAddress(String message) { |
|||
Set<String> emailList = new HashSet<>(); |
|||
Pattern pattern = Pattern.compile("\\w+\\.?\\w+\\@\\w+\\.\\w+"); |
|||
Matcher m = pattern.matcher(message); |
|||
while (m.find()) { |
|||
emailList.add(m.group(0)); |
|||
} |
|||
return emailList; |
|||
} |
|||
public static String getMd5(String string) { |
|||
try { |
|||
MessageDigest md5 = MessageDigest.getInstance("MD5"); |
|||
byte[] bs = md5.digest(string.getBytes("UTF-8")); |
|||
StringBuilder sb = new StringBuilder(40); |
|||
for (byte x : bs) { |
|||
if ((x & 0xff) >> 4 == 0) { |
|||
sb.append("0").append(Integer.toHexString(x & 0xff)); |
|||
} else { |
|||
sb.append(Integer.toHexString(x & 0xff)); |
|||
} |
|||
} |
|||
return sb.toString(); |
|||
} catch (Exception e) { |
|||
//LOG.error("获取md5异常", e); |
|||
return "nceaform" + System.currentTimeMillis(); |
|||
} |
|||
} |
|||
|
|||
public static String removeAllHtmlTags(String str) { |
|||
return hasValue(str) ? str.replaceAll("<[^<>]+?>", "") : ""; |
|||
} |
|||
|
|||
public static String getRegexGroup(Pattern regex, String str, int id) { |
|||
String resultStr = ""; |
|||
if (hasValue(str)) { |
|||
Matcher m = regex.matcher(str); |
|||
if (m.find()) { |
|||
resultStr = m.group(id); |
|||
} |
|||
} |
|||
|
|||
if ("".equals(resultStr)) { |
|||
log.error(regex + " parser error!"); |
|||
} |
|||
|
|||
return resultStr; |
|||
} |
|||
|
|||
public static String getStrByPattern(String str, String regex) { |
|||
Pattern pattern = Pattern.compile(regex); |
|||
Matcher m = pattern.matcher(str); |
|||
return m.find() ? m.group(0) : ""; |
|||
} |
|||
|
|||
} |
@ -0,0 +1,48 @@ |
|||
server: |
|||
port: 7080 |
|||
spring: |
|||
application: |
|||
name: 文字识别 |
|||
boot: |
|||
admin: |
|||
client: |
|||
health: |
|||
timeout: 10s |
|||
url: http://172.16.12.55:8001 |
|||
instance: |
|||
service-base-url: http://172.16.12.55:7088 |
|||
kafka: |
|||
bootstrap-servers: 172.26.28.30:9092 |
|||
producer: |
|||
retries: 3 |
|||
acks: all |
|||
batch-size: 4096 |
|||
buffer-memory: 102476800 |
|||
key-serializer: org.apache.kafka.common.serialization.StringSerializer |
|||
value-serializer: org.apache.kafka.common.serialization.StringSerializer |
|||
|
|||
|
|||
logging: |
|||
file: |
|||
path: ./logs |
|||
|
|||
management: |
|||
endpoints: |
|||
web: |
|||
exposure: |
|||
include: "*" |
|||
endpoint: |
|||
health: |
|||
show-details: always |
|||
|
|||
send: |
|||
topic: analyze0912 |
|||
|
|||
file: |
|||
path: D:\\ocr\\ |
|||
uploadUrl: http://172.18.1.130:9985/group33/upload |
|||
ocrApi: |
|||
thread: |
|||
handler: 1 |
|||
send: 1 |
|||
|
@ -0,0 +1,17 @@ |
|||
package com.bfd.crawl.ocrhandler; |
|||
|
|||
import org.junit.jupiter.api.Test; |
|||
import org.springframework.boot.test.context.SpringBootTest; |
|||
|
|||
import java.util.ArrayList; |
|||
import java.util.List; |
|||
|
|||
class OcrHandlerApplicationTests { |
|||
|
|||
@Test |
|||
void contextLoads() { |
|||
} |
|||
|
|||
|
|||
|
|||
} |
@ -0,0 +1,42 @@ |
|||
package com.bfd.crawl.ocrhandler; |
|||
import java.awt.image.BufferedImage; |
|||
import java.io.File; |
|||
import java.io.IOException; |
|||
|
|||
import javax.imageio.ImageIO; |
|||
|
|||
import org.apache.pdfbox.pdmodel.PDDocument; |
|||
import org.apache.pdfbox.rendering.PDFRenderer; |
|||
/** |
|||
* @author:jinming |
|||
* @className:PdfToImageConverter |
|||
* @version:1.0 |
|||
* @description: |
|||
* @Date:2023/7/28 16:20 |
|||
*/ |
|||
public class PdfToImageConverter { |
|||
public static void main(String[] args) { |
|||
String pdfFilePath = "D:\\迅雷下载\\73c3fd1d6a4eb54fe1c42d8bd963f03d.pdf"; // 替换为实际的 PDF 文件路径 |
|||
String outputFolder = "D:\\txt\\yilong\\"; // 替换为输出图片的文件夹路径 |
|||
|
|||
try { |
|||
PDDocument document = PDDocument.load(new File(pdfFilePath)); |
|||
PDFRenderer pdfRenderer = new PDFRenderer(document); |
|||
|
|||
for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) { |
|||
BufferedImage bim = pdfRenderer.renderImageWithDPI(pageIndex, 300); // 设置 DPI(分辨率) |
|||
|
|||
String imageName = "page_" + (pageIndex + 1) + ".png"; // 图片文件名 |
|||
String imagePath = outputFolder + imageName; // 完整的图片文件路径 |
|||
|
|||
ImageIO.write(bim, "png", new File(imagePath)); |
|||
} |
|||
|
|||
document.close(); |
|||
System.out.println("PDF 已成功拆分为图片!"); |
|||
} catch (IOException e) { |
|||
e.printStackTrace(); |
|||
System.err.println("拆分 PDF 为图片时出现错误:" + e.getMessage()); |
|||
} |
|||
} |
|||
} |
Write
Preview
Loading…
Cancel
Save
Reference in new issue