Browse Source

文档解析应用

master
55007 6 months ago
commit
02bbbf8602
  1. 40
      .classpath
  2. 3
      .gitignore
  3. 23
      .project
  4. 5
      .settings/org.eclipse.core.resources.prefs
  5. 9
      .settings/org.eclipse.jdt.core.prefs
  6. 4
      .settings/org.eclipse.m2e.core.prefs
  7. 1
      README.md
  8. 262
      pom.xml
  9. 26
      src/main/java/com/bfd/parse/Application.java
  10. 35
      src/main/java/com/bfd/parse/cache/ConfigCache.java
  11. 39
      src/main/java/com/bfd/parse/controller/FileExecController.java
  12. 176
      src/main/java/com/bfd/parse/entity/Constants.java
  13. 111
      src/main/java/com/bfd/parse/handler/MainHandler.java
  14. 185
      src/main/java/com/bfd/parse/process/FileTaskProcess.java
  15. 15
      src/main/java/com/bfd/parse/service/FileExecService.java
  16. 55
      src/main/java/com/bfd/parse/service/impl/FileExecServiceImpl.java
  17. 63
      src/main/java/com/bfd/parse/utils/DataUtil.java
  18. 177
      src/main/java/com/bfd/parse/utils/DateUtil.java
  19. 1017
      src/main/java/com/bfd/parse/utils/DownLoadUtil.java
  20. 27
      src/main/java/com/bfd/parse/utils/EncryptionUtil.java
  21. 184
      src/main/java/com/bfd/parse/utils/ExcelUtils.java
  22. 36
      src/main/java/com/bfd/parse/utils/FileUtil.java
  23. 32
      src/main/java/com/bfd/parse/utils/JsonUtil.java
  24. 65
      src/main/java/com/bfd/parse/utils/OcrUtil.java
  25. 33
      src/main/java/com/bfd/parse/utils/OtherUtils.java
  26. 93
      src/main/java/com/bfd/parse/utils/PptUtil.java
  27. 18
      src/main/java/com/bfd/parse/utils/QueueUtil.java
  28. 46
      src/main/java/com/bfd/parse/utils/SpringBootKafka.java
  29. 23
      src/main/java/com/bfd/parse/utils/ThrowMessageUtil.java
  30. 94
      src/main/resources/application.yml
  31. 36
      src/main/resources/logback-spring.xml

40
.classpath

@ -0,0 +1,40 @@
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry kind="src" output="target/classes" path="src/main/java">
<attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="src" output="target/test-classes" path="src/test/java">
<attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
<attribute name="test" value="true"/>
</attributes>
</classpathentry>
<classpathentry excluding="**" kind="src" output="target/classes" path="src/main/resources">
<attributes>
<attribute name="maven.pomderived" value="true"/>
<attribute name="optional" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry excluding="**" kind="src" output="target/test-classes" path="src/test/resources">
<attributes>
<attribute name="maven.pomderived" value="true"/>
<attribute name="test" value="true"/>
<attribute name="optional" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="output" path="target/classes"/>
</classpath>

3
.gitignore

@ -0,0 +1,3 @@
/target/
/logs/
/.idea/

23
.project

@ -0,0 +1,23 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>test</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.jdt.core.javabuilder</name>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.m2e.core.maven2Builder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.jdt.core.javanature</nature>
<nature>org.eclipse.m2e.core.maven2Nature</nature>
</natures>
</projectDescription>

5
.settings/org.eclipse.core.resources.prefs

@ -0,0 +1,5 @@
eclipse.preferences.version=1
encoding//src/main/java=UTF-8
encoding//src/main/resources=UTF-8
encoding//src/test/java=UTF-8
encoding/<project>=UTF-8

9
.settings/org.eclipse.jdt.core.prefs

@ -0,0 +1,9 @@
eclipse.preferences.version=1
org.eclipse.jdt.core.compiler.codegen.methodParameters=generate
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
org.eclipse.jdt.core.compiler.compliance=1.8
org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled
org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore
org.eclipse.jdt.core.compiler.release=disabled
org.eclipse.jdt.core.compiler.source=1.8

4
.settings/org.eclipse.m2e.core.prefs

@ -0,0 +1,4 @@
activeProfiles=
eclipse.preferences.version=1
resolveWorkspaceProjects=true
version=1

1
README.md

@ -0,0 +1 @@
文档解析应用

262
pom.xml

@ -0,0 +1,262 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.2.4.RELEASE</version>
</parent>
<groupId>com.bfd</groupId>
<artifactId>document_parse</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>document_parse</name>
<!-- FIXME change it to the project's website -->
<url>http://www.example.com</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<!-- https://mvnrepository.com/artifact/de.codecentric/spring-boot-admin-starter-client -->
<dependency>
<groupId>de.codecentric</groupId>
<artifactId>spring-boot-admin-starter-client</artifactId>
<version>2.2.4</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.8</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-test</artifactId>
</dependency>
<!-- https://mvnrepository.com/artifact/org.springframework/spring-test -->
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-test</artifactId>
<version>5.0.10.RELEASE</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.11.0</version> <!-- 根据你的需求选择版本 -->
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>2.0.17</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.mchange/c3p0 -->
<dependency>
<groupId>com.mchange</groupId>
<artifactId>c3p0</artifactId>
<version>0.9.5.5</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.squareup.okhttp3/okhttp -->
<dependency>
<groupId>com.squareup.okhttp3</groupId>
<artifactId>okhttp</artifactId>
<version>4.9.3</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.3</version>
</dependency>
<dependency>
<groupId>commons-lang</groupId>
<artifactId>commons-lang</artifactId>
<version>2.6</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jetbrains.kotlin/kotlin-reflect -->
<dependency>
<groupId>org.jetbrains.kotlin</groupId>
<artifactId>kotlin-reflect</artifactId>
<version>1.6.21</version>
<scope>runtime</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.1</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.28</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>4.0.1</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>4.0.1</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.0.1</version>
</dependency>
<!-- Log4j 2 日志库 -->
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
<version>2.14.1</version> <!-- 或者你需要的其他版本 -->
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>2.14.1</version> <!-- 或者你需要的其他版本 -->
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
</dependency>
<!-- <dependency> -->
<!-- <groupId>org.apache.kafka</groupId> -->
<!-- <artifactId>kafka-clients</artifactId> -->
<!-- <version>2.6.0</version> -->
<!-- </dependency> -->
<dependency>
<groupId>org.springframework.kafka</groupId>
<artifactId>spring-kafka</artifactId>
</dependency>
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>5.8.5</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
</dependency>
<!-- https://mvnrepository.com/artifact/p6spy/p6spy -->
<dependency>
<groupId>p6spy</groupId>
<artifactId>p6spy</artifactId>
<version>3.9.0</version>
</dependency>
<dependency>
<groupId>commons-collections</groupId>
<artifactId>commons-collections</artifactId>
<version>3.2.2</version>
</dependency>
</dependencies>
<build>
<!-- <pluginManagement> --><!-- lock down plugins versions to avoid using Maven defaults (may be moved
to parent pom) -->
<plugins>
<!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
<plugin>
<artifactId>maven-clean-plugin</artifactId>
<version>3.1.0</version>
</plugin>
<!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
<plugin>
<artifactId>maven-resources-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.0</version>
</plugin>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.22.1</version>
</plugin>
<plugin>
<artifactId>maven-jar-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-install-plugin</artifactId>
<version>2.5.2</version>
</plugin>
<plugin>
<artifactId>maven-deploy-plugin</artifactId>
<version>2.8.2</version>
</plugin>
<!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
<plugin>
<artifactId>maven-site-plugin</artifactId>
<version>3.7.1</version>
</plugin>
<plugin>
<artifactId>maven-project-info-reports-plugin</artifactId>
<version>3.0.0</version>
</plugin>
<!-- spring-boot-maven-plugin插件就是打包spring boot应用的 -->
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<configuration>
<mainClass>com.bfd.parse.Application</mainClass>
<layout>ZIP</layout>
<includes>
<include>
<groupId>${project.groupId}</groupId>
<artifactId>${project.artifactId}</artifactId>
</include>
</includes>
</configuration>
<executions>
<execution>
<goals>
<goal>repackage</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<version>3.1.1</version>
<executions>
<execution>
<id>copy</id>
<phase>package</phase>
<goals>
<goal>copy-dependencies</goal>
</goals>
<configuration>
<type>jar</type>
<includeTypes>jar</includeTypes>
<includeScope>runtime</includeScope>
<outputDirectory>${project.build.directory}/libs</outputDirectory>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
<!-- </pluginManagement> -->
</build>
</project>

26
src/main/java/com/bfd/parse/Application.java

@ -0,0 +1,26 @@
package com.bfd.parse;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.kafka.annotation.EnableKafka;
import org.springframework.scheduling.annotation.EnableScheduling;
/**
* 主入口
*
* @author jian.mao
* @date 2023年7月4日
* @description
*/
@SpringBootApplication
@EnableScheduling
@EnableKafka
public class Application {
public static void main(String[] args) {
SpringApplication.run(Application.class, args);
}
}

35
src/main/java/com/bfd/parse/cache/ConfigCache.java

@ -0,0 +1,35 @@
package com.bfd.parse.cache;
import lombok.extern.slf4j.Slf4j;
import java.util.Map;
import java.util.concurrent.LinkedBlockingDeque;
/**
* @author jian.mao
* @date 2022年11月11日
* @description 静态变量类
*/
@Slf4j
public class ConfigCache {
/**启动条件**/
public static boolean isStart = true;
/*****任务队列*****/
public static LinkedBlockingDeque<Map<String, Object>> taskQueue = new LinkedBlockingDeque<Map<String,Object>>();
/**
* 队列录入任务
* @param queue
* @param task
*/
public static void putQueue(LinkedBlockingDeque<Map<String, Object>> queue,Map<String, Object> task){
//next app 写入队列准备调出
try {
queue.put(task);
} catch (InterruptedException e) {
log.error("队列写入data失败---");
}
}
}

39
src/main/java/com/bfd/parse/controller/FileExecController.java

@ -0,0 +1,39 @@
package com.bfd.parse.controller;
import javax.annotation.Resource;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestBody;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestMethod;
import org.springframework.web.bind.annotation.ResponseBody;
import com.bfd.parse.service.FileExecService;
@Controller
@RequestMapping("/file")
@Slf4j
public class FileExecController {
@Resource
private FileExecService fileExecService;
@PostMapping("/parse")
@ResponseBody
public String parse(@RequestBody String dataJson){
String response = fileExecService.parse(dataJson);
return response;
}
@RequestMapping(value = "/hello", method = RequestMethod.GET)
@ResponseBody
public String hello(String param, String token) {
return "123";
}
}

176
src/main/java/com/bfd/parse/entity/Constants.java

@ -0,0 +1,176 @@
package com.bfd.parse.entity;
/**
* 常量实体类
* @author jian.mao
* @date 2022年11月15日
* @description
*/
public class Constants {
/*************************蓝图常量key名称*********************************/
public final static String SCHEDULING = "scheduling";
public final static String TYPE = "type";
public final static String INTERVAL = "interval";
public final static String CREATED = "created";
public final static String LAST_EDIT = "last_edit";
public final static String BLUEPRINT_ID = "blueprint_id";
public final static String BLUEPRINTID = "blueprintId";
public final static String BLUEPRINT_NAME = "name";
public final static String SCENARIO = "scenario";
public final static String AUTOCOMMITTRIGGERLAST = "autoCommitTriggerLast";
public final static String FRESHVARIABLES = "freshVariables";
public final static String AUTOCOMMIT = "autoCommit";
public final static String MAXERRORS = "maxErrors";
public final static String DATALOSS = "dataloss";
public final static String POSITION = "position";
public final static String SCENES_ID = "scenes_id";
public final static String SCENESID = "scenesId";
public final static String MULTI_BRANCH = "multi_branch";
public final static String SINGLE = "single";
/**已重试次数**/
public final static String ERROR_TIME = "error_time";
public final static String PREVIOUS_RESULT = "previous_result";
/****数据id*****/
public final static String BUSINESSKEY = "businessKey";
/*************************metadata常量key名称*********************************/
public final static String LABEL_COL = "label_col";
public final static String LABEL = "label";
public final static String USER = "user";
public final static String ADMIN = "admin";
public final static String ADDRESS = "address";
public final static String DATASOURCE = "datasource";
public final static String INDEX = "index";
/*************************app常量key名称*********************************/
public final static String APPS = "apps";
public final static String TRANSFER_ID = "transfer_id";
public final static String MODULE = "module";
public final static String VERSION = "version";
public final static String METADATA = "metadata";
public final static String APP_NAME = "name";
public final static String DESCRIBE = "describe";
public final static String NEXT_APP_ID = "next_app_id";
public final static String EDGE_ID = "edge_id";
public final static String START_ID = "start_id";
public final static String END_ID = "end_id";
public final static String WAIT_CONDITION = "wait_condition";
public final static String START_TAG = "start_tag";
/*************************module类型*********************************/
public final static String FILE = "file";
public final static String OCR = "OCR";
public final static String FILTER = "Filter";
public final static String CHATGPT = "ChatGPT";
public final static String MYSQL = "mysql";
/*************************other类型*********************************/
public final static String UNDERLINE = "_";
public final static String RESULT_TOPIC = null;
public static final String EMPTY = "";
public static final String HTTP = "http";
public static final String REQUEST_ERROR_MESSAGE = "Download failed error is";
public static final String REQUEST_RESULT = "result";
public static final String REQUEST_RESULT_RESULTS = "results";
public static final String MAP_TYPE = "Map";
public static final String LIST_TYPE = "List";
public static final String STRING_TYPE = "String";
public static final String DOCUMENT_TYPE = "doc";
public static final String FILTER_ZH = "过滤器";
public static final String JSON_SELE_SYMBOL = "$.";
public static final String LEFT_BRACKETS = "[";
public static final String RIGTH_BRACKETS = "]";
public static final String TASKTYPE = "taskType";
public static final Integer USER_TYPE = 1;
public static final Integer KEYWORD_TYPE = 0;
public static final Integer DETAIL_TYPE = 2;
public static final String CID = "cid";
public static final String SITETYPE = "siteType";
public static final Integer DEFULT_SUBJECTID = 304864;
public static final Integer DEFULT_CRAWLCYCLICITYTIME = 1440;
public static final String CRAWLENDTIME = "crawlEndTime";
public static final String CRAWLSTARTTIME = "crawlStartTime";
public static final String CRAWLPAGETYPES = "crawlPageTypes";
public static final String APPID = "113ic";
public static final String APP_ID = "appId";
public final static String ID = "id";
public static final Integer DEFULT_CRAWLPERIODHOUR = 24;
public static final String CREATEUSERID = "662015832180933762";
public static final String CRAWL_ADD_URL = "https://caiji.percent.cn/api/crawl/remote/task/save";
public static final String CRAWLKEYWORD = "crawlKeyword";
public static final String ATTACHTAG = "attachTag";
public static final String ATTACHTAG_VALUE = "analyze";
public static final String KEYWORD = "keyword";
public static final String SITEID = "siteId";
public static final String RESULTS = "results";
public static final String RESULT = "result";
public static final String CRAWLDATAFLAG = "crawlDataFlag";
public static final String CRAWLDATAFLAG_PREFIX = "\"crawlDataFlag\":\"keyword:";
public static final String TID = "tid";
public static final Long TIME_OUT = 1800000L;
public static final String ATTR = "attr";
public static final String HASVIDEO = "hasVideo";
public static final String CRAWL_END_MARK = "crawl_end_mark";
public static final String CRAWL_END_MESSAGE = "crawl_end_message";
public static final String CRAWL_END_MESSAGE_VALUE = "数据采集完成";
public static final String SUBJECTID = "subjectId";
public static final String TASKID = "taskId";
public static final int SUCCESS_CODE = 200;
public static final String WEB_URL_SUFFIX = "/api/aogeo/api/cda/caiji/status";
public static final String STATUS = "status";
/************************redis*************************************/
public static final String LOCK_KEY = "myLock";
public static final long LOCK_EXPIRE_TIME = 300000;
/************************应用参数*************************************/
public static final String CODE = "code";
public static final String MESSAGE = "message";
public static final String INPUT = "input";
public static final String OUTPUT = "output";
public static final String FORM = "form";
public static final String FIELD = "field";
public static final String VALUE = "value";
public static final String DATA = "data";
public static final String COLON_EN = ":";
public static final String DATABASE = "database";
public static final String TABLE = "table";
public static final String USERNAME = "username";
public static final String PASSWORD = "password";
public static final String PORT = "port";
public static final String HOSTNAME = "hostname";
public static final String DATATYPE = "dataType";
public static final String RULES = "rules";
public static final String GENID = "genId";
public static final String KEY = "key";
public static final String DATAID = "dataId";
public static final String APP_CODE = "app_code";
/***************文件相关参数常量***********************/
public static final String READCONTENT = "readContent";
public static final String CONTENT = "content";
public static final String JPG = "jpg";
public static final String PNG = "png";
public static final String JPEG = "jpeg";
public static final String XLS = "xls";
public static final String XLSX = "xlsx";
public static final String PDF = "pdf";
public static final String DOC = "doc";
public static final String DOCX = "docx";
public static final String PPT = "ppt";
public static final String PPTX = "pptx";
public static final String MP4 = "mp4";
public static final String EXT = "ext";
public static final String FILES = "files";
public static final String PATH = "path";
public static final String TITLE = "title";
public static final String DATAPROCESSID = "dataProcessId";
public static final String ISLAST = "isLast";
public static final String TRACE = "trace";
}

111
src/main/java/com/bfd/parse/handler/MainHandler.java

@ -0,0 +1,111 @@
package com.bfd.parse.handler;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.concurrent.LinkedBlockingDeque;
import javax.annotation.Resource;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.FileUtils;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.ApplicationArguments;
import org.springframework.boot.ApplicationRunner;
import org.springframework.core.annotation.Order;
import org.springframework.stereotype.Component;
import com.alibaba.fastjson.JSONObject;
import com.bfd.parse.cache.ConfigCache;
import com.bfd.parse.process.FileTaskProcess;
import com.bfd.parse.utils.FileUtil;
/**
* 启动处理入口
* @author jian.mao
* @date 2023年11月3日
* @description
*/
@Component
@Order(value = 1)
@Slf4j
public class MainHandler implements ApplicationRunner {
@Value("${task.task-queue-path}")
private String taskPath;
@Resource
private FileTaskProcess fileTaskProcess;
@Override
public void run(ApplicationArguments args) throws Exception {
new Thread(fileTaskProcess).start();;
log.info("开启es执行线程-----");
//停止处理
waitDown();
//启动加载缓存任务
readTask(taskPath,ConfigCache.taskQueue);
}
@SuppressWarnings("unchecked")
public static void readTask(String path,LinkedBlockingDeque<Map<String, Object>> queue){
File file = new File(path);
if(file.exists()){
List<String> tasks = null;
try {
tasks = FileUtils.readLines(file,"UTF-8");
} catch (IOException e) {
e.printStackTrace();
}
for (String taskStr : tasks) {
Map<String, Object> task = JSONObject.parseObject(taskStr);
try {
queue.put(task);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
file.delete();
}
}
/**
* 结束触发钩子
*/
public void waitDown() {
Runtime.getRuntime().addShutdownHook(new Thread() {
@Override
public void run() {
// 停止线程
ConfigCache.isStart = false;
log.info("stop-------");
writeTsskToFile();
}
});
}
/**
* 任务持久化到硬盘
*/
public void writeTsskToFile(){
while(true){
if(ConfigCache.taskQueue.size() > 0 ){
try {
Map<String, Object> task = ConfigCache.taskQueue.take();
FileUtil.writeFile(taskPath, JSONObject.toJSONString(task));
} catch (InterruptedException e) {
e.printStackTrace();
}
}else{
log.info("taskQueue write is file end");
break;
}
}
}
}

185
src/main/java/com/bfd/parse/process/FileTaskProcess.java

@ -0,0 +1,185 @@
package com.bfd.parse.process;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import java.util.UUID;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.FileUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import com.alibaba.fastjson.JSONObject;
import com.bfd.parse.cache.ConfigCache;
import com.bfd.parse.entity.Constants;
import com.bfd.parse.utils.DataUtil;
import com.bfd.parse.utils.DownLoadUtil;
import com.bfd.parse.utils.SpringBootKafka;
/**
* @author jian.mao
* @date 2024年2月4日
* @description
*/
@Component
@Slf4j
public class FileTaskProcess implements Runnable{
@Autowired
private SpringBootKafka springBootKafka;
@Value("${customize-kafka.producer.topic}")
private String topic;
@Value("${gofast.profix.host}")
private String host;
@Value("${file.download.dir}")
private String saveDir;
@Value("${file.ocrApi}")
private String ocrApi;
@Value("${file.uploadUrl}")
private String uploadUrl;
@Override
public void run() {
while (ConfigCache.isStart){
Map<String, Object> task = null;
String saveFilePath = null;
try {
task = ConfigCache.taskQueue.take();
log.info("任务:{}", JSONObject.toJSONString(task));
log.info("任务队列长度:{}", Integer.valueOf(ConfigCache.taskQueue.size()));
Map<String, Object> input = (Map<String, Object>) task.get(Constants.INPUT);
String gofastUrl = DataUtil.getValue((String) input.get(Constants.PATH), (Map<String, Object>) task.get(Constants.DATA)).toString();
saveFilePath = this.saveDir + gofastUrl.replaceAll(".*(?=\\.)", UUID.randomUUID().toString());
Object content = readFileToSend(gofastUrl, saveFilePath);
Map<String, Object> result = new HashMap<>(16);
Map<String, Object> results = new HashMap<>(16);
results.put(Constants.ID, UUID.randomUUID());
results.put(Constants.CONTENT, content);
results.put(Constants.ISLAST, 1);
result.put(Constants.RESULTS, JSONObject.toJSONString(results));
result.put(Constants.MESSAGE, "success");
result.put(Constants.STATUS, 1);
task.put(Constants.RESULT, result);
this.springBootKafka.send(this.topic, JSONObject.toJSONString(task));
} catch (Exception e) {
log.error("结果组装异常,", e);
Map<String, Object> result2 = new HashMap<>(16);
Map<String, Object> results = new HashMap<>(16);
results.put(Constants.ISLAST, 1);
results.put(Constants.CONTENT, e.getMessage());
result2.put(Constants.RESULTS, JSONObject.toJSONString(results));
result2.put(Constants.MESSAGE, "异常");
result2.put(Constants.STATUS, 2);
task.put(Constants.RESULT, result2);
this.springBootKafka.send(this.topic, JSONObject.toJSONString(task));
log.info("数据流转至下游-------");
}finally{
delFile(saveFilePath);
}
}
}
/**
* 根据文件类型读取内容
* @param fileType 文件类型
* @param path 文件路径
* @return
* @throws IOException
*/
private String readFileToSend(String path, String saveFilePath) throws IOException {
StringBuffer sb = new StringBuffer();
DownLoadUtil.downloadFile(path, saveFilePath);
if (saveFilePath.endsWith(Constants.DOC) || saveFilePath.endsWith(Constants.DOCX)) {
sb.append(readWordFile(saveFilePath));
}else if(saveFilePath.endsWith(Constants.PDF)){
//pdf 纯文字版本
sb.append(readPdfFile(saveFilePath));
} else {
for (String line : FileUtils.readLines(new File(saveFilePath))) {
sb.append(line);
}
}
return sb.toString();
}
/**
* 解析word文档
* @param filePath 文件路径
* @return
* @throws IOException
*/
private String readWordFile(String filePath) throws IOException {
InputStream inputStream = new FileInputStream(filePath);
String fileTypeDoc = "doc";
String fileTypeDocx = "docx";
if (filePath.endsWith(fileTypeDoc)) {
try (HWPFDocument document = new HWPFDocument(inputStream)) {
WordExtractor extractor = new WordExtractor(document);
return extractor.getText();
}
} else if (filePath.endsWith(fileTypeDocx)) {
try (XWPFDocument document = new XWPFDocument(inputStream)) {
XWPFWordExtractor extractor = new XWPFWordExtractor(document);
return extractor.getText();
}
} else {
log.error("Unsupported file format");
throw new IllegalArgumentException("Unsupported file format");
}
}
/**
* 读取纯文本pdf
* @param filePath 文件路径
* @return
* @throws IOException
*/
private String readPdfFile(String filePath)throws IOException {
// 加载PDF文件
File file = new File(filePath);
PDDocument document = PDDocument.load(file);
// 提取文本内容
PDFTextStripper pdfStripper = new PDFTextStripper();
String text = pdfStripper.getText(document);
// 关闭文档
document.close();
return text;
}
/**
* @param filePath
*/
private void delFile(String filePath) {
try {
// 创建 File 对象
File file = new File(filePath);
// 检查文件是否存在
if (file.exists()) {
// 尝试删除文件
if (file.delete()) {
log.info("文件删除成功: " + filePath);
} else {
log.error("无法删除文件: " + filePath);
}
} else {
log.warn("文件不存在: " + filePath);
}
} catch (Exception e) {
// TODO: handle exception
log.error("删除文件异常:{}",filePath);
}
}
}

15
src/main/java/com/bfd/parse/service/FileExecService.java

@ -0,0 +1,15 @@
package com.bfd.parse.service;
/**
* @author jian.mao
* @date 2024年2月4日
* @description
*/
public interface FileExecService {
/**
* @param dataJson
* @return
*/
String parse(String dataJson);
}

55
src/main/java/com/bfd/parse/service/impl/FileExecServiceImpl.java

@ -0,0 +1,55 @@
package com.bfd.parse.service.impl;
import java.util.HashMap;
import java.util.Map;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import com.alibaba.fastjson.JSONObject;
import com.bfd.parse.cache.ConfigCache;
import com.bfd.parse.entity.Constants;
import com.bfd.parse.service.FileExecService;
/**
* @author jian.mao
* @date 2024年2月4日
* @description
*/
@Service
@Slf4j
public class FileExecServiceImpl implements FileExecService {
@Override
public String parse(String dataJson) {
// TODO Auto-generated method stub
Map<String,Object> response = new HashMap<>(16);
int code = 200;
String message = "success";
Map<String,Object> task = null;
try {
task = JSONObject.parseObject(dataJson);
} catch (Exception e) {
log.error("参数结构不合法,",e);
code = 100010;
message = "参数不合法";
}
//写入队列
try {
if(task.containsKey(Constants.TRACE) && (boolean)task.get(Constants.TRACE)){
ConfigCache.taskQueue.putFirst(task);
}else{
ConfigCache.taskQueue.put(task);
}
} catch (InterruptedException e) {
log.error("任务写入等待队列异常,",e);
code = 100011;
message = "任务写入等待队列失败";
}
response.put(Constants.CODE,code);
response.put(Constants.MESSAGE,message);
return JSONObject.toJSONString(response);
}
}

63
src/main/java/com/bfd/parse/utils/DataUtil.java

@ -0,0 +1,63 @@
package com.bfd.parse.utils;
import java.util.Map;
import lombok.extern.slf4j.Slf4j;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.alibaba.fastjson.JSONPath;
import com.bfd.parse.entity.Constants;
/**
* @author:jinming
* @className:DataUtil
* @version:1.0
* @description: 获取dataValue的值
* @Date:2023/11/1 9:54
*/
@Slf4j
public class DataUtil {
/**
*
* @param key 传入的key
* @param dataMap 数据map
* @return 根据传入的参数进行判断解析返回正确的dataValue
*/
public static Object getValue(String key, Map dataMap) {
try {
//公式为空直接就返回
if(key.equals(Constants.EMPTY)){
return Constants.EMPTY;
}
Object dataValue;
String isJson = "#json#";
if (key.contains(isJson)) {
//进行第一次拆分获取#json#前面的部分
String[] keySplit = key.split(isJson);
String firstDataKey = keySplit[0];
String[] firstDataKeySplit = firstDataKey.split(":");
//取出前半部分对应的JSON数据并转换为JSONObject
String dataJson = (String) dataMap.get(firstDataKeySplit[0]);
JSONObject dataJsonObject = JSON.parseObject(dataJson);
//根据key的后半部分取出对应JSONObject中的值
String firstDataKeyJson = (String) JSONPath.eval(dataJsonObject, firstDataKeySplit[1]);
String secDataKey = keySplit[1];
JSONObject firstDataJsonObject = JSON.parseObject(firstDataKeyJson);
dataValue = JSONPath.eval(firstDataJsonObject, secDataKey);
return dataValue;
}
String[] keySplit = key.split(":");
String jsonPath = keySplit[1];
String dataJson = (String) dataMap.get(keySplit[0]);
JSONObject dataJsonObject = JSON.parseObject(dataJson);
dataValue = JSONPath.eval(dataJsonObject, jsonPath);
return dataValue;
} catch (Exception e) {
// TODO: handle exception
log.error("jsonpath公式取值异常,",e);
return null;
}
}
}

177
src/main/java/com/bfd/parse/utils/DateUtil.java

@ -0,0 +1,177 @@
package com.bfd.parse.utils;
import java.math.BigInteger;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.Date;
import lombok.extern.slf4j.Slf4j;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
/**
* 日期工具类
*
* @author jian.mao
* @date 2022年11月15日
* @description
*/
@Slf4j
public class DateUtil {
/**
* @return
*/
public static String getTimeStrForNow() {
SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHH");
return sdf.format(new Date());
}
public static String getTimeStrForDay(long time) {
SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");
return sdf.format(new Date(time * 1000));
}
public static String getTimeStrForDay() {
SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");
return sdf.format(new Date());
}
public static String getDateTime() {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
String time = sdf.format(new Date());
return time;
}
public static String getDateTime(Long timestap) {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
String time = sdf.format(new Date(timestap));
return time;
}
public static String getDate(Long timestap) {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
String time = sdf.format(new Date(timestap));
return time;
}
public static String getDateTimeForMonth() {
SimpleDateFormat sdf = new SimpleDateFormat("yyyyMM");
String time = sdf.format(new Date());
return time;
}
/**
* 休眠
*
* @param millis 毫秒
*/
public static void sleep(long millis) {
try {
Thread.sleep(millis);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
/**
* 1. @Description:时间戳转时间
* 2. @Author: ying.zhao
* 3. @Date: 2023/3/28
*/
public static String timestampToDate(String time) {
int thirteen = 13;
int ten = 10;
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
// if (time.length() == thirteen) {
if (time.length() > ten) {
return sdf.format(new Date(Long.parseLong(time)));
} else {
return sdf.format(new Date(Integer.parseInt(time) * 1000L));
}
}
public static String parseCreated(String jsonTime){
String formattedDateTime = getDateTime();
try {
// 使用fastjson解析JSON数据
JSONObject jsonObject = JSON.parseObject(jsonTime);
// 获取日期和时间的值
JSONObject dateObject = jsonObject.getJSONObject("date");
int day = dateObject.getIntValue("day");
int month = dateObject.getIntValue("month");
int year = dateObject.getIntValue("year");
JSONObject timeObject = jsonObject.getJSONObject("time");
int hour = timeObject.getIntValue("hour");
int minute = timeObject.getIntValue("minute");
int second = timeObject.getIntValue("second");
// 创建LocalDateTime对象
LocalDateTime dateTime = LocalDateTime.of(year, month, day, hour, minute, second);
// 定义日期时间格式化器
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
// 格式化日期时间
formattedDateTime = dateTime.format(formatter);
} catch (Exception e) {
log.info("日期转换失败:{}",e);
}
return formattedDateTime;
}
/**
* 字符串转换日期
* @param format
* @param date
* @return
*/
public static Date strToDate(String format,String date){
SimpleDateFormat sdf = new SimpleDateFormat(format);
if (date == null || date.equals("")){
return new Date();
}else{
Date ru = null;
try {
ru = sdf.parse(date);
} catch (ParseException e) {
e.printStackTrace();
}
return ru;
}
}
/**
* 日期格式话
* @param format 日期格式
* @param dater 要转换的日期,默认当前时间
* @return
*/
public static String FormatDate(String format,Date date){
String fromatDate = null;
SimpleDateFormat sdf = new SimpleDateFormat(format);
if (date == null){
fromatDate = sdf.format(new Date());
}else{
fromatDate = sdf.format(date);
}
return fromatDate;
}
public static void main(String[] args) {
String time = timestampToDate("955814400000");
System.out.println(time);
}
}

1017
src/main/java/com/bfd/parse/utils/DownLoadUtil.java
File diff suppressed because it is too large
View File

27
src/main/java/com/bfd/parse/utils/EncryptionUtil.java

@ -0,0 +1,27 @@
package com.bfd.parse.utils;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
/**
* @author jian.mao
* @date 2023年3月10日
* @description
*/
public class EncryptionUtil {
public static String md5(String text) {
try {
MessageDigest md = MessageDigest.getInstance("MD5");
md.update(text.getBytes());
byte[] bytes = md.digest();
StringBuilder sb = new StringBuilder();
for (byte b : bytes) {
sb.append(String.format("%02x", b & 0xff));
}
return sb.toString();
} catch (NoSuchAlgorithmException e) {
e.printStackTrace();
return null;
}
}
}

184
src/main/java/com/bfd/parse/utils/ExcelUtils.java

@ -0,0 +1,184 @@
package com.bfd.parse.utils;
import okhttp3.*;
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
/**
* @author jian.mao
* @date 2023年4月7日
* @description excel解析工具类
*/
public class ExcelUtils {
/**
* excel解析
*
* @param excel
* @return
*/
public static Map<String, Object> parse(String filePath) {
Map<String, Object> excelMap = new HashMap<String, Object>(16);
try {
File excel = new File(filePath);
FileInputStream file = new FileInputStream(excel);
// 使用工厂模式创建工作簿对象
Workbook workbook = WorkbookFactory.create(file);
// 获取工作簿中工作表的数量
int numberOfSheets = workbook.getNumberOfSheets();
DataFormatter dataFormatter = new DataFormatter();
// 遍历所有工作表
for (int i = 0; i < numberOfSheets; i++) {
Sheet sheet = workbook.getSheetAt(i);
String key = sheet.getSheetName();
//行码
int rowNum = 0;
List<Map<String, String>> data = new ArrayList<Map<String, String>>();
Map<Integer, String> titleHead = new HashMap<Integer, String>(16);
// 遍历所有行
for (Row row : sheet) {
//单元格码
int cellNum = 0;
//行内容存储
Map<String, String> rowMap = new HashMap<String, String>(16);
// 遍历所有单元格
if (rowNum == 0) {
for (Cell cell : row) {
String cellValue = dataFormatter.formatCellValue(cell);
titleHead.put(cellNum, cellValue);
cellNum++;
}
} else {
for (int j = 0; j < titleHead.size(); j++) {
String cellValue = dataFormatter.formatCellValue(row.getCell(j));
rowMap.put(titleHead.get(cellNum), cellValue);
cellNum++;
}
}
if (rowNum > 0) {
data.add(rowMap);
}
rowNum++;
}
excelMap.put(key, data);
}
// 关闭文件输入流和工作簿对象
file.close();
workbook.close();
} catch (IOException e) {
e.printStackTrace();
}
return excelMap;
}
/**
* 将List<Map<String, String>>写入Excel文件中
*
* @param data 要写入Excel的数据每个Map代表一行数据Map的key为列名value为单元格数据
* @param excelFilePath Excel文件路径包含文件名和扩展名
* @param sheetName 工作表名称
* @throws IOException 如果写入Excel文件时发生IO异常则抛出该异常
*/
public static void write(List<Map<String, String>> data, String excelFilePath, String sheetName) throws IOException {
// 创建一个新的工作簿对象
Workbook workbook = new XSSFWorkbook();
// 创建一个新的工作表
Sheet sheet = workbook.createSheet(sheetName);
// 行码
int rowNum = 0;
// 写入列头
Row headerRow = sheet.createRow(rowNum++);
int colNum = 0;
for (String key : data.get(0).keySet()) {
Cell cell = headerRow.createCell(colNum++);
cell.setCellValue(key);
}
// 写入数据
for (Map<String, String> rowMap : data) {
Row row = sheet.createRow(rowNum++);
colNum = 0;
for (String key : rowMap.keySet()) {
Cell cell = row.createCell(colNum++);
try {
String s = rowMap.get(key);
if (s.length() > 30000) {
cell.setCellValue(s.substring(0, 25000));
} else {
cell.setCellValue(s);
}
} catch (Exception e) {
System.out.println(key);
e.printStackTrace();
}
}
}
// 将数据写入文件
FileOutputStream outputStream = new FileOutputStream(excelFilePath);
workbook.write(outputStream);
workbook.close();
outputStream.close();
}
// public static void copyFile(String sourceFloder, String targetFileName) {
// File sourceFile = new File(sourceFloder);
// byte[] buffer = new byte[(int) sourceFile.length()];
// try (InputStream inputStream = new FileInputStream(sourceFile)) {
// inputStream.read(buffer);
// } catch (IOException e) {
// e.printStackTrace();
// return;
// }
// // 写入目标文件
// File targetFile = new File(targetFileName);
// targetFile.mkdirs();
// try (OutputStream outputStream = new FileOutputStream(targetFile)) {
// outputStream.write(buffer);
// } catch (IOException e) {
// e.printStackTrace();
// return;
// }
// }
public static void copyFile(String sourceFilePath) {
// 源文件和目标文件的路径
String targetDrive = "F:";
try {
// 获取源文件和目标文件的路径信息
Path sourcePath = Paths.get(sourceFilePath);
Path targetPath = Paths.get(targetDrive + sourcePath.toString().substring(2));
// 如果目标文件的父目录不存在则创建该目录
if (!targetPath.getParent().toFile().exists()) {
targetPath.getParent().toFile().mkdirs();
}
// 进行文件复制
Files.copy(sourcePath, targetPath, StandardCopyOption.REPLACE_EXISTING);
System.out.println("Copied file: " + sourceFilePath + " -> " + targetPath);
} catch (IOException e) {
e.printStackTrace();
}
}
}

36
src/main/java/com/bfd/parse/utils/FileUtil.java

@ -0,0 +1,36 @@
package com.bfd.parse.utils;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* 文件工具类
* @author jian.mao
* @date 2023年7月14日
* @description
*/
public class FileUtil {
/**
* 数据写入文件
* @param Path 文件路径
* @param result 数据
* @throws IOException
*/
public static void writeFile(String path,String result){
try {
FileWriter fw = new FileWriter(path,true);
fw.write(result+"\n");
fw.flush();
fw.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}

32
src/main/java/com/bfd/parse/utils/JsonUtil.java

@ -0,0 +1,32 @@
package com.bfd.parse.utils;
import com.alibaba.fastjson.JSONObject;
import com.bfd.parse.entity.Constants;
/**
* json工具
* @author jian.mao
* @date 2023年7月10日
* @description
*/
public class JsonUtil {
/**
* 校验字符串是list/map/str
* @param jsonString
* @return
*/
public static String checkJsonType(String jsonString) {
try {
JSONObject.parseObject(jsonString);
return Constants.MAP_TYPE;
} catch (Exception e) {
try {
JSONObject.parseArray(jsonString);
return Constants.LIST_TYPE;
} catch (Exception ex) {
return Constants.STRING_TYPE;
}
}
}
}

65
src/main/java/com/bfd/parse/utils/OcrUtil.java

@ -0,0 +1,65 @@
package com.bfd.parse.utils;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import okhttp3.MediaType;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.RequestBody;
import okhttp3.Response;
import com.alibaba.fastjson.JSON;
import com.bfd.parse.entity.Constants;
/**
* @author:jinming
* @className:ocrUtil
* @version:1.0
* @description:
* @Date:2023/8/1 16:38
*/
public class OcrUtil {
private static OkHttpClient okHttpClient;
private static OkHttpClient getOkHttpClient() {
if (okHttpClient == null) {
okHttpClient = new OkHttpClient();
}
return okHttpClient;
}
public static String doOcr(String url,String ocrApi) {
String text = "";
int reTryTimes = 3;
for (int i = 0; i < reTryTimes; i++) {
int okCode = 200;
OkHttpClient client = getOkHttpClient();
OkHttpClient.Builder builder = client.newBuilder().writeTimeout(600, TimeUnit.SECONDS).connectTimeout(600, TimeUnit.SECONDS).readTimeout(600, TimeUnit.SECONDS);
client = builder.build();
MediaType mediaType = MediaType.parse("application/json");
RequestBody body = RequestBody.create(mediaType, "{\"id\":\"\",\"url\":\"" + url + "\"}");
Request request = new Request.Builder()
.url(ocrApi)
.method("POST", body)
.addHeader("Content-Type", "application/json")
.build();
try {
Response response = client.newCall(request).execute();
String html = response.body().string();
Map dataMap = (Map) JSON.parse(html);
int code = (int) dataMap.get("code");
if (code == okCode) {
text = (String) dataMap.get("text");
}
if (text.equals(Constants.EMPTY)) {
break;
}
} catch (Exception e) {
e.printStackTrace();
}
}
return text;
}
}

33
src/main/java/com/bfd/parse/utils/OtherUtils.java

@ -0,0 +1,33 @@
package com.bfd.parse.utils;
import java.security.MessageDigest;
/**
* 其他工具类
* @author jian.mao
* @date 2023年9月19日
* @description
*/
public class OtherUtils {
public static String getMd5(String string) {
try {
MessageDigest md5 = MessageDigest.getInstance("MD5");
byte[] bs = md5.digest(string.getBytes("UTF-8"));
StringBuilder sb = new StringBuilder(40);
for (byte x : bs) {
if ((x & 0xff) >> 4 == 0) {
sb.append("0").append(Integer.toHexString(x & 0xff));
} else {
sb.append(Integer.toHexString(x & 0xff));
}
}
return sb.toString();
} catch (Exception e) {
return "nceaform" + System.currentTimeMillis();
}
}
}

93
src/main/java/com/bfd/parse/utils/PptUtil.java

@ -0,0 +1,93 @@
package com.bfd.parse.utils;
import org.apache.poi.xslf.usermodel.*;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* @author:jinming
* @className:PptUtil
* @version:1.0
* @description:
* @Date:2024/3/25 16:14
*/
public class PptUtil {
public static String parse(String filePath) {
StringBuilder dataStringsb = new StringBuilder();
XMLSlideShow ppt = null;
try {
// PPT类
ppt = new XMLSlideShow(new FileInputStream(filePath));
// 获取PPT中的所有幻灯片
// List<XSLFSlide> slides = ppt.getSlides();
// 遍历幻灯片
for (int i = 0; i < ppt.getSlides().size(); i++) {
//拿到第i页的PPT
XSLFSlide slides = ppt.getSlides().get(i);
System.out.println("第" + (i + 1) + "页");
//注释的for循环是获取所以PPT的内容
// for (XSLFSlide slide : slides) {
// 获取幻灯片中的所有图形
List<XSLFShape> shapes = slides.getShapes();
// 遍历PPT的图形
for (XSLFShape shape : shapes) {
// 判断该图形类是否是文本框类
if (shape instanceof XSLFTextShape) {
// 将图像类强制装换成文本框类
XSLFTextShape ts = (XSLFTextShape) shape;
// 获取文本框内的文字
String str = ts.getText();
dataStringsb.append(str);
}
// 判断该图形类是否是表格类
if (shape instanceof XSLFTable) {
// 将图像类强制装换成表格类
XSLFTable table = (XSLFTable) shape;
// 获取表格中的所有行
List<XSLFTableRow> rows = table.getRows();
for (XSLFTableRow tr : rows) {
// 获取行中的所有单元格
List<XSLFTableCell> cells = tr.getCells();
for (XSLFTableCell tc : cells) {
// 获取单元格内的文字
String str = tc.getText();
dataStringsb.append(str);
}
}
}
// 判断该图形类是否是图片框类
if (shape instanceof XSLFPictureShape) {
// 将图像类强制装换成图片框类
XSLFPictureShape ps = (XSLFPictureShape) shape;
// 获取图片的字节码数据可以利用输出流将该图片保存到硬盘里
byte[] pictureData = ps.getPictureData().getData();
// System.out.println("图片信息:" + pictureData);
}
}
}
// }
} catch (Exception e) {
e.printStackTrace();
} finally {
if (ppt != null) {
try {
// 保存完之后要对PPT进行关闭操作
ppt.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return dataStringsb.toString();
}
}

18
src/main/java/com/bfd/parse/utils/QueueUtil.java

@ -0,0 +1,18 @@
package com.bfd.parse.utils;
import java.util.Map;
import java.util.concurrent.LinkedBlockingDeque;
/**
* @author:jinming
* @className:QueueUtil
* @version:1.0
* @description:
* @Date:2023/7/13 15:00
*/
public class QueueUtil {
public static LinkedBlockingDeque<Map<String, Object>> taskQueue = new LinkedBlockingDeque<Map<String, Object>>();
public static LinkedBlockingDeque<String> sendQueue = new LinkedBlockingDeque<String>();
}

46
src/main/java/com/bfd/parse/utils/SpringBootKafka.java

@ -0,0 +1,46 @@
package com.bfd.parse.utils;
import com.alibaba.fastjson.JSONObject;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.kafka.core.KafkaTemplate;
import org.springframework.kafka.support.SendResult;
import org.springframework.stereotype.Component;
import org.springframework.util.concurrent.ListenableFuture;
import org.springframework.util.concurrent.ListenableFutureCallback;
/**
* @PROJECT_NAME: companybusinesscrawl
* @DESCRIPTION:SpringBootKafka 工具类
* @AUTHOR: ying.zhao
* @DATE: 2023/4/6 11:09
*/
@Slf4j
@Component
public class SpringBootKafka {
@Autowired
private KafkaTemplate<String, Object> kafkaTemplate;
/**
* 自定义topicKafkaTemplate
*/
/**
* public static final String TOPIC = "companyBussTest";
**/
public void send(String topic, String message) {
//发送消息
ListenableFuture<SendResult<String, Object>> future = kafkaTemplate.send(topic, message);
future.addCallback(new ListenableFutureCallback<SendResult<String, Object>>() {
@Override
public void onFailure(Throwable throwable) {
//发送失败的处理
log.info(topic + " - 生产者 发送消息失败:" + throwable.getMessage());
}
@Override
public void onSuccess(SendResult<String, Object> stringObjectSendResult) {
//成功的处理
log.info("{} - 生产者 发送消息成功:",topic);
}
});
}
}

23
src/main/java/com/bfd/parse/utils/ThrowMessageUtil.java

@ -0,0 +1,23 @@
package com.bfd.parse.utils;
import java.io.PrintWriter;
import java.io.StringWriter;
/**
* @author jian.mao
* @date 2023年3月22日
* @description
*/
public class ThrowMessageUtil {
/**
* 获取异常信息
* @param t
* @return
*/
public static String getErrmessage(Throwable t){
StringWriter stringWriter=new StringWriter();
t.printStackTrace(new PrintWriter(stringWriter,true));
return stringWriter.getBuffer().toString();
}
}

94
src/main/resources/application.yml

@ -0,0 +1,94 @@
logging:
level:
root: info
path: ../logs
server:
port: 8017
servlet:
context-path: /document_parse
tomcat:
uri-encoding: utf-8
max-connections: 20000
max-http-form-post-size: 1
max-threads: 1000
spring:
application:
name: 文档解析
datasource:
url: jdbc:mysql://172.18.1.147:3306/youzhi_cda_db?serverTimezone=UTC&useUnicode=true&characterEncoding=utf-8&useSSL=true
username: root
password: baifendian123
driver-class-name: com.mysql.cj.jdbc.Driver
kafka:
bootstrap-servers: 172.18.1.146:9092,172.18.1.147:9092,172.18.1.148:9092
producer:
retries: 0
#当有多个消息需要被发送到同一个分区时,生产者会把它们放在同一个批次里。该参数指定了一个批次可以使用的内存大小,按照字节数计算。
batch-size: 16384
# 设置生产者内存缓冲区的大小。
buffer-memory: 33554432
# 键的序列化方式
key-serializer: org.apache.kafka.common.serialization.StringSerializer
# 值的序列化方式
value-serializer: org.apache.kafka.common.serialization.StringSerializer
# acks=0 : 生产者在成功写入消息之前不会等待任何来自服务器的响应。
# acks=1 : 只要集群的首领节点收到消息,生产者就会收到一个来自服务器成功响应。
# acks=all :只有当所有参与复制的节点全部收到消息时,生产者才会收到一个来自服务器的成功响应。
acks: 1
consumer:
# 自动提交的时间间隔 在spring boot 2.X 版本中这里采用的是值的类型为Duration 需要符合特定的格式,如1S,1M,2H,5D
auto-commit-interval: 1S
# 该属性指定了消费者在读取一个没有偏移量的分区或者偏移量无效的情况下该作何处理:
# latest(默认值)在偏移量无效的情况下,消费者将从最新的记录开始读取数据(在消费者启动之后生成的记录)
# earliest :在偏移量无效的情况下,消费者将从起始位置读取分区的记录
auto-offset-reset: earliest
# 是否自动提交偏移量,默认值是true,为了避免出现重复数据和数据丢失,可以把它设置为false,然后手动提交偏移量
enable-auto-commit: true
# 键的反序列化方式
key-deserializer: org.apache.kafka.common.serialization.StringDeserializer
# 值的反序列化方式
value-deserializer: org.apache.kafka.common.serialization.StringDeserializer
#消费组
group-id: test4
#消费者并发线程数
concurrency: 4
#超时时间
max-poll-interval-ms: 60000
#listener:
# 在侦听器容器中运行的线程数。
#concurrency: 5
#listner负责ack,每调用一次,就立即commit
#ack-mode: manual_immediate
#missing-topics-fatal: false
boot:
admin:
client:
url: http://172.18.1.147:8001
instance:
service-base-url: http://172.18.1.147:8017
management:
endpoints:
web:
exposure:
include: "*"
endpoint:
health:
show-details: always
health:
elasticsearch:
enabled: false
customize-kafka:
producer:
topic: produce_analyze
task:
task-queue-path: ../data/taskQueue.txt
gofast:
profix:
host: http://172.18.1.146:8080
file:
download:
dir: ../file/
ocrApi: http://10.0.32.238:10004/ocr/arm
uploadUrl: http://172.18.1.130:9985/group33/upload

36
src/main/resources/logback-spring.xml

@ -0,0 +1,36 @@
<configuration>
<!-- 属性文件:在properties文件中找到对应的配置项 -->
<springProperty scope="context" name="logging.path" source="logging.path"/>
<springProperty scope="context" name="logging.level" source="logging.level.com.bfd"/>
<!-- 默认的控制台日志输出,一般生产环境都是后台启动,这个没太大作用 -->
<!-- <appender name="STDOUT"
class="ch.qos.logback.core.ConsoleAppender">
<encoder class="ch.qos.logback.classic.encoder.PatternLayoutEncoder">
<Pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %line %-5level %logger{50} - %msg%n</Pattern>
</encoder>
</appender> -->
<appender name="GLMAPPER-LOGGERONE"
class="ch.qos.logback.core.rolling.RollingFileAppender">
<append>true</append>
<filter class="ch.qos.logback.classic.filter.ThresholdFilter">
<level>${logging.level}</level>
</filter>
<file>
${logging.path}/document_parseInfo.log
</file>
<rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy">
<FileNamePattern>${logging.path}/document_parseInfo.log.%d{yyyy-MM-dd}</FileNamePattern>
<MaxHistory>7</MaxHistory>
</rollingPolicy>
<encoder class="ch.qos.logback.classic.encoder.PatternLayoutEncoder">
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %line %-5level %logger{50} - %msg%n</pattern>
<charset>UTF-8</charset>
</encoder>
</appender>
<root level="info">
<appender-ref ref="GLMAPPER-LOGGERONE"/>
<!-- <appender-ref ref="STDOUT"/> -->
</root>
</configuration>
Loading…
Cancel
Save