Browse Source

excel解析应用

master
55007 6 months ago
commit
5bed73d153
  1. 33
      .gitignore
  2. 1
      README.md
  3. 37
      logs/formHandler.log
  4. 5627
      logs/ocrHandler.log
  5. 143
      pom.xml
  6. 13
      src/main/java/com/bfd/crawl/formhandler/FormHanlerApplication.java
  7. 60
      src/main/java/com/bfd/crawl/formhandler/bean/ResponsePo.java
  8. 48
      src/main/java/com/bfd/crawl/formhandler/config/AsyncThreadConfiguration.java
  9. 27
      src/main/java/com/bfd/crawl/formhandler/config/Constant.java
  10. 52
      src/main/java/com/bfd/crawl/formhandler/controller/ExcelHandlerController.java
  11. 32
      src/main/java/com/bfd/crawl/formhandler/enums/ResponseCode.java
  12. 243
      src/main/java/com/bfd/crawl/formhandler/service/HandlerService.java
  13. 54
      src/main/java/com/bfd/crawl/formhandler/service/SendService.java
  14. 75
      src/main/java/com/bfd/crawl/formhandler/service/StartServcie.java
  15. 59
      src/main/java/com/bfd/crawl/formhandler/util/CsvUtil.java
  16. 60
      src/main/java/com/bfd/crawl/formhandler/util/DataUtil.java
  17. 318
      src/main/java/com/bfd/crawl/formhandler/util/ExcelUtils.java
  18. 117
      src/main/java/com/bfd/crawl/formhandler/util/FileDownloader.java
  19. 42
      src/main/java/com/bfd/crawl/formhandler/util/FileUtil.java
  20. 59
      src/main/java/com/bfd/crawl/formhandler/util/OcrUtil.java
  21. 23
      src/main/java/com/bfd/crawl/formhandler/util/OsUtil.java
  22. 19
      src/main/java/com/bfd/crawl/formhandler/util/QueueUtil.java
  23. 94
      src/main/java/com/bfd/crawl/formhandler/util/StringUtil.java
  24. 60
      src/main/resources/application.yml
  25. 36
      src/main/resources/logback-spring.xml

33
.gitignore

@ -0,0 +1,33 @@
HELP.md
target/
!.mvn/wrapper/maven-wrapper.jar
!**/src/main/**/target/
!**/src/test/**/target/
### STS ###
.apt_generated
.classpath
.factorypath
.project
.settings
.springBeans
.sts4-cache
### IntelliJ IDEA ###
.idea
*.iws
*.iml
*.ipr
### NetBeans ###
/nbproject/private/
/nbbuild/
/dist/
/nbdist/
/.nb-gradle/
build/
!**/src/main/**/build/
!**/src/test/**/build/
### VS Code ###
.vscode/

1
README.md

@ -0,0 +1 @@
excel解析应用

37
logs/formHandler.log

@ -0,0 +1,37 @@
2024-03-25 16:03:08.049 [restartedMain] 55 INFO com.bfd.crawl.formhandler.FormHanlerApplication - Starting FormHanlerApplication on JinMing with PID 18732 (D:\git\formHanler\target\classes started by ming.jin in D:\git\formHanler)
2024-03-25 16:03:08.051 [restartedMain] 651 INFO com.bfd.crawl.formhandler.FormHanlerApplication - No active profile set, falling back to default profiles: default
2024-03-25 16:03:08.100 [restartedMain] 225 INFO o.s.b.d.env.DevToolsPropertyDefaultsPostProcessor - Devtools property defaults active! Set 'spring.devtools.add-properties' to 'false' to disable
2024-03-25 16:03:08.100 [restartedMain] 225 INFO o.s.b.d.env.DevToolsPropertyDefaultsPostProcessor - For additional web related logging consider setting the 'logging.level.web' property to 'DEBUG'
2024-03-25 16:03:08.680 [restartedMain] 92 INFO o.s.boot.web.embedded.tomcat.TomcatWebServer - Tomcat initialized with port(s): 7081 (http)
2024-03-25 16:03:08.685 [restartedMain] 173 INFO org.apache.coyote.http11.Http11NioProtocol - Initializing ProtocolHandler ["http-nio-7081"]
2024-03-25 16:03:08.685 [restartedMain] 173 INFO org.apache.catalina.core.StandardService - Starting service [Tomcat]
2024-03-25 16:03:08.685 [restartedMain] 173 INFO org.apache.catalina.core.StandardEngine - Starting Servlet engine: [Apache Tomcat/9.0.30]
2024-03-25 16:03:08.729 [restartedMain] 173 INFO o.a.c.core.ContainerBase.[Tomcat].[localhost].[/] - Initializing Spring embedded WebApplicationContext
2024-03-25 16:03:08.729 [restartedMain] 284 INFO org.springframework.web.context.ContextLoader - Root WebApplicationContext: initialization completed in 629 ms
2024-03-25 16:03:08.873 [restartedMain] 171 INFO o.s.scheduling.concurrent.ThreadPoolTaskExecutor - Initializing ExecutorService
2024-03-25 16:03:08.874 [restartedMain] 171 INFO o.s.scheduling.concurrent.ThreadPoolTaskExecutor - Initializing ExecutorService 'asyncExecutor'
2024-03-25 16:03:08.875 [restartedMain] 171 INFO o.s.scheduling.concurrent.ThreadPoolTaskExecutor - Initializing ExecutorService
2024-03-25 16:03:08.875 [restartedMain] 171 INFO o.s.scheduling.concurrent.ThreadPoolTaskExecutor - Initializing ExecutorService 'sendExecutor'
2024-03-25 16:03:10.682 [restartedMain] 171 INFO o.s.scheduling.concurrent.ThreadPoolTaskScheduler - Initializing ExecutorService
2024-03-25 16:03:10.727 [restartedMain] 58 INFO o.s.b.d.autoconfigure.OptionalLiveReloadServer - LiveReload server is running on port 35729
2024-03-25 16:03:10.729 [restartedMain] 58 INFO o.s.b.actuate.endpoint.web.EndpointLinksResolver - Exposing 14 endpoint(s) beneath base path '/actuator'
2024-03-25 16:03:10.754 [restartedMain] 173 INFO org.apache.coyote.http11.Http11NioProtocol - Starting ProtocolHandler ["http-nio-7081"]
2024-03-25 16:03:10.768 [restartedMain] 204 INFO o.s.boot.web.embedded.tomcat.TomcatWebServer - Tomcat started on port(s): 7081 (http) with context path ''
2024-03-25 16:03:10.770 [restartedMain] 61 INFO com.bfd.crawl.formhandler.FormHanlerApplication - Started FormHanlerApplication in 2.951 seconds (JVM running for 5.365)
2024-03-25 16:03:10.772 [restartedMain] 35 INFO com.bfd.crawl.formhandler.service.StartServcie - 处理服务线程0已启动
2024-03-25 16:03:10.774 [restartedMain] 39 INFO com.bfd.crawl.formhandler.service.StartServcie - 发送服务线程0已启动
2024-03-25 16:03:10.775 [handlerData-1] 133 INFO com.bfd.crawl.formhandler.service.HandlerService - 任务队列为空,休眠3秒
2024-03-25 16:03:10.775 [sendData-1] 45 INFO com.bfd.crawl.formhandler.service.SendService - 任务队列为空,休眠3秒
2024-03-25 16:03:10.775 [Thread-12] 46 INFO com.bfd.crawl.formhandler.service.StartServcie - 任务队列长度为0
2024-03-25 16:03:10.775 [Thread-12] 47 INFO com.bfd.crawl.formhandler.service.StartServcie - 发送队列长度为0
2024-03-25 16:03:11.012 [registrationTask1] 84 INFO d.c.b.a.client.registration.ApplicationRegistrator - Application registered itself as b40b416ce444
2024-03-25 16:03:13.785 [sendData-1] 45 INFO com.bfd.crawl.formhandler.service.SendService - 任务队列为空,休眠3秒
2024-03-25 16:03:13.785 [handlerData-1] 133 INFO com.bfd.crawl.formhandler.service.HandlerService - 任务队列为空,休眠3秒
2024-03-25 16:03:13.897 [RMI TCP Connection(3)-10.10.144.49] 173 INFO o.a.c.core.ContainerBase.[Tomcat].[localhost].[/] - Initializing Spring DispatcherServlet 'dispatcherServlet'
2024-03-25 16:03:13.897 [RMI TCP Connection(3)-10.10.144.49] 525 INFO org.springframework.web.servlet.DispatcherServlet - Initializing Servlet 'dispatcherServlet'
2024-03-25 16:03:13.900 [RMI TCP Connection(3)-10.10.144.49] 547 INFO org.springframework.web.servlet.DispatcherServlet - Completed initialization in 3 ms
2024-03-25 16:03:15.244 [SpringContextShutdownHook] 208 INFO o.s.scheduling.concurrent.ThreadPoolTaskScheduler - Shutting down ExecutorService
2024-03-25 16:03:15.245 [SpringContextShutdownHook] 208 INFO o.s.scheduling.concurrent.ThreadPoolTaskExecutor - Shutting down ExecutorService 'sendExecutor'
2024-03-25 16:03:15.245 [SpringContextShutdownHook] 208 INFO o.s.scheduling.concurrent.ThreadPoolTaskExecutor - Shutting down ExecutorService 'asyncExecutor'
2024-03-25 16:03:16.788 [handlerData-1] 133 INFO com.bfd.crawl.formhandler.service.HandlerService - 任务队列为空,休眠3秒
2024-03-25 16:03:16.788 [sendData-1] 45 INFO com.bfd.crawl.formhandler.service.SendService - 任务队列为空,休眠3秒

5627
logs/ocrHandler.log
File diff suppressed because it is too large
View File

143
pom.xml

@ -0,0 +1,143 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.2.4.RELEASE</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<groupId>com.bfd.crawl</groupId>
<artifactId>formHanler</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>formHanler</name>
<description>formHanler</description>
<properties>
<java.version>8</java.version>
</properties>
<dependencies>
<!-- https://mvnrepository.com/artifact/de.codecentric/spring-boot-admin-starter-client -->
<dependency>
<groupId>de.codecentric</groupId>
<artifactId>spring-boot-admin-client</artifactId>
<version>2.2.4</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-csv</artifactId>
<version>1.10.0</version>
</dependency>
<dependency>
<groupId>org.springframework.kafka</groupId>
<artifactId>spring-kafka</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-devtools</artifactId>
<scope>runtime</scope>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.28</version>
</dependency>
<!-- Apache POI -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>5.2.5</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>5.2.5</version>
<exclusions>
<exclusion>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
<version>2.17.1</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>5.2.0</version>
</dependency>
<!--JSON-->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>2.0.17</version>
</dependency>
<!--OKHTTP-->
<dependency>
<groupId>com.squareup.okhttp3</groupId>
<artifactId>okhttp</artifactId>
<version>3.9.1</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.8</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>2.3.1</version> <!--根据您正在使用的Kafka版本选择合适的版本号-->
</dependency>
<dependency>
<groupId>org.springframework.kafka</groupId>
<artifactId>spring-kafka-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.bfd.util</groupId>
<artifactId>pauseTool</artifactId>
<version>1.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<configuration>
<excludes>
<exclude>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
</exclude>
</excludes>
</configuration>
</plugin>
</plugins>
</build>
</project>

13
src/main/java/com/bfd/crawl/formhandler/FormHanlerApplication.java

@ -0,0 +1,13 @@
package com.bfd.crawl.formhandler;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
@SpringBootApplication
public class FormHanlerApplication {
public static void main(String[] args) {
SpringApplication.run(FormHanlerApplication.class, args);
}
}

60
src/main/java/com/bfd/crawl/formhandler/bean/ResponsePo.java

@ -0,0 +1,60 @@
package com.bfd.crawl.formhandler.bean;
import com.bfd.crawl.formhandler.enums.ResponseCode;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
/**
* @author:jinming
* @className:ResponsePo
* @version:1.0
* @description:
* @Date:2023/4/3 17:23
*/
@Data
@NoArgsConstructor
@AllArgsConstructor
public class ResponsePo {
/**
* 响应码
*/
private int code;
/**
* 正常放 返回数据 的JSON串
*/
private Object data;
/**
* 提示消息
*/
private String message;
public static ResponsePo success() {
return setStatus(ResponseCode.SUCCESS.getCode(), ResponseCode.SUCCESS.getMessage());
}
public static ResponsePo error() {
return setStatus(ResponseCode.FAILURE.getCode(), ResponseCode.FAILURE.getMessage());
}
public static ResponsePo setStatus(int code, String message) {
ResponsePo resultBean = new ResponsePo();
resultBean.code = code;
resultBean.message = message;
return resultBean;
}
public ResponsePo(int code, String message) {
this.code = code;
this.message = message;
this.data = data;
}
public ResponsePo(ResponseCode responseCode){
this.code = responseCode.getCode();
this.message = responseCode.getMessage();
this.data = data;
}
}

48
src/main/java/com/bfd/crawl/formhandler/config/AsyncThreadConfiguration.java

@ -0,0 +1,48 @@
package com.bfd.crawl.formhandler.config;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.scheduling.annotation.EnableAsync;
import org.springframework.scheduling.concurrent.ThreadPoolTaskExecutor;
import java.util.concurrent.Executor;
/**
* @author jinming
* @version 1.0
* @className AsyncThreadConfiguration
* @Date 2022/2/17 18:37
*/
@Configuration
@EnableAsync
public class AsyncThreadConfiguration {
@Bean
public Executor asyncExecutor() {
ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
// 核心线程数
executor.setCorePoolSize(500);
// 并发线程的数量限制为2
executor.setMaxPoolSize(500);
// 线程队列
executor.setQueueCapacity(500);
executor.setThreadNamePrefix("handlerData-");
executor.initialize();
executor.setWaitForTasksToCompleteOnShutdown(true);
return executor;
}
@Bean
public Executor sendExecutor() {
ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
// 核心线程数
executor.setCorePoolSize(500);
// 并发线程的数量限制为2
executor.setMaxPoolSize(500);
// 线程队列
executor.setQueueCapacity(500);
executor.setThreadNamePrefix("sendData-");
executor.initialize();
executor.setWaitForTasksToCompleteOnShutdown(true);
return executor;
}
}

27
src/main/java/com/bfd/crawl/formhandler/config/Constant.java

@ -0,0 +1,27 @@
package com.bfd.crawl.formhandler.config;
/**
* @author:jinming
* @className:Constant
* @version:1.0
* @description:
* @Date:2023/8/16 15:26
*/
public class Constant {
/**
*
*/
public final static String IS_XLS = "xls";
/**
*
*/
public final static String IS_CSV = "csv";
/**
*
*/
public final static String ALL = "*";
}

52
src/main/java/com/bfd/crawl/formhandler/controller/ExcelHandlerController.java

@ -0,0 +1,52 @@
package com.bfd.crawl.formhandler.controller;
import com.alibaba.fastjson.JSON;
import com.bfd.crawl.formhandler.bean.ResponsePo;
import com.bfd.crawl.formhandler.enums.ResponseCode;
import com.bfd.crawl.formhandler.util.QueueUtil;
import lombok.extern.slf4j.Slf4j;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestBody;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import java.util.Map;
/**
* @author:jinming
* @className:DataFilterController
* @version:1.0
* @description:
* @Date:2023/7/26 11:21
*/
@RestController
@RequestMapping("/handlerdata")
@Slf4j
public class ExcelHandlerController {
@PostMapping("/formhandler")
public ResponsePo documentFeedback(@RequestBody String dataJson) {
String trace = "trace";
ResponsePo responsePo = ResponsePo.success();
try {
Map parse = (Map) JSON.parse(dataJson);
log.info("新增任务:" + dataJson);
if (parse.containsKey(trace) && (Boolean) parse.get(trace) == true) {
log.info("测试流程,插入队首");
QueueUtil.taskQueue.putFirst(dataJson);
}else {
QueueUtil.taskQueue.add(dataJson);
}
} catch (Exception e) {
e.printStackTrace();
log.error("请求格式发生异常" + e.getMessage());
responsePo.setCode(ResponseCode.FAILURE.getCode());
responsePo.setMessage(ResponseCode.FAILURE.getMessage());
return responsePo;
}
return responsePo;
}
}

32
src/main/java/com/bfd/crawl/formhandler/enums/ResponseCode.java

@ -0,0 +1,32 @@
package com.bfd.crawl.formhandler.enums;
/**
* @author:jinming
* @className:ResponseCodeEnum
* @version:1.0
* @description:响应结果码枚举类
* @Date:2023/2/28 11:40
*/
public enum ResponseCode {
//返回结果码枚举类
SUCCESS(200, "操作成功"),
FAILURE(400, "参数错误"),
INTERNAL_SERVER_ERROR(500, "服务器内部错误"),
TYPE_NOT_SUPPORT(601,"文件类型不支持");
private int code;
private String message;
ResponseCode(int code, String message) {
this.code = code;
this.message = message;
}
public int getCode() {
return code;
}
public String getMessage() {
return message;
}
}

243
src/main/java/com/bfd/crawl/formhandler/service/HandlerService.java

@ -0,0 +1,243 @@
package com.bfd.crawl.formhandler.service;
import com.alibaba.fastjson.JSON;
import com.bfd.crawl.formhandler.config.Constant;
import com.bfd.crawl.formhandler.util.*;
import com.bfd.util.PauseTool;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Service;
import java.io.File;
import java.io.IOException;
import java.util.*;
/**
* @author:jinming
* @className:HandlerService
* @version:1.0
* @description:
* @Date:2023/8/1 16:05
*/
@Service
@Slf4j
public class HandlerService {
@Value("${file.path}")
private String downloadFilePath;
@Async("asyncExecutor")
void run() {
while (true) {
try {
if (QueueUtil.taskQueue.size() > 0) {
log.info("当前工作线程开始获取数据");
String dataJson = QueueUtil.taskQueue.poll();
if (!StringUtil.hasValue(dataJson)) {
log.info("任务队列为空,休眠3秒");
try {
Thread.sleep(3000);
} catch (InterruptedException e) {
e.printStackTrace();
}
continue;
}
Map parse = (Map) JSON.parse(dataJson);
Map dataMap = (Map) parse.get("data");
int id = (int) parse.get("id");
Map admin = (Map) parse.get("input");
int scenesId = (int) parse.get("scenes_id");
int version = (int) parse.get("version");
String pauseKey = scenesId + "_" + version;
if (!PauseTool.CACHE.containsKey(pauseKey)) {
log.info("流程:{}的版本:{}已失效,任务跳过", scenesId, version);
continue;
}
String key = (String) admin.get("fileUrl");
String fileUrl = (String) DataUtil.getValue(key, dataMap);
boolean windows = OsUtil.isWindows();
Map output = (Map) parse.get("output");
//todo:后续如果需要选sheet将当前行注释放开即可
int allSheet = 0;
try {
allSheet = (int) admin.get("allSheet");
} catch (Exception e) {
}
Boolean isTrace = false;
if (parse.containsKey("trace") && (Boolean)parse.get("trace")==true) {
log.info("测试流程,只返回第一个sheet页的第一条数据");
isTrace = true;
}
//fieldType自定义输出字段 0 关闭1-开启如果开启则拼接form到output里如果关闭则取默认的output拼接
int fieldType = 0;
String fileType = StringUtil.getStrByPattern(fileUrl, "\\.[^.\\\\/:*?\"<>|\\r\\n]+$");
log.info("任务:" + id + "的文件类型为" + fileType);
if (fileType.contains(Constant.IS_XLS)) {
String fileName = StringUtil.getMd5(fileUrl);
//定义xls的下载路径
String xlsDir = downloadFilePath.concat(windows ? "\\xls\\" : "/xls/").concat(fileName).concat(fileType);
int dataRow = (int) admin.get("dataRow");
Map<Integer, String> fromMap = new HashMap<>(32);
Set<String> set = output.keySet();
for (String s : set) {
fromMap.put(ExcelUtils.getAlphabetPosition(s), output.get(s).toString());
}
try {
FileDownloader.downloadFile(fileUrl, new File(xlsDir));
} catch (IOException e) {
Map result = new HashMap(32);
result.put("status", 2);
result.put("results", "");
result.put("message", "文件下载失败");
parse.put("result", result);
String message = JSON.toJSONString(parse);
try {
QueueUtil.sendQueue.put(message);
} catch (InterruptedException ex) {
ex.printStackTrace();
}
continue;
}
Map<String, Object> xlsParse = null;
try {
xlsParse = ExcelUtils.parse(new File(xlsDir), dataRow, fromMap);
} catch (Exception e) {
e.printStackTrace();
Map result = new HashMap(32);
result.put("status", 2);
result.put("results", "");
result.put("message", "Excel解析失败");
parse.put("result", result);
String message = JSON.toJSONString(parse);
try {
QueueUtil.sendQueue.put(message);
} catch (InterruptedException ex) {
ex.printStackTrace();
}
continue;
}
Set<String> xlsParseKeySet = xlsParse.keySet();
int loopIndex = 1;
for (String xlsParseKey : xlsParseKeySet) {
int looploopIndex = 1;
List<Map> sheetListData = (List<Map>) xlsParse.get(xlsParseKey);
for (Map sheetListDatum : sheetListData) {
Map result = new HashMap(32);
Map resultsMap = new HashMap(32);
if (looploopIndex == sheetListData.size()) {
resultsMap.put("isLast", 1);
sheetListDatum.put("isLast", 1);
} else if (loopIndex == xlsParseKeySet.size() && looploopIndex == sheetListData.size()) {
resultsMap.put("isLast", 1);
sheetListDatum.put("isLast", 1);
}
if (fieldType != 0) {
resultsMap.remove("result");
Set outputKeySet = output.keySet();
for (Object outputKey : outputKeySet) {
String dataVlue = (String) sheetListDatum.get(outputKey);
if (StringUtil.hasValue(dataVlue)) {
resultsMap.put(outputKey, dataVlue);
}
}
if (resultsMap.isEmpty()) {
looploopIndex++;
continue;
}
resultsMap.put("sheetName", xlsParseKey);
String resultsMapJson = JSON.toJSONString(resultsMap);
result.put("results", resultsMapJson);
parse.put("result", result);
result.put("status", 1);
result.put("message", "成功");
} else {
sheetListDatum.put("sheetName", xlsParseKey);
result.put("results", JSON.toJSONString(sheetListDatum));
result.put("status", 1);
result.put("message", "成功");
parse.put("result", result);
}
String message = JSON.toJSONString(parse);
QueueUtil.sendQueue.put(message);
looploopIndex++;
if (isTrace){
break;
}
}
loopIndex++;
if (isTrace){
break;
}
}
} else if (fileType.contains(Constant.IS_CSV)) {
List<Map> form = (List<Map>) admin.get("form");
Set<Integer> outputLine = new HashSet<>();
Map<Integer, String> fromMap = new HashMap<>(32);
for (Map map : form) {
String field = map.get("field").toString();
int alphabetPosition = ExcelUtils.getAlphabetPosition(field);
fromMap.put(alphabetPosition, field);
}
int loopIndex = 0;
String csvCharSet = (String) admin.get("csvCharSet");
if (!StringUtil.hasValue(csvCharSet)) {
csvCharSet = "UTF-8";
}
String fileName = StringUtil.getMd5(fileUrl);
//定义csv的下载路径
String csvDir = downloadFilePath.concat(windows ? "\\csv\\" : "/csv/").concat(fileName).concat(".csv");
FileDownloader.downloadFile(fileUrl, new File(csvDir));
List<Map<String, String>> csvDataList = CsvUtil.parseCSV(csvDir, csvCharSet, fromMap);
for (Map<String, String> stringStringMap : csvDataList) {
Map result = new HashMap(32);
Map resultsMap = new HashMap(32);
if (loopIndex == csvDataList.size()) {
resultsMap.put("isLast", 1);
stringStringMap.put("isLast", String.valueOf(1));
}
resultsMap.remove("result");
Set outputKeySet = output.keySet();
for (Object outputKey : outputKeySet) {
resultsMap.put(outputKey, stringStringMap.get(outputKey));
}
resultsMap.put("isLast", 1);
String resultsMapJson = JSON.toJSONString(resultsMap);
result.put("results", resultsMapJson);
result.put("status", 1);
result.put("message", "成功");
parse.put("result", result);
String empty = "{}";
if (resultsMapJson.equals(empty)) {
continue;
}
String message = JSON.toJSONString(parse);
QueueUtil.sendQueue.put(message);
loopIndex++;
if (isTrace){
break;
}
}
}
} else {
log.info("任务队列为空,休眠3秒");
try {
Thread.sleep(3000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
} catch (Throwable e) {
e.printStackTrace();
log.error("工作线程发生异常" + e.getMessage());
}
}
}
}

54
src/main/java/com/bfd/crawl/formhandler/service/SendService.java

@ -0,0 +1,54 @@
package com.bfd.crawl.formhandler.service;
import com.alibaba.fastjson.JSON;
import com.bfd.crawl.formhandler.util.QueueUtil;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.kafka.core.KafkaTemplate;
import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Service;
import javax.annotation.Resource;
import java.util.Map;
/**
* @author:jinming
* @className:SendService
* @version:1.0
* @description:
* @Date:2023/7/31 17:53
*/
@Slf4j
@Service
public class SendService {
@Value("${send.topic}")
private String topic;
@Resource
private KafkaTemplate kafkaTemplate;
@Async("sendExecutor")
void sendToKafka() {
while (true) {
if (QueueUtil.sendQueue.size() > 0) {
try {
String message = QueueUtil.sendQueue.take();
Map parse = (Map) JSON.parse(message);
String id = parse.get("id").toString();
log.info("ID:" + id + "\t" + "数据已发出");
kafkaTemplate.send(topic, message);
} catch (Exception e) {
e.printStackTrace();
}
} else {
log.info("任务队列为空,休眠3秒");
try {
Thread.sleep(3000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}
}

75
src/main/java/com/bfd/crawl/formhandler/service/StartServcie.java

@ -0,0 +1,75 @@
package com.bfd.crawl.formhandler.service;
import com.bfd.crawl.formhandler.util.QueueUtil;
import com.bfd.util.PauseTool;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.ApplicationArguments;
import org.springframework.boot.ApplicationRunner;
import org.springframework.data.redis.core.StringRedisTemplate;
import org.springframework.stereotype.Service;
import javax.annotation.Resource;
/**
* @author:jinming
* @className:StartServcie
* @version:1.0
* @description:
* @Date:2023/7/31 17:14
*/
@Service
@Slf4j
public class StartServcie implements ApplicationRunner {
@Value("${thread.handler}")
private int handlerNumber;
@Value("${thread.send}")
private int sendNumber;
@Autowired
private HandlerService handlerService;
@Autowired
private SendService sendService;
@Value("${zookeeper.connection-string}")
private String connectionString;
@Value("${zookeeper.publish-node}")
private String nodePath;
@Resource
private StringRedisTemplate stringRedisTemplate;
@Override
public void run(ApplicationArguments args) throws Exception {
PauseTool pauseTool = new PauseTool();
pauseTool.initializeRedisCache(stringRedisTemplate);
pauseTool.setupZookeeperListener(connectionString, nodePath);
for (int i = 0; i < handlerNumber; i++) {
log.info("处理服务线程" + i + "已启动 ");
handlerService.run();
}
for (int i = 0; i < sendNumber; i++) {
log.info("发送服务线程" + i + "已启动 ");
sendService.sendToKafka();
}
Runnable myRunnable = new Runnable() {
@Override
public void run() {
while (true) {
log.info("任务队列长度为" + QueueUtil.taskQueue.size());
log.info("发送队列长度为" + QueueUtil.sendQueue.size());
try {
Thread.sleep(10000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
};
// 创建一个新的线程并将Runnable对象传递给Thread构造函数
Thread myThread = new Thread(myRunnable);
// 启动线程
myThread.start();
}
}

59
src/main/java/com/bfd/crawl/formhandler/util/CsvUtil.java

@ -0,0 +1,59 @@
package com.bfd.crawl.formhandler.util;
import com.alibaba.fastjson.JSON;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* @author:jinming
* @className:CsvUtil
* @version:1.0
* @description:
* @Date:2024/3/25 11:02
*/
public class CsvUtil {
public static List<Map<String, String>> parseCSV(String filePath, String csvCharSet, Map<Integer, String> fromMap) {
List<Map<String, String>> dataList = new ArrayList<>();
try (Reader reader = new InputStreamReader(new FileInputStream(filePath), csvCharSet);
CSVParser csvParser = new CSVParser(reader, CSVFormat.DEFAULT.withFirstRecordAsHeader())) {
Map<String, String> haderDataMap = new HashMap<>(32);
for (Integer i : fromMap.keySet()) {
// System.out.println(csvParser.getHeaderNames().get(i));
haderDataMap.put(fromMap.get(i), csvParser.getHeaderNames().get(i));
System.out.println(JSON.toJSONString(haderDataMap));
}
dataList.add(haderDataMap);
for (CSVRecord csvRecord : csvParser) {
Map<String, String> dataMap = new HashMap<>(32);
for (Integer i : fromMap.keySet()) {
// System.out.println(csvRecord.get(i));
dataMap.put(fromMap.get(i), csvRecord.get(i));
System.out.println(JSON.toJSONString(dataMap));
}
dataList.add(dataMap);
}
} catch (Exception e) {
e.printStackTrace();
}
return dataList;
}
public static void main(String[] args) {
Map<Integer, String> fromMap = new HashMap<>(32);
fromMap.put(0, "A");
fromMap.put(2, "C");
List<Map<String, String>> gbk = parseCSV("C:\\Users\\10318\\Desktop\\评论.csv", "gbk", fromMap);
System.out.println(gbk);
System.out.println((gbk));
}
}

60
src/main/java/com/bfd/crawl/formhandler/util/DataUtil.java

@ -0,0 +1,60 @@
package com.bfd.crawl.formhandler.util;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.alibaba.fastjson.JSONPath;
import lombok.extern.slf4j.Slf4j;
import java.util.Map;
/**
* @author:jinming
* @className:DataUtil
* @version:1.0
* @description: 获取dataValue的值
* @Date:2023/11/1 9:54
*/
@Slf4j
public class DataUtil {
/**
* @param key 传入的key
* @param dataMap 数据map
* @return 根据传入的参数进行判断解析返回正确的dataValue
*/
public static Object getValue(String key, Map dataMap) {
try {
//公式为空直接就返回
if (!StringUtil.hasValue(key)) {
return "";
}
Object dataValue;
String isJson = "#json#";
if (key.contains(isJson)) {
//进行第一次拆分获取#json#前面的部分
String[] keySplit = key.split(isJson);
String firstDataKey = keySplit[0];
String[] firstDataKeySplit = firstDataKey.split(":");
//取出前半部分对应的JSON数据并转换为JSONObject
String dataJson = (String) dataMap.get(firstDataKeySplit[0]);
JSONObject dataJsonObject = JSON.parseObject(dataJson);
//根据key的后半部分取出对应JSONObject中的值
String firstDataKeyJson = (String) JSONPath.eval(dataJsonObject, firstDataKeySplit[1]);
String secDataKey = keySplit[1];
JSONObject firstDataJsonObject = JSON.parseObject(firstDataKeyJson);
dataValue = JSONPath.eval(firstDataJsonObject, secDataKey);
return dataValue;
}
String[] keySplit = key.split(":");
String jsonPath = keySplit[1];
String dataJson = (String) dataMap.get(keySplit[0]);
JSONObject dataJsonObject = JSON.parseObject(dataJson);
dataValue = JSONPath.eval(dataJsonObject, jsonPath);
return dataValue;
} catch (Exception e) {
// TODO: handle exception
log.error("jsonpath公式取值异常,", e);
return null;
}
}
}

318
src/main/java/com/bfd/crawl/formhandler/util/ExcelUtils.java

@ -0,0 +1,318 @@
package com.bfd.crawl.formhandler.util;
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.ss.util.CellRangeAddress;
import org.apache.poi.util.IOUtils;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.util.*;
/**
* @author jian.mao
* @date 2023年4月7日
* @description excel解析工具类
*/
public class ExcelUtils {
/**
* 判断指定的单元格是否是合并单元格
*
* @param sheet excel工作簿
* @param row 行下标
* @param column 列下标
* @return boolean
*/
public static Map isMergedRegion(Sheet sheet, int row, int column) {
Map infoMap = new HashMap(32);
//合并单元格的数量
int sheetMergeCount = sheet.getNumMergedRegions();
//遍历所有合并单元格信息
for (int i = 0; i < sheetMergeCount; i++) {
//获取第i个合并单元格的信息
CellRangeAddress range = sheet.getMergedRegion(i);
//获取开始合并的列位置
int startColumn = range.getFirstColumn();
//获取结束合列的行位置
int endColumn = range.getLastColumn();
//获取开始合并的行位置
int startRow = range.getFirstRow();
//获取结束合并的行位置
int endRow = range.getLastRow();
/*
判断row,column是否在合并的单元格里
即row大于开始合并的行位置小于合并结束的行位置
并且column大于开始合并的列位置小于合并结束的列位置
*/
if (row >= startRow && row <= endRow) {
if (column >= startColumn && column <= endColumn) {
infoMap.put("startRow", startRow);
infoMap.put("startColumn", startColumn);
infoMap.put("isMergedRegion", true);
return infoMap;
}
}
}
infoMap.put("isMergedRegion", false);
return infoMap;
}
/**
* excel解析
*
* @param excel
* @return
*/
public static Map<String, Object> parse(File excel, int dataStartRow, Map<Integer, String> form) {
// 设置更高的覆盖值比如 200,000,000
IOUtils.setByteArrayMaxOverride(200000000);
dataStartRow = dataStartRow - 1;
Map<String, Object> excelMap = new HashMap<String, Object>(16);
try {
FileInputStream file = new FileInputStream(excel);
// 使用工厂模式创建工作簿对象
Workbook workbook = WorkbookFactory.create(file);
// 获取工作簿中工作表的数量
int numberOfSheets = workbook.getNumberOfSheets();
DataFormatter dataFormatter = new DataFormatter();
// 遍历所有工作表
for (int i = 0; i < numberOfSheets; i++) {
Sheet sheet = workbook.getSheetAt(i);
String key = sheet.getSheetName();
//行码
int rowNum = 0;
List<Map<String, String>> data = new ArrayList<Map<String, String>>();
// 遍历所有行
for (Row row : sheet) {
//行内容存储
Map<String, String> rowMap = new HashMap<String, String>(16);
// 遍历所有单元格
if (rowNum < dataStartRow) {
rowNum++;
continue;
} else {
Set<Integer> integers = form.keySet();
for (Integer j : integers) {
Map mergedRegion = isMergedRegion(sheet, rowNum, j);
boolean isMergedRegion = (boolean) mergedRegion.get("isMergedRegion");
String cellValue = dataFormatter.formatCellValue(row.getCell(j));
if (isMergedRegion) {
//获取开始合并的列位置
int startColumn = (int) mergedRegion.get("startColumn");
//获取开始合并的行位置
int startRow = (int) mergedRegion.get("startRow");
cellValue = dataFormatter.formatCellValue(sheet.getRow(startRow).getCell(startColumn));
}
rowMap.put(form.get(j), cellValue);
}
}
rowNum++;
if (rowNum > 0) {
data.add(rowMap);
}
}
excelMap.put(key, data);
}
// 关闭文件输入流和工作簿对象
file.close();
workbook.close();
} catch (IOException e) {
e.printStackTrace();
}
return excelMap;
}
/**
* excel解析
*
* @param excel
* @return
*/
public static Map<String, Object> parse(File excel) {
Map<String, Object> excelMap = new HashMap<String, Object>(16);
try {
FileInputStream file = new FileInputStream(excel);
// 使用工厂模式创建工作簿对象
Workbook workbook = WorkbookFactory.create(file);
// 获取工作簿中工作表的数量
int numberOfSheets = workbook.getNumberOfSheets();
DataFormatter dataFormatter = new DataFormatter();
// 遍历所有工作表
for (int i = 0; i < numberOfSheets; i++) {
Sheet sheet = workbook.getSheetAt(i);
String key = sheet.getSheetName();
//行码
int rowNum = 0;
List<Map<String, String>> data = new ArrayList<Map<String, String>>();
Map<Integer, String> titleHead = new HashMap<Integer, String>(16);
// 遍历所有行
for (Row row : sheet) {
//单元格码
int cellNum = 0;
//行内容存储
Map<String, String> rowMap = new HashMap<String, String>(16);
// 遍历所有单元格
if (rowNum == 0) {
for (Cell cell : row) {
String cellValue = dataFormatter.formatCellValue(cell);
titleHead.put(cellNum, cellValue);
cellNum++;
}
} else {
for (int j = 0; j < titleHead.size(); j++) {
Map mergedRegion = isMergedRegion(sheet, rowNum, cellNum);
boolean isMergedRegion = (boolean) mergedRegion.get("isMergedRegion");
String cellValue = dataFormatter.formatCellValue(row.getCell(j));
if (isMergedRegion) {
//获取开始合并的列位置
int startColumn = (int) mergedRegion.get("startColumn");
//获取开始合并的行位置
int startRow = (int) mergedRegion.get("startRow");
cellValue = dataFormatter.formatCellValue(sheet.getRow(startRow).getCell(startColumn));
}
rowMap.put(titleHead.get(cellNum), cellValue);
cellNum++;
}
}
if (rowNum > 0) {
data.add(rowMap);
}
rowNum++;
}
excelMap.put(key, data);
}
// 关闭文件输入流和工作簿对象
file.close();
workbook.close();
} catch (IOException e) {
e.printStackTrace();
}
return excelMap;
}
/**
* 将List<Map<String, String>>写入Excel文件中
*
* @param data 要写入Excel的数据每个Map代表一行数据Map的key为列名value为单元格数据
* @param excelFilePath Excel文件路径包含文件名和扩展名
* @param sheetName 工作表名称
* @throws IOException 如果写入Excel文件时发生IO异常则抛出该异常
*/
public static void write(List<Map<String, String>> data, String excelFilePath, String sheetName) throws IOException {
// 创建一个新的工作簿对象
Workbook workbook = new XSSFWorkbook();
// 创建一个新的工作表
Sheet sheet = workbook.createSheet(sheetName);
// 行码
int rowNum = 0;
// 写入列头
Row headerRow = sheet.createRow(rowNum++);
int colNum = 0;
for (String key : data.get(0).keySet()) {
Cell cell = headerRow.createCell(colNum++);
cell.setCellValue(key);
}
// 写入数据
for (Map<String, String> rowMap : data) {
Row row = sheet.createRow(rowNum++);
colNum = 0;
for (String key : rowMap.keySet()) {
Cell cell = row.createCell(colNum++);
try {
String s = rowMap.get(key);
if (s.length() > 30000) {
cell.setCellValue(s.substring(0, 25000));
} else {
cell.setCellValue(s);
}
} catch (Exception e) {
System.out.println(key);
e.printStackTrace();
}
}
}
// 将数据写入文件
FileOutputStream outputStream = new FileOutputStream(excelFilePath);
workbook.write(outputStream);
workbook.close();
outputStream.close();
}
// public static void copyFile(String sourceFloder, String targetFileName) {
// File sourceFile = new File(sourceFloder);
// byte[] buffer = new byte[(int) sourceFile.length()];
// try (InputStream inputStream = new FileInputStream(sourceFile)) {
// inputStream.read(buffer);
// } catch (IOException e) {
// e.printStackTrace();
// return;
// }
// // 写入目标文件
// File targetFile = new File(targetFileName);
// targetFile.mkdirs();
// try (OutputStream outputStream = new FileOutputStream(targetFile)) {
// outputStream.write(buffer);
// } catch (IOException e) {
// e.printStackTrace();
// return;
// }
// }
public static void copyFile(String sourceFilePath) {
// 源文件和目标文件的路径
String targetDrive = "F:";
try {
// 获取源文件和目标文件的路径信息
Path sourcePath = Paths.get(sourceFilePath);
Path targetPath = Paths.get(targetDrive + sourcePath.toString().substring(2));
// 如果目标文件的父目录不存在则创建该目录
if (!targetPath.getParent().toFile().exists()) {
targetPath.getParent().toFile().mkdirs();
}
// 进行文件复制
Files.copy(sourcePath, targetPath, StandardCopyOption.REPLACE_EXISTING);
System.out.println("Copied file: " + sourceFilePath + " -> " + targetPath);
} catch (IOException e) {
e.printStackTrace();
}
}
public static int getAlphabetPosition(String input) {
if (input == null || input.isEmpty()) {
return 0;
}
// 将输入字符串转换为大写以处理大小写
input = input.toUpperCase();
int position = 0;
for (int i = 0; i < input.length(); i++) {
char c = input.charAt(i);
if (c >= 'A' && c <= 'Z') {
position = position * 26 + (c - 'A' + 1);
}
}
return position - 1;
}
public static void main(String[] args) {
System.out.println(getAlphabetPosition("xfd"));
Map from = new HashMap(32);
from.put(0, "人名");
from.put(3, "站点名");
System.out.println(parse(new File("D:\\fromHanler\\xls\\b51484b213ed8fea61f5b99cbdc1490e.xlsx"), 2, from));
}
}

117
src/main/java/com/bfd/crawl/formhandler/util/FileDownloader.java

@ -0,0 +1,117 @@
package com.bfd.crawl.formhandler.util;
import com.alibaba.fastjson.JSON;
import okhttp3.*;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
/**
* @author:jinming
* @className:FileDownloader
* @version:1.0
* @description:
* @Date:2023/8/1 16:31
*/
public class FileDownloader {
private static OkHttpClient okHttpClient;
private static OkHttpClient getOkHttpClient() {
if (okHttpClient == null) {
okHttpClient = new OkHttpClient();
}
return okHttpClient;
}
public static void downloadFile(String url, File destination) throws IOException {
OkHttpClient client = getOkHttpClient();
Request request = new Request.Builder()
.url(url)
.build();
try (Response response = client.newCall(request).execute()) {
if (!response.isSuccessful()) {
throw new IOException("Failed to download file: " + response);
}
ResponseBody body = response.body();
if (body == null) {
throw new IOException("Response body is null");
}
if (!destination.getParentFile().exists()) {
destination.getParentFile().mkdirs();
}
try (InputStream inputStream = body.byteStream();
FileOutputStream outputStream = new FileOutputStream(destination)) {
byte[] buffer = new byte[8192];
int bytesRead;
while ((bytesRead = inputStream.read(buffer)) != -1) {
outputStream.write(buffer, 0, bytesRead);
}
outputStream.flush();
}
}
}
public static Map<String, String> uploadFile(String url, String filePath) throws Exception {
File file = new File(filePath);
Map returnMap = new HashMap(32);
OkHttpClient client = getOkHttpClient();
// 设置文件上传的媒体类型
MediaType mediaType = MediaType.parse("application/octet-stream");
// 创建请求体将文件添加到请求体中
RequestBody requestBody = RequestBody.create(mediaType, file);
// 创建多部分请求体用于上传文件
MultipartBody multipartBody = new MultipartBody.Builder()
.setType(MultipartBody.FORM)
.addFormDataPart("file", file.getName(), requestBody)
.build();
// 创建上传文件的请求
Request request = new Request.Builder()
.url(url)
.post(multipartBody)
.build();
try (Response response = client.newCall(request).execute()) {
if (!response.isSuccessful()) {
throw new IOException("Failed to upload file: " + response);
}
String html = response.body().string();
try {
Map parse = (Map) JSON.parse(html);
Map data = (Map) parse.get("data");
String domain = (String) data.get("domain");
String src = (String) data.get("src");
String fileUrl = domain.concat(src);
returnMap.put("fileUrl", fileUrl);
} catch (Exception e) {
returnMap.put("fileUrl", html);
}
// 处理上传成功的响应
System.out.println("File uploaded successfully!");
}
return returnMap;
}
public static void deleteFile(String url, String md5) throws Exception {
OkHttpClient client = getOkHttpClient();
url = url.concat("delete?md5=").concat(md5);
Request request = new Request.Builder()
.url(url)
.get()
.build();
try (Response response = client.newCall(request).execute()) {
if (!response.isSuccessful()) {
throw new IOException("Failed to upload file: " + response);
}
}
}
}

42
src/main/java/com/bfd/crawl/formhandler/util/FileUtil.java

@ -0,0 +1,42 @@
package com.bfd.crawl.formhandler.util;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
/**
* @author:jinming
* @className:FileUtil
* @version:1.0
* @description:
* @Date:2023/8/2 10:57
*/
public class FileUtil {
public static void main(String[] args) {
System.out.println(traverseAndReturnFilePath("D:\\\\ocr\\\\305ce27d7a05770456fdc09d0b3044f7\\\\"));
}
public static List<String> traverseAndReturnFilePath(String folderPath) {
List<String> fileList = new ArrayList<>();
File folder = new File(folderPath);
// 检查文件夹是否存在并且是一个文件夹
if (folder.exists() && folder.isDirectory()) {
// 获取文件夹中的所有文件和子文件夹
File[] files = folder.listFiles();
if (files != null) {
for (File file : files) {
if (file.isFile()) {
// 如果是文件则输出全路径
fileList.add(file.getAbsolutePath());
} else if (file.isDirectory()) {
}
}
}
} else {
System.out.println("指定的路径不是一个文件夹或文件夹不存在。");
}
return fileList;
}
}

59
src/main/java/com/bfd/crawl/formhandler/util/OcrUtil.java

@ -0,0 +1,59 @@
package com.bfd.crawl.formhandler.util;
import com.alibaba.fastjson.JSON;
import okhttp3.*;
import java.util.Map;
import java.util.concurrent.TimeUnit;
/**
* @author:jinming
* @className:ocrUtil
* @version:1.0
* @description:
* @Date:2023/8/1 16:38
*/
public class OcrUtil {
private static OkHttpClient okHttpClient;
private static OkHttpClient getOkHttpClient() {
if (okHttpClient == null) {
okHttpClient = new OkHttpClient();
}
return okHttpClient;
}
public static String doOcr(String url,String ocrApi) {
String text = "";
int reTryTimes = 3;
for (int i = 0; i < reTryTimes; i++) {
int okCode = 200;
OkHttpClient client = getOkHttpClient();
OkHttpClient.Builder builder = client.newBuilder().writeTimeout(600, TimeUnit.SECONDS).connectTimeout(600, TimeUnit.SECONDS).readTimeout(600, TimeUnit.SECONDS);
client = builder.build();
MediaType mediaType = MediaType.parse("application/json");
RequestBody body = RequestBody.create(mediaType, "{\"id\":\"\",\"url\":\"" + url + "\"}");
Request request = new Request.Builder()
.url(ocrApi)
.method("POST", body)
.addHeader("Content-Type", "application/json")
.build();
try {
Response response = client.newCall(request).execute();
String html = response.body().string();
Map dataMap = (Map) JSON.parse(html);
int code = (int) dataMap.get("code");
if (code == okCode) {
text = (String) dataMap.get("text");
}
if (StringUtil.hasValue(text)) {
break;
}
} catch (Exception e) {
e.printStackTrace();
}
}
return text;
}
}

23
src/main/java/com/bfd/crawl/formhandler/util/OsUtil.java

@ -0,0 +1,23 @@
package com.bfd.crawl.formhandler.util;
import lombok.extern.slf4j.Slf4j;
import java.util.Locale;
/**
* @author:jinming
* @className:OsUtil
* @version:1.0
* @description:
* @Date:2023/4/23 9:40
*/
@Slf4j
public class OsUtil {
public static Boolean isWindows() {
String windows = "windows";
String osName = System.getProperty("os.name").toLowerCase(Locale.ROOT);
log.info("osName = " + osName);
return osName.contains(windows);
}
}

19
src/main/java/com/bfd/crawl/formhandler/util/QueueUtil.java

@ -0,0 +1,19 @@
package com.bfd.crawl.formhandler.util;
import java.util.concurrent.LinkedBlockingDeque;
/**
* @author:jinming
* @className:QueueUtil
* @version:1.0
* @description:
* @Date:2023/7/13 15:00
*/
public class QueueUtil {
public static LinkedBlockingDeque<String> taskQueue = new LinkedBlockingDeque<String>();
public static LinkedBlockingDeque<String> sendQueue = new LinkedBlockingDeque<String>();
}

94
src/main/java/com/bfd/crawl/formhandler/util/StringUtil.java

@ -0,0 +1,94 @@
package com.bfd.crawl.formhandler.util;
import lombok.extern.slf4j.Slf4j;
import java.security.MessageDigest;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @author jinming
* @version 1.0
* @className StringUtile
* @Date 2022/1/21 11:46
*/
@Slf4j
public class StringUtil {
public static boolean hasValue(String str) {
return str != null && !"".equals(str.trim());
}
public static String getRegexGroup(String regex, String str, int id) {
String resultStr = "";
if (hasValue(str)) {
Pattern p = Pattern.compile(regex);
Matcher m = p.matcher(str);
if (m.find()) {
resultStr = m.group(id);
}
}
if ("".equals(resultStr)) {
}
return resultStr;
}
public static Set<String> getEmailAddress(String message) {
Set<String> emailList = new HashSet<>();
Pattern pattern = Pattern.compile("\\w+\\.?\\w+\\@\\w+\\.\\w+");
Matcher m = pattern.matcher(message);
while (m.find()) {
emailList.add(m.group(0));
}
return emailList;
}
public static String getMd5(String string) {
try {
MessageDigest md5 = MessageDigest.getInstance("MD5");
byte[] bs = md5.digest(string.getBytes("UTF-8"));
StringBuilder sb = new StringBuilder(40);
for (byte x : bs) {
if ((x & 0xff) >> 4 == 0) {
sb.append("0").append(Integer.toHexString(x & 0xff));
} else {
sb.append(Integer.toHexString(x & 0xff));
}
}
return sb.toString();
} catch (Exception e) {
//LOG.error("获取md5异常", e);
return "nceaform" + System.currentTimeMillis();
}
}
public static String removeAllHtmlTags(String str) {
return hasValue(str) ? str.replaceAll("<[^<>]+?>", "") : "";
}
public static String getRegexGroup(Pattern regex, String str, int id) {
String resultStr = "";
if (hasValue(str)) {
Matcher m = regex.matcher(str);
if (m.find()) {
resultStr = m.group(id);
}
}
if ("".equals(resultStr)) {
log.error(regex + " parser error!");
}
return resultStr;
}
public static String getStrByPattern(String str, String regex) {
Pattern pattern = Pattern.compile(regex);
Matcher m = pattern.matcher(str);
return m.find() ? m.group(0) : "";
}
}

60
src/main/resources/application.yml

@ -0,0 +1,60 @@
server:
port: 7081
spring:
application:
name: 表格处理
boot:
admin:
client:
health:
timeout: 10s
url: http://172.16.12.55:8001
instance:
service-base-url: http://172.16.12.56:7080
kafka:
bootstrap-servers: 172.16.12.55:9092,172.16.12.56:9092,172.16.12.57:9092
producer:
retries: 3
acks: all
batch-size: 4096
buffer-memory: 102476800
key-serializer: org.apache.kafka.common.serialization.StringSerializer
value-serializer: org.apache.kafka.common.serialization.StringSerializer
redis:
host: 172.24.12.126
port: 6379
timeout: 10000
database: 5
jedis:
pool:
max-active: 8 # 连接池最大连接数(使用负值表示没有限制)
max-wait: 800 # 连接池最大阻塞等待时间(使用负值表示没有限制)
max-idle: 8 # 连接池中的最大空闲连接
min-idle: 2 # 连接池中的最小空闲连接
zookeeper:
connection-string: 172.16.12.55:2181,172.16.12.56:2181,172.16.12.57:2181
publish-node: /analyze
logging:
file:
path: ./logs
management:
endpoints:
web:
exposure:
include: "*"
endpoint:
health:
show-details: always
send:
topic: analyze
file:
path: /opt/analyze/apps/formHandler/file/
thread:
handler: 1
send: 1

36
src/main/resources/logback-spring.xml

@ -0,0 +1,36 @@
<configuration>
<!-- 属性文件:在properties文件中找到对应的配置项 -->
<springProperty scope="context" name="logging.file.path" source="logging.file.path"/>
<springProperty scope="context" name="logging.level" source="logging.level"/>
<!-- 默认的控制台日志输出,一般生产环境都是后台启动,这个没太大作用 -->
<appender name="STDOUT"
class="ch.qos.logback.core.ConsoleAppender">
<encoder class="ch.qos.logback.classic.encoder.PatternLayoutEncoder">
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %line %-5level %logger{50} - %msg%n</pattern>
</encoder>
</appender>
<appender name="GLMAPPER-LOGGERONE"
class="ch.qos.logback.core.rolling.RollingFileAppender">
<append>true</append>
<filter class="ch.qos.logback.classic.filter.ThresholdFilter">
<level>${logging.level}</level>
</filter>
<file>
${logging.file.path}/formHandler.log
</file>
<rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy">
<FileNamePattern>${logging.file.path}/formHandler.log.%d{yyyy-MM-dd}</FileNamePattern>
<MaxHistory>3</MaxHistory>
</rollingPolicy>
<encoder class="ch.qos.logback.classic.encoder.PatternLayoutEncoder">
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %line %-5level %logger{50} - %msg%n</pattern>
<charset>UTF-8</charset>
</encoder>
</appender>
<root level="info">
<appender-ref ref="GLMAPPER-LOGGERONE"/>
<appender-ref ref="STDOUT"/>
</root>
</configuration>
Loading…
Cancel
Save