commit cb8c11dfdd3019a4d44ba14ea5e05c62470a24c9 Author: 55007 <55007@maojian> Date: Tue Jan 7 18:17:57 2025 +0800 ppt解析应用 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..549e00a --- /dev/null +++ b/.gitignore @@ -0,0 +1,33 @@ +HELP.md +target/ +!.mvn/wrapper/maven-wrapper.jar +!**/src/main/**/target/ +!**/src/test/**/target/ + +### STS ### +.apt_generated +.classpath +.factorypath +.project +.settings +.springBeans +.sts4-cache + +### IntelliJ IDEA ### +.idea +*.iws +*.iml +*.ipr + +### NetBeans ### +/nbproject/private/ +/nbbuild/ +/dist/ +/nbdist/ +/.nb-gradle/ +build/ +!**/src/main/**/build/ +!**/src/test/**/build/ + +### VS Code ### +.vscode/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..5779331 --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +ppt 解析应用 diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..84546d9 --- /dev/null +++ b/pom.xml @@ -0,0 +1,138 @@ + + + 4.0.0 + + org.springframework.boot + spring-boot-starter-parent + 2.2.4.RELEASE + + + com.bfd.crawl + pptHandler + 0.0.1-SNAPSHOT + pptHandler + pptHandler + + 8 + + + + + + + de.codecentric + spring-boot-admin-client + 2.2.4 + + + org.apache.commons + commons-csv + 1.10.0 + + + org.springframework.kafka + spring-kafka + + + org.springframework.boot + spring-boot-starter + + + org.springframework.boot + spring-boot-starter-web + + + org.springframework.boot + spring-boot-devtools + runtime + true + + + org.projectlombok + lombok + true + + + org.springframework.boot + spring-boot-starter-test + test + + + org.apache.pdfbox + pdfbox + 2.0.28 + + + + org.apache.poi + poi-scratchpad + 5.2.0 + + + org.apache.poi + poi + 5.2.0 + + + org.apache.logging.log4j + log4j-api + + + + + org.apache.logging.log4j + log4j-api + 2.17.1 + + + org.apache.poi + poi-ooxml + 5.2.0 + + + + com.alibaba + fastjson + 2.0.17 + + + + com.squareup.okhttp3 + okhttp + 3.9.1 + + + com.google.code.gson + gson + 2.8.8 + + + org.apache.kafka + kafka-clients + 2.3.1 + + + org.springframework.kafka + spring-kafka-test + test + + + + + + org.springframework.boot + spring-boot-maven-plugin + + + + org.projectlombok + lombok + + + + + + + + diff --git a/src/main/java/com/bfd/crawl/ppthandler/PptHanlerApplication.java b/src/main/java/com/bfd/crawl/ppthandler/PptHanlerApplication.java new file mode 100644 index 0000000..7780d61 --- /dev/null +++ b/src/main/java/com/bfd/crawl/ppthandler/PptHanlerApplication.java @@ -0,0 +1,13 @@ +package com.bfd.crawl.ppthandler; + +import org.springframework.boot.SpringApplication; +import org.springframework.boot.autoconfigure.SpringBootApplication; + +@SpringBootApplication +public class PptHanlerApplication { + + public static void main(String[] args) { + SpringApplication.run(PptHanlerApplication.class, args); + } + +} diff --git a/src/main/java/com/bfd/crawl/ppthandler/bean/ResponsePo.java b/src/main/java/com/bfd/crawl/ppthandler/bean/ResponsePo.java new file mode 100644 index 0000000..b404c98 --- /dev/null +++ b/src/main/java/com/bfd/crawl/ppthandler/bean/ResponsePo.java @@ -0,0 +1,60 @@ +package com.bfd.crawl.ppthandler.bean; + + + +import com.bfd.crawl.ppthandler.enums.ResponseCode; +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.NoArgsConstructor; + +/** + * @author:jinming + * @className:ResponsePo + * @version:1.0 + * @description: + * @Date:2023/4/3 17:23 + */ +@Data +@NoArgsConstructor +@AllArgsConstructor +public class ResponsePo { + /** + * 响应码 + */ + private int code; + + /** + * 正常放 返回数据 的JSON串 + */ + private Object data; + + /** + * 提示消息 + */ + private String message; + + public static ResponsePo success() { + return setStatus(ResponseCode.SUCCESS.getCode(), ResponseCode.SUCCESS.getMessage()); + } + + public static ResponsePo error() { + return setStatus(ResponseCode.FAILURE.getCode(), ResponseCode.FAILURE.getMessage()); + } + + public static ResponsePo setStatus(int code, String message) { + ResponsePo resultBean = new ResponsePo(); + resultBean.code = code; + resultBean.message = message; + return resultBean; + } + public ResponsePo(int code, String message) { + this.code = code; + this.message = message; + this.data = data; + } + public ResponsePo(ResponseCode responseCode){ + this.code = responseCode.getCode(); + this.message = responseCode.getMessage(); + this.data = data; + } +} \ No newline at end of file diff --git a/src/main/java/com/bfd/crawl/ppthandler/config/AsyncThreadConfiguration.java b/src/main/java/com/bfd/crawl/ppthandler/config/AsyncThreadConfiguration.java new file mode 100644 index 0000000..6c2b134 --- /dev/null +++ b/src/main/java/com/bfd/crawl/ppthandler/config/AsyncThreadConfiguration.java @@ -0,0 +1,48 @@ +package com.bfd.crawl.ppthandler.config; + + +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; +import org.springframework.scheduling.annotation.EnableAsync; +import org.springframework.scheduling.concurrent.ThreadPoolTaskExecutor; + +import java.util.concurrent.Executor; + +/** + * @author jinming + * @version 1.0 + * @className AsyncThreadConfiguration + * @Date 2022/2/17 18:37 + */ +@Configuration +@EnableAsync +public class AsyncThreadConfiguration { + @Bean + public Executor asyncExecutor() { + ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor(); + // 核心线程数 + executor.setCorePoolSize(500); + // 并发线程的数量限制为2 + executor.setMaxPoolSize(500); + // 线程队列 + executor.setQueueCapacity(500); + executor.setThreadNamePrefix("handlerData-"); + executor.initialize(); + executor.setWaitForTasksToCompleteOnShutdown(true); + return executor; + } + @Bean + public Executor sendExecutor() { + ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor(); + // 核心线程数 + executor.setCorePoolSize(500); + // 并发线程的数量限制为2 + executor.setMaxPoolSize(500); + // 线程队列 + executor.setQueueCapacity(500); + executor.setThreadNamePrefix("sendData-"); + executor.initialize(); + executor.setWaitForTasksToCompleteOnShutdown(true); + return executor; + } +} diff --git a/src/main/java/com/bfd/crawl/ppthandler/config/Constant.java b/src/main/java/com/bfd/crawl/ppthandler/config/Constant.java new file mode 100644 index 0000000..8d6acfc --- /dev/null +++ b/src/main/java/com/bfd/crawl/ppthandler/config/Constant.java @@ -0,0 +1,27 @@ +package com.bfd.crawl.ppthandler.config; + +/** + * @author:jinming + * @className:Constant + * @version:1.0 + * @description: + * @Date:2023/8/16 15:26 + */ +public class Constant { + /** + * + */ + public final static String IS_XLS = "xls"; + + /** + * + */ + public final static String IS_CSV = "csv"; + + /** + * + */ + public final static String ALL = "*"; + + +} \ No newline at end of file diff --git a/src/main/java/com/bfd/crawl/ppthandler/controller/PptHandlerController.java b/src/main/java/com/bfd/crawl/ppthandler/controller/PptHandlerController.java new file mode 100644 index 0000000..4efac6b --- /dev/null +++ b/src/main/java/com/bfd/crawl/ppthandler/controller/PptHandlerController.java @@ -0,0 +1,46 @@ +package com.bfd.crawl.ppthandler.controller; + + +import com.alibaba.fastjson.JSON; + +import com.bfd.crawl.ppthandler.bean.ResponsePo; +import com.bfd.crawl.ppthandler.enums.ResponseCode; +import com.bfd.crawl.ppthandler.util.QueueUtil; +import lombok.extern.slf4j.Slf4j; +import org.springframework.web.bind.annotation.PostMapping; +import org.springframework.web.bind.annotation.RequestBody; +import org.springframework.web.bind.annotation.RequestMapping; +import org.springframework.web.bind.annotation.RestController; + +import java.util.Map; + +/** + * @author:jinming + * @className:DataFilterController + * @version:1.0 + * @description: + * @Date:2023/7/26 11:21 + */ +@RestController +@RequestMapping("/handlerdata") +@Slf4j +public class PptHandlerController { + @PostMapping("/ppthandler") + public ResponsePo documentFeedback(@RequestBody String dataJson) { + + ResponsePo responsePo = ResponsePo.success(); + try { + Map parse = (Map) JSON.parse(dataJson); + } catch (Exception e) { + e.printStackTrace(); + log.error("请求格式发生异常" + e.getMessage()); + responsePo.setCode(ResponseCode.FAILURE.getCode()); + responsePo.setMessage(ResponseCode.FAILURE.getMessage()); + return responsePo; + } + log.info("新增任务:" + dataJson); + QueueUtil.taskQueue.add(dataJson); + + return responsePo; + } +} \ No newline at end of file diff --git a/src/main/java/com/bfd/crawl/ppthandler/enums/ResponseCode.java b/src/main/java/com/bfd/crawl/ppthandler/enums/ResponseCode.java new file mode 100644 index 0000000..e8c5a44 --- /dev/null +++ b/src/main/java/com/bfd/crawl/ppthandler/enums/ResponseCode.java @@ -0,0 +1,32 @@ +package com.bfd.crawl.ppthandler.enums; + +/** + * @author:jinming + * @className:ResponseCodeEnum + * @version:1.0 + * @description:响应结果码枚举类 + * @Date:2023/2/28 11:40 + */ +public enum ResponseCode { + //返回结果码枚举类 + SUCCESS(200, "操作成功"), + FAILURE(400, "参数错误"), + INTERNAL_SERVER_ERROR(500, "服务器内部错误"), + TYPE_NOT_SUPPORT(601,"文件类型不支持"); + + private int code; + private String message; + + ResponseCode(int code, String message) { + this.code = code; + this.message = message; + } + + public int getCode() { + return code; + } + + public String getMessage() { + return message; + } +} \ No newline at end of file diff --git a/src/main/java/com/bfd/crawl/ppthandler/service/HandlerService.java b/src/main/java/com/bfd/crawl/ppthandler/service/HandlerService.java new file mode 100644 index 0000000..1715702 --- /dev/null +++ b/src/main/java/com/bfd/crawl/ppthandler/service/HandlerService.java @@ -0,0 +1,121 @@ +package com.bfd.crawl.ppthandler.service; + + +import com.alibaba.fastjson.JSON; +import com.bfd.crawl.ppthandler.config.Constant; +import com.bfd.crawl.ppthandler.util.*; +import lombok.extern.slf4j.Slf4j; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.scheduling.annotation.Async; +import org.springframework.stereotype.Service; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * @author:jinming + * @className:HandlerService + * @version:1.0 + * @description: + * @Date:2023/8/1 16:05 + */ +@Service +@Slf4j +public class HandlerService { + @Value("${file.path}") + private String downloadFilePath; + + @Async("asyncExecutor") + void run() { + while (true) { + try { + if (QueueUtil.taskQueue.size() > 0) { + String dataJson = null; + try { + dataJson = QueueUtil.taskQueue.take(); + } catch (InterruptedException e) { + e.printStackTrace(); + continue; + } + String errorMessage = ""; + Map parse = (Map) JSON.parse(dataJson); + Map dataMap = (Map) parse.get("data"); + int id = (int) parse.get("id"); + Map admin = (Map) parse.get("input"); + + String key = (String) admin.get("fileUrl"); + String fileUrl = (String) DataUtil.getValue(key, dataMap); + boolean windows = OsUtil.isWindows(); + Map output = (Map) parse.get("output"); +// List sheetName = (List) admin.get("sheetName"); + //fieldType:自定义输出字段: 0 关闭,1-开启,如果开启则拼接form到output里(如果关闭,则取默认的output拼接) + String fileType = StringUtil.getStrByPattern(fileUrl, "\\.[^.\\\\/:*?\"<>|\\r\\n]+$"); + log.info("任务:" + id + "的文件类型为" + fileType); + String fileName = StringUtil.getMd5(fileUrl); + //定义ppt的下载路径 + String pptDir = null; + try { + pptDir = downloadFilePath.concat(windows ? "\\ppt\\" : "/ppt/").concat(fileName).concat(fileType); + FileDownloader.downloadFile(fileUrl, new File(pptDir)); + } catch (IOException e) { + e.printStackTrace(); + errorMessage = "文件下载失败"; + } + String pptParse = null; + String pptx = "pptx"; + try { + if (fileType.contains(pptx)) { + pptParse = PptUtil.parsepptx(pptDir); + } else { + pptParse = PptUtil.parsePpt(pptDir); + } + + } catch (Exception e) { + e.printStackTrace(); + errorMessage = "文件解析失败"; + } + + if (!StringUtil.hasValue(errorMessage)) { + Map result = new HashMap(32); + Map resultsMap = new HashMap(32); + resultsMap.put("result", pptParse); + resultsMap.put("isLast", 1); + result.put("status", 1); + result.put("message", "成功"); + result.put("results", JSON.toJSONString(resultsMap)); + parse.put("result", result); + String message = JSON.toJSONString(parse); + QueueUtil.sendQueue.put(message); + continue; + } + Map result = new HashMap(32); + Map resultsMap = new HashMap(32); + resultsMap.put("content", errorMessage); + resultsMap.put("isLast", 1); + result.put("results", JSON.toJSONString(resultsMap)); + result.put("status", 2); + result.put("message", errorMessage); + parse.put("result", result); + String message = JSON.toJSONString(parse); + QueueUtil.sendQueue.put(message); + } else { + log.info("任务队列为空,休眠3秒"); + try { + Thread.sleep(3000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } catch (Throwable e) { + e.printStackTrace(); + log.error("工作线程发生异常" + e.getMessage()); + } + } + } + + +} \ No newline at end of file diff --git a/src/main/java/com/bfd/crawl/ppthandler/service/SendService.java b/src/main/java/com/bfd/crawl/ppthandler/service/SendService.java new file mode 100644 index 0000000..75d8d1b --- /dev/null +++ b/src/main/java/com/bfd/crawl/ppthandler/service/SendService.java @@ -0,0 +1,54 @@ +package com.bfd.crawl.ppthandler.service; + + +import com.alibaba.fastjson.JSON; +import com.bfd.crawl.ppthandler.util.QueueUtil; +import lombok.extern.slf4j.Slf4j; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.kafka.core.KafkaTemplate; +import org.springframework.scheduling.annotation.Async; +import org.springframework.stereotype.Service; + +import javax.annotation.Resource; +import java.util.Map; + +/** + * @author:jinming + * @className:SendService + * @version:1.0 + * @description: + * @Date:2023/7/31 17:53 + */ +@Slf4j +@Service +public class SendService { + @Value("${send.topic}") + private String topic; + + @Resource + private KafkaTemplate kafkaTemplate; + + @Async("sendExecutor") + void sendToKafka() { + while (true) { + if (QueueUtil.sendQueue.size() > 0) { + try { + String message = QueueUtil.sendQueue.take(); + Map parse = (Map) JSON.parse(message); + String id = parse.get("id").toString(); + log.info("ID:" + id + "\t" + "数据已发出"); + kafkaTemplate.send(topic, message); + } catch (Exception e) { + e.printStackTrace(); + } + } else { + log.info("任务队列为空,休眠3秒"); + try { + Thread.sleep(3000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } + } +} diff --git a/src/main/java/com/bfd/crawl/ppthandler/service/StartServcie.java b/src/main/java/com/bfd/crawl/ppthandler/service/StartServcie.java new file mode 100644 index 0000000..84dea03 --- /dev/null +++ b/src/main/java/com/bfd/crawl/ppthandler/service/StartServcie.java @@ -0,0 +1,63 @@ +package com.bfd.crawl.ppthandler.service; + + +import com.bfd.crawl.ppthandler.util.QueueUtil; +import lombok.extern.slf4j.Slf4j; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.boot.ApplicationArguments; +import org.springframework.boot.ApplicationRunner; +import org.springframework.stereotype.Service; + +/** + * @author:jinming + * @className:StartServcie + * @version:1.0 + * @description: + * @Date:2023/7/31 17:14 + */ +@Service +@Slf4j +public class StartServcie implements ApplicationRunner { + @Value("${thread.handler}") + private int handlerNumber; + @Value("${thread.send}") + private int sendNumber; + + @Autowired + private HandlerService handlerService; + @Autowired + private SendService sendService; + + @Override + public void run(ApplicationArguments args) throws Exception { + for (int i = 0; i < handlerNumber; i++) { + log.info("处理服务线程" + i + "已启动 "); + handlerService.run(); + } + for (int i = 0; i < sendNumber; i++) { + log.info("发送服务线程" + i + "已启动 "); + sendService.sendToKafka(); + } + Runnable myRunnable = new Runnable() { + @Override + public void run() { + while (true) { + log.info("任务队列长度为" + QueueUtil.taskQueue.size()); + log.info("发送队列长度为" + QueueUtil.sendQueue.size()); + try { + Thread.sleep(10000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } + }; + // 创建一个新的线程,并将Runnable对象传递给Thread构造函数 + Thread myThread = new Thread(myRunnable); + // 启动线程 + myThread.start(); + + + } +} \ No newline at end of file diff --git a/src/main/java/com/bfd/crawl/ppthandler/util/CsvUtil.java b/src/main/java/com/bfd/crawl/ppthandler/util/CsvUtil.java new file mode 100644 index 0000000..a979f34 --- /dev/null +++ b/src/main/java/com/bfd/crawl/ppthandler/util/CsvUtil.java @@ -0,0 +1,44 @@ +package com.bfd.crawl.ppthandler.util; + +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.CSVParser; +import org.apache.commons.csv.CSVRecord; + +import java.io.FileInputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * @author:jinming + * @className:CsvUtil + * @version:1.0 + * @description: + * @Date:2024/3/25 11:02 + */ +public class CsvUtil { + public static List> parseCSV(String filePath, String csvCharSet) { + List> dataList = new ArrayList<>(); + try (Reader reader = new InputStreamReader(new FileInputStream(filePath), csvCharSet); + CSVParser csvParser = new CSVParser(reader, CSVFormat.DEFAULT.withFirstRecordAsHeader())) { + Map headerMap = csvParser.getHeaderMap(); + for (CSVRecord csvRecord : csvParser) { + Map dataMap = new HashMap<>(); + for (Map.Entry entry : headerMap.entrySet()) { + String key = entry.getKey(); + int index = entry.getValue(); + dataMap.put(key, csvRecord.get(index)); + } + dataList.add(dataMap); + } + } catch (Exception e) { + e.printStackTrace(); + } + + return dataList; + } + +} \ No newline at end of file diff --git a/src/main/java/com/bfd/crawl/ppthandler/util/DataUtil.java b/src/main/java/com/bfd/crawl/ppthandler/util/DataUtil.java new file mode 100644 index 0000000..91395de --- /dev/null +++ b/src/main/java/com/bfd/crawl/ppthandler/util/DataUtil.java @@ -0,0 +1,60 @@ +package com.bfd.crawl.ppthandler.util; + +import com.alibaba.fastjson.JSON; +import com.alibaba.fastjson.JSONObject; +import com.alibaba.fastjson.JSONPath; +import lombok.extern.slf4j.Slf4j; + +import java.util.Map; + +/** + * @author:jinming + * @className:DataUtil + * @version:1.0 + * @description: 获取dataValue的值 + * @Date:2023/11/1 9:54 + */ +@Slf4j +public class DataUtil { + /** + * @param key 传入的key + * @param dataMap 数据map + * @return 根据传入的参数进行判断解析,返回正确的dataValue + */ + public static Object getValue(String key, Map dataMap) { + try { + //公式为空直接就返回 + if (!StringUtil.hasValue(key)) { + return ""; + } + Object dataValue; + String isJson = "#json#"; + if (key.contains(isJson)) { + //进行第一次拆分,获取#json#前面的部分 + String[] keySplit = key.split(isJson); + String firstDataKey = keySplit[0]; + String[] firstDataKeySplit = firstDataKey.split(":"); + //取出前半部分对应的JSON数据并转换为JSONObject + String dataJson = (String) dataMap.get(firstDataKeySplit[0]); + JSONObject dataJsonObject = JSON.parseObject(dataJson); + //根据key的后半部分取出对应JSONObject中的值 + String firstDataKeyJson = (String) JSONPath.eval(dataJsonObject, firstDataKeySplit[1]); + String secDataKey = keySplit[1]; + JSONObject firstDataJsonObject = JSON.parseObject(firstDataKeyJson); + dataValue = JSONPath.eval(firstDataJsonObject, secDataKey); + return dataValue; + } + String[] keySplit = key.split(":"); + String jsonPath = keySplit[1]; + String dataJson = (String) dataMap.get(keySplit[0]); + JSONObject dataJsonObject = JSON.parseObject(dataJson); + dataValue = JSONPath.eval(dataJsonObject, jsonPath); + return dataValue; + } catch (Exception e) { + // TODO: handle exception + log.error("jsonpath公式取值异常,", e); + return null; + } + + } +} \ No newline at end of file diff --git a/src/main/java/com/bfd/crawl/ppthandler/util/ExcelUtils.java b/src/main/java/com/bfd/crawl/ppthandler/util/ExcelUtils.java new file mode 100644 index 0000000..ae8dc8e --- /dev/null +++ b/src/main/java/com/bfd/crawl/ppthandler/util/ExcelUtils.java @@ -0,0 +1,182 @@ +package com.bfd.crawl.ppthandler.util; + +import org.apache.poi.ss.usermodel.*; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.StandardCopyOption; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + + +/** + * @author jian.mao + * @date 2023年4月7日 + * @description excel解析工具类 + */ +public class ExcelUtils { + /** + * excel解析 + * + * @param excel + * @return + */ + public static Map parse(File excel) { + Map excelMap = new HashMap(16); + try { + FileInputStream file = new FileInputStream(excel); + // 使用工厂模式创建工作簿对象 + Workbook workbook = WorkbookFactory.create(file); + // 获取工作簿中工作表的数量 + int numberOfSheets = workbook.getNumberOfSheets(); + DataFormatter dataFormatter = new DataFormatter(); + // 遍历所有工作表 + for (int i = 0; i < numberOfSheets; i++) { + Sheet sheet = workbook.getSheetAt(i); + String key = sheet.getSheetName(); + //行码 + int rowNum = 0; + List> data = new ArrayList>(); + Map titleHead = new HashMap(16); + // 遍历所有行 + for (Row row : sheet) { + //单元格码 + int cellNum = 0; + //行内容存储 + Map rowMap = new HashMap(16); + // 遍历所有单元格 + if (rowNum == 0) { + for (Cell cell : row) { + String cellValue = dataFormatter.formatCellValue(cell); + titleHead.put(cellNum, cellValue); + cellNum++; + } + } else { + for (int j = 0; j < titleHead.size(); j++) { + String cellValue = dataFormatter.formatCellValue(row.getCell(j)); + rowMap.put(titleHead.get(cellNum), cellValue); + cellNum++; + } + } + + if (rowNum > 0) { + data.add(rowMap); + } + rowNum++; + } + excelMap.put(key, data); + } + // 关闭文件输入流和工作簿对象 + file.close(); + workbook.close(); + } catch (IOException e) { + e.printStackTrace(); + } + return excelMap; + } + + /** + * 将List>写入Excel文件中 + * + * @param data 要写入Excel的数据,每个Map代表一行数据,Map的key为列名,value为单元格数据 + * @param excelFilePath Excel文件路径,包含文件名和扩展名 + * @param sheetName 工作表名称 + * @throws IOException 如果写入Excel文件时发生IO异常,则抛出该异常 + */ + public static void write(List> data, String excelFilePath, String sheetName) throws IOException { + // 创建一个新的工作簿对象 + Workbook workbook = new XSSFWorkbook(); + // 创建一个新的工作表 + Sheet sheet = workbook.createSheet(sheetName); + // 行码 + int rowNum = 0; + // 写入列头 + Row headerRow = sheet.createRow(rowNum++); + int colNum = 0; + for (String key : data.get(0).keySet()) { + Cell cell = headerRow.createCell(colNum++); + cell.setCellValue(key); + } + // 写入数据 + for (Map rowMap : data) { + Row row = sheet.createRow(rowNum++); + colNum = 0; + for (String key : rowMap.keySet()) { + Cell cell = row.createCell(colNum++); + try { + String s = rowMap.get(key); + if (s.length() > 30000) { + + cell.setCellValue(s.substring(0, 25000)); + } else { + cell.setCellValue(s); + } + } catch (Exception e) { + System.out.println(key); + e.printStackTrace(); + } + } + } + // 将数据写入文件 + FileOutputStream outputStream = new FileOutputStream(excelFilePath); + workbook.write(outputStream); + workbook.close(); + outputStream.close(); + } + +// public static void copyFile(String sourceFloder, String targetFileName) { +// File sourceFile = new File(sourceFloder); +// byte[] buffer = new byte[(int) sourceFile.length()]; +// try (InputStream inputStream = new FileInputStream(sourceFile)) { +// inputStream.read(buffer); +// } catch (IOException e) { +// e.printStackTrace(); +// return; +// } +// // 写入目标文件 +// File targetFile = new File(targetFileName); +// targetFile.mkdirs(); +// try (OutputStream outputStream = new FileOutputStream(targetFile)) { +// outputStream.write(buffer); +// } catch (IOException e) { +// e.printStackTrace(); +// return; +// } +// } + + public static void copyFile(String sourceFilePath) { + // 源文件和目标文件的路径 + String targetDrive = "F:"; + + try { + // 获取源文件和目标文件的路径信息 + Path sourcePath = Paths.get(sourceFilePath); + Path targetPath = Paths.get(targetDrive + sourcePath.toString().substring(2)); + + // 如果目标文件的父目录不存在,则创建该目录 + if (!targetPath.getParent().toFile().exists()) { + targetPath.getParent().toFile().mkdirs(); + } + + // 进行文件复制 + Files.copy(sourcePath, targetPath, StandardCopyOption.REPLACE_EXISTING); + + System.out.println("Copied file: " + sourceFilePath + " -> " + targetPath); + } catch (IOException e) { + e.printStackTrace(); + } + } + + public static void main(String[] args) { + System.out.println(parse(new File("D:\\fromHanler\\xls\\b51484b213ed8fea61f5b99cbdc1490e.xlsx"))); + } + +} diff --git a/src/main/java/com/bfd/crawl/ppthandler/util/FileDownloader.java b/src/main/java/com/bfd/crawl/ppthandler/util/FileDownloader.java new file mode 100644 index 0000000..329b26b --- /dev/null +++ b/src/main/java/com/bfd/crawl/ppthandler/util/FileDownloader.java @@ -0,0 +1,117 @@ +package com.bfd.crawl.ppthandler.util; + +import com.alibaba.fastjson.JSON; +import okhttp3.*; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.HashMap; +import java.util.Map; + +/** + * @author:jinming + * @className:FileDownloader + * @version:1.0 + * @description: + * @Date:2023/8/1 16:31 + */ +public class FileDownloader { + private static OkHttpClient okHttpClient; + + private static OkHttpClient getOkHttpClient() { + if (okHttpClient == null) { + okHttpClient = new OkHttpClient(); + } + return okHttpClient; + } + + public static void downloadFile(String url, File destination) throws IOException { + OkHttpClient client = getOkHttpClient(); + Request request = new Request.Builder() + .url(url) + .build(); + + try (Response response = client.newCall(request).execute()) { + if (!response.isSuccessful()) { + throw new IOException("Failed to download file: " + response); + } + + ResponseBody body = response.body(); + if (body == null) { + throw new IOException("Response body is null"); + } + if (!destination.getParentFile().exists()) { + + destination.getParentFile().mkdirs(); + } + try (InputStream inputStream = body.byteStream(); + FileOutputStream outputStream = new FileOutputStream(destination)) { + byte[] buffer = new byte[8192]; + int bytesRead; + while ((bytesRead = inputStream.read(buffer)) != -1) { + outputStream.write(buffer, 0, bytesRead); + } + outputStream.flush(); + } + } + } + + public static Map uploadFile(String url, String filePath) throws Exception { + File file = new File(filePath); + + Map returnMap = new HashMap(32); + OkHttpClient client = getOkHttpClient(); + // 设置文件上传的媒体类型 + MediaType mediaType = MediaType.parse("application/octet-stream"); + // 创建请求体,将文件添加到请求体中 + RequestBody requestBody = RequestBody.create(mediaType, file); + + // 创建多部分请求体,用于上传文件 + MultipartBody multipartBody = new MultipartBody.Builder() + .setType(MultipartBody.FORM) + .addFormDataPart("file", file.getName(), requestBody) + .build(); + // 创建上传文件的请求 + Request request = new Request.Builder() + .url(url) + .post(multipartBody) + .build(); + + try (Response response = client.newCall(request).execute()) { + if (!response.isSuccessful()) { + throw new IOException("Failed to upload file: " + response); + } + String html = response.body().string(); + + try { + Map parse = (Map) JSON.parse(html); + Map data = (Map) parse.get("data"); + String domain = (String) data.get("domain"); + String src = (String) data.get("src"); + String fileUrl = domain.concat(src); + returnMap.put("fileUrl", fileUrl); + } catch (Exception e) { + returnMap.put("fileUrl", html); + } + // 处理上传成功的响应 + System.out.println("File uploaded successfully!"); + } + return returnMap; + } + + public static void deleteFile(String url, String md5) throws Exception { + OkHttpClient client = getOkHttpClient(); + url = url.concat("delete?md5=").concat(md5); + Request request = new Request.Builder() + .url(url) + .get() + .build(); + try (Response response = client.newCall(request).execute()) { + if (!response.isSuccessful()) { + throw new IOException("Failed to upload file: " + response); + } + } + } +} \ No newline at end of file diff --git a/src/main/java/com/bfd/crawl/ppthandler/util/FileUtil.java b/src/main/java/com/bfd/crawl/ppthandler/util/FileUtil.java new file mode 100644 index 0000000..4786e1f --- /dev/null +++ b/src/main/java/com/bfd/crawl/ppthandler/util/FileUtil.java @@ -0,0 +1,42 @@ +package com.bfd.crawl.ppthandler.util; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; + +/** + * @author:jinming + * @className:FileUtil + * @version:1.0 + * @description: + * @Date:2023/8/2 10:57 + */ +public class FileUtil { + + public static void main(String[] args) { + System.out.println(traverseAndReturnFilePath("D:\\\\ocr\\\\305ce27d7a05770456fdc09d0b3044f7\\\\")); + } + + public static List traverseAndReturnFilePath(String folderPath) { + List fileList = new ArrayList<>(); + File folder = new File(folderPath); + // 检查文件夹是否存在并且是一个文件夹 + if (folder.exists() && folder.isDirectory()) { + // 获取文件夹中的所有文件和子文件夹 + File[] files = folder.listFiles(); + if (files != null) { + for (File file : files) { + if (file.isFile()) { + // 如果是文件,则输出全路径 + fileList.add(file.getAbsolutePath()); + } else if (file.isDirectory()) { + + } + } + } + } else { + System.out.println("指定的路径不是一个文件夹或文件夹不存在。"); + } + return fileList; + } +} \ No newline at end of file diff --git a/src/main/java/com/bfd/crawl/ppthandler/util/OcrUtil.java b/src/main/java/com/bfd/crawl/ppthandler/util/OcrUtil.java new file mode 100644 index 0000000..4d76686 --- /dev/null +++ b/src/main/java/com/bfd/crawl/ppthandler/util/OcrUtil.java @@ -0,0 +1,59 @@ +package com.bfd.crawl.ppthandler.util; + +import com.alibaba.fastjson.JSON; +import okhttp3.*; + +import java.util.Map; +import java.util.concurrent.TimeUnit; + +/** + * @author:jinming + * @className:ocrUtil + * @version:1.0 + * @description: + * @Date:2023/8/1 16:38 + */ +public class OcrUtil { + private static OkHttpClient okHttpClient; + + private static OkHttpClient getOkHttpClient() { + if (okHttpClient == null) { + okHttpClient = new OkHttpClient(); + } + return okHttpClient; + } + + public static String doOcr(String url,String ocrApi) { + String text = ""; + int reTryTimes = 3; + for (int i = 0; i < reTryTimes; i++) { + int okCode = 200; + OkHttpClient client = getOkHttpClient(); + OkHttpClient.Builder builder = client.newBuilder().writeTimeout(600, TimeUnit.SECONDS).connectTimeout(600, TimeUnit.SECONDS).readTimeout(600, TimeUnit.SECONDS); + client = builder.build(); + MediaType mediaType = MediaType.parse("application/json"); + RequestBody body = RequestBody.create(mediaType, "{\"id\":\"\",\"url\":\"" + url + "\"}"); + Request request = new Request.Builder() + .url(ocrApi) + .method("POST", body) + .addHeader("Content-Type", "application/json") + .build(); + try { + Response response = client.newCall(request).execute(); + String html = response.body().string(); + Map dataMap = (Map) JSON.parse(html); + int code = (int) dataMap.get("code"); + if (code == okCode) { + text = (String) dataMap.get("text"); + } + if (StringUtil.hasValue(text)) { + break; + } + } catch (Exception e) { + e.printStackTrace(); + } + } + + return text; + } +} \ No newline at end of file diff --git a/src/main/java/com/bfd/crawl/ppthandler/util/OsUtil.java b/src/main/java/com/bfd/crawl/ppthandler/util/OsUtil.java new file mode 100644 index 0000000..85bdac7 --- /dev/null +++ b/src/main/java/com/bfd/crawl/ppthandler/util/OsUtil.java @@ -0,0 +1,23 @@ +package com.bfd.crawl.ppthandler.util; + +import lombok.extern.slf4j.Slf4j; + +import java.util.Locale; + +/** + * @author:jinming + * @className:OsUtil + * @version:1.0 + * @description: + * @Date:2023/4/23 9:40 + */ +@Slf4j +public class OsUtil { + public static Boolean isWindows() { + String windows = "windows"; + String osName = System.getProperty("os.name").toLowerCase(Locale.ROOT); + log.info("osName = " + osName); + return osName.contains(windows); + } + +} \ No newline at end of file diff --git a/src/main/java/com/bfd/crawl/ppthandler/util/PptUtil.java b/src/main/java/com/bfd/crawl/ppthandler/util/PptUtil.java new file mode 100644 index 0000000..1d1e78e --- /dev/null +++ b/src/main/java/com/bfd/crawl/ppthandler/util/PptUtil.java @@ -0,0 +1,124 @@ +package com.bfd.crawl.ppthandler.util; + +import org.apache.poi.hslf.usermodel.HSLFShape; +import org.apache.poi.hslf.usermodel.HSLFSlide; +import org.apache.poi.hslf.usermodel.HSLFSlideShow; +import org.apache.poi.hslf.usermodel.HSLFTextShape; +import org.apache.poi.xslf.usermodel.*; + +import java.io.FileInputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +/** + * @author:jinming + * @className:PptUtil + * @version:1.0 + * @description: + * @Date:2024/3/25 16:14 + */ +public class PptUtil { + + + public static String parsepptx(String filePath) { + StringBuilder dataStringsb = new StringBuilder(); + XMLSlideShow ppt = null; + try { + // PPT类 + ppt = new XMLSlideShow(new FileInputStream(filePath)); + // 获取PPT中的所有幻灯片 +// List slides = ppt.getSlides(); + // 遍历幻灯片 + for (int i = 0; i < ppt.getSlides().size(); i++) { + + //拿到第i页的PPT + XSLFSlide slides = ppt.getSlides().get(i); + System.out.println("第" + (i + 1) + "页"); + //注释的for循环是获取所以PPT的内容 +// for (XSLFSlide slide : slides) { + // 获取幻灯片中的所有图形 + List shapes = slides.getShapes(); + // 遍历PPT的图形 + for (XSLFShape shape : shapes) { + // 判断该图形类是否是文本框类 + if (shape instanceof XSLFTextShape) { + // 将图像类强制装换成文本框类 + XSLFTextShape ts = (XSLFTextShape) shape; + // 获取文本框内的文字 + String str = ts.getText(); + dataStringsb.append(str); + } + // 判断该图形类是否是表格类 + if (shape instanceof XSLFTable) { + // 将图像类强制装换成表格类 + XSLFTable table = (XSLFTable) shape; + // 获取表格中的所有行 + List rows = table.getRows(); + for (XSLFTableRow tr : rows) { + // 获取行中的所有单元格 + List cells = tr.getCells(); + for (XSLFTableCell tc : cells) { + // 获取单元格内的文字 + String str = tc.getText(); + dataStringsb.append(str); + } + } + } + // 判断该图形类是否是图片框类chr + if (shape instanceof XSLFPictureShape) { + // 将图像类强制装换成图片框类 + XSLFPictureShape ps = (XSLFPictureShape) shape; + // 获取图片的字节码数据(可以利用输出流将该图片保存到硬盘里) + byte[] pictureData = ps.getPictureData().getData(); +// System.out.println("图片信息:" + pictureData); + } + } + + } +// } + } catch (Exception e) { + e.printStackTrace(); + } finally { + if (ppt != null) { + + try { + // 保存完之后要对PPT进行关闭操作 + ppt.close(); + } catch (IOException e) { + e.printStackTrace(); + } + + } + + } + return dataStringsb.toString(); + } + + public static String parsePpt(String filePath) { + StringBuilder dataStringsb = new StringBuilder(); + HSLFSlideShow ppt = null; + try { + ppt = new HSLFSlideShow(new FileInputStream(filePath)); + for (HSLFSlide slide : ppt.getSlides()) { + for (HSLFShape shape : slide.getShapes()) { + if (shape instanceof HSLFTextShape) { + HSLFTextShape textShape = (HSLFTextShape) shape; + dataStringsb.append(textShape.getText()).append("\n"); + } + } + } + } catch (Exception e) { + e.printStackTrace(); + } finally { + if (ppt != null) { + try { + ppt.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + return dataStringsb.toString(); + } +} \ No newline at end of file diff --git a/src/main/java/com/bfd/crawl/ppthandler/util/QueueUtil.java b/src/main/java/com/bfd/crawl/ppthandler/util/QueueUtil.java new file mode 100644 index 0000000..a311622 --- /dev/null +++ b/src/main/java/com/bfd/crawl/ppthandler/util/QueueUtil.java @@ -0,0 +1,19 @@ +package com.bfd.crawl.ppthandler.util; + +import java.util.concurrent.LinkedBlockingDeque; + +/** + * @author:jinming + * @className:QueueUtil + * @version:1.0 + * @description: + * @Date:2023/7/13 15:00 + */ +public class QueueUtil { + + + public static LinkedBlockingDeque taskQueue = new LinkedBlockingDeque(); + + public static LinkedBlockingDeque sendQueue = new LinkedBlockingDeque(); + +} \ No newline at end of file diff --git a/src/main/java/com/bfd/crawl/ppthandler/util/StringUtil.java b/src/main/java/com/bfd/crawl/ppthandler/util/StringUtil.java new file mode 100644 index 0000000..3d3f9a7 --- /dev/null +++ b/src/main/java/com/bfd/crawl/ppthandler/util/StringUtil.java @@ -0,0 +1,94 @@ +package com.bfd.crawl.ppthandler.util; + + +import lombok.extern.slf4j.Slf4j; + +import java.security.MessageDigest; +import java.util.HashSet; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * @author jinming + * @version 1.0 + * @className StringUtile + * @Date 2022/1/21 11:46 + */ +@Slf4j +public class StringUtil { + public static boolean hasValue(String str) { + return str != null && !"".equals(str.trim()); + } + + public static String getRegexGroup(String regex, String str, int id) { + String resultStr = ""; + if (hasValue(str)) { + Pattern p = Pattern.compile(regex); + Matcher m = p.matcher(str); + if (m.find()) { + resultStr = m.group(id); + } + } + + if ("".equals(resultStr)) { + } + + return resultStr; + } + + public static Set getEmailAddress(String message) { + Set emailList = new HashSet<>(); + Pattern pattern = Pattern.compile("\\w+\\.?\\w+\\@\\w+\\.\\w+"); + Matcher m = pattern.matcher(message); + while (m.find()) { + emailList.add(m.group(0)); + } + return emailList; + } + public static String getMd5(String string) { + try { + MessageDigest md5 = MessageDigest.getInstance("MD5"); + byte[] bs = md5.digest(string.getBytes("UTF-8")); + StringBuilder sb = new StringBuilder(40); + for (byte x : bs) { + if ((x & 0xff) >> 4 == 0) { + sb.append("0").append(Integer.toHexString(x & 0xff)); + } else { + sb.append(Integer.toHexString(x & 0xff)); + } + } + return sb.toString(); + } catch (Exception e) { + //LOG.error("获取md5异常", e); + return "nceaform" + System.currentTimeMillis(); + } + } + + public static String removeAllHtmlTags(String str) { + return hasValue(str) ? str.replaceAll("<[^<>]+?>", "") : ""; + } + + public static String getRegexGroup(Pattern regex, String str, int id) { + String resultStr = ""; + if (hasValue(str)) { + Matcher m = regex.matcher(str); + if (m.find()) { + resultStr = m.group(id); + } + } + + if ("".equals(resultStr)) { + log.error(regex + " parser error!"); + } + + return resultStr; + } + + public static String getStrByPattern(String str, String regex) { + Pattern pattern = Pattern.compile(regex); + Matcher m = pattern.matcher(str); + return m.find() ? m.group(0) : ""; + } + +} diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml new file mode 100644 index 0000000..593ae67 --- /dev/null +++ b/src/main/resources/application.yml @@ -0,0 +1,47 @@ +server: + port: 7082 +spring: + application: + name: 幻灯片处理 + boot: + admin: + client: + health: + timeout: 10s + url: http://172.16.12.55:8001 + instance: + service-base-url: http://172.16.12.55:7088 + kafka: + bootstrap-servers: 172.26.28.30:9092 + producer: + retries: 3 + acks: all + batch-size: 4096 + buffer-memory: 102476800 + key-serializer: org.apache.kafka.common.serialization.StringSerializer + value-serializer: org.apache.kafka.common.serialization.StringSerializer + + +logging: + file: + path: ./logs + +management: + endpoints: + web: + exposure: + include: "*" + endpoint: + health: + show-details: always + +send: + topic: analyze0912 + +file: + path: D:\\fromHanler\\ + +thread: + handler: 1 + send: 1 + diff --git a/src/main/resources/logback-spring.xml b/src/main/resources/logback-spring.xml new file mode 100644 index 0000000..021a114 --- /dev/null +++ b/src/main/resources/logback-spring.xml @@ -0,0 +1,36 @@ + + + + + + + + %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %line %-5level %logger{50} - %msg%n + + + + + true + + ${logging.level} + + + ${logging.file.path}/formHandler.log + + + ${logging.file.path}/formHandler.log.%d{yyyy-MM-dd} + 3 + + + %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %line %-5level %logger{50} - %msg%n + UTF-8 + + + + + + + +