From 4d48a8ed53a55659a683574ba103548daaf8c6d7 Mon Sep 17 00:00:00 2001
From: 55007 <55007@maojian>
Date: Tue, 7 Jan 2025 17:41:35 +0800
Subject: [PATCH] =?UTF-8?q?=E6=96=87=E6=A1=A3=E6=8A=93=E6=8D=A2=E5=BA=94?=
=?UTF-8?q?=E7=94=A8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.classpath | 40 +++
.gitignore | 3 +
.project | 23 ++
.settings/org.eclipse.core.resources.prefs | 5 +
.settings/org.eclipse.jdt.core.prefs | 8 +
.settings/org.eclipse.m2e.core.prefs | 4 +
pom.xml | 218 ++++++++++++++
.../docconversion/DocConversionApplication.java | 67 +++++
.../docconversion/controller/ApiController.java | 40 +++
.../service/ConversionToPdfService.java | 13 +
.../bfd/docconversion/service/ProcessService.java | 46 +++
.../service/impl/ConversionToPdfServiceImpl.java | 97 ++++++
.../com/bfd/docconversion/util/AsyncConfig.java | 37 +++
.../java/com/bfd/docconversion/util/Config.java | 32 ++
.../java/com/bfd/docconversion/util/Constants.java | 19 ++
.../bfd/docconversion/util/FileExtensionEnum.java | 39 +++
.../java/com/bfd/docconversion/util/KfkUtil.java | 83 ++++++
.../com/bfd/docconversion/util/MainHandler.java | 104 +++++++
.../java/com/bfd/docconversion/util/Utils.java | 325 +++++++++++++++++++++
src/main/resources/application.yml | 40 +++
src/main/resources/logback-spring.xml | 38 +++
.../DocConversionApplicationTests.java | 13 +
22 files changed, 1294 insertions(+)
create mode 100644 .classpath
create mode 100644 .gitignore
create mode 100644 .project
create mode 100644 .settings/org.eclipse.core.resources.prefs
create mode 100644 .settings/org.eclipse.jdt.core.prefs
create mode 100644 .settings/org.eclipse.m2e.core.prefs
create mode 100644 pom.xml
create mode 100644 src/main/java/com/bfd/docconversion/DocConversionApplication.java
create mode 100644 src/main/java/com/bfd/docconversion/controller/ApiController.java
create mode 100644 src/main/java/com/bfd/docconversion/service/ConversionToPdfService.java
create mode 100644 src/main/java/com/bfd/docconversion/service/ProcessService.java
create mode 100644 src/main/java/com/bfd/docconversion/service/impl/ConversionToPdfServiceImpl.java
create mode 100644 src/main/java/com/bfd/docconversion/util/AsyncConfig.java
create mode 100644 src/main/java/com/bfd/docconversion/util/Config.java
create mode 100644 src/main/java/com/bfd/docconversion/util/Constants.java
create mode 100644 src/main/java/com/bfd/docconversion/util/FileExtensionEnum.java
create mode 100644 src/main/java/com/bfd/docconversion/util/KfkUtil.java
create mode 100644 src/main/java/com/bfd/docconversion/util/MainHandler.java
create mode 100644 src/main/java/com/bfd/docconversion/util/Utils.java
create mode 100644 src/main/resources/application.yml
create mode 100644 src/main/resources/logback-spring.xml
create mode 100644 src/test/java/com/bfd/doc_conversion/DocConversionApplicationTests.java
diff --git a/.classpath b/.classpath
new file mode 100644
index 0000000..f7e4a1d
--- /dev/null
+++ b/.classpath
@@ -0,0 +1,40 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..49ab93c
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+/target/
+/logs/
+/jarlib/
\ No newline at end of file
diff --git a/.project b/.project
new file mode 100644
index 0000000..73b9bbd
--- /dev/null
+++ b/.project
@@ -0,0 +1,23 @@
+
+
+ doc_conversion
+
+
+
+
+
+ org.eclipse.jdt.core.javabuilder
+
+
+
+
+ org.eclipse.m2e.core.maven2Builder
+
+
+
+
+
+ org.eclipse.jdt.core.javanature
+ org.eclipse.m2e.core.maven2Nature
+
+
diff --git a/.settings/org.eclipse.core.resources.prefs b/.settings/org.eclipse.core.resources.prefs
new file mode 100644
index 0000000..839d647
--- /dev/null
+++ b/.settings/org.eclipse.core.resources.prefs
@@ -0,0 +1,5 @@
+eclipse.preferences.version=1
+encoding//src/main/java=UTF-8
+encoding//src/main/resources=UTF-8
+encoding//src/test/java=UTF-8
+encoding/=UTF-8
diff --git a/.settings/org.eclipse.jdt.core.prefs b/.settings/org.eclipse.jdt.core.prefs
new file mode 100644
index 0000000..2f5cc74
--- /dev/null
+++ b/.settings/org.eclipse.jdt.core.prefs
@@ -0,0 +1,8 @@
+eclipse.preferences.version=1
+org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
+org.eclipse.jdt.core.compiler.compliance=1.8
+org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled
+org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
+org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore
+org.eclipse.jdt.core.compiler.release=disabled
+org.eclipse.jdt.core.compiler.source=1.8
diff --git a/.settings/org.eclipse.m2e.core.prefs b/.settings/org.eclipse.m2e.core.prefs
new file mode 100644
index 0000000..f897a7f
--- /dev/null
+++ b/.settings/org.eclipse.m2e.core.prefs
@@ -0,0 +1,4 @@
+activeProfiles=
+eclipse.preferences.version=1
+resolveWorkspaceProjects=true
+version=1
diff --git a/pom.xml b/pom.xml
new file mode 100644
index 0000000..c004987
--- /dev/null
+++ b/pom.xml
@@ -0,0 +1,218 @@
+
+
+ 4.0.0
+ com.bfd
+ doc_conversion
+ 0.0.1-SNAPSHOT
+ docconversion
+ docconversion
+
+ 1.8
+ UTF-8
+ UTF-8
+ 2.2.4.RELEASE
+
+
+
+ org.springframework.boot
+ spring-boot-starter-web
+
+
+
+ org.projectlombok
+ lombok
+ true
+
+
+ org.springframework.boot
+ spring-boot-starter-test
+ test
+
+
+ org.springframework.boot
+ spring-boot-starter-test
+ test
+
+
+ com.alibaba.fastjson2
+ fastjson2
+ 2.0.12
+
+
+ cn.hutool
+ hutool-all
+ 5.8.27
+
+
+ org.apache.kafka
+ kafka-clients
+ 2.7.1
+
+
+ com.squareup.okhttp3
+ okhttp
+ 3.11.0
+
+
+ de.codecentric
+ spring-boot-admin-client
+ 2.2.4
+
+
+
+ aspose-cells-20.12-crack
+ aspose-cells-20.12-crack
+ 20.12
+ system
+ D:\eclipseWork\doc_conversion/./jarlib/aspose-cells-20.12-crack.jar
+
+
+ aspose-slides-20.12-crack
+ aspose-slides-20.12-crack
+ 20.12
+ system
+ D:\eclipseWork\doc_conversion/../jarlib/aspose-slides-20.12-crack.jar
+
+
+ aspose-words-20.12-crack
+ aspose-words-20.12-crack
+ 20.12
+ system
+ D:\eclipseWork\doc_conversion/../jarlib/aspose-words-20.12-crack.jar
+
+
+
+ org.javassist
+ javassist
+ 3.20.0-GA
+
+
+
+ aspose-pdf-23.1
+ aspose-pdf-23.1
+ 23.1
+ system
+ D:\eclipseWork\doc_conversion/../jarlib/aspose-pdf-23.1.jar
+
+
+ org.apache.curator
+ curator-framework
+ 5.2.0
+
+
+ org.apache.curator
+ curator-recipes
+ 5.2.0
+
+
+
+ com.bfd.util
+ pauseTool
+ 1.0
+ system
+ D:\eclipseWork\doc_conversion/../jarlib/pauseTool-1.0.jar
+
+
+
+
+
+
+
+
+ org.springframework.boot
+ spring-boot-dependencies
+ ${spring-boot.version}
+ pom
+ import
+
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-jar-plugin
+
+
+
+ *.properties
+ *.yml
+ *.yaml
+
+
+
+
+ com.bfd.docconversion.DocConversionApplication
+
+ true
+
+ lib/
+
+ false
+
+
+
+ lib/pauseTool-1.0.jar lib/aspose-pdf-23.1-23.1.jar lib/aspose-cells-20.12-crack-20.12.jar lib/aspose-slides-20.12-crack-20.12.jar
+ lib/aspose-words-20.12-crack-20.12.jar config/
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-dependency-plugin
+
+
+ copy
+ package
+
+ copy-dependencies
+
+
+ ${project.build.directory}/lib/
+
+
+
+
+
+
+ maven-resources-plugin
+
+
+ copy-resources
+ package
+
+ copy-resources
+
+
+
+
+
+ src/main/resources/
+
+ *.properties
+ *.yml
+ *.yaml
+
+
+
+ ${project.build.directory}/config
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+
+ 8
+ 8
+
+
+
+
+
+
diff --git a/src/main/java/com/bfd/docconversion/DocConversionApplication.java b/src/main/java/com/bfd/docconversion/DocConversionApplication.java
new file mode 100644
index 0000000..d127219
--- /dev/null
+++ b/src/main/java/com/bfd/docconversion/DocConversionApplication.java
@@ -0,0 +1,67 @@
+package com.bfd.docconversion;
+
+import cn.hutool.core.thread.ThreadFactoryBuilder;
+import com.bfd.docconversion.service.ProcessService;
+import com.bfd.docconversion.util.Config;
+import com.bfd.docconversion.util.KfkUtil;
+import com.bfd.util.PauseTool;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.boot.SpringApplication;
+import org.springframework.boot.autoconfigure.SpringBootApplication;
+import org.springframework.context.ConfigurableApplicationContext;
+import org.springframework.data.redis.core.StringRedisTemplate;
+import org.springframework.scheduling.annotation.EnableScheduling;
+import org.springframework.scheduling.annotation.Scheduled;
+
+import javax.annotation.Resource;
+import java.util.concurrent.*;
+
+/**
+ * @author guowei
+ */
+@SpringBootApplication
+@EnableScheduling
+@Slf4j
+public class DocConversionApplication {
+ @Autowired
+ private StringRedisTemplate stringRedisTemplate;
+
+ @Value("${zookeeper.connection-string}")
+ private String connectionString;
+ @Value("${zookeeper.publish-node}")
+ private String nodePath;
+ @Value("${crawl.threadNum}")
+ private int threadNum;
+
+ @Resource
+ ProcessService processService;
+ public static void main(String[] args) {
+ ConfigurableApplicationContext applicationContext = SpringApplication.run(DocConversionApplication.class, args);
+ DocConversionApplication bean = applicationContext.getBean(DocConversionApplication.class);
+ System.setProperty("java.io.tmpdir","/opt/analyze/apps/doc_conversion/tmp");
+ bean.start();
+ }
+
+ public void start(){
+
+ ThreadFactory namedThreadFactory = new ThreadFactoryBuilder().setNamePrefix("crawl-pool-%d").build();
+ ExecutorService singleThreadPool = new ThreadPoolExecutor(10, 20, 100L, TimeUnit.SECONDS, new LinkedBlockingQueue(1024), namedThreadFactory, new ThreadPoolExecutor.AbortPolicy());
+ for (int i=0;i input:" + JSON.toJSONString(input));
+ System.out.println("queryData ---> output:" + JSON.toJSONString(output));
+ System.out.println("queryData ---> data:" + JSON.toJSONString(data));
+ Map resultMap = new HashMap<>(32);
+ Map results = new HashMap<>(32);
+ try {
+ //需修改
+// String gofastUrl = input.getString("filePath");
+ String gofastUrl = (String) Utils.jsonParse(input.getString("filePath"), data);
+ log.info("开始下载文件, path:"+ gofastUrl);
+ InputStream source = Utils.gofastDownLoadFile(gofastUrl);
+ if (source == null) {
+ throw new NullPointerException();
+ }
+ URL url = new URL(gofastUrl);
+ String newPath = url.getPath();
+ Path path = Paths.get(newPath);
+ String extension = Utils.getExtension(path);
+ ByteArrayOutputStream target = new ByteArrayOutputStream();
+ String filePath = "";
+ if (extension.equals(Config.PDF)) {
+ log.info("文档转换开始: " + extension + " --> DOC");
+ Utils.asposePdfTo(extension, source,target);
+ filePath = "./files/"+IdUtil.simpleUUID()+".docx";
+ }else {
+ log.info("文档转换开始: " + extension + " --> PDF");
+ Utils.asposeToPdf(extension, source,target);
+ filePath = "./files/"+IdUtil.simpleUUID()+".pdf";
+ }
+// InputStream source = Files.newInputStream(path);
+ Files.write(Paths.get(filePath), target.toByteArray());
+ log.info("文档转换完成");
+ log.info("文件开始上传 path:{}",filePath);
+ String upLoadFile = Utils.upLoadFile(filePath);
+ System.out.println(upLoadFile);
+ log.info("文件结束上传");
+ JSONObject resultUpload = JSONObject.parseObject(upLoadFile);
+ resultMap.put("id", IdUtil.randomUUID());
+ resultMap.put("conversionUrl", Config.resultGofast + resultUpload.getString("path"));
+ results.put("status", 1);
+ results.put("message", "成功");
+ }catch (Exception e){
+ e.printStackTrace();
+ log.error("文档转换异常",e);
+ resultMap.put("conversionUrl", "失败");
+ results.put("status", 2);
+ results.put("message", "失败");
+ }
+ resultMap.put("isLast",1);
+ results.put("results", JSON.toJSONString(resultMap));
+
+ jsonObject.put("result", results);
+ KfkUtil.sendKafka(JSON.toJSONString(jsonObject));
+ log.info("处理完成,result:" + JSON.toJSONString(results));
+
+ }
+}
diff --git a/src/main/java/com/bfd/docconversion/util/AsyncConfig.java b/src/main/java/com/bfd/docconversion/util/AsyncConfig.java
new file mode 100644
index 0000000..6358407
--- /dev/null
+++ b/src/main/java/com/bfd/docconversion/util/AsyncConfig.java
@@ -0,0 +1,37 @@
+package com.bfd.docconversion.util;
+
+import org.springframework.context.annotation.Configuration;
+import org.springframework.scheduling.annotation.AsyncConfigurer;
+import org.springframework.scheduling.annotation.EnableAsync;
+import org.springframework.scheduling.concurrent.ThreadPoolTaskExecutor;
+
+import java.util.concurrent.Executor;
+
+
+@Configuration
+@EnableAsync //Java配置文件标注它,那么Spring就会开启异步可用
+/**
+ * @author guowei
+ * 异步任务线程池
+ * 注解@EnableAsync代表开启Spring异步。这样就可以使用@Async驱动Spring使用异步,
+ * 但是异步需要提供可用线程池,所以这里的配置类还会实现AsyncConfigurer接口,然后覆盖getAsyncExecutor方法,这样就可以自定义一个线程池
+ */
+public class AsyncConfig implements AsyncConfigurer {
+
+ @Override
+ public Executor getAsyncExecutor() {
+ //定义线程池
+ ThreadPoolTaskExecutor threadPoolTaskExecutor = new ThreadPoolTaskExecutor();
+ //核心线程数
+ threadPoolTaskExecutor.setCorePoolSize(10);
+ //线程池最大线程数
+ threadPoolTaskExecutor.setMaxPoolSize(50);
+ //线程队列最大线程数
+ threadPoolTaskExecutor.setQueueCapacity(200);
+ //初始化
+ threadPoolTaskExecutor.initialize();
+
+ return threadPoolTaskExecutor;
+ }
+
+}
diff --git a/src/main/java/com/bfd/docconversion/util/Config.java b/src/main/java/com/bfd/docconversion/util/Config.java
new file mode 100644
index 0000000..91e5b8c
--- /dev/null
+++ b/src/main/java/com/bfd/docconversion/util/Config.java
@@ -0,0 +1,32 @@
+package com.bfd.docconversion.util;
+
+import com.alibaba.fastjson2.JSONObject;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.LinkedBlockingDeque;
+
+/**
+ * @author guowei
+ */
+public class Config {
+
+ public static String gofastUrl = "http://172.18.1.180:9980/upload";
+
+// public static String resultGofast = "https://crawl-files.pontoaplus.com";
+
+ public static String resultGofast = "https://caiji.pontoaplus.com";
+
+ public static LinkedBlockingDeque taskQueue = new LinkedBlockingDeque ();
+
+ public static Map stopCache = new HashMap<>();
+
+ public static final String PDF = "pdf";
+
+ public static final Integer NUM = 5;
+
+ public static final String TRACE = "trace";
+
+
+
+}
diff --git a/src/main/java/com/bfd/docconversion/util/Constants.java b/src/main/java/com/bfd/docconversion/util/Constants.java
new file mode 100644
index 0000000..937e0b9
--- /dev/null
+++ b/src/main/java/com/bfd/docconversion/util/Constants.java
@@ -0,0 +1,19 @@
+package com.bfd.docconversion.util;
+
+import org.springframework.stereotype.Component;
+
+/**
+ * @author guowei
+ */
+@Component
+public class Constants {
+
+ public final static String STOP = "stop";
+
+ public final static String SCENES_ID = "scenes_id";
+
+ public final static String VERSION = "version";
+
+ public final static String UNDERLINE = "_";
+
+}
diff --git a/src/main/java/com/bfd/docconversion/util/FileExtensionEnum.java b/src/main/java/com/bfd/docconversion/util/FileExtensionEnum.java
new file mode 100644
index 0000000..cc18f29
--- /dev/null
+++ b/src/main/java/com/bfd/docconversion/util/FileExtensionEnum.java
@@ -0,0 +1,39 @@
+package com.bfd.docconversion.util;
+/**
+ * @author guowei
+ */
+public enum FileExtensionEnum {
+ /**doc**/
+ doc("doc"),
+ /**docx**/
+ docx("docx"),
+ /**xls**/
+ xls("xls"),
+ /**xlsx**/
+ xlsx("xlsx"),
+ /**ppt**/
+ ppt("ppt"),
+ /**pptx"**/
+ pptx("pptx"),
+ /**pdf**/
+ pdf("pdf");
+
+ private final String extension;
+
+ FileExtensionEnum(String extension) {
+ this.extension = extension;
+ }
+
+ public String getExtension() {
+ return extension;
+ }
+
+ public static FileExtensionEnum getByExtension(String extension) {
+ for (FileExtensionEnum fileExtension : values()) {
+ if (fileExtension.getExtension().equalsIgnoreCase(extension)) {
+ return fileExtension;
+ }
+ }
+ throw new IllegalArgumentException("Unsupported file extension: " + extension);
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/com/bfd/docconversion/util/KfkUtil.java b/src/main/java/com/bfd/docconversion/util/KfkUtil.java
new file mode 100644
index 0000000..3327089
--- /dev/null
+++ b/src/main/java/com/bfd/docconversion/util/KfkUtil.java
@@ -0,0 +1,83 @@
+package com.bfd.docconversion.util;
+
+import lombok.extern.slf4j.Slf4j;
+import org.apache.kafka.clients.producer.KafkaProducer;
+import org.apache.kafka.clients.producer.ProducerRecord;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.stereotype.Component;
+
+import java.util.Properties;
+
+/**
+ * @author guowei
+ * kfk工具类
+ */
+@Component
+@Slf4j
+public class KfkUtil {
+ private static String topic;
+
+ private static String brokerList;
+
+ @Value("${crawl.kafka.topic}")
+ public void setTopic(String topic) {
+ KfkUtil.topic = topic;
+ }
+
+ @Value("${crawl.kafka.brokers}")
+ public void setBrokerList(String brokerList) {
+ KfkUtil.brokerList = brokerList;
+ }
+ private static KafkaProducer kafkaProducer;
+
+ public static int num = 0;
+
+ /**
+ * 获取KafkaProducer实例
+ */
+ public static KafkaProducer getProducer() {
+// synchronized (kafkaProducer) {
+ if (kafkaProducer == null) {
+ Properties props = new Properties();
+ //xxx服务器ip
+ props.put("bootstrap.servers", brokerList);
+ //所有follower都响应了才认为消息提交成功,即"committed"
+ props.put("acks", "all");
+ //retries = MAX 无限重试,直到你意识到出现了问题:)
+ props.put("retries", 3);
+ //producer将试图批处理消息记录,以减少请求次数.默认的批量处理消息字节数
+ props.put("batch.size", 16384);
+ //batch.size当批量的数据大小达到设定值后,就会立即发送,不顾下面的linger.ms
+ //延迟1ms发送,这项设置将通过增加小的延迟来完成--即,不是立即发送一条记录,producer将会等待给定的延迟时间以允许其他消息记录发送,这些消息记录可以批量处理
+ props.put("linger.ms", 1);
+ //producer可以用来缓存数据的内存大小。
+ props.put("buffer.memory", 33554432);
+ props.put("key.serializer",
+ "org.apache.kafka.common.serialization.StringSerializer");
+ props.put("value.serializer",
+ "org.apache.kafka.common.serialization.StringSerializer");
+ kafkaProducer = new KafkaProducer(props);
+ }
+// }
+ return kafkaProducer;
+ }
+
+ /**
+ * 关闭KafkaProducer实例
+ */
+ public static void closeProducer() {
+ if (kafkaProducer != null) {
+ log.info("----------close producer----------");
+ kafkaProducer.close();
+ kafkaProducer = null;
+ }
+ }
+
+ public static void sendKafka(String resultData) {
+ KafkaProducer producer = getProducer();
+ ProducerRecord se = new ProducerRecord(topic, resultData);
+ producer.send(se);
+ log.info("发送kafka成功");
+// num++;
+ }
+}
diff --git a/src/main/java/com/bfd/docconversion/util/MainHandler.java b/src/main/java/com/bfd/docconversion/util/MainHandler.java
new file mode 100644
index 0000000..d6c2c9a
--- /dev/null
+++ b/src/main/java/com/bfd/docconversion/util/MainHandler.java
@@ -0,0 +1,104 @@
+package com.bfd.docconversion.util;
+
+import cn.hutool.core.io.FileUtil;
+import cn.hutool.core.io.file.FileWriter;
+import com.alibaba.fastjson2.JSON;
+import com.alibaba.fastjson2.JSONObject;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.boot.ApplicationArguments;
+import org.springframework.boot.ApplicationRunner;
+import org.springframework.stereotype.Service;
+
+import java.io.File;
+import java.util.List;
+import java.util.concurrent.LinkedBlockingDeque;
+import java.util.concurrent.LinkedBlockingQueue;
+
+
+/**
+ * @author guowei
+ */
+@Slf4j
+@Service
+public class MainHandler implements ApplicationRunner {
+
+ @Value("${crawl.task.taskData}")
+ private String taskPath;
+
+ @Override
+ public void run(ApplicationArguments args) throws Exception {
+ log.info("监测程序运行线程 start");
+ //停止处理
+ waitDown();
+ //启动加载缓存任务
+ readTask(taskPath, Config.taskQueue);
+ }
+
+
+ public static void readTask(String path, LinkedBlockingDeque queue) throws InterruptedException {
+ File file = new File(path);
+ if (file.exists()) {
+ List tasks = null;
+ tasks = FileUtil.readLines(file, "UTF-8");
+ log.info("缓存文件有 " + tasks.size() + " 条数据");
+ for (String taskStr : tasks) {
+ log.info("读到缓存数据:" + taskStr);
+ System.out.println("读到缓存数据:" + taskStr);
+ JSONObject parse = JSONObject.parseObject(taskStr);
+// JSONObject value = (JSONObject) parse.get("value");
+// if (value.containsKey("result")){
+// KfkUtil.sendKafka(JSON.toJSONString(value));
+// log.info("此数据已经组装好,直接推送kfk");
+// continue;
+// }
+ queue.put(parse);
+ }
+ file.delete();
+ } else {
+ log.info("未找到缓存任务文件");
+ }
+
+ }
+
+ /**
+ * 结束触发钩子
+ */
+ public void waitDown() {
+ Runtime.getRuntime().addShutdownHook(new Thread() {
+ @Override
+ public void run() {
+ // 停止线程
+// Config.isStart = false;
+ log.info("stop-------");
+ try {
+ writeTsskToFile();
+ } catch (InterruptedException e) {
+ log.error("写出缓存异常,{}", e);
+ }
+ }
+ });
+ }
+
+
+ /**
+ * 任务持久化到硬盘
+ */
+ public void writeTsskToFile() throws InterruptedException {
+
+ System.out.println(taskPath);
+ File file = new File(taskPath);
+ FileWriter fileWriter = new FileWriter(file);
+ if (!file.exists()) {
+ fileWriter = FileWriter.create(file);
+ }
+ while (Config.taskQueue.size() > 0) {
+ JSONObject take = Config.taskQueue.take();
+ String entryJson = JSON.toJSONString(take);
+ System.out.println("写入缓存数据:" + entryJson);
+ fileWriter.write(entryJson + "\r\n", true);
+ }
+ log.info("taskMap 缓存已输出");
+ }
+
+}
diff --git a/src/main/java/com/bfd/docconversion/util/Utils.java b/src/main/java/com/bfd/docconversion/util/Utils.java
new file mode 100644
index 0000000..820e247
--- /dev/null
+++ b/src/main/java/com/bfd/docconversion/util/Utils.java
@@ -0,0 +1,325 @@
+package com.bfd.docconversion.util;
+
+import cn.hutool.core.util.IdUtil;
+import com.alibaba.fastjson2.JSON;
+import com.alibaba.fastjson2.JSONObject;
+import com.alibaba.fastjson2.JSONPath;
+
+import com.aspose.cells.Workbook;
+import com.aspose.slides.Presentation;
+import lombok.extern.slf4j.Slf4j;
+import okhttp3.*;
+import org.springframework.stereotype.Component;
+import com.aspose.pdf.Document;
+import com.aspose.pdf.SaveFormat;
+
+import java.io.*;
+import java.net.URL;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Map;
+import java.util.concurrent.TimeUnit;
+
+/**
+ * @author guowei
+ */
+@Component
+@Slf4j
+public class Utils {
+
+ /**
+ * 转换成pdf
+ *
+ * @param extension
+ * @param source
+ * @param target
+ * @throws Exception
+ */
+ public static void asposeToPdf(String extension, InputStream source, ByteArrayOutputStream target) throws Exception {
+ switch (FileExtensionEnum.getByExtension(extension)) {
+ case doc:
+ case docx:
+ com.aspose.words.Document doc = new com.aspose.words.Document(source);
+ doc.save(target, com.aspose.words.SaveFormat.PDF);
+
+ break;
+ case xls:
+ case xlsx:
+ com.aspose.cells.Workbook excel = new com.aspose.cells.Workbook(source);
+ com.aspose.cells.PdfSaveOptions pdfSaveOptions = new com.aspose.cells.PdfSaveOptions();
+ // 单页显示,防截断 防换行
+ pdfSaveOptions.setOnePagePerSheet(true);
+ excel.save(target, pdfSaveOptions);
+ excel.dispose();
+ break;
+ case ppt:
+ case pptx:
+ com.aspose.slides.Presentation ppt = new com.aspose.slides.Presentation(source);
+ ppt.save(target, com.aspose.slides.SaveFormat.Pdf);
+ ppt.dispose();
+ break;
+ default:
+ System.out.println("不支持的文件转换类型");
+// throw new BaseException("不支持的文件转换类型");
+ }
+ }
+
+ /**
+ * pdf 转换
+ * @param extension
+ * @param source
+ * @param target
+ * @throws Exception
+ */
+ public static void asposePdfTo(String extension, InputStream source, ByteArrayOutputStream target) throws Exception {
+ switch (FileExtensionEnum.getByExtension(extension)) {
+ case doc:
+ case docx:
+ case pdf:
+ // 设置字体替换
+// FontSettings fontSettings = new FontSettings();
+// FontSubstitutionSettings fontSubstitutionSettings = fontSettings.getSubstitutionSettings();
+// fontSubstitutionSettings.getDefaultFontSubstitution().setDefaultFontName("Arial");
+//
+// // 加载系统字体
+// FontSourceBase[] fontSources = fontSettings.getFontsSources();
+// SystemFontSource systemFontSource = new SystemFontSource();
+// FontSourceBase[] updatedFontSources = new FontSourceBase[fontSources.length + 1];
+// System.arraycopy(fontSources, 0, updatedFontSources, 0, fontSources.length);
+// updatedFontSources[fontSources.length] = systemFontSource;
+// fontSettings.setFontsSources(updatedFontSources);
+//
+// // 指定加载选项,以确保正确处理字体
+// LoadOptions loadOptions = new LoadOptions();
+// loadOptions.setFontSettings(fontSettings);
+
+ Document doc = new Document(source);
+ //全面支持DOC, DOCX, OOXML, RTF HTML, OpenDocument, PDF, EPUB, XPS, SWF 相互转换
+ doc.save(target, SaveFormat.DocX);
+ doc.close();
+ break;
+// case xls:
+// case xlsx:
+// // Load PDF document
+// Document excel = new Document(source);
+// excel.save(target, SaveFormat.Excel);
+// break;
+// case ppt:
+// case pptx:
+// Document ppt = new Document(source);
+// ppt.save(target, SaveFormat.Pptx);
+// break;
+ default:
+ System.out.println("不支持的文件转换类型");
+// throw new BaseException("不支持的文件转换类型");
+ }
+ }
+
+// public static void convertFile(String inputFilePath, String outputFilePath) throws Exception {
+// String inputExtension = getFileExtension(inputFilePath).toLowerCase();
+// String outputExtension = getFileExtension(outputFilePath).toLowerCase();
+//
+// switch (inputExtension) {
+// case "doc":
+// case "docx":
+// convertWord(inputFilePath, outputFilePath, outputExtension);
+// break;
+// case "xls":
+// case "xlsx":
+// convertExcel(inputFilePath, outputFilePath, outputExtension);
+// break;
+// case "ppt":
+// case "pptx":
+// convertPPT(inputFilePath, outputFilePath, outputExtension);
+// break;
+// case "pdf":
+// convertPDF(inputFilePath, outputFilePath, outputExtension);
+// break;
+// default:
+// throw new IllegalArgumentException("Unsupported file format: " + inputExtension);
+// }
+// }
+
+ private static void convertWord(String inputFilePath, String outputFilePath, String outputExtension) throws Exception {
+ com.aspose.words.Document doc = new com.aspose.words.Document(inputFilePath);
+ switch (outputExtension) {
+ case "pdf":
+ doc.save(outputFilePath, com.aspose.words.SaveFormat.PDF);
+ break;
+ default:
+ System.out.println("不支持的文件转换类型");
+ }
+ }
+
+ private static void convertExcel(String inputFilePath, String outputFilePath, String outputExtension) throws Exception {
+ Workbook workbook = new Workbook(inputFilePath);
+ switch (outputExtension) {
+ case "pdf":
+ workbook.save(outputFilePath, com.aspose.cells.SaveFormat.PDF);
+ break;
+ case "docx":
+ // Excel to Word conversion (Not directly supported)
+ ByteArrayOutputStream htmlStream = new ByteArrayOutputStream();
+ workbook.save(htmlStream, com.aspose.cells.SaveFormat.HTML);
+ ByteArrayInputStream htmlInputStream = new ByteArrayInputStream(htmlStream.toByteArray());
+ com.aspose.words.Document doc = new com.aspose.words.Document(htmlInputStream);
+ doc.save(outputFilePath, com.aspose.cells.SaveFormat.DOCX);
+ break;
+ case "xlsx":
+ workbook.save(outputFilePath, com.aspose.cells.SaveFormat.XLSX);
+ break;
+ case "pptx":
+ // Excel to PPTX conversion (Not directly supported)
+ ByteArrayOutputStream htmlStream2 = new ByteArrayOutputStream();
+ workbook.save(htmlStream2, com.aspose.cells.SaveFormat.HTML);
+ ByteArrayInputStream htmlInputStream2 = new ByteArrayInputStream(htmlStream2.toByteArray());
+ Presentation presentation = new Presentation(htmlInputStream2);
+ presentation.save(outputFilePath, com.aspose.slides.SaveFormat.Pptx);
+ break;
+ default:
+ throw new IllegalArgumentException("Unsupported conversion: Excel to " + outputExtension);
+ }
+ }
+
+ /**
+ * 获取文件扩展名
+ *
+ * @param path 文件路径
+ * @return 文件扩展名
+ */
+ public static String getExtension(Path path) {
+ String fileName = path.getFileName().toString();
+ int dotIndex = fileName.lastIndexOf('.');
+ if (dotIndex == -1) {
+ throw new IllegalArgumentException("File without extension: " + fileName);
+ }
+ return fileName.substring(dotIndex + 1).toLowerCase();
+ }
+
+ public static Object jsonParse(String key, Map data) {
+ String[] keySplit = key.split(":");
+ String jsonPath = keySplit[1];
+ if (!data.containsKey(keySplit[0])) {
+ return "";
+ }
+ String dataJson = (String) data.get(keySplit[0]);
+ JSONObject dataJsonObject = JSON.parseObject(dataJson);
+ Object dataValue = JSONPath.eval(dataJsonObject, jsonPath);
+ return dataValue;
+ }
+
+ /**
+ * gofast 文件下载
+ *
+ * @param url
+ * @return
+ * @throws IOException
+ */
+ public static InputStream gofastDownLoadFile(String url) {
+ OkHttpClient client = new OkHttpClient().newBuilder()
+ .readTimeout(60, TimeUnit.SECONDS)
+ .writeTimeout(60, TimeUnit.SECONDS)
+ .connectTimeout(60, TimeUnit.SECONDS)
+ .build();
+ MediaType mediaType = MediaType.parse("text/plain");
+ RequestBody body = RequestBody.create(mediaType, "");
+ Request request = new Request.Builder()
+ .url(url)
+ .method("GET", null)
+ .addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36")
+ .build();
+ BufferedOutputStream out = null;
+ InputStream inputStream = null;
+ Response response = null;
+ try {
+ response = client.newCall(request).execute();
+ for (int i = 0; i < Config.NUM; i++) {
+ if (response.isSuccessful()) {
+ break;
+ } else {
+ response = client.newCall(request).execute();
+ System.out.println("gofast文件下载失败,file=" + url + ",第" + i + "次");
+ log.error("gofast文件下载失败,file=" + url + ",第" + i + "次");
+ Thread.sleep(3000);
+ i++;
+ }
+ }
+ inputStream = response.body().byteStream();
+ } catch (Exception e) {
+ e.printStackTrace();
+ log.error("gofast文件下载异常", e);
+ }
+ return inputStream;
+ }
+
+ public static String upLoadFile(String filePath) {
+
+ File file = new File(filePath);
+ String realFilename = filePath.substring(filePath.lastIndexOf(File.separator) + 1);
+ MultipartBody.Builder builder = new MultipartBody.Builder().setType(MultipartBody.FORM);
+ builder.addPart(Headers.of("Content-Disposition", "form-data; name=\"file\";filename=\"" + realFilename + "\""),
+ RequestBody.create(MediaType.parse("image/png"), file)
+
+ ).addFormDataPart("output", "json").build();
+ RequestBody body = builder.build();
+ Request request = new Request.Builder().url(Config.gofastUrl).post(body).header("Expect", "100-continue").build();
+ OkHttpClient.Builder okBuilder = new OkHttpClient.Builder();
+ // 获得一个客户对象
+ OkHttpClient client = okBuilder.build();
+ Call call = client.newCall(request);
+ String html = "";
+ Response response = null;
+ int retry = 0;
+ do {
+ try {
+ response = call.execute();
+ html = response.body().string();
+ break;
+ } catch (IOException e) {
+ log.error("文档上传异常,file:" + filePath + ",重试" + retry + "次");
+ } finally {
+ response.close();
+ }
+ } while (retry >= 5);
+ file.delete();
+
+ return html;
+ }
+
+ public static void main(String[] args) throws Exception {
+ String filePath = "C:\\Users\\86150\\Desktop\\embed_watermark (1).pdf";
+// Path path = Paths.get(filePath);
+//// String extension = getExtension(path);
+// String extension = "docx";
+// System.out.println("文档转换: "+ extension + " --> PDF" );
+// ByteArrayOutputStream target = new ByteArrayOutputStream();
+// InputStream source = Files.newInputStream(path);
+//// asposeToPdf(extension, source,target);
+// asposePdfTo(extension,source,target);
+//
+// Files.write(Paths.get("C:\\Users\\86150\\Desktop\\embed_watermark (2).docx"), target.toByteArray());
+// String s = upLoadFile(filePath);
+// System.out.println(s);
+ String gofastUrl = "http://172.18.1.180:9980/group17/default/20240812/16/40/3/971260fd6cce96624965c692f709660b.pdf";
+ InputStream inputStream = gofastDownLoadFile(gofastUrl);
+ URL url = new URL(gofastUrl);
+ String newPath = url.getPath();
+ Path path = Paths.get(newPath);
+ String extension = Utils.getExtension(path);
+ ByteArrayOutputStream target = new ByteArrayOutputStream();
+ Utils.asposePdfTo(extension, inputStream,target);
+ filePath = "./files/"+ IdUtil.simpleUUID()+".docx";
+ Files.write(Paths.get(filePath), target.toByteArray());
+ }
+
+// public static void main(String[] args) {
+// String pdfFilePath = "C:\\Users\\86150\\Desktop\\百分点\\考试\\百分点019期新员工特训营-文化篇(终版)20210512.pdf";
+// String wordFilePath = "C:\\Users\\86150\\Desktop\\百分点\\考试\\云学堂.docx";
+//
+// pdf2doc(pdfFilePath);
+// System.out.println("PDF successfully converted to Word document.");
+// }
+
+
+}
diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml
new file mode 100644
index 0000000..1914432
--- /dev/null
+++ b/src/main/resources/application.yml
@@ -0,0 +1,40 @@
+server:
+ port: 9955
+crawl:
+ kafka:
+ topic: produce_analyze
+ brokers: 172.18.1.146:9092,172.18.1.147:9092,172.18.1.148:9092
+ task:
+ taskData: ./data/task.txt
+ threadNum: 3
+#日志级别
+logging:
+ level:
+ com:
+ bfd: INFO
+ #日志路径
+ log:
+ path: ./logs
+spring:
+ boot:
+ admin:
+ client:
+ url: http://172.18.1.147:8001
+ instance:
+ service-base-url: http://172.18.1.147:9999
+ application:
+ name: 文档转换
+management:
+ endpoints:
+ web:
+ exposure:
+ include: "*"
+ endpoint:
+ health:
+ show-details: always
+ health:
+ elasticsearch:
+ enabled: false
+zookeeper:
+ connection-string: 172.18.1.146:2181,172.18.1.147:2181,172.18.1.148:2181
+ publish-node: /analyze
\ No newline at end of file
diff --git a/src/main/resources/logback-spring.xml b/src/main/resources/logback-spring.xml
new file mode 100644
index 0000000..0c59240
--- /dev/null
+++ b/src/main/resources/logback-spring.xml
@@ -0,0 +1,38 @@
+
+
+
+
+
+
+
+
+ true
+
+ ${logging.level}
+
+
+ ${logging.path}/crawlSchedule.log
+
+
+
+ ${logging.path}/crawlSchedule.log.%d{yyyy-MM-dd}
+
+ 7
+
+
+ %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %line %-5level %logger{50} - %msg%n
+ UTF-8
+
+
+
+
+
+
+
+
diff --git a/src/test/java/com/bfd/doc_conversion/DocConversionApplicationTests.java b/src/test/java/com/bfd/doc_conversion/DocConversionApplicationTests.java
new file mode 100644
index 0000000..bec0f37
--- /dev/null
+++ b/src/test/java/com/bfd/doc_conversion/DocConversionApplicationTests.java
@@ -0,0 +1,13 @@
+package com.bfd.doc_conversion;
+
+import org.junit.jupiter.api.Test;
+import org.springframework.boot.test.context.SpringBootTest;
+
+@SpringBootTest
+class DocConversionApplicationTests {
+
+ @Test
+ void contextLoads() {
+ }
+
+}