From 4d48a8ed53a55659a683574ba103548daaf8c6d7 Mon Sep 17 00:00:00 2001 From: 55007 <55007@maojian> Date: Tue, 7 Jan 2025 17:41:35 +0800 Subject: [PATCH] =?UTF-8?q?=E6=96=87=E6=A1=A3=E6=8A=93=E6=8D=A2=E5=BA=94?= =?UTF-8?q?=E7=94=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .classpath | 40 +++ .gitignore | 3 + .project | 23 ++ .settings/org.eclipse.core.resources.prefs | 5 + .settings/org.eclipse.jdt.core.prefs | 8 + .settings/org.eclipse.m2e.core.prefs | 4 + pom.xml | 218 ++++++++++++++ .../docconversion/DocConversionApplication.java | 67 +++++ .../docconversion/controller/ApiController.java | 40 +++ .../service/ConversionToPdfService.java | 13 + .../bfd/docconversion/service/ProcessService.java | 46 +++ .../service/impl/ConversionToPdfServiceImpl.java | 97 ++++++ .../com/bfd/docconversion/util/AsyncConfig.java | 37 +++ .../java/com/bfd/docconversion/util/Config.java | 32 ++ .../java/com/bfd/docconversion/util/Constants.java | 19 ++ .../bfd/docconversion/util/FileExtensionEnum.java | 39 +++ .../java/com/bfd/docconversion/util/KfkUtil.java | 83 ++++++ .../com/bfd/docconversion/util/MainHandler.java | 104 +++++++ .../java/com/bfd/docconversion/util/Utils.java | 325 +++++++++++++++++++++ src/main/resources/application.yml | 40 +++ src/main/resources/logback-spring.xml | 38 +++ .../DocConversionApplicationTests.java | 13 + 22 files changed, 1294 insertions(+) create mode 100644 .classpath create mode 100644 .gitignore create mode 100644 .project create mode 100644 .settings/org.eclipse.core.resources.prefs create mode 100644 .settings/org.eclipse.jdt.core.prefs create mode 100644 .settings/org.eclipse.m2e.core.prefs create mode 100644 pom.xml create mode 100644 src/main/java/com/bfd/docconversion/DocConversionApplication.java create mode 100644 src/main/java/com/bfd/docconversion/controller/ApiController.java create mode 100644 src/main/java/com/bfd/docconversion/service/ConversionToPdfService.java create mode 100644 src/main/java/com/bfd/docconversion/service/ProcessService.java create mode 100644 src/main/java/com/bfd/docconversion/service/impl/ConversionToPdfServiceImpl.java create mode 100644 src/main/java/com/bfd/docconversion/util/AsyncConfig.java create mode 100644 src/main/java/com/bfd/docconversion/util/Config.java create mode 100644 src/main/java/com/bfd/docconversion/util/Constants.java create mode 100644 src/main/java/com/bfd/docconversion/util/FileExtensionEnum.java create mode 100644 src/main/java/com/bfd/docconversion/util/KfkUtil.java create mode 100644 src/main/java/com/bfd/docconversion/util/MainHandler.java create mode 100644 src/main/java/com/bfd/docconversion/util/Utils.java create mode 100644 src/main/resources/application.yml create mode 100644 src/main/resources/logback-spring.xml create mode 100644 src/test/java/com/bfd/doc_conversion/DocConversionApplicationTests.java diff --git a/.classpath b/.classpath new file mode 100644 index 0000000..f7e4a1d --- /dev/null +++ b/.classpath @@ -0,0 +1,40 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..49ab93c --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/target/ +/logs/ +/jarlib/ \ No newline at end of file diff --git a/.project b/.project new file mode 100644 index 0000000..73b9bbd --- /dev/null +++ b/.project @@ -0,0 +1,23 @@ + + + doc_conversion + + + + + + org.eclipse.jdt.core.javabuilder + + + + + org.eclipse.m2e.core.maven2Builder + + + + + + org.eclipse.jdt.core.javanature + org.eclipse.m2e.core.maven2Nature + + diff --git a/.settings/org.eclipse.core.resources.prefs b/.settings/org.eclipse.core.resources.prefs new file mode 100644 index 0000000..839d647 --- /dev/null +++ b/.settings/org.eclipse.core.resources.prefs @@ -0,0 +1,5 @@ +eclipse.preferences.version=1 +encoding//src/main/java=UTF-8 +encoding//src/main/resources=UTF-8 +encoding//src/test/java=UTF-8 +encoding/=UTF-8 diff --git a/.settings/org.eclipse.jdt.core.prefs b/.settings/org.eclipse.jdt.core.prefs new file mode 100644 index 0000000..2f5cc74 --- /dev/null +++ b/.settings/org.eclipse.jdt.core.prefs @@ -0,0 +1,8 @@ +eclipse.preferences.version=1 +org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 +org.eclipse.jdt.core.compiler.compliance=1.8 +org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled +org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning +org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore +org.eclipse.jdt.core.compiler.release=disabled +org.eclipse.jdt.core.compiler.source=1.8 diff --git a/.settings/org.eclipse.m2e.core.prefs b/.settings/org.eclipse.m2e.core.prefs new file mode 100644 index 0000000..f897a7f --- /dev/null +++ b/.settings/org.eclipse.m2e.core.prefs @@ -0,0 +1,4 @@ +activeProfiles= +eclipse.preferences.version=1 +resolveWorkspaceProjects=true +version=1 diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..c004987 --- /dev/null +++ b/pom.xml @@ -0,0 +1,218 @@ + + + 4.0.0 + com.bfd + doc_conversion + 0.0.1-SNAPSHOT + docconversion + docconversion + + 1.8 + UTF-8 + UTF-8 + 2.2.4.RELEASE + + + + org.springframework.boot + spring-boot-starter-web + + + + org.projectlombok + lombok + true + + + org.springframework.boot + spring-boot-starter-test + test + + + org.springframework.boot + spring-boot-starter-test + test + + + com.alibaba.fastjson2 + fastjson2 + 2.0.12 + + + cn.hutool + hutool-all + 5.8.27 + + + org.apache.kafka + kafka-clients + 2.7.1 + + + com.squareup.okhttp3 + okhttp + 3.11.0 + + + de.codecentric + spring-boot-admin-client + 2.2.4 + + + + aspose-cells-20.12-crack + aspose-cells-20.12-crack + 20.12 + system + D:\eclipseWork\doc_conversion/./jarlib/aspose-cells-20.12-crack.jar + + + aspose-slides-20.12-crack + aspose-slides-20.12-crack + 20.12 + system + D:\eclipseWork\doc_conversion/../jarlib/aspose-slides-20.12-crack.jar + + + aspose-words-20.12-crack + aspose-words-20.12-crack + 20.12 + system + D:\eclipseWork\doc_conversion/../jarlib/aspose-words-20.12-crack.jar + + + + org.javassist + javassist + 3.20.0-GA + + + + aspose-pdf-23.1 + aspose-pdf-23.1 + 23.1 + system + D:\eclipseWork\doc_conversion/../jarlib/aspose-pdf-23.1.jar + + + org.apache.curator + curator-framework + 5.2.0 + + + org.apache.curator + curator-recipes + 5.2.0 + + + + com.bfd.util + pauseTool + 1.0 + system + D:\eclipseWork\doc_conversion/../jarlib/pauseTool-1.0.jar + + + + + + + + + org.springframework.boot + spring-boot-dependencies + ${spring-boot.version} + pom + import + + + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + + *.properties + *.yml + *.yaml + + + + + com.bfd.docconversion.DocConversionApplication + + true + + lib/ + + false + + + + lib/pauseTool-1.0.jar lib/aspose-pdf-23.1-23.1.jar lib/aspose-cells-20.12-crack-20.12.jar lib/aspose-slides-20.12-crack-20.12.jar + lib/aspose-words-20.12-crack-20.12.jar config/ + + + + + + + org.apache.maven.plugins + maven-dependency-plugin + + + copy + package + + copy-dependencies + + + ${project.build.directory}/lib/ + + + + + + + maven-resources-plugin + + + copy-resources + package + + copy-resources + + + + + + src/main/resources/ + + *.properties + *.yml + *.yaml + + + + ${project.build.directory}/config + + + + + + org.apache.maven.plugins + maven-compiler-plugin + + 8 + 8 + + + + + + diff --git a/src/main/java/com/bfd/docconversion/DocConversionApplication.java b/src/main/java/com/bfd/docconversion/DocConversionApplication.java new file mode 100644 index 0000000..d127219 --- /dev/null +++ b/src/main/java/com/bfd/docconversion/DocConversionApplication.java @@ -0,0 +1,67 @@ +package com.bfd.docconversion; + +import cn.hutool.core.thread.ThreadFactoryBuilder; +import com.bfd.docconversion.service.ProcessService; +import com.bfd.docconversion.util.Config; +import com.bfd.docconversion.util.KfkUtil; +import com.bfd.util.PauseTool; +import lombok.extern.slf4j.Slf4j; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.boot.SpringApplication; +import org.springframework.boot.autoconfigure.SpringBootApplication; +import org.springframework.context.ConfigurableApplicationContext; +import org.springframework.data.redis.core.StringRedisTemplate; +import org.springframework.scheduling.annotation.EnableScheduling; +import org.springframework.scheduling.annotation.Scheduled; + +import javax.annotation.Resource; +import java.util.concurrent.*; + +/** + * @author guowei + */ +@SpringBootApplication +@EnableScheduling +@Slf4j +public class DocConversionApplication { + @Autowired + private StringRedisTemplate stringRedisTemplate; + + @Value("${zookeeper.connection-string}") + private String connectionString; + @Value("${zookeeper.publish-node}") + private String nodePath; + @Value("${crawl.threadNum}") + private int threadNum; + + @Resource + ProcessService processService; + public static void main(String[] args) { + ConfigurableApplicationContext applicationContext = SpringApplication.run(DocConversionApplication.class, args); + DocConversionApplication bean = applicationContext.getBean(DocConversionApplication.class); + System.setProperty("java.io.tmpdir","/opt/analyze/apps/doc_conversion/tmp"); + bean.start(); + } + + public void start(){ + + ThreadFactory namedThreadFactory = new ThreadFactoryBuilder().setNamePrefix("crawl-pool-%d").build(); + ExecutorService singleThreadPool = new ThreadPoolExecutor(10, 20, 100L, TimeUnit.SECONDS, new LinkedBlockingQueue(1024), namedThreadFactory, new ThreadPoolExecutor.AbortPolicy()); + for (int i=0;i input:" + JSON.toJSONString(input)); + System.out.println("queryData ---> output:" + JSON.toJSONString(output)); + System.out.println("queryData ---> data:" + JSON.toJSONString(data)); + Map resultMap = new HashMap<>(32); + Map results = new HashMap<>(32); + try { + //需修改 +// String gofastUrl = input.getString("filePath"); + String gofastUrl = (String) Utils.jsonParse(input.getString("filePath"), data); + log.info("开始下载文件, path:"+ gofastUrl); + InputStream source = Utils.gofastDownLoadFile(gofastUrl); + if (source == null) { + throw new NullPointerException(); + } + URL url = new URL(gofastUrl); + String newPath = url.getPath(); + Path path = Paths.get(newPath); + String extension = Utils.getExtension(path); + ByteArrayOutputStream target = new ByteArrayOutputStream(); + String filePath = ""; + if (extension.equals(Config.PDF)) { + log.info("文档转换开始: " + extension + " --> DOC"); + Utils.asposePdfTo(extension, source,target); + filePath = "./files/"+IdUtil.simpleUUID()+".docx"; + }else { + log.info("文档转换开始: " + extension + " --> PDF"); + Utils.asposeToPdf(extension, source,target); + filePath = "./files/"+IdUtil.simpleUUID()+".pdf"; + } +// InputStream source = Files.newInputStream(path); + Files.write(Paths.get(filePath), target.toByteArray()); + log.info("文档转换完成"); + log.info("文件开始上传 path:{}",filePath); + String upLoadFile = Utils.upLoadFile(filePath); + System.out.println(upLoadFile); + log.info("文件结束上传"); + JSONObject resultUpload = JSONObject.parseObject(upLoadFile); + resultMap.put("id", IdUtil.randomUUID()); + resultMap.put("conversionUrl", Config.resultGofast + resultUpload.getString("path")); + results.put("status", 1); + results.put("message", "成功"); + }catch (Exception e){ + e.printStackTrace(); + log.error("文档转换异常",e); + resultMap.put("conversionUrl", "失败"); + results.put("status", 2); + results.put("message", "失败"); + } + resultMap.put("isLast",1); + results.put("results", JSON.toJSONString(resultMap)); + + jsonObject.put("result", results); + KfkUtil.sendKafka(JSON.toJSONString(jsonObject)); + log.info("处理完成,result:" + JSON.toJSONString(results)); + + } +} diff --git a/src/main/java/com/bfd/docconversion/util/AsyncConfig.java b/src/main/java/com/bfd/docconversion/util/AsyncConfig.java new file mode 100644 index 0000000..6358407 --- /dev/null +++ b/src/main/java/com/bfd/docconversion/util/AsyncConfig.java @@ -0,0 +1,37 @@ +package com.bfd.docconversion.util; + +import org.springframework.context.annotation.Configuration; +import org.springframework.scheduling.annotation.AsyncConfigurer; +import org.springframework.scheduling.annotation.EnableAsync; +import org.springframework.scheduling.concurrent.ThreadPoolTaskExecutor; + +import java.util.concurrent.Executor; + + +@Configuration +@EnableAsync //Java配置文件标注它,那么Spring就会开启异步可用 +/** + * @author guowei + * 异步任务线程池 + * 注解@EnableAsync代表开启Spring异步。这样就可以使用@Async驱动Spring使用异步, + * 但是异步需要提供可用线程池,所以这里的配置类还会实现AsyncConfigurer接口,然后覆盖getAsyncExecutor方法,这样就可以自定义一个线程池 + */ +public class AsyncConfig implements AsyncConfigurer { + + @Override + public Executor getAsyncExecutor() { + //定义线程池 + ThreadPoolTaskExecutor threadPoolTaskExecutor = new ThreadPoolTaskExecutor(); + //核心线程数 + threadPoolTaskExecutor.setCorePoolSize(10); + //线程池最大线程数 + threadPoolTaskExecutor.setMaxPoolSize(50); + //线程队列最大线程数 + threadPoolTaskExecutor.setQueueCapacity(200); + //初始化 + threadPoolTaskExecutor.initialize(); + + return threadPoolTaskExecutor; + } + +} diff --git a/src/main/java/com/bfd/docconversion/util/Config.java b/src/main/java/com/bfd/docconversion/util/Config.java new file mode 100644 index 0000000..91e5b8c --- /dev/null +++ b/src/main/java/com/bfd/docconversion/util/Config.java @@ -0,0 +1,32 @@ +package com.bfd.docconversion.util; + +import com.alibaba.fastjson2.JSONObject; + +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.LinkedBlockingDeque; + +/** + * @author guowei + */ +public class Config { + + public static String gofastUrl = "http://172.18.1.180:9980/upload"; + +// public static String resultGofast = "https://crawl-files.pontoaplus.com"; + + public static String resultGofast = "https://caiji.pontoaplus.com"; + + public static LinkedBlockingDeque taskQueue = new LinkedBlockingDeque (); + + public static Map stopCache = new HashMap<>(); + + public static final String PDF = "pdf"; + + public static final Integer NUM = 5; + + public static final String TRACE = "trace"; + + + +} diff --git a/src/main/java/com/bfd/docconversion/util/Constants.java b/src/main/java/com/bfd/docconversion/util/Constants.java new file mode 100644 index 0000000..937e0b9 --- /dev/null +++ b/src/main/java/com/bfd/docconversion/util/Constants.java @@ -0,0 +1,19 @@ +package com.bfd.docconversion.util; + +import org.springframework.stereotype.Component; + +/** + * @author guowei + */ +@Component +public class Constants { + + public final static String STOP = "stop"; + + public final static String SCENES_ID = "scenes_id"; + + public final static String VERSION = "version"; + + public final static String UNDERLINE = "_"; + +} diff --git a/src/main/java/com/bfd/docconversion/util/FileExtensionEnum.java b/src/main/java/com/bfd/docconversion/util/FileExtensionEnum.java new file mode 100644 index 0000000..cc18f29 --- /dev/null +++ b/src/main/java/com/bfd/docconversion/util/FileExtensionEnum.java @@ -0,0 +1,39 @@ +package com.bfd.docconversion.util; +/** + * @author guowei + */ +public enum FileExtensionEnum { + /**doc**/ + doc("doc"), + /**docx**/ + docx("docx"), + /**xls**/ + xls("xls"), + /**xlsx**/ + xlsx("xlsx"), + /**ppt**/ + ppt("ppt"), + /**pptx"**/ + pptx("pptx"), + /**pdf**/ + pdf("pdf"); + + private final String extension; + + FileExtensionEnum(String extension) { + this.extension = extension; + } + + public String getExtension() { + return extension; + } + + public static FileExtensionEnum getByExtension(String extension) { + for (FileExtensionEnum fileExtension : values()) { + if (fileExtension.getExtension().equalsIgnoreCase(extension)) { + return fileExtension; + } + } + throw new IllegalArgumentException("Unsupported file extension: " + extension); + } +} \ No newline at end of file diff --git a/src/main/java/com/bfd/docconversion/util/KfkUtil.java b/src/main/java/com/bfd/docconversion/util/KfkUtil.java new file mode 100644 index 0000000..3327089 --- /dev/null +++ b/src/main/java/com/bfd/docconversion/util/KfkUtil.java @@ -0,0 +1,83 @@ +package com.bfd.docconversion.util; + +import lombok.extern.slf4j.Slf4j; +import org.apache.kafka.clients.producer.KafkaProducer; +import org.apache.kafka.clients.producer.ProducerRecord; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.stereotype.Component; + +import java.util.Properties; + +/** + * @author guowei + * kfk工具类 + */ +@Component +@Slf4j +public class KfkUtil { + private static String topic; + + private static String brokerList; + + @Value("${crawl.kafka.topic}") + public void setTopic(String topic) { + KfkUtil.topic = topic; + } + + @Value("${crawl.kafka.brokers}") + public void setBrokerList(String brokerList) { + KfkUtil.brokerList = brokerList; + } + private static KafkaProducer kafkaProducer; + + public static int num = 0; + + /** + * 获取KafkaProducer实例 + */ + public static KafkaProducer getProducer() { +// synchronized (kafkaProducer) { + if (kafkaProducer == null) { + Properties props = new Properties(); + //xxx服务器ip + props.put("bootstrap.servers", brokerList); + //所有follower都响应了才认为消息提交成功,即"committed" + props.put("acks", "all"); + //retries = MAX 无限重试,直到你意识到出现了问题:) + props.put("retries", 3); + //producer将试图批处理消息记录,以减少请求次数.默认的批量处理消息字节数 + props.put("batch.size", 16384); + //batch.size当批量的数据大小达到设定值后,就会立即发送,不顾下面的linger.ms + //延迟1ms发送,这项设置将通过增加小的延迟来完成--即,不是立即发送一条记录,producer将会等待给定的延迟时间以允许其他消息记录发送,这些消息记录可以批量处理 + props.put("linger.ms", 1); + //producer可以用来缓存数据的内存大小。 + props.put("buffer.memory", 33554432); + props.put("key.serializer", + "org.apache.kafka.common.serialization.StringSerializer"); + props.put("value.serializer", + "org.apache.kafka.common.serialization.StringSerializer"); + kafkaProducer = new KafkaProducer(props); + } +// } + return kafkaProducer; + } + + /** + * 关闭KafkaProducer实例 + */ + public static void closeProducer() { + if (kafkaProducer != null) { + log.info("----------close producer----------"); + kafkaProducer.close(); + kafkaProducer = null; + } + } + + public static void sendKafka(String resultData) { + KafkaProducer producer = getProducer(); + ProducerRecord se = new ProducerRecord(topic, resultData); + producer.send(se); + log.info("发送kafka成功"); +// num++; + } +} diff --git a/src/main/java/com/bfd/docconversion/util/MainHandler.java b/src/main/java/com/bfd/docconversion/util/MainHandler.java new file mode 100644 index 0000000..d6c2c9a --- /dev/null +++ b/src/main/java/com/bfd/docconversion/util/MainHandler.java @@ -0,0 +1,104 @@ +package com.bfd.docconversion.util; + +import cn.hutool.core.io.FileUtil; +import cn.hutool.core.io.file.FileWriter; +import com.alibaba.fastjson2.JSON; +import com.alibaba.fastjson2.JSONObject; +import lombok.extern.slf4j.Slf4j; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.boot.ApplicationArguments; +import org.springframework.boot.ApplicationRunner; +import org.springframework.stereotype.Service; + +import java.io.File; +import java.util.List; +import java.util.concurrent.LinkedBlockingDeque; +import java.util.concurrent.LinkedBlockingQueue; + + +/** + * @author guowei + */ +@Slf4j +@Service +public class MainHandler implements ApplicationRunner { + + @Value("${crawl.task.taskData}") + private String taskPath; + + @Override + public void run(ApplicationArguments args) throws Exception { + log.info("监测程序运行线程 start"); + //停止处理 + waitDown(); + //启动加载缓存任务 + readTask(taskPath, Config.taskQueue); + } + + + public static void readTask(String path, LinkedBlockingDeque queue) throws InterruptedException { + File file = new File(path); + if (file.exists()) { + List tasks = null; + tasks = FileUtil.readLines(file, "UTF-8"); + log.info("缓存文件有 " + tasks.size() + " 条数据"); + for (String taskStr : tasks) { + log.info("读到缓存数据:" + taskStr); + System.out.println("读到缓存数据:" + taskStr); + JSONObject parse = JSONObject.parseObject(taskStr); +// JSONObject value = (JSONObject) parse.get("value"); +// if (value.containsKey("result")){ +// KfkUtil.sendKafka(JSON.toJSONString(value)); +// log.info("此数据已经组装好,直接推送kfk"); +// continue; +// } + queue.put(parse); + } + file.delete(); + } else { + log.info("未找到缓存任务文件"); + } + + } + + /** + * 结束触发钩子 + */ + public void waitDown() { + Runtime.getRuntime().addShutdownHook(new Thread() { + @Override + public void run() { + // 停止线程 +// Config.isStart = false; + log.info("stop-------"); + try { + writeTsskToFile(); + } catch (InterruptedException e) { + log.error("写出缓存异常,{}", e); + } + } + }); + } + + + /** + * 任务持久化到硬盘 + */ + public void writeTsskToFile() throws InterruptedException { + + System.out.println(taskPath); + File file = new File(taskPath); + FileWriter fileWriter = new FileWriter(file); + if (!file.exists()) { + fileWriter = FileWriter.create(file); + } + while (Config.taskQueue.size() > 0) { + JSONObject take = Config.taskQueue.take(); + String entryJson = JSON.toJSONString(take); + System.out.println("写入缓存数据:" + entryJson); + fileWriter.write(entryJson + "\r\n", true); + } + log.info("taskMap 缓存已输出"); + } + +} diff --git a/src/main/java/com/bfd/docconversion/util/Utils.java b/src/main/java/com/bfd/docconversion/util/Utils.java new file mode 100644 index 0000000..820e247 --- /dev/null +++ b/src/main/java/com/bfd/docconversion/util/Utils.java @@ -0,0 +1,325 @@ +package com.bfd.docconversion.util; + +import cn.hutool.core.util.IdUtil; +import com.alibaba.fastjson2.JSON; +import com.alibaba.fastjson2.JSONObject; +import com.alibaba.fastjson2.JSONPath; + +import com.aspose.cells.Workbook; +import com.aspose.slides.Presentation; +import lombok.extern.slf4j.Slf4j; +import okhttp3.*; +import org.springframework.stereotype.Component; +import com.aspose.pdf.Document; +import com.aspose.pdf.SaveFormat; + +import java.io.*; +import java.net.URL; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Map; +import java.util.concurrent.TimeUnit; + +/** + * @author guowei + */ +@Component +@Slf4j +public class Utils { + + /** + * 转换成pdf + * + * @param extension + * @param source + * @param target + * @throws Exception + */ + public static void asposeToPdf(String extension, InputStream source, ByteArrayOutputStream target) throws Exception { + switch (FileExtensionEnum.getByExtension(extension)) { + case doc: + case docx: + com.aspose.words.Document doc = new com.aspose.words.Document(source); + doc.save(target, com.aspose.words.SaveFormat.PDF); + + break; + case xls: + case xlsx: + com.aspose.cells.Workbook excel = new com.aspose.cells.Workbook(source); + com.aspose.cells.PdfSaveOptions pdfSaveOptions = new com.aspose.cells.PdfSaveOptions(); + // 单页显示,防截断 防换行 + pdfSaveOptions.setOnePagePerSheet(true); + excel.save(target, pdfSaveOptions); + excel.dispose(); + break; + case ppt: + case pptx: + com.aspose.slides.Presentation ppt = new com.aspose.slides.Presentation(source); + ppt.save(target, com.aspose.slides.SaveFormat.Pdf); + ppt.dispose(); + break; + default: + System.out.println("不支持的文件转换类型"); +// throw new BaseException("不支持的文件转换类型"); + } + } + + /** + * pdf 转换 + * @param extension + * @param source + * @param target + * @throws Exception + */ + public static void asposePdfTo(String extension, InputStream source, ByteArrayOutputStream target) throws Exception { + switch (FileExtensionEnum.getByExtension(extension)) { + case doc: + case docx: + case pdf: + // 设置字体替换 +// FontSettings fontSettings = new FontSettings(); +// FontSubstitutionSettings fontSubstitutionSettings = fontSettings.getSubstitutionSettings(); +// fontSubstitutionSettings.getDefaultFontSubstitution().setDefaultFontName("Arial"); +// +// // 加载系统字体 +// FontSourceBase[] fontSources = fontSettings.getFontsSources(); +// SystemFontSource systemFontSource = new SystemFontSource(); +// FontSourceBase[] updatedFontSources = new FontSourceBase[fontSources.length + 1]; +// System.arraycopy(fontSources, 0, updatedFontSources, 0, fontSources.length); +// updatedFontSources[fontSources.length] = systemFontSource; +// fontSettings.setFontsSources(updatedFontSources); +// +// // 指定加载选项,以确保正确处理字体 +// LoadOptions loadOptions = new LoadOptions(); +// loadOptions.setFontSettings(fontSettings); + + Document doc = new Document(source); + //全面支持DOC, DOCX, OOXML, RTF HTML, OpenDocument, PDF, EPUB, XPS, SWF 相互转换 + doc.save(target, SaveFormat.DocX); + doc.close(); + break; +// case xls: +// case xlsx: +// // Load PDF document +// Document excel = new Document(source); +// excel.save(target, SaveFormat.Excel); +// break; +// case ppt: +// case pptx: +// Document ppt = new Document(source); +// ppt.save(target, SaveFormat.Pptx); +// break; + default: + System.out.println("不支持的文件转换类型"); +// throw new BaseException("不支持的文件转换类型"); + } + } + +// public static void convertFile(String inputFilePath, String outputFilePath) throws Exception { +// String inputExtension = getFileExtension(inputFilePath).toLowerCase(); +// String outputExtension = getFileExtension(outputFilePath).toLowerCase(); +// +// switch (inputExtension) { +// case "doc": +// case "docx": +// convertWord(inputFilePath, outputFilePath, outputExtension); +// break; +// case "xls": +// case "xlsx": +// convertExcel(inputFilePath, outputFilePath, outputExtension); +// break; +// case "ppt": +// case "pptx": +// convertPPT(inputFilePath, outputFilePath, outputExtension); +// break; +// case "pdf": +// convertPDF(inputFilePath, outputFilePath, outputExtension); +// break; +// default: +// throw new IllegalArgumentException("Unsupported file format: " + inputExtension); +// } +// } + + private static void convertWord(String inputFilePath, String outputFilePath, String outputExtension) throws Exception { + com.aspose.words.Document doc = new com.aspose.words.Document(inputFilePath); + switch (outputExtension) { + case "pdf": + doc.save(outputFilePath, com.aspose.words.SaveFormat.PDF); + break; + default: + System.out.println("不支持的文件转换类型"); + } + } + + private static void convertExcel(String inputFilePath, String outputFilePath, String outputExtension) throws Exception { + Workbook workbook = new Workbook(inputFilePath); + switch (outputExtension) { + case "pdf": + workbook.save(outputFilePath, com.aspose.cells.SaveFormat.PDF); + break; + case "docx": + // Excel to Word conversion (Not directly supported) + ByteArrayOutputStream htmlStream = new ByteArrayOutputStream(); + workbook.save(htmlStream, com.aspose.cells.SaveFormat.HTML); + ByteArrayInputStream htmlInputStream = new ByteArrayInputStream(htmlStream.toByteArray()); + com.aspose.words.Document doc = new com.aspose.words.Document(htmlInputStream); + doc.save(outputFilePath, com.aspose.cells.SaveFormat.DOCX); + break; + case "xlsx": + workbook.save(outputFilePath, com.aspose.cells.SaveFormat.XLSX); + break; + case "pptx": + // Excel to PPTX conversion (Not directly supported) + ByteArrayOutputStream htmlStream2 = new ByteArrayOutputStream(); + workbook.save(htmlStream2, com.aspose.cells.SaveFormat.HTML); + ByteArrayInputStream htmlInputStream2 = new ByteArrayInputStream(htmlStream2.toByteArray()); + Presentation presentation = new Presentation(htmlInputStream2); + presentation.save(outputFilePath, com.aspose.slides.SaveFormat.Pptx); + break; + default: + throw new IllegalArgumentException("Unsupported conversion: Excel to " + outputExtension); + } + } + + /** + * 获取文件扩展名 + * + * @param path 文件路径 + * @return 文件扩展名 + */ + public static String getExtension(Path path) { + String fileName = path.getFileName().toString(); + int dotIndex = fileName.lastIndexOf('.'); + if (dotIndex == -1) { + throw new IllegalArgumentException("File without extension: " + fileName); + } + return fileName.substring(dotIndex + 1).toLowerCase(); + } + + public static Object jsonParse(String key, Map data) { + String[] keySplit = key.split(":"); + String jsonPath = keySplit[1]; + if (!data.containsKey(keySplit[0])) { + return ""; + } + String dataJson = (String) data.get(keySplit[0]); + JSONObject dataJsonObject = JSON.parseObject(dataJson); + Object dataValue = JSONPath.eval(dataJsonObject, jsonPath); + return dataValue; + } + + /** + * gofast 文件下载 + * + * @param url + * @return + * @throws IOException + */ + public static InputStream gofastDownLoadFile(String url) { + OkHttpClient client = new OkHttpClient().newBuilder() + .readTimeout(60, TimeUnit.SECONDS) + .writeTimeout(60, TimeUnit.SECONDS) + .connectTimeout(60, TimeUnit.SECONDS) + .build(); + MediaType mediaType = MediaType.parse("text/plain"); + RequestBody body = RequestBody.create(mediaType, ""); + Request request = new Request.Builder() + .url(url) + .method("GET", null) + .addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36") + .build(); + BufferedOutputStream out = null; + InputStream inputStream = null; + Response response = null; + try { + response = client.newCall(request).execute(); + for (int i = 0; i < Config.NUM; i++) { + if (response.isSuccessful()) { + break; + } else { + response = client.newCall(request).execute(); + System.out.println("gofast文件下载失败,file=" + url + ",第" + i + "次"); + log.error("gofast文件下载失败,file=" + url + ",第" + i + "次"); + Thread.sleep(3000); + i++; + } + } + inputStream = response.body().byteStream(); + } catch (Exception e) { + e.printStackTrace(); + log.error("gofast文件下载异常", e); + } + return inputStream; + } + + public static String upLoadFile(String filePath) { + + File file = new File(filePath); + String realFilename = filePath.substring(filePath.lastIndexOf(File.separator) + 1); + MultipartBody.Builder builder = new MultipartBody.Builder().setType(MultipartBody.FORM); + builder.addPart(Headers.of("Content-Disposition", "form-data; name=\"file\";filename=\"" + realFilename + "\""), + RequestBody.create(MediaType.parse("image/png"), file) + + ).addFormDataPart("output", "json").build(); + RequestBody body = builder.build(); + Request request = new Request.Builder().url(Config.gofastUrl).post(body).header("Expect", "100-continue").build(); + OkHttpClient.Builder okBuilder = new OkHttpClient.Builder(); + // 获得一个客户对象 + OkHttpClient client = okBuilder.build(); + Call call = client.newCall(request); + String html = ""; + Response response = null; + int retry = 0; + do { + try { + response = call.execute(); + html = response.body().string(); + break; + } catch (IOException e) { + log.error("文档上传异常,file:" + filePath + ",重试" + retry + "次"); + } finally { + response.close(); + } + } while (retry >= 5); + file.delete(); + + return html; + } + + public static void main(String[] args) throws Exception { + String filePath = "C:\\Users\\86150\\Desktop\\embed_watermark (1).pdf"; +// Path path = Paths.get(filePath); +//// String extension = getExtension(path); +// String extension = "docx"; +// System.out.println("文档转换: "+ extension + " --> PDF" ); +// ByteArrayOutputStream target = new ByteArrayOutputStream(); +// InputStream source = Files.newInputStream(path); +//// asposeToPdf(extension, source,target); +// asposePdfTo(extension,source,target); +// +// Files.write(Paths.get("C:\\Users\\86150\\Desktop\\embed_watermark (2).docx"), target.toByteArray()); +// String s = upLoadFile(filePath); +// System.out.println(s); + String gofastUrl = "http://172.18.1.180:9980/group17/default/20240812/16/40/3/971260fd6cce96624965c692f709660b.pdf"; + InputStream inputStream = gofastDownLoadFile(gofastUrl); + URL url = new URL(gofastUrl); + String newPath = url.getPath(); + Path path = Paths.get(newPath); + String extension = Utils.getExtension(path); + ByteArrayOutputStream target = new ByteArrayOutputStream(); + Utils.asposePdfTo(extension, inputStream,target); + filePath = "./files/"+ IdUtil.simpleUUID()+".docx"; + Files.write(Paths.get(filePath), target.toByteArray()); + } + +// public static void main(String[] args) { +// String pdfFilePath = "C:\\Users\\86150\\Desktop\\百分点\\考试\\百分点019期新员工特训营-文化篇(终版)20210512.pdf"; +// String wordFilePath = "C:\\Users\\86150\\Desktop\\百分点\\考试\\云学堂.docx"; +// +// pdf2doc(pdfFilePath); +// System.out.println("PDF successfully converted to Word document."); +// } + + +} diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml new file mode 100644 index 0000000..1914432 --- /dev/null +++ b/src/main/resources/application.yml @@ -0,0 +1,40 @@ +server: + port: 9955 +crawl: + kafka: + topic: produce_analyze + brokers: 172.18.1.146:9092,172.18.1.147:9092,172.18.1.148:9092 + task: + taskData: ./data/task.txt + threadNum: 3 +#日志级别 +logging: + level: + com: + bfd: INFO + #日志路径 + log: + path: ./logs +spring: + boot: + admin: + client: + url: http://172.18.1.147:8001 + instance: + service-base-url: http://172.18.1.147:9999 + application: + name: 文档转换 +management: + endpoints: + web: + exposure: + include: "*" + endpoint: + health: + show-details: always + health: + elasticsearch: + enabled: false +zookeeper: + connection-string: 172.18.1.146:2181,172.18.1.147:2181,172.18.1.148:2181 + publish-node: /analyze \ No newline at end of file diff --git a/src/main/resources/logback-spring.xml b/src/main/resources/logback-spring.xml new file mode 100644 index 0000000..0c59240 --- /dev/null +++ b/src/main/resources/logback-spring.xml @@ -0,0 +1,38 @@ + + + + + + + + + true + + ${logging.level} + + + ${logging.path}/crawlSchedule.log + + + + ${logging.path}/crawlSchedule.log.%d{yyyy-MM-dd} + + 7 + + + %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %line %-5level %logger{50} - %msg%n + UTF-8 + + + + + + + + diff --git a/src/test/java/com/bfd/doc_conversion/DocConversionApplicationTests.java b/src/test/java/com/bfd/doc_conversion/DocConversionApplicationTests.java new file mode 100644 index 0000000..bec0f37 --- /dev/null +++ b/src/test/java/com/bfd/doc_conversion/DocConversionApplicationTests.java @@ -0,0 +1,13 @@ +package com.bfd.doc_conversion; + +import org.junit.jupiter.api.Test; +import org.springframework.boot.test.context.SpringBootTest; + +@SpringBootTest +class DocConversionApplicationTests { + + @Test + void contextLoads() { + } + +}