Browse Source

文档抓换应用

master
55007 6 months ago
commit
4d48a8ed53
  1. 40
      .classpath
  2. 3
      .gitignore
  3. 23
      .project
  4. 5
      .settings/org.eclipse.core.resources.prefs
  5. 8
      .settings/org.eclipse.jdt.core.prefs
  6. 4
      .settings/org.eclipse.m2e.core.prefs
  7. 218
      pom.xml
  8. 67
      src/main/java/com/bfd/docconversion/DocConversionApplication.java
  9. 40
      src/main/java/com/bfd/docconversion/controller/ApiController.java
  10. 13
      src/main/java/com/bfd/docconversion/service/ConversionToPdfService.java
  11. 46
      src/main/java/com/bfd/docconversion/service/ProcessService.java
  12. 97
      src/main/java/com/bfd/docconversion/service/impl/ConversionToPdfServiceImpl.java
  13. 37
      src/main/java/com/bfd/docconversion/util/AsyncConfig.java
  14. 32
      src/main/java/com/bfd/docconversion/util/Config.java
  15. 19
      src/main/java/com/bfd/docconversion/util/Constants.java
  16. 39
      src/main/java/com/bfd/docconversion/util/FileExtensionEnum.java
  17. 83
      src/main/java/com/bfd/docconversion/util/KfkUtil.java
  18. 104
      src/main/java/com/bfd/docconversion/util/MainHandler.java
  19. 325
      src/main/java/com/bfd/docconversion/util/Utils.java
  20. 40
      src/main/resources/application.yml
  21. 38
      src/main/resources/logback-spring.xml
  22. 13
      src/test/java/com/bfd/doc_conversion/DocConversionApplicationTests.java

40
.classpath

@ -0,0 +1,40 @@
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry kind="src" output="target/classes" path="src/main/java">
<attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry excluding="**" kind="src" output="target/classes" path="src/main/resources">
<attributes>
<attribute name="maven.pomderived" value="true"/>
<attribute name="optional" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="src" output="target/test-classes" path="src/test/java">
<attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
<attribute name="test" value="true"/>
</attributes>
</classpathentry>
<classpathentry excluding="**" kind="src" output="target/test-classes" path="src/test/resources">
<attributes>
<attribute name="maven.pomderived" value="true"/>
<attribute name="test" value="true"/>
<attribute name="optional" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="output" path="target/classes"/>
</classpath>

3
.gitignore

@ -0,0 +1,3 @@
/target/
/logs/
/jarlib/

23
.project

@ -0,0 +1,23 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>doc_conversion</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.jdt.core.javabuilder</name>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.m2e.core.maven2Builder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.jdt.core.javanature</nature>
<nature>org.eclipse.m2e.core.maven2Nature</nature>
</natures>
</projectDescription>

5
.settings/org.eclipse.core.resources.prefs

@ -0,0 +1,5 @@
eclipse.preferences.version=1
encoding//src/main/java=UTF-8
encoding//src/main/resources=UTF-8
encoding//src/test/java=UTF-8
encoding/<project>=UTF-8

8
.settings/org.eclipse.jdt.core.prefs

@ -0,0 +1,8 @@
eclipse.preferences.version=1
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
org.eclipse.jdt.core.compiler.compliance=1.8
org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled
org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore
org.eclipse.jdt.core.compiler.release=disabled
org.eclipse.jdt.core.compiler.source=1.8

4
.settings/org.eclipse.m2e.core.prefs

@ -0,0 +1,4 @@
activeProfiles=
eclipse.preferences.version=1
resolveWorkspaceProjects=true
version=1

218
pom.xml

@ -0,0 +1,218 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.bfd</groupId>
<artifactId>doc_conversion</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>docconversion</name>
<description>docconversion</description>
<properties>
<java.version>1.8</java.version>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<spring-boot.version>2.2.4.RELEASE</spring-boot.version>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.alibaba.fastjson2</groupId>
<artifactId>fastjson2</artifactId>
<version>2.0.12</version>
</dependency>
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>5.8.27</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>2.7.1</version>
</dependency>
<dependency>
<groupId>com.squareup.okhttp3</groupId>
<artifactId>okhttp</artifactId>
<version>3.11.0</version>
</dependency>
<dependency>
<groupId>de.codecentric</groupId>
<artifactId>spring-boot-admin-client</artifactId>
<version>2.2.4</version>
</dependency>
<dependency>
<groupId>aspose-cells-20.12-crack</groupId>
<artifactId>aspose-cells-20.12-crack</artifactId>
<version>20.12</version>
<scope>system</scope>
<systemPath>D:\eclipseWork\doc_conversion/./jarlib/aspose-cells-20.12-crack.jar</systemPath>
</dependency>
<dependency>
<groupId>aspose-slides-20.12-crack</groupId>
<artifactId>aspose-slides-20.12-crack</artifactId>
<version>20.12</version>
<scope>system</scope>
<systemPath>D:\eclipseWork\doc_conversion/../jarlib/aspose-slides-20.12-crack.jar</systemPath>
</dependency>
<dependency>
<groupId>aspose-words-20.12-crack</groupId>
<artifactId>aspose-words-20.12-crack</artifactId>
<version>20.12</version>
<scope>system</scope>
<systemPath>D:\eclipseWork\doc_conversion/../jarlib/aspose-words-20.12-crack.jar</systemPath>
</dependency>
<dependency>
<groupId>org.javassist</groupId>
<artifactId>javassist</artifactId>
<version>3.20.0-GA</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.aspose/aspose-pdf -->
<dependency>
<groupId>aspose-pdf-23.1</groupId>
<artifactId>aspose-pdf-23.1</artifactId>
<version>23.1</version>
<scope>system</scope>
<systemPath>D:\eclipseWork\doc_conversion/../jarlib/aspose-pdf-23.1.jar</systemPath>
</dependency>
<dependency>
<groupId>org.apache.curator</groupId>
<artifactId>curator-framework</artifactId>
<version>5.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.curator</groupId>
<artifactId>curator-recipes</artifactId>
<version>5.2.0</version>
</dependency>
<dependency>
<groupId>com.bfd.util</groupId>
<artifactId>pauseTool</artifactId>
<version>1.0</version>
<scope>system</scope>
<systemPath>D:\eclipseWork\doc_conversion/../jarlib/pauseTool-1.0.jar</systemPath>
</dependency>
</dependencies>
<dependencyManagement>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-dependencies</artifactId>
<version>${spring-boot.version}</version>
<type>pom</type>
<scope>import</scope>
</dependency>
</dependencies>
</dependencyManagement>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<configuration>
<!--不打入jar包的文件类型或者路径-->
<excludes>
<exclude>*.properties</exclude>
<exclude>*.yml</exclude>
<exclude>*.yaml</exclude>
</excludes>
<archive>
<manifest>
<!-- 执行的主程序路径 -->
<mainClass>com.bfd.docconversion.DocConversionApplication</mainClass>
<!--是否要把第三方jar放到manifest的classpath中-->
<addClasspath>true</addClasspath>
<!--生成的manifest中classpath的前缀,因为要把第三方jar放到lib目录下,所以classpath的前缀是lib/-->
<classpathPrefix>lib/</classpathPrefix>
<!-- 打包时 MANIFEST.MF 文件不记录的时间戳版本 -->
<useUniqueVersions>false</useUniqueVersions>
</manifest>
<manifestEntries>
<!-- 在 Class-Path 下添加配置文件的路径 -->
<Class-Path>lib/pauseTool-1.0.jar lib/aspose-pdf-23.1-23.1.jar lib/aspose-cells-20.12-crack-20.12.jar lib/aspose-slides-20.12-crack-20.12.jar
lib/aspose-words-20.12-crack-20.12.jar config/
</Class-Path>
</manifestEntries>
</archive>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<executions>
<execution>
<id>copy</id>
<phase>package</phase>
<goals>
<goal>copy-dependencies</goal>
</goals>
<configuration>
<outputDirectory>${project.build.directory}/lib/</outputDirectory>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-resources-plugin</artifactId>
<executions>
<execution>
<id>copy-resources</id>
<phase>package</phase>
<goals>
<goal>copy-resources</goal>
</goals>
<configuration>
<resources>
<!--把配置文件打包到指定路径-->
<resource>
<directory>src/main/resources/</directory>
<includes>
<include>*.properties</include>
<include>*.yml</include>
<exclude>*.yaml</exclude>
</includes>
</resource>
</resources>
<outputDirectory>${project.build.directory}/config</outputDirectory>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>8</source>
<target>8</target>
</configuration>
</plugin>
</plugins>
</build>
</project>

67
src/main/java/com/bfd/docconversion/DocConversionApplication.java

@ -0,0 +1,67 @@
package com.bfd.docconversion;
import cn.hutool.core.thread.ThreadFactoryBuilder;
import com.bfd.docconversion.service.ProcessService;
import com.bfd.docconversion.util.Config;
import com.bfd.docconversion.util.KfkUtil;
import com.bfd.util.PauseTool;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.context.ConfigurableApplicationContext;
import org.springframework.data.redis.core.StringRedisTemplate;
import org.springframework.scheduling.annotation.EnableScheduling;
import org.springframework.scheduling.annotation.Scheduled;
import javax.annotation.Resource;
import java.util.concurrent.*;
/**
* @author guowei
*/
@SpringBootApplication
@EnableScheduling
@Slf4j
public class DocConversionApplication {
@Autowired
private StringRedisTemplate stringRedisTemplate;
@Value("${zookeeper.connection-string}")
private String connectionString;
@Value("${zookeeper.publish-node}")
private String nodePath;
@Value("${crawl.threadNum}")
private int threadNum;
@Resource
ProcessService processService;
public static void main(String[] args) {
ConfigurableApplicationContext applicationContext = SpringApplication.run(DocConversionApplication.class, args);
DocConversionApplication bean = applicationContext.getBean(DocConversionApplication.class);
System.setProperty("java.io.tmpdir","/opt/analyze/apps/doc_conversion/tmp");
bean.start();
}
public void start(){
ThreadFactory namedThreadFactory = new ThreadFactoryBuilder().setNamePrefix("crawl-pool-%d").build();
ExecutorService singleThreadPool = new ThreadPoolExecutor(10, 20, 100L, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>(1024), namedThreadFactory, new ThreadPoolExecutor.AbortPolicy());
for (int i=0;i<threadNum;i++){
singleThreadPool.execute(processService);
}
KfkUtil.getProducer();
PauseTool pauseTool = new PauseTool();
pauseTool.initializeRedisCache(stringRedisTemplate);
pauseTool.setupZookeeperListener(connectionString, nodePath);
}
@Scheduled(cron = "0 0/5 * * * ?")
public void timeSize(){
int size = Config.taskQueue.size();
log.info("当前有 {} 条文档没有转换",size);
}
}

40
src/main/java/com/bfd/docconversion/controller/ApiController.java

@ -0,0 +1,40 @@
package com.bfd.docconversion.controller;
import com.alibaba.fastjson2.JSONObject;
import com.bfd.docconversion.util.Config;
import lombok.extern.slf4j.Slf4j;
import org.springframework.web.bind.annotation.*;
/**
* @author guowei
*/
@RestController
@Slf4j
@RequestMapping(value = "/document")
@CrossOrigin(origins = "*", maxAge = 3600)
public class ApiController {
// @Resource
// conversionToPdfService conversionToPdfService;
/**
* 文档转换 Api
* @param jsonObject
* @return
*/
@RequestMapping(value = "/conversion", method = RequestMethod.POST, produces = "application/json")
@ResponseBody
public String varAna(@RequestBody JSONObject jsonObject) {
log.info("文档转换参数:"+jsonObject);
// conversionToPdfService.conversion(jsonObject);
try {
if (jsonObject.containsKey(Config.TRACE) && jsonObject.getBoolean(Config.TRACE)==true){
log.info("测试流程,插入队首");
Config.taskQueue.putFirst(jsonObject);
}else {
Config.taskQueue.put(jsonObject);
}
} catch (InterruptedException e) {
e.printStackTrace();
}
return "success";
}
}

13
src/main/java/com/bfd/docconversion/service/ConversionToPdfService.java

@ -0,0 +1,13 @@
package com.bfd.docconversion.service;
import com.alibaba.fastjson2.JSONObject;
import org.springframework.stereotype.Service;
/**
* @author guowei
*/
@Service
public interface ConversionToPdfService {
void conversion(JSONObject jsonObject);
}

46
src/main/java/com/bfd/docconversion/service/ProcessService.java

@ -0,0 +1,46 @@
package com.bfd.docconversion.service;
import com.alibaba.fastjson2.JSONObject;
import com.aspose.pdf.MemoryCleaner;
import com.bfd.docconversion.util.Config;
import com.bfd.docconversion.util.Constants;
import com.bfd.util.PauseTool;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import javax.annotation.Resource;
/**
* @author guowei
*/
@Service
@Slf4j
public class ProcessService implements Runnable {
@Resource
ConversionToPdfService conversionToPdfService;
@Override
public void run() {
while (true) {
try {
if (Config.taskQueue.size() <= 0) {
Thread.sleep(1000 * 10);
//清除缓存
MemoryCleaner.clearAllTempFiles();
} else {
JSONObject take = Config.taskQueue.take();
Integer scense_id = (Integer) take.get(Constants.SCENES_ID);
Integer version = (Integer) take.get(Constants.VERSION);
if (PauseTool.CACHE.containsKey(scense_id + Constants.UNDERLINE + version)) {
conversionToPdfService.conversion(take);
} else {
log.info("暂停任务:{}", JSONObject.toJSONString(take));
}
}
} catch (Exception e) {
e.printStackTrace();
log.info("异常,{}", e);
}
}
}
}

97
src/main/java/com/bfd/docconversion/service/impl/ConversionToPdfServiceImpl.java

@ -0,0 +1,97 @@
package com.bfd.docconversion.service.impl;
import cn.hutool.core.util.IdUtil;
import com.alibaba.fastjson2.JSON;
import com.alibaba.fastjson2.JSONObject;
import com.bfd.docconversion.service.ConversionToPdfService;
import com.bfd.docconversion.util.Config;
import com.bfd.docconversion.util.KfkUtil;
import com.bfd.docconversion.util.Utils;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Map;
/**
* @author guowei
*/
@Service
@Slf4j
public class ConversionToPdfServiceImpl implements ConversionToPdfService {
/**
* 转换
* @param jsonObject
*/
@Override
public void conversion(JSONObject jsonObject) {
//输入
JSONObject input = jsonObject.getJSONObject("input");
//输出
JSONObject output = jsonObject.getJSONObject("output");
//data
JSONObject data = jsonObject.getJSONObject("data");
System.out.println("queryData ---> input:" + JSON.toJSONString(input));
System.out.println("queryData ---> output:" + JSON.toJSONString(output));
System.out.println("queryData ---> data:" + JSON.toJSONString(data));
Map resultMap = new HashMap<>(32);
Map results = new HashMap<>(32);
try {
//需修改
// String gofastUrl = input.getString("filePath");
String gofastUrl = (String) Utils.jsonParse(input.getString("filePath"), data);
log.info("开始下载文件, path:"+ gofastUrl);
InputStream source = Utils.gofastDownLoadFile(gofastUrl);
if (source == null) {
throw new NullPointerException();
}
URL url = new URL(gofastUrl);
String newPath = url.getPath();
Path path = Paths.get(newPath);
String extension = Utils.getExtension(path);
ByteArrayOutputStream target = new ByteArrayOutputStream();
String filePath = "";
if (extension.equals(Config.PDF)) {
log.info("文档转换开始: " + extension + " --> DOC");
Utils.asposePdfTo(extension, source,target);
filePath = "./files/"+IdUtil.simpleUUID()+".docx";
}else {
log.info("文档转换开始: " + extension + " --> PDF");
Utils.asposeToPdf(extension, source,target);
filePath = "./files/"+IdUtil.simpleUUID()+".pdf";
}
// InputStream source = Files.newInputStream(path);
Files.write(Paths.get(filePath), target.toByteArray());
log.info("文档转换完成");
log.info("文件开始上传 path:{}",filePath);
String upLoadFile = Utils.upLoadFile(filePath);
System.out.println(upLoadFile);
log.info("文件结束上传");
JSONObject resultUpload = JSONObject.parseObject(upLoadFile);
resultMap.put("id", IdUtil.randomUUID());
resultMap.put("conversionUrl", Config.resultGofast + resultUpload.getString("path"));
results.put("status", 1);
results.put("message", "成功");
}catch (Exception e){
e.printStackTrace();
log.error("文档转换异常",e);
resultMap.put("conversionUrl", "失败");
results.put("status", 2);
results.put("message", "失败");
}
resultMap.put("isLast",1);
results.put("results", JSON.toJSONString(resultMap));
jsonObject.put("result", results);
KfkUtil.sendKafka(JSON.toJSONString(jsonObject));
log.info("处理完成,result:" + JSON.toJSONString(results));
}
}

37
src/main/java/com/bfd/docconversion/util/AsyncConfig.java

@ -0,0 +1,37 @@
package com.bfd.docconversion.util;
import org.springframework.context.annotation.Configuration;
import org.springframework.scheduling.annotation.AsyncConfigurer;
import org.springframework.scheduling.annotation.EnableAsync;
import org.springframework.scheduling.concurrent.ThreadPoolTaskExecutor;
import java.util.concurrent.Executor;
@Configuration
@EnableAsync //Java配置文件标注它那么Spring就会开启异步可用
/**
* @author guowei
* 异步任务线程池
* 注解@EnableAsync代表开启Spring异步这样就可以使用@Async驱动Spring使用异步
* 但是异步需要提供可用线程池所以这里的配置类还会实现AsyncConfigurer接口然后覆盖getAsyncExecutor方法这样就可以自定义一个线程池
*/
public class AsyncConfig implements AsyncConfigurer {
@Override
public Executor getAsyncExecutor() {
//定义线程池
ThreadPoolTaskExecutor threadPoolTaskExecutor = new ThreadPoolTaskExecutor();
//核心线程数
threadPoolTaskExecutor.setCorePoolSize(10);
//线程池最大线程数
threadPoolTaskExecutor.setMaxPoolSize(50);
//线程队列最大线程数
threadPoolTaskExecutor.setQueueCapacity(200);
//初始化
threadPoolTaskExecutor.initialize();
return threadPoolTaskExecutor;
}
}

32
src/main/java/com/bfd/docconversion/util/Config.java

@ -0,0 +1,32 @@
package com.bfd.docconversion.util;
import com.alibaba.fastjson2.JSONObject;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.LinkedBlockingDeque;
/**
* @author guowei
*/
public class Config {
public static String gofastUrl = "http://172.18.1.180:9980/upload";
// public static String resultGofast = "https://crawl-files.pontoaplus.com";
public static String resultGofast = "https://caiji.pontoaplus.com";
public static LinkedBlockingDeque<JSONObject> taskQueue = new LinkedBlockingDeque <JSONObject>();
public static Map stopCache = new HashMap<>();
public static final String PDF = "pdf";
public static final Integer NUM = 5;
public static final String TRACE = "trace";
}

19
src/main/java/com/bfd/docconversion/util/Constants.java

@ -0,0 +1,19 @@
package com.bfd.docconversion.util;
import org.springframework.stereotype.Component;
/**
* @author guowei
*/
@Component
public class Constants {
public final static String STOP = "stop";
public final static String SCENES_ID = "scenes_id";
public final static String VERSION = "version";
public final static String UNDERLINE = "_";
}

39
src/main/java/com/bfd/docconversion/util/FileExtensionEnum.java

@ -0,0 +1,39 @@
package com.bfd.docconversion.util;
/**
* @author guowei
*/
public enum FileExtensionEnum {
/**doc**/
doc("doc"),
/**docx**/
docx("docx"),
/**xls**/
xls("xls"),
/**xlsx**/
xlsx("xlsx"),
/**ppt**/
ppt("ppt"),
/**pptx"**/
pptx("pptx"),
/**pdf**/
pdf("pdf");
private final String extension;
FileExtensionEnum(String extension) {
this.extension = extension;
}
public String getExtension() {
return extension;
}
public static FileExtensionEnum getByExtension(String extension) {
for (FileExtensionEnum fileExtension : values()) {
if (fileExtension.getExtension().equalsIgnoreCase(extension)) {
return fileExtension;
}
}
throw new IllegalArgumentException("Unsupported file extension: " + extension);
}
}

83
src/main/java/com/bfd/docconversion/util/KfkUtil.java

@ -0,0 +1,83 @@
package com.bfd.docconversion.util;
import lombok.extern.slf4j.Slf4j;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import java.util.Properties;
/**
* @author guowei
* kfk工具类
*/
@Component
@Slf4j
public class KfkUtil {
private static String topic;
private static String brokerList;
@Value("${crawl.kafka.topic}")
public void setTopic(String topic) {
KfkUtil.topic = topic;
}
@Value("${crawl.kafka.brokers}")
public void setBrokerList(String brokerList) {
KfkUtil.brokerList = brokerList;
}
private static KafkaProducer<String, String> kafkaProducer;
public static int num = 0;
/**
* 获取KafkaProducer实例
*/
public static KafkaProducer<String, String> getProducer() {
// synchronized (kafkaProducer) {
if (kafkaProducer == null) {
Properties props = new Properties();
//xxx服务器ip
props.put("bootstrap.servers", brokerList);
//所有follower都响应了才认为消息提交成功"committed"
props.put("acks", "all");
//retries = MAX 无限重试直到你意识到出现了问题:)
props.put("retries", 3);
//producer将试图批处理消息记录以减少请求次数.默认的批量处理消息字节数
props.put("batch.size", 16384);
//batch.size当批量的数据大小达到设定值后就会立即发送不顾下面的linger.ms
//延迟1ms发送这项设置将通过增加小的延迟来完成--不是立即发送一条记录producer将会等待给定的延迟时间以允许其他消息记录发送这些消息记录可以批量处理
props.put("linger.ms", 1);
//producer可以用来缓存数据的内存大小
props.put("buffer.memory", 33554432);
props.put("key.serializer",
"org.apache.kafka.common.serialization.StringSerializer");
props.put("value.serializer",
"org.apache.kafka.common.serialization.StringSerializer");
kafkaProducer = new KafkaProducer<String, String>(props);
}
// }
return kafkaProducer;
}
/**
* 关闭KafkaProducer实例
*/
public static void closeProducer() {
if (kafkaProducer != null) {
log.info("----------close producer----------");
kafkaProducer.close();
kafkaProducer = null;
}
}
public static void sendKafka(String resultData) {
KafkaProducer<String, String> producer = getProducer();
ProducerRecord<String, String> se = new ProducerRecord<String, String>(topic, resultData);
producer.send(se);
log.info("发送kafka成功");
// num++;
}
}

104
src/main/java/com/bfd/docconversion/util/MainHandler.java

@ -0,0 +1,104 @@
package com.bfd.docconversion.util;
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.io.file.FileWriter;
import com.alibaba.fastjson2.JSON;
import com.alibaba.fastjson2.JSONObject;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.ApplicationArguments;
import org.springframework.boot.ApplicationRunner;
import org.springframework.stereotype.Service;
import java.io.File;
import java.util.List;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.LinkedBlockingQueue;
/**
* @author guowei
*/
@Slf4j
@Service
public class MainHandler implements ApplicationRunner {
@Value("${crawl.task.taskData}")
private String taskPath;
@Override
public void run(ApplicationArguments args) throws Exception {
log.info("监测程序运行线程 start");
//停止处理
waitDown();
//启动加载缓存任务
readTask(taskPath, Config.taskQueue);
}
public static void readTask(String path, LinkedBlockingDeque queue) throws InterruptedException {
File file = new File(path);
if (file.exists()) {
List<String> tasks = null;
tasks = FileUtil.readLines(file, "UTF-8");
log.info("缓存文件有 " + tasks.size() + " 条数据");
for (String taskStr : tasks) {
log.info("读到缓存数据:" + taskStr);
System.out.println("读到缓存数据:" + taskStr);
JSONObject parse = JSONObject.parseObject(taskStr);
// JSONObject value = (JSONObject) parse.get("value");
// if (value.containsKey("result")){
// KfkUtil.sendKafka(JSON.toJSONString(value));
// log.info("此数据已经组装好,直接推送kfk");
// continue;
// }
queue.put(parse);
}
file.delete();
} else {
log.info("未找到缓存任务文件");
}
}
/**
* 结束触发钩子
*/
public void waitDown() {
Runtime.getRuntime().addShutdownHook(new Thread() {
@Override
public void run() {
// 停止线程
// Config.isStart = false;
log.info("stop-------");
try {
writeTsskToFile();
} catch (InterruptedException e) {
log.error("写出缓存异常,{}", e);
}
}
});
}
/**
* 任务持久化到硬盘
*/
public void writeTsskToFile() throws InterruptedException {
System.out.println(taskPath);
File file = new File(taskPath);
FileWriter fileWriter = new FileWriter(file);
if (!file.exists()) {
fileWriter = FileWriter.create(file);
}
while (Config.taskQueue.size() > 0) {
JSONObject take = Config.taskQueue.take();
String entryJson = JSON.toJSONString(take);
System.out.println("写入缓存数据:" + entryJson);
fileWriter.write(entryJson + "\r\n", true);
}
log.info("taskMap 缓存已输出");
}
}

325
src/main/java/com/bfd/docconversion/util/Utils.java

@ -0,0 +1,325 @@
package com.bfd.docconversion.util;
import cn.hutool.core.util.IdUtil;
import com.alibaba.fastjson2.JSON;
import com.alibaba.fastjson2.JSONObject;
import com.alibaba.fastjson2.JSONPath;
import com.aspose.cells.Workbook;
import com.aspose.slides.Presentation;
import lombok.extern.slf4j.Slf4j;
import okhttp3.*;
import org.springframework.stereotype.Component;
import com.aspose.pdf.Document;
import com.aspose.pdf.SaveFormat;
import java.io.*;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Map;
import java.util.concurrent.TimeUnit;
/**
* @author guowei
*/
@Component
@Slf4j
public class Utils {
/**
* 转换成pdf
*
* @param extension
* @param source
* @param target
* @throws Exception
*/
public static void asposeToPdf(String extension, InputStream source, ByteArrayOutputStream target) throws Exception {
switch (FileExtensionEnum.getByExtension(extension)) {
case doc:
case docx:
com.aspose.words.Document doc = new com.aspose.words.Document(source);
doc.save(target, com.aspose.words.SaveFormat.PDF);
break;
case xls:
case xlsx:
com.aspose.cells.Workbook excel = new com.aspose.cells.Workbook(source);
com.aspose.cells.PdfSaveOptions pdfSaveOptions = new com.aspose.cells.PdfSaveOptions();
// 单页显示防截断 防换行
pdfSaveOptions.setOnePagePerSheet(true);
excel.save(target, pdfSaveOptions);
excel.dispose();
break;
case ppt:
case pptx:
com.aspose.slides.Presentation ppt = new com.aspose.slides.Presentation(source);
ppt.save(target, com.aspose.slides.SaveFormat.Pdf);
ppt.dispose();
break;
default:
System.out.println("不支持的文件转换类型");
// throw new BaseException("不支持的文件转换类型");
}
}
/**
* pdf 转换
* @param extension
* @param source
* @param target
* @throws Exception
*/
public static void asposePdfTo(String extension, InputStream source, ByteArrayOutputStream target) throws Exception {
switch (FileExtensionEnum.getByExtension(extension)) {
case doc:
case docx:
case pdf:
// 设置字体替换
// FontSettings fontSettings = new FontSettings();
// FontSubstitutionSettings fontSubstitutionSettings = fontSettings.getSubstitutionSettings();
// fontSubstitutionSettings.getDefaultFontSubstitution().setDefaultFontName("Arial");
//
// // 加载系统字体
// FontSourceBase[] fontSources = fontSettings.getFontsSources();
// SystemFontSource systemFontSource = new SystemFontSource();
// FontSourceBase[] updatedFontSources = new FontSourceBase[fontSources.length + 1];
// System.arraycopy(fontSources, 0, updatedFontSources, 0, fontSources.length);
// updatedFontSources[fontSources.length] = systemFontSource;
// fontSettings.setFontsSources(updatedFontSources);
//
// // 指定加载选项以确保正确处理字体
// LoadOptions loadOptions = new LoadOptions();
// loadOptions.setFontSettings(fontSettings);
Document doc = new Document(source);
//全面支持DOC, DOCX, OOXML, RTF HTML, OpenDocument, PDF, EPUB, XPS, SWF 相互转换
doc.save(target, SaveFormat.DocX);
doc.close();
break;
// case xls:
// case xlsx:
// // Load PDF document
// Document excel = new Document(source);
// excel.save(target, SaveFormat.Excel);
// break;
// case ppt:
// case pptx:
// Document ppt = new Document(source);
// ppt.save(target, SaveFormat.Pptx);
// break;
default:
System.out.println("不支持的文件转换类型");
// throw new BaseException("不支持的文件转换类型");
}
}
// public static void convertFile(String inputFilePath, String outputFilePath) throws Exception {
// String inputExtension = getFileExtension(inputFilePath).toLowerCase();
// String outputExtension = getFileExtension(outputFilePath).toLowerCase();
//
// switch (inputExtension) {
// case "doc":
// case "docx":
// convertWord(inputFilePath, outputFilePath, outputExtension);
// break;
// case "xls":
// case "xlsx":
// convertExcel(inputFilePath, outputFilePath, outputExtension);
// break;
// case "ppt":
// case "pptx":
// convertPPT(inputFilePath, outputFilePath, outputExtension);
// break;
// case "pdf":
// convertPDF(inputFilePath, outputFilePath, outputExtension);
// break;
// default:
// throw new IllegalArgumentException("Unsupported file format: " + inputExtension);
// }
// }
private static void convertWord(String inputFilePath, String outputFilePath, String outputExtension) throws Exception {
com.aspose.words.Document doc = new com.aspose.words.Document(inputFilePath);
switch (outputExtension) {
case "pdf":
doc.save(outputFilePath, com.aspose.words.SaveFormat.PDF);
break;
default:
System.out.println("不支持的文件转换类型");
}
}
private static void convertExcel(String inputFilePath, String outputFilePath, String outputExtension) throws Exception {
Workbook workbook = new Workbook(inputFilePath);
switch (outputExtension) {
case "pdf":
workbook.save(outputFilePath, com.aspose.cells.SaveFormat.PDF);
break;
case "docx":
// Excel to Word conversion (Not directly supported)
ByteArrayOutputStream htmlStream = new ByteArrayOutputStream();
workbook.save(htmlStream, com.aspose.cells.SaveFormat.HTML);
ByteArrayInputStream htmlInputStream = new ByteArrayInputStream(htmlStream.toByteArray());
com.aspose.words.Document doc = new com.aspose.words.Document(htmlInputStream);
doc.save(outputFilePath, com.aspose.cells.SaveFormat.DOCX);
break;
case "xlsx":
workbook.save(outputFilePath, com.aspose.cells.SaveFormat.XLSX);
break;
case "pptx":
// Excel to PPTX conversion (Not directly supported)
ByteArrayOutputStream htmlStream2 = new ByteArrayOutputStream();
workbook.save(htmlStream2, com.aspose.cells.SaveFormat.HTML);
ByteArrayInputStream htmlInputStream2 = new ByteArrayInputStream(htmlStream2.toByteArray());
Presentation presentation = new Presentation(htmlInputStream2);
presentation.save(outputFilePath, com.aspose.slides.SaveFormat.Pptx);
break;
default:
throw new IllegalArgumentException("Unsupported conversion: Excel to " + outputExtension);
}
}
/**
* 获取文件扩展名
*
* @param path 文件路径
* @return 文件扩展名
*/
public static String getExtension(Path path) {
String fileName = path.getFileName().toString();
int dotIndex = fileName.lastIndexOf('.');
if (dotIndex == -1) {
throw new IllegalArgumentException("File without extension: " + fileName);
}
return fileName.substring(dotIndex + 1).toLowerCase();
}
public static Object jsonParse(String key, Map data) {
String[] keySplit = key.split(":");
String jsonPath = keySplit[1];
if (!data.containsKey(keySplit[0])) {
return "";
}
String dataJson = (String) data.get(keySplit[0]);
JSONObject dataJsonObject = JSON.parseObject(dataJson);
Object dataValue = JSONPath.eval(dataJsonObject, jsonPath);
return dataValue;
}
/**
* gofast 文件下载
*
* @param url
* @return
* @throws IOException
*/
public static InputStream gofastDownLoadFile(String url) {
OkHttpClient client = new OkHttpClient().newBuilder()
.readTimeout(60, TimeUnit.SECONDS)
.writeTimeout(60, TimeUnit.SECONDS)
.connectTimeout(60, TimeUnit.SECONDS)
.build();
MediaType mediaType = MediaType.parse("text/plain");
RequestBody body = RequestBody.create(mediaType, "");
Request request = new Request.Builder()
.url(url)
.method("GET", null)
.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36")
.build();
BufferedOutputStream out = null;
InputStream inputStream = null;
Response response = null;
try {
response = client.newCall(request).execute();
for (int i = 0; i < Config.NUM; i++) {
if (response.isSuccessful()) {
break;
} else {
response = client.newCall(request).execute();
System.out.println("gofast文件下载失败,file=" + url + ",第" + i + "次");
log.error("gofast文件下载失败,file=" + url + ",第" + i + "次");
Thread.sleep(3000);
i++;
}
}
inputStream = response.body().byteStream();
} catch (Exception e) {
e.printStackTrace();
log.error("gofast文件下载异常", e);
}
return inputStream;
}
public static String upLoadFile(String filePath) {
File file = new File(filePath);
String realFilename = filePath.substring(filePath.lastIndexOf(File.separator) + 1);
MultipartBody.Builder builder = new MultipartBody.Builder().setType(MultipartBody.FORM);
builder.addPart(Headers.of("Content-Disposition", "form-data; name=\"file\";filename=\"" + realFilename + "\""),
RequestBody.create(MediaType.parse("image/png"), file)
).addFormDataPart("output", "json").build();
RequestBody body = builder.build();
Request request = new Request.Builder().url(Config.gofastUrl).post(body).header("Expect", "100-continue").build();
OkHttpClient.Builder okBuilder = new OkHttpClient.Builder();
// 获得一个客户对象
OkHttpClient client = okBuilder.build();
Call call = client.newCall(request);
String html = "";
Response response = null;
int retry = 0;
do {
try {
response = call.execute();
html = response.body().string();
break;
} catch (IOException e) {
log.error("文档上传异常,file:" + filePath + ",重试" + retry + "次");
} finally {
response.close();
}
} while (retry >= 5);
file.delete();
return html;
}
public static void main(String[] args) throws Exception {
String filePath = "C:\\Users\\86150\\Desktop\\embed_watermark (1).pdf";
// Path path = Paths.get(filePath);
//// String extension = getExtension(path);
// String extension = "docx";
// System.out.println("文档转换: "+ extension + " --> PDF" );
// ByteArrayOutputStream target = new ByteArrayOutputStream();
// InputStream source = Files.newInputStream(path);
//// asposeToPdf(extension, source,target);
// asposePdfTo(extension,source,target);
//
// Files.write(Paths.get("C:\\Users\\86150\\Desktop\\embed_watermark (2).docx"), target.toByteArray());
// String s = upLoadFile(filePath);
// System.out.println(s);
String gofastUrl = "http://172.18.1.180:9980/group17/default/20240812/16/40/3/971260fd6cce96624965c692f709660b.pdf";
InputStream inputStream = gofastDownLoadFile(gofastUrl);
URL url = new URL(gofastUrl);
String newPath = url.getPath();
Path path = Paths.get(newPath);
String extension = Utils.getExtension(path);
ByteArrayOutputStream target = new ByteArrayOutputStream();
Utils.asposePdfTo(extension, inputStream,target);
filePath = "./files/"+ IdUtil.simpleUUID()+".docx";
Files.write(Paths.get(filePath), target.toByteArray());
}
// public static void main(String[] args) {
// String pdfFilePath = "C:\\Users\\86150\\Desktop\\百分点\\考试\\百分点019期新员工特训营-文化篇(终版)20210512.pdf";
// String wordFilePath = "C:\\Users\\86150\\Desktop\\百分点\\考试\\云学堂.docx";
//
// pdf2doc(pdfFilePath);
// System.out.println("PDF successfully converted to Word document.");
// }
}

40
src/main/resources/application.yml

@ -0,0 +1,40 @@
server:
port: 9955
crawl:
kafka:
topic: produce_analyze
brokers: 172.18.1.146:9092,172.18.1.147:9092,172.18.1.148:9092
task:
taskData: ./data/task.txt
threadNum: 3
#日志级别
logging:
level:
com:
bfd: INFO
#日志路径
log:
path: ./logs
spring:
boot:
admin:
client:
url: http://172.18.1.147:8001
instance:
service-base-url: http://172.18.1.147:9999
application:
name: 文档转换
management:
endpoints:
web:
exposure:
include: "*"
endpoint:
health:
show-details: always
health:
elasticsearch:
enabled: false
zookeeper:
connection-string: 172.18.1.146:2181,172.18.1.147:2181,172.18.1.148:2181
publish-node: /analyze

38
src/main/resources/logback-spring.xml

@ -0,0 +1,38 @@
<configuration>
<!-- 属性文件:在properties文件中找到对应的配置项 -->
<springProperty scope="context" name="logging.path" source="logging.log.path"/>
<springProperty scope="context" name="logging.level" source="logging.level.com.bfd"/>
<!-- 默认的控制台日志输出,一般生产环境都是后台启动,这个没太大作用 -->
<!-- <appender name="STDOUT"
class="ch.qos.logback.core.ConsoleAppender">
<encoder class="ch.qos.logback.classic.encoder.PatternLayoutEncoder">
<Pattern>%d{HH:mm:ss.SSS} %-5level %logger{80} - %msg%n</Pattern>
</encoder>
</appender> -->
<appender name="GLMAPPER-LOGGERONE"
class="ch.qos.logback.core.rolling.RollingFileAppender">
<append>true</append>
<filter class="ch.qos.logback.classic.filter.ThresholdFilter">
<level>${logging.level}</level>
</filter>
<file>
${logging.path}/crawlSchedule.log
<!-- ${logging.path}/sendKafka.log -->
</file>
<rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy">
<FileNamePattern>${logging.path}/crawlSchedule.log.%d{yyyy-MM-dd}</FileNamePattern>
<!-- <FileNamePattern>${logging.path}/sendKafka.log.%d{yyyy-MM-dd}</FileNamePattern> -->
<MaxHistory>7</MaxHistory>
</rollingPolicy>
<encoder class="ch.qos.logback.classic.encoder.PatternLayoutEncoder">
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %line %-5level %logger{50} - %msg%n</pattern>
<charset>UTF-8</charset>
</encoder>
</appender>
<root level="info">
<appender-ref ref="GLMAPPER-LOGGERONE"/>
<!-- <appender-ref ref="STDOUT"/> -->
</root>
</configuration>

13
src/test/java/com/bfd/doc_conversion/DocConversionApplicationTests.java

@ -0,0 +1,13 @@
package com.bfd.doc_conversion;
import org.junit.jupiter.api.Test;
import org.springframework.boot.test.context.SpringBootTest;
@SpringBootTest
class DocConversionApplicationTests {
@Test
void contextLoads() {
}
}
Loading…
Cancel
Save