diff --git a/src/main/java/com/bfd/parse/entity/Constants.java b/src/main/java/com/bfd/parse/entity/Constants.java index e890b51..cd2652b 100644 --- a/src/main/java/com/bfd/parse/entity/Constants.java +++ b/src/main/java/com/bfd/parse/entity/Constants.java @@ -161,6 +161,7 @@ public class Constants { public static final String XLS = "xls"; public static final String XLSX = "xlsx"; public static final String PDF = "pdf"; + public static final String TXT = "txt"; public static final String DOC = "doc"; public static final String DOCX = "docx"; public static final String PPT = "ppt"; diff --git a/src/main/java/com/bfd/parse/process/FileTaskProcess.java b/src/main/java/com/bfd/parse/process/FileTaskProcess.java index eca46c3..8b21d5c 100644 --- a/src/main/java/com/bfd/parse/process/FileTaskProcess.java +++ b/src/main/java/com/bfd/parse/process/FileTaskProcess.java @@ -30,6 +30,8 @@ import com.bfd.parse.utils.DataUtil; import com.bfd.parse.utils.DownLoadUtil; import com.bfd.parse.utils.SpringBootKafka; +import kotlin.jvm.Throws; + /** * @author jian.mao * @date 2024年2月4日 @@ -99,7 +101,7 @@ public class FileTaskProcess implements Runnable{ * @return * @throws IOException */ - private String readFileToSend(String path, String saveFilePath) throws IOException { + private String readFileToSend(String path, String saveFilePath) throws Exception { StringBuffer sb = new StringBuffer(); DownLoadUtil.downloadFile(path, saveFilePath); if (saveFilePath.endsWith(Constants.DOC) || saveFilePath.endsWith(Constants.DOCX)) { @@ -107,10 +109,13 @@ public class FileTaskProcess implements Runnable{ }else if(saveFilePath.endsWith(Constants.PDF)){ //pdf 纯文字版本 sb.append(readPdfFile(saveFilePath)); - } else { - for (String line : FileUtils.readLines(new File(saveFilePath))) { + } else if(saveFilePath.endsWith(Constants.TXT)){ + for (String line : FileUtils.readLines(new File(saveFilePath))) { sb.append(line); } + } else { + throw new Exception("未知文件类型。"); + } return sb.toString(); } @@ -154,6 +159,9 @@ public class FileTaskProcess implements Runnable{ // 提取文本内容 PDFTextStripper pdfStripper = new PDFTextStripper(); String text = pdfStripper.getText(document); + if(text.replace("\r\n", Constants.EMPTY).replace("\n", Constants.EMPTY).replace("\r", Constants.EMPTY).equals(Constants.EMPTY)) { + text = "文档内容为空"; + } // 关闭文档 document.close(); return text; @@ -182,4 +190,17 @@ public class FileTaskProcess implements Runnable{ log.error("删除文件异常:{}",filePath); } } + + public static void main(String[] args) throws IOException { + // 加载PDF文件 + File file = new File("C:\\Users\\55007\\Downloads\\2025001_04.pdf"); + PDDocument document = PDDocument.load(file); + // 提取文本内容 + PDFTextStripper pdfStripper = new PDFTextStripper(); + String text = pdfStripper.getText(document).replace("\r\n", ""); + + System.out.println(text.equals("")); + // 关闭文档 + document.close(); + } }