|
|
@ -30,6 +30,8 @@ import com.bfd.parse.utils.DataUtil; |
|
|
import com.bfd.parse.utils.DownLoadUtil; |
|
|
import com.bfd.parse.utils.DownLoadUtil; |
|
|
import com.bfd.parse.utils.SpringBootKafka; |
|
|
import com.bfd.parse.utils.SpringBootKafka; |
|
|
|
|
|
|
|
|
|
|
|
import kotlin.jvm.Throws; |
|
|
|
|
|
|
|
|
/** |
|
|
/** |
|
|
* @author jian.mao |
|
|
* @author jian.mao |
|
|
* @date 2024年2月4日 |
|
|
* @date 2024年2月4日 |
|
|
@ -99,7 +101,7 @@ public class FileTaskProcess implements Runnable{ |
|
|
* @return |
|
|
* @return |
|
|
* @throws IOException |
|
|
* @throws IOException |
|
|
*/ |
|
|
*/ |
|
|
private String readFileToSend(String path, String saveFilePath) throws IOException { |
|
|
|
|
|
|
|
|
private String readFileToSend(String path, String saveFilePath) throws Exception { |
|
|
StringBuffer sb = new StringBuffer(); |
|
|
StringBuffer sb = new StringBuffer(); |
|
|
DownLoadUtil.downloadFile(path, saveFilePath); |
|
|
DownLoadUtil.downloadFile(path, saveFilePath); |
|
|
if (saveFilePath.endsWith(Constants.DOC) || saveFilePath.endsWith(Constants.DOCX)) { |
|
|
if (saveFilePath.endsWith(Constants.DOC) || saveFilePath.endsWith(Constants.DOCX)) { |
|
|
@ -107,10 +109,13 @@ public class FileTaskProcess implements Runnable{ |
|
|
}else if(saveFilePath.endsWith(Constants.PDF)){ |
|
|
}else if(saveFilePath.endsWith(Constants.PDF)){ |
|
|
//pdf 纯文字版本 |
|
|
//pdf 纯文字版本 |
|
|
sb.append(readPdfFile(saveFilePath)); |
|
|
sb.append(readPdfFile(saveFilePath)); |
|
|
} else { |
|
|
|
|
|
|
|
|
} else if(saveFilePath.endsWith(Constants.TXT)){ |
|
|
for (String line : FileUtils.readLines(new File(saveFilePath))) { |
|
|
for (String line : FileUtils.readLines(new File(saveFilePath))) { |
|
|
sb.append(line); |
|
|
sb.append(line); |
|
|
} |
|
|
} |
|
|
|
|
|
} else { |
|
|
|
|
|
throw new Exception("未知文件类型。"); |
|
|
|
|
|
|
|
|
} |
|
|
} |
|
|
return sb.toString(); |
|
|
return sb.toString(); |
|
|
} |
|
|
} |
|
|
@ -154,6 +159,9 @@ public class FileTaskProcess implements Runnable{ |
|
|
// 提取文本内容 |
|
|
// 提取文本内容 |
|
|
PDFTextStripper pdfStripper = new PDFTextStripper(); |
|
|
PDFTextStripper pdfStripper = new PDFTextStripper(); |
|
|
String text = pdfStripper.getText(document); |
|
|
String text = pdfStripper.getText(document); |
|
|
|
|
|
if(text.replace("\r\n", Constants.EMPTY).replace("\n", Constants.EMPTY).replace("\r", Constants.EMPTY).equals(Constants.EMPTY)) { |
|
|
|
|
|
text = "文档内容为空"; |
|
|
|
|
|
} |
|
|
// 关闭文档 |
|
|
// 关闭文档 |
|
|
document.close(); |
|
|
document.close(); |
|
|
return text; |
|
|
return text; |
|
|
@ -182,4 +190,17 @@ public class FileTaskProcess implements Runnable{ |
|
|
log.error("删除文件异常:{}",filePath); |
|
|
log.error("删除文件异常:{}",filePath); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
public static void main(String[] args) throws IOException { |
|
|
|
|
|
// 加载PDF文件 |
|
|
|
|
|
File file = new File("C:\\Users\\55007\\Downloads\\2025001_04.pdf"); |
|
|
|
|
|
PDDocument document = PDDocument.load(file); |
|
|
|
|
|
// 提取文本内容 |
|
|
|
|
|
PDFTextStripper pdfStripper = new PDFTextStripper(); |
|
|
|
|
|
String text = pdfStripper.getText(document).replace("\r\n", ""); |
|
|
|
|
|
|
|
|
|
|
|
System.out.println(text.equals("")); |
|
|
|
|
|
// 关闭文档 |
|
|
|
|
|
document.close(); |
|
|
|
|
|
} |
|
|
} |
|
|
} |