From 41e92250200d430ac93259e08440d46bb33a1149 Mon Sep 17 00:00:00 2001
From: 55007 <55007@maojian>
Date: Tue, 7 Jan 2025 16:14:56 +0800
Subject: [PATCH] =?UTF-8?q?=E6=96=87=E5=AD=97=E7=BF=BB=E8=AF=91=E5=BA=94?=
=?UTF-8?q?=E7=94=A8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.classpath | 40 +++
.project | 23 ++
.settings/org.eclipse.core.resources.prefs | 4 +
.settings/org.eclipse.jdt.core.prefs | 9 +
.settings/org.eclipse.m2e.core.prefs | 4 +
README.md | 1 +
pom.xml | 198 ++++++++++++++
.../crawl_translate/CrawlTranslateApplication.java | 69 +++++
.../crawl_translate/controller/ApiController.java | 32 +++
.../bfd/crawl_translate/service/MainHandler.java | 105 ++++++++
.../service/TranslateChatGptService.java | 296 +++++++++++++++++++++
.../java/com/bfd/crawl_translate/utils/Config.java | 28 ++
.../com/bfd/crawl_translate/utils/Constants.java | 19 ++
.../crawl_translate/utils/ContentException.java | 13 +
.../bfd/crawl_translate/utils/ESClientFactory.java | 57 ++++
.../com/bfd/crawl_translate/utils/HttpUtil.java | 274 +++++++++++++++++++
.../com/bfd/crawl_translate/utils/KfkUtil.java | 81 ++++++
.../com/bfd/crawl_translate/utils/PauseTool.java | 92 +++++++
.../utils/PercentTransalteUtil.java | 166 ++++++++++++
.../bfd/crawl_translate/utils/TranslateUtil.java | 165 ++++++++++++
src/main/resources/application.yml | 59 ++++
src/main/resources/logback-spring.xml | 38 +++
22 files changed, 1773 insertions(+)
create mode 100644 .classpath
create mode 100644 .project
create mode 100644 .settings/org.eclipse.core.resources.prefs
create mode 100644 .settings/org.eclipse.jdt.core.prefs
create mode 100644 .settings/org.eclipse.m2e.core.prefs
create mode 100644 README.md
create mode 100644 pom.xml
create mode 100644 src/main/java/com/bfd/crawl_translate/CrawlTranslateApplication.java
create mode 100644 src/main/java/com/bfd/crawl_translate/controller/ApiController.java
create mode 100644 src/main/java/com/bfd/crawl_translate/service/MainHandler.java
create mode 100644 src/main/java/com/bfd/crawl_translate/service/TranslateChatGptService.java
create mode 100644 src/main/java/com/bfd/crawl_translate/utils/Config.java
create mode 100644 src/main/java/com/bfd/crawl_translate/utils/Constants.java
create mode 100644 src/main/java/com/bfd/crawl_translate/utils/ContentException.java
create mode 100644 src/main/java/com/bfd/crawl_translate/utils/ESClientFactory.java
create mode 100644 src/main/java/com/bfd/crawl_translate/utils/HttpUtil.java
create mode 100644 src/main/java/com/bfd/crawl_translate/utils/KfkUtil.java
create mode 100644 src/main/java/com/bfd/crawl_translate/utils/PauseTool.java
create mode 100644 src/main/java/com/bfd/crawl_translate/utils/PercentTransalteUtil.java
create mode 100644 src/main/java/com/bfd/crawl_translate/utils/TranslateUtil.java
create mode 100644 src/main/resources/application.yml
create mode 100644 src/main/resources/logback-spring.xml
diff --git a/.classpath b/.classpath
new file mode 100644
index 0000000..8d95b91
--- /dev/null
+++ b/.classpath
@@ -0,0 +1,40 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/.project b/.project
new file mode 100644
index 0000000..9190480
--- /dev/null
+++ b/.project
@@ -0,0 +1,23 @@
+
+
+ analyst_translate
+
+
+
+
+
+ org.eclipse.jdt.core.javabuilder
+
+
+
+
+ org.eclipse.m2e.core.maven2Builder
+
+
+
+
+
+ org.eclipse.jdt.core.javanature
+ org.eclipse.m2e.core.maven2Nature
+
+
diff --git a/.settings/org.eclipse.core.resources.prefs b/.settings/org.eclipse.core.resources.prefs
new file mode 100644
index 0000000..cf6931b
--- /dev/null
+++ b/.settings/org.eclipse.core.resources.prefs
@@ -0,0 +1,4 @@
+eclipse.preferences.version=1
+encoding//src/main/java=UTF-8
+encoding//src/main/resources=UTF-8
+encoding/=UTF-8
diff --git a/.settings/org.eclipse.jdt.core.prefs b/.settings/org.eclipse.jdt.core.prefs
new file mode 100644
index 0000000..0ada971
--- /dev/null
+++ b/.settings/org.eclipse.jdt.core.prefs
@@ -0,0 +1,9 @@
+eclipse.preferences.version=1
+org.eclipse.jdt.core.compiler.codegen.methodParameters=generate
+org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
+org.eclipse.jdt.core.compiler.compliance=1.8
+org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled
+org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
+org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore
+org.eclipse.jdt.core.compiler.release=disabled
+org.eclipse.jdt.core.compiler.source=1.8
diff --git a/.settings/org.eclipse.m2e.core.prefs b/.settings/org.eclipse.m2e.core.prefs
new file mode 100644
index 0000000..14b697b
--- /dev/null
+++ b/.settings/org.eclipse.m2e.core.prefs
@@ -0,0 +1,4 @@
+activeProfiles=
+eclipse.preferences.version=1
+resolveWorkspaceProjects=true
+version=1
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..1380d0d
--- /dev/null
+++ b/README.md
@@ -0,0 +1 @@
+bfd翻译应用
diff --git a/pom.xml b/pom.xml
new file mode 100644
index 0000000..4ffd93f
--- /dev/null
+++ b/pom.xml
@@ -0,0 +1,198 @@
+
+
+ 4.0.0
+
+ org.springframework.boot
+ spring-boot-starter-parent
+ 2.2.4.RELEASE
+
+
+ com.bfd
+ crawl_translate
+ 0.0.1-SNAPSHOT
+ crawl_translate
+ crawl_translate
+
+ 1.8
+
+
+
+ org.springframework.boot
+ spring-boot-starter-web
+
+
+
+ mysql
+ mysql-connector-java
+ runtime
+
+
+ org.projectlombok
+ lombok
+ true
+
+
+ org.springframework.boot
+ spring-boot-starter-test
+ test
+
+
+ org.elasticsearch
+ elasticsearch
+ 6.0.0
+
+
+ org.elasticsearch.client
+ elasticsearch-rest-client
+ 6.0.0
+
+
+ org.elasticsearch.client
+ elasticsearch-rest-high-level-client
+ 6.0.0
+
+
+ org.elasticsearch.client
+ elasticsearch-rest-client
+
+
+
+
+ com.squareup.okhttp3
+ okhttp
+ 3.11.0
+
+
+ com.alibaba
+ fastjson
+ 2.0.12
+
+
+ cn.hutool
+ hutool-all
+ 5.8.9
+
+
+ org.apache.kafka
+ kafka-clients
+ 2.7.1
+
+
+ org.apache.poi
+ poi-ooxml
+ 5.2.2
+
+
+
+ de.codecentric
+ spring-boot-admin-client
+ 2.2.4
+
+
+
+ org.redisson
+ redisson-spring-boot-starter
+ 3.13.6
+
+
+ org.springframework.boot
+ spring-boot-starter-data-redis
+
+
+ org.apache.curator
+ curator-framework
+ 5.2.0
+
+
+ org.apache.curator
+ curator-recipes
+ 5.2.0
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-jar-plugin
+
+
+
+ *.properties
+ *.yml
+ *.yaml
+
+
+
+
+ com.bfd.crawl_translate.CrawlTranslateApplication
+
+ true
+
+ lib/
+
+ false
+
+
+
+ lib/pauseTool-1.0.jar config/
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-dependency-plugin
+
+
+ copy
+ package
+
+ copy-dependencies
+
+
+ ${project.build.directory}/lib/
+
+
+
+
+
+
+ maven-resources-plugin
+
+
+ copy-resources
+ package
+
+ copy-resources
+
+
+
+
+
+ src/main/resources/
+
+ *.properties
+ *.yml
+ *.yaml
+
+
+
+ ${project.build.directory}/config
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+
+ 8
+ 8
+
+
+
+
+
+
diff --git a/src/main/java/com/bfd/crawl_translate/CrawlTranslateApplication.java b/src/main/java/com/bfd/crawl_translate/CrawlTranslateApplication.java
new file mode 100644
index 0000000..b781f4c
--- /dev/null
+++ b/src/main/java/com/bfd/crawl_translate/CrawlTranslateApplication.java
@@ -0,0 +1,69 @@
+package com.bfd.crawl_translate;
+
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.ThreadFactory;
+import java.util.concurrent.ThreadPoolExecutor;
+import java.util.concurrent.TimeUnit;
+
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.boot.SpringApplication;
+import org.springframework.boot.autoconfigure.SpringBootApplication;
+import org.springframework.context.ConfigurableApplicationContext;
+import org.springframework.data.redis.core.StringRedisTemplate;
+import org.springframework.scheduling.annotation.EnableScheduling;
+import org.springframework.scheduling.annotation.Scheduled;
+
+import com.bfd.crawl_translate.service.TranslateChatGptService;
+import com.bfd.crawl_translate.utils.Config;
+import com.bfd.crawl_translate.utils.HttpUtil;
+import com.bfd.crawl_translate.utils.PauseTool;
+
+import cn.hutool.core.thread.ThreadFactoryBuilder;
+import lombok.extern.slf4j.Slf4j;
+
+@SpringBootApplication
+@Slf4j
+@EnableScheduling
+public class CrawlTranslateApplication {
+
+ @Autowired
+ TranslateChatGptService translateChatGptService;
+ @Autowired
+ private StringRedisTemplate stringRedisTemplate;
+
+ @Value("${zookeeper.connection-string}")
+ private String connectionString;
+ @Value("${zookeeper.publish-node}")
+ private String nodePath;
+
+
+ public static void main(String[] args) {
+
+ ConfigurableApplicationContext applicationContext = SpringApplication.run(CrawlTranslateApplication.class, args);
+ applicationContext.getBean(CrawlTranslateApplication.class).start();
+ }
+ public void start(){
+ log.info("----------CrawlTranslateApplication start success----------");
+ HttpUtil.getToken();
+ //定义线程池
+ ThreadFactory namedThreadFactory = new ThreadFactoryBuilder().setNamePrefix("crawl-pool-%d").build();
+ ExecutorService singleThreadPool = new ThreadPoolExecutor(10, 20, 100L, TimeUnit.SECONDS, new LinkedBlockingQueue(1024), namedThreadFactory, new ThreadPoolExecutor.AbortPolicy());
+
+
+ singleThreadPool.execute(translateChatGptService);
+
+ PauseTool pauseTool = new PauseTool();
+ pauseTool.initializeRedisCache(stringRedisTemplate);
+ pauseTool.setupZookeeperListener(connectionString, nodePath);
+
+ log.info("----------CrawlTranslateApplication stop success----------");
+ }
+
+ @Scheduled(cron = "${crawl.cron.size_cron}")
+ public void getQueueSize(){
+ log.info("--------->taskQueue length == "+ Config.chatGptTranslateQueue.size());
+ }
+
+}
diff --git a/src/main/java/com/bfd/crawl_translate/controller/ApiController.java b/src/main/java/com/bfd/crawl_translate/controller/ApiController.java
new file mode 100644
index 0000000..31aec3e
--- /dev/null
+++ b/src/main/java/com/bfd/crawl_translate/controller/ApiController.java
@@ -0,0 +1,32 @@
+package com.bfd.crawl_translate.controller;
+
+import com.alibaba.fastjson.JSON;
+import com.bfd.crawl_translate.utils.Config;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.web.bind.annotation.*;
+
+import java.util.Map;
+
+/**
+ * @author guowei
+ */
+@Slf4j
+@RestController
+@RequestMapping(value = "/chatGpt")
+@CrossOrigin(origins = "*", maxAge = 3600)
+public class ApiController {
+ @RequestMapping(value = "/translate", method = RequestMethod.POST, produces = "application/json")
+ @ResponseBody
+
+ public String getchannelitems(@RequestBody String RequestStr) {
+ System.out.println("收到gpt翻译请求:"+RequestStr);
+ log.info("收到gpt翻译请求");
+// Map parse = (Map) JSON.parse(RequestStr);
+ try {
+ Config.chatGptTranslateQueue.put(RequestStr);
+ } catch (InterruptedException e) {
+ log.error("推送队列失败",e);
+ }
+ return "TranslationAPi Successfully";
+ }
+}
diff --git a/src/main/java/com/bfd/crawl_translate/service/MainHandler.java b/src/main/java/com/bfd/crawl_translate/service/MainHandler.java
new file mode 100644
index 0000000..c41559f
--- /dev/null
+++ b/src/main/java/com/bfd/crawl_translate/service/MainHandler.java
@@ -0,0 +1,105 @@
+package com.bfd.crawl_translate.service;
+
+import cn.hutool.core.io.FileUtil;
+import cn.hutool.core.io.file.FileWriter;
+import com.bfd.crawl_translate.utils.Config;
+import lombok.extern.slf4j.Slf4j;
+import org.apache.commons.io.FileUtils;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.boot.ApplicationArguments;
+import org.springframework.boot.ApplicationRunner;
+import org.springframework.stereotype.Service;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.List;
+import java.util.concurrent.LinkedBlockingQueue;
+
+
+/**
+ * @author jian.mao
+ * @date 2023年11月3日
+ * @description
+ */
+@Slf4j
+@Service
+public class MainHandler implements ApplicationRunner {
+
+ @Value("${crawl.task.taskData}")
+ private String taskPath;
+
+ @Override
+ public void run(ApplicationArguments args) throws Exception {
+ //停止处理
+ waitDown();
+ //启动加载缓存任务
+ readTask(taskPath, Config.chatGptTranslateQueue);
+ }
+
+
+
+
+
+ public static void readTask(String path, LinkedBlockingQueue queue){
+ File file = new File(path);
+ if(file.exists()){
+ List tasks = null;
+ try {
+ tasks = FileUtils.readLines(file,"UTF-8");
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ for (String taskStr : tasks) {
+// Map task = JSONObject.parseObject(taskStr);
+ try {
+ System.out.println("读到缓存数据:"+taskStr);
+ queue.put(taskStr);
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+ file.delete();
+ }
+ }
+ /**
+ * 结束触发钩子
+ */
+ public void waitDown() {
+ Runtime.getRuntime().addShutdownHook(new Thread() {
+ @Override
+ public void run() {
+ // 停止线程
+ Config.isStart = false;
+ log.info("stop-------");
+ writeTsskToFile();
+ }
+ });
+ }
+
+
+ /**
+ * 任务持久化到硬盘
+ */
+ public void writeTsskToFile(){
+ System.out.println(taskPath);
+ File file = new File(taskPath);
+ FileWriter fileWriter = new FileWriter(file);
+ if (!file.exists()){
+ fileWriter = FileWriter.create(file);
+ }
+ while(true){
+ if(Config.chatGptTranslateQueue.size() > 0 ){
+ try {
+ String task = Config.chatGptTranslateQueue.take();
+ System.out.println("写入缓存数据:"+task);
+ fileWriter.write(task+"\r\n",true);
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }else{
+ log.info("taskQueue write is file end");
+ break;
+ }
+ }
+ }
+}
diff --git a/src/main/java/com/bfd/crawl_translate/service/TranslateChatGptService.java b/src/main/java/com/bfd/crawl_translate/service/TranslateChatGptService.java
new file mode 100644
index 0000000..48e5670
--- /dev/null
+++ b/src/main/java/com/bfd/crawl_translate/service/TranslateChatGptService.java
@@ -0,0 +1,296 @@
+package com.bfd.crawl_translate.service;
+
+import cn.hutool.core.util.IdUtil;
+import com.alibaba.fastjson2.JSON;
+import com.alibaba.fastjson2.JSONObject;
+import com.alibaba.fastjson2.JSONPath;
+import com.bfd.crawl_translate.utils.*;
+import lombok.SneakyThrows;
+import lombok.extern.slf4j.Slf4j;
+import org.springframework.stereotype.Service;
+
+import java.text.BreakIterator;
+import java.util.*;
+
+/**
+ * @author guowei
+ */
+@Service
+@Slf4j
+public class TranslateChatGptService implements Runnable {
+ @SneakyThrows
+ @Override
+ public void run() {
+ while (Config.isStart) {
+ Map resultMap = new HashMap<>(32);
+ resultMap.put("isLast",1);
+ Map results = new HashMap<>();
+ String content = "";
+ resultMap.put("source",content);
+ JSONObject takeJson = new JSONObject();
+ if (Config.chatGptTranslateQueue.size() == 0) {
+ log.info("no task chatgpt translate task");
+ Thread.sleep(1000 * 10);
+ } else {
+ try {
+ String take = Config.chatGptTranslateQueue.take();
+ takeJson = JSONObject.parseObject(take);
+
+ Integer scense_id = (Integer) takeJson.get(Constants.SCENES_ID);
+ Integer version = (Integer) takeJson.get(Constants.VERSION);
+ if (!PauseTool.CACHE.containsKey(scense_id + Constants.UNDERLINE + version)) {
+ log.info("暂停任务:{}", JSONObject.toJSONString(takeJson));
+ continue;
+ }
+ JSONObject input = takeJson.getJSONObject("input");
+ String fromLanguage = input.getString("fromLanguage");
+ String toLanguage = input.getString("toLanguage");
+ JSONObject output = takeJson.getJSONObject("output");
+ if (output.containsKey("id")) {
+ resultMap.put("id", IdUtil.randomUUID());
+ }
+ if (!output.containsKey("content")) {
+ throw new ContentException("output缺少content");
+ }
+ //datasource样例:3_OCR识别内容:$['content']#json#$['attrbuite']
+ String datasource = input.getString("datasource");
+ System.out.println("datasourcec:" + datasource);
+ String[] split = datasource.split(":");
+ if (split.length == 0) {
+ log.error("datasource为空");
+ throw new NullPointerException();
+ }
+ String key = split[0];
+ JSONObject data = takeJson.getJSONObject("data");
+ String value = data.getString(key);
+ if (value.isEmpty()) {
+ log.error("内容为空");
+ throw new NullPointerException();
+ }
+ System.out.println("value:" + value);
+ if (split.length > 1 && !split[1].isEmpty()) {
+ if (split[1].contains("#json#")) {
+ String[] splitContent = split[1].split("#json#");
+ System.out.println(splitContent[0] + ":" + splitContent[1]);
+ JSONObject jsonObject = JSON.parseObject(value);
+ JSONObject jsonObject1 = (JSONObject) JSONPath.eval(jsonObject, splitContent[0]);
+ content = (String) JSONPath.eval(jsonObject1, splitContent[1]);
+ } else {
+ JSONObject jsonObject = JSON.parseObject(value);
+ content = (String) JSONPath.eval(jsonObject, split[1]);
+ }
+
+ } else {
+ content = value;
+ }
+ resultMap.put("source",content);
+
+// log.info("content before translate:" + content);
+ String translateContent = "";
+ //翻译
+ translateContent = translate(fromLanguage, toLanguage, content);
+ if (translateContent.equals("翻译失败")){
+ throw new Exception();
+ }
+
+ resultMap.put("content", translateContent);
+ results.put("results", JSON.toJSONString(resultMap));
+ results.put("status",1);
+ results.put("message","成功");
+ takeJson.put("result", results);
+ System.out.println("处理后:" + JSON.toJSONString(takeJson));
+ } catch (NullPointerException nullPointerException) {
+ log.error("关键字段为空", nullPointerException);
+ resultMap.put("content", "关键字段为空");
+ results.put("results", JSON.toJSONString(resultMap));
+ results.put("status",2);
+ results.put("message","关键字段为空");
+ takeJson.put("result", results);
+ System.out.println("处理后:" + JSON.toJSONString(takeJson));
+ } catch (ContentException contentException) {
+ System.out.println("没有content!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!");
+ resultMap.put("content", "关键字段为空");
+ results.put("results", JSON.toJSONString(resultMap));
+ results.put("status",2);
+ results.put("message","内容为空");
+ takeJson.put("result", results);
+ System.out.println("处理后:" + JSON.toJSONString(takeJson));
+ } catch (Throwable e) {
+ log.error("异常", e);
+ resultMap.put("content", "翻译失败");
+ results.put("results", JSON.toJSONString(resultMap));
+ results.put("status",2);
+ results.put("message","失败");
+ } finally {
+ takeJson.put("result", results);
+ System.out.println("处理后:" + JSON.toJSONString(takeJson));
+// System.out.println(JSON.toJSONString(takeJson));
+ KfkUtil.sendKafka(JSON.toJSONString(takeJson));
+ }
+ log.info("处理完成");
+ }
+ }
+ }
+
+ public String translate(String fromLanguage, String toLanguage, String content) throws Throwable {
+ String translateContent = "";
+// if (content.length() > 0) {
+// if (content.length() > 5000) {
+// log.info("content长度大于5000,根据句号分割");
+// String[] split_content = content.split("\\. ");
+// for (String text : split_content) {
+// if (text.isEmpty()) {
+// continue;
+// }
+// if (text.length() > 5000) {
+// log.info("拆分content后单句长度大于5000,根据句号分割");
+// String[] split_text = text.split("\\.");
+// for (String text2 : split_text) {
+// if (text2.isEmpty()) {
+// continue;
+// }
+// Map result_content = HttpUtil.getText(fromLanguage, toLanguage, text2);
+// String tran_text = "";
+// if ((boolean) result_content.get("isSuccess") == true) {
+// tran_text = (String) result_content.get("result");
+// translateContent += tran_text + "。";
+// } else {
+// log.error("content翻译失败,content:" + text2);
+// translateContent = "翻译失败";
+// break;
+// }
+// }
+// } else {
+// Map result_content = HttpUtil.getText(fromLanguage, toLanguage, text);
+// String tran_text = "";
+// if ((boolean) result_content.get("isSuccess") == true) {
+// tran_text = (String) result_content.get("result");
+// translateContent += tran_text + "。";
+// } else {
+// log.error("content翻译失败,content:" + text);
+// translateContent = "翻译失败";
+// break;
+// }
+// }
+// }
+// }
+ if (!content.isEmpty()) {
+ List parts = splitText(content, 4800);
+ List sentences = joinSentences(parts, 4800);
+ for (String sentence : sentences) {
+ String result = parse(fromLanguage, toLanguage, sentence);
+ if (result.isEmpty()) {
+ log.error("content翻译失败,sentence:" + sentence);
+ translateContent = "翻译失败";
+ break;
+ } else {
+ translateContent += result;
+ }
+ }
+ if(sentences.size()==0){
+ log.info("句子为空");
+ translateContent = "翻译失败";
+ }
+ } else {
+ String result = parse(fromLanguage, toLanguage, content);
+ if (result.isEmpty()) {
+ log.error("content翻译失败,sentence:" + content);
+ translateContent = "翻译失败";
+ } else {
+ translateContent = result;
+ }
+
+ }
+ return translateContent;
+ }
+
+ public static String parse(String fromLanguage,String toLanguage,String content) {
+ String toText = "";
+ if (!content.isEmpty()) {
+ if (fromLanguage.contains("auto") && toLanguage.equals("zh")) {
+ String language = HttpUtil.getLanguage2(content);
+ if (language.equals("zh")){
+ log.info("检测到是自动语言-->中文,并且原文检测为中文,使用繁体中文模型");
+ fromLanguage = "zh-tw";
+ }
+ }
+ Map translateResult = HttpUtil.getText(fromLanguage, toLanguage, content);
+ if ((boolean) translateResult.get("isSuccess") == true) {
+ toText = (String) translateResult.get("result");
+ } else {
+ log.error("翻译失败,text:{},json:{}", content,JSON.toJSONString(translateResult));
+ }
+ }
+ return toText;
+ }
+
+ // 将文本分割成小于指定长度的片段(确保不会在句子中间分割)
+ public static List splitText(String text, int maxLength) {
+ List parts = new ArrayList<>();
+ BreakIterator boundary = BreakIterator.getSentenceInstance();
+ boundary.setText(text);
+ int start = boundary.first();
+ for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
+ String sentence = text.substring(start, end);
+ if (sentence.length() > maxLength) {
+ List subParts = splitLongSentence(sentence, maxLength);
+ parts.addAll(subParts);
+ } else {
+ parts.add(sentence);
+ }
+ }
+ return parts;
+ }
+
+ // 将超过最大长度的句子分割成小片段
+ private static List splitLongSentence(String sentence, int maxLength) {
+ List subParts = new ArrayList<>();
+ for (int i = 0; i < sentence.length(); i += maxLength) {
+ subParts.add(sentence.substring(i, Math.min(sentence.length(), i + maxLength)));
+ }
+ return subParts;
+ }
+
+ /**
+ * 将多个短句拼接成不超过maxLength的段落
+ *
+ * @param sentences
+ * @param maxLength
+ * @return
+ */
+ public static List joinSentences(List sentences, int maxLength) {
+ List paragraphs = new ArrayList<>();
+ StringBuilder currentParagraph = new StringBuilder();
+
+ for (String sentence : sentences) {
+ // 计算当前段落加上当前句子的长度
+ int newLength = currentParagraph.length() + sentence.length();
+
+ if (newLength <= maxLength) {
+ // 如果加上当前句子后长度仍在限制范围内,则添加到当前段落
+ if (currentParagraph.length() > 0) {
+ currentParagraph.append(" "); // 添加句子间的空格
+ }
+ currentParagraph.append(sentence);
+ } else {
+ // 如果超出限制,则将当前段落加入列表,重置当前段落为当前句子
+ paragraphs.add(currentParagraph.toString());
+ currentParagraph = new StringBuilder(sentence);
+ }
+ }
+ // 添加最后一个段落
+ if (currentParagraph.length() > 0) {
+ paragraphs.add(currentParagraph.toString());
+ }
+ return paragraphs;
+ }
+
+ public static void main(String[] args) {
+ String content = "Administration Show submenu for “Administration”” President Joe Biden Vice President Kamala Harris First Lady Dr. Jill Biden Second Gentleman Douglas Emhoff The Cabinet Executive Offices Show submenu for “Executive Offices”” Council of Economic Advisers Council on Environmental Quality Domestic Policy Council Gender Policy Council National Economic Council National Security Council National Space Council Office of Intergovernmental Affairs Office of Management and Budget Office of the National Cyber Director Office of National Drug Control Policy Office of Public Engagement Office of Science and Technology Policy Office of the United States Trade Representative Climate Policy Office Presidential Personnel Office Priorities Briefing Room The White House Show submenu for “The White House”” Presidents First Families The Grounds Our Government Get Involved Show submenu for “Get Involved”” Write or Call The White House Join Us White House Fellows White House Internship Program The Record Disclosures Espa?ol Contact Us Privacy Policy Copyright Policy Accessibility Statement InstagramOpens in a new window FacebookOpens in a new window XOpens in a new window YouTubeOpens in a new window The White House 1600 Pennsylvania Ave NW Washington, DC 20500 April 25, 2024 FACT SHEET: President?Biden Announces New Workforce Hubs to Train and Connect American Workers to Good Jobs Created by the President’s Investing in America?Agenda Home Briefing Room Statements and Releases Today, President Biden will announce four new Workforce Hubs to ensure all Americans can access the good jobs created by the President’s Investing in America agenda, which includes the American Rescue Plan, the Bipartisan Infrastructure Law, the CHIPS and Science Act, and the Inflation Reduction Act. President Biden will make the announcement during his visit to Syracuse, New York, to highlight a CHIPS and Science Act preliminary agreement with Micron to dramatically expand semiconductor manufacturing in the United States. The Upstate New York region will be one of the four new Workforce Hubs, in addition to Philadelphia, Pennsylvania, Milwaukee, Wisconsin, and the state of Michigan. Since the beginning of the Biden-Harris Administration, private companies have announced over $825 billion in manufacturing and clean energy investments, on top of $478 billion already announced by the Administration for clean energy and infrastructure projects funded by the Bipartisan Infrastructure Law and Inflation Reduction Act. These investments are projected to create hundreds of thousands of good jobs—many of which do not require a college degree. The Biden-Harris Administration is committed to ensuring that all workers—including women, people of color, veterans, and those that have been historically left behind–have equitable access to those job opportunities and the training and skills needed to fill them. ? Today’s announcement also builds on the inaugural five Investing in America Workforce Hubs in Columbus, Baltimore, Pittsburgh, Augusta, and Phoenix that First Lady Jill Biden announced last May. Over the last year, the inaugural Hubs have generated dozens of significant commitments to create pipelines to good jobs, including an initiative to train 10,000 skilled construction workers in Columbus, Ohio, the first-ever registered apprenticeship program in semiconductor manufacturing at TSMC in Phoenix, and project labor agreements on $9 billion worth of infrastructure projects across Maryland. In each of the four new Hubs, the Administration will expand the successful models developed in the first round of Workforce Hubs and will continue to collaborate with state and local elected officials and community leaders to drive effective place-based workforce development efforts that are essential to the President’s vision of building an economy from the bottom up and the middle out. ? The next four Investing in America Workforce Hubs are:?? Upstate New York:?Upstate New York has emerged as a growing hub for semiconductor manufacturing, with record-breaking investments throughout the region. To date, companies have announced hundreds of billions of dollars in private-sector investments to regain American leadership in chips manufacturing since President Biden signed his CHIPS and Science Act. And today, President Biden is announcing a $6.1 billion preliminary agreement of terms with Micron to invest in semiconductor manufacturing in New York and Idaho, which will create over 70,000 jobs. The Department of Commerce, with support from the Departments of Education and Labor, will stand up a Workforce Hub to help meet the training needs of this nascent industry and related investments in the region by fostering collaborations with partners such as labor unions, employers, and education and training providers. Michigan: The state of Michigan has long been the engine of the American auto industry — and the good-paying union jobs that built the American middle class. As the country accelerates into an electric vehicle (EV) future, President Biden is committed to ensuring that the workers, unions, and businesses that have historically powered the auto industry lead the next generation of clean vehicles. President Biden strongly believes that auto companies transitioning to new technology should retool, reboot, and rehire in the same factories and in the same communities with comparable wages. Building on significant efforts underway – including President Biden’s $15.5 billion investment in the retooling of existing auto plants and rehiring of existing workers for the EV transition –? the Department of Energy and Department of Labor will partner with the State of Michigan to launch an Electric Vehicle Workforce Hub. Milwaukee: Last December, the City of Milwaukee announced that—thanks to funding from President Biden’s Bipartisan Infrastructure Law and in response to proposed rulemaking from the Environmental Protection Agency (EPA)— the City would reduce its timeline for replacing 100% of its lead pipes from 60 years to the 10 years outlined in the proposed rule. This announcement aligns with President Biden’s broader goal to remove all lead pipes across the nation within a decade. The EPA, with support from the Department of Transportation (DOT), will stand up a Workforce Hub to ensure the city has the skilled workers needed to accomplish this ambitious lead pipes replacement project and invest in clean water infrastructure in Milwaukee. Philadelphia:?The City of Philadelphia has received billions of dollars in funding for public infrastructure—including clean water infrastructure and improved roadway safety. DOT and EPA will co-lead this Hub to ensure the city has strong workforce pipelines for all residents to access good jobs replacing lead pipes and investing in construction and infrastructure. These new Workforce Hubs will align with the?Roadmap?to Support Good Jobs, the Biden-Harris Administration’s comprehensive approach to ensure that every American—whether they go to college or not—has equitable access to high-quality training, education, and services that provide a path to a good career without leaving their community. A new analysis released today from the Council of Economic Advisors outlines the economics behind the Administration’s workforce strategy and underscores how it has led to record-breaking job growth. Progress to Date The Investing in America Workforce Hubs build on the Biden-Harris Administration’s existing whole-of-government effort to advance high-quality workforce development, including: Building new pipelines to connect Americans to good jobs The Administration has invested more than $440 million since the President took office? to expand Registered Apprenticeships and pre-apprenticeships, supporting the education and training needs of more than 1 million apprentices. President Biden signed a Registered Apprenticeship Executive Order to bolster apprenticeships in the federal workforce. The Department of Education launched the first-ever Career-Connected High School grants program, supporting 19 districts and states reimagining the high school experience to better connect to career pathways. The Department of Labor has provided $200 million in Strengthening Community College grants since 2021, supporting quality workforce programs around the country. The Department of Labor released the High Road Training Program Map to spotlight high-quality training programs and show where they are located relative to projects mobilized by the Investing in America agenda. In January, the White House?announced?new commitments to its Advanced Manufacturing Sprint, including 150 new advanced manufacturing-related Registered Apprenticeship programs and occupations have been created or are newly under development, and more than 4,700 new apprentices hired in advanced manufacturing occupations. Making place-based workforce investments so every community can meet its foundational labor needs In addition to the nine Investing in America Workforce Hubs that are training residents for growing industries like clean energy and manufacturing, the Biden-Harris Administration has: Announced the designation of 31 communities across the country as Regional Innovation and Technology Hubs (Tech Hubs). Announced the 22 finalists of the Distressed Area Recompete Pilot Program. Recompete will invest $200 million in economic and workforce development projects that connect workers to good jobs in geographically diverse and persistently distressed communities across the country.? Stood up the National Semiconductor Training Center, which will deploy $5 billion in semiconductor-related research, development, and workforce needs to deliver on the CHIPS and Science Act. Invested tens of billions of dollars from the American Rescue Plan in workforce development strategies. Through the State and Local Fiscal Recovery Fund, which provided funding to every single local government across the country, more than 2,000 state and local governments have invested over $13 billion in workforce development and worker supports projects. Funded 32 coalitions across the country through the American Rescue Plan’s $500 million Good Jobs Challenge. As of December 2023, over 11,000 participants have entered training programs as a direct result of the program and thousands of workers have secured good, quality jobs in high-demand industries like construction, manufacturing, clean tech, forestry, and healthcare. Boosting job quality to support recruitment and retention For the first time in nearly 40 years, the Department of Labor updated?its Davis-Bacon regulations to modernize and strengthen prevailing wage rates for workers on federally funded construction projects, which will raise wages for 1 million construction workers over time. The National Labor Relations Board issued a?decision?announcing a new framework for union representation proceedings—where if an employer commits any unfair labor practices during a representation election, the Board will order the employer to recognize and bargain with the union, rather than re-running the election. The Department of Energy is requiring grant applicants to submit Community Benefits Plans to access Investing in America funding. Nearly all of the significant construction programs contained in President Biden’s Bipartisan Infrastructure Law, CHIPS and Science Act, and Inflation Reduction Act require or strongly incentivize the use of Davis-Bacon prevailing wages. The Inflation Reduction Act offers incentives that increase the value of clean energy tax credits by five times if employers pay prevailing wages and employ registered apprentices. The Department of Commerce required major CHIPS and Science Act awardees provide high-quality child care to their employees. The American Rescue Plan provided $24 billion to help child care providers keep their doors open – including over $2 billion for higher pay, hiring or retention bonuses, or other expanded benefits for care workers. Recent analysis shows that this funding led to an increase in the labor force participation rate of mothers with young children of about 3 percentage points relative to similar groups. ?### Next Post: Joint Statement from the Leaders of the United States, Argentina, Austria, Brazil, Bulgaria, Canada, Colombia, Denmark, France, Germany, Hungary, Poland, Portugal, Romania, Serbia, Spain, Thailand, and the United Kingdom Calling for the Release of the Hostages Held in Gaza Joint Statement from the Leaders of the United?States, Argentina, Austria, Brazil, Bulgaria, Canada, Colombia, Denmark, France, Germany, Hungary, Poland, Portugal, Romania, Serbia, Spain, Thailand, and the United Kingdom Calling for the Release of the Hostages Held in?Gaza April 25, 2024 ? Statements and Releases Next Post Stay Connected Sign Up Email Address* Required ZIP Code Please leave blank. We'll be in touch with the latest information on how President Biden and his administration are working for the American people, as well as ways you can get involved and help our country build back better. Opt in to send and receive text messages from President Biden. Home The Administration Executive Offices Priorities The Record Briefing Room The White House Disclosures Get Involved Espa?ol Contact Us Privacy Policy Copyright Policy Accessibility Statement InstagramOpens in a new window FacebookOpens in a new window XOpens in a new window YouTubeOpens in a new window The White House 1600 Pennsylvania Ave NW Washington, DC 20500 WH.gov";
+ List parts = splitText(content, 4800);
+ List sentences = joinSentences(parts, 4800);
+ for (String sentence : sentences) {
+ System.out.println(sentence);
+ }
+ }
+}
diff --git a/src/main/java/com/bfd/crawl_translate/utils/Config.java b/src/main/java/com/bfd/crawl_translate/utils/Config.java
new file mode 100644
index 0000000..788bd55
--- /dev/null
+++ b/src/main/java/com/bfd/crawl_translate/utils/Config.java
@@ -0,0 +1,28 @@
+package com.bfd.crawl_translate.utils;
+
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.stereotype.Component;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.LinkedBlockingQueue;
+
+/**
+ * @author guowei
+ */
+@Component
+public class Config {
+ public static String access_token;
+
+ public static LinkedBlockingQueue