commit 41e92250200d430ac93259e08440d46bb33a1149 Author: 55007 <55007@maojian> Date: Tue Jan 7 16:14:56 2025 +0800 文字翻译应用 diff --git a/.classpath b/.classpath new file mode 100644 index 0000000..8d95b91 --- /dev/null +++ b/.classpath @@ -0,0 +1,40 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.project b/.project new file mode 100644 index 0000000..9190480 --- /dev/null +++ b/.project @@ -0,0 +1,23 @@ + + + analyst_translate + + + + + + org.eclipse.jdt.core.javabuilder + + + + + org.eclipse.m2e.core.maven2Builder + + + + + + org.eclipse.jdt.core.javanature + org.eclipse.m2e.core.maven2Nature + + diff --git a/.settings/org.eclipse.core.resources.prefs b/.settings/org.eclipse.core.resources.prefs new file mode 100644 index 0000000..cf6931b --- /dev/null +++ b/.settings/org.eclipse.core.resources.prefs @@ -0,0 +1,4 @@ +eclipse.preferences.version=1 +encoding//src/main/java=UTF-8 +encoding//src/main/resources=UTF-8 +encoding/=UTF-8 diff --git a/.settings/org.eclipse.jdt.core.prefs b/.settings/org.eclipse.jdt.core.prefs new file mode 100644 index 0000000..0ada971 --- /dev/null +++ b/.settings/org.eclipse.jdt.core.prefs @@ -0,0 +1,9 @@ +eclipse.preferences.version=1 +org.eclipse.jdt.core.compiler.codegen.methodParameters=generate +org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 +org.eclipse.jdt.core.compiler.compliance=1.8 +org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled +org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning +org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore +org.eclipse.jdt.core.compiler.release=disabled +org.eclipse.jdt.core.compiler.source=1.8 diff --git a/.settings/org.eclipse.m2e.core.prefs b/.settings/org.eclipse.m2e.core.prefs new file mode 100644 index 0000000..14b697b --- /dev/null +++ b/.settings/org.eclipse.m2e.core.prefs @@ -0,0 +1,4 @@ +activeProfiles= +eclipse.preferences.version=1 +resolveWorkspaceProjects=true +version=1 diff --git a/README.md b/README.md new file mode 100644 index 0000000..1380d0d --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +bfd翻译应用 diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..4ffd93f --- /dev/null +++ b/pom.xml @@ -0,0 +1,198 @@ + + + 4.0.0 + + org.springframework.boot + spring-boot-starter-parent + 2.2.4.RELEASE + + + com.bfd + crawl_translate + 0.0.1-SNAPSHOT + crawl_translate + crawl_translate + + 1.8 + + + + org.springframework.boot + spring-boot-starter-web + + + + mysql + mysql-connector-java + runtime + + + org.projectlombok + lombok + true + + + org.springframework.boot + spring-boot-starter-test + test + + + org.elasticsearch + elasticsearch + 6.0.0 + + + org.elasticsearch.client + elasticsearch-rest-client + 6.0.0 + + + org.elasticsearch.client + elasticsearch-rest-high-level-client + 6.0.0 + + + org.elasticsearch.client + elasticsearch-rest-client + + + + + com.squareup.okhttp3 + okhttp + 3.11.0 + + + com.alibaba + fastjson + 2.0.12 + + + cn.hutool + hutool-all + 5.8.9 + + + org.apache.kafka + kafka-clients + 2.7.1 + + + org.apache.poi + poi-ooxml + 5.2.2 + + + + de.codecentric + spring-boot-admin-client + 2.2.4 + + + + org.redisson + redisson-spring-boot-starter + 3.13.6 + + + org.springframework.boot + spring-boot-starter-data-redis + + + org.apache.curator + curator-framework + 5.2.0 + + + org.apache.curator + curator-recipes + 5.2.0 + + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + + *.properties + *.yml + *.yaml + + + + + com.bfd.crawl_translate.CrawlTranslateApplication + + true + + lib/ + + false + + + + lib/pauseTool-1.0.jar config/ + + + + + + org.apache.maven.plugins + maven-dependency-plugin + + + copy + package + + copy-dependencies + + + ${project.build.directory}/lib/ + + + + + + + maven-resources-plugin + + + copy-resources + package + + copy-resources + + + + + + src/main/resources/ + + *.properties + *.yml + *.yaml + + + + ${project.build.directory}/config + + + + + + org.apache.maven.plugins + maven-compiler-plugin + + 8 + 8 + + + + + + diff --git a/src/main/java/com/bfd/crawl_translate/CrawlTranslateApplication.java b/src/main/java/com/bfd/crawl_translate/CrawlTranslateApplication.java new file mode 100644 index 0000000..b781f4c --- /dev/null +++ b/src/main/java/com/bfd/crawl_translate/CrawlTranslateApplication.java @@ -0,0 +1,69 @@ +package com.bfd.crawl_translate; + +import java.util.concurrent.ExecutorService; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.ThreadFactory; +import java.util.concurrent.ThreadPoolExecutor; +import java.util.concurrent.TimeUnit; + +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.boot.SpringApplication; +import org.springframework.boot.autoconfigure.SpringBootApplication; +import org.springframework.context.ConfigurableApplicationContext; +import org.springframework.data.redis.core.StringRedisTemplate; +import org.springframework.scheduling.annotation.EnableScheduling; +import org.springframework.scheduling.annotation.Scheduled; + +import com.bfd.crawl_translate.service.TranslateChatGptService; +import com.bfd.crawl_translate.utils.Config; +import com.bfd.crawl_translate.utils.HttpUtil; +import com.bfd.crawl_translate.utils.PauseTool; + +import cn.hutool.core.thread.ThreadFactoryBuilder; +import lombok.extern.slf4j.Slf4j; + +@SpringBootApplication +@Slf4j +@EnableScheduling +public class CrawlTranslateApplication { + + @Autowired + TranslateChatGptService translateChatGptService; + @Autowired + private StringRedisTemplate stringRedisTemplate; + + @Value("${zookeeper.connection-string}") + private String connectionString; + @Value("${zookeeper.publish-node}") + private String nodePath; + + + public static void main(String[] args) { + + ConfigurableApplicationContext applicationContext = SpringApplication.run(CrawlTranslateApplication.class, args); + applicationContext.getBean(CrawlTranslateApplication.class).start(); + } + public void start(){ + log.info("----------CrawlTranslateApplication start success----------"); + HttpUtil.getToken(); + //定义线程池 + ThreadFactory namedThreadFactory = new ThreadFactoryBuilder().setNamePrefix("crawl-pool-%d").build(); + ExecutorService singleThreadPool = new ThreadPoolExecutor(10, 20, 100L, TimeUnit.SECONDS, new LinkedBlockingQueue(1024), namedThreadFactory, new ThreadPoolExecutor.AbortPolicy()); + + + singleThreadPool.execute(translateChatGptService); + + PauseTool pauseTool = new PauseTool(); + pauseTool.initializeRedisCache(stringRedisTemplate); + pauseTool.setupZookeeperListener(connectionString, nodePath); + + log.info("----------CrawlTranslateApplication stop success----------"); + } + + @Scheduled(cron = "${crawl.cron.size_cron}") + public void getQueueSize(){ + log.info("--------->taskQueue length == "+ Config.chatGptTranslateQueue.size()); + } + +} diff --git a/src/main/java/com/bfd/crawl_translate/controller/ApiController.java b/src/main/java/com/bfd/crawl_translate/controller/ApiController.java new file mode 100644 index 0000000..31aec3e --- /dev/null +++ b/src/main/java/com/bfd/crawl_translate/controller/ApiController.java @@ -0,0 +1,32 @@ +package com.bfd.crawl_translate.controller; + +import com.alibaba.fastjson.JSON; +import com.bfd.crawl_translate.utils.Config; +import lombok.extern.slf4j.Slf4j; +import org.springframework.web.bind.annotation.*; + +import java.util.Map; + +/** + * @author guowei + */ +@Slf4j +@RestController +@RequestMapping(value = "/chatGpt") +@CrossOrigin(origins = "*", maxAge = 3600) +public class ApiController { + @RequestMapping(value = "/translate", method = RequestMethod.POST, produces = "application/json") + @ResponseBody + + public String getchannelitems(@RequestBody String RequestStr) { + System.out.println("收到gpt翻译请求:"+RequestStr); + log.info("收到gpt翻译请求"); +// Map parse = (Map) JSON.parse(RequestStr); + try { + Config.chatGptTranslateQueue.put(RequestStr); + } catch (InterruptedException e) { + log.error("推送队列失败",e); + } + return "TranslationAPi Successfully"; + } +} diff --git a/src/main/java/com/bfd/crawl_translate/service/MainHandler.java b/src/main/java/com/bfd/crawl_translate/service/MainHandler.java new file mode 100644 index 0000000..c41559f --- /dev/null +++ b/src/main/java/com/bfd/crawl_translate/service/MainHandler.java @@ -0,0 +1,105 @@ +package com.bfd.crawl_translate.service; + +import cn.hutool.core.io.FileUtil; +import cn.hutool.core.io.file.FileWriter; +import com.bfd.crawl_translate.utils.Config; +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.io.FileUtils; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.boot.ApplicationArguments; +import org.springframework.boot.ApplicationRunner; +import org.springframework.stereotype.Service; + +import java.io.File; +import java.io.IOException; +import java.util.List; +import java.util.concurrent.LinkedBlockingQueue; + + +/** + * @author jian.mao + * @date 2023年11月3日 + * @description + */ +@Slf4j +@Service +public class MainHandler implements ApplicationRunner { + + @Value("${crawl.task.taskData}") + private String taskPath; + + @Override + public void run(ApplicationArguments args) throws Exception { + //停止处理 + waitDown(); + //启动加载缓存任务 + readTask(taskPath, Config.chatGptTranslateQueue); + } + + + + + + public static void readTask(String path, LinkedBlockingQueue queue){ + File file = new File(path); + if(file.exists()){ + List tasks = null; + try { + tasks = FileUtils.readLines(file,"UTF-8"); + } catch (IOException e) { + e.printStackTrace(); + } + for (String taskStr : tasks) { +// Map task = JSONObject.parseObject(taskStr); + try { + System.out.println("读到缓存数据:"+taskStr); + queue.put(taskStr); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + file.delete(); + } + } + /** + * 结束触发钩子 + */ + public void waitDown() { + Runtime.getRuntime().addShutdownHook(new Thread() { + @Override + public void run() { + // 停止线程 + Config.isStart = false; + log.info("stop-------"); + writeTsskToFile(); + } + }); + } + + + /** + * 任务持久化到硬盘 + */ + public void writeTsskToFile(){ + System.out.println(taskPath); + File file = new File(taskPath); + FileWriter fileWriter = new FileWriter(file); + if (!file.exists()){ + fileWriter = FileWriter.create(file); + } + while(true){ + if(Config.chatGptTranslateQueue.size() > 0 ){ + try { + String task = Config.chatGptTranslateQueue.take(); + System.out.println("写入缓存数据:"+task); + fileWriter.write(task+"\r\n",true); + } catch (InterruptedException e) { + e.printStackTrace(); + } + }else{ + log.info("taskQueue write is file end"); + break; + } + } + } +} diff --git a/src/main/java/com/bfd/crawl_translate/service/TranslateChatGptService.java b/src/main/java/com/bfd/crawl_translate/service/TranslateChatGptService.java new file mode 100644 index 0000000..48e5670 --- /dev/null +++ b/src/main/java/com/bfd/crawl_translate/service/TranslateChatGptService.java @@ -0,0 +1,296 @@ +package com.bfd.crawl_translate.service; + +import cn.hutool.core.util.IdUtil; +import com.alibaba.fastjson2.JSON; +import com.alibaba.fastjson2.JSONObject; +import com.alibaba.fastjson2.JSONPath; +import com.bfd.crawl_translate.utils.*; +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; +import org.springframework.stereotype.Service; + +import java.text.BreakIterator; +import java.util.*; + +/** + * @author guowei + */ +@Service +@Slf4j +public class TranslateChatGptService implements Runnable { + @SneakyThrows + @Override + public void run() { + while (Config.isStart) { + Map resultMap = new HashMap<>(32); + resultMap.put("isLast",1); + Map results = new HashMap<>(); + String content = ""; + resultMap.put("source",content); + JSONObject takeJson = new JSONObject(); + if (Config.chatGptTranslateQueue.size() == 0) { + log.info("no task chatgpt translate task"); + Thread.sleep(1000 * 10); + } else { + try { + String take = Config.chatGptTranslateQueue.take(); + takeJson = JSONObject.parseObject(take); + + Integer scense_id = (Integer) takeJson.get(Constants.SCENES_ID); + Integer version = (Integer) takeJson.get(Constants.VERSION); + if (!PauseTool.CACHE.containsKey(scense_id + Constants.UNDERLINE + version)) { + log.info("暂停任务:{}", JSONObject.toJSONString(takeJson)); + continue; + } + JSONObject input = takeJson.getJSONObject("input"); + String fromLanguage = input.getString("fromLanguage"); + String toLanguage = input.getString("toLanguage"); + JSONObject output = takeJson.getJSONObject("output"); + if (output.containsKey("id")) { + resultMap.put("id", IdUtil.randomUUID()); + } + if (!output.containsKey("content")) { + throw new ContentException("output缺少content"); + } + //datasource样例:3_OCR识别内容:$['content']#json#$['attrbuite'] + String datasource = input.getString("datasource"); + System.out.println("datasourcec:" + datasource); + String[] split = datasource.split(":"); + if (split.length == 0) { + log.error("datasource为空"); + throw new NullPointerException(); + } + String key = split[0]; + JSONObject data = takeJson.getJSONObject("data"); + String value = data.getString(key); + if (value.isEmpty()) { + log.error("内容为空"); + throw new NullPointerException(); + } + System.out.println("value:" + value); + if (split.length > 1 && !split[1].isEmpty()) { + if (split[1].contains("#json#")) { + String[] splitContent = split[1].split("#json#"); + System.out.println(splitContent[0] + ":" + splitContent[1]); + JSONObject jsonObject = JSON.parseObject(value); + JSONObject jsonObject1 = (JSONObject) JSONPath.eval(jsonObject, splitContent[0]); + content = (String) JSONPath.eval(jsonObject1, splitContent[1]); + } else { + JSONObject jsonObject = JSON.parseObject(value); + content = (String) JSONPath.eval(jsonObject, split[1]); + } + + } else { + content = value; + } + resultMap.put("source",content); + +// log.info("content before translate:" + content); + String translateContent = ""; + //翻译 + translateContent = translate(fromLanguage, toLanguage, content); + if (translateContent.equals("翻译失败")){ + throw new Exception(); + } + + resultMap.put("content", translateContent); + results.put("results", JSON.toJSONString(resultMap)); + results.put("status",1); + results.put("message","成功"); + takeJson.put("result", results); + System.out.println("处理后:" + JSON.toJSONString(takeJson)); + } catch (NullPointerException nullPointerException) { + log.error("关键字段为空", nullPointerException); + resultMap.put("content", "关键字段为空"); + results.put("results", JSON.toJSONString(resultMap)); + results.put("status",2); + results.put("message","关键字段为空"); + takeJson.put("result", results); + System.out.println("处理后:" + JSON.toJSONString(takeJson)); + } catch (ContentException contentException) { + System.out.println("没有content!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"); + resultMap.put("content", "关键字段为空"); + results.put("results", JSON.toJSONString(resultMap)); + results.put("status",2); + results.put("message","内容为空"); + takeJson.put("result", results); + System.out.println("处理后:" + JSON.toJSONString(takeJson)); + } catch (Throwable e) { + log.error("异常", e); + resultMap.put("content", "翻译失败"); + results.put("results", JSON.toJSONString(resultMap)); + results.put("status",2); + results.put("message","失败"); + } finally { + takeJson.put("result", results); + System.out.println("处理后:" + JSON.toJSONString(takeJson)); +// System.out.println(JSON.toJSONString(takeJson)); + KfkUtil.sendKafka(JSON.toJSONString(takeJson)); + } + log.info("处理完成"); + } + } + } + + public String translate(String fromLanguage, String toLanguage, String content) throws Throwable { + String translateContent = ""; +// if (content.length() > 0) { +// if (content.length() > 5000) { +// log.info("content长度大于5000,根据句号分割"); +// String[] split_content = content.split("\\. "); +// for (String text : split_content) { +// if (text.isEmpty()) { +// continue; +// } +// if (text.length() > 5000) { +// log.info("拆分content后单句长度大于5000,根据句号分割"); +// String[] split_text = text.split("\\."); +// for (String text2 : split_text) { +// if (text2.isEmpty()) { +// continue; +// } +// Map result_content = HttpUtil.getText(fromLanguage, toLanguage, text2); +// String tran_text = ""; +// if ((boolean) result_content.get("isSuccess") == true) { +// tran_text = (String) result_content.get("result"); +// translateContent += tran_text + "。"; +// } else { +// log.error("content翻译失败,content:" + text2); +// translateContent = "翻译失败"; +// break; +// } +// } +// } else { +// Map result_content = HttpUtil.getText(fromLanguage, toLanguage, text); +// String tran_text = ""; +// if ((boolean) result_content.get("isSuccess") == true) { +// tran_text = (String) result_content.get("result"); +// translateContent += tran_text + "。"; +// } else { +// log.error("content翻译失败,content:" + text); +// translateContent = "翻译失败"; +// break; +// } +// } +// } +// } + if (!content.isEmpty()) { + List parts = splitText(content, 4800); + List sentences = joinSentences(parts, 4800); + for (String sentence : sentences) { + String result = parse(fromLanguage, toLanguage, sentence); + if (result.isEmpty()) { + log.error("content翻译失败,sentence:" + sentence); + translateContent = "翻译失败"; + break; + } else { + translateContent += result; + } + } + if(sentences.size()==0){ + log.info("句子为空"); + translateContent = "翻译失败"; + } + } else { + String result = parse(fromLanguage, toLanguage, content); + if (result.isEmpty()) { + log.error("content翻译失败,sentence:" + content); + translateContent = "翻译失败"; + } else { + translateContent = result; + } + + } + return translateContent; + } + + public static String parse(String fromLanguage,String toLanguage,String content) { + String toText = ""; + if (!content.isEmpty()) { + if (fromLanguage.contains("auto") && toLanguage.equals("zh")) { + String language = HttpUtil.getLanguage2(content); + if (language.equals("zh")){ + log.info("检测到是自动语言-->中文,并且原文检测为中文,使用繁体中文模型"); + fromLanguage = "zh-tw"; + } + } + Map translateResult = HttpUtil.getText(fromLanguage, toLanguage, content); + if ((boolean) translateResult.get("isSuccess") == true) { + toText = (String) translateResult.get("result"); + } else { + log.error("翻译失败,text:{},json:{}", content,JSON.toJSONString(translateResult)); + } + } + return toText; + } + + // 将文本分割成小于指定长度的片段(确保不会在句子中间分割) + public static List splitText(String text, int maxLength) { + List parts = new ArrayList<>(); + BreakIterator boundary = BreakIterator.getSentenceInstance(); + boundary.setText(text); + int start = boundary.first(); + for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) { + String sentence = text.substring(start, end); + if (sentence.length() > maxLength) { + List subParts = splitLongSentence(sentence, maxLength); + parts.addAll(subParts); + } else { + parts.add(sentence); + } + } + return parts; + } + + // 将超过最大长度的句子分割成小片段 + private static List splitLongSentence(String sentence, int maxLength) { + List subParts = new ArrayList<>(); + for (int i = 0; i < sentence.length(); i += maxLength) { + subParts.add(sentence.substring(i, Math.min(sentence.length(), i + maxLength))); + } + return subParts; + } + + /** + * 将多个短句拼接成不超过maxLength的段落 + * + * @param sentences + * @param maxLength + * @return + */ + public static List joinSentences(List sentences, int maxLength) { + List paragraphs = new ArrayList<>(); + StringBuilder currentParagraph = new StringBuilder(); + + for (String sentence : sentences) { + // 计算当前段落加上当前句子的长度 + int newLength = currentParagraph.length() + sentence.length(); + + if (newLength <= maxLength) { + // 如果加上当前句子后长度仍在限制范围内,则添加到当前段落 + if (currentParagraph.length() > 0) { + currentParagraph.append(" "); // 添加句子间的空格 + } + currentParagraph.append(sentence); + } else { + // 如果超出限制,则将当前段落加入列表,重置当前段落为当前句子 + paragraphs.add(currentParagraph.toString()); + currentParagraph = new StringBuilder(sentence); + } + } + // 添加最后一个段落 + if (currentParagraph.length() > 0) { + paragraphs.add(currentParagraph.toString()); + } + return paragraphs; + } + + public static void main(String[] args) { + String content = "Administration Show submenu for “Administration”” President Joe Biden Vice President Kamala Harris First Lady Dr. Jill Biden Second Gentleman Douglas Emhoff The Cabinet Executive Offices Show submenu for “Executive Offices”” Council of Economic Advisers Council on Environmental Quality Domestic Policy Council Gender Policy Council National Economic Council National Security Council National Space Council Office of Intergovernmental Affairs Office of Management and Budget Office of the National Cyber Director Office of National Drug Control Policy Office of Public Engagement Office of Science and Technology Policy Office of the United States Trade Representative Climate Policy Office Presidential Personnel Office Priorities Briefing Room The White House Show submenu for “The White House”” Presidents First Families The Grounds Our Government Get Involved Show submenu for “Get Involved”” Write or Call The White House Join Us White House Fellows White House Internship Program The Record Disclosures Espa?ol Contact Us Privacy Policy Copyright Policy Accessibility Statement InstagramOpens in a new window FacebookOpens in a new window XOpens in a new window YouTubeOpens in a new window The White House 1600 Pennsylvania Ave NW Washington, DC 20500 April 25, 2024 FACT SHEET: President?Biden Announces New Workforce Hubs to Train and Connect American Workers to Good Jobs Created by the President’s Investing in America?Agenda Home Briefing Room Statements and Releases Today, President Biden will announce four new Workforce Hubs to ensure all Americans can access the good jobs created by the President’s Investing in America agenda, which includes the American Rescue Plan, the Bipartisan Infrastructure Law, the CHIPS and Science Act, and the Inflation Reduction Act. President Biden will make the announcement during his visit to Syracuse, New York, to highlight a CHIPS and Science Act preliminary agreement with Micron to dramatically expand semiconductor manufacturing in the United States. The Upstate New York region will be one of the four new Workforce Hubs, in addition to Philadelphia, Pennsylvania, Milwaukee, Wisconsin, and the state of Michigan. Since the beginning of the Biden-Harris Administration, private companies have announced over $825 billion in manufacturing and clean energy investments, on top of $478 billion already announced by the Administration for clean energy and infrastructure projects funded by the Bipartisan Infrastructure Law and Inflation Reduction Act. These investments are projected to create hundreds of thousands of good jobs—many of which do not require a college degree. The Biden-Harris Administration is committed to ensuring that all workers—including women, people of color, veterans, and those that have been historically left behind–have equitable access to those job opportunities and the training and skills needed to fill them. ? Today’s announcement also builds on the inaugural five Investing in America Workforce Hubs in Columbus, Baltimore, Pittsburgh, Augusta, and Phoenix that First Lady Jill Biden announced last May. Over the last year, the inaugural Hubs have generated dozens of significant commitments to create pipelines to good jobs, including an initiative to train 10,000 skilled construction workers in Columbus, Ohio, the first-ever registered apprenticeship program in semiconductor manufacturing at TSMC in Phoenix, and project labor agreements on $9 billion worth of infrastructure projects across Maryland. In each of the four new Hubs, the Administration will expand the successful models developed in the first round of Workforce Hubs and will continue to collaborate with state and local elected officials and community leaders to drive effective place-based workforce development efforts that are essential to the President’s vision of building an economy from the bottom up and the middle out. ? The next four Investing in America Workforce Hubs are:?? Upstate New York:?Upstate New York has emerged as a growing hub for semiconductor manufacturing, with record-breaking investments throughout the region. To date, companies have announced hundreds of billions of dollars in private-sector investments to regain American leadership in chips manufacturing since President Biden signed his CHIPS and Science Act. And today, President Biden is announcing a $6.1 billion preliminary agreement of terms with Micron to invest in semiconductor manufacturing in New York and Idaho, which will create over 70,000 jobs. The Department of Commerce, with support from the Departments of Education and Labor, will stand up a Workforce Hub to help meet the training needs of this nascent industry and related investments in the region by fostering collaborations with partners such as labor unions, employers, and education and training providers. Michigan: The state of Michigan has long been the engine of the American auto industry — and the good-paying union jobs that built the American middle class. As the country accelerates into an electric vehicle (EV) future, President Biden is committed to ensuring that the workers, unions, and businesses that have historically powered the auto industry lead the next generation of clean vehicles. President Biden strongly believes that auto companies transitioning to new technology should retool, reboot, and rehire in the same factories and in the same communities with comparable wages. Building on significant efforts underway – including President Biden’s $15.5 billion investment in the retooling of existing auto plants and rehiring of existing workers for the EV transition –? the Department of Energy and Department of Labor will partner with the State of Michigan to launch an Electric Vehicle Workforce Hub. Milwaukee: Last December, the City of Milwaukee announced that—thanks to funding from President Biden’s Bipartisan Infrastructure Law and in response to proposed rulemaking from the Environmental Protection Agency (EPA)— the City would reduce its timeline for replacing 100% of its lead pipes from 60 years to the 10 years outlined in the proposed rule. This announcement aligns with President Biden’s broader goal to remove all lead pipes across the nation within a decade. The EPA, with support from the Department of Transportation (DOT), will stand up a Workforce Hub to ensure the city has the skilled workers needed to accomplish this ambitious lead pipes replacement project and invest in clean water infrastructure in Milwaukee. Philadelphia:?The City of Philadelphia has received billions of dollars in funding for public infrastructure—including clean water infrastructure and improved roadway safety. DOT and EPA will co-lead this Hub to ensure the city has strong workforce pipelines for all residents to access good jobs replacing lead pipes and investing in construction and infrastructure. These new Workforce Hubs will align with the?Roadmap?to Support Good Jobs, the Biden-Harris Administration’s comprehensive approach to ensure that every American—whether they go to college or not—has equitable access to high-quality training, education, and services that provide a path to a good career without leaving their community. A new analysis released today from the Council of Economic Advisors outlines the economics behind the Administration’s workforce strategy and underscores how it has led to record-breaking job growth. Progress to Date The Investing in America Workforce Hubs build on the Biden-Harris Administration’s existing whole-of-government effort to advance high-quality workforce development, including: Building new pipelines to connect Americans to good jobs The Administration has invested more than $440 million since the President took office? to expand Registered Apprenticeships and pre-apprenticeships, supporting the education and training needs of more than 1 million apprentices. President Biden signed a Registered Apprenticeship Executive Order to bolster apprenticeships in the federal workforce. The Department of Education launched the first-ever Career-Connected High School grants program, supporting 19 districts and states reimagining the high school experience to better connect to career pathways. The Department of Labor has provided $200 million in Strengthening Community College grants since 2021, supporting quality workforce programs around the country. The Department of Labor released the High Road Training Program Map to spotlight high-quality training programs and show where they are located relative to projects mobilized by the Investing in America agenda. In January, the White House?announced?new commitments to its Advanced Manufacturing Sprint, including 150 new advanced manufacturing-related Registered Apprenticeship programs and occupations have been created or are newly under development, and more than 4,700 new apprentices hired in advanced manufacturing occupations. Making place-based workforce investments so every community can meet its foundational labor needs In addition to the nine Investing in America Workforce Hubs that are training residents for growing industries like clean energy and manufacturing, the Biden-Harris Administration has: Announced the designation of 31 communities across the country as Regional Innovation and Technology Hubs (Tech Hubs). Announced the 22 finalists of the Distressed Area Recompete Pilot Program. Recompete will invest $200 million in economic and workforce development projects that connect workers to good jobs in geographically diverse and persistently distressed communities across the country.? Stood up the National Semiconductor Training Center, which will deploy $5 billion in semiconductor-related research, development, and workforce needs to deliver on the CHIPS and Science Act. Invested tens of billions of dollars from the American Rescue Plan in workforce development strategies. Through the State and Local Fiscal Recovery Fund, which provided funding to every single local government across the country, more than 2,000 state and local governments have invested over $13 billion in workforce development and worker supports projects. Funded 32 coalitions across the country through the American Rescue Plan’s $500 million Good Jobs Challenge. As of December 2023, over 11,000 participants have entered training programs as a direct result of the program and thousands of workers have secured good, quality jobs in high-demand industries like construction, manufacturing, clean tech, forestry, and healthcare. Boosting job quality to support recruitment and retention For the first time in nearly 40 years, the Department of Labor updated?its Davis-Bacon regulations to modernize and strengthen prevailing wage rates for workers on federally funded construction projects, which will raise wages for 1 million construction workers over time. The National Labor Relations Board issued a?decision?announcing a new framework for union representation proceedings—where if an employer commits any unfair labor practices during a representation election, the Board will order the employer to recognize and bargain with the union, rather than re-running the election. The Department of Energy is requiring grant applicants to submit Community Benefits Plans to access Investing in America funding. Nearly all of the significant construction programs contained in President Biden’s Bipartisan Infrastructure Law, CHIPS and Science Act, and Inflation Reduction Act require or strongly incentivize the use of Davis-Bacon prevailing wages. The Inflation Reduction Act offers incentives that increase the value of clean energy tax credits by five times if employers pay prevailing wages and employ registered apprentices. The Department of Commerce required major CHIPS and Science Act awardees provide high-quality child care to their employees. The American Rescue Plan provided $24 billion to help child care providers keep their doors open – including over $2 billion for higher pay, hiring or retention bonuses, or other expanded benefits for care workers. Recent analysis shows that this funding led to an increase in the labor force participation rate of mothers with young children of about 3 percentage points relative to similar groups. ?### Next Post: Joint Statement from the Leaders of the United States, Argentina, Austria, Brazil, Bulgaria, Canada, Colombia, Denmark, France, Germany, Hungary, Poland, Portugal, Romania, Serbia, Spain, Thailand, and the United Kingdom Calling for the Release of the Hostages Held in Gaza Joint Statement from the Leaders of the United?States, Argentina, Austria, Brazil, Bulgaria, Canada, Colombia, Denmark, France, Germany, Hungary, Poland, Portugal, Romania, Serbia, Spain, Thailand, and the United Kingdom Calling for the Release of the Hostages Held in?Gaza April 25, 2024 ? Statements and Releases Next Post Stay Connected Sign Up Email Address* Required ZIP Code Please leave blank. We'll be in touch with the latest information on how President Biden and his administration are working for the American people, as well as ways you can get involved and help our country build back better. Opt in to send and receive text messages from President Biden. Home The Administration Executive Offices Priorities The Record Briefing Room The White House Disclosures Get Involved Espa?ol Contact Us Privacy Policy Copyright Policy Accessibility Statement InstagramOpens in a new window FacebookOpens in a new window XOpens in a new window YouTubeOpens in a new window The White House 1600 Pennsylvania Ave NW Washington, DC 20500 WH.gov"; + List parts = splitText(content, 4800); + List sentences = joinSentences(parts, 4800); + for (String sentence : sentences) { + System.out.println(sentence); + } + } +} diff --git a/src/main/java/com/bfd/crawl_translate/utils/Config.java b/src/main/java/com/bfd/crawl_translate/utils/Config.java new file mode 100644 index 0000000..788bd55 --- /dev/null +++ b/src/main/java/com/bfd/crawl_translate/utils/Config.java @@ -0,0 +1,28 @@ +package com.bfd.crawl_translate.utils; + +import org.springframework.beans.factory.annotation.Value; +import org.springframework.stereotype.Component; + +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.LinkedBlockingQueue; + +/** + * @author guowei + */ +@Component +public class Config { + public static String access_token; + + public static LinkedBlockingQueue taskQueue = new LinkedBlockingQueue(); + + + /** + * chatGpt 翻译 + */ + public static LinkedBlockingQueue chatGptTranslateQueue = new LinkedBlockingQueue(); + + public static Boolean isStart = true; + + public static Map stopCache = new HashMap<>(); +} diff --git a/src/main/java/com/bfd/crawl_translate/utils/Constants.java b/src/main/java/com/bfd/crawl_translate/utils/Constants.java new file mode 100644 index 0000000..bd719e5 --- /dev/null +++ b/src/main/java/com/bfd/crawl_translate/utils/Constants.java @@ -0,0 +1,19 @@ +package com.bfd.crawl_translate.utils; + +import org.springframework.stereotype.Component; + +/** + * @author guowei + */ +@Component +public class Constants { + + public final static String STOP = "stop"; + + public final static String SCENES_ID = "scenes_id"; + + public final static String VERSION = "version"; + + public final static String UNDERLINE = "_"; + +} diff --git a/src/main/java/com/bfd/crawl_translate/utils/ContentException.java b/src/main/java/com/bfd/crawl_translate/utils/ContentException.java new file mode 100644 index 0000000..e447fe9 --- /dev/null +++ b/src/main/java/com/bfd/crawl_translate/utils/ContentException.java @@ -0,0 +1,13 @@ +package com.bfd.crawl_translate.utils; + +/** + * @author guowei + */ +public class ContentException extends Exception{ + public ContentException(){ + + } + public ContentException(String message){ + super(message); + } +} diff --git a/src/main/java/com/bfd/crawl_translate/utils/ESClientFactory.java b/src/main/java/com/bfd/crawl_translate/utils/ESClientFactory.java new file mode 100644 index 0000000..bb519e6 --- /dev/null +++ b/src/main/java/com/bfd/crawl_translate/utils/ESClientFactory.java @@ -0,0 +1,57 @@ +package com.bfd.crawl_translate.utils; + +import org.apache.http.HttpHost; +import org.apache.http.client.config.RequestConfig; +import org.apache.http.impl.nio.client.HttpAsyncClientBuilder; +import org.elasticsearch.client.RestClient; +import org.elasticsearch.client.RestClientBuilder; +import org.elasticsearch.client.RestClientBuilder.HttpClientConfigCallback; +import org.elasticsearch.client.RestHighLevelClient; + +public class ESClientFactory { + private static final int CONNECT_TIME_OUT = 1000; + private static final int SOCKET_TIME_OUT = 30000; + private static final int CONNECTION_REQUEST_TIME_OUT = 500; + private static final int MAX_CONNECT_NUM = 100; + private static final int MAX_CONNECT_PER_ROUTE = 100; + private static boolean uniqueConnectTimeConfig = false; + private static boolean uniqueConnectNumConfig = true; + + public static RestHighLevelClient init(){ + RestClientBuilder builder = RestClient.builder(new HttpHost("172.18.1.81",9201,"http")); + if(uniqueConnectTimeConfig){ + setConnectTimeOutConfig(builder); + } + if(uniqueConnectNumConfig){ + setMutiConnectConfig(builder); + } + RestHighLevelClient restHighLevelClient = new RestHighLevelClient(builder); + return restHighLevelClient; + } + + // 主要关于异步httpclient的连接延时配置 + + public static void setConnectTimeOutConfig(RestClientBuilder builder){ + builder.setRequestConfigCallback(new RestClientBuilder.RequestConfigCallback() { + @Override + public RequestConfig.Builder customizeRequestConfig(RequestConfig.Builder requestConfigBuilder) { + requestConfigBuilder.setConnectTimeout(CONNECT_TIME_OUT); + requestConfigBuilder.setSocketTimeout(SOCKET_TIME_OUT); + requestConfigBuilder.setConnectionRequestTimeout(CONNECTION_REQUEST_TIME_OUT); + return requestConfigBuilder; + } + }); + } + + // 主要关于异步httpclient的连接数配置 + public static void setMutiConnectConfig(RestClientBuilder builder){ + builder.setHttpClientConfigCallback(new HttpClientConfigCallback() { + @Override + public HttpAsyncClientBuilder customizeHttpClient(HttpAsyncClientBuilder httpClientBuilder) { + httpClientBuilder.setMaxConnTotal(MAX_CONNECT_NUM); + httpClientBuilder.setMaxConnPerRoute(MAX_CONNECT_PER_ROUTE); + return httpClientBuilder; + } + }); + } +} diff --git a/src/main/java/com/bfd/crawl_translate/utils/HttpUtil.java b/src/main/java/com/bfd/crawl_translate/utils/HttpUtil.java new file mode 100644 index 0000000..a945912 --- /dev/null +++ b/src/main/java/com/bfd/crawl_translate/utils/HttpUtil.java @@ -0,0 +1,274 @@ +package com.bfd.crawl_translate.utils; + +import com.alibaba.fastjson2.JSON; +import lombok.extern.slf4j.Slf4j; +import okhttp3.*; +import org.apache.kafka.clients.producer.KafkaProducer; +import org.apache.kafka.clients.producer.ProducerRecord; +import org.apache.kafka.common.serialization.StringSerializer; +import org.springframework.scheduling.annotation.Scheduled; +import org.springframework.stereotype.Component; + +import java.io.IOException; +import java.net.ConnectException; +import java.net.URLDecoder; +import java.net.URLEncoder; +import java.util.HashMap; +import java.util.Map; +import java.util.Properties; +import java.util.concurrent.TimeUnit; + +/** + * @author guowei + */ +@Slf4j +@Component +public class HttpUtil { + + @Scheduled(cron = "${crawl.cron.token_cron}") + public static String getToken() { + OkHttpClient client = new OkHttpClient().newBuilder() + .readTimeout(6000,TimeUnit.SECONDS) + .connectTimeout(6000,TimeUnit.SECONDS) + .writeTimeout(6000,TimeUnit.SECONDS) + .readTimeout(6000,TimeUnit.SECONDS) + .build(); + MediaType mediaType = MediaType.parse("application/x-www-form-urlencoded"); + RequestBody body = RequestBody.create(mediaType, "username=collectionSystem&password=1ade58c775ddc6203f444c60f9af680e&grant_type=password"); + Request request = new Request.Builder() + .url("https://fanyi.percent.cn/api/SystemManager/oauth/token") + .method("POST", body) + .addHeader("Content-Type", "application/x-www-form-urlencoded") + .build(); + try { + Response response = client.newCall(request).execute(); + Map json = (Map) JSON.parse(response.body().string()); + if ("200".equals(json.get("code"))) { + log.info("获取token成功,当前时间:{},token:{}",System.currentTimeMillis(),json.get("access_token")); + Config.access_token = (String) json.get("access_token"); + } else { + log.error("获取token失败,json:" + json); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + return ""; + } + + public static Map getText(String fromLanguage,String toLanguage,String text) { + StackTraceElement[] stackTrace = Thread.currentThread().getStackTrace(); + String methodName = stackTrace[2].getMethodName(); + String className = stackTrace[2].getClassName(); +// System.out.println("谁调用了我:"+className+","+methodName); + String result = ""; + Map resultMap = new HashMap<>(); + try { + String encode = URLEncoder.encode(text, "UTF-8"); + OkHttpClient client = new OkHttpClient().newBuilder() + .connectTimeout(30000, TimeUnit.SECONDS) + .readTimeout(30000, TimeUnit.SECONDS) + .writeTimeout(30000, TimeUnit.SECONDS) + .build(); + MediaType mediaType = MediaType.parse("application/json"); + RequestBody body = RequestBody.create(mediaType, "{\"fromLanguage\":\""+fromLanguage+"\",\"toLanguage\":\""+toLanguage+"\",\"text\":\"" + encode + "\"}"); + Request request = new Request.Builder() + .url("https://fanyi.percent.cn/api/dt/tran/text") + .method("POST", body) + .addHeader("Authorization", "Bearer " + Config.access_token) + .addHeader("Content-Type", "application/json") +// .addHeader("Cookie", "dt_saas_token="+Config.access_token+"; JSESSIONID=951aa17c-9d1d-4991-a327-d5d2dc2de860") + .build(); + Response response = null; + for (int i = 0; i < 10; i++) { +// System.out.println(i); + try { + response = client.newCall(request).execute(); + if (response.isSuccessful()) { +// break; + Map json = (Map) JSON.parse(response.body().string()); + int code = (int) json.get("code"); + String message = (String) json.get("message"); + if (code == 200 && message == null) { + Map data = (Map) json.get("data"); + result = URLDecoder.decode((String) data.get("value"), "UTF-8"); + resultMap.put("isSuccess", true); + resultMap.put("result", result); + break; + } else if (code == 500 && message.contains("翻译超时")){ + continue; + } else { + log.error("文本翻译失败,json:" + json); + resultMap.put("isSuccess", false); + resultMap.put("result", result); + break; + } + } + }catch (ConnectException connectException){ + connectException.printStackTrace(); + log.error("内容翻译失败"+i+"次",connectException); + } + } + + } catch (Throwable e) { +// e. + log.error("实时文本接口翻译失败,text:"+text+",e:"+e); + } + return resultMap; + } + + public static Boolean getLanguage(String text) { + String result = ""; + Boolean isZh = false; + try { +// String encode = URLEncoder.encode(text, "UTF-8"); + OkHttpClient client = new OkHttpClient().newBuilder() + .connectTimeout(300, TimeUnit.SECONDS) + .readTimeout(60, TimeUnit.SECONDS) + .writeTimeout(30, TimeUnit.SECONDS) + .build(); + MediaType mediaType = MediaType.parse("application/json"); + RequestBody body = RequestBody.create(mediaType, "{\"text\":\"" + text + "\"}"); + Request request = new Request.Builder() + .url("https://fanyi.percent.cn/api/dt/api/language/distinguish") + .method("POST", body) + .addHeader("Authorization", "Bearer " + Config.access_token) + .addHeader("Content-Type", "application/json") +// .addHeader("Cookie", "dt_saas_token=d3c60514c65285836dc316b6d48d468e; JSESSIONID=2726ac68-ad92-470f-b1d5-c1c0255f1cb9") + .build(); + Response response = null; + for (int i = 0; i < 10; i++) { +// System.out.println(i); + try { + response = client.newCall(request).execute(); + if (response.isSuccessful()) { + break; + } + }catch (ConnectException connectException){ + connectException.printStackTrace(); +// continue; + } + } + Map json = (Map) JSON.parse(response.body().string()); + if ((int) json.get("code") == 200) { + Map data = (Map) json.get("data"); +// result = URLDecoder.decode((String) data.get("value"), "UTF-8"); + String language = (String) data.get("language"); + if (language.equals("zh")){ + isZh = true; + }else { + isZh = false; + } + } else { + log.error("语种检测失败,json:" + json); + + } +// System.out.println(response.code()); +// System.out.println(response.body().string()); + } catch (Throwable e) { +// e. + log.error("语种检测失败,text:"+text+",e:"+e); + } + return isZh; + } + + public static String getLanguage2(String text) { +// String result = ""; + String language = ""; + try { +// String encode = URLEncoder.encode(text, "UTF-8"); + OkHttpClient client = new OkHttpClient().newBuilder() + .connectTimeout(300, TimeUnit.SECONDS) + .readTimeout(60, TimeUnit.SECONDS) + .writeTimeout(30, TimeUnit.SECONDS) + .build(); + MediaType mediaType = MediaType.parse("application/x-www-form-urlencoded"); + RequestBody body = RequestBody.create(mediaType, "content="+text); + Request request = new Request.Builder() + .url("http://distinguish.pontoaplus.com/translate/lang_detect") +// .url("http://172.18.1.155:19999/translate/lang_detect") + .method("POST", body) + .build(); + Response response = null; + for (int i = 0; i < 10; i++) { +// System.out.println(i); + try { + response = client.newCall(request).execute(); + if (response.isSuccessful()) { + break; + } + } catch (ConnectException connectException) { + connectException.printStackTrace(); +// continue; + } + } + Map json = (Map) JSON.parse(response.body().string()); + if ((int) json.get("code") == 200) { + Map data = (Map) json.get("data"); +// result = URLDecoder.decode((String) data.get("value"), "UTF-8"); + language = (String) data.get("lang"); + + } else { + log.error("语种检测失败,json:" + json); + + } +// System.out.println(response.code()); +// System.out.println(response.body().string()); + } catch (Throwable e) { +// e. + log.error("语种检测失败,text:" + text + ",e:" + e); + } + return language; + } + + private static KafkaProducer producer; + + static { + producer = getKafkaProdect("172.18.1.101:9092,172.18.1.102:9092,172.18.1.104:9092"); + } + + public static void sendKafka(String topic, String resultData) { + ProducerRecord se = new ProducerRecord(topic, resultData); +// KafkaProducer producer = getKafkaProdect(config.brokers); + producer.send(se); + } + + public static KafkaProducer getKafkaProdect(String brokerList) { + Properties props = new Properties(); + props.put("bootstrap.servers", brokerList);//xxx服务器ip +// props.put("bootstrap.servers", "172.18.1.114:9992");//xxx服务器ip + props.put("acks", "all");//所有follower都响应了才认为消息提交成功,即"committed" + props.put("retries", 3);//retries = MAX 无限重试,直到你意识到出现了问题:) + props.put("batch.size", 16384);//producer将试图批处理消息记录,以减少请求次数.默认的批量处理消息字节数 + //batch.size当批量的数据大小达到设定值后,就会立即发送,不顾下面的linger.ms + props.put("linger.ms", 1);//延迟1ms发送,这项设置将通过增加小的延迟来完成--即,不是立即发送一条记录,producer将会等待给定的延迟时间以允许其他消息记录发送,这些消息记录可以批量处理 + props.put("buffer.memory", 33554432);//producer可以用来缓存数据的内存大小。 + props.put("key.serializer", + StringSerializer.class.getName()); + props.put("value.serializer", + StringSerializer.class.getName()); + KafkaProducer producer = new KafkaProducer(props); + return producer; + } + + public static void main(String[] args) throws IOException { +// getToken(); +// String access_token = Config.access_token; + + OkHttpClient client = new OkHttpClient().newBuilder() + .build(); + MediaType mediaType = MediaType.parse("application/json"); + RequestBody body = RequestBody.create(mediaType, "{\"fromLanguage\":\"auto\",\"toLanguage\":\"zh\",\"text\":\"hello\"}"); + Request request = new Request.Builder() + .url("https://fanyi.percent.cn/api/dt/tran/text") + .method("POST", body) + .addHeader("Authorization", "Beare fc5e8aa070b0e6c1eba8140c3462afb0") + .addHeader("Content-Type", "application/json") + .addHeader("Cookie", "dt_saas_token=fc5e8aa070b0e6c1eba8140c3462afb0; JSESSIONID=951aa17c-9d1d-4991-a327-d5d2dc2de860") + .build(); + Response response = client.newCall(request).execute(); + String string = response.body().string(); + System.out.println(string); +// System.out.println(access_token); +// getText("hello"); + } +} diff --git a/src/main/java/com/bfd/crawl_translate/utils/KfkUtil.java b/src/main/java/com/bfd/crawl_translate/utils/KfkUtil.java new file mode 100644 index 0000000..3d762eb --- /dev/null +++ b/src/main/java/com/bfd/crawl_translate/utils/KfkUtil.java @@ -0,0 +1,81 @@ +package com.bfd.crawl_translate.utils; + +import lombok.extern.slf4j.Slf4j; +import org.apache.kafka.clients.producer.KafkaProducer; +import org.apache.kafka.clients.producer.ProducerRecord; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.stereotype.Component; + +import java.util.Properties; + +/** + * @author guowei + * kfk工具类 + */ +@Component +@Slf4j +public class KfkUtil { + private static String topic; + + private static String brokerList; + + @Value("${crawl.kafka.topic}") + public void setTopic(String topic) { + KfkUtil.topic = topic; + } + + @Value("${crawl.kafka.brokers}") + public void setBrokerList(String brokerList) { + KfkUtil.brokerList = brokerList; + } + private static KafkaProducer kafkaProducer; + + public static int num = 0; + + /** + * 获取KafkaProducer实例 + */ + public static KafkaProducer getProducer() { + if (kafkaProducer == null) { + Properties props = new Properties(); + //xxx服务器ip + props.put("bootstrap.servers", brokerList); + //所有follower都响应了才认为消息提交成功,即"committed" + props.put("acks", "all"); + //retries = MAX 无限重试,直到你意识到出现了问题:) + props.put("retries", 3); + //producer将试图批处理消息记录,以减少请求次数.默认的批量处理消息字节数 + props.put("batch.size", 16384); + //batch.size当批量的数据大小达到设定值后,就会立即发送,不顾下面的linger.ms + //延迟1ms发送,这项设置将通过增加小的延迟来完成--即,不是立即发送一条记录,producer将会等待给定的延迟时间以允许其他消息记录发送,这些消息记录可以批量处理 + props.put("linger.ms", 1); + //producer可以用来缓存数据的内存大小。 + props.put("buffer.memory", 33554432); + props.put("key.serializer", + "org.apache.kafka.common.serialization.StringSerializer"); + props.put("value.serializer", + "org.apache.kafka.common.serialization.StringSerializer"); + kafkaProducer = new KafkaProducer(props); + } + return kafkaProducer; + } + + /** + * 关闭KafkaProducer实例 + */ + public static void closeProducer() { + if (kafkaProducer != null) { + log.info("----------close producer----------"); + kafkaProducer.close(); + kafkaProducer = null; + } + } + + public static void sendKafka(String resultData) { + KafkaProducer producer = getProducer(); + ProducerRecord se = new ProducerRecord(topic, resultData); + producer.send(se); + log.info("发送kafka成功"); +// num++; + } +} diff --git a/src/main/java/com/bfd/crawl_translate/utils/PauseTool.java b/src/main/java/com/bfd/crawl_translate/utils/PauseTool.java new file mode 100644 index 0000000..e6d7e26 --- /dev/null +++ b/src/main/java/com/bfd/crawl_translate/utils/PauseTool.java @@ -0,0 +1,92 @@ +package com.bfd.crawl_translate.utils; + +import com.alibaba.fastjson.JSON; +import com.alibaba.fastjson.JSONObject; +import lombok.extern.slf4j.Slf4j; +import org.apache.curator.framework.CuratorFramework; +import org.apache.curator.framework.CuratorFrameworkFactory; +import org.apache.curator.framework.recipes.cache.NodeCache; +import org.apache.curator.framework.recipes.cache.NodeCacheListener; +import org.apache.curator.retry.ExponentialBackoffRetry; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.data.redis.core.StringRedisTemplate; +import org.springframework.stereotype.Component; + +import javax.annotation.PostConstruct; +import javax.annotation.Resource; +import java.util.HashMap; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; + +/** + * @author:jinming + * @className:ZookeeperNodeMonitor + * @version:1.0 + * @description: Zookeeper节点监听和Redis初始化工具类 + * @Date:2024/7/2 14:20 + */ +@Component +@Slf4j +public class PauseTool { + + // 本地缓存 + public static final HashMap CACHE = new HashMap<>(); + + /** + * 初始化Redis中的version_*键并加载到本地缓存 + */ + public void initializeRedisCache(StringRedisTemplate stringRedisTemplate) { + try { + Set keys = stringRedisTemplate.keys("version_*"); + if (keys != null) { + for (String key : keys) { + String value = stringRedisTemplate.opsForValue().get(key); + if (value != null) { + String sincesId = key.split("_")[1]; + CACHE.put(sincesId.concat("_").concat(value), value); + } + } + } + log.info("当前缓存version信息:{}", JSON.toJSON(CACHE)); + } catch (Exception e) { + log.error("Error initializing Redis cache", e); + } + } + + public void setupZookeeperListener(String connectionString, String nodePath) { + CuratorFramework curatorFramework = CuratorFrameworkFactory.newClient(connectionString, new ExponentialBackoffRetry(1000, 3)); + curatorFramework.start(); + try { + // 创建节点监听器 + NodeCache nodeCache = new NodeCache(curatorFramework, nodePath); + nodeCache.start(); + log.info("数据监听已启动"); + // 监听节点变化 + nodeCache.getListenable().addListener(new NodeCacheListener() { + @Override + public void nodeChanged() throws Exception { + byte[] data = nodeCache.getCurrentData().getData(); + try { + String nodeData = new String(data); + log.info("Node data changed: " + nodeData); + // 解析JSON数据 + JSONObject jsonObject = JSON.parseObject(nodeData); + int scenesId = jsonObject.getIntValue("scenes_id"); + int version = jsonObject.getIntValue("version"); + String newKey = scenesId + "_" + version; + // 移除CACHE中所有以scenesId开头的key + CACHE.keySet().removeIf(key -> key.startsWith(scenesId + "_")); + // 将新的key放入CACHE + CACHE.put(newKey, String.valueOf(version)); + log.info("当前缓存version信息:{}", JSON.toJSON(CACHE)); + } catch (Exception e) { + e.printStackTrace(); + } + } + }); + } catch (Exception e) { + log.error("Error setting up Zookeeper listener", e); + } + } +} \ No newline at end of file diff --git a/src/main/java/com/bfd/crawl_translate/utils/PercentTransalteUtil.java b/src/main/java/com/bfd/crawl_translate/utils/PercentTransalteUtil.java new file mode 100644 index 0000000..7f04ff6 --- /dev/null +++ b/src/main/java/com/bfd/crawl_translate/utils/PercentTransalteUtil.java @@ -0,0 +1,166 @@ +package com.bfd.crawl_translate.utils; + +import com.alibaba.fastjson2.JSON; +import lombok.extern.slf4j.Slf4j; +import okhttp3.*; +import org.springframework.scheduling.annotation.Scheduled; +import org.springframework.stereotype.Component; + +import java.io.IOException; +import java.net.ConnectException; +import java.net.URLDecoder; +import java.net.URLEncoder; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.TimeUnit; + +/** + * @author guowei + */ +@Component +@Slf4j +public class PercentTransalteUtil { + public static String token; + +// @Scheduled(cron = "${crawl.cron.token_cron}") + public static String getToken() { + OkHttpClient client = new OkHttpClient().newBuilder() + .build(); + MediaType mediaType = MediaType.parse("application/x-www-form-urlencoded"); + //isRememberMe为1表示获取长效token(7天内有效),不传或非1获取的token为12小时有效 + RequestBody body = RequestBody.create(mediaType, "username=collectionSystem&password=67385d64fab46ab2a25f2ff2898e590d&grant_type=password"); + Request request = new Request.Builder() + .url("https://fanyi.percent.cn/api/SystemManager/oauth/token") + .method("POST", body) + .addHeader("Content-Type", "application/x-www-form-urlencoded") + .build(); + try { + Response response = client.newCall(request).execute(); + Map json = (Map) JSON.parse(response.body().string()); + if ("200".equals(json.get("code"))) { + log.info("获取token成功,当前时间:"+System.currentTimeMillis()); + token = (String) json.get("access_token"); + } else { + log.error("获取token失败,json:" + json); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + return ""; + } + + public static Map getText(String text) { + String result = ""; + Map resultMap = new HashMap<>(); + try { + String encode = URLEncoder.encode(text, "UTF-8"); + OkHttpClient client = new OkHttpClient().newBuilder() + .connectTimeout(300, TimeUnit.SECONDS) + .readTimeout(60, TimeUnit.SECONDS) + .writeTimeout(30, TimeUnit.SECONDS) + .build(); + MediaType mediaType = MediaType.parse("application/json"); + RequestBody body = RequestBody.create(mediaType, "{\"fromLanguage\":\"auto\",\"toLanguage\":\"en\",\"text\":\"" + encode + "\"}"); + Request request = new Request.Builder() + .url("https://fanyi.percent.cn/api/dt/tran/text") + .method("POST", body) + .addHeader("Authorization", "Bearer " + token) + .addHeader("Content-Type", "application/json") + .build(); + Response response = null; + for (int i = 0; i < 10; i++) { + try { + response = client.newCall(request).execute(); + if (response.isSuccessful()) { + break; + } + }catch (ConnectException connectException){ + connectException.printStackTrace(); + } + } + Map json = (Map) JSON.parse(response.body().string()); + if ((int) json.get("code") == 200) { + Map data = (Map) json.get("data"); + result = URLDecoder.decode((String) data.get("value"), "UTF-8"); + resultMap.put("isSuccess", true); + resultMap.put("result", result); + } else { + log.error("文本翻译失败,json:" + json); + resultMap.put("isSuccess", false); + resultMap.put("result", result); + } + + } catch (Throwable e) { +// e. + log.error("实时文本接口翻译失败,text:"+text+",e:"+e); + } + return resultMap; + } + + /** + * 获取语种 + * @param text + * @return + */ + public static Boolean getLanguage(String text) { + String result = ""; + Boolean isZh = false; + try { +// String encode = URLEncoder.encode(text, "UTF-8"); + OkHttpClient client = new OkHttpClient().newBuilder() + .connectTimeout(300, TimeUnit.SECONDS) + .readTimeout(60, TimeUnit.SECONDS) + .writeTimeout(30, TimeUnit.SECONDS) + .build(); + MediaType mediaType = MediaType.parse("application/json"); + RequestBody body = RequestBody.create(mediaType, "{\"text\":\"" + text + "\"}"); + Request request = new Request.Builder() + .url("https://fanyi.percent.cn/api/dt/api/language/distinguish") + .method("POST", body) + .addHeader("Authorization", "Bearer " + Config.access_token) + .addHeader("Content-Type", "application/json") + //.addHeader("Cookie", "dt_saas_token=d3c60514c65285836dc316b6d48d468e; JSESSIONID=2726ac68-ad92-470f-b1d5-c1c0255f1cb9") + .build(); + Response response = null; + for (int i = 0; i < 10; i++) { +// System.out.println(i); + try { + response = client.newCall(request).execute(); + if (response.isSuccessful()) { + break; + } + }catch (ConnectException connectException){ + connectException.printStackTrace(); +// continue; + } + } + Map json = (Map) JSON.parse(response.body().string()); + if ((int) json.get("code") == 200) { + Map data = (Map) json.get("data"); +// result = URLDecoder.decode((String) data.get("value"), "UTF-8"); + String language = (String) data.get("language"); + if (language.equals("zh")){ + isZh = true; + }else { + isZh = false; + } + } else { + log.error("语种检测失败,json:" + json); + + } +// System.out.println(response.code()); +// System.out.println(response.body().string()); + } catch (Throwable e) { +// e. + log.error("语种检测失败,text:"+text+",e:"+e); + } + return isZh; + } + + public static void main(String[] args) { + getToken(); + Map data = getText("你好"); + System.out.println(data.get("result")); + + } +} diff --git a/src/main/java/com/bfd/crawl_translate/utils/TranslateUtil.java b/src/main/java/com/bfd/crawl_translate/utils/TranslateUtil.java new file mode 100644 index 0000000..27f5152 --- /dev/null +++ b/src/main/java/com/bfd/crawl_translate/utils/TranslateUtil.java @@ -0,0 +1,165 @@ +package com.bfd.crawl_translate.utils; + +import com.alibaba.fastjson.JSONObject; +import lombok.extern.slf4j.Slf4j; +import okhttp3.*; +import org.springframework.stereotype.Component; + +import java.io.IOException; +import java.net.InetSocketAddress; +import java.net.Proxy; +import java.net.URLEncoder; +import java.util.Random; +import java.util.concurrent.TimeUnit; + +/** + * @author guowei + */ +@Slf4j +@Component +public class TranslateUtil { + /** + * 模拟请求翻译 + * @param sourceContent 原始内容 + * @return 翻译后内容 + */ + public static String doDown(String sourceContent) { +// System.out.println("sourceContent --"+sourceContent); + MediaType mediaType = MediaType.parse("application/x-www-form-urlencoded;charset=UTF-8"); + String freq = "[[[\"MkEWBc\",\"[[\\\"#sourceContent#\\\",\\\"auto\\\",\\\"zh\\\",true],[null]]\",null,\"generic\"]]]&"; + sourceContent = sourceContent.replace("\"", "\\\\\\\"").replace("\n", "%5C%5Cn"); + freq = freq.replace("#sourceContent#", sourceContent); + String resData = null; + OkHttpClient client = new OkHttpClient().newBuilder().build(); + try { + freq = URLEncoder.encode(freq, "UTF-8"); + freq = freq.replace("%255C", "%5C").replace("+", "%20"); + RequestBody body = RequestBody.create(mediaType, "f.req=" + freq + "&"); + Request request = new Request.Builder() + .url("https://translate.google.com/_/TranslateWebserverUi/data/batchexecute") + .method("POST", body) + .addHeader("authority", "translate.google.cn") + .addHeader("accept", "*/*") + .addHeader("accept-language", "zh-CN,zh;q=0.9,en;q=0.8") + .addHeader("cache-control", "no-cache") + .addHeader("content-type", "application/x-www-form-urlencoded;charset=UTF-8") + .addHeader("cookie", "NID=511=voX-g_h3pvWIPMGq5T4ZaWq5jd6vRlZxBa6wqNTkEkdarBpallKXRxEOvfJu5TLDfbUxJXopAExNiqHJEW1wZU0MuvTIRmkVAAwBknQKHO_gu_xjtuXA00a56i8JL7RWSharKyQ5Ihoq0B-x21AANraC1Fhs9Q6q9eaSKZ3SwRw; _ga=GA1.3.1552263839.1660025840; _gid=GA1.3.1708261691.1660025840; OTZ=6628698_24_24__24_") + .addHeader("origin", "https://www..google.com") + .addHeader("pragma", "no-cache") + .addHeader("referer", "https://www.google.com/") + .addHeader("sec-ch-ua", "\"Chromium\";v=\"104\", \" Not A;Brand\";v=\"99\", \"Google Chrome\";v=\"104\"") + .addHeader("sec-ch-ua-arch", "\"x86\"") + .addHeader("sec-ch-ua-bitness", "\"64\"") + .addHeader("sec-ch-ua-full-version", "\"105.0.5195.127\"") + .addHeader("sec-ch-ua-full-version-list", "\".Not/A)Brand\";v=\"99.0.0.0\", \"Google Chrome\";v=\"105.0.5195.127\", \"Chromium\";v=\"105.0.5195.127\"") + .addHeader("sec-ch-ua-mobile", "?0") + .addHeader("sec-ch-ua-model", "") + .addHeader("sec-ch-ua-platform", "\"Windows\"") + .addHeader("sec-ch-ua-platform-version", "\"14.0.0\"") + .addHeader("sec-fetch-dest", "empty") + .addHeader("sec-fetch-mode", "cors") + .addHeader("sec-fetch-site", "same-origin") + .addHeader("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36") + .addHeader("x-goog-batchexecute-bgr", "[\";QF64XhPQAAYEzRk0JM1fOJOg-gftA3smACkAIwj8RvJeTIev0D-SyoRKJ6uoX00-HGvJj52RYTiUM4uTOmtSjeDe8R8AAABETwAAAAN1AQcXADDPHkARfFVQcldCsJmHArK64X7Kwazlq5rjQUagjtthTLDL8Kzwcpyq2ZSnfkW-TgeEAkCdc1ZvDogdYwYPMOz1Wlm8kLxPrcyeDMlDZPwN-mxBRJ37LOpgblKfQbr4Uol0HOHebHB1MFSkLsj1GXbaAleVvEXINfpVoeY3ffPsALBLISUQo2-U9qxDn37JXqCrusVYhqzob2M7PEIb92UC8FT9NnDamVfEqaEjSFsmWH3-nfWCh4w65mczyn7qIhrktTvjIpNa7f19cW5hiPwrolcvqU7jjhELE2u-pJTtdmyuWW505lNZkFB3JDWIWbnF5fwlXW4h956fijzKow8rQ6VghrCdPmk5HKsDBvKKczaMM-j--yXw9UbLDtjwU3M_W_MMa3pxImT1161eWEmny3itLXPrN7F0ooYXFn8NMIcZ9124sRpN0JGQwOToAzJw_A5NxHNHHthCqsuZcmS1QWoDc52J7iVptnljhJj8wvin5DYCopcIraaZZdzKsyPL_cApUctaEC1mr5B6Bi5AVSNvkfyoZCx9tPdqAiDLjK0a0sx_vZYK5yARycrTsycOzN1aA9wCYvTdvtruv1f8JQwG6N9SsOAi4Ko0yHb8Uo9KhY6bO6sqcYqNOkWG8Zhl2-nvC6u2vLHofilKy8CAdzJQq5VlnrwhPPKhZupo1I_3XBYOxHYgy3Ly7fDaVhHUWam2H6xU6m4gaUbaTyx5MtD9mJqAOQRjrzeO-kUBshbAlCAEqpBd7IJ089Ph-RrST0NsRoUL7wuC5cxIs3jNcbBf3yietmn3vnDCWzRYI1gh7CtYSQ2Xk_kNLR41BjsoOK0\",null,null,380,7,null,null,0,\"2\"]") + .addHeader("x-same-domain", "1") + .build(); + int i = 0; + while (i < 3) { + i++; + Response response = null; + try { + //requestPost 使用代理方式 + resData = requestPost(request); + //使用本机ip +// response = client.newCall(request).execute(); +// if (response.isSuccessful() && response.body() != null) { +// resData = response.body().string(); +// } +// resData = response.body().string(); +// System.out.println(resData); + } catch (Exception e) { + log.warn("Download Fail retry:{}. errorMsg:", i, e); + } finally { +// response.close(); + } + } + } catch (Exception e) { + log.error("Download Fail sourceContent:"+sourceContent+",e:"+e); + } + String parse = doParse(resData); + return parse; + } + + + + /** + * 解析请求内容 + * @param resData + * @return + */ + public static String doParse(String resData){ + String resTranslate = ""; + try { + resData = resData.replaceAll("\\)\\]}'",""); + resData = "{\"a\":"+resData+"}"; + JSONObject jsonObject = JSONObject.parseObject(resData); + String a1 = jsonObject.getJSONArray("a").getJSONArray(0).get(2).toString(); + String a2 = "{\"a\":"+a1+"}"; + JSONObject objects = JSONObject.parseObject(a2); + int size = objects.getJSONArray("a").getJSONArray(1).getJSONArray(0).getJSONArray(0).getJSONArray(5).size(); + for (int i=0;i<=size-1;i++){ + String string = objects.getJSONArray("a").getJSONArray(1).getJSONArray(0).getJSONArray(0).getJSONArray(5).getJSONArray(i).get(0).toString(); + if (!string.isEmpty() && !string.equals("null") ) { + resTranslate += objects.getJSONArray("a").getJSONArray(1).getJSONArray(0).getJSONArray(0).getJSONArray(5).getJSONArray(i).get(0).toString()+" "; + } + } + } catch (Exception e) { + e.printStackTrace(); + log.error("Parse fail,e:"+e); + } + return resTranslate; + } + + public static String requestPost(Request request) throws Exception { + + OkHttpClient httpClient = Proxy(); + String resData = null; + Random random = new Random(); + int m = (int) (20 * (random.nextFloat() + 1)); + Thread.sleep(m); + Response response = httpClient.newCall(request).execute(); + if (response.isSuccessful() && response.body() != null) { + resData = response.body().string(); + } + response.close(); + return resData; + } + public static OkHttpClient Proxy() { + Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress("oversea_vpn.baifendian.com", 3128)); + Authenticator proxyAuthenticator = new Authenticator() { + + @Override + public Request authenticate(Route route, Response response) throws IOException { + //设置代理服务器账号密码 + String credential = Credentials.basic("", ""); + return response.request().newBuilder() + .header("Proxy-Authorization", credential) + .build(); + } + }; + OkHttpClient httpClientProxy = new OkHttpClient.Builder() +// //1000是1秒 + .connectTimeout(200, TimeUnit.SECONDS) + //设置读取超时时间 + .readTimeout(30, TimeUnit.SECONDS) + .proxy(proxy) + .proxyAuthenticator(proxyAuthenticator) + .build(); + return httpClientProxy; + } + + public static void main(String[] args) { + String doDown = doDown("British authorities withhold Chelsea proceeds from Roman Abramovich"); + System.out.println(doDown); + } +} diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml new file mode 100644 index 0000000..02ae8ea --- /dev/null +++ b/src/main/resources/application.yml @@ -0,0 +1,59 @@ +crawl: + es: + subjectId: cl_special_1.0_305041 + cid: all + beginTime: 2000-01-01 + endTime: 2025-01-01 + cron: + token_cron: 0 0 0/10 * * ? + query_cron: 0 26 10 * * ? + size_cron: 0 0/30 * * * * + kafka: + topic: analyze + brokers: 172.26.28.30:9092 + task: + taskData: ./data/taskData.txt +server: + port: 9999 +#日志级别 +logging: + level: + com: + bfd: INFO + #日志路径 + log: + path: ./logs +spring: + redis: + host: 172.24.12.126 + port: 6379 + timeout: 10000 + database: 5 + jedis: + pool: + max-active: 8 # 连接池最大连接数(使用负值表示没有限制) + max-wait: 800 # 连接池最大阻塞等待时间(使用负值表示没有限制) + max-idle: 8 # 连接池中的最大空闲连接 + min-idle: 2 # 连接池中的最小空闲连接 + boot: + admin: + client: + url: http://172.18.1.147:8001 + instance: + service-base-url: http://172.18.1.147:9999 + application: + name: translate +management: + endpoints: + web: + exposure: + include: "*" + endpoint: + health: + show-details: always + health: + elasticsearch: + enabled: false +zookeeper: + connection-string: 172.18.1.146:2181,172.18.1.147:2181,172.18.1.148:2181 + publish-node: /analyze diff --git a/src/main/resources/logback-spring.xml b/src/main/resources/logback-spring.xml new file mode 100644 index 0000000..ecd456c --- /dev/null +++ b/src/main/resources/logback-spring.xml @@ -0,0 +1,38 @@ + + + + + + + + + true + + ${logging.level} + + + ${logging.path}/crawlSchedule.log + + + + ${logging.path}/crawlSchedule.log.%d{yyyy-MM-dd} + + 7 + + + %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %line %-5level %logger{50} - %msg%n + UTF-8 + + + + + + + +