You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

165 lines
7.9 KiB

1 month ago
  1. package com.example;
  2. import org.apache.kafka.clients.producer.*;
  3. import org.apache.kafka.common.serialization.StringSerializer;
  4. import org.jsoup.Jsoup;
  5. import org.jsoup.nodes.Document;
  6. import org.jsoup.select.Elements;
  7. import okhttp3.OkHttpClient;
  8. import okhttp3.Request;
  9. import okhttp3.Response;
  10. import java.io.*;
  11. import java.util.*;
  12. import java.util.concurrent.Future;
  13. public class getInKa {
  14. // 初始化 OkHttp 客户端,用于发送 HTTP 请求
  15. private static final OkHttpClient httpClient = new OkHttpClient();
  16. private static final String PROCESSED_URLS_FILE = "processed_urls.txt"; // 记录已处理的 URL 文件
  17. public static void main(String[] args) {
  18. try {
  19. // 获取目标 URL 列表
  20. System.out.println("Starting URL collection...");
  21. List<String> urls = getUrls();
  22. System.out.println("Collected " + urls.size() + " URLs.");
  23. // 从 URL 中提取新闻数据并保存到 kafka
  24. System.out.println("Starting news extraction...");
  25. getNews(urls);
  26. System.out.println("News extraction completed.");
  27. } catch (IOException | InterruptedException e) {
  28. System.out.println("Error in main: " + e.getMessage());
  29. }
  30. }
  31. public static List<String> getUrls() throws IOException, InterruptedException {
  32. List<String> urls = new ArrayList<>();
  33. Set<String> processedUrls = loadProcessedUrls(); // 加载已处理的 URL
  34. for (int page = 1; page <= 28; page++) {
  35. String url = "https://www.zyctd.com/zixun/201/pz102-" + page + ".html";
  36. Request request = new Request.Builder()
  37. .url(url)
  38. .addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0")
  39. .build();
  40. System.out.println("Fetching page " + page + ": " + url);
  41. try (Response response = httpClient.newCall(request).execute()) {
  42. if (response.isSuccessful() && response.body() != null) {
  43. System.out.println("Successfully fetched page " + page);
  44. String html = response.body().string();
  45. Document doc = Jsoup.parse(html);
  46. Elements links = doc.select("div.zixun-list > div.zixun-item-box > div.zixun-item-title > p > a");
  47. List<String> projectIDs = links.eachAttr("href");
  48. System.out.println("Found " + projectIDs.size() + " URLs on page " + page);
  49. for (String projectUrl : projectIDs) {
  50. if (!processedUrls.contains(projectUrl)) { // 检查是否已处理
  51. urls.add(projectUrl);
  52. processedUrls.add(projectUrl); // 添加到已处理集合
  53. }
  54. }
  55. } else {
  56. System.out.println("Failed to fetch page " + page + ": Status code " + response.code());
  57. }
  58. }
  59. Thread.sleep(1000);
  60. }
  61. saveProcessedUrls(processedUrls); // 保存已处理的 URL
  62. return urls;
  63. }
  64. public static void getNews(List<String> urls) throws IOException {
  65. for (int i = 0; i < urls.size(); i++) {
  66. String url = urls.get(i);
  67. Request request = new Request.Builder()
  68. .url(url)
  69. .addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0")
  70. .build();
  71. System.out.println("Processing URL " + (i + 1) + "/" + urls.size() + ": " + url);
  72. try (Response response = httpClient.newCall(request).execute()) {
  73. if (response.isSuccessful() && response.body() != null) {
  74. System.out.println("Successfully fetched news from " + url);
  75. String html = response.body().string();
  76. Document doc = Jsoup.parse(html);
  77. String title = doc.select("div.info-title.t-center > h1").text().trim();
  78. String date = doc.select("div.author.color-grey.art-info > span:nth-child(1)").text().trim();
  79. String content = String.join("\n", doc.select("div.info-content > div > p").eachText()).trim();
  80. if (content.isEmpty()) {
  81. content = String.join("\n", doc.select("div.info-content > p:nth-child(2)").eachText()).trim();
  82. }
  83. if (!title.isEmpty() && !date.isEmpty() && !content.isEmpty()) {
  84. Map<String, String> news = new HashMap<>();
  85. news.put("title", title);
  86. news.put("date", date);
  87. news.put("content", content);
  88. news.put("url", url);
  89. System.out.println("Extracted news: " + news.get("title"));
  90. saveData(news); // 调用修改后的 saveData 方法
  91. } else {
  92. System.out.println("Failed to extract complete data from " + url);
  93. }
  94. } else {
  95. System.out.println("Failed to fetch news from " + url + ": Status code " + response.code());
  96. }
  97. } catch (Exception e) {
  98. System.out.println("An error occurred while fetching " + url + ": " + e.getMessage());
  99. }
  100. try {
  101. Thread.sleep(5000); // 休眠5秒
  102. } catch (InterruptedException e) {
  103. System.out.println("Sleep interrupted: " + e.getMessage());
  104. }
  105. }
  106. }
  107. public static void saveData(Map<String, String> news) {
  108. Properties properties = new Properties();
  109. properties.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092");
  110. properties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName());
  111. properties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName());
  112. try (Producer<String, String> producer = new KafkaProducer<>(properties)) {
  113. String topic = "news-topic";
  114. String key = news.get("title");
  115. String value = news.toString();
  116. ProducerRecord<String, String> record = new ProducerRecord<>(topic, key, value);
  117. producer.send(record, (metadata, exception) -> {
  118. if (exception == null) {
  119. System.out.println("Data sent successfully to Kafka: topic=" + metadata.topic() +
  120. ", partition=" + metadata.partition() + ", offset=" + metadata.offset());
  121. } else {
  122. System.err.println("Failed to send data to Kafka: " + exception.getMessage());
  123. }
  124. }).get();
  125. } catch (Exception e) {
  126. System.err.println("Error while sending data to Kafka: " + e.getMessage());
  127. }
  128. }
  129. // 加载已处理的 URL
  130. private static Set<String> loadProcessedUrls() throws IOException {
  131. Set<String> processedUrls = new HashSet<>();
  132. File file = new File(PROCESSED_URLS_FILE);
  133. if (file.exists()) {
  134. try (BufferedReader reader = new BufferedReader(new FileReader(file))) {
  135. String line;
  136. while ((line = reader.readLine()) != null) {
  137. processedUrls.add(line.trim());
  138. }
  139. }
  140. }
  141. return processedUrls;
  142. }
  143. // 保存已处理的 URL
  144. private static void saveProcessedUrls(Set<String> processedUrls) throws IOException {
  145. try (BufferedWriter writer = new BufferedWriter(new FileWriter(PROCESSED_URLS_FILE))) {
  146. for (String url : processedUrls) {
  147. writer.write(url);
  148. writer.newLine();
  149. }
  150. }
  151. }
  152. }