|
|
package com.example;
import org.apache.kafka.clients.producer.*; import org.apache.kafka.common.serialization.StringSerializer;
import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.select.Elements; import okhttp3.OkHttpClient; import okhttp3.Request; import okhttp3.Response;
import java.io.*; import java.util.*; import java.util.concurrent.Future;
public class getInKa { // 初始化 OkHttp 客户端,用于发送 HTTP 请求
private static final OkHttpClient httpClient = new OkHttpClient(); private static final String PROCESSED_URLS_FILE = "processed_urls.txt"; // 记录已处理的 URL 文件
public static void main(String[] args) { try { // 获取目标 URL 列表
System.out.println("Starting URL collection..."); List<String> urls = getUrls(); System.out.println("Collected " + urls.size() + " URLs.");
// 从 URL 中提取新闻数据并保存到 kafka
System.out.println("Starting news extraction..."); getNews(urls); System.out.println("News extraction completed."); } catch (IOException | InterruptedException e) { System.out.println("Error in main: " + e.getMessage()); } } public static List<String> getUrls() throws IOException, InterruptedException { List<String> urls = new ArrayList<>(); Set<String> processedUrls = loadProcessedUrls(); // 加载已处理的 URL
for (int page = 1; page <= 28; page++) { String url = "https://www.zyctd.com/zixun/201/pz102-" + page + ".html"; Request request = new Request.Builder() .url(url) .addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0") .build();
System.out.println("Fetching page " + page + ": " + url); try (Response response = httpClient.newCall(request).execute()) { if (response.isSuccessful() && response.body() != null) { System.out.println("Successfully fetched page " + page); String html = response.body().string(); Document doc = Jsoup.parse(html); Elements links = doc.select("div.zixun-list > div.zixun-item-box > div.zixun-item-title > p > a"); List<String> projectIDs = links.eachAttr("href"); System.out.println("Found " + projectIDs.size() + " URLs on page " + page);
for (String projectUrl : projectIDs) { if (!processedUrls.contains(projectUrl)) { // 检查是否已处理
urls.add(projectUrl); processedUrls.add(projectUrl); // 添加到已处理集合
} } } else { System.out.println("Failed to fetch page " + page + ": Status code " + response.code()); } } Thread.sleep(1000); } saveProcessedUrls(processedUrls); // 保存已处理的 URL
return urls; } public static void getNews(List<String> urls) throws IOException { for (int i = 0; i < urls.size(); i++) { String url = urls.get(i); Request request = new Request.Builder() .url(url) .addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0") .build();
System.out.println("Processing URL " + (i + 1) + "/" + urls.size() + ": " + url); try (Response response = httpClient.newCall(request).execute()) { if (response.isSuccessful() && response.body() != null) { System.out.println("Successfully fetched news from " + url); String html = response.body().string(); Document doc = Jsoup.parse(html); String title = doc.select("div.info-title.t-center > h1").text().trim(); String date = doc.select("div.author.color-grey.art-info > span:nth-child(1)").text().trim(); String content = String.join("\n", doc.select("div.info-content > div > p").eachText()).trim(); if (content.isEmpty()) { content = String.join("\n", doc.select("div.info-content > p:nth-child(2)").eachText()).trim(); }
if (!title.isEmpty() && !date.isEmpty() && !content.isEmpty()) { Map<String, String> news = new HashMap<>(); news.put("title", title); news.put("date", date); news.put("content", content); news.put("url", url); System.out.println("Extracted news: " + news.get("title")); saveData(news); // 调用修改后的 saveData 方法
} else { System.out.println("Failed to extract complete data from " + url); } } else { System.out.println("Failed to fetch news from " + url + ": Status code " + response.code()); } } catch (Exception e) { System.out.println("An error occurred while fetching " + url + ": " + e.getMessage()); } try { Thread.sleep(5000); // 休眠5秒
} catch (InterruptedException e) { System.out.println("Sleep interrupted: " + e.getMessage()); } } } public static void saveData(Map<String, String> news) { Properties properties = new Properties(); properties.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092"); properties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName()); properties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName());
try (Producer<String, String> producer = new KafkaProducer<>(properties)) { String topic = "news-topic"; String key = news.get("title"); String value = news.toString(); ProducerRecord<String, String> record = new ProducerRecord<>(topic, key, value);
producer.send(record, (metadata, exception) -> { if (exception == null) { System.out.println("Data sent successfully to Kafka: topic=" + metadata.topic() + ", partition=" + metadata.partition() + ", offset=" + metadata.offset()); } else { System.err.println("Failed to send data to Kafka: " + exception.getMessage()); } }).get(); } catch (Exception e) { System.err.println("Error while sending data to Kafka: " + e.getMessage()); } } // 加载已处理的 URL
private static Set<String> loadProcessedUrls() throws IOException { Set<String> processedUrls = new HashSet<>(); File file = new File(PROCESSED_URLS_FILE); if (file.exists()) { try (BufferedReader reader = new BufferedReader(new FileReader(file))) { String line; while ((line = reader.readLine()) != null) { processedUrls.add(line.trim()); } } } return processedUrls; }
// 保存已处理的 URL
private static void saveProcessedUrls(Set<String> processedUrls) throws IOException { try (BufferedWriter writer = new BufferedWriter(new FileWriter(PROCESSED_URLS_FILE))) { for (String url : processedUrls) { writer.write(url); writer.newLine(); } } } }
|