Server_local_code_deployment/src/main/java/com/example/getInKa.java

package com.example;

import org.apache.kafka.clients.producer.*;
import org.apache.kafka.common.serialization.StringSerializer;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;

import java.io.*;
import java.util.*;
import java.util.concurrent.Future;

public class getInKa {
    // 初始化 OkHttp 客户端，用于发送 HTTP 请求
    private static final OkHttpClient httpClient = new OkHttpClient();
    private static final String PROCESSED_URLS_FILE = "processed_urls.txt"; // 记录已处理的 URL 文件
    public static void main(String[] args) {
        try {
            // 获取目标 URL 列表
            System.out.println("Starting URL collection...");
            List<String> urls = getUrls();
            System.out.println("Collected " + urls.size() + " URLs.");

            // 从 URL 中提取新闻数据并保存到 kafka
            System.out.println("Starting news extraction...");
            getNews(urls);
            System.out.println("News extraction completed.");
        } catch (IOException | InterruptedException e) {
            System.out.println("Error in main: " + e.getMessage());
        }
    }
    public static List<String> getUrls() throws IOException, InterruptedException {
        List<String> urls = new ArrayList<>();
        Set<String> processedUrls = loadProcessedUrls(); // 加载已处理的 URL

        for (int page = 1; page <= 28; page++) {
            String url = "https://www.zyctd.com/zixun/201/pz102-" + page + ".html";
            Request request = new Request.Builder()
                    .url(url)
                    .addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0")
                    .build();

            System.out.println("Fetching page " + page + ": " + url);
            try (Response response = httpClient.newCall(request).execute()) {
                if (response.isSuccessful() && response.body() != null) {
                    System.out.println("Successfully fetched page " + page);
                    String html = response.body().string();
                    Document doc = Jsoup.parse(html);
                    Elements links = doc.select("div.zixun-list > div.zixun-item-box > div.zixun-item-title > p > a");
                    List<String> projectIDs = links.eachAttr("href");
                    System.out.println("Found " + projectIDs.size() + " URLs on page " + page);

                    for (String projectUrl : projectIDs) {
                        if (!processedUrls.contains(projectUrl)) { // 检查是否已处理
                            urls.add(projectUrl);
                            processedUrls.add(projectUrl); // 添加到已处理集合
                        }
                    }
                } else {
                    System.out.println("Failed to fetch page " + page + ": Status code " + response.code());
                }
            }
            Thread.sleep(1000);
        }
        saveProcessedUrls(processedUrls); // 保存已处理的 URL
        return urls;
    }
    public static void getNews(List<String> urls) throws IOException {
        for (int i = 0; i < urls.size(); i++) {
            String url = urls.get(i);
            Request request = new Request.Builder()
                    .url(url)
                    .addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0")
                    .build();

            System.out.println("Processing URL " + (i + 1) + "/" + urls.size() + ": " + url);
            try (Response response = httpClient.newCall(request).execute()) {
                if (response.isSuccessful() && response.body() != null) {
                    System.out.println("Successfully fetched news from " + url);
                    String html = response.body().string();
                    Document doc = Jsoup.parse(html);
                    String title = doc.select("div.info-title.t-center > h1").text().trim();
                    String date = doc.select("div.author.color-grey.art-info > span:nth-child(1)").text().trim();
                    String content = String.join("\n", doc.select("div.info-content > div > p").eachText()).trim();
                    if (content.isEmpty()) {
                        content = String.join("\n", doc.select("div.info-content > p:nth-child(2)").eachText()).trim();
                    }

                    if (!title.isEmpty() && !date.isEmpty() && !content.isEmpty()) {
                        Map<String, String> news = new HashMap<>();
                        news.put("title", title);
                        news.put("date", date);
                        news.put("content", content);
                        news.put("url", url);
                        System.out.println("Extracted news: " + news.get("title"));
                        saveData(news); // 调用修改后的 saveData 方法
                    } else {
                        System.out.println("Failed to extract complete data from " + url);
                    }
                } else {
                    System.out.println("Failed to fetch news from " + url + ": Status code " + response.code());
                }
            } catch (Exception e) {
                System.out.println("An error occurred while fetching " + url + ": " + e.getMessage());
            }
            try {
                Thread.sleep(5000); // 休眠5秒
            } catch (InterruptedException e) {
                System.out.println("Sleep interrupted: " + e.getMessage());
            }
        }
    }
    public static void saveData(Map<String, String> news) {
        Properties properties = new Properties();
        properties.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092");
        properties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName());
        properties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName());

        try (Producer<String, String> producer = new KafkaProducer<>(properties)) {
            String topic = "news-topic";
            String key = news.get("title");
            String value = news.toString();
            ProducerRecord<String, String> record = new ProducerRecord<>(topic, key, value);

            producer.send(record, (metadata, exception) -> {
                if (exception == null) {
                    System.out.println("Data sent successfully to Kafka: topic=" + metadata.topic() +
                            ", partition=" + metadata.partition() + ", offset=" + metadata.offset());
                } else {
                    System.err.println("Failed to send data to Kafka: " + exception.getMessage());
                }
            }).get();
        } catch (Exception e) {
            System.err.println("Error while sending data to Kafka: " + e.getMessage());
        }
    }
    // 加载已处理的 URL
    private static Set<String> loadProcessedUrls() throws IOException {
        Set<String> processedUrls = new HashSet<>();
        File file = new File(PROCESSED_URLS_FILE);
        if (file.exists()) {
            try (BufferedReader reader = new BufferedReader(new FileReader(file))) {
                String line;
                while ((line = reader.readLine()) != null) {
                    processedUrls.add(line.trim());
                }
            }
        }
        return processedUrls;
    }

    // 保存已处理的 URL
    private static void saveProcessedUrls(Set<String> processedUrls) throws IOException {
        try (BufferedWriter writer = new BufferedWriter(new FileWriter(PROCESSED_URLS_FILE))) {
            for (String url : processedUrls) {
                writer.write(url);
                writer.newLine();
            }
        }
    }
}