You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

165 lines
7.9 KiB

package com.example;
import org.apache.kafka.clients.producer.*;
import org.apache.kafka.common.serialization.StringSerializer;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
import java.io.*;
import java.util.*;
import java.util.concurrent.Future;
public class getInKa {
// 初始化 OkHttp 客户端,用于发送 HTTP 请求
private static final OkHttpClient httpClient = new OkHttpClient();
private static final String PROCESSED_URLS_FILE = "processed_urls.txt"; // 记录已处理的 URL 文件
public static void main(String[] args) {
try {
// 获取目标 URL 列表
System.out.println("Starting URL collection...");
List<String> urls = getUrls();
System.out.println("Collected " + urls.size() + " URLs.");
// 从 URL 中提取新闻数据并保存到 kafka
System.out.println("Starting news extraction...");
getNews(urls);
System.out.println("News extraction completed.");
} catch (IOException | InterruptedException e) {
System.out.println("Error in main: " + e.getMessage());
}
}
public static List<String> getUrls() throws IOException, InterruptedException {
List<String> urls = new ArrayList<>();
Set<String> processedUrls = loadProcessedUrls(); // 加载已处理的 URL
for (int page = 1; page <= 28; page++) {
String url = "https://www.zyctd.com/zixun/201/pz102-" + page + ".html";
Request request = new Request.Builder()
.url(url)
.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0")
.build();
System.out.println("Fetching page " + page + ": " + url);
try (Response response = httpClient.newCall(request).execute()) {
if (response.isSuccessful() && response.body() != null) {
System.out.println("Successfully fetched page " + page);
String html = response.body().string();
Document doc = Jsoup.parse(html);
Elements links = doc.select("div.zixun-list > div.zixun-item-box > div.zixun-item-title > p > a");
List<String> projectIDs = links.eachAttr("href");
System.out.println("Found " + projectIDs.size() + " URLs on page " + page);
for (String projectUrl : projectIDs) {
if (!processedUrls.contains(projectUrl)) { // 检查是否已处理
urls.add(projectUrl);
processedUrls.add(projectUrl); // 添加到已处理集合
}
}
} else {
System.out.println("Failed to fetch page " + page + ": Status code " + response.code());
}
}
Thread.sleep(1000);
}
saveProcessedUrls(processedUrls); // 保存已处理的 URL
return urls;
}
public static void getNews(List<String> urls) throws IOException {
for (int i = 0; i < urls.size(); i++) {
String url = urls.get(i);
Request request = new Request.Builder()
.url(url)
.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0")
.build();
System.out.println("Processing URL " + (i + 1) + "/" + urls.size() + ": " + url);
try (Response response = httpClient.newCall(request).execute()) {
if (response.isSuccessful() && response.body() != null) {
System.out.println("Successfully fetched news from " + url);
String html = response.body().string();
Document doc = Jsoup.parse(html);
String title = doc.select("div.info-title.t-center > h1").text().trim();
String date = doc.select("div.author.color-grey.art-info > span:nth-child(1)").text().trim();
String content = String.join("\n", doc.select("div.info-content > div > p").eachText()).trim();
if (content.isEmpty()) {
content = String.join("\n", doc.select("div.info-content > p:nth-child(2)").eachText()).trim();
}
if (!title.isEmpty() && !date.isEmpty() && !content.isEmpty()) {
Map<String, String> news = new HashMap<>();
news.put("title", title);
news.put("date", date);
news.put("content", content);
news.put("url", url);
System.out.println("Extracted news: " + news.get("title"));
saveData(news); // 调用修改后的 saveData 方法
} else {
System.out.println("Failed to extract complete data from " + url);
}
} else {
System.out.println("Failed to fetch news from " + url + ": Status code " + response.code());
}
} catch (Exception e) {
System.out.println("An error occurred while fetching " + url + ": " + e.getMessage());
}
try {
Thread.sleep(5000); // 休眠5秒
} catch (InterruptedException e) {
System.out.println("Sleep interrupted: " + e.getMessage());
}
}
}
public static void saveData(Map<String, String> news) {
Properties properties = new Properties();
properties.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092");
properties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName());
properties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName());
try (Producer<String, String> producer = new KafkaProducer<>(properties)) {
String topic = "news-topic";
String key = news.get("title");
String value = news.toString();
ProducerRecord<String, String> record = new ProducerRecord<>(topic, key, value);
producer.send(record, (metadata, exception) -> {
if (exception == null) {
System.out.println("Data sent successfully to Kafka: topic=" + metadata.topic() +
", partition=" + metadata.partition() + ", offset=" + metadata.offset());
} else {
System.err.println("Failed to send data to Kafka: " + exception.getMessage());
}
}).get();
} catch (Exception e) {
System.err.println("Error while sending data to Kafka: " + e.getMessage());
}
}
// 加载已处理的 URL
private static Set<String> loadProcessedUrls() throws IOException {
Set<String> processedUrls = new HashSet<>();
File file = new File(PROCESSED_URLS_FILE);
if (file.exists()) {
try (BufferedReader reader = new BufferedReader(new FileReader(file))) {
String line;
while ((line = reader.readLine()) != null) {
processedUrls.add(line.trim());
}
}
}
return processedUrls;
}
// 保存已处理的 URL
private static void saveProcessedUrls(Set<String> processedUrls) throws IOException {
try (BufferedWriter writer = new BufferedWriter(new FileWriter(PROCESSED_URLS_FILE))) {
for (String url : processedUrls) {
writer.write(url);
writer.newLine();
}
}
}
}