Server_local_code_deployment/src/main/java/com/example/CaptchaOCR.java


								package com.example;


								import java.awt.image.BufferedImage;

								import java.io.*;

								import java.net.HttpURLConnection;

								import java.net.URL;

								import javax.imageio.ImageIO;

								import net.sourceforge.tess4j.Tesseract;

								import net.sourceforge.tess4j.TesseractException;


								// ... 其他必要的导入 ...


								public class CaptchaOCR {


								    // Tesseract data 路径 (tessdata 文件夹所在目录)

								    // Windows 示例: "C:\\Program Files\\Tesseract-OCR\\tessdata"

								    // Linux/macOS 示例: 通常不需要设置，Tess4J 会自动查找

								    private static final String TESSDATA_PATH = "F:\\tool\\Tesseract-OCR\\tessdata"; // 根据你的安装路径修改


								    /**

								     * 下载验证码图片

								     * @param imageUrl 图片的完整 URL

								     * @return 图片的 BufferedImage 对象

								     * @throws IOException 如果下载失败

								     */

								    public static BufferedImage downloadImage(String imageUrl) throws IOException {

								        URL url = new URL(imageUrl);

								        HttpURLConnection conn = (HttpURLConnection) url.openConnection();

								        conn.setRequestMethod("GET");

								        // 添加 User-Agent 等必要的请求头，模拟浏览器

								        conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36");

								        // ... 其他头 ...


								        int responseCode = conn.getResponseCode();

								        if (responseCode == HttpURLConnection.HTTP_OK) {

								            try (InputStream is = conn.getInputStream()) {

								                // 将输入流读取到字节数组，ImageIO 从字节数组读取更稳定

								                ByteArrayOutputStream baos = new ByteArrayOutputStream();

								                byte[] buffer = new byte[4096]; // 缓冲区大小，可以调整

								                int bytesRead;

								                while ((bytesRead = is.read(buffer)) != -1) {

								                    baos.write(buffer, 0, bytesRead);

								                }

								                ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());


								                BufferedImage image = ImageIO.read(bais);


								                if (image == null) {

								                    throw new IOException("Failed to read image from stream. Check image format.");

								                }

								                return image;

								            }

								        } else {

								            throw new IOException("Failed to download image. HTTP error code: " + responseCode);

								        }

								    }


								    /**

								     * 对验证码图片进行预处理 (基础示例：转灰度+二值化)

								     * 这是最关键的部分，需要根据验证码样式调整

								     * @param originalImage 原始图片

								     * @return 预处理后的图片

								     */

								    public static BufferedImage preprocessImage(BufferedImage originalImage) {

								        // TODO: 这里是图像预处理的重点，需要根据实际验证码样式进行调整和优化

								        // 基础处理：转灰度 -> 二值化

								        int width = originalImage.getWidth();

								        int height = originalImage.getHeight();

								        BufferedImage grayImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_GRAY);

								        grayImage.getGraphics().drawImage(originalImage, 0, 0, null);


								        BufferedImage binaryImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_BINARY);

								        // 二值化阈值，可能需要调整 (0-255)

								        int threshold = 128;

								        for (int y = 0; y < height; y++) {

								            for (int x = 0; x < width; x++) {

								                int gray = grayImage.getRaster().getSample(x, y, 0);

								                if (gray < threshold) {

								                    binaryImage.getRaster().setSample(x, y, 0, 0); // 黑色

								                } else {

								                    binaryImage.getRaster().setSample(x, y, 0, 1); // 白色

								                }

								            }

								        }


								        // TODO: 更高级的预处理包括：

								        // - 去除干扰线、噪点

								        // - 字符分割（如果字符粘连）

								        // - 倾斜校正

								        // - 调整亮度和对比度等

								        // 你可能需要引入更专业的图像处理库或算法


								        // 为了调试，可以将预处理后的图片保存下来查看效果

								        try {

								            File outputfile = new File("preprocessed_captcha.png");

								            ImageIO.write(binaryImage, "png", outputfile);

								            System.out.println("Preprocessed image saved to " + outputfile.getAbsolutePath());

								        } catch (IOException e) {

								            e.printStackTrace();

								        }


								        return binaryImage; // 返回预处理后的图片

								    }


								    /**

								     * 使用 Tess4J 识别图片中的文字

								     * @param image 待识别的图片 (最好是预处理后的)

								     * @return 识别出的字符串

								     */

								    public static String recognizeCaptcha(BufferedImage image) {

								        Tesseract tesseract = new Tesseract();


								        // 设置 tessdata 路径 (如果 TESSDATA_PATH 已正确设置且 Tesseract 安装正确，这行可能不是必需的，Tess4J 会自动查找)

								        // 但显式设置更保险

								        if (TESSDATA_PATH != null && !TESSDATA_PATH.isEmpty()) {

								            tesseract.setDatapath(TESSDATA_PATH);

								        } else {

								            System.out.println("TESSDATA_PATH not set. Tess4J will try to find tessdata automatically.");

								        }


								        tesseract.setLanguage("eng"); // 设置识别语言为英文 (通常包含数字)

								        // 如果验证码只有数字，可以尝试设置仅识别数字

								        // tesseract.setTessVariable("tessedit_char_whitelist", "0123456789");


								        try {

								            String result = tesseract.doOCR(image);

								            // 清理识别结果，去除空格或换行符等

								            result = result.trim().replaceAll("[^0-9a-zA-Z]", ""); // 根据验证码内容调整清理规则

								            System.out.println("OCR Result: " + result);

								            return result;

								        } catch (TesseractException e) {

								            System.err.println("Error during OCR: " + e.getMessage());

								            return null; // 识别失败

								        }

								    }


								    // 示例如何在你的爬虫流程中使用

								    public static void main(String[] args) {

								        String captchaImageUrl = "YOUR_CAPTCHA_IMAGE_URL"; // 从页面解析获取到的验证码图片 URL


								        try {

								            // 1. 下载图片

								            BufferedImage originalCaptchaImage = downloadImage(captchaImageUrl);

								            System.out.println("Image downloaded.");


								            // 2. 预处理图片

								            BufferedImage preprocessedImage = preprocessImage(originalCaptchaImage);

								            System.out.println("Image preprocessed.");


								            // 3. 识别验证码

								            String captchaCode = recognizeCaptcha(preprocessedImage);


								            if (captchaCode != null && !captchaCode.isEmpty()) {

								                System.out.println("Recognized CAPTCHA: " + captchaCode);

								                // 4. 将 captchaCode 填入 POST 数据中，提交表单

								                // ... (你的 ASP.NET WebForms POST 提交代码，将 captchaCode 放到对应的隐藏字段或输入框字段中) ...

								                // 例如：postData += "&captchaInputFieldName=" + URLEncoder.encode(captchaCode, StandardCharsets.UTF_8.name());

								                // ... 提交 POST 请求 ...


								            } else {

								                System.out.println("Failed to recognize CAPTCHA.");

								                // 5. 处理识别失败的情况，可能需要重试或记录日志

								            }


								        } catch (IOException e) {

								            System.err.println("Error downloading or processing image: " + e.getMessage());

								        }

								        // catch (URISyntaxException e) {

								        //     System.err.println("Invalid URL: " + e.getMessage());

								        // } // 如果你的 downloadImage 方法 throws URISyntaxException

								    }

								}