暗网采集与部署

4 days ago · 9459ff0139
4 changed files with 1211 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,29 @@
+# Python
+__pycache__/
+*.py[cod]
+*.egg-info/
+.venv/
+venv/
+env/
+
+# 日志与运行产物
+logs/
+*.log
+html*.html
+
+# 本地测试/临时文件（体积大或含采集结果）
+test.txt
+test2.txt
+tem.txt
+
+# 含敏感信息的旧笔记（请用 .md 版）
+暗网服务部署笔记.txt
+
+# IDE
+.idea/
+.vscode/
+*.swp
+
+# 系统文件
+.DS_Store
+Thumbs.db
--- a/darkweb_api_3.py
+++ b/darkweb_api_3.py
@ -0,0 +1,606 @@
+# -*- coding: utf-8 -*-
+# 1. 【核心】必须在最开头打猴子补丁，支持异步并发    更新：增加了动态请求，将页面拉到最底部的操作
+from gevent import monkey
+monkey.patch_all()
+import os
+import logging
+import time
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+from urllib.parse import urlparse
+from logging.handlers import RotatingFileHandler
+from flask import Flask, request, jsonify
+import requests
+from playwright.async_api import async_playwright
+
+app = Flask(__name__)
+
+# === 配置区域 ===
+# 1. 静态请求代理 (Privoxy - HTTP)
+STATIC_PROXY_IP = "127.0.0.1"
+STATIC_PROXY_PORT = "19095"
+STATIC_PROXIES = {
+    "http": f"http://{STATIC_PROXY_IP}:{STATIC_PROXY_PORT}",
+    "https": f"http://{STATIC_PROXY_IP}:{STATIC_PROXY_PORT}"
+}
+
+# 2. 动态请求代理 (Tor - SOCKS5)
+# Playwright 直接连 Tor 端口，效率更高
+DYNAMIC_PROXY_SERVER = "socks5://127.0.0.1:9050"
+# 动态采集专用线程池，在线程内运行 async Playwright
+DYNAMIC_MAX_WORKERS = 6
+DYNAMIC_EXECUTOR = ThreadPoolExecutor(max_workers=DYNAMIC_MAX_WORKERS, thread_name_prefix="pw-dyn")
+
+DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0"
+
+# 特定站点可在这里做定制化策略
+SITE_PROFILES = {
+    "pitchprash4aqilfr7sbmuwve3pnkpylqwxjbj2q5o4szcfeea6d27yd.onion": {
+        "force_dynamic": True,
+        "headers": {
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+            "Accept-Language": "en-US,en;q=0.5",
+            "Connection": "keep-alive",
+            "Upgrade-Insecure-Requests": "1",
+            "User-Agent": DEFAULT_USER_AGENT,
+        },
+        "static": {
+            "allow_redirects": True,
+            "timeout_s": 90,
+            "retry_times": 3,
+            "retry_interval_ms": 1200
+        },
+        "dynamic": {
+            "initial_wait_ms": 8000,
+            "post_scroll_wait_ms": 3000,
+            "wait_selector": "main, article, .post, .search-results, #content, body",
+            "wait_selector_timeout_ms": 30000,
+            "goto_wait_until": "domcontentloaded",
+            "retry_times": 3,
+            "retry_interval_ms": 1500,
+            "min_content_len": 200
+        }
+    }
+}
+
+# 规则匹配：用于批量覆盖“同类页面”，如任意 onion 站的搜索页
+RULE_PROFILES = [
+    {
+        "name": "xmh57_prefix_omega",
+        "match": {
+            "hostname_prefix": "xmh57jrknzkhv6y3ls3ubitzfqnkrwxhopf5aygthi7d6rplyvk3noyd",
+            "path_contains": ["/cgi-bin/omega/omega"]
+        },
+        "profile": {
+            "headers": {
+                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+                "Accept-Language": "en-US,en;q=0.5",
+                "Upgrade-Insecure-Requests": "1",
+                "User-Agent": DEFAULT_USER_AGENT
+            },
+            "static": {
+                "allow_redirects": True,
+                "timeout_s": 120,
+                "retry_times": 4,
+                "retry_interval_ms": 1500,
+                "retry_status_codes": [502, 503, 504],
+                "min_content_len": 120
+            },
+            "dynamic": {
+                "initial_wait_ms": 8000,
+                "post_scroll_wait_ms": 2500,
+                "wait_selector": "body, main, #content, .results, .record",
+                "wait_selector_timeout_ms": 25000,
+                "goto_wait_until": "domcontentloaded",
+                "retry_times": 3,
+                "retry_interval_ms": 1500,
+                "retry_status_codes": [502, 503, 504],
+                "min_content_len": 120
+            }
+        }
+    },
+    {
+        "name": "generic_onion_search",
+        "match": {
+            "hostname_suffix": ".onion",
+            "path_contains": ["/search"]
+        },
+        "profile": {
+            "headers": {
+                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+                "Accept-Language": "en-US,en;q=0.5",
+                "Upgrade-Insecure-Requests": "1",
+                "User-Agent": DEFAULT_USER_AGENT
+            },
+            "static": {
+                "allow_redirects": True,
+                "timeout_s": 90,
+                "retry_times": 2,
+                "retry_interval_ms": 1000
+            },
+            "dynamic": {
+                "initial_wait_ms": 8000,
+                "post_scroll_wait_ms": 3000,
+                "wait_selector": "main, article, .post, .search-results, #content, body",
+                "wait_selector_timeout_ms": 30000,
+                "goto_wait_until": "domcontentloaded",
+                "retry_times": 2,
+                "retry_interval_ms": 1200,
+                "min_content_len": 120
+            }
+        }
+    }
+]
+
+
+# === 日志配置函数 (保持不变) ===
+def setup_logging():
+    if not os.path.exists('logs'):
+        os.makedirs('logs')
+
+    logger = logging.getLogger('darkweb_spider')
+    logger.setLevel(logging.INFO)
+
+    formatter = logging.Formatter(
+        '%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s'
+    )
+
+    info_handler = RotatingFileHandler('logs/info.log', maxBytes=10 * 1024 * 1024, backupCount=10, encoding='utf-8')
+    info_handler.setLevel(logging.INFO)
+    info_handler.setFormatter(formatter)
+
+    error_handler = RotatingFileHandler('logs/error.log', maxBytes=10 * 1024 * 1024, backupCount=10, encoding='utf-8')
+    error_handler.setLevel(logging.ERROR)
+    error_handler.setFormatter(formatter)
+
+    logger.addHandler(info_handler)
+    logger.addHandler(error_handler)
+
+    return logger
+
+
+# 初始化日志对象
+logger = setup_logging()
+
+
+def _get_profile_by_url(target_url):
+    """
+    根据域名 + 规则返回站点定制配置
+    """
+    try:
+        parsed = urlparse(target_url)
+        hostname = (parsed.hostname or "").lower()
+        path = parsed.path or "/"
+    except Exception:
+        hostname = ""
+        path = "/"
+
+    profile = SITE_PROFILES.get(hostname, {}).copy()
+
+    # 覆盖规则：命中则叠加到当前 profile 上
+    for rule in RULE_PROFILES:
+        matcher = rule.get("match", {})
+        host_suffix = (matcher.get("hostname_suffix") or "").lower()
+        host_prefix = (matcher.get("hostname_prefix") or "").lower()
+        path_contains = matcher.get("path_contains", [])
+
+        host_ok = (not host_suffix) or hostname.endswith(host_suffix)
+        if host_prefix:
+            host_ok = host_ok and hostname.startswith(host_prefix)
+        path_ok = True
+        if path_contains:
+            path_ok = any(token in path for token in path_contains)
+
+        if host_ok and path_ok:
+            rule_profile = rule.get("profile", {})
+            profile = _merge_profile(profile, rule_profile)
+
+    return profile
+
+
+def _merge_profile(base_profile, extra_profile):
+    """
+    递归合并 profile，extra_profile 优先
+    """
+    merged = dict(base_profile or {})
+    for key, value in (extra_profile or {}).items():
+        if isinstance(value, dict) and isinstance(merged.get(key), dict):
+            merged[key] = _merge_profile(merged[key], value)
+        else:
+            merged[key] = value
+    return merged
+
+
+def _merge_headers(profile_headers, custom_headers):
+    """
+    合并站点默认头和调用方传入头，后者优先
+    """
+    merged = {}
+    if profile_headers:
+        merged.update(profile_headers)
+    if custom_headers:
+        merged.update(custom_headers)
+    if "User-Agent" not in merged:
+        merged["User-Agent"] = DEFAULT_USER_AGENT
+    return merged
+
+
+def _normalize_cookies(cookie_input):
+    """
+    支持 dict 或 cookie 字符串两种格式
+    """
+    if not cookie_input:
+        return {}
+
+    if isinstance(cookie_input, dict):
+        return {str(k): str(v) for k, v in cookie_input.items()}
+
+    if isinstance(cookie_input, str):
+        cookie_map = {}
+        parts = [p.strip() for p in cookie_input.split(";") if p.strip()]
+        for part in parts:
+            if "=" not in part:
+                continue
+            k, v = part.split("=", 1)
+            cookie_map[k.strip()] = v.strip()
+        return cookie_map
+
+    return {}
+
+
+def _merge_cookies(base_cookies, more_cookies):
+    """
+    合并 cookie 字典，后者优先
+    """
+    merged = {}
+    if base_cookies:
+        merged.update(base_cookies)
+    if more_cookies:
+        merged.update(more_cookies)
+    return merged
+
+
+def _extract_cookie_from_headers(headers):
+    """
+    从 headers 中提取 Cookie，并返回 (clean_headers, cookie_map)
+    """
+    if not headers:
+        return {}, {}
+
+    clean_headers = dict(headers)
+    cookie_value = clean_headers.pop("Cookie", None)
+    if cookie_value is None:
+        cookie_value = clean_headers.pop("cookie", None)
+    cookie_map = _normalize_cookies(cookie_value)
+    return clean_headers, cookie_map
+
+
+def _sanitize_dynamic_headers(headers):
+    """
+    Playwright 不建议透传部分连接级头，避免被浏览器内部覆盖后冲突
+    """
+    blocked = {"host", "content-length", "connection", "cookie"}
+    return {k: v for k, v in (headers or {}).items() if k.lower() not in blocked}
+
+
+def _build_dynamic_target_urls(target_url):
+    """
+    对 onion 站点构建动态访问候选 URL。
+    优先使用原始 URL；若为 https 的 onion，再尝试回退到 http。
+    """
+    candidates = [target_url]
+    try:
+        parsed = urlparse(target_url)
+        hostname = (parsed.hostname or "").lower()
+        if parsed.scheme == "https" and hostname.endswith(".onion"):
+            candidates.append(target_url.replace("https://", "http://", 1))
+    except Exception:
+        pass
+    return candidates
+
+
+def _should_retry_status(status_code, retry_status_codes):
+    """
+    判断状态码是否应重试
+    """
+    try:
+        code = int(status_code)
+    except Exception:
+        return False
+    return code in set(retry_status_codes or [])
+
+
+# === 核心方法 1: 动态采集实现 (Playwright) ===
+async def _fetch_dynamic_content_async(target_url, client_ip, runtime_cfg):
+    """
+    使用 Playwright + Firefox 抓取动态页面 (含自动滚动)
+    """
+    logger.info(f"[动态] 启动浏览器内核 | 来源IP: {client_ip} | 目标: {target_url}")
+
+    start_time = time.time()
+    req_headers = runtime_cfg.get("headers", {})
+    req_cookies = runtime_cfg.get("cookies", {})
+    dynamic_cfg = runtime_cfg.get("dynamic", {})
+    initial_wait_ms = int(dynamic_cfg.get("initial_wait_ms", 5000))
+    post_scroll_wait_ms = int(dynamic_cfg.get("post_scroll_wait_ms", 2000))
+    wait_selector = dynamic_cfg.get("wait_selector")
+    wait_selector_timeout_ms = int(dynamic_cfg.get("wait_selector_timeout_ms", 20000))
+    goto_timeout_ms = int(dynamic_cfg.get("goto_timeout_ms", 90000))
+    goto_wait_until = dynamic_cfg.get("goto_wait_until", "load")
+    retry_times = int(dynamic_cfg.get("retry_times", 1))
+    retry_interval_ms = int(dynamic_cfg.get("retry_interval_ms", 1000))
+    min_content_len = int(dynamic_cfg.get("min_content_len", 1))
+    retry_status_codes = dynamic_cfg.get("retry_status_codes", [502, 503, 504])
+    dynamic_headers = _sanitize_dynamic_headers(req_headers)
+
+    async with async_playwright() as p:
+        browser = None
+        try:
+            browser = await p.firefox.launch(
+                headless=True,
+                proxy={"server": DYNAMIC_PROXY_SERVER}
+            )
+
+            context = await browser.new_context(
+                user_agent=req_headers.get("User-Agent", DEFAULT_USER_AGENT),
+                extra_http_headers=dynamic_headers,
+                ignore_https_errors=True
+            )
+            if req_cookies:
+                await context.add_cookies([
+                    {"name": k, "value": v, "domain": urlparse(target_url).hostname, "path": "/"}
+                    for k, v in req_cookies.items()
+                ])
+            page = await context.new_page()
+
+            response = None
+            html_content = ""
+            status_code = 0
+            last_error = None
+            candidate_urls = _build_dynamic_target_urls(target_url)
+            total_attempts = max(1, retry_times)
+
+            for attempt in range(1, total_attempts + 1):
+                for candidate_url in candidate_urls:
+                    logger.info(
+                        f"[动态] 第{attempt}/{total_attempts}次访问 | Timeout {goto_timeout_ms / 1000:.0f}s | URL: {candidate_url}"
+                    )
+                    try:
+                        response = await page.goto(candidate_url, timeout=goto_timeout_ms, wait_until=goto_wait_until)
+                        if not response:
+                            raise Exception("Response is None")
+
+                        if wait_selector:
+                            try:
+                                await page.wait_for_selector(wait_selector, timeout=wait_selector_timeout_ms)
+                            except Exception:
+                                logger.info(f"[动态] 等待选择器超时，继续执行 | selector={wait_selector}")
+
+                        await page.wait_for_timeout(initial_wait_ms)
+                        await _auto_scroll(page, logger, candidate_url)
+                        await page.wait_for_timeout(post_scroll_wait_ms)
+
+                        html_content = await page.content() or ""
+                        status_code = response.status
+                        if _should_retry_status(status_code, retry_status_codes):
+                            raise Exception(f"命中可重试状态码 | status_code={status_code}")
+                        if len(html_content) < min_content_len:
+                            raise Exception(f"内容过短，疑似未获取到正文 | length={len(html_content)}")
+
+                        cost_time = time.time() - start_time
+                        logger.info(
+                            f"[动态] 采集成功 | 耗时: {cost_time:.2f}s | 状态码: {status_code} | 长度: {len(html_content)}"
+                        )
+                        return {
+                            "status_code": status_code,
+                            "content": html_content,
+                        }
+                    except Exception as nav_err:
+                        last_error = nav_err
+                        logger.warning(f"[动态] 当前尝试失败 | URL: {candidate_url} | Error: {str(nav_err)}")
+
+                if attempt < total_attempts:
+                    await page.wait_for_timeout(retry_interval_ms)
+
+            if last_error:
+                raise last_error
+            raise Exception("动态采集失败，未获得有效响应")
+
+        except Exception as e:
+            raise e
+        finally:
+            if browser:
+                await browser.close()
+
+
+def fetch_dynamic_content(target_url, client_ip, runtime_cfg):
+    """
+    在线程池中执行 Async Playwright，避免 sync API 与 loop 冲突并支持并发。
+    """
+    future = DYNAMIC_EXECUTOR.submit(
+        lambda: asyncio.run(_fetch_dynamic_content_async(target_url, client_ip, runtime_cfg))
+    )
+    return future.result()
+
+
+# === 核心方法 2: 静态采集实现 (Requests) ===
+def fetch_static_content(target_url, client_ip, runtime_cfg):
+    """
+    使用 Requests 抓取静态页面
+    """
+    logger.info(f"[静态] 发起请求 | 来源IP: {client_ip} | 目标: {target_url}")
+
+    headers = runtime_cfg.get("headers", {})
+    cookies = runtime_cfg.get("cookies", {})
+    static_cfg = runtime_cfg.get("static", {})
+    timeout = int(static_cfg.get("timeout_s", 60))
+    allow_redirects = bool(static_cfg.get("allow_redirects", True))
+    retry_times = int(static_cfg.get("retry_times", 1))
+    retry_interval_ms = int(static_cfg.get("retry_interval_ms", 1000))
+    min_content_len = int(static_cfg.get("min_content_len", 1))
+    retry_status_codes = static_cfg.get("retry_status_codes", [502, 503, 504])
+
+    last_error = None
+    total_attempts = max(1, retry_times)
+    for attempt in range(1, total_attempts + 1):
+        try:
+            # 使用 Privoxy 代理
+            resp = requests.get(
+                target_url,
+                proxies=STATIC_PROXIES,
+                headers=headers,
+                cookies=cookies,
+                timeout=timeout,
+                allow_redirects=allow_redirects
+            )
+            content = resp.text or ""
+            if _should_retry_status(resp.status_code, retry_status_codes):
+                raise Exception(f"命中可重试状态码 | status_code={resp.status_code}")
+            if len(content) < min_content_len:
+                raise Exception(f"内容过短，疑似未获取到正文 | length={len(content)}")
+
+            logger.info(f"[静态] 采集成功 | 状态码: {resp.status_code} | URL: {target_url} | 长度: {len(content)}")
+            return {
+                "status_code": resp.status_code,
+                "content": content,
+            }
+        except Exception as req_err:
+            last_error = req_err
+            logger.warning(f"[静态] 第{attempt}/{total_attempts}次请求失败 | URL: {target_url} | Error: {str(req_err)}")
+            if attempt < total_attempts:
+                time.sleep(retry_interval_ms / 1000.0)
+
+    if last_error:
+        raise last_error
+    raise Exception("静态采集失败，未获得有效响应")
+
+
+# === 新增辅助函数：通用自动滚动逻辑 ===
+async def _auto_scroll(page, logger, target_url):
+    """
+    模拟人类滚动到底部，触发懒加载
+    """
+    logger.info(f"[动态] 开始自动滚动页面... | URL: {target_url}")
+
+    # 最大的滚动次数 (防止无限加载的页面卡死程序)
+    MAX_SCROLLS = 10
+    # 每次滚动后的等待时间 (暗网建议长一点，3-5秒)
+    WAIT_TIME = 3000
+
+    previous_height = await page.evaluate("document.body.scrollHeight")
+
+    for i in range(MAX_SCROLLS):
+        # 1. 滚动到当前页面的最底部
+        await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
+
+        # 2. 等待页面加载新内容 (相当于 sleep)
+        await page.wait_for_timeout(WAIT_TIME)
+
+        # 3. 获取新的高度
+        new_height = await page.evaluate("document.body.scrollHeight")
+
+        if new_height == previous_height:
+            logger.info(f"[动态] 滚动结束: 高度不再变化 (次数: {i})")
+            break
+
+        logger.info(f"[动态] 滚动触发加载: 高度从 {previous_height} 变为 {new_height}")
+        previous_height = new_height
+    else:
+        logger.info(f"[动态] 滚动结束: 达到最大次数限制 ({MAX_SCROLLS})")
+
+
+# === API 路由入口 ===
+@app.route('/crawl', methods=['POST'])
+def crawl_onion():
+    # 1. 解析参数
+    request_data = request.get_json()
+    client_ip = request.remote_addr
+
+    if not request_data:
+        logger.warning(f"请求体错误 | 来源IP: {client_ip}")
+        return jsonify({"code": 400, "msg": "请发送 JSON 格式"}), 400
+
+    target_url = request_data.get('url')
+    # 获取是否动态的标志，默认为 False (静态)
+    is_dynamic = request_data.get('is_dynamic', False)
+    custom_headers = request_data.get('headers', {})
+    custom_cookies = request_data.get('cookies', {})
+    static_timeout = request_data.get('static_timeout_s', 60)
+    goto_timeout_ms = request_data.get('dynamic_timeout_ms', 90000)
+    wait_selector = request_data.get('wait_selector')
+    wait_selector_timeout_ms = request_data.get('wait_selector_timeout_ms', 30000)
+    referer = request_data.get('referer')
+
+    if not target_url:
+        logger.warning(f"参数缺失 | 来源IP: {client_ip}")
+        return jsonify({"code": 400, "msg": "url 不能为空"}), 400
+
+    profile = _get_profile_by_url(target_url)
+    if profile.get("force_dynamic"):
+        is_dynamic = True
+        logger.info(f"[路由] 命中站点强制策略，自动切换动态模式 | URL: {target_url}")
+
+    # 标记当前模式，用于日志
+    mode_tag = "动态" if is_dynamic else "静态"
+    merged_headers = _merge_headers(profile.get("headers", {}), custom_headers)
+    header_without_cookie, header_cookie_map = _extract_cookie_from_headers(merged_headers)
+    merged_headers = header_without_cookie
+    if referer:
+        merged_headers["Referer"] = referer
+    merged_cookies = _merge_cookies(header_cookie_map, _normalize_cookies(custom_cookies))
+
+    static_profile_cfg = profile.get("static", {}).copy()
+    static_profile_cfg["timeout_s"] = static_timeout
+    dynamic_profile_cfg = profile.get("dynamic", {}).copy()
+    if wait_selector:
+        dynamic_profile_cfg["wait_selector"] = wait_selector
+    dynamic_profile_cfg["wait_selector_timeout_ms"] = wait_selector_timeout_ms
+    dynamic_profile_cfg["goto_timeout_ms"] = goto_timeout_ms
+
+    runtime_cfg = {
+        "headers": merged_headers,
+        "cookies": merged_cookies,
+        "static": static_profile_cfg,
+        "dynamic": dynamic_profile_cfg
+    }
+
+    try:
+        result_data = {}
+
+        # === 分流逻辑 ===
+        if is_dynamic:
+            # 走 Playwright
+            result_data = fetch_dynamic_content(target_url, client_ip, runtime_cfg)
+        else:
+            # 走 Requests
+            result_data = fetch_static_content(target_url, client_ip, runtime_cfg)
+
+        # 统一返回格式
+        return jsonify({
+            "code": 200,
+            "msg": "success",
+            "mode": mode_tag,  # 告诉调用者用了什么模式
+            "data": {
+                "status_code": result_data.get('status_code'),
+                "url": target_url,
+                "content": result_data.get('content')
+            }
+        })
+
+    except requests.exceptions.Timeout:
+        logger.error(f"[{mode_tag}] 请求超时 | URL: {target_url}")
+        return jsonify({"code": 504, "msg": f"{mode_tag}请求超时"}), 504
+
+    except requests.exceptions.ConnectionError:
+        logger.error(f"[{mode_tag}] 代理连接失败 | URL: {target_url}")
+        return jsonify({"code": 502, "msg": "代理连接失败，请检查服务状态"}), 502
+
+    except Exception as e:
+        # 捕获 Playwright 或其他未知错误
+        logger.error(f"[{mode_tag}] 系统异常 | URL: {target_url} | Error: {str(e)}", exc_info=True)
+        return jsonify({"code": 500, "msg": f"系统异常: {str(e)}"}), 500
+
+
+if __name__ == '__main__':
+    # 生产环境请使用 Gunicorn 启动
+    app.run(host='0.0.0.0', port=8000)
--- a/暗网服务部署笔记.md
+++ b/暗网服务部署笔记.md
@ -0,0 +1,521 @@
+# 暗网采集服务部署笔记
+
+> CentOS + Tor + Privoxy + Flask（python3 直接启动）
+
+## 环境信息
+
+| 项目 | 值 |
+|------|-----|
+| 云平台 | 华为云 |
+| 部署主机 | `ocai-node-05` |
+| 公网 IP | `124.243.188.109` |  
+| 内网 IP | `192.168.0.131`（同 VPC 内访问） |
+| 代码目录 | `/opt/crawl/darkweb_api` |
+| 入口文件 | `darkweb_api_3.py` |
+
+| 服务 | 端口 | 说明 |
+|------|------|------|
+| Tor (SOCKS5) | `9050` | 暗网隧道；动态采集直连此端口 |
+| Privoxy (HTTP) | `19095` | 静态采集走此代理 |
+| Flask API | `8000` | 采集接口 `/crawl` |
+
+**端口对应关系（三处必须一致）：**
+
+| 组件 | 配置文件 | 关键项 | 值 |
+|------|----------|--------|-----|
+| Tor | `/etc/tor/torrc` | `SOCKSPort` | `127.0.0.1:9050` |
+| Privoxy | `/etc/privoxy/config` | `listen-address` | `127.0.0.1:19095` |
+| Privoxy | `/etc/privoxy/config` | `forward-socks5t` | `/ 127.0.0.1:9050 .` |
+| API 代码 | `darkweb_api_3.py` | 静态代理 / 动态代理 | `19095` / `9050` |
+
+## API 接口说明
+
+**作用**：接收一个目标 URL（明网或 `.onion` 暗网），由服务器经 Tor 代理抓取页面内容，以 JSON 返回 HTML 源码。调用方无需在本机配置 Tor / 代理。
+
+| 项目 | 说明 |
+|------|------|
+| 地址 | `POST http://<IP>:8000/crawl` |
+| Content-Type | `application/json` |
+| 采集模式 | **静态**（默认）：`requests` + Privoxy；**动态**：Playwright + Tor，适合搜索页 / JS 渲染页 |
+
+**输入（JSON Body）：**
+
+| 参数 | 必填 | 默认值 | 说明 |
+|------|------|--------|------|
+| `url` | 是 | — | 要采集的目标地址 |
+| `is_dynamic` | 否 | `false` | `true` 启用动态采集（Playwright） |
+| `headers` | 否 | `{}` | 自定义请求头 |
+| `cookies` | 否 | `{}` | 自定义 Cookie |
+| `referer` | 否 | — | Referer 头 |
+| `static_timeout_s` | 否 | `60` | 静态采集超时（秒） |
+| `dynamic_timeout_ms` | 否 | `90000` | 动态页面加载超时（毫秒） |
+| `wait_selector` | 否 | — | 动态模式下等待的 CSS 选择器 |
+| `wait_selector_timeout_ms` | 否 | `30000` | 选择器等待超时（毫秒） |
+
+**输出（JSON）：**
+
+成功（HTTP 200）：
+
+```json
+{
+  "code": 200,
+  "msg": "success",
+  "mode": "静态",
+  "data": {
+    "status_code": 200,
+    "url": "http://xxx.onion/",
+    "content": "<html>...页面源码...</html>"
+  }
+}
+```
+
+失败：
+
+| HTTP 状态码 | `code` | `msg` 示例 |
+|-------------|--------|------------|
+| 400 | 400 | `请发送 JSON 格式` / `url 不能为空` |
+| 502 | 502 | `代理连接失败，请检查服务状态` |
+| 504 | 504 | `静态请求超时` / `动态请求超时` |
+| 500 | 500 | `系统异常: ...` |
+
+**Python 调用示例：**
+
+```python
+import requests
+
+resp = requests.post(
+    "http://124.243.188.109:8000/crawl",
+    json={"url": "http://xxx.onion/", "is_dynamic": True},
+    timeout=300,
+)
+data = resp.json()
+html = data["data"]["content"]   # msg == "success" 时取用
+```
+
+---
+
+# 快速上手（最常用）
+
+## 关键配置（首次部署必做）
+
+> Privoxy 默认监听 `8118`，必须改成 `19095` 并转发到 Tor `9050`；改完后还要处理文件权限。
+
+### 1. 配置 Tor（`/etc/tor/torrc`）
+
+```bash
+sudo vim /etc/tor/torrc
+```
+
+确认存在以下行（没有就加上，有重复端口配置以这条为准）：
+
+```
+SOCKSPort 127.0.0.1:9050
+```
+
+```bash
+sudo systemctl restart tor
+sudo netstat -tlnp | grep 9050    # 应看到 tor 监听 9050
+```
+
+### 2. 配置 Privoxy（`/etc/privoxy/config`）
+
+```bash
+sudo vim /etc/privoxy/config
+```
+
+**① 注释掉默认监听端口**（搜索 `listen-address`，把 `8118` 那行注释掉）：
+
+```
+# listen-address  127.0.0.1:8118
+# listen-address  [::1]:8118
+```
+
+**② 新增监听端口**（HTTP 代理，供 API 静态采集使用）：
+
+```
+listen-address  127.0.0.1:19095
+```
+
+**③ 新增转发规则**（把 HTTP 代理流量转给 Tor SOCKS5，末尾 `.` 不能漏）：
+
+```
+forward-socks5t / 127.0.0.1:9050 .
+```
+
+**④ 修复权限**（`sudo vim` 改完后面属主会变成 root，Privoxy 会启动失败）：
+
+```bash
+sudo chown -R privoxy:privoxy /etc/privoxy/
+```
+
+**⑤ 若 SELinux 拦截 19095 端口**（启动失败或端口未监听时）：
+
+```bash
+sudo setenforce 0
+```
+
+```bash
+sudo systemctl restart privoxy
+sudo netstat -tlnp | grep 19095    # 应看到 privoxy 监听 19095
+```
+
+### 3. 确认 API 代码端口一致
+
+`darkweb_api_3.py` 中应与上面配置对应（一般默认已正确，改过 Privoxy 端口时需同步修改）：
+
+```python
+STATIC_PROXY_PORT = "19095"              # 对应 Privoxy listen-address
+DYNAMIC_PROXY_SERVER = "socks5://127.0.0.1:9050"   # 对应 Tor SOCKSPort
+```
+
+---
+
+## 启动服务
+
+```bash
+# 1. 启动基础代理（一般已装好，重启即可）
+sudo systemctl start tor
+sudo systemctl start privoxy
+
+# 2. 启动 API
+cd /opt/crawl/darkweb_api
+nohup python3 darkweb_api_3.py > ./logs/api_stdout.log 2>&1 &
+```
+
+## 验证命令
+
+```bash
+# 服务与端口
+sudo systemctl status tor privoxy
+ps -ef | grep darkweb_api_3
+sudo netstat -tlnp | grep -E '9050|19095|8000'
+
+# Tor 直连
+curl --socks5-hostname 127.0.0.1:9050 --max-time 60 http://httpbin.org/ip
+
+# Privoxy 转发
+curl -x http://127.0.0.1:19095 --max-time 60 http://httpbin.org/ip
+
+# 暗网连通
+curl -x http://127.0.0.1:19095 -L --max-time 120 \
+  "http://darkzqtmbdeauwq5mzcmgeeuhet42fhfjj4p5wbak3ofx2yqgecoeqyd.onion/"
+
+# API 静态采集
+curl -X POST -H "Content-Type: application/json" \
+  -d '{"url": "http://darkzqtmbdeauwq5mzcmgeeuhet42fhfjj4p5wbak3ofx2yqgecoeqyd.onion/"}' \
+  http://127.0.0.1:8000/crawl
+
+# API 动态采集
+curl -X POST -H "Content-Type: application/json" \
+  -d '{"url": "http://darkzqtmbdeauwq5mzcmgeeuhet42fhfjj4p5wbak3ofx2yqgecoeqyd.onion/search?q=a", "is_dynamic": true}' \
+  http://127.0.0.1:8000/crawl
+```
+
+## 一键体检
+
+```bash
+echo "=== Tor ===" && curl -s --socks5-hostname 127.0.0.1:9050 --max-time 30 http://httpbin.org/ip && \
+echo -e "\n=== Privoxy ===" && curl -s -x http://127.0.0.1:19095 --max-time 30 http://httpbin.org/ip && \
+echo -e "\n=== API ===" && curl -s -X POST -H "Content-Type: application/json" \
+  -d '{"url":"http://httpbin.org/ip"}' http://127.0.0.1:8000/crawl | head -c 200
+```
+
+## 启停与排障
+
+```bash
+# 采集超时？先重启 Tor（最常见修复方式）
+sudo systemctl restart tor
+sudo systemctl restart privoxy
+
+# 查看 Tor 是否正常建电路
+journalctl -u tor -n 20 --no-pager | grep -iE 'bootstrap|circuit'
+
+# API 进程管理
+ps -ef | grep darkweb_api_3
+kill <PID>
+cd /opt/crawl/darkweb_api && nohup python3 darkweb_api_3.py > ./logs/api_stdout.log 2>&1 &
+```
+
+## 外部调用
+
+```bash
+# 其他机器通过公网 IP 调 API（推荐）
+curl -X POST -H "Content-Type: application/json" \
+  -d '{"url": "http://<onion-url>/"}' \
+  http://124.243.188.109:8000/crawl
+
+# 同 VPC 内网
+curl -X POST -H "Content-Type: application/json" \
+  -d '{"url": "http://<onion-url>/"}' \
+  http://192.168.0.131:8000/crawl
+```
+
+---
+
+# 详细部署说明
+
+## 一、核心架构
+
+```
+客户端 (Python / Curl / 其他服务器)
+   ↓  HTTP 请求
+Flask API (:8000)  ──静态采集──→  Privoxy (:19095)
+   │                                      ↓ SOCKS5
+   └──动态采集 (Playwright) ──────────→  Tor (:9050)
+                                              ↓ 加密隧道
+                                         暗网 (.onion)
+```
+
+- **Tor**：建立暗网加密隧道，提供 SOCKS5 接口（`9050`）。
+- **Privoxy**：将 HTTP 代理请求翻译为 SOCKS5 后转发给 Tor。
+- **Flask API**：对外提供 `/crawl` 接口；静态走 Privoxy，动态由 Playwright 直连 Tor。当前以 `python3` 直接启动。
+
+---
+
+## 二、安装与配置 Tor
+
+Tor 不在 CentOS 默认源中，需先添加 EPEL：
+
+```bash
+sudo yum install epel-release -y
+sudo yum install tor -y
+```
+
+编辑 `/etc/tor/torrc`，确认 SOCKS 监听端口：
+
+```
+SOCKSPort 127.0.0.1:9050
+```
+
+```bash
+sudo systemctl start tor
+sudo systemctl enable tor
+sudo systemctl restart tor
+sudo netstat -tlnp | grep 9050    # 确认 9050 已监听
+```
+
+---
+
+## 三、安装与配置 Privoxy
+
+```bash
+sudo yum install privoxy -y
+```
+
+编辑 `/etc/privoxy/config`，**三项必改**：
+
+| 步骤 | 操作 | 说明 |
+|------|------|------|
+| ① | 注释默认 `listen-address ... 8118` | 避免仍监听旧端口 |
+| ② | 新增 `listen-address 127.0.0.1:19095` | HTTP 代理监听端口 |
+| ③ | 新增 `forward-socks5t / 127.0.0.1:9050 .` | 转发到 Tor，末尾 `.` 不能漏 |
+
+完整示例：
+
+```
+# listen-address  127.0.0.1:8118
+# listen-address  [::1]:8118
+listen-address  127.0.0.1:19095
+forward-socks5t / 127.0.0.1:9050 .
+```
+
+改完权限并启动：
+
+```bash
+sudo chown -R privoxy:privoxy /etc/privoxy/
+sudo systemctl start privoxy
+sudo systemctl enable privoxy
+sudo systemctl restart privoxy
+sudo systemctl status privoxy
+sudo netstat -tlnp | grep 19095
+```
+
+---
+
+## 四、常见问题
+
+### 权限错误（Permission Denied）
+
+`sudo vim` 改配置后文件属主变为 `root`，Privoxy 以 `privoxy` 用户运行时会读不到配置：
+
+```bash
+sudo chown -R privoxy:privoxy /etc/privoxy/
+ls -l /etc/privoxy/config    # 确认属主为 privoxy
+```
+
+### SELinux 拦截自定义端口
+
+CentOS 默认只允许 Privoxy 监听 `8118`，改用 `19095` 可能被拦截：
+
+```bash
+sudo setenforce 0    # 临时关闭，验证后可按需配置 SELinux 策略
+```
+
+### Tor 进程在跑但采集超时
+
+日志出现 `0 circuits open` 或 Privoxy 请求 `127.0.0.1:19095` 超时：
+
+```bash
+sudo systemctl restart tor
+journalctl -u tor -f    # 观察是否出现 Bootstrapped 100%
+```
+
+若重启后仍卡在 Bootstrap 5%，检查系统时间（`timedatectl`）和华为云安全组出站规则；仍不通再考虑配置 Tor Bridge。
+
+---
+
+## 五、API 服务部署
+
+### 接口作用
+
+`/crawl` 是暗网采集服务的唯一入口。客户端传入目标 URL，服务端代为完成：
+
+1. **静态模式**（默认）：通过 Privoxy → Tor 发 HTTP 请求，返回页面 HTML。
+2. **动态模式**（`is_dynamic: true`）：启动无头浏览器经 Tor 访问，支持 JS 渲染、自动滚动，返回渲染后的 HTML。
+
+适用于：暗网页面采集、搜索结果显示、需登录 Cookie / 自定义请求头的场景。部分站点在代码内置了采集策略，会自动切换动态模式。
+
+### 请求格式
+
+```
+POST /crawl
+Host: 124.243.188.109:8000
+Content-Type: application/json
+```
+
+```json
+{
+  "url": "http://darkzqtmbdeauwq5mzcmgeeuhet42fhfjj4p5wbak3ofx2yqgecoeqyd.onion/search?q=a",
+  "is_dynamic": true,
+  "headers": {"User-Agent": "Mozilla/5.0 ..."},
+  "cookies": {"session": "xxx"},
+  "referer": "http://xxx.onion/",
+  "static_timeout_s": 120,
+  "dynamic_timeout_ms": 90000,
+  "wait_selector": "body",
+  "wait_selector_timeout_ms": 30000
+}
+```
+
+### 响应格式
+
+**成功：**
+
+```json
+{
+  "code": 200,
+  "msg": "success",
+  "mode": "动态",
+  "data": {
+    "status_code": 200,
+    "url": "http://xxx.onion/search?q=a",
+    "content": "<!DOCTYPE html>..."
+  }
+}
+```
+
+| 字段 | 说明 |
+|------|------|
+| `code` | 业务状态码，200 表示成功 |
+| `msg` | `success` 或错误描述 |
+| `mode` | 实际使用的采集模式：`静态` / `动态` |
+| `data.status_code` | 目标站点 HTTP 状态码 |
+| `data.url` | 请求的原始 URL |
+| `data.content` | 页面 HTML 源码（主要取用字段） |
+
+**失败：**
+
+```json
+{"code": 504, "msg": "静态请求超时"}
+```
+
+### 代码目录
+
+```bash
+cd /opt/crawl/darkweb_api
+ls darkweb_api_3.py
+```
+
+代码内代理配置（一般无需修改）：
+
+- 静态代理：`127.0.0.1:19095`（Privoxy）
+- 动态代理：`socks5://127.0.0.1:9050`（Tor）
+
+### 启动方式
+
+前台运行（调试用）：
+
+```bash
+cd /opt/crawl/darkweb_api
+python3 darkweb_api_3.py
+```
+
+后台运行（生产常用）：
+
+```bash
+cd /opt/crawl/darkweb_api
+nohup python3 darkweb_api_3.py > ./logs/api_stdout.log 2>&1 &
+```
+
+### 调用示例
+
+**curl：**
+
+```bash
+# 静态采集
+curl -X POST -H "Content-Type: application/json" \
+  -d '{"url": "http://<onion-url>/"}' \
+  http://127.0.0.1:8000/crawl
+
+# 动态采集
+curl -X POST -H "Content-Type: application/json" \
+  -d '{"url": "http://<onion-url>/", "is_dynamic": true}' \
+  http://127.0.0.1:8000/crawl
+```
+
+**Python：**
+
+```python
+import requests
+
+api_url = "http://124.243.188.109:8000/crawl"
+payload = {
+    "url": "http://<onion-url>/",
+    "is_dynamic": True,
+}
+
+resp = requests.post(api_url, json=payload, timeout=300)
+result = resp.json()
+
+if result.get("msg") == "success":
+    html = result["data"]["content"]
+else:
+    print("失败:", result.get("msg"))
+```
+
+**其他服务器（公网）：**
+
+```bash
+curl -X POST -H "Content-Type: application/json" \
+  -d '{"url": "http://<onion-url>/"}' \
+  http://124.243.188.109:8000/crawl
+```
+
+---
+
+## 六、运维速查
+
+```bash
+# 服务状态
+sudo systemctl status tor privoxy
+ps -ef | grep darkweb_api_3
+
+# 端口占用
+sudo netstat -tlnp | grep -E '9050|19095|8000'
+
+# 日志
+tail -f /opt/crawl/darkweb_api/logs/api_stdout.log
+tail -f /opt/crawl/darkweb_api/logs/*.log
+sudo journalctl -u tor -f
+sudo journalctl -u privoxy -f
+```
--- a/请求ok.py
+++ b/请求ok.py
@ -0,0 +1,55 @@
+import requests
+# 公网ip
+api_url = "http://124.243.188.109:8000/crawl"
+
+
+# 内网ip
+# api_url = "http://192.168.0.131:8000/crawl"
+
+url_list = [
+            'http://darkzqtmbdeauwq5mzcmgeeuhet42fhfjj4p5wbak3ofx2yqgecoeqyd.onion',
+            'http://rznvg5sjacavz5kpshrq4urm75xzruha6iiyuggidnioo5ztvwdfroyd.onion/blogs/where-to-buy-counterfeit-banknotes-how-t/hitman-internet-killers-hit-man-service.html',
+            'http://dwltorbltw3tdjskxn23j2mwz2f4q25j4ninl5bdvttiy4xb6cqzikid.onion/',
+            'https://onionsearchengine.com/search.php?q=search+engine',
+            'http://xao2lxsmia2edq2n5zxg6uahx6xox2t7bfjw6b5vdzsxi7ezmqob6qid.onion/',
+            'http://dwltorbltw3tdjskxn23j2mwz2f4q25j4ninl5bdvttiy4xb6cqzikid.onion/blog/dark-web-onion-links',
+            'http://darkzqtmbdeauwq5mzcmgeeuhet42fhfjj4p5wbak3ofx2yqgecoeqyd.onion/search?q=dark&p=2']
+# 注意这里用 json=payload，且方法是 post
+index = 6
+payload = {
+    # "url": url_list[index],
+    # "url": "http://zqktlwiuavvvqqt4ybvgvi7tyo4hjl5xgfuvpdf6otjiycgwqbym2qad.onion/wiki/index.php/Main_Page#Conferences",
+    "url": "http://darkzqtmbdeauwq5mzcmgeeuhet42fhfjj4p5wbak3ofx2yqgecoeqyd.onion/search?q=a",
+    # "is_dynamic": True,
+    "method": "POST",  # 告诉服务器：底层用 POST 去连暗网
+
+}
+resp = requests.post(api_url, json=payload)
+data_json = resp.json()
+print(data_json)
+html = data_json.get('data').get('content')
+# msg = data_json.get('msg')
+# cont = 0
+# for i,item in enumerate(url_list):
+#     payload = {
+#         "url": item,
+#         # "url": "http://dwltorbltw3tdjskxn23j2mwz2f4q25j4ninl5bdvttiy4xb6cqzikid.onion/blog/dark-web-onion-links ",
+#         "is_dynamic": True,
+#         "method": "POST",  # 告诉服务器：底层用 POST 去连暗网
+#     }
+#     try:
+#         resp = requests.post(api_url, json=payload)
+#         data_json = resp.json()
+#         # print(data_json)
+#         html = data_json.get('data').get('content')
+#         msg = data_json.get('msg')
+#         if msg =="success":
+#             # print(html)
+#             cont += 1
+#         else:print(item)
+#     except:
+#         print(item)
+# print(cont)
+#
+with open(f'./html{str(index)}.html', 'w',encoding='utf-8') as f:
+    f.write(html)