commit
35d83546d1
19 changed files with 964 additions and 0 deletions
-
96README.md
-
0dark_net_crawler/__init__.py
-
40dark_net_crawler/items.py
-
164dark_net_crawler/middlewares.py
-
38dark_net_crawler/pipelines.py
-
125dark_net_crawler/settings.py
-
4dark_net_crawler/spiders/__init__.py
-
122dark_net_crawler/spiders/news_denkbares.py
-
139dark_net_crawler/spiders/shop_pot.py
-
28dark_net_crawler/utils/file_util.py
-
24dark_net_crawler/utils/gofast_util.py
-
7dark_net_crawler/utils/kafka_config.py
-
28dark_net_crawler/utils/kafka_util.py
-
51dark_net_crawler/utils/logger_tool.py
-
13entrypoint.py
-
BINimg_1.png
-
49requirements.txt
-
25scheduled_run.py
-
11scrapy.cfg
@ -0,0 +1,96 @@ |
|||||
|
# dark_net_crawler |
||||
|
|
||||
|
暗网采集(新版) |
||||
|
——推送到采集平台的数据字段都和新闻的一样 |
||||
|
|
||||
|
1. 新闻网站: |
||||
|
|
||||
|
http://pr3ygifxd23xu43be2fegjjsk5jlb22q2va2h5apz76ejbvammeclkid.onion/ |
||||
|
|
||||
|
|
||||
|
2. 交易网站: |
||||
|
|
||||
|
http://potshopk4eov76aciyranqyq2r3mszuvfisvneytodfxo56ubha7doqd.onion/?post_type=product |
||||
|
|
||||
|
|
||||
|
部署说明: |
||||
|
1. 线上 47.252.23.168 机器(已翻墙)上,部署了: |
||||
|
|
||||
|
(1)项目部署路径: |
||||
|
|
||||
|
/opt/crawl/dark_net/dark_net_crawler |
||||
|
|
||||
|
沙箱环境: |
||||
|
|
||||
|
conda activate pdf_crawler_py3.8 |
||||
|
|
||||
|
python环境为3.8 |
||||
|
|
||||
|
|
||||
|
(2)暗网采集代理的Tor服务-------代理地址:socks5h://localhost:9050 |
||||
|
|
||||
|
CentOS+tor+Privoxy 服务搭建(要出墙tor才可正常使用): |
||||
|
|
||||
|
sudo yum install epel-release 不然没有源 |
||||
|
sudo yum install tor |
||||
|
service tor start 启动服务 |
||||
|
service tor status 检查服务状态 |
||||
|
|
||||
|
(3)Privoxy:将SOCKS5代理转换为HTTP代理:http://172.18.1.103:19050(最终暗网采集项目中 使用的代理地址:见 settings.py) |
||||
|
|
||||
|
sudo yum install privoxy 安装 |
||||
|
|
||||
|
修改一下privoxy 配置文件 将tor 和privoxy整合 |
||||
|
vim /ect/privoxy/config |
||||
|
将 listen-address 改为0.0.0.0:19095 |
||||
|
搜索 forward-socks5t 找到注释拿掉 |
||||
|
ESC wq保存退出 |
||||
|
|
||||
|
启动privoxy |
||||
|
service privoxy start |
||||
|
启动完成检查状态 status 这样就完成了服务搭建 |
||||
|
|
||||
|
设置完成代理 检查是否成功 |
||||
|
curl -x http://172.18.1.103:19050 'http://pr3ygifxd23xu43be2fegjjsk5jlb22q2va2h5apz76ejbvammeclkid.onion/' |
||||
|
curl -x http://172.18.1.103:19050 'http://httpbin.org/ip' |
||||
|
curl -x socks5h://localhost:9050 'http://httpbin.org/ip' |
||||
|
curl 'http://httpbin.org/ip' |
||||
|
 |
||||
|
|
||||
|
2. 本地或线上一次性运行启动入口:python entrypoint.py |
||||
|
|
||||
|
|
||||
|
依赖安装:pip install -r requirements.txt --python-version 3.8 |
||||
|
|
||||
|
线上周期定时采集: |
||||
|
|
||||
|
conda activate pdf-crawler(进入沙箱环境) |
||||
|
|
||||
|
python scheduled_run.py (每周五采集一次) |
||||
|
|
||||
|
|
||||
|
3. (1) 数据连接的Kafka配置: |
||||
|
|
||||
|
dark_net_crawler/utils/kafka_config.py |
||||
|
|
||||
|
(2) 输出到采集平台的数据格式:(和新闻字段一致) |
||||
|
|
||||
|
见:dark_net_crawler/items.py |
||||
|
|
||||
|
如果想把新采集的数据覆盖掉:version字段值递增即可。 items['version'] = 2 |
||||
|
|
||||
|
|
||||
|
4. 日志打印--按天输出,只保留近7天: |
||||
|
|
||||
|
dark_net_crawler/utils/fb-download-logs/2024-07-18.log |
||||
|
|
||||
|
|
||||
|
5. 两个网站的采集解析逻辑: |
||||
|
|
||||
|
dark_net_crawler/spiders/news_denkbares.py |
||||
|
|
||||
|
dark_net_crawler/spiders/shop_pot.py |
||||
|
|
||||
|
|
||||
|
|
||||
|
|
@ -0,0 +1,40 @@ |
|||||
|
# Define here the models for your scraped items |
||||
|
# |
||||
|
# See documentation in: |
||||
|
# https://docs.scrapy.org/en/latest/topics/items.html |
||||
|
|
||||
|
from scrapy.item import Item, Field |
||||
|
|
||||
|
|
||||
|
class DarkNetCrawlerItem(Item): |
||||
|
# define the fields for your item here like: |
||||
|
# name = scrapy.Field() |
||||
|
|
||||
|
source = Field() |
||||
|
cid = Field() |
||||
|
dns = Field() |
||||
|
crawlTime = Field() |
||||
|
creation_time = Field() |
||||
|
|
||||
|
url = Field() |
||||
|
news_id = Field() |
||||
|
author = Field() |
||||
|
purl = Field() |
||||
|
title = Field() |
||||
|
content = Field() |
||||
|
post_time = Field() |
||||
|
forwardcontent = Field() |
||||
|
|
||||
|
imagePath = Field() |
||||
|
imgList = Field() |
||||
|
contentimgs = Field() |
||||
|
|
||||
|
purl = Field() |
||||
|
isDownload = Field() |
||||
|
pagetype = Field() |
||||
|
type = Field() |
||||
|
imagePathSize = Field() |
||||
|
|
||||
|
attr = Field() |
||||
|
|
||||
|
version = Field() |
@ -0,0 +1,164 @@ |
|||||
|
# Define here the models for your spider middleware |
||||
|
# |
||||
|
# See documentation in: |
||||
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html |
||||
|
|
||||
|
from scrapy import signals |
||||
|
|
||||
|
# useful for handling different item types with a single interface |
||||
|
from itemadapter import is_item, ItemAdapter |
||||
|
|
||||
|
import dark_net_crawler |
||||
|
|
||||
|
|
||||
|
class PdfCrawlerSpiderMiddleware: |
||||
|
# Not all methods need to be defined. If a method is not defined, |
||||
|
# scrapy acts as if the spider middleware does not modify the |
||||
|
# passed objects. |
||||
|
|
||||
|
@classmethod |
||||
|
def from_crawler(cls, crawler): |
||||
|
# This method is used by Scrapy to create your spiders. |
||||
|
s = cls() |
||||
|
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) |
||||
|
return s |
||||
|
|
||||
|
def process_spider_input(self, response, spider): |
||||
|
# Called for each response that goes through the spider |
||||
|
# middleware and into the spider. |
||||
|
|
||||
|
# Should return None or raise an exception. |
||||
|
return None |
||||
|
|
||||
|
def process_spider_output(self, response, result, spider): |
||||
|
# Called with the results returned from the Spider, after |
||||
|
# it has processed the response. |
||||
|
|
||||
|
# Must return an iterable of Request, or item objects. |
||||
|
for i in result: |
||||
|
yield i |
||||
|
|
||||
|
def process_spider_exception(self, response, exception, spider): |
||||
|
# Called when a spider or process_spider_input() method |
||||
|
# (from other spider middleware) raises an exception. |
||||
|
|
||||
|
# Should return either None or an iterable of Request or item objects. |
||||
|
pass |
||||
|
|
||||
|
def process_start_requests(self, start_requests, spider): |
||||
|
# Called with the start requests of the spider, and works |
||||
|
# similarly to the process_spider_output() method, except |
||||
|
# that it doesn’t have a response associated. |
||||
|
|
||||
|
# Must return only requests (not items). |
||||
|
for r in start_requests: |
||||
|
yield r |
||||
|
|
||||
|
def spider_opened(self, spider): |
||||
|
spider.logger.info('Spider opened: %s' % spider.name) |
||||
|
|
||||
|
|
||||
|
class PdfCrawlerDownloaderMiddleware: |
||||
|
# Not all methods need to be defined. If a method is not defined, |
||||
|
# scrapy acts as if the downloader middleware does not modify the |
||||
|
# passed objects. |
||||
|
|
||||
|
@classmethod |
||||
|
def from_crawler(cls, crawler): |
||||
|
# This method is used by Scrapy to create your spiders. |
||||
|
s = cls() |
||||
|
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) |
||||
|
return s |
||||
|
|
||||
|
def process_request(self, request, spider): |
||||
|
# Called for each request that goes through the downloader |
||||
|
# middleware. |
||||
|
|
||||
|
# Must either: |
||||
|
# - return None: continue processing this request |
||||
|
# - or return a Response object |
||||
|
# - or return a Request object |
||||
|
# - or raise IgnoreRequest: process_exception() methods of |
||||
|
# installed downloader middleware will be called |
||||
|
return None |
||||
|
|
||||
|
def process_response(self, request, response, spider): |
||||
|
# Called with the response returned from the downloader. |
||||
|
|
||||
|
# Must either; |
||||
|
# - return a Response object |
||||
|
# - return a Request object |
||||
|
# - or raise IgnoreRequest |
||||
|
return response |
||||
|
|
||||
|
def process_exception(self, request, exception, spider): |
||||
|
# Called when a download handler or a process_request() |
||||
|
# (from other downloader middleware) raises an exception. |
||||
|
|
||||
|
# Must either: |
||||
|
# - return None: continue processing this exception |
||||
|
# - return a Response object: stops process_exception() chain |
||||
|
# - return a Request object: stops process_exception() chain |
||||
|
pass |
||||
|
|
||||
|
def spider_opened(self, spider): |
||||
|
spider.logger.info('Spider opened: %s' % spider.name) |
||||
|
|
||||
|
|
||||
|
import random |
||||
|
|
||||
|
|
||||
|
class Proxy_Middleware: |
||||
|
|
||||
|
def __init__(self, crawler): |
||||
|
self.proxy_list = dark_net_crawler.settings.PROXY_LIST |
||||
|
self.ua_list = dark_net_crawler.settings.USER_AGENT_LIST |
||||
|
|
||||
|
@classmethod |
||||
|
def from_crawler(cls, crawler): |
||||
|
return cls(crawler) |
||||
|
|
||||
|
def process_request(self, request, spider): |
||||
|
try: |
||||
|
ua = random.choice(self.ua_list) |
||||
|
request.headers.setdefault('User-Agent', ua) |
||||
|
|
||||
|
proxy_ip_port = random.choice(self.proxy_list) |
||||
|
request.meta['proxy'] = 'http://' + proxy_ip_port |
||||
|
|
||||
|
except request.exceptions.RequestException: |
||||
|
spider.logger.error('some error happended!') |
||||
|
|
||||
|
|
||||
|
from scrapy.downloadermiddlewares.retry import RetryMiddleware |
||||
|
from scrapy.utils.response import response_status_message |
||||
|
|
||||
|
|
||||
|
class My_RetryMiddleware(RetryMiddleware): |
||||
|
def __init__(self, crawler): |
||||
|
self.proxy_list = dark_net_crawler.settings.PROXY_LIST |
||||
|
self.ua_list = dark_net_crawler.settings.USER_AGENT_LIST |
||||
|
self.retry_http_codes = ['404'] |
||||
|
|
||||
|
@classmethod |
||||
|
def from_crawler(cls, crawler): |
||||
|
return cls(crawler) |
||||
|
|
||||
|
def process_response(self, request, response, spider): |
||||
|
if request.meta.get('dont_retry', False): |
||||
|
return response |
||||
|
|
||||
|
if response.status in self.retry_http_codes: |
||||
|
reason = response_status_message(response.status) |
||||
|
try: |
||||
|
ua = random.choice(self.ua_list) |
||||
|
request.headers.setdefault('User-Agent', ua) |
||||
|
|
||||
|
proxy_ip_port = random.choice(self.proxy_list) |
||||
|
request.meta['proxy'] = 'http://' + proxy_ip_port |
||||
|
|
||||
|
except request.exceptions.RequestException: |
||||
|
spider.logger.error('获取暗网代理ip失败!') |
||||
|
|
||||
|
return self._retry(request, reason, spider) or response |
||||
|
return response |
@ -0,0 +1,38 @@ |
|||||
|
# Define your item pipelines here |
||||
|
# |
||||
|
# Don't forget to add your pipeline to the ITEM_PIPELINES setting |
||||
|
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html |
||||
|
|
||||
|
|
||||
|
# useful for handling different item types with a single interface |
||||
|
from itemadapter import ItemAdapter |
||||
|
|
||||
|
import json |
||||
|
from kafka import KafkaProducer |
||||
|
|
||||
|
from dark_net_crawler.utils.kafka_config import KAFKA_TOPIC, KAFKA_ADDRESS |
||||
|
from dark_net_crawler.utils.logger_tool import Loggings |
||||
|
|
||||
|
|
||||
|
class DarkNetCrawlerPipeline: |
||||
|
def process_item(self, item, spider): |
||||
|
return item |
||||
|
|
||||
|
|
||||
|
class DarkNetCrawlerKfkPipeline: |
||||
|
def open_spider(self, spider): |
||||
|
self.producer = KafkaProducer( |
||||
|
bootstrap_servers=[KAFKA_ADDRESS], # 替换为你的Kafka服务器地址 |
||||
|
value_serializer=lambda v: json.dumps(v).encode('utf-8') |
||||
|
) |
||||
|
|
||||
|
def close_spider(self, spider): |
||||
|
self.producer.close() |
||||
|
|
||||
|
def process_item(self, item, spider): |
||||
|
topic = KAFKA_TOPIC["post"] |
||||
|
data = dict(item) |
||||
|
self.producer.send(topic, data) # 替换为你的Kafka主题 |
||||
|
Loggings.info(f'Send to kfk OK! topic:{topic}, data:{data}') |
||||
|
return item |
||||
|
|
@ -0,0 +1,125 @@ |
|||||
|
# Scrapy settings for dark_net_crawler project |
||||
|
# |
||||
|
# For simplicity, this file contains only settings considered important or |
||||
|
# commonly used. You can find more settings consulting the documentation: |
||||
|
# |
||||
|
# https://docs.scrapy.org/en/latest/topics/settings.html |
||||
|
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html |
||||
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html |
||||
|
|
||||
|
BOT_NAME = 'dark_net_crawler' |
||||
|
|
||||
|
SPIDER_MODULES = ['dark_net_crawler.spiders'] |
||||
|
NEWSPIDER_MODULE = 'dark_net_crawler.spiders' |
||||
|
|
||||
|
|
||||
|
# Crawl responsibly by identifying yourself (and your website) on the user-agent |
||||
|
#USER_AGENT = 'dark_net_crawler (+http://www.yourdomain.com)' |
||||
|
|
||||
|
# setting中设置 |
||||
|
USER_AGENT_LIST = [ \ |
||||
|
"Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0", \ |
||||
|
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", \ |
||||
|
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \ |
||||
|
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \ |
||||
|
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \ |
||||
|
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \ |
||||
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \ |
||||
|
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \ |
||||
|
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ |
||||
|
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ |
||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ |
||||
|
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \ |
||||
|
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \ |
||||
|
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ |
||||
|
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ |
||||
|
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ |
||||
|
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \ |
||||
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \ |
||||
|
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \ |
||||
|
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.132 Safari/537.36", \ |
||||
|
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0" |
||||
|
] |
||||
|
|
||||
|
PROXY_LIST = [ |
||||
|
# 'localhost:9050', |
||||
|
# 'localhost:19050', |
||||
|
'47.252.23.168:19050' |
||||
|
] |
||||
|
|
||||
|
|
||||
|
# Obey robots.txt rules |
||||
|
ROBOTSTXT_OBEY = False |
||||
|
|
||||
|
# Configure maximum concurrent requests performed by Scrapy (default: 16) |
||||
|
#CONCURRENT_REQUESTS = 32 |
||||
|
|
||||
|
# Configure a delay for requests for the same website (default: 0) |
||||
|
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay |
||||
|
# See also autothrottle settings and docs |
||||
|
#DOWNLOAD_DELAY = 3 |
||||
|
# The download delay setting will honor only one of: |
||||
|
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 |
||||
|
#CONCURRENT_REQUESTS_PER_IP = 16 |
||||
|
|
||||
|
# Disable cookies (enabled by default) |
||||
|
#COOKIES_ENABLED = False |
||||
|
|
||||
|
# Disable Telnet Console (enabled by default) |
||||
|
#TELNETCONSOLE_ENABLED = False |
||||
|
|
||||
|
# Override the default request headers: |
||||
|
#DEFAULT_REQUEST_HEADERS = { |
||||
|
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', |
||||
|
# 'Accept-Language': 'en', |
||||
|
#} |
||||
|
|
||||
|
# Enable or disable spider middlewares |
||||
|
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html |
||||
|
#SPIDER_MIDDLEWARES = { |
||||
|
# 'dark_net_crawler.middlewares.DarkNetCrawlerSpiderMiddleware': 543, |
||||
|
#} |
||||
|
|
||||
|
# Enable or disable downloader middlewares |
||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html |
||||
|
DOWNLOADER_MIDDLEWARES = { |
||||
|
# 'dark_net_crawler.middlewares.DarkNetCrawlerDownloaderMiddleware': 543, |
||||
|
'dark_net_crawler.middlewares.Proxy_Middleware': 800, |
||||
|
# 'dark_net_crawler.middlewares.My_RetryMiddleware': 543, |
||||
|
} |
||||
|
|
||||
|
# Enable or disable extensions |
||||
|
# See https://docs.scrapy.org/en/latest/topics/extensions.html |
||||
|
#EXTENSIONS = { |
||||
|
# 'scrapy.extensions.telnet.TelnetConsole': None, |
||||
|
#} |
||||
|
|
||||
|
# Configure item pipelines |
||||
|
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html |
||||
|
ITEM_PIPELINES = { |
||||
|
# 'dark_net_crawler.pipelines.DarkNetCrawlerPipeline': 300, |
||||
|
# 'dark_net_crawler.pipelines.DarkNetCrawlerKfkPipeline': 800, |
||||
|
} |
||||
|
|
||||
|
# Enable and configure the AutoThrottle extension (disabled by default) |
||||
|
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html |
||||
|
#AUTOTHROTTLE_ENABLED = True |
||||
|
# The initial download delay |
||||
|
#AUTOTHROTTLE_START_DELAY = 5 |
||||
|
# The maximum download delay to be set in case of high latencies |
||||
|
#AUTOTHROTTLE_MAX_DELAY = 60 |
||||
|
# The average number of requests Scrapy should be sending in parallel to |
||||
|
# each remote server |
||||
|
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 |
||||
|
# Enable showing throttling stats for every response received: |
||||
|
#AUTOTHROTTLE_DEBUG = False |
||||
|
|
||||
|
# Enable and configure HTTP caching (disabled by default) |
||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings |
||||
|
#HTTPCACHE_ENABLED = True |
||||
|
#HTTPCACHE_EXPIRATION_SECS = 0 |
||||
|
#HTTPCACHE_DIR = 'httpcache' |
||||
|
#HTTPCACHE_IGNORE_HTTP_CODES = [] |
||||
|
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' |
||||
|
|
||||
|
max_retry_times = 3 |
@ -0,0 +1,4 @@ |
|||||
|
# This package will contain the spiders of your Scrapy project |
||||
|
# |
||||
|
# Please refer to the documentation for information on how to create and manage |
||||
|
# your spiders. |
@ -0,0 +1,122 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
""" |
||||
|
Denkbares 新闻网站采集 |
||||
|
""" |
||||
|
import hashlib |
||||
|
import logging |
||||
|
import time |
||||
|
|
||||
|
import scrapy |
||||
|
from lxml import etree |
||||
|
|
||||
|
from dark_net_crawler.items import DarkNetCrawlerItem |
||||
|
from dark_net_crawler.utils.file_util import download_file |
||||
|
from dark_net_crawler.utils.gofast_util import upload_to_gofast |
||||
|
from dark_net_crawler.utils.kafka_config import KAFKA_TOPIC, KAFKA_ADDRESS |
||||
|
from dark_net_crawler.utils.kafka_util import kafkaProduce |
||||
|
from dark_net_crawler.utils.logger_tool import Loggings |
||||
|
|
||||
|
custom_settings = { |
||||
|
'DOWNLOAD_TIMEOUT': 60, |
||||
|
} |
||||
|
|
||||
|
start_url = 'http://pr3ygifxd23xu43be2fegjjsk5jlb22q2va2h5apz76ejbvammeclkid.onion/' |
||||
|
|
||||
|
|
||||
|
class NewsDenkbaresSpider(scrapy.Spider): |
||||
|
name = 'news_denkbares' |
||||
|
# allowed_domains = ['pr3ygifxd23xu43be2fegjjsk5jlb22q2va2h5apz76ejbvammeclkid.onion'] |
||||
|
|
||||
|
def start_requests(self): |
||||
|
urls = [ |
||||
|
start_url, |
||||
|
# "http://httpbin.org/ip", |
||||
|
] |
||||
|
for url in urls: |
||||
|
yield scrapy.Request(url=url, callback=self.parse_list_page) |
||||
|
|
||||
|
def parse_list_page(self, response): |
||||
|
# Loggings.info(f'list_page_html:{response.text}') |
||||
|
li_elements = response.css('ul.post-list > li') |
||||
|
Loggings.info(f'news_count:{len(li_elements)}') |
||||
|
for li_element in li_elements: |
||||
|
detail_page_url = li_element.css('li > h3 > a::attr(href)').get() |
||||
|
title = li_element.css('li > h3 > a::text').get().replace('\n', '').strip() |
||||
|
post_time = li_element.css('li > span::text').get() |
||||
|
Loggings.info(f'title:{title}, \tpost_time:{post_time}, \tdetail_page_url:{detail_page_url}') |
||||
|
|
||||
|
# if 'widersprueche-der-querfront-nachdenkseiten' in detail_page_url: |
||||
|
yield response.follow(response.urljoin(detail_page_url), callback=self.parse_detail_page, meta={'title': title, 'post_time': post_time}) |
||||
|
# break |
||||
|
|
||||
|
def parse_detail_page(self, response): |
||||
|
title = response.meta['title'].replace('\\u', ' ') |
||||
|
post_time = response.css('time.dt-published::attr(datetime)').get().split('+')[0].replace('T', ' ') |
||||
|
content = ''.join(response.xpath('//div[@class=\"post-content e-content\"]//./text()').extract()) |
||||
|
html = response.css('div.post-content').get() |
||||
|
author = response.css('span.p-author::text').get() |
||||
|
# 图片 |
||||
|
img_part_urls = response.xpath('//div[@class=\"post-content e-content\"]/figure/p/img/@src').extract() |
||||
|
img_urls = [] |
||||
|
imagePath = [] |
||||
|
|
||||
|
img_map = {} |
||||
|
|
||||
|
for index, img_part_url in enumerate(img_part_urls): |
||||
|
img_url = response.urljoin(img_part_url) |
||||
|
Loggings.info(f'{img_url}') |
||||
|
img_urls.append(img_url) |
||||
|
|
||||
|
file_name = '/opt/pics/news_denkbares/%s' % (img_part_url.split('/')[-1]) |
||||
|
download_file(img_url, file_name) |
||||
|
gofast_path = upload_to_gofast(file_name) |
||||
|
imagePath.append(gofast_path) |
||||
|
|
||||
|
img_map[f"img{index + 1}"] = { |
||||
|
"img": img_url, |
||||
|
"uploadImg": gofast_path |
||||
|
} |
||||
|
|
||||
|
# Loggings.info(f'{img_urls}') |
||||
|
# Loggings.info(f'{img_map}') |
||||
|
|
||||
|
|
||||
|
# Loggings.info(f'{title}\t{post_time}\t{author}\n{img_urls}\n{content}') |
||||
|
|
||||
|
items = DarkNetCrawlerItem() |
||||
|
|
||||
|
items['url'] = response.url |
||||
|
items['news_id'] = hashlib.md5(items['url'].encode(encoding='UTF-8')).hexdigest() |
||||
|
items['purl'] = start_url |
||||
|
|
||||
|
items['title'] = title |
||||
|
items['post_time'] = post_time |
||||
|
items['content'] = content |
||||
|
items['author'] = author |
||||
|
items['forwardcontent'] = html |
||||
|
items['imgList'] = img_urls |
||||
|
items['imagePath'] = imagePath |
||||
|
items['contentimgs'] = img_map |
||||
|
|
||||
|
items['source'] = 'news_denkbares' |
||||
|
items['cid'] = 'news-denkbares' |
||||
|
items['dns'] = 'pr3ygifxd23xu43be2fegjjsk5jlb22q2va2h5apz76ejbvammeclkid.onion' |
||||
|
items['crawlTime'] = int(time.time()) |
||||
|
items['creation_time'] = items['crawlTime'] |
||||
|
|
||||
|
items['attr'] = { |
||||
|
"appId": "ic", |
||||
|
"attachTag": "", |
||||
|
"crawlDataFlag": f'url:{start_url}', |
||||
|
"project_name": "ic" |
||||
|
} |
||||
|
|
||||
|
items['isDownload'] = True |
||||
|
items['pagetype'] = 'newscontent' |
||||
|
items['type'] = 'newscontent' |
||||
|
items['imagePathSize'] = [] |
||||
|
|
||||
|
kafkaProduce(KAFKA_TOPIC["post"], dict(items)) |
||||
|
Loggings.info(f'Send to kfk OK!') |
||||
|
|
||||
|
yield items # 将items提交到管道中 |
@ -0,0 +1,139 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
""" |
||||
|
Denkbares 新闻网站采集 |
||||
|
""" |
||||
|
import hashlib |
||||
|
import logging |
||||
|
import time |
||||
|
|
||||
|
import scrapy |
||||
|
|
||||
|
from dark_net_crawler.items import DarkNetCrawlerItem |
||||
|
from dark_net_crawler.utils.file_util import download_file |
||||
|
from dark_net_crawler.utils.gofast_util import upload_to_gofast |
||||
|
from dark_net_crawler.utils.kafka_config import KAFKA_TOPIC, KAFKA_ADDRESS |
||||
|
from dark_net_crawler.utils.kafka_util import kafkaProduce |
||||
|
from dark_net_crawler.utils.logger_tool import Loggings |
||||
|
|
||||
|
custom_settings = { |
||||
|
'DOWNLOAD_TIMEOUT': 60, |
||||
|
} |
||||
|
|
||||
|
start_url = 'http://potshopk4eov76aciyranqyq2r3mszuvfisvneytodfxo56ubha7doqd.onion/?post_type=product' |
||||
|
shop_pot_list_url = 'http://potshopk4eov76aciyranqyq2r3mszuvfisvneytodfxo56ubha7doqd.onion/?post_type=product&paged=%s' |
||||
|
|
||||
|
|
||||
|
class NewsDenkbaresSpider(scrapy.Spider): |
||||
|
name = 'shop_pot' |
||||
|
# allowed_domains = ['pr3ygifxd23xu43be2fegjjsk5jlb22q2va2h5apz76ejbvammeclkid.onion'] |
||||
|
|
||||
|
def start_requests(self): |
||||
|
total_page = 1 |
||||
|
for page_no in range(1, total_page + 1): |
||||
|
turn_page_url = shop_pot_list_url % ('%d' % page_no) |
||||
|
# Loggings.info(turn_page_url) |
||||
|
yield scrapy.Request(turn_page_url, callback=self.parse_list_page, dont_filter=True) |
||||
|
# break |
||||
|
|
||||
|
def parse_list_page(self, response): |
||||
|
# Loggings.info(f'list_page_html:{response.text}') |
||||
|
li_elements = response.css('ul.products > li') |
||||
|
Loggings.info(f'shop_pot_count:{len(li_elements)}') |
||||
|
for li_element in li_elements: |
||||
|
detail_page_url = li_element.css('li > a::attr(href)').get() |
||||
|
Loggings.info(detail_page_url) |
||||
|
# if 'widersprueche-der-querfront-nachdenkseiten' in detail_page_url: |
||||
|
# if 'product=afghan-skunk' in detail_page_url: |
||||
|
# if 'product=atomic-5g-new' in detail_page_url: |
||||
|
yield response.follow(detail_page_url, callback=self.parse_detail_page, meta={'purl': response.url}) |
||||
|
# break |
||||
|
|
||||
|
def parse_detail_page(self, response): |
||||
|
current_price = response.xpath('//ins/span[@class=\"woocommerce-Price-amount amount\"]/bdi/text()').get() |
||||
|
|
||||
|
if current_price is None: |
||||
|
current_price = response.xpath('//span[@class=\"woocommerce-Price-amount amount\"]/bdi/text()').get() |
||||
|
|
||||
|
title = response.css('h1.product_title::text').get() \ |
||||
|
+ ' $%s' % current_price |
||||
|
content = str(response.css('div.woocommerce-product-details__short-description > p::text').get()).replace('None', '') \ |
||||
|
+ '\n' \ |
||||
|
+ str(response.css('span.sku_wrapper::text').get()).replace('None', '') + str(response.css('span.sku::text').get()).replace('None', '') \ |
||||
|
+ '\t' \ |
||||
|
+ str(response.css('span.posted_in::text').get()).replace('None', '') + str(response.css('span.posted_in > a::text').get()).replace('None', '') \ |
||||
|
+ '\n' \ |
||||
|
+ ''.join(response.css('#tab-description > h2::text').get()) \ |
||||
|
+ '\n' \ |
||||
|
+ '\n'.join(response.css('#tab-description > p::text').extract()) |
||||
|
|
||||
|
logging.info(title) |
||||
|
logging.info(content) |
||||
|
|
||||
|
html = ''.join(response.css('#main > div > div').getall()) |
||||
|
# logging.info(html) |
||||
|
|
||||
|
# 图片 |
||||
|
img_urls = response.xpath('//ol[@class=\"flex-control-nav flex-control-thumbs\"]/li/img/@src').extract() |
||||
|
if len(img_urls) == 0: |
||||
|
img_urls.append(response.xpath('//img[@class=\"wp-post-image\"]/@src').get()) |
||||
|
|
||||
|
imagePath = [] |
||||
|
|
||||
|
img_map = {} |
||||
|
|
||||
|
for index, img_url in enumerate(img_urls): |
||||
|
Loggings.info(f'{img_url}') |
||||
|
|
||||
|
file_name = '/opt/pics/news_denkbares/%s' % (hashlib.md5(img_url.encode(encoding='UTF-8')).hexdigest()) |
||||
|
download_file(img_url, file_name) |
||||
|
gofast_path = upload_to_gofast(file_name) |
||||
|
imagePath.append(gofast_path) |
||||
|
|
||||
|
img_map[f"img{index + 1}"] = { |
||||
|
"img": img_url, |
||||
|
"uploadImg": gofast_path |
||||
|
} |
||||
|
|
||||
|
Loggings.info(f'{img_urls}') |
||||
|
Loggings.info(f'{img_map}') |
||||
|
|
||||
|
|
||||
|
# Loggings.info(f'{title}\t{post_time}\t{author}\n{img_urls}\n{content}') |
||||
|
|
||||
|
items = DarkNetCrawlerItem() |
||||
|
|
||||
|
items['url'] = response.url |
||||
|
items['news_id'] = hashlib.md5(items['url'].encode(encoding='UTF-8')).hexdigest() |
||||
|
items['purl'] = response.meta['purl'] |
||||
|
|
||||
|
items['title'] = title |
||||
|
items['post_time'] = int(time.time()) |
||||
|
items['content'] = content |
||||
|
items['forwardcontent'] = html |
||||
|
items['imgList'] = img_urls |
||||
|
items['imagePath'] = imagePath |
||||
|
items['contentimgs'] = img_map |
||||
|
|
||||
|
items['source'] = 'shop_pot' |
||||
|
items['cid'] = 'shop-pot' |
||||
|
items['dns'] = 'potshopk4eov76aciyranqyq2r3mszuvfisvneytodfxo56ubha7doqd.onion' |
||||
|
items['crawlTime'] = int(time.time()) |
||||
|
items['creation_time'] = items['crawlTime'] |
||||
|
|
||||
|
items['attr'] = { |
||||
|
"appId": "ic", |
||||
|
"attachTag": "", |
||||
|
"crawlDataFlag": f'url:{start_url}', |
||||
|
"project_name": "ic" |
||||
|
} |
||||
|
|
||||
|
items['isDownload'] = True |
||||
|
items['pagetype'] = 'newscontent' |
||||
|
items['type'] = 'newscontent' |
||||
|
items['imagePathSize'] = [] |
||||
|
items['version'] = 2 |
||||
|
|
||||
|
kafkaProduce(KAFKA_TOPIC["post"], dict(items)) |
||||
|
Loggings.info(f'Send to kfk OK!') |
||||
|
|
||||
|
yield items # 将items提交到管道中 |
@ -0,0 +1,28 @@ |
|||||
|
import logging |
||||
|
|
||||
|
import requests |
||||
|
|
||||
|
proxy = { |
||||
|
"http": "127.0.0.1:19050" |
||||
|
} |
||||
|
|
||||
|
|
||||
|
def download_file(url, file_name): |
||||
|
r = requests.get(url, stream=True, proxies=proxy) |
||||
|
chunk_size = 1000000 |
||||
|
with open(file_name, 'wb') as fd: |
||||
|
for chunk in r.iter_content(chunk_size): |
||||
|
fd.write(chunk) |
||||
|
|
||||
|
|
||||
|
def save_html(html, file_name): |
||||
|
with open(file_name, 'wb') as fd: |
||||
|
fd.write(html) |
||||
|
|
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
img_part_url = '/assets/img/widerspruch-nds/wellbrock-warweg-ungar.jpg' |
||||
|
img_url = 'http://pr3ygifxd23xu43be2fegjjsk5jlb22q2va2h5apz76ejbvammeclkid.onion' + img_part_url |
||||
|
file_name = '/opt/pics/news_denkbares/%s' % (img_part_url.split('/')[-1]) |
||||
|
download_file(img_url, file_name) |
||||
|
logging.info(f'ok!') |
@ -0,0 +1,24 @@ |
|||||
|
#!/usr/bin/env python |
||||
|
# -*- coding:utf-8 -*- |
||||
|
|
||||
|
import json |
||||
|
|
||||
|
import requests |
||||
|
|
||||
|
from dark_net_crawler.utils.logger_tool import Loggings |
||||
|
|
||||
|
url = 'http://172.18.1.130:8896/group17/upload' |
||||
|
|
||||
|
|
||||
|
def upload_to_gofast(file_name): |
||||
|
files = {'file': open(file_name, 'rb')} |
||||
|
options = {'output':'json', 'path':'', 'scene':''} #参阅浏览器上传的选项 |
||||
|
response = requests.post(url, data=options, files=files) |
||||
|
# Loggings.info(response.text) |
||||
|
gofast_path = json.loads(response.text)['url'] |
||||
|
Loggings.info(gofast_path) |
||||
|
return gofast_path |
||||
|
|
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
upload_to_gofast('/opt/pics/news_denkbares/wellbrock-warweg-ungar.jpg') |
@ -0,0 +1,7 @@ |
|||||
|
# topic |
||||
|
KAFKA_TOPIC = { |
||||
|
"post": "newsTopicdata", |
||||
|
# "post":"test-yj", |
||||
|
} |
||||
|
KAFKA_ADDRESS = "172.18.1.101:9092,172.18.1.102:9092,172.18.1.104:9092,172.18.1.180:9092,172.18.1.182:9092" |
||||
|
# KAFKA_ADDRESS = "127.0.0.1:9092" |
@ -0,0 +1,28 @@ |
|||||
|
# coding=utf-8 |
||||
|
import json |
||||
|
|
||||
|
from kafka import KafkaProducer |
||||
|
|
||||
|
from dark_net_crawler.utils.kafka_config import KAFKA_TOPIC, KAFKA_ADDRESS |
||||
|
from dark_net_crawler.utils.logger_tool import Loggings |
||||
|
|
||||
|
""" |
||||
|
写到kafka |
||||
|
""" |
||||
|
|
||||
|
|
||||
|
def kafkaProduce(topic, data): |
||||
|
try: |
||||
|
producer = KafkaProducer(bootstrap_servers = '{}'.format(KAFKA_ADDRESS), |
||||
|
value_serializer = lambda m: json.dumps(m, ensure_ascii = False).encode('utf-8')) |
||||
|
producer.send(topic, data) |
||||
|
Loggings.info(f'Send to kfk OK! topic:{topic}, data:{data}') |
||||
|
producer.flush() |
||||
|
except Exception as e: |
||||
|
Loggings.warning(f"kafka断开连接{e}") |
||||
|
|
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
resultData = ["111111111111111111111", "222222222"] |
||||
|
kafkaProduce(KAFKA_TOPIC["post"], resultData) |
||||
|
Loggings.info(f'Send to kfk OK!') |
@ -0,0 +1,51 @@ |
|||||
|
import os |
||||
|
import time |
||||
|
from loguru import logger |
||||
|
from pathlib import Path |
||||
|
|
||||
|
|
||||
|
class Loggings: |
||||
|
__instance = None |
||||
|
# project_path = Path.cwd().parent |
||||
|
project_path = Path.cwd() |
||||
|
log_path = Path(os.path.join(project_path), "dark-download-logs") |
||||
|
if not os.path.exists(log_path): |
||||
|
os.mkdir(log_path) |
||||
|
|
||||
|
logger.add(str(log_path) + "/{time:YYYY-MM-DD}.log", format="{time:YYYY-MM-DD A HH:mm:ss.SSSS} | {level} | {name} | {message}", level="DEBUG", |
||||
|
rotation="00:00", encoding="utf-8", enqueue=True, retention="7 days", backtrace=True, diagnose=True) |
||||
|
|
||||
|
def __new__(cls, *args, **kwargs): |
||||
|
if not cls.__instance: |
||||
|
cls.__instance = super(Loggings, cls).__new__(cls, *args, **kwargs) |
||||
|
|
||||
|
return cls.__instance |
||||
|
|
||||
|
@classmethod |
||||
|
def info(self, msg): |
||||
|
return logger.info(msg) |
||||
|
|
||||
|
@classmethod |
||||
|
def debug(self, msg): |
||||
|
return logger.debug(msg) |
||||
|
|
||||
|
@classmethod |
||||
|
def warning(self, msg): |
||||
|
return logger.warning(msg) |
||||
|
|
||||
|
@classmethod |
||||
|
def error(self, msg): |
||||
|
return logger.error(msg) |
||||
|
|
||||
|
|
||||
|
logging = Loggings() |
||||
|
if __name__ == '__main__': |
||||
|
logging.info("info") |
||||
|
# loggings.debug("debug") |
||||
|
# loggings.warning("warning") |
||||
|
# loggings.error("error") |
||||
|
|
||||
|
# 使用说明 |
||||
|
# from logger_tool import Loggings |
||||
|
# |
||||
|
# Loggings.error(f"{post_url}详情请求失败:{e}") |
@ -0,0 +1,13 @@ |
|||||
|
# -*- coding: utf-8 -*- |
||||
|
""" |
||||
|
启动入口类 |
||||
|
""" |
||||
|
|
||||
|
from scrapy.cmdline import execute |
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
# news_denkbares新闻网站采集启动 |
||||
|
execute(['scrapy', 'crawl', 'news_denkbares']) |
||||
|
|
||||
|
# shop_pot 毒品网站采集启动 |
||||
|
# execute(['scrapy', 'crawl', 'shop_pot']) |
After Width: 1449 | Height: 661 | Size: 660 KiB |
@ -0,0 +1,49 @@ |
|||||
|
aiocontextvars==0.2.2 |
||||
|
attrs==24.2.0 |
||||
|
Automat==20.2.0 |
||||
|
certifi==2020.12.5 |
||||
|
cffi==1.17.1 |
||||
|
chardet==4.0.0 |
||||
|
constantly==15.1.0 |
||||
|
contextvars==2.4 |
||||
|
cryptography==43.0.1 |
||||
|
cssselect==1.2.0 |
||||
|
defusedxml==0.7.1 |
||||
|
filelock==3.16.1 |
||||
|
h2==3.2.0 |
||||
|
hpack==3.0.0 |
||||
|
hyperframe==5.2.0 |
||||
|
hyperlink==21.0.0 |
||||
|
idna==2.10 |
||||
|
immutables==0.19 |
||||
|
incremental==24.7.0 |
||||
|
itemadapter==0.9.0 |
||||
|
itemloaders==1.3.2 |
||||
|
jmespath==1.0.1 |
||||
|
kafka-python==2.0.2 |
||||
|
loguru==0.7.2 |
||||
|
lxml==5.3.0 |
||||
|
packaging==24.1 |
||||
|
parsel==1.9.1 |
||||
|
priority==1.3.0 |
||||
|
Protego==0.1.16 |
||||
|
pyasn1==0.4.8 |
||||
|
pyasn1-modules==0.2.7 |
||||
|
pycparser==2.22 |
||||
|
PyDispatcher==2.0.5 |
||||
|
PyHamcrest==2.0.2 |
||||
|
pyOpenSSL==24.2.1 |
||||
|
queuelib==1.5.0 |
||||
|
requests==2.25.1 |
||||
|
requests-file==2.1.0 |
||||
|
schedule==1.1.0 |
||||
|
Scrapy==2.11.2 |
||||
|
service-identity==18.1.0 |
||||
|
six==1.16.0 |
||||
|
tldextract==5.1.2 |
||||
|
tomli==2.0.2 |
||||
|
Twisted==24.7.0 |
||||
|
typing_extensions==4.2.0 |
||||
|
urllib3==1.26.4 |
||||
|
w3lib==2.2.1 |
||||
|
zope.interface==7.1.0 |
@ -0,0 +1,25 @@ |
|||||
|
#!/usr/bin/env python |
||||
|
# -*- coding:utf-8 -*- |
||||
|
|
||||
|
import schedule |
||||
|
import time |
||||
|
from scrapy.cmdline import execute |
||||
|
|
||||
|
|
||||
|
def crawl_1_news(): |
||||
|
# news_denkbares新闻网站采集启动 |
||||
|
execute(['scrapy', 'crawl', 'news_denkbares']) |
||||
|
|
||||
|
|
||||
|
def crawl_2_pot(): |
||||
|
# shop_pot 毒品网站采集启动 |
||||
|
execute(['scrapy', 'crawl', 'shop_pot']) |
||||
|
|
||||
|
|
||||
|
# 每周五凌晨五点运行程序 |
||||
|
schedule.every().friday.at('5:00').do(crawl_1_news) |
||||
|
schedule.every().friday.at('7:00').do(crawl_2_pot) |
||||
|
|
||||
|
while True: |
||||
|
schedule.run_pending() |
||||
|
time.sleep(1) |
@ -0,0 +1,11 @@ |
|||||
|
# Automatically created by: scrapy startproject |
||||
|
# |
||||
|
# For more information about the [deploy] section see: |
||||
|
# https://scrapyd.readthedocs.io/en/latest/deploy.html |
||||
|
|
||||
|
[settings] |
||||
|
default = dark_net_crawler.settings |
||||
|
|
||||
|
[deploy] |
||||
|
#url = http://localhost:6800/ |
||||
|
project = dark_net_crawler |
Write
Preview
Loading…
Cancel
Save
Reference in new issue