commit 35d83546d1d936d32e609c5ff8e78648dc5477e7 Author: steve.gao Date: Thu Jan 2 15:36:36 2025 +0800 initial diff --git a/README.md b/README.md new file mode 100644 index 0000000..cc9e7b2 --- /dev/null +++ b/README.md @@ -0,0 +1,96 @@ +# dark_net_crawler + +暗网采集(新版) +——推送到采集平台的数据字段都和新闻的一样 + +1. 新闻网站: + +http://pr3ygifxd23xu43be2fegjjsk5jlb22q2va2h5apz76ejbvammeclkid.onion/ + + +2. 交易网站: + +http://potshopk4eov76aciyranqyq2r3mszuvfisvneytodfxo56ubha7doqd.onion/?post_type=product + + +部署说明: +1. 线上 47.252.23.168 机器(已翻墙)上,部署了: + + (1)项目部署路径: + + /opt/crawl/dark_net/dark_net_crawler + + 沙箱环境: + + conda activate pdf_crawler_py3.8 + + python环境为3.8 + + + (2)暗网采集代理的Tor服务-------代理地址:socks5h://localhost:9050 + + CentOS+tor+Privoxy 服务搭建(要出墙tor才可正常使用): + + sudo yum install epel-release 不然没有源 + sudo yum install tor + service tor start 启动服务 + service tor status 检查服务状态 + + (3)Privoxy:将SOCKS5代理转换为HTTP代理:http://172.18.1.103:19050(最终暗网采集项目中 使用的代理地址:见 settings.py) + + sudo yum install privoxy 安装 + + 修改一下privoxy 配置文件 将tor 和privoxy整合 + vim /ect/privoxy/config + 将 listen-address 改为0.0.0.0:19095 + 搜索 forward-socks5t 找到注释拿掉 + ESC wq保存退出 + + 启动privoxy + service privoxy start + 启动完成检查状态 status 这样就完成了服务搭建 + + 设置完成代理 检查是否成功 + curl -x http://172.18.1.103:19050 'http://pr3ygifxd23xu43be2fegjjsk5jlb22q2va2h5apz76ejbvammeclkid.onion/' + curl -x http://172.18.1.103:19050 'http://httpbin.org/ip' + curl -x socks5h://localhost:9050 'http://httpbin.org/ip' + curl 'http://httpbin.org/ip' + ![img_1.png](img_1.png) + +2. 本地或线上一次性运行启动入口:python entrypoint.py + + + 依赖安装:pip install -r requirements.txt --python-version 3.8 + + 线上周期定时采集: + + conda activate pdf-crawler(进入沙箱环境) + + python scheduled_run.py (每周五采集一次) + + +3. (1) 数据连接的Kafka配置: + + dark_net_crawler/utils/kafka_config.py + + (2) 输出到采集平台的数据格式:(和新闻字段一致) + + 见:dark_net_crawler/items.py + + 如果想把新采集的数据覆盖掉:version字段值递增即可。 items['version'] = 2 + + +4. 日志打印--按天输出,只保留近7天: + + dark_net_crawler/utils/fb-download-logs/2024-07-18.log + + +5. 两个网站的采集解析逻辑: + + dark_net_crawler/spiders/news_denkbares.py + + dark_net_crawler/spiders/shop_pot.py + + + + diff --git a/dark_net_crawler/__init__.py b/dark_net_crawler/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dark_net_crawler/items.py b/dark_net_crawler/items.py new file mode 100644 index 0000000..adefd6b --- /dev/null +++ b/dark_net_crawler/items.py @@ -0,0 +1,40 @@ +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +from scrapy.item import Item, Field + + +class DarkNetCrawlerItem(Item): + # define the fields for your item here like: + # name = scrapy.Field() + + source = Field() + cid = Field() + dns = Field() + crawlTime = Field() + creation_time = Field() + + url = Field() + news_id = Field() + author = Field() + purl = Field() + title = Field() + content = Field() + post_time = Field() + forwardcontent = Field() + + imagePath = Field() + imgList = Field() + contentimgs = Field() + + purl = Field() + isDownload = Field() + pagetype = Field() + type = Field() + imagePathSize = Field() + + attr = Field() + + version = Field() \ No newline at end of file diff --git a/dark_net_crawler/middlewares.py b/dark_net_crawler/middlewares.py new file mode 100644 index 0000000..a800b52 --- /dev/null +++ b/dark_net_crawler/middlewares.py @@ -0,0 +1,164 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + +# useful for handling different item types with a single interface +from itemadapter import is_item, ItemAdapter + +import dark_net_crawler + + +class PdfCrawlerSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class PdfCrawlerDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +import random + + +class Proxy_Middleware: + + def __init__(self, crawler): + self.proxy_list = dark_net_crawler.settings.PROXY_LIST + self.ua_list = dark_net_crawler.settings.USER_AGENT_LIST + + @classmethod + def from_crawler(cls, crawler): + return cls(crawler) + + def process_request(self, request, spider): + try: + ua = random.choice(self.ua_list) + request.headers.setdefault('User-Agent', ua) + + proxy_ip_port = random.choice(self.proxy_list) + request.meta['proxy'] = 'http://' + proxy_ip_port + + except request.exceptions.RequestException: + spider.logger.error('some error happended!') + + +from scrapy.downloadermiddlewares.retry import RetryMiddleware +from scrapy.utils.response import response_status_message + + +class My_RetryMiddleware(RetryMiddleware): + def __init__(self, crawler): + self.proxy_list = dark_net_crawler.settings.PROXY_LIST + self.ua_list = dark_net_crawler.settings.USER_AGENT_LIST + self.retry_http_codes = ['404'] + + @classmethod + def from_crawler(cls, crawler): + return cls(crawler) + + def process_response(self, request, response, spider): + if request.meta.get('dont_retry', False): + return response + + if response.status in self.retry_http_codes: + reason = response_status_message(response.status) + try: + ua = random.choice(self.ua_list) + request.headers.setdefault('User-Agent', ua) + + proxy_ip_port = random.choice(self.proxy_list) + request.meta['proxy'] = 'http://' + proxy_ip_port + + except request.exceptions.RequestException: + spider.logger.error('获取暗网代理ip失败!') + + return self._retry(request, reason, spider) or response + return response diff --git a/dark_net_crawler/pipelines.py b/dark_net_crawler/pipelines.py new file mode 100644 index 0000000..4681e48 --- /dev/null +++ b/dark_net_crawler/pipelines.py @@ -0,0 +1,38 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + + +# useful for handling different item types with a single interface +from itemadapter import ItemAdapter + +import json +from kafka import KafkaProducer + +from dark_net_crawler.utils.kafka_config import KAFKA_TOPIC, KAFKA_ADDRESS +from dark_net_crawler.utils.logger_tool import Loggings + + +class DarkNetCrawlerPipeline: + def process_item(self, item, spider): + return item + + +class DarkNetCrawlerKfkPipeline: + def open_spider(self, spider): + self.producer = KafkaProducer( + bootstrap_servers=[KAFKA_ADDRESS], # 替换为你的Kafka服务器地址 + value_serializer=lambda v: json.dumps(v).encode('utf-8') + ) + + def close_spider(self, spider): + self.producer.close() + + def process_item(self, item, spider): + topic = KAFKA_TOPIC["post"] + data = dict(item) + self.producer.send(topic, data) # 替换为你的Kafka主题 + Loggings.info(f'Send to kfk OK! topic:{topic}, data:{data}') + return item + diff --git a/dark_net_crawler/settings.py b/dark_net_crawler/settings.py new file mode 100644 index 0000000..bac265a --- /dev/null +++ b/dark_net_crawler/settings.py @@ -0,0 +1,125 @@ +# Scrapy settings for dark_net_crawler project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'dark_net_crawler' + +SPIDER_MODULES = ['dark_net_crawler.spiders'] +NEWSPIDER_MODULE = 'dark_net_crawler.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = 'dark_net_crawler (+http://www.yourdomain.com)' + +# setting中设置 +USER_AGENT_LIST = [ \ + "Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0", \ + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", \ + "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \ + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \ + "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \ + "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \ + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \ + "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \ + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ + "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ + "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \ + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \ + "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ + "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ + "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \ + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \ + "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \ + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.132 Safari/537.36", \ + "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0" +] + +PROXY_LIST = [ + # 'localhost:9050', + # 'localhost:19050', + '47.252.23.168:19050' +] + + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'dark_net_crawler.middlewares.DarkNetCrawlerSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +DOWNLOADER_MIDDLEWARES = { + # 'dark_net_crawler.middlewares.DarkNetCrawlerDownloaderMiddleware': 543, + 'dark_net_crawler.middlewares.Proxy_Middleware': 800, + # 'dark_net_crawler.middlewares.My_RetryMiddleware': 543, +} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + # 'dark_net_crawler.pipelines.DarkNetCrawlerPipeline': 300, + # 'dark_net_crawler.pipelines.DarkNetCrawlerKfkPipeline': 800, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +max_retry_times = 3 \ No newline at end of file diff --git a/dark_net_crawler/spiders/__init__.py b/dark_net_crawler/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/dark_net_crawler/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/dark_net_crawler/spiders/news_denkbares.py b/dark_net_crawler/spiders/news_denkbares.py new file mode 100644 index 0000000..e2551b2 --- /dev/null +++ b/dark_net_crawler/spiders/news_denkbares.py @@ -0,0 +1,122 @@ +# -*- coding: utf-8 -*- +""" + Denkbares 新闻网站采集 +""" +import hashlib +import logging +import time + +import scrapy +from lxml import etree + +from dark_net_crawler.items import DarkNetCrawlerItem +from dark_net_crawler.utils.file_util import download_file +from dark_net_crawler.utils.gofast_util import upload_to_gofast +from dark_net_crawler.utils.kafka_config import KAFKA_TOPIC, KAFKA_ADDRESS +from dark_net_crawler.utils.kafka_util import kafkaProduce +from dark_net_crawler.utils.logger_tool import Loggings + +custom_settings = { + 'DOWNLOAD_TIMEOUT': 60, +} + +start_url = 'http://pr3ygifxd23xu43be2fegjjsk5jlb22q2va2h5apz76ejbvammeclkid.onion/' + + +class NewsDenkbaresSpider(scrapy.Spider): + name = 'news_denkbares' + # allowed_domains = ['pr3ygifxd23xu43be2fegjjsk5jlb22q2va2h5apz76ejbvammeclkid.onion'] + + def start_requests(self): + urls = [ + start_url, + # "http://httpbin.org/ip", + ] + for url in urls: + yield scrapy.Request(url=url, callback=self.parse_list_page) + + def parse_list_page(self, response): + # Loggings.info(f'list_page_html:{response.text}') + li_elements = response.css('ul.post-list > li') + Loggings.info(f'news_count:{len(li_elements)}') + for li_element in li_elements: + detail_page_url = li_element.css('li > h3 > a::attr(href)').get() + title = li_element.css('li > h3 > a::text').get().replace('\n', '').strip() + post_time = li_element.css('li > span::text').get() + Loggings.info(f'title:{title}, \tpost_time:{post_time}, \tdetail_page_url:{detail_page_url}') + + # if 'widersprueche-der-querfront-nachdenkseiten' in detail_page_url: + yield response.follow(response.urljoin(detail_page_url), callback=self.parse_detail_page, meta={'title': title, 'post_time': post_time}) + # break + + def parse_detail_page(self, response): + title = response.meta['title'].replace('\\u', ' ') + post_time = response.css('time.dt-published::attr(datetime)').get().split('+')[0].replace('T', ' ') + content = ''.join(response.xpath('//div[@class=\"post-content e-content\"]//./text()').extract()) + html = response.css('div.post-content').get() + author = response.css('span.p-author::text').get() + # 图片 + img_part_urls = response.xpath('//div[@class=\"post-content e-content\"]/figure/p/img/@src').extract() + img_urls = [] + imagePath = [] + + img_map = {} + + for index, img_part_url in enumerate(img_part_urls): + img_url = response.urljoin(img_part_url) + Loggings.info(f'{img_url}') + img_urls.append(img_url) + + file_name = '/opt/pics/news_denkbares/%s' % (img_part_url.split('/')[-1]) + download_file(img_url, file_name) + gofast_path = upload_to_gofast(file_name) + imagePath.append(gofast_path) + + img_map[f"img{index + 1}"] = { + "img": img_url, + "uploadImg": gofast_path + } + + # Loggings.info(f'{img_urls}') + # Loggings.info(f'{img_map}') + + + # Loggings.info(f'{title}\t{post_time}\t{author}\n{img_urls}\n{content}') + + items = DarkNetCrawlerItem() + + items['url'] = response.url + items['news_id'] = hashlib.md5(items['url'].encode(encoding='UTF-8')).hexdigest() + items['purl'] = start_url + + items['title'] = title + items['post_time'] = post_time + items['content'] = content + items['author'] = author + items['forwardcontent'] = html + items['imgList'] = img_urls + items['imagePath'] = imagePath + items['contentimgs'] = img_map + + items['source'] = 'news_denkbares' + items['cid'] = 'news-denkbares' + items['dns'] = 'pr3ygifxd23xu43be2fegjjsk5jlb22q2va2h5apz76ejbvammeclkid.onion' + items['crawlTime'] = int(time.time()) + items['creation_time'] = items['crawlTime'] + + items['attr'] = { + "appId": "ic", + "attachTag": "", + "crawlDataFlag": f'url:{start_url}', + "project_name": "ic" + } + + items['isDownload'] = True + items['pagetype'] = 'newscontent' + items['type'] = 'newscontent' + items['imagePathSize'] = [] + + kafkaProduce(KAFKA_TOPIC["post"], dict(items)) + Loggings.info(f'Send to kfk OK!') + + yield items # 将items提交到管道中 diff --git a/dark_net_crawler/spiders/shop_pot.py b/dark_net_crawler/spiders/shop_pot.py new file mode 100644 index 0000000..5f22ee9 --- /dev/null +++ b/dark_net_crawler/spiders/shop_pot.py @@ -0,0 +1,139 @@ +# -*- coding: utf-8 -*- +""" + Denkbares 新闻网站采集 +""" +import hashlib +import logging +import time + +import scrapy + +from dark_net_crawler.items import DarkNetCrawlerItem +from dark_net_crawler.utils.file_util import download_file +from dark_net_crawler.utils.gofast_util import upload_to_gofast +from dark_net_crawler.utils.kafka_config import KAFKA_TOPIC, KAFKA_ADDRESS +from dark_net_crawler.utils.kafka_util import kafkaProduce +from dark_net_crawler.utils.logger_tool import Loggings + +custom_settings = { + 'DOWNLOAD_TIMEOUT': 60, +} + +start_url = 'http://potshopk4eov76aciyranqyq2r3mszuvfisvneytodfxo56ubha7doqd.onion/?post_type=product' +shop_pot_list_url = 'http://potshopk4eov76aciyranqyq2r3mszuvfisvneytodfxo56ubha7doqd.onion/?post_type=product&paged=%s' + + +class NewsDenkbaresSpider(scrapy.Spider): + name = 'shop_pot' + # allowed_domains = ['pr3ygifxd23xu43be2fegjjsk5jlb22q2va2h5apz76ejbvammeclkid.onion'] + + def start_requests(self): + total_page = 1 + for page_no in range(1, total_page + 1): + turn_page_url = shop_pot_list_url % ('%d' % page_no) + # Loggings.info(turn_page_url) + yield scrapy.Request(turn_page_url, callback=self.parse_list_page, dont_filter=True) + # break + + def parse_list_page(self, response): + # Loggings.info(f'list_page_html:{response.text}') + li_elements = response.css('ul.products > li') + Loggings.info(f'shop_pot_count:{len(li_elements)}') + for li_element in li_elements: + detail_page_url = li_element.css('li > a::attr(href)').get() + Loggings.info(detail_page_url) + # if 'widersprueche-der-querfront-nachdenkseiten' in detail_page_url: + # if 'product=afghan-skunk' in detail_page_url: + # if 'product=atomic-5g-new' in detail_page_url: + yield response.follow(detail_page_url, callback=self.parse_detail_page, meta={'purl': response.url}) + # break + + def parse_detail_page(self, response): + current_price = response.xpath('//ins/span[@class=\"woocommerce-Price-amount amount\"]/bdi/text()').get() + + if current_price is None: + current_price = response.xpath('//span[@class=\"woocommerce-Price-amount amount\"]/bdi/text()').get() + + title = response.css('h1.product_title::text').get() \ + + ' $%s' % current_price + content = str(response.css('div.woocommerce-product-details__short-description > p::text').get()).replace('None', '') \ + + '\n' \ + + str(response.css('span.sku_wrapper::text').get()).replace('None', '') + str(response.css('span.sku::text').get()).replace('None', '') \ + + '\t' \ + + str(response.css('span.posted_in::text').get()).replace('None', '') + str(response.css('span.posted_in > a::text').get()).replace('None', '') \ + + '\n' \ + + ''.join(response.css('#tab-description > h2::text').get()) \ + + '\n' \ + + '\n'.join(response.css('#tab-description > p::text').extract()) + + logging.info(title) + logging.info(content) + + html = ''.join(response.css('#main > div > div').getall()) + # logging.info(html) + + # 图片 + img_urls = response.xpath('//ol[@class=\"flex-control-nav flex-control-thumbs\"]/li/img/@src').extract() + if len(img_urls) == 0: + img_urls.append(response.xpath('//img[@class=\"wp-post-image\"]/@src').get()) + + imagePath = [] + + img_map = {} + + for index, img_url in enumerate(img_urls): + Loggings.info(f'{img_url}') + + file_name = '/opt/pics/news_denkbares/%s' % (hashlib.md5(img_url.encode(encoding='UTF-8')).hexdigest()) + download_file(img_url, file_name) + gofast_path = upload_to_gofast(file_name) + imagePath.append(gofast_path) + + img_map[f"img{index + 1}"] = { + "img": img_url, + "uploadImg": gofast_path + } + + Loggings.info(f'{img_urls}') + Loggings.info(f'{img_map}') + + + # Loggings.info(f'{title}\t{post_time}\t{author}\n{img_urls}\n{content}') + + items = DarkNetCrawlerItem() + + items['url'] = response.url + items['news_id'] = hashlib.md5(items['url'].encode(encoding='UTF-8')).hexdigest() + items['purl'] = response.meta['purl'] + + items['title'] = title + items['post_time'] = int(time.time()) + items['content'] = content + items['forwardcontent'] = html + items['imgList'] = img_urls + items['imagePath'] = imagePath + items['contentimgs'] = img_map + + items['source'] = 'shop_pot' + items['cid'] = 'shop-pot' + items['dns'] = 'potshopk4eov76aciyranqyq2r3mszuvfisvneytodfxo56ubha7doqd.onion' + items['crawlTime'] = int(time.time()) + items['creation_time'] = items['crawlTime'] + + items['attr'] = { + "appId": "ic", + "attachTag": "", + "crawlDataFlag": f'url:{start_url}', + "project_name": "ic" + } + + items['isDownload'] = True + items['pagetype'] = 'newscontent' + items['type'] = 'newscontent' + items['imagePathSize'] = [] + items['version'] = 2 + + kafkaProduce(KAFKA_TOPIC["post"], dict(items)) + Loggings.info(f'Send to kfk OK!') + + yield items # 将items提交到管道中 diff --git a/dark_net_crawler/utils/file_util.py b/dark_net_crawler/utils/file_util.py new file mode 100644 index 0000000..3bd3806 --- /dev/null +++ b/dark_net_crawler/utils/file_util.py @@ -0,0 +1,28 @@ +import logging + +import requests + +proxy = { + "http": "127.0.0.1:19050" +} + + +def download_file(url, file_name): + r = requests.get(url, stream=True, proxies=proxy) + chunk_size = 1000000 + with open(file_name, 'wb') as fd: + for chunk in r.iter_content(chunk_size): + fd.write(chunk) + + +def save_html(html, file_name): + with open(file_name, 'wb') as fd: + fd.write(html) + + +if __name__ == '__main__': + img_part_url = '/assets/img/widerspruch-nds/wellbrock-warweg-ungar.jpg' + img_url = 'http://pr3ygifxd23xu43be2fegjjsk5jlb22q2va2h5apz76ejbvammeclkid.onion' + img_part_url + file_name = '/opt/pics/news_denkbares/%s' % (img_part_url.split('/')[-1]) + download_file(img_url, file_name) + logging.info(f'ok!') diff --git a/dark_net_crawler/utils/gofast_util.py b/dark_net_crawler/utils/gofast_util.py new file mode 100644 index 0000000..3ee8d5d --- /dev/null +++ b/dark_net_crawler/utils/gofast_util.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- + +import json + +import requests + +from dark_net_crawler.utils.logger_tool import Loggings + +url = 'http://172.18.1.130:8896/group17/upload' + + +def upload_to_gofast(file_name): + files = {'file': open(file_name, 'rb')} + options = {'output':'json', 'path':'', 'scene':''} #参阅浏览器上传的选项 + response = requests.post(url, data=options, files=files) + # Loggings.info(response.text) + gofast_path = json.loads(response.text)['url'] + Loggings.info(gofast_path) + return gofast_path + + +if __name__ == '__main__': + upload_to_gofast('/opt/pics/news_denkbares/wellbrock-warweg-ungar.jpg') \ No newline at end of file diff --git a/dark_net_crawler/utils/kafka_config.py b/dark_net_crawler/utils/kafka_config.py new file mode 100644 index 0000000..1c34b50 --- /dev/null +++ b/dark_net_crawler/utils/kafka_config.py @@ -0,0 +1,7 @@ +# topic +KAFKA_TOPIC = { + "post": "newsTopicdata", + # "post":"test-yj", +} +KAFKA_ADDRESS = "172.18.1.101:9092,172.18.1.102:9092,172.18.1.104:9092,172.18.1.180:9092,172.18.1.182:9092" +# KAFKA_ADDRESS = "127.0.0.1:9092" diff --git a/dark_net_crawler/utils/kafka_util.py b/dark_net_crawler/utils/kafka_util.py new file mode 100644 index 0000000..286170b --- /dev/null +++ b/dark_net_crawler/utils/kafka_util.py @@ -0,0 +1,28 @@ +# coding=utf-8 +import json + +from kafka import KafkaProducer + +from dark_net_crawler.utils.kafka_config import KAFKA_TOPIC, KAFKA_ADDRESS +from dark_net_crawler.utils.logger_tool import Loggings + +""" +写到kafka +""" + + +def kafkaProduce(topic, data): + try: + producer = KafkaProducer(bootstrap_servers = '{}'.format(KAFKA_ADDRESS), + value_serializer = lambda m: json.dumps(m, ensure_ascii = False).encode('utf-8')) + producer.send(topic, data) + Loggings.info(f'Send to kfk OK! topic:{topic}, data:{data}') + producer.flush() + except Exception as e: + Loggings.warning(f"kafka断开连接{e}") + + +if __name__ == '__main__': + resultData = ["111111111111111111111", "222222222"] + kafkaProduce(KAFKA_TOPIC["post"], resultData) + Loggings.info(f'Send to kfk OK!') diff --git a/dark_net_crawler/utils/logger_tool.py b/dark_net_crawler/utils/logger_tool.py new file mode 100644 index 0000000..4200ecb --- /dev/null +++ b/dark_net_crawler/utils/logger_tool.py @@ -0,0 +1,51 @@ +import os +import time +from loguru import logger +from pathlib import Path + + +class Loggings: + __instance = None + # project_path = Path.cwd().parent + project_path = Path.cwd() + log_path = Path(os.path.join(project_path), "dark-download-logs") + if not os.path.exists(log_path): + os.mkdir(log_path) + + logger.add(str(log_path) + "/{time:YYYY-MM-DD}.log", format="{time:YYYY-MM-DD A HH:mm:ss.SSSS} | {level} | {name} | {message}", level="DEBUG", + rotation="00:00", encoding="utf-8", enqueue=True, retention="7 days", backtrace=True, diagnose=True) + + def __new__(cls, *args, **kwargs): + if not cls.__instance: + cls.__instance = super(Loggings, cls).__new__(cls, *args, **kwargs) + + return cls.__instance + + @classmethod + def info(self, msg): + return logger.info(msg) + + @classmethod + def debug(self, msg): + return logger.debug(msg) + + @classmethod + def warning(self, msg): + return logger.warning(msg) + + @classmethod + def error(self, msg): + return logger.error(msg) + + +logging = Loggings() +if __name__ == '__main__': + logging.info("info") +# loggings.debug("debug") +# loggings.warning("warning") +# loggings.error("error") + +# 使用说明 +# from logger_tool import Loggings +# +# Loggings.error(f"{post_url}详情请求失败:{e}") \ No newline at end of file diff --git a/entrypoint.py b/entrypoint.py new file mode 100644 index 0000000..f7d8406 --- /dev/null +++ b/entrypoint.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- +""" +启动入口类 +""" + +from scrapy.cmdline import execute + +if __name__ == '__main__': + # news_denkbares新闻网站采集启动 + execute(['scrapy', 'crawl', 'news_denkbares']) + + # shop_pot 毒品网站采集启动 + # execute(['scrapy', 'crawl', 'shop_pot']) diff --git a/img_1.png b/img_1.png new file mode 100644 index 0000000..f462a4b Binary files /dev/null and b/img_1.png differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..73a0260 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,49 @@ +aiocontextvars==0.2.2 +attrs==24.2.0 +Automat==20.2.0 +certifi==2020.12.5 +cffi==1.17.1 +chardet==4.0.0 +constantly==15.1.0 +contextvars==2.4 +cryptography==43.0.1 +cssselect==1.2.0 +defusedxml==0.7.1 +filelock==3.16.1 +h2==3.2.0 +hpack==3.0.0 +hyperframe==5.2.0 +hyperlink==21.0.0 +idna==2.10 +immutables==0.19 +incremental==24.7.0 +itemadapter==0.9.0 +itemloaders==1.3.2 +jmespath==1.0.1 +kafka-python==2.0.2 +loguru==0.7.2 +lxml==5.3.0 +packaging==24.1 +parsel==1.9.1 +priority==1.3.0 +Protego==0.1.16 +pyasn1==0.4.8 +pyasn1-modules==0.2.7 +pycparser==2.22 +PyDispatcher==2.0.5 +PyHamcrest==2.0.2 +pyOpenSSL==24.2.1 +queuelib==1.5.0 +requests==2.25.1 +requests-file==2.1.0 +schedule==1.1.0 +Scrapy==2.11.2 +service-identity==18.1.0 +six==1.16.0 +tldextract==5.1.2 +tomli==2.0.2 +Twisted==24.7.0 +typing_extensions==4.2.0 +urllib3==1.26.4 +w3lib==2.2.1 +zope.interface==7.1.0 diff --git a/scheduled_run.py b/scheduled_run.py new file mode 100644 index 0000000..f1c6fdb --- /dev/null +++ b/scheduled_run.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- + +import schedule +import time +from scrapy.cmdline import execute + + +def crawl_1_news(): + # news_denkbares新闻网站采集启动 + execute(['scrapy', 'crawl', 'news_denkbares']) + + +def crawl_2_pot(): + # shop_pot 毒品网站采集启动 + execute(['scrapy', 'crawl', 'shop_pot']) + + +# 每周五凌晨五点运行程序 +schedule.every().friday.at('5:00').do(crawl_1_news) +schedule.every().friday.at('7:00').do(crawl_2_pot) + +while True: + schedule.run_pending() + time.sleep(1) \ No newline at end of file diff --git a/scrapy.cfg b/scrapy.cfg new file mode 100644 index 0000000..68dd97d --- /dev/null +++ b/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = dark_net_crawler.settings + +[deploy] +#url = http://localhost:6800/ +project = dark_net_crawler