Browse Source

initial

master
steve.gao 6 months ago
commit
35d83546d1
  1. 96
      README.md
  2. 0
      dark_net_crawler/__init__.py
  3. 40
      dark_net_crawler/items.py
  4. 164
      dark_net_crawler/middlewares.py
  5. 38
      dark_net_crawler/pipelines.py
  6. 125
      dark_net_crawler/settings.py
  7. 4
      dark_net_crawler/spiders/__init__.py
  8. 122
      dark_net_crawler/spiders/news_denkbares.py
  9. 139
      dark_net_crawler/spiders/shop_pot.py
  10. 28
      dark_net_crawler/utils/file_util.py
  11. 24
      dark_net_crawler/utils/gofast_util.py
  12. 7
      dark_net_crawler/utils/kafka_config.py
  13. 28
      dark_net_crawler/utils/kafka_util.py
  14. 51
      dark_net_crawler/utils/logger_tool.py
  15. 13
      entrypoint.py
  16. BIN
      img_1.png
  17. 49
      requirements.txt
  18. 25
      scheduled_run.py
  19. 11
      scrapy.cfg

96
README.md

@ -0,0 +1,96 @@
# dark_net_crawler
暗网采集(新版)
——推送到采集平台的数据字段都和新闻的一样
1. 新闻网站:
http://pr3ygifxd23xu43be2fegjjsk5jlb22q2va2h5apz76ejbvammeclkid.onion/
2. 交易网站:
http://potshopk4eov76aciyranqyq2r3mszuvfisvneytodfxo56ubha7doqd.onion/?post_type=product
部署说明:
1. 线上 47.252.23.168 机器(已翻墙)上,部署了:
(1)项目部署路径:
/opt/crawl/dark_net/dark_net_crawler
沙箱环境:
conda activate pdf_crawler_py3.8
python环境为3.8
(2)暗网采集代理的Tor服务-------代理地址:socks5h://localhost:9050
CentOS+tor+Privoxy 服务搭建(要出墙tor才可正常使用):
sudo yum install epel-release 不然没有源
sudo yum install tor
service tor start 启动服务
service tor status 检查服务状态
(3)Privoxy:将SOCKS5代理转换为HTTP代理:http://172.18.1.103:19050(最终暗网采集项目中 使用的代理地址:见 settings.py)
sudo yum install privoxy 安装
修改一下privoxy 配置文件 将tor 和privoxy整合
vim /ect/privoxy/config
将 listen-address 改为0.0.0.0:19095
搜索 forward-socks5t 找到注释拿掉
ESC wq保存退出
启动privoxy
service privoxy start
启动完成检查状态 status 这样就完成了服务搭建
设置完成代理 检查是否成功
curl -x http://172.18.1.103:19050 'http://pr3ygifxd23xu43be2fegjjsk5jlb22q2va2h5apz76ejbvammeclkid.onion/'
curl -x http://172.18.1.103:19050 'http://httpbin.org/ip'
curl -x socks5h://localhost:9050 'http://httpbin.org/ip'
curl 'http://httpbin.org/ip'
![img_1.png](img_1.png)
2. 本地或线上一次性运行启动入口:python entrypoint.py
依赖安装:pip install -r requirements.txt --python-version 3.8
线上周期定时采集:
conda activate pdf-crawler(进入沙箱环境)
python scheduled_run.py (每周五采集一次)
3. (1) 数据连接的Kafka配置:
dark_net_crawler/utils/kafka_config.py
(2) 输出到采集平台的数据格式:(和新闻字段一致)
见:dark_net_crawler/items.py
如果想把新采集的数据覆盖掉:version字段值递增即可。 items['version'] = 2
4. 日志打印--按天输出,只保留近7天:
dark_net_crawler/utils/fb-download-logs/2024-07-18.log
5. 两个网站的采集解析逻辑:
dark_net_crawler/spiders/news_denkbares.py
dark_net_crawler/spiders/shop_pot.py

0
dark_net_crawler/__init__.py

40
dark_net_crawler/items.py

@ -0,0 +1,40 @@
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
from scrapy.item import Item, Field
class DarkNetCrawlerItem(Item):
# define the fields for your item here like:
# name = scrapy.Field()
source = Field()
cid = Field()
dns = Field()
crawlTime = Field()
creation_time = Field()
url = Field()
news_id = Field()
author = Field()
purl = Field()
title = Field()
content = Field()
post_time = Field()
forwardcontent = Field()
imagePath = Field()
imgList = Field()
contentimgs = Field()
purl = Field()
isDownload = Field()
pagetype = Field()
type = Field()
imagePathSize = Field()
attr = Field()
version = Field()

164
dark_net_crawler/middlewares.py

@ -0,0 +1,164 @@
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
import dark_net_crawler
class PdfCrawlerSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class PdfCrawlerDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
import random
class Proxy_Middleware:
def __init__(self, crawler):
self.proxy_list = dark_net_crawler.settings.PROXY_LIST
self.ua_list = dark_net_crawler.settings.USER_AGENT_LIST
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_request(self, request, spider):
try:
ua = random.choice(self.ua_list)
request.headers.setdefault('User-Agent', ua)
proxy_ip_port = random.choice(self.proxy_list)
request.meta['proxy'] = 'http://' + proxy_ip_port
except request.exceptions.RequestException:
spider.logger.error('some error happended!')
from scrapy.downloadermiddlewares.retry import RetryMiddleware
from scrapy.utils.response import response_status_message
class My_RetryMiddleware(RetryMiddleware):
def __init__(self, crawler):
self.proxy_list = dark_net_crawler.settings.PROXY_LIST
self.ua_list = dark_net_crawler.settings.USER_AGENT_LIST
self.retry_http_codes = ['404']
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_response(self, request, response, spider):
if request.meta.get('dont_retry', False):
return response
if response.status in self.retry_http_codes:
reason = response_status_message(response.status)
try:
ua = random.choice(self.ua_list)
request.headers.setdefault('User-Agent', ua)
proxy_ip_port = random.choice(self.proxy_list)
request.meta['proxy'] = 'http://' + proxy_ip_port
except request.exceptions.RequestException:
spider.logger.error('获取暗网代理ip失败!')
return self._retry(request, reason, spider) or response
return response

38
dark_net_crawler/pipelines.py

@ -0,0 +1,38 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import json
from kafka import KafkaProducer
from dark_net_crawler.utils.kafka_config import KAFKA_TOPIC, KAFKA_ADDRESS
from dark_net_crawler.utils.logger_tool import Loggings
class DarkNetCrawlerPipeline:
def process_item(self, item, spider):
return item
class DarkNetCrawlerKfkPipeline:
def open_spider(self, spider):
self.producer = KafkaProducer(
bootstrap_servers=[KAFKA_ADDRESS], # 替换为你的Kafka服务器地址
value_serializer=lambda v: json.dumps(v).encode('utf-8')
)
def close_spider(self, spider):
self.producer.close()
def process_item(self, item, spider):
topic = KAFKA_TOPIC["post"]
data = dict(item)
self.producer.send(topic, data) # 替换为你的Kafka主题
Loggings.info(f'Send to kfk OK! topic:{topic}, data:{data}')
return item

125
dark_net_crawler/settings.py

@ -0,0 +1,125 @@
# Scrapy settings for dark_net_crawler project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'dark_net_crawler'
SPIDER_MODULES = ['dark_net_crawler.spiders']
NEWSPIDER_MODULE = 'dark_net_crawler.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'dark_net_crawler (+http://www.yourdomain.com)'
# setting中设置
USER_AGENT_LIST = [ \
"Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", \
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.132 Safari/537.36", \
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0"
]
PROXY_LIST = [
# 'localhost:9050',
# 'localhost:19050',
'47.252.23.168:19050'
]
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'dark_net_crawler.middlewares.DarkNetCrawlerSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
# 'dark_net_crawler.middlewares.DarkNetCrawlerDownloaderMiddleware': 543,
'dark_net_crawler.middlewares.Proxy_Middleware': 800,
# 'dark_net_crawler.middlewares.My_RetryMiddleware': 543,
}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
# 'dark_net_crawler.pipelines.DarkNetCrawlerPipeline': 300,
# 'dark_net_crawler.pipelines.DarkNetCrawlerKfkPipeline': 800,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
max_retry_times = 3

4
dark_net_crawler/spiders/__init__.py

@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

122
dark_net_crawler/spiders/news_denkbares.py

@ -0,0 +1,122 @@
# -*- coding: utf-8 -*-
"""
Denkbares
"""
import hashlib
import logging
import time
import scrapy
from lxml import etree
from dark_net_crawler.items import DarkNetCrawlerItem
from dark_net_crawler.utils.file_util import download_file
from dark_net_crawler.utils.gofast_util import upload_to_gofast
from dark_net_crawler.utils.kafka_config import KAFKA_TOPIC, KAFKA_ADDRESS
from dark_net_crawler.utils.kafka_util import kafkaProduce
from dark_net_crawler.utils.logger_tool import Loggings
custom_settings = {
'DOWNLOAD_TIMEOUT': 60,
}
start_url = 'http://pr3ygifxd23xu43be2fegjjsk5jlb22q2va2h5apz76ejbvammeclkid.onion/'
class NewsDenkbaresSpider(scrapy.Spider):
name = 'news_denkbares'
# allowed_domains = ['pr3ygifxd23xu43be2fegjjsk5jlb22q2va2h5apz76ejbvammeclkid.onion']
def start_requests(self):
urls = [
start_url,
# "http://httpbin.org/ip",
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse_list_page)
def parse_list_page(self, response):
# Loggings.info(f'list_page_html:{response.text}')
li_elements = response.css('ul.post-list > li')
Loggings.info(f'news_count:{len(li_elements)}')
for li_element in li_elements:
detail_page_url = li_element.css('li > h3 > a::attr(href)').get()
title = li_element.css('li > h3 > a::text').get().replace('\n', '').strip()
post_time = li_element.css('li > span::text').get()
Loggings.info(f'title:{title}, \tpost_time:{post_time}, \tdetail_page_url:{detail_page_url}')
# if 'widersprueche-der-querfront-nachdenkseiten' in detail_page_url:
yield response.follow(response.urljoin(detail_page_url), callback=self.parse_detail_page, meta={'title': title, 'post_time': post_time})
# break
def parse_detail_page(self, response):
title = response.meta['title'].replace('\\u', ' ')
post_time = response.css('time.dt-published::attr(datetime)').get().split('+')[0].replace('T', ' ')
content = ''.join(response.xpath('//div[@class=\"post-content e-content\"]//./text()').extract())
html = response.css('div.post-content').get()
author = response.css('span.p-author::text').get()
# 图片
img_part_urls = response.xpath('//div[@class=\"post-content e-content\"]/figure/p/img/@src').extract()
img_urls = []
imagePath = []
img_map = {}
for index, img_part_url in enumerate(img_part_urls):
img_url = response.urljoin(img_part_url)
Loggings.info(f'{img_url}')
img_urls.append(img_url)
file_name = '/opt/pics/news_denkbares/%s' % (img_part_url.split('/')[-1])
download_file(img_url, file_name)
gofast_path = upload_to_gofast(file_name)
imagePath.append(gofast_path)
img_map[f"img{index + 1}"] = {
"img": img_url,
"uploadImg": gofast_path
}
# Loggings.info(f'{img_urls}')
# Loggings.info(f'{img_map}')
# Loggings.info(f'{title}\t{post_time}\t{author}\n{img_urls}\n{content}')
items = DarkNetCrawlerItem()
items['url'] = response.url
items['news_id'] = hashlib.md5(items['url'].encode(encoding='UTF-8')).hexdigest()
items['purl'] = start_url
items['title'] = title
items['post_time'] = post_time
items['content'] = content
items['author'] = author
items['forwardcontent'] = html
items['imgList'] = img_urls
items['imagePath'] = imagePath
items['contentimgs'] = img_map
items['source'] = 'news_denkbares'
items['cid'] = 'news-denkbares'
items['dns'] = 'pr3ygifxd23xu43be2fegjjsk5jlb22q2va2h5apz76ejbvammeclkid.onion'
items['crawlTime'] = int(time.time())
items['creation_time'] = items['crawlTime']
items['attr'] = {
"appId": "ic",
"attachTag": "",
"crawlDataFlag": f'url:{start_url}',
"project_name": "ic"
}
items['isDownload'] = True
items['pagetype'] = 'newscontent'
items['type'] = 'newscontent'
items['imagePathSize'] = []
kafkaProduce(KAFKA_TOPIC["post"], dict(items))
Loggings.info(f'Send to kfk OK!')
yield items # 将items提交到管道中

139
dark_net_crawler/spiders/shop_pot.py

@ -0,0 +1,139 @@
# -*- coding: utf-8 -*-
"""
Denkbares
"""
import hashlib
import logging
import time
import scrapy
from dark_net_crawler.items import DarkNetCrawlerItem
from dark_net_crawler.utils.file_util import download_file
from dark_net_crawler.utils.gofast_util import upload_to_gofast
from dark_net_crawler.utils.kafka_config import KAFKA_TOPIC, KAFKA_ADDRESS
from dark_net_crawler.utils.kafka_util import kafkaProduce
from dark_net_crawler.utils.logger_tool import Loggings
custom_settings = {
'DOWNLOAD_TIMEOUT': 60,
}
start_url = 'http://potshopk4eov76aciyranqyq2r3mszuvfisvneytodfxo56ubha7doqd.onion/?post_type=product'
shop_pot_list_url = 'http://potshopk4eov76aciyranqyq2r3mszuvfisvneytodfxo56ubha7doqd.onion/?post_type=product&paged=%s'
class NewsDenkbaresSpider(scrapy.Spider):
name = 'shop_pot'
# allowed_domains = ['pr3ygifxd23xu43be2fegjjsk5jlb22q2va2h5apz76ejbvammeclkid.onion']
def start_requests(self):
total_page = 1
for page_no in range(1, total_page + 1):
turn_page_url = shop_pot_list_url % ('%d' % page_no)
# Loggings.info(turn_page_url)
yield scrapy.Request(turn_page_url, callback=self.parse_list_page, dont_filter=True)
# break
def parse_list_page(self, response):
# Loggings.info(f'list_page_html:{response.text}')
li_elements = response.css('ul.products > li')
Loggings.info(f'shop_pot_count:{len(li_elements)}')
for li_element in li_elements:
detail_page_url = li_element.css('li > a::attr(href)').get()
Loggings.info(detail_page_url)
# if 'widersprueche-der-querfront-nachdenkseiten' in detail_page_url:
# if 'product=afghan-skunk' in detail_page_url:
# if 'product=atomic-5g-new' in detail_page_url:
yield response.follow(detail_page_url, callback=self.parse_detail_page, meta={'purl': response.url})
# break
def parse_detail_page(self, response):
current_price = response.xpath('//ins/span[@class=\"woocommerce-Price-amount amount\"]/bdi/text()').get()
if current_price is None:
current_price = response.xpath('//span[@class=\"woocommerce-Price-amount amount\"]/bdi/text()').get()
title = response.css('h1.product_title::text').get() \
+ ' $%s' % current_price
content = str(response.css('div.woocommerce-product-details__short-description > p::text').get()).replace('None', '') \
+ '\n' \
+ str(response.css('span.sku_wrapper::text').get()).replace('None', '') + str(response.css('span.sku::text').get()).replace('None', '') \
+ '\t' \
+ str(response.css('span.posted_in::text').get()).replace('None', '') + str(response.css('span.posted_in > a::text').get()).replace('None', '') \
+ '\n' \
+ ''.join(response.css('#tab-description > h2::text').get()) \
+ '\n' \
+ '\n'.join(response.css('#tab-description > p::text').extract())
logging.info(title)
logging.info(content)
html = ''.join(response.css('#main > div > div').getall())
# logging.info(html)
# 图片
img_urls = response.xpath('//ol[@class=\"flex-control-nav flex-control-thumbs\"]/li/img/@src').extract()
if len(img_urls) == 0:
img_urls.append(response.xpath('//img[@class=\"wp-post-image\"]/@src').get())
imagePath = []
img_map = {}
for index, img_url in enumerate(img_urls):
Loggings.info(f'{img_url}')
file_name = '/opt/pics/news_denkbares/%s' % (hashlib.md5(img_url.encode(encoding='UTF-8')).hexdigest())
download_file(img_url, file_name)
gofast_path = upload_to_gofast(file_name)
imagePath.append(gofast_path)
img_map[f"img{index + 1}"] = {
"img": img_url,
"uploadImg": gofast_path
}
Loggings.info(f'{img_urls}')
Loggings.info(f'{img_map}')
# Loggings.info(f'{title}\t{post_time}\t{author}\n{img_urls}\n{content}')
items = DarkNetCrawlerItem()
items['url'] = response.url
items['news_id'] = hashlib.md5(items['url'].encode(encoding='UTF-8')).hexdigest()
items['purl'] = response.meta['purl']
items['title'] = title
items['post_time'] = int(time.time())
items['content'] = content
items['forwardcontent'] = html
items['imgList'] = img_urls
items['imagePath'] = imagePath
items['contentimgs'] = img_map
items['source'] = 'shop_pot'
items['cid'] = 'shop-pot'
items['dns'] = 'potshopk4eov76aciyranqyq2r3mszuvfisvneytodfxo56ubha7doqd.onion'
items['crawlTime'] = int(time.time())
items['creation_time'] = items['crawlTime']
items['attr'] = {
"appId": "ic",
"attachTag": "",
"crawlDataFlag": f'url:{start_url}',
"project_name": "ic"
}
items['isDownload'] = True
items['pagetype'] = 'newscontent'
items['type'] = 'newscontent'
items['imagePathSize'] = []
items['version'] = 2
kafkaProduce(KAFKA_TOPIC["post"], dict(items))
Loggings.info(f'Send to kfk OK!')
yield items # 将items提交到管道中

28
dark_net_crawler/utils/file_util.py

@ -0,0 +1,28 @@
import logging
import requests
proxy = {
"http": "127.0.0.1:19050"
}
def download_file(url, file_name):
r = requests.get(url, stream=True, proxies=proxy)
chunk_size = 1000000
with open(file_name, 'wb') as fd:
for chunk in r.iter_content(chunk_size):
fd.write(chunk)
def save_html(html, file_name):
with open(file_name, 'wb') as fd:
fd.write(html)
if __name__ == '__main__':
img_part_url = '/assets/img/widerspruch-nds/wellbrock-warweg-ungar.jpg'
img_url = 'http://pr3ygifxd23xu43be2fegjjsk5jlb22q2va2h5apz76ejbvammeclkid.onion' + img_part_url
file_name = '/opt/pics/news_denkbares/%s' % (img_part_url.split('/')[-1])
download_file(img_url, file_name)
logging.info(f'ok!')

24
dark_net_crawler/utils/gofast_util.py

@ -0,0 +1,24 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import json
import requests
from dark_net_crawler.utils.logger_tool import Loggings
url = 'http://172.18.1.130:8896/group17/upload'
def upload_to_gofast(file_name):
files = {'file': open(file_name, 'rb')}
options = {'output':'json', 'path':'', 'scene':''} #参阅浏览器上传的选项
response = requests.post(url, data=options, files=files)
# Loggings.info(response.text)
gofast_path = json.loads(response.text)['url']
Loggings.info(gofast_path)
return gofast_path
if __name__ == '__main__':
upload_to_gofast('/opt/pics/news_denkbares/wellbrock-warweg-ungar.jpg')

7
dark_net_crawler/utils/kafka_config.py

@ -0,0 +1,7 @@
# topic
KAFKA_TOPIC = {
"post": "newsTopicdata",
# "post":"test-yj",
}
KAFKA_ADDRESS = "172.18.1.101:9092,172.18.1.102:9092,172.18.1.104:9092,172.18.1.180:9092,172.18.1.182:9092"
# KAFKA_ADDRESS = "127.0.0.1:9092"

28
dark_net_crawler/utils/kafka_util.py

@ -0,0 +1,28 @@
# coding=utf-8
import json
from kafka import KafkaProducer
from dark_net_crawler.utils.kafka_config import KAFKA_TOPIC, KAFKA_ADDRESS
from dark_net_crawler.utils.logger_tool import Loggings
"""
kafka
"""
def kafkaProduce(topic, data):
try:
producer = KafkaProducer(bootstrap_servers = '{}'.format(KAFKA_ADDRESS),
value_serializer = lambda m: json.dumps(m, ensure_ascii = False).encode('utf-8'))
producer.send(topic, data)
Loggings.info(f'Send to kfk OK! topic:{topic}, data:{data}')
producer.flush()
except Exception as e:
Loggings.warning(f"kafka断开连接{e}")
if __name__ == '__main__':
resultData = ["111111111111111111111", "222222222"]
kafkaProduce(KAFKA_TOPIC["post"], resultData)
Loggings.info(f'Send to kfk OK!')

51
dark_net_crawler/utils/logger_tool.py

@ -0,0 +1,51 @@
import os
import time
from loguru import logger
from pathlib import Path
class Loggings:
__instance = None
# project_path = Path.cwd().parent
project_path = Path.cwd()
log_path = Path(os.path.join(project_path), "dark-download-logs")
if not os.path.exists(log_path):
os.mkdir(log_path)
logger.add(str(log_path) + "/{time:YYYY-MM-DD}.log", format="{time:YYYY-MM-DD A HH:mm:ss.SSSS} | {level} | {name} | {message}", level="DEBUG",
rotation="00:00", encoding="utf-8", enqueue=True, retention="7 days", backtrace=True, diagnose=True)
def __new__(cls, *args, **kwargs):
if not cls.__instance:
cls.__instance = super(Loggings, cls).__new__(cls, *args, **kwargs)
return cls.__instance
@classmethod
def info(self, msg):
return logger.info(msg)
@classmethod
def debug(self, msg):
return logger.debug(msg)
@classmethod
def warning(self, msg):
return logger.warning(msg)
@classmethod
def error(self, msg):
return logger.error(msg)
logging = Loggings()
if __name__ == '__main__':
logging.info("info")
# loggings.debug("debug")
# loggings.warning("warning")
# loggings.error("error")
# 使用说明
# from logger_tool import Loggings
#
# Loggings.error(f"{post_url}详情请求失败:{e}")

13
entrypoint.py

@ -0,0 +1,13 @@
# -*- coding: utf-8 -*-
"""
"""
from scrapy.cmdline import execute
if __name__ == '__main__':
# news_denkbares新闻网站采集启动
execute(['scrapy', 'crawl', 'news_denkbares'])
# shop_pot 毒品网站采集启动
# execute(['scrapy', 'crawl', 'shop_pot'])

BIN
img_1.png

After

Width: 1449  |  Height: 661  |  Size: 660 KiB

49
requirements.txt

@ -0,0 +1,49 @@
aiocontextvars==0.2.2
attrs==24.2.0
Automat==20.2.0
certifi==2020.12.5
cffi==1.17.1
chardet==4.0.0
constantly==15.1.0
contextvars==2.4
cryptography==43.0.1
cssselect==1.2.0
defusedxml==0.7.1
filelock==3.16.1
h2==3.2.0
hpack==3.0.0
hyperframe==5.2.0
hyperlink==21.0.0
idna==2.10
immutables==0.19
incremental==24.7.0
itemadapter==0.9.0
itemloaders==1.3.2
jmespath==1.0.1
kafka-python==2.0.2
loguru==0.7.2
lxml==5.3.0
packaging==24.1
parsel==1.9.1
priority==1.3.0
Protego==0.1.16
pyasn1==0.4.8
pyasn1-modules==0.2.7
pycparser==2.22
PyDispatcher==2.0.5
PyHamcrest==2.0.2
pyOpenSSL==24.2.1
queuelib==1.5.0
requests==2.25.1
requests-file==2.1.0
schedule==1.1.0
Scrapy==2.11.2
service-identity==18.1.0
six==1.16.0
tldextract==5.1.2
tomli==2.0.2
Twisted==24.7.0
typing_extensions==4.2.0
urllib3==1.26.4
w3lib==2.2.1
zope.interface==7.1.0

25
scheduled_run.py

@ -0,0 +1,25 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import schedule
import time
from scrapy.cmdline import execute
def crawl_1_news():
# news_denkbares新闻网站采集启动
execute(['scrapy', 'crawl', 'news_denkbares'])
def crawl_2_pot():
# shop_pot 毒品网站采集启动
execute(['scrapy', 'crawl', 'shop_pot'])
# 每周五凌晨五点运行程序
schedule.every().friday.at('5:00').do(crawl_1_news)
schedule.every().friday.at('7:00').do(crawl_2_pot)
while True:
schedule.run_pending()
time.sleep(1)

11
scrapy.cfg

@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = dark_net_crawler.settings
[deploy]
#url = http://localhost:6800/
project = dark_net_crawler
Loading…
Cancel
Save