telegram 群组监控 / 群组功能
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

218 lines
7.7 KiB

8 months ago
8 months ago
8 months ago
  1. # code=utf-8
  2. import os, sys
  3. root_path = os.path.abspath(os.path.dirname(__file__)).split('telegram_crawler')[0] + "telegram_crawler"
  4. sys.path.append(root_path)
  5. from utils.MysqlData import MysqlPoolClient
  6. from utils.push_kafka import SKafka
  7. from telethon import TelegramClient, events
  8. import json
  9. from telethon.sessions import StringSession
  10. from telethon.tl.types import User
  11. import time
  12. from config import logger, SESSION_STRING, API_ID, API_HASH, PROXY, TOPIC_ADDRESS, TOPIC_DIC, CRAWLER_DB_CONF, \
  13. TG_ROBOT_ACCOUNT_TABLE
  14. import telethon, asyncio
  15. from tg_utils.tg_api import *
  16. from utils.upload_files import upload_file
  17. """
  18. nohup python receive_message.py > rece.log 2>&1 &
  19. """
  20. def get_group(api_id):
  21. """
  22. TODO:sql读取
  23. :return:
  24. """
  25. with open("group_ids.json", 'r') as f:
  26. chat_names = json.load(f)
  27. group_ids = [group["group_id"] for group in chat_names if group["group_id"] and not group["is_group"]]
  28. # 目前是 退群了仍旧会被监控到? 采用黑名单 下列群组不再接受监控 :(
  29. group_ids = [
  30. -1001440842385, -1001121481396, -1001643896726,
  31. -1001156230593, -1001964495180, -1001510876777,
  32. -1001878335287, -1001360128984, -1001461057234,
  33. -1001144411934, -1001790944354, -1001759695404,
  34. -1001643896726, -1002001492389
  35. ]
  36. logger.success(f"load group lengths {len(group_ids) if group_ids else 0}")
  37. return group_ids
  38. def push_kafka(kp, data, topic):
  39. """
  40. kafka推送
  41. :param kp:
  42. :param data:
  43. :param topic:
  44. :return:
  45. """
  46. try:
  47. if isinstance(data, list):
  48. kp.sync_producer(topic, data)
  49. else:
  50. kp.sync_producer(topic, [data])
  51. except Exception as e:
  52. logger.error(f"数据推送失败:{data}")
  53. logger.error(f"kafka error: {e}")
  54. async def update_filename(message, media, file="resources"):
  55. """
  56. :param message:
  57. :param media:
  58. :param file:
  59. :return:
  60. """
  61. file_type = telethon.utils.get_extension(media)
  62. logger.info(f"捕获文件类型:{file_type}")
  63. if "." in file_type: # 修改文件名称
  64. time_stamp = str(int(time.time()))
  65. file = f"{file}/{time_stamp}_{message.id}{file_type}"
  66. filename = await download_media(message, file=file)
  67. upload_path = await upload_file(filename) # 上传文件
  68. return upload_path
  69. else:
  70. filename = await download_media(message)
  71. upload_path = await upload_file(filename) # 上传文件
  72. return upload_path
  73. async def download_resources(message, media, client, sender, file="resources"):
  74. """
  75. :param message:
  76. :param media:
  77. :param client:
  78. :param sender:
  79. :param file:
  80. :return:
  81. """
  82. new_file_name = None # 对于没有媒体类的消息 直接不下载了
  83. if media:
  84. file_type = telethon.utils.get_extension(media)
  85. # logger.info(f"捕获文件类型:{file_type}")
  86. if "." in file_type: # 能识别出来类型的 修改文件名称
  87. time_stamp = str(int(time.time()))
  88. new_file_name = f"{file}/{time_stamp}_{message.id}{file_type}"
  89. else:
  90. new_file_name = file
  91. # 下载telegram 媒体类文件
  92. media_path, photo_path = await asyncio.gather(
  93. download_media(message, file=new_file_name), # 媒体文件下载
  94. download_profile_photo(client, sender) # 下载头像
  95. )
  96. logger.info(f"下载媒体文件路径:{media_path} ; 下载头像文件路径:{photo_path} ;")
  97. # 上传go-fast
  98. upload_media_path, upload_photo_path = await asyncio.gather(
  99. upload_file(media_path),
  100. upload_file(photo_path)
  101. )
  102. logger.info(f"go-fast媒体文件路径:{upload_media_path} ; go-fast头像路径:{upload_photo_path} ;")
  103. return upload_media_path, upload_photo_path
  104. async def main(session_string, api_id, api_hash, kp=None, message_topic=""):
  105. session = StringSession(session_string)
  106. logger.info(f"客户端 {api_id} 启动!!!!")
  107. client = TelegramClient(session, api_id, api_hash, timeout=60, proxy=PROXY)
  108. await client.start()
  109. @client.on(events.NewMessage(chats=get_group(api_id), incoming=True, blacklist_chats=True))
  110. async def handler(event):
  111. """
  112. chats控制
  113. :param event:
  114. :return:
  115. """
  116. sender = await event.get_sender() # 群组接受的为user 频道则接受的为频道
  117. chat = await event.get_chat()
  118. # if not sender or (hasattr(sender, "bot") and sender.bot): # 群组机器人过滤
  119. # logger.info(f"过滤机器人测试: {sender}")
  120. # return
  121. if not sender: # 没有sender可能也有消息
  122. logger.info(f" 不存在发送者 群组异常: {chat}")
  123. # logger.info(f"meesages: {event}")
  124. # logger.info(f"messages: {event.message}")
  125. sender = chat # 一般此时为channel
  126. if isinstance(sender, User):
  127. sender_name = str(sender.first_name) + ' ' + str(sender.last_name)
  128. else: # 如果是频道的话 发送者是以频道为对象发送
  129. sender_name = sender.title
  130. message = event.message
  131. media = event.media
  132. other_link = await get_extra_linked(message) # 获得超链接
  133. file_list, user_photo = await download_resources(message, media, client, sender)
  134. message_text = message.message
  135. date = message.date
  136. replay = message.reply_to # MessageReplyHeader 对象
  137. data = {
  138. "group_title": chat.title,
  139. "group_id": -1000000000000 - chat.id,
  140. "username": str(chat.username),
  141. "sender_name": sender_name,
  142. "sender_id": str(sender.id),
  143. "media": file_list,
  144. "sender_photo": user_photo,
  145. "message_id": message.id,
  146. "reply_to_msg_id": replay.reply_to_msg_id if replay else None, # 这个字段是针对某一条聊天记录的回复 *****
  147. "reply_to_top_id": replay.reply_to_top_id if replay else None, # 这个字段是针对频道内的讨论组 针对的某一条聊天内容发起的讨论
  148. "message_text": message_text,
  149. "other_link": other_link, # 聊天过程中的超链接 以及对应的位置
  150. "datetime": date.strftime("%Y-%m-%d %H:%M:%S")
  151. }
  152. logger.debug(f"client: {api_id} ; data: {data}")
  153. # logger.debug(f"message: {message}")
  154. push_kafka(kp, json.dumps(data, ensure_ascii=False), message_topic)
  155. if client:
  156. logger.info("client start")
  157. await client.run_until_disconnected()
  158. async def mul_account():
  159. client_mysql = MysqlPoolClient(CRAWLER_DB_CONF)
  160. sql = f"select * from {TG_ROBOT_ACCOUNT_TABLE} where api_id!='28340634' order by update_time"
  161. results = client_mysql.getAll(sql)
  162. kp = SKafka(bootstrap_servers=TOPIC_ADDRESS)
  163. message_topic = TOPIC_DIC["testtelegram"]
  164. logger.info(f"链接 kafka {TOPIC_ADDRESS}:{message_topic} 成功")
  165. tasks = []
  166. for i in results:
  167. api_id = i["api_id"]
  168. api_hash = i["api_hash"]
  169. session_string = i["session_string"]
  170. logger.info(i)
  171. tasks.append(main(session_string, api_id, api_hash, kp, message_topic))
  172. logger.info(f"获得任务数量 {len(tasks)}")
  173. await asyncio.gather(*tasks)
  174. if __name__ == '__main__':
  175. # main()
  176. loop = asyncio.get_event_loop()
  177. try:
  178. loop.run_until_complete(mul_account())
  179. except Exception as e:
  180. print(f"An error occurred: {e}")
  181. # mul_account()
  182. # pass