话题水军识别应用
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

222 lines
9.9 KiB

  1. #coding:utf8
  2. import os, sys
  3. import io
  4. sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')
  5. cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd()
  6. par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir))
  7. sys.path.append(cur_dir)
  8. sys.path.append(par_dir)
  9. import json
  10. from django.http import HttpResponse
  11. from text_analysis.tools import to_kafka,tool
  12. from django.views.decorators.csrf import csrf_exempt
  13. from log_util.set_logger import set_logger
  14. logging=set_logger('logs/results.log')
  15. import traceback
  16. import queue
  17. import requests
  18. import time
  19. from datetime import datetime, timedelta
  20. import os
  21. import joblib
  22. #任务队列
  23. global task_queue
  24. task_queue = queue.Queue()
  25. global replyGraph
  26. replyGraph={}
  27. @csrf_exempt
  28. def robotIdentificationTopic(request):
  29. if request.method == 'POST':
  30. try:
  31. raw_data = json.loads(request.body)
  32. task_queue.put(raw_data)
  33. return HttpResponse(json.dumps({"code": 1, "msg": "请求正常!"}, ensure_ascii=False))
  34. except:
  35. logging.error(traceback.format_exc())
  36. return HttpResponse(json.dumps({"code": 0, "msg": "请求json格式不正确!"}, ensure_ascii=False))
  37. else:
  38. return HttpResponse(json.dumps({"code": 0, "msg": "请求方式错误,改为post请求"}, ensure_ascii=False))
  39. def predictTopic():
  40. while True:
  41. if task_queue.qsize() >0:
  42. try:
  43. logging.info("取任务队列长度{}".format(task_queue.qsize()))
  44. raw_data = task_queue.get()
  45. # 识别结果返回值
  46. recognition_code = "0"
  47. logging.info("原始数据-{}".format(raw_data))
  48. # 用户数据
  49. res = {"successCode": "1", "errorLog": "", "results": {}}
  50. # 获取历史数据源
  51. all_result = raw_data['data']
  52. user_data = []
  53. data=raw_data["metadata"]["admin"]
  54. # {"user_file": "9_获取用户信息", "post_file": "10_获取用户发帖信息"}
  55. user_file_result = json.loads(all_result[data['user_file']])
  56. data['user_file'] = user_file_result
  57. logging.info('用户数据:{}'.format(data['user_file']))
  58. post_file_result = json.loads(all_result[data['post_file']])
  59. if post_file_result['resultList']:
  60. data['post_file'] = post_file_result['resultList'][0]
  61. logging.info('帖子数据:{}'.format(data['post_file']))
  62. else:
  63. data['post_file'] = {}
  64. try:
  65. user_data_otherInfo_1 = 0 if data["user_file"]["otherInfo"].strip() == "" else 1
  66. except:
  67. user_data_otherInfo_1 = 0
  68. try:
  69. user_data_nickName_2 = 0 if data["user_file"]["nickName"].strip() == "" else 1
  70. except:
  71. user_data_nickName_2 = 0
  72. try:
  73. user_data_likeCount_4 = int(data["user_file"]["likeCount"])
  74. except:
  75. user_data_likeCount_4 = 0
  76. try:
  77. user_data_postCount_5 = int(data["user_file"]["postCount"])
  78. except:
  79. user_data_postCount_5 = 0
  80. try:
  81. user_data_authentication_6 = int(data["user_file"]["authentication"])
  82. except:
  83. user_data_authentication_6 = 0
  84. user_data.extend(
  85. [user_data_otherInfo_1, user_data_nickName_2, user_data_likeCount_4,user_data_postCount_5, user_data_authentication_6])
  86. logging.info("用户数据处理完毕!-{}".format(user_data))
  87. # 帖子数据
  88. post_data = []
  89. if data["post_file"]=={}:
  90. post_data=[0,0,0,0,0,0,0,0]
  91. else:
  92. try:
  93. post_data_LikeCount_1 = int(data["post_file"]["LikeCount"])
  94. except:
  95. post_data_LikeCount_1 = 0
  96. try:
  97. post_data_ShareCount_2 = int(data["post_file"]["ShareCount"])
  98. except:
  99. post_data_ShareCount_2 = 0
  100. try:
  101. post_data_emotionCount_3 = int(data["post_file"]["emotionCount"])
  102. except:
  103. post_data_emotionCount_3 = 0
  104. try:
  105. post_data_CommentsCount_4 = int(data["post_file"]["CommentsCount"])
  106. except:
  107. post_data_CommentsCount_4 = 0
  108. try:
  109. post_data_length_5 = int(data["post_file"]["length"])
  110. except:
  111. post_data_length_5 = 0
  112. try:
  113. post_data_tags_6 = int(data["post_file"]["tags"])
  114. except:
  115. post_data_tags_6 = 0
  116. try:
  117. post_data_https_7 = int(data["post_file"]["https"])
  118. except:
  119. post_data_https_7 = 0
  120. try:
  121. post_data_diffdate_8 = int(data["post_file"]["diffdate"])
  122. except:
  123. post_data_diffdate_8 = 0
  124. post_data.extend(
  125. [post_data_LikeCount_1, post_data_ShareCount_2, post_data_emotionCount_3, post_data_CommentsCount_4,
  126. post_data_length_5, post_data_tags_6, post_data_https_7, post_data_diffdate_8])
  127. logging.info("帖子数据处理完毕!-{}".format(post_data))
  128. #关系数据
  129. reply_data_1 = [0, 0, 0, 0, 0]
  130. reply_data_2 = [0, 0]
  131. try:
  132. #先判断内存中是否有该专题图信息
  133. topicID=data["reply_file"]["topicId"]
  134. if topicID not in list(replyGraph.keys()):
  135. reply_file=tool.mysqlData(raw_data,logging)
  136. if reply_file!='':
  137. graph=tool.get_replyData(reply_file)
  138. replyGraph[topicID]=graph
  139. else:
  140. graph=replyGraph[topicID]
  141. userId=data["user_file"]["accountId"]
  142. if userId in list(graph.keys()):
  143. closeness_centrality=graph["userId"]["closeness_centrality"]
  144. pagerank=graph["userId"]["pagerank"]
  145. clustering=graph["userId"]["clustering"]
  146. in_degree=graph["userId"]["in_degree"]
  147. out_degree=graph["userId"]["out_degree"]
  148. reply_data_1=[closeness_centrality,pagerank,clustering,in_degree,out_degree]
  149. user_flag_infl=graph["userId"]["user_flag_infl"]
  150. user_flag_act=graph["userId"]["user_flag_act"]
  151. reply_data_2=[user_flag_infl,user_flag_act]
  152. replyGraph[topicID]["last_operation_time"]=datetime.now()
  153. except:
  154. logging.info("专题关系数据mysql获取失败!")
  155. logging.info(traceback.format_exc())
  156. logging.info("关系数据处理完毕!{}-{}".format(reply_data_1,reply_data_2))
  157. features = [user_data + reply_data_1 + post_data + reply_data_2]
  158. bot_user = joblib.load(cur_dir+"/model/bot_topic.pkl") # 加载训练好的模型
  159. result = bot_user.predict(features)
  160. recognition_code = str(result[0])
  161. res["results"]=str(result[0])
  162. results = {}
  163. # 用户id
  164. results['accountId'] = data["user_file"]["accountId"]
  165. # 用户昵称
  166. results['nickName'] = data["user_file"]["nickName"]
  167. # 用户账号
  168. results['accountName'] = data["user_file"]["accountName"]
  169. if recognition_code == '0':
  170. results['recognitionResult'] = '非机器人'
  171. results['recognitionCode'] = recognition_code
  172. elif recognition_code == '1':
  173. results['recognitionResult'] = '机器人'
  174. results['recognitionCode'] = recognition_code
  175. else:
  176. results['recognitionResult'] = '未知识别结果'
  177. results['recognitionCode'] = recognition_code
  178. res['results'] = json.dumps(results)
  179. raw_data["result"] = res
  180. logging.info("增加预测数据-{}".format(raw_data))
  181. to_kafka.send_kafka(raw_data, logging)
  182. except:
  183. res = {"successCode": "0", "errorLog": "", "results": {}}
  184. raw_data["result"] = res
  185. raw_data["result"]["error"] = traceback.format_exc()
  186. logging.info(traceback.format_exc())
  187. to_kafka.send_kafka(raw_data, logging)
  188. else:
  189. #暂无任务,进入休眠
  190. time.sleep(10)
  191. def replyGraphThread():
  192. '''
  193. 2访
  194. :return:
  195. '''
  196. while True:
  197. try:
  198. if replyGraph!={}:
  199. # 获取当前时间
  200. current_time = datetime.now()
  201. for topicID in list(replyGraph.keys()):
  202. # 计算最后一次操作的时间与当前时间的差值
  203. time_difference = current_time - replyGraph[topicID]['last_operation_time']
  204. # 如果差值大于等于120分钟,则删除该话题图信息
  205. if time_difference >= timedelta(minutes=120):
  206. del replyGraph[topicID]
  207. except:
  208. logging.info(traceback.format_exc())
  209. finally:
  210. time.sleep(1800)