话题水军识别应用
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

222 lines
9.9 KiB

#coding:utf8
import os, sys
import io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')
cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd()
par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir))
sys.path.append(cur_dir)
sys.path.append(par_dir)
import json
from django.http import HttpResponse
from text_analysis.tools import to_kafka,tool
from django.views.decorators.csrf import csrf_exempt
from log_util.set_logger import set_logger
logging=set_logger('logs/results.log')
import traceback
import queue
import requests
import time
from datetime import datetime, timedelta
import os
import joblib
#任务队列
global task_queue
task_queue = queue.Queue()
global replyGraph
replyGraph={}
@csrf_exempt
def robotIdentificationTopic(request):
if request.method == 'POST':
try:
raw_data = json.loads(request.body)
task_queue.put(raw_data)
return HttpResponse(json.dumps({"code": 1, "msg": "请求正常!"}, ensure_ascii=False))
except:
logging.error(traceback.format_exc())
return HttpResponse(json.dumps({"code": 0, "msg": "请求json格式不正确!"}, ensure_ascii=False))
else:
return HttpResponse(json.dumps({"code": 0, "msg": "请求方式错误,改为post请求"}, ensure_ascii=False))
def predictTopic():
while True:
if task_queue.qsize() >0:
try:
logging.info("取任务队列长度{}".format(task_queue.qsize()))
raw_data = task_queue.get()
# 识别结果返回值
recognition_code = "0"
logging.info("原始数据-{}".format(raw_data))
# 用户数据
res = {"successCode": "1", "errorLog": "", "results": {}}
# 获取历史数据源
all_result = raw_data['data']
user_data = []
data=raw_data["metadata"]["admin"]
# {"user_file": "9_获取用户信息", "post_file": "10_获取用户发帖信息"}
user_file_result = json.loads(all_result[data['user_file']])
data['user_file'] = user_file_result
logging.info('用户数据:{}'.format(data['user_file']))
post_file_result = json.loads(all_result[data['post_file']])
if post_file_result['resultList']:
data['post_file'] = post_file_result['resultList'][0]
logging.info('帖子数据:{}'.format(data['post_file']))
else:
data['post_file'] = {}
try:
user_data_otherInfo_1 = 0 if data["user_file"]["otherInfo"].strip() == "" else 1
except:
user_data_otherInfo_1 = 0
try:
user_data_nickName_2 = 0 if data["user_file"]["nickName"].strip() == "" else 1
except:
user_data_nickName_2 = 0
try:
user_data_likeCount_4 = int(data["user_file"]["likeCount"])
except:
user_data_likeCount_4 = 0
try:
user_data_postCount_5 = int(data["user_file"]["postCount"])
except:
user_data_postCount_5 = 0
try:
user_data_authentication_6 = int(data["user_file"]["authentication"])
except:
user_data_authentication_6 = 0
user_data.extend(
[user_data_otherInfo_1, user_data_nickName_2, user_data_likeCount_4,user_data_postCount_5, user_data_authentication_6])
logging.info("用户数据处理完毕!-{}".format(user_data))
# 帖子数据
post_data = []
if data["post_file"]=={}:
post_data=[0,0,0,0,0,0,0,0]
else:
try:
post_data_LikeCount_1 = int(data["post_file"]["LikeCount"])
except:
post_data_LikeCount_1 = 0
try:
post_data_ShareCount_2 = int(data["post_file"]["ShareCount"])
except:
post_data_ShareCount_2 = 0
try:
post_data_emotionCount_3 = int(data["post_file"]["emotionCount"])
except:
post_data_emotionCount_3 = 0
try:
post_data_CommentsCount_4 = int(data["post_file"]["CommentsCount"])
except:
post_data_CommentsCount_4 = 0
try:
post_data_length_5 = int(data["post_file"]["length"])
except:
post_data_length_5 = 0
try:
post_data_tags_6 = int(data["post_file"]["tags"])
except:
post_data_tags_6 = 0
try:
post_data_https_7 = int(data["post_file"]["https"])
except:
post_data_https_7 = 0
try:
post_data_diffdate_8 = int(data["post_file"]["diffdate"])
except:
post_data_diffdate_8 = 0
post_data.extend(
[post_data_LikeCount_1, post_data_ShareCount_2, post_data_emotionCount_3, post_data_CommentsCount_4,
post_data_length_5, post_data_tags_6, post_data_https_7, post_data_diffdate_8])
logging.info("帖子数据处理完毕!-{}".format(post_data))
#关系数据
reply_data_1 = [0, 0, 0, 0, 0]
reply_data_2 = [0, 0]
try:
#先判断内存中是否有该专题图信息
topicID=data["reply_file"]["topicId"]
if topicID not in list(replyGraph.keys()):
reply_file=tool.mysqlData(raw_data,logging)
if reply_file!='':
graph=tool.get_replyData(reply_file)
replyGraph[topicID]=graph
else:
graph=replyGraph[topicID]
userId=data["user_file"]["accountId"]
if userId in list(graph.keys()):
closeness_centrality=graph["userId"]["closeness_centrality"]
pagerank=graph["userId"]["pagerank"]
clustering=graph["userId"]["clustering"]
in_degree=graph["userId"]["in_degree"]
out_degree=graph["userId"]["out_degree"]
reply_data_1=[closeness_centrality,pagerank,clustering,in_degree,out_degree]
user_flag_infl=graph["userId"]["user_flag_infl"]
user_flag_act=graph["userId"]["user_flag_act"]
reply_data_2=[user_flag_infl,user_flag_act]
replyGraph[topicID]["last_operation_time"]=datetime.now()
except:
logging.info("专题关系数据mysql获取失败!")
logging.info(traceback.format_exc())
logging.info("关系数据处理完毕!{}-{}".format(reply_data_1,reply_data_2))
features = [user_data + reply_data_1 + post_data + reply_data_2]
bot_user = joblib.load(cur_dir+"/model/bot_topic.pkl") # 加载训练好的模型
result = bot_user.predict(features)
recognition_code = str(result[0])
res["results"]=str(result[0])
results = {}
# 用户id
results['accountId'] = data["user_file"]["accountId"]
# 用户昵称
results['nickName'] = data["user_file"]["nickName"]
# 用户账号
results['accountName'] = data["user_file"]["accountName"]
if recognition_code == '0':
results['recognitionResult'] = '非机器人'
results['recognitionCode'] = recognition_code
elif recognition_code == '1':
results['recognitionResult'] = '机器人'
results['recognitionCode'] = recognition_code
else:
results['recognitionResult'] = '未知识别结果'
results['recognitionCode'] = recognition_code
res['results'] = json.dumps(results)
raw_data["result"] = res
logging.info("增加预测数据-{}".format(raw_data))
to_kafka.send_kafka(raw_data, logging)
except:
res = {"successCode": "0", "errorLog": "", "results": {}}
raw_data["result"] = res
raw_data["result"]["error"] = traceback.format_exc()
logging.info(traceback.format_exc())
to_kafka.send_kafka(raw_data, logging)
else:
#暂无任务,进入休眠
time.sleep(10)
def replyGraphThread():
'''
判断话题是否结束,如果2个小时未访问话题,则删除该话题的图信息。
:return:
'''
while True:
try:
if replyGraph!={}:
# 获取当前时间
current_time = datetime.now()
for topicID in list(replyGraph.keys()):
# 计算最后一次操作的时间与当前时间的差值
time_difference = current_time - replyGraph[topicID]['last_operation_time']
# 如果差值大于等于120分钟,则删除该话题图信息
if time_difference >= timedelta(minutes=120):
del replyGraph[topicID]
except:
logging.info(traceback.format_exc())
finally:
time.sleep(1800)