假新闻识别应用
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

233 lines
19 KiB

#coding:utf8
import re
import pymysql
import pandas as pd
import numpy as np
import networkx as nx
import traceback
import json
from jsonpath_ng import jsonpath, parse
def get_id(raw_data,taskid):
# taskid = raw_data["metadata"]["admin"]["taskId"]
all_result = raw_data['data']
param_split = taskid.split(":")
datasourcestr = all_result[param_split[0]]
datasource = json.loads(datasourcestr)
# 创建 JsonPath 表达式对象
expr = parse(param_split[1])
# 使用表达式来选择 JSON 元素
match = [match.value for match in expr.find(datasource)]
val = match[0]
return val
def mysqlData(raw_data,logging,dataTag,dbConfig):
result=''
taskid=raw_data["input"]["taskId"]
postid=raw_data["input"]["postId"]
taskId = get_id(raw_data,taskid)
postId = get_id(raw_data,postid)
if dataTag=='1':
table="tw_account"
else:
table="tw_deep"
try:
db = pymysql.connect(host=dbConfig['host'], user=dbConfig['username'], passwd=dbConfig['password'],
db=dbConfig['db'], port=int(dbConfig['port']), charset='utf8',cursorclass=pymysql.cursors.DictCursor, connect_timeout=30)
db.ping(reconnect=True)
cursor = db.cursor()
sql="SELECT * FROM {} WHERE taskId={} and ssId={}".format(table,taskId,postId)
cursor.execute(sql)
result = cursor.fetchall()
db.commit()
cursor.close()
db.close()
except:
logging.info("专题关系数据查询失败!")
logging.info(traceback.format_exc())
return result
def get_replyData(data):
reply=pd.DataFrame(data)
reply = reply.drop_duplicates().reset_index(drop=True) # 去重
reply=reply[['ReviewerAccountId', 'PostAccountId']]
# reply.columns = ['ReviewerAccountId', 'ReviewerAccountName', 'PostAccountId', 'PostAccountName',
# 'ShareCount', 'LikeCount', 'CommentCount', 'CommentTime']
reply = reply[['ReviewerAccountId', 'PostAccountId']]
reply['ReviewerAccountId'] = reply['ReviewerAccountId'].astype(str)
reply['PostAccountId'] = reply['PostAccountId'].astype(str)
reply = reply.groupby(['ReviewerAccountId', 'PostAccountId']).size().reset_index()
# user_net_df = user_net(reply) ##SNA数据清洗
edgeweightset = reply.fillna(0)
edgeweightset.columns = ['source', 'target', 'count']
edgeweightset_l = [[] for _ in range(edgeweightset.shape[0])]
for i in range(len(edgeweightset_l)):
for j in range(edgeweightset.shape[1]):
edgeweightset_l[i].append(edgeweightset.iloc[i, j])
g = nx.DiGraph()
g.add_weighted_edges_from(edgeweightset_l)
degree = [g.degree(),
g.in_degree(),
g.out_degree()]
centrality = [nx.degree_centrality(g), # 计算图 g 中每个节点的度中心性。度中心性是指节点的度(与其他节点相连的边的数量)与图中节点总数的比值。
nx.closeness_centrality(g), # 计算图 g 中每个节点的接近中心性。接近中心性是指节点到其他节点的平均最短路径长度的倒数。
nx.pagerank(g), # 计算图 g 中每个节点的 PageRank 值。PageRank 是一种用于评估网页重要性的算法,也可以应用于其他网络中的节点重要性评估。
nx.clustering(g)] # 计算图 g 中每个节点的聚集系数。聚集系数是指节点的邻居之间存在连接的概率。
#把主贴相关信息拿出来
tmp=edgeweightset["target"].values
node_list = []
nodes = g.nodes() # 提取网络中节点列表
for node in nodes:
if node not in tmp:
continue
node_list.append([node,
degree[0][node],
degree[1][node],
degree[2][node],
centrality[0][node],
centrality[1][node],
centrality[2][node],
centrality[3][node]])
node_list = pd.DataFrame(node_list)
node_list.columns = ['Id', 'degree', 'in_degree', 'out_degree',
'degree_centrality', 'closeness_centrality', 'pagerank', 'clustering']
node_list['user_flag_infl'] = 0
node_list['user_flag_act'] = 0
node_list.user_flag_infl[node_list['out_degree'] > np.percentile(node_list['out_degree'], 95)] = 1
node_list.user_flag_act[(node_list['in_degree'] > np.percentile(node_list['in_degree'], 90)) &
(node_list['closeness_centrality'] > np.percentile(node_list['closeness_centrality'],
50))] = 1
node_dic=node_list.set_index('Id')[['degree', 'in_degree','out_degree','degree_centrality','closeness_centrality','pagerank','clustering']].T.to_dict()
return node_dic
def get_content(inputdata,logging):
"""
重新组装参数
:param inputdata:原json数据
:return: 组装的prompt及其他参数
"""
res={}
admin=inputdata["metadata"]["admin"]
data=inputdata["data"]
prompt=admin["prompt"]
if_user=re.findall("{{(.*)}}",prompt)
if_data=re.findall("@@(.*)@@",prompt)
if if_user != []:
user_data=inputdata["metadata"]["user"]
if if_user[0] in user_data.keys():
tmp=user_data[if_user[0]]
prompt=re.sub("{{(.*)}}",tmp,prompt)
if if_data!=[] and if_data[0] in data.keys():
tmp1=data[if_data[0]]
prompt=re.sub("@@(.*)@@",tmp1,prompt)
res["prompt"]=prompt
res["authorization"]=admin["authorization"]
res["model"]=admin["model"]
res["temperature"]=admin["temperature"]
res["authorization"]=admin["authorization"]
res["top_p"]=admin["top_p"]
res["n"]=admin["n"]
return res
if __name__=="__main__":
from log_util.set_logger import set_logger
logging = set_logger('test.log')
inputdata={
"metadata":{
"output":{
"output_type":"table",
"label_col":[
"假新闻识别"
]
},
"input":{
"input_type":"text",
"label":[
]
},
"address":"http://172.24.12.127:9030/fakeNewIdentification/",
"admin":{
"taskId":"1_twitter采集:$.taskId",
"Host":"172.24.12.126",
"User":"root",
"Password":"baifendian123",
"Database":"analyze",
"Port":3306,
"accountId":"1_twitter采集:$.authorId",
"postId":"1_twitter采集:$.postId"
},
"user":{
"tag":""
}
},
"data":{
"9_获取用户发帖信息":"{\"resultList\": [{\"count\": \"187\", \"LikeCount\": \"9.796791443850267\", \"CommentsCount\": \"1.53475935828877\", \"ShareCount\": \"0.6631016042780749\", \"length\": \"103.7166\", \"tags\": \"0.98930481\", \"https\": \"0.73262032\", \"at\": \"0.10160428\", \"diffdate\": \"4771\"}]}",
"8_获取用户信息":"{\"resultList\": []}",
"businessKey":"5f0297539eec6000",
"1_twitter采集":"{\"isDownload\":\"true\",\"imageCount\":1,\"groupRules\":[],\"commentUrl\":\"\",\"channel\":\"社交媒体\",\"readCount\":-1,\"resolution\":\"\",\"srcimagePath\":[],\"fileCount\":0,\"forwardQuoteCount\":-1,\"province\":\"\",\"crawlDataFlagType\":\"1\",\"price\":0,\"quoteCount\":0,\"translateContentLength\":\"\",\"forwardUserType\":0,\"brand\":\"\",\"createTimeStr\":\"2023-10-08 11:39:02\",\"ocrLength\":0,\"hasOCR\":0,\"authornickname\":\"\",\"contentSimHash\":\"7aa3aa7066c0a22618f401b52e02cba4\",\"dns\":\"https://twitter.com/\",\"crawlDay\":1696694400000,\"asrText\":\"\",\"nomorprice\":0,\"mentionAccountUrl\":[],\"titleSimHash\":\"\",\"videoPath\":[],\"brandId\":\"5f0297539eec6000\",\"commentId\":\"\",\"impression\":\"\",\"originalPhrase\":\"[\\\"关取\\\",\\\"互关\\\",\\\"取关\\\",\\\"一定回关没\\\",\\\"互关必\\\",\\\"关必回\\\",\\\"回关没关\\\",\\\"关上限\\\",\\\"互粉\\\",\\\"顺延一定\\\",\\\"上限会\\\",\\\"会顺延\\\",\\\"关贴\\\",\\\"关评论\\\",\\\"留言\\\",\\\"回关\\\",\\\"关注互\\\",\\\"评论区\\\",\\\"区留\\\",\\\"先主动\\\",\\\"主动关注\\\"]\",\"userTypeContent\":\"\",\"crawl_end_mark\":\"ok\",\"forwardContent\":\"\",\"hashTag\":[],\"primary\":1,\"poorrate\":-1,\"extension\":\"\",\"translateContent\":\"\",\"imagePathSize\":[{\"size\":\"\",\"videoTime\":\"\",\"resolution\":\"\",\"url\":\"/group16/default/20231008/11/29/6/1_4temp.jpeg\"}],\"mentionTopic\":[\"每日互关100\",\"互关贴\",\"互关必回\",\"互关\",\"互粉\"],\"finalPhrase\":\"[\\\"关取\\\",\\\"互关\\\",\\\"互关\\\",\\\"互关必\\\",\\\"回关\\\",\\\"取关\\\",\\\"上限会\\\",\\\"一定回关没\\\",\\\"主动关注\\\"]\",\"city\":\"\",\"availability\":1,\"mentionAccount\":[],\"dataCount\":0,\"forwardUserUrl\":\"\",\"videoCount\":0,\"pageType\":\"storyDetailPage\",\"dataId\":\"fb136a04955de4d0c0044e21b443d75c\",\"fourListBrand\":\"\",\"titleLength\":0,\"videoUrl\":\"\",\"ocrText\":[],\"mentionTopicUrl\":[\"https://twitter.com/hashtag/每日互关100\",\"https://twitter.com/hashtag/互关贴\",\"https://twitter.com/hashtag/互关必回\",\"https://twitter.com/hashtag/互关\",\"https://twitter.com/hashtag/互粉\"],\"forwardAuthor\":\"\",\"sysAbstract\":\"\",\"forwardUrl\":\"\",\"createDate\":\"2023-10-08T11:29:25.922+08:00\",\"cate\":\"\",\"forwardPostSource\":\"\",\"expression\":[],\"docType\":\"social\",\"sex\":\"\",\"threeListBrand\":\"\",\"collectCount\":-1,\"crawlDate\":\"2023-10-08T11:29:20.266+08:00\",\"channelNum\":\"0\",\"avatar\":\"/group16/default/20230921/12/00/6/1697088541984264192temp.jpeg\",\"promotionInfo\":\"\",\"authorId\":\"21836801\",\"isVip\":0,\"url\":\"https://twitter.com/zhichu444/status/1696498207377768703\",\"skuProperties\":\"\",\"places\":[],\"hlKeywords\":[],\"createTime\":1696736342599,\"contentLength\":106,\"thumbnails\":\"\",\"downCnt\":-1,\"country\":\"\",\"forwardUserId\":\"\",\"hasASR\":0,\"imagePath\":[\"/group16/default/20231008/11/29/6/1_4temp.jpeg\"],\"pubTime\":1693311666000,\"sign\":\"\",\"asrLength\":0,\"fansCount\":\"\",\"language\":\"\",\"otherSourceJson\":\"\",\"source\":\"twitter\",\"smallImgs\":[],\"forwardAvatar\":\"\",\"goodrate\":-1,\"pageCommentCount\":1,\"crawlDataFlag\":\"account:https://twitter.com/zhichu444\",\"viewCnt\":-1,\"members\":[],\"crawl_end_message\":\"数据采集完成\",\"postCount\":\"\",\"videoTime\":\"\",\"tag\":\"\",\"filePathSize\":[],\"enSource\":\"twitter\",\"pictureList\":\"[\\\"https://pbs.twimg.com/media/F4ssE8gXQAEWyy0.jpg\\\"]\",\"userUrl\":\"https://twitter.com/zhichu444\",\"area\":\"\",\"fiveListBrand\":\"\",\"srcvideoPath\":[],\"firstListBrand\":\"\",\"forwardCommentsCount\":-1,\"contentTag\":\"nomal\",\"author\":\"之初之初(互fo)@zhichu444\",\"sysSentiment\":0.0,\"generalrate\":-1,\"attitudesCount\":3,\"createDay\":1696694400000,\"postId\":\"1714272222612008988\",\"srcfilePath\":[],\"pubDate\":\"2023-08-29T20:21:06.000+08:00\",\"sysKeywords\":\"\",\"hasFile\":0,\"translateTitle\":\"\",\"translateTitleLength\":\"\",\"getSource\":\"\",\"crawlTime\":1696735760266,\"userType\":\"\",\"projectName\":\"\",\"lastModifiedTime\":1696735765922,\"productParameter\":\"\",\"docId\":\"bfd_social_7793d00f49558319ef0db899ff6f509e\",\"videoTimeLong\":0,\"commentScore\":0,\"urlHash\":\"a9ae4250c680d49a5d945356784c5621\",\"_id_\":\"24f8aa39ef3d94a51817d17d2b54bc8c\",\"hasImage\":1,\"videoPathSize\":[],\"title\":\"\",\"pageTranspondCount\":0,\"pageAttitudeCount\":\"{\\\"totalCount\\\":3,\\\"likeCount\\\":3}\",\"content\":\"#每日互关100 %回关 #互关贴 1、先主动关注别人2、互关不要取关不要取关不要取关每天上限会顺延下一天但一定回关没关的评论区给我留言 #互关必回 #互关 #互粉 https://t.co/DbluriZoTx\",\"authorLevel\":\"\",\"attr\":\"{\\\"projectName\\\":\\\"113ic\\\",\\\"cate\\\":\\\"\\\",\\\"crawlDataFlag\\\":\\\"account:https://twitter.com/zhichu444\\\",\\\"attachTag\\\":\\\"5f0297539eec6000\\\",\\\"appId\\\":\\\"113ic\\\",\\\"project_name\\\":\\\"113ic\\\"}\",\"forumScore\":\"\",\"hasVideo\":0,\"forwardAttitudesCount\":-1,\"pubDay\":1693238400000,\"pubTimeStr\":\"2023-08-29 20:21:06\",\"filePath\":[],\"forwardPubTime\":0,\"postSource\":\"\",\"hasTrans\":0,\"avatarPath\":\"http://crawl-files.pontoaplus.com/group16/default/20230921/12/00/6/1697088541984264192temp.jpeg\",\"crawlTimeStr\":\"2023-10-08 11:29:20\",\"externalId\":\"94b001e6-6c48-4e64-b238-d4497a7d5326\",\"favorCnt\":-1,\"friendsCount\":\"\",\"secondListBrand\":\"\",\"forwardImgs\":\"\",\"commentsCount\":1,\"listBrand\":\"\",\"opinions\":[],\"siteId\":\"181\",\"location\":\"\",\"otherInfoJson\":\"\",\"age\":\"\",\"taskId\":\"1034427\"}",
"7_采集结束过滤":"{\"_id_\":\"24f8aa39ef3d94a51817d17d2b54bc8c\",\"age\":\"\",\"area\":\"\",\"asrLength\":0,\"asrText\":\"\",\"attitudesCount\":3,\"attr\":\"{\\\"projectName\\\":\\\"113ic\\\",\\\"cate\\\":\\\"\\\",\\\"crawlDataFlag\\\":\\\"account:https://twitter.com/zhichu444\\\",\\\"attachTag\\\":\\\"5f0297539eec6000\\\",\\\"appId\\\":\\\"113ic\\\",\\\"project_name\\\":\\\"113ic\\\"}\",\"author\":\"之初之初(互fo)@zhichu444\",\"authorId\":\"1680809976866955265\",\"authorLevel\":\"\",\"authornickname\":\"\",\"availability\":1,\"avatar\":\"/group16/default/20230921/12/00/6/1697088541984264192temp.jpeg\",\"avatarPath\":\"http://crawl-files.pontoaplus.com/group16/default/20230921/12/00/6/1697088541984264192temp.jpeg\",\"brand\":\"\",\"brandId\":\"5f0297539eec6000\",\"cate\":\"\",\"channel\":\"社交媒体\",\"channelNum\":\"0\",\"city\":\"\",\"collectCount\":-1,\"commentId\":\"\",\"commentScore\":0,\"commentUrl\":\"\",\"commentsCount\":1,\"content\":\"#每日互关100 %回关 #互关贴 1、先主动关注别人2、互关不要取关不要取关不要取关每天上限会顺延下一天但一定回关没关的评论区给我留言 #互关必回 #互关 #互粉 https://t.co/DbluriZoTx\",\"contentLength\":106,\"contentSimHash\":\"7aa3aa7066c0a22618f401b52e02cba4\",\"contentTag\":\"nomal\",\"country\":\"\",\"crawlDataFlag\":\"account:https://twitter.com/zhichu444\",\"crawlDataFlagType\":\"1\",\"crawlDate\":\"2023-10-08T11:29:20.266+08:00\",\"crawlDay\":1696694400000,\"crawlTime\":1696735760266,\"crawlTimeStr\":\"2023-10-08 11:29:20\",\"crawl_end_mark\":\"ok\",\"crawl_end_message\":\"数据采集完成\",\"createDate\":\"2023-10-08T11:29:25.922+08:00\",\"createDay\":1696694400000,\"createTime\":1696736342599,\"createTimeStr\":\"2023-10-08 11:39:02\",\"dataCount\":0,\"dataId\":\"fb136a04955de4d0c0044e21b443d75c\",\"dns\":\"https://twitter.com/\",\"docId\":\"bfd_social_7793d00f49558319ef0db899ff6f509e\",\"docType\":\"social\",\"downCnt\":-1,\"enSource\":\"twitter\",\"expression\":[],\"extension\":\"\",\"externalId\":\"94b001e6-6c48-4e64-b238-d4497a7d5326\",\"fansCount\":\"\",\"favorCnt\":-1,\"fileCount\":0,\"filePath\":[],\"filePathSize\":[],\"finalPhrase\":\"[\\\"关取\\\",\\\"互关\\\",\\\"互关\\\",\\\"互关必\\\",\\\"回关\\\",\\\"取关\\\",\\\"上限会\\\",\\\"一定回关没\\\",\\\"主动关注\\\"]\",\"firstListBrand\":\"\",\"fiveListBrand\":\"\",\"forumScore\":\"\",\"forwardAttitudesCount\":-1,\"forwardAuthor\":\"\",\"forwardAvatar\":\"\",\"forwardCommentsCount\":-1,\"forwardContent\":\"\",\"forwardImgs\":\"\",\"forwardPostSource\":\"\",\"forwardPubTime\":0,\"forwardQuoteCount\":-1,\"forwardUrl\":\"\",\"forwardUserId\":\"\",\"forwardUserType\":0,\"forwardUserUrl\":\"\",\"fourListBrand\":\"\",\"friendsCount\":\"\",\"generalrate\":-1,\"getSource\":\"\",\"goodrate\":-1,\"groupRules\":[],\"hasASR\":0,\"hasFile\":0,\"hasImage\":1,\"hasOCR\":0,\"hasTrans\":0,\"hasVideo\":0,\"hashTag\":[],\"hlKeywords\":[],\"imageCount\":1,\"imagePath\":[\"/group16/default/20231008/11/29/6/1_4temp.jpeg\"],\"imagePathSize\":[{\"resolution\":\"\",\"size\":\"\",\"url\":\"/group16/default/20231008/11/29/6/1_4temp.jpeg\",\"videoTime\":\"\"}],\"impression\":\"\",\"isDownload\":\"true\",\"isVip\":0,\"language\":\"\",\"lastModifiedTime\":1696735765922,\"listBrand\":\"\",\"location\":\"\",\"members\":[],\"mentionAccount\":[],\"mentionAccountUrl\":[],\"mentionTopic\":[\"每日互关100\",\"互关贴\",\"互关必回\",\"互关\",\"互粉\"],\"mentionTopicUrl\":[\"https://twitter.com/hashtag/每日互关100\",\"https://twitter.com/hashtag/互关贴\",\"https://twitter.com/hashtag/互关必回\",\"https://twitter.com/hashtag/互关\",\"https://twitter.com/hashtag/互粉\"],\"nomorprice\":0,\"ocrLength\":0,\"ocrText\":[],\"opinions\":[],\"originalPhrase\":\"[\\\"关取\\\",\\\"互关\\\",\\\"取关\\\",\\\"一定回关没\\\",\\\"互关必\\\",\\\"关必回\\\",\\\"回关没关\\\",\\\"关上限\\\",\\\"互粉\\\",\\\"顺延一定\\\",\\\"上限会\\\",\\\"会顺延\\\",\\\"关贴\\\",\\\"关评论\\\",\\\"留言\\\",\\\"回关\\\",\\\"关注互\\\",\\\"评论区\\\",\\\"区留\\\",\\\"先主动\\\",\\\"主动关注\\\"]\",\"otherInfoJson\":\"\",\"otherSourceJson\":\"\",\"pageAttitudeCount\":\"{\\\"totalCount\\\":3,\\\"likeCount\\\":3}\",\"pageCommentCount\":1,\"pageTranspondCount\":0,\"pageType\":\"storyDetailPage\",\"pictureList\":\"[\\\"https://pbs.twimg.com/media/F4ssE8gXQAEWyy0.jpg\\\"]\",\"places\":[],\"poorrate\":-1,\"postCount\":\"\",\"postId\":\"1696498207377768703\",\"postSource\":\"\",\"price\":0,\"primary\":1,\"productParameter\":\"\",\"projectName\":\"\",\"promotionInfo\":\"\",\"province\":\"\",\"pubDate\":\"2023-08-29T20:21:06.000+08:00\",\"pubDay\":1693238400000,\"pubTime\":1693311666000,\"pubTimeStr\":\"2023-08-29 20:21:06\",\"quoteCount\":0,\"readCount\":-1,\"resolution\":\"\",\"secondListBrand\":\"\",\"sex\":\"\",\"sign\":\"\",\"siteId\":\"181\",\"skuProperties\":\"\",\"smallImgs\":[],\"source\":\"twitter\",\"srcfilePath\":[],\"srcimagePath\":[],\"srcvideoPath\":[],\"sysAbstract\":\"\",\"sysKeywords\":\"\",\"sysSentiment\":0.0,\"tag\":\"\",\"taskId\":\"1030851\",\"threeListBrand\":\"\",\"thumbnails\":\"\",\"title\":\"\",\"titleLength\":0,\"titleSimHash\":\"\",\"translateContent\":\"\",\"translateContentLength\":\"\",\"translateTitle\":\"\",\"translateTitleLength\":\"\",\"url\":\"https://twitter.com/zhichu444/status/1696498207377768703\",\"urlHash\":\"a9ae4250c680d49a5d945356784c5621\",\"userType\":\"\",\"userTypeContent\":\"\",\"userUrl\":\"https://twitter.com/zhichu444\",\"videoCount\":0,\"videoPath\":[],\"videoPathSize\":[],\"videoTime\":\"\",\"videoTimeLong\":0,\"videoUrl\":\"\",\"viewCnt\":-1}"
},
"created":1691004265000,
"module":"假新闻识别",
"start_tag":"false",
"multi_branch":0,
"last_edit":1696757732000,
"next_app_id":[
{
"start_id":284,
"edge_id":171,
"end_id":285
}
],
"transfer_id":10,
"version":1,
"blueprint_id":9,
"scenes_id":9,
"scenario":{
"dataloss":1,
"autoCommitTriggerLast":1,
"maxErrors":3,
"autoCommit":1,
"freshVariables":1
},
"wait_condition":[
],
"scheduling":{
"interval":-1,
"type":"single"
},
"name":"假新闻识别",
"businessKey":"5f0297539eec6000",
"id":284,
"position":[
100,
200
],
"describe":"假新闻识别"
}
dataTag="1"
a=mysqlData(inputdata,logging,dataTag)
print(a)