假新闻识别应用
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

233 lines
19 KiB

  1. #coding:utf8
  2. import re
  3. import pymysql
  4. import pandas as pd
  5. import numpy as np
  6. import networkx as nx
  7. import traceback
  8. import json
  9. from jsonpath_ng import jsonpath, parse
  10. def get_id(raw_data,taskid):
  11. # taskid = raw_data["metadata"]["admin"]["taskId"]
  12. all_result = raw_data['data']
  13. param_split = taskid.split(":")
  14. datasourcestr = all_result[param_split[0]]
  15. datasource = json.loads(datasourcestr)
  16. # 创建 JsonPath 表达式对象
  17. expr = parse(param_split[1])
  18. # 使用表达式来选择 JSON 元素
  19. match = [match.value for match in expr.find(datasource)]
  20. val = match[0]
  21. return val
  22. def mysqlData(raw_data,logging,dataTag,dbConfig):
  23. result=''
  24. taskid=raw_data["input"]["taskId"]
  25. postid=raw_data["input"]["postId"]
  26. taskId = get_id(raw_data,taskid)
  27. postId = get_id(raw_data,postid)
  28. if dataTag=='1':
  29. table="tw_account"
  30. else:
  31. table="tw_deep"
  32. try:
  33. db = pymysql.connect(host=dbConfig['host'], user=dbConfig['username'], passwd=dbConfig['password'],
  34. db=dbConfig['db'], port=int(dbConfig['port']), charset='utf8',cursorclass=pymysql.cursors.DictCursor, connect_timeout=30)
  35. db.ping(reconnect=True)
  36. cursor = db.cursor()
  37. sql="SELECT * FROM {} WHERE taskId={} and ssId={}".format(table,taskId,postId)
  38. cursor.execute(sql)
  39. result = cursor.fetchall()
  40. db.commit()
  41. cursor.close()
  42. db.close()
  43. except:
  44. logging.info("专题关系数据查询失败!")
  45. logging.info(traceback.format_exc())
  46. return result
  47. def get_replyData(data):
  48. reply=pd.DataFrame(data)
  49. reply = reply.drop_duplicates().reset_index(drop=True) # 去重
  50. reply=reply[['ReviewerAccountId', 'PostAccountId']]
  51. # reply.columns = ['ReviewerAccountId', 'ReviewerAccountName', 'PostAccountId', 'PostAccountName',
  52. # 'ShareCount', 'LikeCount', 'CommentCount', 'CommentTime']
  53. reply = reply[['ReviewerAccountId', 'PostAccountId']]
  54. reply['ReviewerAccountId'] = reply['ReviewerAccountId'].astype(str)
  55. reply['PostAccountId'] = reply['PostAccountId'].astype(str)
  56. reply = reply.groupby(['ReviewerAccountId', 'PostAccountId']).size().reset_index()
  57. # user_net_df = user_net(reply) ##SNA数据清洗
  58. edgeweightset = reply.fillna(0)
  59. edgeweightset.columns = ['source', 'target', 'count']
  60. edgeweightset_l = [[] for _ in range(edgeweightset.shape[0])]
  61. for i in range(len(edgeweightset_l)):
  62. for j in range(edgeweightset.shape[1]):
  63. edgeweightset_l[i].append(edgeweightset.iloc[i, j])
  64. g = nx.DiGraph()
  65. g.add_weighted_edges_from(edgeweightset_l)
  66. degree = [g.degree(),
  67. g.in_degree(),
  68. g.out_degree()]
  69. centrality = [nx.degree_centrality(g), # 计算图 g 中每个节点的度中心性。度中心性是指节点的度(与其他节点相连的边的数量)与图中节点总数的比值。
  70. nx.closeness_centrality(g), # 计算图 g 中每个节点的接近中心性。接近中心性是指节点到其他节点的平均最短路径长度的倒数。
  71. nx.pagerank(g), # 计算图 g 中每个节点的 PageRank 值。PageRank 是一种用于评估网页重要性的算法,也可以应用于其他网络中的节点重要性评估。
  72. nx.clustering(g)] # 计算图 g 中每个节点的聚集系数。聚集系数是指节点的邻居之间存在连接的概率。
  73. #把主贴相关信息拿出来
  74. tmp=edgeweightset["target"].values
  75. node_list = []
  76. nodes = g.nodes() # 提取网络中节点列表
  77. for node in nodes:
  78. if node not in tmp:
  79. continue
  80. node_list.append([node,
  81. degree[0][node],
  82. degree[1][node],
  83. degree[2][node],
  84. centrality[0][node],
  85. centrality[1][node],
  86. centrality[2][node],
  87. centrality[3][node]])
  88. node_list = pd.DataFrame(node_list)
  89. node_list.columns = ['Id', 'degree', 'in_degree', 'out_degree',
  90. 'degree_centrality', 'closeness_centrality', 'pagerank', 'clustering']
  91. node_list['user_flag_infl'] = 0
  92. node_list['user_flag_act'] = 0
  93. node_list.user_flag_infl[node_list['out_degree'] > np.percentile(node_list['out_degree'], 95)] = 1
  94. node_list.user_flag_act[(node_list['in_degree'] > np.percentile(node_list['in_degree'], 90)) &
  95. (node_list['closeness_centrality'] > np.percentile(node_list['closeness_centrality'],
  96. 50))] = 1
  97. node_dic=node_list.set_index('Id')[['degree', 'in_degree','out_degree','degree_centrality','closeness_centrality','pagerank','clustering']].T.to_dict()
  98. return node_dic
  99. def get_content(inputdata,logging):
  100. """
  101. :param inputdata:json数据
  102. :return: prompt及其他参数
  103. """
  104. res={}
  105. admin=inputdata["metadata"]["admin"]
  106. data=inputdata["data"]
  107. prompt=admin["prompt"]
  108. if_user=re.findall("{{(.*)}}",prompt)
  109. if_data=re.findall("@@(.*)@@",prompt)
  110. if if_user != []:
  111. user_data=inputdata["metadata"]["user"]
  112. if if_user[0] in user_data.keys():
  113. tmp=user_data[if_user[0]]
  114. prompt=re.sub("{{(.*)}}",tmp,prompt)
  115. if if_data!=[] and if_data[0] in data.keys():
  116. tmp1=data[if_data[0]]
  117. prompt=re.sub("@@(.*)@@",tmp1,prompt)
  118. res["prompt"]=prompt
  119. res["authorization"]=admin["authorization"]
  120. res["model"]=admin["model"]
  121. res["temperature"]=admin["temperature"]
  122. res["authorization"]=admin["authorization"]
  123. res["top_p"]=admin["top_p"]
  124. res["n"]=admin["n"]
  125. return res
  126. if __name__=="__main__":
  127. from log_util.set_logger import set_logger
  128. logging = set_logger('test.log')
  129. inputdata={
  130. "metadata":{
  131. "output":{
  132. "output_type":"table",
  133. "label_col":[
  134. "假新闻识别"
  135. ]
  136. },
  137. "input":{
  138. "input_type":"text",
  139. "label":[
  140. ]
  141. },
  142. "address":"http://172.24.12.127:9030/fakeNewIdentification/",
  143. "admin":{
  144. "taskId":"1_twitter采集:$.taskId",
  145. "Host":"172.24.12.126",
  146. "User":"root",
  147. "Password":"baifendian123",
  148. "Database":"analyze",
  149. "Port":3306,
  150. "accountId":"1_twitter采集:$.authorId",
  151. "postId":"1_twitter采集:$.postId"
  152. },
  153. "user":{
  154. "tag":""
  155. }
  156. },
  157. "data":{
  158. "9_获取用户发帖信息":"{\"resultList\": [{\"count\": \"187\", \"LikeCount\": \"9.796791443850267\", \"CommentsCount\": \"1.53475935828877\", \"ShareCount\": \"0.6631016042780749\", \"length\": \"103.7166\", \"tags\": \"0.98930481\", \"https\": \"0.73262032\", \"at\": \"0.10160428\", \"diffdate\": \"4771\"}]}",
  159. "8_获取用户信息":"{\"resultList\": []}",
  160. "businessKey":"5f0297539eec6000",
  161. "1_twitter采集":"{\"isDownload\":\"true\",\"imageCount\":1,\"groupRules\":[],\"commentUrl\":\"\",\"channel\":\"社交媒体\",\"readCount\":-1,\"resolution\":\"\",\"srcimagePath\":[],\"fileCount\":0,\"forwardQuoteCount\":-1,\"province\":\"\",\"crawlDataFlagType\":\"1\",\"price\":0,\"quoteCount\":0,\"translateContentLength\":\"\",\"forwardUserType\":0,\"brand\":\"\",\"createTimeStr\":\"2023-10-08 11:39:02\",\"ocrLength\":0,\"hasOCR\":0,\"authornickname\":\"\",\"contentSimHash\":\"7aa3aa7066c0a22618f401b52e02cba4\",\"dns\":\"https://twitter.com/\",\"crawlDay\":1696694400000,\"asrText\":\"\",\"nomorprice\":0,\"mentionAccountUrl\":[],\"titleSimHash\":\"\",\"videoPath\":[],\"brandId\":\"5f0297539eec6000\",\"commentId\":\"\",\"impression\":\"\",\"originalPhrase\":\"[\\\"关取\\\",\\\"互关\\\",\\\"取关\\\",\\\"一定回关没\\\",\\\"互关必\\\",\\\"关必回\\\",\\\"回关没关\\\",\\\"关上限\\\",\\\"互粉\\\",\\\"顺延一定\\\",\\\"上限会\\\",\\\"会顺延\\\",\\\"关贴\\\",\\\"关评论\\\",\\\"留言\\\",\\\"回关\\\",\\\"关注互\\\",\\\"评论区\\\",\\\"区留\\\",\\\"先主动\\\",\\\"主动关注\\\"]\",\"userTypeContent\":\"\",\"crawl_end_mark\":\"ok\",\"forwardContent\":\"\",\"hashTag\":[],\"primary\":1,\"poorrate\":-1,\"extension\":\"\",\"translateContent\":\"\",\"imagePathSize\":[{\"size\":\"\",\"videoTime\":\"\",\"resolution\":\"\",\"url\":\"/group16/default/20231008/11/29/6/1_4temp.jpeg\"}],\"mentionTopic\":[\"每日互关100\",\"互关贴\",\"互关必回\",\"互关\",\"互粉\"],\"finalPhrase\":\"[\\\"关取\\\",\\\"互关\\\",\\\"互关\\\",\\\"互关必\\\",\\\"回关\\\",\\\"取关\\\",\\\"上限会\\\",\\\"一定回关没\\\",\\\"主动关注\\\"]\",\"city\":\"\",\"availability\":1,\"mentionAccount\":[],\"dataCount\":0,\"forwardUserUrl\":\"\",\"videoCount\":0,\"pageType\":\"storyDetailPage\",\"dataId\":\"fb136a04955de4d0c0044e21b443d75c\",\"fourListBrand\":\"\",\"titleLength\":0,\"videoUrl\":\"\",\"ocrText\":[],\"mentionTopicUrl\":[\"https://twitter.com/hashtag/每日互关100\",\"https://twitter.com/hashtag/互关贴\",\"https://twitter.com/hashtag/互关必回\",\"https://twitter.com/hashtag/互关\",\"https://twitter.com/hashtag/互粉\"],\"forwardAuthor\":\"\",\"sysAbstract\":\"\",\"forwardUrl\":\"\",\"createDate\":\"2023-10-08T11:29:25.922+08:00\",\"cate\":\"\",\"forwardPostSource\":\"\",\"expression\":[],\"docType\":\"social\",\"sex\":\"\",\"threeListBrand\":\"\",\"collectCount\":-1,\"crawlDate\":\"2023-10-08T11:29:20.266+08:00\",\"channelNum\":\"0\",\"avatar\":\"/group16/default/20230921/12/00/6/1697088541984264192temp.jpeg\",\"promotionInfo\":\"\",\"authorId\":\"21836801\",\"isVip\":0,\"url\":\"https://twitter.com/zhichu444/status/1696498207377768703\",\"skuProperties\":\"\",\"places\":[],\"hlKeywords\":[],\"createTime\":1696736342599,\"contentLength\":106,\"thumbnails\":\"\",\"downCnt\":-1,\"country\":\"\",\"forwardUserId\":\"\",\"hasASR\":0,\"imagePath\":[\"/group16/default/20231008/11/29/6/1_4temp.jpeg\"],\"pubTime\":1693311666000,\"sign\":\"\",\"asrLength\":0,\"fansCount\":\"\",\"language\":\"\",\"otherSourceJson\":\"\",\"source\":\"twitter\",\"smallImgs\":[],\"forwardAvatar\":\"\",\"goodrate\":-1,\"pageCommentCount\":1,\"crawlDataFlag\":\"account:https://twitter.com/zhichu444\",\"viewCnt\":-1,\"members\":[],\"crawl_end_message\":\"数据采集完成\",\"postCount\":\"\",\"videoTime\":\"\",\"tag\":\"\",\"filePathSize\":[],\"enSource\":\"twitter\",\"pictureList\":\"[\\\"https://pbs.twimg.com/media/F4ssE8gXQAEWyy0.jpg\\\"]\",\"userUrl\":\"https://twitter.com/zhichu444\",\"area\":\"\",\"fiveListBrand\":\"\",\"srcvideoPath\":[],\"firstListBrand\":\"\",\"forwardCommentsCount\":-1,\"contentTag\":\"nomal\",\"author\":\"之初之初(互fo)@zhichu444\",\"sysSentiment\":0.0,\"generalrate\":-1,\"attitudesCount\":3,\"createDay\":1696694400000,\"postId\":\"1714272222612008988\",\"srcfilePath\":[],\"pubDate\":\"2023-08-29T20:21:06.000+08:00\",\"sysKeywords\":\"\",\"hasFile\":0,\"translateTitle\":\"\",\"translateTitleLength\":\"\",\"getSource\":\"\",\"crawlTime\":169673576026
  162. "7_采集结束过滤":"{\"_id_\":\"24f8aa39ef3d94a51817d17d2b54bc8c\",\"age\":\"\",\"area\":\"\",\"asrLength\":0,\"asrText\":\"\",\"attitudesCount\":3,\"attr\":\"{\\\"projectName\\\":\\\"113ic\\\",\\\"cate\\\":\\\"\\\",\\\"crawlDataFlag\\\":\\\"account:https://twitter.com/zhichu444\\\",\\\"attachTag\\\":\\\"5f0297539eec6000\\\",\\\"appId\\\":\\\"113ic\\\",\\\"project_name\\\":\\\"113ic\\\"}\",\"author\":\"之初之初(互fo)@zhichu444\",\"authorId\":\"1680809976866955265\",\"authorLevel\":\"\",\"authornickname\":\"\",\"availability\":1,\"avatar\":\"/group16/default/20230921/12/00/6/1697088541984264192temp.jpeg\",\"avatarPath\":\"http://crawl-files.pontoaplus.com/group16/default/20230921/12/00/6/1697088541984264192temp.jpeg\",\"brand\":\"\",\"brandId\":\"5f0297539eec6000\",\"cate\":\"\",\"channel\":\"社交媒体\",\"channelNum\":\"0\",\"city\":\"\",\"collectCount\":-1,\"commentId\":\"\",\"commentScore\":0,\"commentUrl\":\"\",\"commentsCount\":1,\"content\":\"#每日互关100 %回关 #互关贴 1、先主动关注别人2、互关不要取关不要取关不要取关每天上限会顺延下一天但一定回关没关的评论区给我留言 #互关必回 #互关 #互粉 https://t.co/DbluriZoTx\",\"contentLength\":106,\"contentSimHash\":\"7aa3aa7066c0a22618f401b52e02cba4\",\"contentTag\":\"nomal\",\"country\":\"\",\"crawlDataFlag\":\"account:https://twitter.com/zhichu444\",\"crawlDataFlagType\":\"1\",\"crawlDate\":\"2023-10-08T11:29:20.266+08:00\",\"crawlDay\":1696694400000,\"crawlTime\":1696735760266,\"crawlTimeStr\":\"2023-10-08 11:29:20\",\"crawl_end_mark\":\"ok\",\"crawl_end_message\":\"数据采集完成\",\"createDate\":\"2023-10-08T11:29:25.922+08:00\",\"createDay\":1696694400000,\"createTime\":1696736342599,\"createTimeStr\":\"2023-10-08 11:39:02\",\"dataCount\":0,\"dataId\":\"fb136a04955de4d0c0044e21b443d75c\",\"dns\":\"https://twitter.com/\",\"docId\":\"bfd_social_7793d00f49558319ef0db899ff6f509e\",\"docType\":\"social\",\"downCnt\":-1,\"enSource\":\"twitter\",\"expression\":[],\"extension\":\"\",\"externalId\":\"94b001e6-6c48-4e64-b238-d4497a7d5326\",\"fansCount\":\"\",\"favorCnt\":-1,\"fileCount\":0,\"filePath\":[],\"filePathSize\":[],\"finalPhrase\":\"[\\\"关取\\\",\\\"互关\\\",\\\"互关\\\",\\\"互关必\\\",\\\"回关\\\",\\\"取关\\\",\\\"上限会\\\",\\\"一定回关没\\\",\\\"主动关注\\\"]\",\"firstListBrand\":\"\",\"fiveListBrand\":\"\",\"forumScore\":\"\",\"forwardAttitudesCount\":-1,\"forwardAuthor\":\"\",\"forwardAvatar\":\"\",\"forwardCommentsCount\":-1,\"forwardContent\":\"\",\"forwardImgs\":\"\",\"forwardPostSource\":\"\",\"forwardPubTime\":0,\"forwardQuoteCount\":-1,\"forwardUrl\":\"\",\"forwardUserId\":\"\",\"forwardUserType\":0,\"forwardUserUrl\":\"\",\"fourListBrand\":\"\",\"friendsCount\":\"\",\"generalrate\":-1,\"getSource\":\"\",\"goodrate\":-1,\"groupRules\":[],\"hasASR\":0,\"hasFile\":0,\"hasImage\":1,\"hasOCR\":0,\"hasTrans\":0,\"hasVideo\":0,\"hashTag\":[],\"hlKeywords\":[],\"imageCount\":1,\"imagePath\":[\"/group16/default/20231008/11/29/6/1_4temp.jpeg\"],\"imagePathSize\":[{\"resolution\":\"\",\"size\":\"\",\"url\":\"/group16/default/20231008/11/29/6/1_4temp.jpeg\",\"videoTime\":\"\"}],\"impression\":\"\",\"isDownload\":\"true\",\"isVip\":0,\"language\":\"\",\"lastModifiedTime\":1696735765922,\"listBrand\":\"\",\"location\":\"\",\"members\":[],\"mentionAccount\":[],\"mentionAccountUrl\":[],\"mentionTopic\":[\"每日互关100\",\"互关贴\",\"互关必回\",\"互关\",\"互粉\"],\"mentionTopicUrl\":[\"https://twitter.com/hashtag/每日互关100\",\"https://twitter.com/hashtag/互关贴\",\"https://twitter.com/hashtag/互关必回\",\"https://twitter.com/hashtag/互关\",\"https://twitter.com/hashtag/互粉\"],\"nomorprice\":0,\"ocrLength\":0,\"ocrText\":[],\"opinions\":[],\"originalPhrase\":\"[\\\"关取\\\",\\\"互关\\\",\\\"取关\\\",\\\"一定回关没\\\",\\\"互关必\\\",\\\"关必回\\\",\\\"回关没关\\\",\\\"关上限\\\",\\\"互粉\\\",\\\"顺延一定\\\",\\\"上限会\\\",\\\"会顺延\\\",\\\"关贴\\\",\\\"关评论\\\",\\\"留言\\\",\\\"回�
  163. },
  164. "created":1691004265000,
  165. "module":"假新闻识别",
  166. "start_tag":"false",
  167. "multi_branch":0,
  168. "last_edit":1696757732000,
  169. "next_app_id":[
  170. {
  171. "start_id":284,
  172. "edge_id":171,
  173. "end_id":285
  174. }
  175. ],
  176. "transfer_id":10,
  177. "version":1,
  178. "blueprint_id":9,
  179. "scenes_id":9,
  180. "scenario":{
  181. "dataloss":1,
  182. "autoCommitTriggerLast":1,
  183. "maxErrors":3,
  184. "autoCommit":1,
  185. "freshVariables":1
  186. },
  187. "wait_condition":[
  188. ],
  189. "scheduling":{
  190. "interval":-1,
  191. "type":"single"
  192. },
  193. "name":"假新闻识别",
  194. "businessKey":"5f0297539eec6000",
  195. "id":284,
  196. "position":[
  197. 100,
  198. 200
  199. ],
  200. "describe":"假新闻识别"
  201. }
  202. dataTag="1"
  203. a=mysqlData(inputdata,logging,dataTag)
  204. print(a)