#coding:utf8 import re import pymysql import pandas as pd import numpy as np import networkx as nx import traceback import json from jsonpath_ng import jsonpath, parse def get_id(raw_data,taskid): # taskid = raw_data["metadata"]["admin"]["taskId"] all_result = raw_data['data'] param_split = taskid.split(":") datasourcestr = all_result[param_split[0]] datasource = json.loads(datasourcestr) # 创建 JsonPath 表达式对象 expr = parse(param_split[1]) # 使用表达式来选择 JSON 元素 match = [match.value for match in expr.find(datasource)] val = match[0] return val def mysqlData(raw_data,logging,dataTag,dbConfig): result='' taskid=raw_data["input"]["taskId"] postid=raw_data["input"]["postId"] taskId = get_id(raw_data,taskid) postId = get_id(raw_data,postid) if dataTag=='1': table="tw_account" else: table="tw_deep" try: db = pymysql.connect(host=dbConfig['host'], user=dbConfig['username'], passwd=dbConfig['password'], db=dbConfig['db'], port=int(dbConfig['port']), charset='utf8',cursorclass=pymysql.cursors.DictCursor, connect_timeout=30) db.ping(reconnect=True) cursor = db.cursor() sql="SELECT * FROM {} WHERE taskId={} and ssId={}".format(table,taskId,postId) cursor.execute(sql) result = cursor.fetchall() db.commit() cursor.close() db.close() except: logging.info("专题关系数据查询失败!") logging.info(traceback.format_exc()) return result def get_replyData(data): reply=pd.DataFrame(data) reply = reply.drop_duplicates().reset_index(drop=True) # 去重 reply=reply[['ReviewerAccountId', 'PostAccountId']] # reply.columns = ['ReviewerAccountId', 'ReviewerAccountName', 'PostAccountId', 'PostAccountName', # 'ShareCount', 'LikeCount', 'CommentCount', 'CommentTime'] reply = reply[['ReviewerAccountId', 'PostAccountId']] reply['ReviewerAccountId'] = reply['ReviewerAccountId'].astype(str) reply['PostAccountId'] = reply['PostAccountId'].astype(str) reply = reply.groupby(['ReviewerAccountId', 'PostAccountId']).size().reset_index() # user_net_df = user_net(reply) ##SNA数据清洗 edgeweightset = reply.fillna(0) edgeweightset.columns = ['source', 'target', 'count'] edgeweightset_l = [[] for _ in range(edgeweightset.shape[0])] for i in range(len(edgeweightset_l)): for j in range(edgeweightset.shape[1]): edgeweightset_l[i].append(edgeweightset.iloc[i, j]) g = nx.DiGraph() g.add_weighted_edges_from(edgeweightset_l) degree = [g.degree(), g.in_degree(), g.out_degree()] centrality = [nx.degree_centrality(g), # 计算图 g 中每个节点的度中心性。度中心性是指节点的度(与其他节点相连的边的数量)与图中节点总数的比值。 nx.closeness_centrality(g), # 计算图 g 中每个节点的接近中心性。接近中心性是指节点到其他节点的平均最短路径长度的倒数。 nx.pagerank(g), # 计算图 g 中每个节点的 PageRank 值。PageRank 是一种用于评估网页重要性的算法,也可以应用于其他网络中的节点重要性评估。 nx.clustering(g)] # 计算图 g 中每个节点的聚集系数。聚集系数是指节点的邻居之间存在连接的概率。 #把主贴相关信息拿出来 tmp=edgeweightset["target"].values node_list = [] nodes = g.nodes() # 提取网络中节点列表 for node in nodes: if node not in tmp: continue node_list.append([node, degree[0][node], degree[1][node], degree[2][node], centrality[0][node], centrality[1][node], centrality[2][node], centrality[3][node]]) node_list = pd.DataFrame(node_list) node_list.columns = ['Id', 'degree', 'in_degree', 'out_degree', 'degree_centrality', 'closeness_centrality', 'pagerank', 'clustering'] node_list['user_flag_infl'] = 0 node_list['user_flag_act'] = 0 node_list.user_flag_infl[node_list['out_degree'] > np.percentile(node_list['out_degree'], 95)] = 1 node_list.user_flag_act[(node_list['in_degree'] > np.percentile(node_list['in_degree'], 90)) & (node_list['closeness_centrality'] > np.percentile(node_list['closeness_centrality'], 50))] = 1 node_dic=node_list.set_index('Id')[['degree', 'in_degree','out_degree','degree_centrality','closeness_centrality','pagerank','clustering']].T.to_dict() return node_dic def get_content(inputdata,logging): """ 重新组装参数 :param inputdata:原json数据 :return: 组装的prompt及其他参数 """ res={} admin=inputdata["metadata"]["admin"] data=inputdata["data"] prompt=admin["prompt"] if_user=re.findall("{{(.*)}}",prompt) if_data=re.findall("@@(.*)@@",prompt) if if_user != []: user_data=inputdata["metadata"]["user"] if if_user[0] in user_data.keys(): tmp=user_data[if_user[0]] prompt=re.sub("{{(.*)}}",tmp,prompt) if if_data!=[] and if_data[0] in data.keys(): tmp1=data[if_data[0]] prompt=re.sub("@@(.*)@@",tmp1,prompt) res["prompt"]=prompt res["authorization"]=admin["authorization"] res["model"]=admin["model"] res["temperature"]=admin["temperature"] res["authorization"]=admin["authorization"] res["top_p"]=admin["top_p"] res["n"]=admin["n"] return res if __name__=="__main__": from log_util.set_logger import set_logger logging = set_logger('test.log') inputdata={ "metadata":{ "output":{ "output_type":"table", "label_col":[ "假新闻识别" ] }, "input":{ "input_type":"text", "label":[ ] }, "address":"http://172.24.12.127:9030/fakeNewIdentification/", "admin":{ "taskId":"1_twitter采集:$.taskId", "Host":"172.24.12.126", "User":"root", "Password":"baifendian123", "Database":"analyze", "Port":3306, "accountId":"1_twitter采集:$.authorId", "postId":"1_twitter采集:$.postId" }, "user":{ "tag":"" } }, "data":{ "9_获取用户发帖信息":"{\"resultList\": [{\"count\": \"187\", \"LikeCount\": \"9.796791443850267\", \"CommentsCount\": \"1.53475935828877\", \"ShareCount\": \"0.6631016042780749\", \"length\": \"103.7166\", \"tags\": \"0.98930481\", \"https\": \"0.73262032\", \"at\": \"0.10160428\", \"diffdate\": \"4771\"}]}", "8_获取用户信息":"{\"resultList\": []}", "businessKey":"5f0297539eec6000", "1_twitter采集":"{\"isDownload\":\"true\",\"imageCount\":1,\"groupRules\":[],\"commentUrl\":\"\",\"channel\":\"社交媒体\",\"readCount\":-1,\"resolution\":\"\",\"srcimagePath\":[],\"fileCount\":0,\"forwardQuoteCount\":-1,\"province\":\"\",\"crawlDataFlagType\":\"1\",\"price\":0,\"quoteCount\":0,\"translateContentLength\":\"\",\"forwardUserType\":0,\"brand\":\"\",\"createTimeStr\":\"2023-10-08 11:39:02\",\"ocrLength\":0,\"hasOCR\":0,\"authornickname\":\"\",\"contentSimHash\":\"7aa3aa7066c0a22618f401b52e02cba4\",\"dns\":\"https://twitter.com/\",\"crawlDay\":1696694400000,\"asrText\":\"\",\"nomorprice\":0,\"mentionAccountUrl\":[],\"titleSimHash\":\"\",\"videoPath\":[],\"brandId\":\"5f0297539eec6000\",\"commentId\":\"\",\"impression\":\"\",\"originalPhrase\":\"[\\\"关取\\\",\\\"互关\\\",\\\"取关\\\",\\\"一定回关没\\\",\\\"互关必\\\",\\\"关必回\\\",\\\"回关没关\\\",\\\"关上限\\\",\\\"互粉\\\",\\\"顺延一定\\\",\\\"上限会\\\",\\\"会顺延\\\",\\\"关贴\\\",\\\"关评论\\\",\\\"留言\\\",\\\"回关\\\",\\\"关注互\\\",\\\"评论区\\\",\\\"区留\\\",\\\"先主动\\\",\\\"主动关注\\\"]\",\"userTypeContent\":\"\",\"crawl_end_mark\":\"ok\",\"forwardContent\":\"\",\"hashTag\":[],\"primary\":1,\"poorrate\":-1,\"extension\":\"\",\"translateContent\":\"\",\"imagePathSize\":[{\"size\":\"\",\"videoTime\":\"\",\"resolution\":\"\",\"url\":\"/group16/default/20231008/11/29/6/1_4temp.jpeg\"}],\"mentionTopic\":[\"每日互关100\",\"互关贴\",\"互关必回\",\"互关\",\"互粉\"],\"finalPhrase\":\"[\\\"关取\\\",\\\"互关\\\",\\\"互关\\\",\\\"互关必\\\",\\\"回关\\\",\\\"取关\\\",\\\"上限会\\\",\\\"一定回关没\\\",\\\"主动关注\\\"]\",\"city\":\"\",\"availability\":1,\"mentionAccount\":[],\"dataCount\":0,\"forwardUserUrl\":\"\",\"videoCount\":0,\"pageType\":\"storyDetailPage\",\"dataId\":\"fb136a04955de4d0c0044e21b443d75c\",\"fourListBrand\":\"\",\"titleLength\":0,\"videoUrl\":\"\",\"ocrText\":[],\"mentionTopicUrl\":[\"https://twitter.com/hashtag/每日互关100\",\"https://twitter.com/hashtag/互关贴\",\"https://twitter.com/hashtag/互关必回\",\"https://twitter.com/hashtag/互关\",\"https://twitter.com/hashtag/互粉\"],\"forwardAuthor\":\"\",\"sysAbstract\":\"\",\"forwardUrl\":\"\",\"createDate\":\"2023-10-08T11:29:25.922+08:00\",\"cate\":\"\",\"forwardPostSource\":\"\",\"expression\":[],\"docType\":\"social\",\"sex\":\"\",\"threeListBrand\":\"\",\"collectCount\":-1,\"crawlDate\":\"2023-10-08T11:29:20.266+08:00\",\"channelNum\":\"0\",\"avatar\":\"/group16/default/20230921/12/00/6/1697088541984264192temp.jpeg\",\"promotionInfo\":\"\",\"authorId\":\"21836801\",\"isVip\":0,\"url\":\"https://twitter.com/zhichu444/status/1696498207377768703\",\"skuProperties\":\"\",\"places\":[],\"hlKeywords\":[],\"createTime\":1696736342599,\"contentLength\":106,\"thumbnails\":\"\",\"downCnt\":-1,\"country\":\"\",\"forwardUserId\":\"\",\"hasASR\":0,\"imagePath\":[\"/group16/default/20231008/11/29/6/1_4temp.jpeg\"],\"pubTime\":1693311666000,\"sign\":\"\",\"asrLength\":0,\"fansCount\":\"\",\"language\":\"\",\"otherSourceJson\":\"\",\"source\":\"twitter\",\"smallImgs\":[],\"forwardAvatar\":\"\",\"goodrate\":-1,\"pageCommentCount\":1,\"crawlDataFlag\":\"account:https://twitter.com/zhichu444\",\"viewCnt\":-1,\"members\":[],\"crawl_end_message\":\"数据采集完成\",\"postCount\":\"\",\"videoTime\":\"\",\"tag\":\"\",\"filePathSize\":[],\"enSource\":\"twitter\",\"pictureList\":\"[\\\"https://pbs.twimg.com/media/F4ssE8gXQAEWyy0.jpg\\\"]\",\"userUrl\":\"https://twitter.com/zhichu444\",\"area\":\"\",\"fiveListBrand\":\"\",\"srcvideoPath\":[],\"firstListBrand\":\"\",\"forwardCommentsCount\":-1,\"contentTag\":\"nomal\",\"author\":\"之初之初(互fo)@zhichu444\",\"sysSentiment\":0.0,\"generalrate\":-1,\"attitudesCount\":3,\"createDay\":1696694400000,\"postId\":\"1714272222612008988\",\"srcfilePath\":[],\"pubDate\":\"2023-08-29T20:21:06.000+08:00\",\"sysKeywords\":\"\",\"hasFile\":0,\"translateTitle\":\"\",\"translateTitleLength\":\"\",\"getSource\":\"\",\"crawlTime\":1696735760266,\"userType\":\"\",\"projectName\":\"\",\"lastModifiedTime\":1696735765922,\"productParameter\":\"\",\"docId\":\"bfd_social_7793d00f49558319ef0db899ff6f509e\",\"videoTimeLong\":0,\"commentScore\":0,\"urlHash\":\"a9ae4250c680d49a5d945356784c5621\",\"_id_\":\"24f8aa39ef3d94a51817d17d2b54bc8c\",\"hasImage\":1,\"videoPathSize\":[],\"title\":\"\",\"pageTranspondCount\":0,\"pageAttitudeCount\":\"{\\\"totalCount\\\":3,\\\"likeCount\\\":3}\",\"content\":\"#每日互关100 %回关 #互关贴 1、先主动关注别人2、互关不要取关不要取关不要取关每天上限会顺延下一天但一定回关没关的评论区给我留言 #互关必回 #互关 #互粉 https://t.co/DbluriZoTx\",\"authorLevel\":\"\",\"attr\":\"{\\\"projectName\\\":\\\"113ic\\\",\\\"cate\\\":\\\"\\\",\\\"crawlDataFlag\\\":\\\"account:https://twitter.com/zhichu444\\\",\\\"attachTag\\\":\\\"5f0297539eec6000\\\",\\\"appId\\\":\\\"113ic\\\",\\\"project_name\\\":\\\"113ic\\\"}\",\"forumScore\":\"\",\"hasVideo\":0,\"forwardAttitudesCount\":-1,\"pubDay\":1693238400000,\"pubTimeStr\":\"2023-08-29 20:21:06\",\"filePath\":[],\"forwardPubTime\":0,\"postSource\":\"\",\"hasTrans\":0,\"avatarPath\":\"http://crawl-files.pontoaplus.com/group16/default/20230921/12/00/6/1697088541984264192temp.jpeg\",\"crawlTimeStr\":\"2023-10-08 11:29:20\",\"externalId\":\"94b001e6-6c48-4e64-b238-d4497a7d5326\",\"favorCnt\":-1,\"friendsCount\":\"\",\"secondListBrand\":\"\",\"forwardImgs\":\"\",\"commentsCount\":1,\"listBrand\":\"\",\"opinions\":[],\"siteId\":\"181\",\"location\":\"\",\"otherInfoJson\":\"\",\"age\":\"\",\"taskId\":\"1034427\"}", "7_采集结束过滤":"{\"_id_\":\"24f8aa39ef3d94a51817d17d2b54bc8c\",\"age\":\"\",\"area\":\"\",\"asrLength\":0,\"asrText\":\"\",\"attitudesCount\":3,\"attr\":\"{\\\"projectName\\\":\\\"113ic\\\",\\\"cate\\\":\\\"\\\",\\\"crawlDataFlag\\\":\\\"account:https://twitter.com/zhichu444\\\",\\\"attachTag\\\":\\\"5f0297539eec6000\\\",\\\"appId\\\":\\\"113ic\\\",\\\"project_name\\\":\\\"113ic\\\"}\",\"author\":\"之初之初(互fo)@zhichu444\",\"authorId\":\"1680809976866955265\",\"authorLevel\":\"\",\"authornickname\":\"\",\"availability\":1,\"avatar\":\"/group16/default/20230921/12/00/6/1697088541984264192temp.jpeg\",\"avatarPath\":\"http://crawl-files.pontoaplus.com/group16/default/20230921/12/00/6/1697088541984264192temp.jpeg\",\"brand\":\"\",\"brandId\":\"5f0297539eec6000\",\"cate\":\"\",\"channel\":\"社交媒体\",\"channelNum\":\"0\",\"city\":\"\",\"collectCount\":-1,\"commentId\":\"\",\"commentScore\":0,\"commentUrl\":\"\",\"commentsCount\":1,\"content\":\"#每日互关100 %回关 #互关贴 1、先主动关注别人2、互关不要取关不要取关不要取关每天上限会顺延下一天但一定回关没关的评论区给我留言 #互关必回 #互关 #互粉 https://t.co/DbluriZoTx\",\"contentLength\":106,\"contentSimHash\":\"7aa3aa7066c0a22618f401b52e02cba4\",\"contentTag\":\"nomal\",\"country\":\"\",\"crawlDataFlag\":\"account:https://twitter.com/zhichu444\",\"crawlDataFlagType\":\"1\",\"crawlDate\":\"2023-10-08T11:29:20.266+08:00\",\"crawlDay\":1696694400000,\"crawlTime\":1696735760266,\"crawlTimeStr\":\"2023-10-08 11:29:20\",\"crawl_end_mark\":\"ok\",\"crawl_end_message\":\"数据采集完成\",\"createDate\":\"2023-10-08T11:29:25.922+08:00\",\"createDay\":1696694400000,\"createTime\":1696736342599,\"createTimeStr\":\"2023-10-08 11:39:02\",\"dataCount\":0,\"dataId\":\"fb136a04955de4d0c0044e21b443d75c\",\"dns\":\"https://twitter.com/\",\"docId\":\"bfd_social_7793d00f49558319ef0db899ff6f509e\",\"docType\":\"social\",\"downCnt\":-1,\"enSource\":\"twitter\",\"expression\":[],\"extension\":\"\",\"externalId\":\"94b001e6-6c48-4e64-b238-d4497a7d5326\",\"fansCount\":\"\",\"favorCnt\":-1,\"fileCount\":0,\"filePath\":[],\"filePathSize\":[],\"finalPhrase\":\"[\\\"关取\\\",\\\"互关\\\",\\\"互关\\\",\\\"互关必\\\",\\\"回关\\\",\\\"取关\\\",\\\"上限会\\\",\\\"一定回关没\\\",\\\"主动关注\\\"]\",\"firstListBrand\":\"\",\"fiveListBrand\":\"\",\"forumScore\":\"\",\"forwardAttitudesCount\":-1,\"forwardAuthor\":\"\",\"forwardAvatar\":\"\",\"forwardCommentsCount\":-1,\"forwardContent\":\"\",\"forwardImgs\":\"\",\"forwardPostSource\":\"\",\"forwardPubTime\":0,\"forwardQuoteCount\":-1,\"forwardUrl\":\"\",\"forwardUserId\":\"\",\"forwardUserType\":0,\"forwardUserUrl\":\"\",\"fourListBrand\":\"\",\"friendsCount\":\"\",\"generalrate\":-1,\"getSource\":\"\",\"goodrate\":-1,\"groupRules\":[],\"hasASR\":0,\"hasFile\":0,\"hasImage\":1,\"hasOCR\":0,\"hasTrans\":0,\"hasVideo\":0,\"hashTag\":[],\"hlKeywords\":[],\"imageCount\":1,\"imagePath\":[\"/group16/default/20231008/11/29/6/1_4temp.jpeg\"],\"imagePathSize\":[{\"resolution\":\"\",\"size\":\"\",\"url\":\"/group16/default/20231008/11/29/6/1_4temp.jpeg\",\"videoTime\":\"\"}],\"impression\":\"\",\"isDownload\":\"true\",\"isVip\":0,\"language\":\"\",\"lastModifiedTime\":1696735765922,\"listBrand\":\"\",\"location\":\"\",\"members\":[],\"mentionAccount\":[],\"mentionAccountUrl\":[],\"mentionTopic\":[\"每日互关100\",\"互关贴\",\"互关必回\",\"互关\",\"互粉\"],\"mentionTopicUrl\":[\"https://twitter.com/hashtag/每日互关100\",\"https://twitter.com/hashtag/互关贴\",\"https://twitter.com/hashtag/互关必回\",\"https://twitter.com/hashtag/互关\",\"https://twitter.com/hashtag/互粉\"],\"nomorprice\":0,\"ocrLength\":0,\"ocrText\":[],\"opinions\":[],\"originalPhrase\":\"[\\\"关取\\\",\\\"互关\\\",\\\"取关\\\",\\\"一定回关没\\\",\\\"互关必\\\",\\\"关必回\\\",\\\"回关没关\\\",\\\"关上限\\\",\\\"互粉\\\",\\\"顺延一定\\\",\\\"上限会\\\",\\\"会顺延\\\",\\\"关贴\\\",\\\"关评论\\\",\\\"留言\\\",\\\"回关\\\",\\\"关注互\\\",\\\"评论区\\\",\\\"区留\\\",\\\"先主动\\\",\\\"主动关注\\\"]\",\"otherInfoJson\":\"\",\"otherSourceJson\":\"\",\"pageAttitudeCount\":\"{\\\"totalCount\\\":3,\\\"likeCount\\\":3}\",\"pageCommentCount\":1,\"pageTranspondCount\":0,\"pageType\":\"storyDetailPage\",\"pictureList\":\"[\\\"https://pbs.twimg.com/media/F4ssE8gXQAEWyy0.jpg\\\"]\",\"places\":[],\"poorrate\":-1,\"postCount\":\"\",\"postId\":\"1696498207377768703\",\"postSource\":\"\",\"price\":0,\"primary\":1,\"productParameter\":\"\",\"projectName\":\"\",\"promotionInfo\":\"\",\"province\":\"\",\"pubDate\":\"2023-08-29T20:21:06.000+08:00\",\"pubDay\":1693238400000,\"pubTime\":1693311666000,\"pubTimeStr\":\"2023-08-29 20:21:06\",\"quoteCount\":0,\"readCount\":-1,\"resolution\":\"\",\"secondListBrand\":\"\",\"sex\":\"\",\"sign\":\"\",\"siteId\":\"181\",\"skuProperties\":\"\",\"smallImgs\":[],\"source\":\"twitter\",\"srcfilePath\":[],\"srcimagePath\":[],\"srcvideoPath\":[],\"sysAbstract\":\"\",\"sysKeywords\":\"\",\"sysSentiment\":0.0,\"tag\":\"\",\"taskId\":\"1030851\",\"threeListBrand\":\"\",\"thumbnails\":\"\",\"title\":\"\",\"titleLength\":0,\"titleSimHash\":\"\",\"translateContent\":\"\",\"translateContentLength\":\"\",\"translateTitle\":\"\",\"translateTitleLength\":\"\",\"url\":\"https://twitter.com/zhichu444/status/1696498207377768703\",\"urlHash\":\"a9ae4250c680d49a5d945356784c5621\",\"userType\":\"\",\"userTypeContent\":\"\",\"userUrl\":\"https://twitter.com/zhichu444\",\"videoCount\":0,\"videoPath\":[],\"videoPathSize\":[],\"videoTime\":\"\",\"videoTimeLong\":0,\"videoUrl\":\"\",\"viewCnt\":-1}" }, "created":1691004265000, "module":"假新闻识别", "start_tag":"false", "multi_branch":0, "last_edit":1696757732000, "next_app_id":[ { "start_id":284, "edge_id":171, "end_id":285 } ], "transfer_id":10, "version":1, "blueprint_id":9, "scenes_id":9, "scenario":{ "dataloss":1, "autoCommitTriggerLast":1, "maxErrors":3, "autoCommit":1, "freshVariables":1 }, "wait_condition":[ ], "scheduling":{ "interval":-1, "type":"single" }, "name":"假新闻识别", "businessKey":"5f0297539eec6000", "id":284, "position":[ 100, 200 ], "describe":"假新闻识别" } dataTag="1" a=mysqlData(inputdata,logging,dataTag) print(a)