robotIdentificationTopic/text_analysis/tools/tool.py


								#coding:utf8

								import re

								import pymysql

								import pandas as pd

								import numpy as np

								import networkx as nx

								import traceback

								import json

								from jsonpath_ng import jsonpath, parse


								def parse_data(raw_data,url):

								    val = None

								    try:

								        if "#json#" in url:

								            parm = url.split("#")

								            data1 = parse_data(raw_data, parm[0])

								            data1_json = json.loads(data1)

								            expr = parse(parm[2])

								            match = [match.value for match in expr.find(data1_json)]

								            val = match[0]

								        else:

								            all_result = raw_data['data']

								            param_split = str(url).split(":")

								            datasourcestr = all_result[param_split[0]]

								            datasource = json.loads(datasourcestr)

								            # 创建 JsonPath 表达式对象

								            expr = parse(param_split[1])

								            # 使用表达式来选择 JSON 元素

								            match = [match.value for match in expr.find(datasource)]

								            val = match[0]

								    except Exception as e:

								        traceback.print_exc()

								        val = ''

								    return val


								def get_taskId(raw_data):

								    taskid = raw_data["metadata"]["admin"]["reply_file"]["taskId"]

								    all_result = raw_data['data']

								    param_split = taskid.split(":")

								    datasourcestr = all_result[param_split[0]]

								    datasource = json.loads(datasourcestr)

								    # 创建 JsonPath 表达式对象

								    expr = parse(param_split[1])

								    # 使用表达式来选择 JSON 元素

								    match = [match.value for match in expr.find(datasource)]

								    val = match[0]

								    return val


								def mysqlData(dbConfig,taskId,logging):

								    result=''

								    try:

								        # taskId=get_taskId(raw_data)

								        db = pymysql.connect(host=dbConfig["host"], user=dbConfig["username"], passwd=dbConfig["password"],

								                             db=dbConfig["db"], port=int(dbConfig["port"]), charset='utf8',cursorclass=pymysql.cursors.DictCursor, connect_timeout=30)

								        db.ping(reconnect=True)

								        cursor = db.cursor()

								        # sql="SELECT ReviewerAccountId, PostAccountId FROM {} WHERE topicId={}".format(inputdata["table"],inputdata["topicId"])

								        sql="select a.ReviewerAccountId,a.ReviewerAccountName,b.accountId PostAccountId,b.accountName PostAccountName,a.ShareCount,a.LikeCount,a.CommentCount,a.CommentTime from reply a LEFT JOIN user_post b on a.postId = b.postId where a.taskId = {}".format(taskId)


								        cursor.execute(sql)

								        result = cursor.fetchall()

								        db.commit()

								        cursor.close()

								        db.close()

								    except:

								        logging.info("专题关系数据查询失败！")

								        logging.info(traceback.format_exc())


								    return result


								def get_replyData(data):

								    reply=pd.DataFrame(data)

								    reply = reply.drop_duplicates().reset_index(drop=True)  # 去重

								    reply=reply[['ReviewerAccountId', 'PostAccountId']]

								    # reply.columns = ['ReviewerAccountId', 'ReviewerAccountName', 'PostAccountId', 'PostAccountName',

								    #                  'ShareCount', 'LikeCount', 'CommentCount', 'CommentTime']

								    reply = reply[['ReviewerAccountId', 'PostAccountId']]

								    reply['ReviewerAccountId'] = reply['ReviewerAccountId'].astype(str)

								    reply['PostAccountId'] = reply['PostAccountId'].astype(str)


								    reply = reply.groupby(['ReviewerAccountId', 'PostAccountId']).size().reset_index()

								    # user_net_df = user_net(reply)  ##SNA数据清洗

								    edgeweightset = reply.fillna(0)

								    edgeweightset.columns = ['source', 'target', 'count']

								    edgeweightset_l = [[] for _ in range(edgeweightset.shape[0])]

								    for i in range(len(edgeweightset_l)):

								        for j in range(edgeweightset.shape[1]):

								            edgeweightset_l[i].append(edgeweightset.iloc[i, j])

								    g = nx.DiGraph()

								    g.add_weighted_edges_from(edgeweightset_l)


								    degree = [g.degree(),

								              g.in_degree(),

								              g.out_degree()]


								    centrality = [nx.degree_centrality(g),  # 计算图 g 中每个节点的度中心性。度中心性是指节点的度（与其他节点相连的边的数量）与图中节点总数的比值。

								                  nx.closeness_centrality(g),  # 计算图 g 中每个节点的接近中心性。接近中心性是指节点到其他节点的平均最短路径长度的倒数。

								                  nx.pagerank(g),  # 计算图 g 中每个节点的 PageRank 值。PageRank 是一种用于评估网页重要性的算法，也可以应用于其他网络中的节点重要性评估。

								                  nx.clustering(g)]  # 计算图 g 中每个节点的聚集系数。聚集系数是指节点的邻居之间存在连接的概率。

								    #把主贴相关信息拿出来

								    tmp=edgeweightset["target"].values

								    node_list = []

								    nodes = g.nodes()  # 提取网络中节点列表

								    for node in nodes:

								        if node not in tmp:

								            continue

								        node_list.append([node,

								                          degree[0][node],

								                          degree[1][node],

								                          degree[2][node],

								                          centrality[0][node],

								                          centrality[1][node],

								                          centrality[2][node],

								                          centrality[3][node]])


								    node_list = pd.DataFrame(node_list)

								    node_list.columns = ['Id', 'degree', 'in_degree', 'out_degree',

								                         'degree_centrality', 'closeness_centrality', 'pagerank', 'clustering']

								    node_list['user_flag_infl'] = 0

								    node_list['user_flag_act'] = 0

								    node_list.user_flag_infl[node_list['out_degree'] > np.percentile(node_list['out_degree'], 95)] = 1

								    node_list.user_flag_act[(node_list['in_degree'] > np.percentile(node_list['in_degree'], 90)) &

								                            (node_list['closeness_centrality'] > np.percentile(node_list['closeness_centrality'],

								                                                                               50))] = 1

								    node_dic=node_list.set_index('Id')[['degree', 'in_degree','out_degree','degree_centrality','closeness_centrality','pagerank','clustering']].T.to_dict()

								    return node_dic


								def get_content(inputdata,logging):

								    """

								    重新组装参数

								    :param inputdata:原json数据

								    :return: 组装的prompt及其他参数

								    """

								    res={}

								    admin=inputdata["metadata"]["admin"]

								    data=inputdata["data"]

								    prompt=admin["prompt"]

								    if_user=re.findall("{{(.*)}}",prompt)

								    if_data=re.findall("@@(.*)@@",prompt)

								    if if_user != []:

								        user_data=inputdata["metadata"]["user"]

								        if if_user[0] in user_data.keys():

								            tmp=user_data[if_user[0]]

								            prompt=re.sub("{{(.*)}}",tmp,prompt)

								    if if_data!=[] and if_data[0] in data.keys():

								        tmp1=data[if_data[0]]

								        prompt=re.sub("@@(.*)@@",tmp1,prompt)

								    res["prompt"]=prompt

								    res["authorization"]=admin["authorization"]

								    res["model"]=admin["model"]

								    res["temperature"]=admin["temperature"]

								    res["authorization"]=admin["authorization"]

								    res["top_p"]=admin["top_p"]

								    res["n"]=admin["n"]

								    return res


								if __name__=="__main__":

								    inputdata={

								    "metadata":{

								        "output":{

								            "output_type":"table",

								            "label_col":[

								                "软件著作抽取结果"

								            ]

								        },

								        "input":{

								            "input_type":"text",

								            "label":[

								                "7_软件著作过滤器"

								            ]

								        },

								        "address":"http://172.18.1.181:9011/chatGpt/",

								        "admin":{

								            "user_file": "12_任务拆分",

								            "post_file": "13_获取发帖信息",

								            "reply_file": {

								                "taskId": "1_twitter采集:$.taskId",

								                "host": "172.24.12.126",

								                "user": "root",

								                "passwd": "baifendian123",

								                "db": "analyze",

								                "port": 3306

								            }

								        },

								        "index":1

								    },

								    "data":{

								        "sgwg":"[{ \"taskId\":\"http://172.18.1.130:9985/group33/default/20230816/16/05/1/1-基于时间序列遥感 影像洪涝检测系统.jpg\",\"fileType\":\"jpg\", \"filePath\":\"/软件著作/1-基于时间序列遥感 影像洪涝检测系统.jpg\",\"fileId\":\"cd6592f0389bb1da25afbb44901f9cde\",\"fileName\":\"1-基于时间序列遥感 影像洪涝检测系统.jpg\" },{ \"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230816/16/06/1/2-基于遥感影像的快速变化检测系统.jpg\",\"fileType\":\"jpg\", \"filePath\":\"/软件著作/2-基于遥感影像的快速变化检测系统.jpg\",\"fileId\":\"338847e34904fa96e8834cb220667db8\",\"fileName\":\"2-基于遥感影像的快速变化检测系统.jpg\" },{ \"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230816/16/08/1/3-基于时空模型的遥感时间序列森林火灾检测系统.jpg\",\"fileType\":\"jpg\", \"filePath\":\"/软件著作/1/3-基于时空模型的遥感时间序列森林火灾检测系统.jpg\",\"fileId\":\"944eec1cf98f216ea953459dac4dd505\",\"fileName\":\"3-基于时空模型的遥感时间序列森林火灾检测系统.jpg\" },{ \"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230816/16/09/1/4-基于隐马尔可夫模型的遥感时间序列分类系统.jpg\",\"fileType\":\"jpg\", \"filePath\":\"/软件著作/4-基于隐马尔可夫模型的遥感时间序列分类系统.jpg\",\"fileId\":\"eb378cb9ee914323f601500378dfad76\",\"fileName\":\"4-基于隐马尔可夫模型的遥感时间序列分类系统.jpg\" }]",

								        "1_twitter采集":"{\"taskId\":100}",

								        "3_OCR识别内容":"{\"content\":\"  22222222222222222222222222222222222222222222222222\\n中华人民共和国国家版权局\\n计算机软件著作权登记证书\\n证书号：软著登字第1623261号\\n软件名称：\\n基于遥感影像的快速变化检测系统\\nV1.0\\n著作权人：中国科学院遥感与数字地球研究所\\n开发完成日期：2016年08月01日\\n首次发表日期：未发表\\n权利取得方式：原始取得\\n权利范围：全部权利\\n登记号：2017SR037977\\n根据《计算机软件保护条例》和《计算机软件著作权登记办法》的\\n规定，经中国版权保护中心审核，对以上事项予以登记\\n计算机软件著作权\\n登记专用章\\n2017年02月10日\\nNo.01433672\",\"fileId\":\"338847e34904fa96e8834cb220667db8\",\"fileName\":\"2-基于遥感影像的快速变化检测系统.jpg\",\"filePath\":\"/软件著作/2-基于遥感影像的快速变化检测系统.jpg\",\"fileType\":\"jpg\",\"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230816/16/06/1/2-基于遥感影像的快速变化检测系统.jpg\",\"pageNum\":1}",

								        "businessKey":"185aef3b1c810799a6be8314abf6512c",

								        "7_软件著作过滤器":"{\"content\":\"  22222222222222222222222222222222222222222222222222\\n中华人民共和国国家版权局\\n计算机软件著作权登记证书\\n证书号：软著登字第1623261号\\n软件名称：\\n基于遥感影像的快速变化检测系统\\nV1.0\\n著作权人：中国科学院遥感与数字地球研究所\\n开发完成日期：2016年08月01日\\n首次发表日期：未发表\\n权利取得方式：原始取得\\n权利范围：全部权利\\n登记号：2017SR037977\\n根据《计算机软件保护条例》和《计算机软件著作权登记办法》的\\n规定，经中国版权保护中心审核，对以上事项予以登记\\n计算机软件著作权\\n登记专用章\\n2017年02月10日\\nNo.01433672\",\"fileId\":\"338847e34904fa96e8834cb220667db8\",\"fileName\":\"2-基于遥感影像的快速变化检测系统.jpg\",\"filePath\":\"/软件著作/2-基于遥感影像的快速变化检测系统.jpg\",\"fileType\":\"jpg\",\"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230816/16/06/1/2-基于遥感影像的快速变化检测系统.jpg\",\"pageNum\":1}"

								    },

								    "created":1691004265000,

								    "module":"OCR",

								    "start_tag":"false",

								    "last_edit":1692464331000,

								    "next_app_id":[

								        {

								            "start_id":86,

								            "edge_id":49,

								            "end_id":90

								        }

								    ],

								    "transfer_id":11,

								    "blueprint_id":3,

								    "scenes_id":3,

								    "scenario":{

								        "dataloss":1,

								        "autoCommitTriggerLast":1,

								        "maxErrors":3,

								        "autoCommit":1,

								        "freshVariables":1

								    },

								    "wait_condition":[


								    ],

								    "scheduling":{

								        "interval":-1,

								        "type":"single"

								    },

								    "name":"软件著作抽取",

								    "businessKey":"185aef3b1c810799a6be8314abf6512c",

								    "id":86,

								    "describe":"软件著作抽取"

								}

								    a=get_taskId(inputdata)

								    print(a)