robotIdentificationTopic/text_analysis/tools/bak/tool.py

#coding:utf8
import re
import pymysql
import pandas as pd
import numpy as np
import networkx as nx
import traceback

def mysqlData(inputdata,logging):
    result=''
    try:
        db = pymysql.connect(host=inputdata["host"], user=inputdata["user"], passwd=inputdata["passwd"],
                             db=inputdata["db"], port=inputdata["port"], charset='utf8',cursorclass=pymysql.cursors.DictCursor, connect_timeout=30)
        db.ping(reconnect=True)
        cursor = db.cursor()
        sql="SELECT ReviewerAccountId, PostAccountId FROM {} WHERE topicId={}".format(inputdata["table"],inputdata["topicId"])
        cursor.execute(sql)
        result = cursor.fetchall()
        db.commit()
        cursor.close()
        db.close()
    except:
        logging.info("专题关系数据查询失败！")
        logging.info(traceback.format_exc())

    return result

def get_replyData(data):
    reply=pd.DataFrame(data)
    reply = reply.drop_duplicates().reset_index(drop=True)  # 去重
    reply=reply[['ReviewerAccountId', 'PostAccountId']]
    # reply.columns = ['ReviewerAccountId', 'ReviewerAccountName', 'PostAccountId', 'PostAccountName',
    #                  'ShareCount', 'LikeCount', 'CommentCount', 'CommentTime']
    reply = reply[['ReviewerAccountId', 'PostAccountId']]
    reply['ReviewerAccountId'] = reply['ReviewerAccountId'].astype(str)
    reply['PostAccountId'] = reply['PostAccountId'].astype(str)

    reply = reply.groupby(['ReviewerAccountId', 'PostAccountId']).size().reset_index()
    # user_net_df = user_net(reply)  ##SNA数据清洗
    edgeweightset = reply.fillna(0)
    edgeweightset.columns = ['source', 'target', 'count']
    edgeweightset_l = [[] for _ in range(edgeweightset.shape[0])]
    for i in range(len(edgeweightset_l)):
        for j in range(edgeweightset.shape[1]):
            edgeweightset_l[i].append(edgeweightset.iloc[i, j])
    g = nx.DiGraph()
    g.add_weighted_edges_from(edgeweightset_l)

    degree = [g.degree(),
              g.in_degree(),
              g.out_degree()]

    centrality = [nx.degree_centrality(g),  # 计算图 g 中每个节点的度中心性。度中心性是指节点的度（与其他节点相连的边的数量）与图中节点总数的比值。
                  nx.closeness_centrality(g),  # 计算图 g 中每个节点的接近中心性。接近中心性是指节点到其他节点的平均最短路径长度的倒数。
                  nx.pagerank(g),  # 计算图 g 中每个节点的 PageRank 值。PageRank 是一种用于评估网页重要性的算法，也可以应用于其他网络中的节点重要性评估。
                  nx.clustering(g)]  # 计算图 g 中每个节点的聚集系数。聚集系数是指节点的邻居之间存在连接的概率。
    #把主贴相关信息拿出来
    tmp=edgeweightset["target"].values
    node_list = []
    nodes = g.nodes()  # 提取网络中节点列表
    for node in nodes:
        if node not in tmp:
            continue
        node_list.append([node,
                          degree[0][node],
                          degree[1][node],
                          degree[2][node],
                          centrality[0][node],
                          centrality[1][node],
                          centrality[2][node],
                          centrality[3][node]])

    node_list = pd.DataFrame(node_list)
    node_list.columns = ['Id', 'degree', 'in_degree', 'out_degree',
                         'degree_centrality', 'closeness_centrality', 'pagerank', 'clustering']
    node_list['user_flag_infl'] = 0
    node_list['user_flag_act'] = 0
    node_list.user_flag_infl[node_list['out_degree'] > np.percentile(node_list['out_degree'], 95)] = 1
    node_list.user_flag_act[(node_list['in_degree'] > np.percentile(node_list['in_degree'], 90)) &
                            (node_list['closeness_centrality'] > np.percentile(node_list['closeness_centrality'],
                                                                               50))] = 1
    node_dic=node_list.set_index('Id')[['degree', 'in_degree','out_degree','degree_centrality','closeness_centrality','pagerank','clustering']].T.to_dict()
    return node_dic


def get_content(inputdata,logging):
    """
    重新组装参数
    :param inputdata:原json数据 
    :return: 组装的prompt及其他参数
    """
    res={}
    admin=inputdata["metadata"]["admin"]
    data=inputdata["data"]
    prompt=admin["prompt"]
    if_user=re.findall("{{(.*)}}",prompt)
    if_data=re.findall("@@(.*)@@",prompt)
    if if_user != []:
        user_data=inputdata["metadata"]["user"]
        if if_user[0] in user_data.keys():
            tmp=user_data[if_user[0]]
            prompt=re.sub("{{(.*)}}",tmp,prompt)
    if if_data!=[] and if_data[0] in data.keys():
        tmp1=data[if_data[0]]
        prompt=re.sub("@@(.*)@@",tmp1,prompt)
    res["prompt"]=prompt
    res["authorization"]=admin["authorization"]
    res["model"]=admin["model"]
    res["temperature"]=admin["temperature"]
    res["authorization"]=admin["authorization"]
    res["top_p"]=admin["top_p"]
    res["n"]=admin["n"]
    return res


if __name__=="__main__":
    inputdata={
    "metadata":{
        "output":{
            "output_type":"table",
            "label_col":[
                "软件著作抽取结果"
            ]
        },
        "input":{
            "input_type":"text",
            "label":[
                "7_软件著作过滤器"
            ]
        },
        "address":"http://172.18.1.181:9011/chatGpt/",
        "admin":{
            "authorization":"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",
            "top_p":"1",
            "user_input":[
                {
                    "keyname":"tag",
                    "keydesc":""
                }
            ],
            "temperature":"0.2",
            "model":"gpt-3.5-turbo-16k",
            "prompt":"请在下面这句话中提取出：证书号、软件名称、著作权人，以json格式输出，找不到的字段赋值为空字符串，不要有多余的文字输出，只输出json结构。@@7_软件著作过滤器@@",
            "n":"1"
        },
        "index":1
    },
    "data":{
        "1_项目文件上传":"[{ \"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230816/16/05/1/1-基于时间序列遥感 影像洪涝检测系统.jpg\",\"fileType\":\"jpg\", \"filePath\":\"/软件著作/1-基于时间序列遥感 影像洪涝检测系统.jpg\",\"fileId\":\"cd6592f0389bb1da25afbb44901f9cde\",\"fileName\":\"1-基于时间序列遥感 影像洪涝检测系统.jpg\" },{ \"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230816/16/06/1/2-基于遥感影像的快速变化检测系统.jpg\",\"fileType\":\"jpg\", \"filePath\":\"/软件著作/2-基于遥感影像的快速变化检测系统.jpg\",\"fileId\":\"338847e34904fa96e8834cb220667db8\",\"fileName\":\"2-基于遥感影像的快速变化检测系统.jpg\" },{ \"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230816/16/08/1/3-基于时空模型的遥感时间序列森林火灾检测系统.jpg\",\"fileType\":\"jpg\", \"filePath\":\"/软件著作/1/3-基于时空模型的遥感时间序列森林火灾检测系统.jpg\",\"fileId\":\"944eec1cf98f216ea953459dac4dd505\",\"fileName\":\"3-基于时空模型的遥感时间序列森林火灾检测系统.jpg\" },{ \"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230816/16/09/1/4-基于隐马尔可夫模型的遥感时间序列分类系统.jpg\",\"fileType\":\"jpg\", \"filePath\":\"/软件著作/4-基于隐马尔可夫模型的遥感时间序列分类系统.jpg\",\"fileId\":\"eb378cb9ee914323f601500378dfad76\",\"fileName\":\"4-基于隐马尔可夫模型的遥感时间序列分类系统.jpg\" }]",
        "2_文件分类信息":"{\"软件著作\":4}",
        "3_OCR识别内容":"{\"content\":\"  22222222222222222222222222222222222222222222222222\\n中华人民共和国国家版权局\\n计算机软件著作权登记证书\\n证书号：软著登字第1623261号\\n软件名称：\\n基于遥感影像的快速变化检测系统\\nV1.0\\n著作权人：中国科学院遥感与数字地球研究所\\n开发完成日期：2016年08月01日\\n首次发表日期：未发表\\n权利取得方式：原始取得\\n权利范围：全部权利\\n登记号：2017SR037977\\n根据《计算机软件保护条例》和《计算机软件著作权登记办法》的\\n规定，经中国版权保护中心审核，对以上事项予以登记\\n计算机软件著作权\\n登记专用章\\n2017年02月10日\\nNo.01433672\",\"fileId\":\"338847e34904fa96e8834cb220667db8\",\"fileName\":\"2-基于遥感影像的快速变化检测系统.jpg\",\"filePath\":\"/软件著作/2-基于遥感影像的快速变化检测系统.jpg\",\"fileType\":\"jpg\",\"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230816/16/06/1/2-基于遥感影像的快速变化检测系统.jpg\",\"pageNum\":1}",
        "businessKey":"185aef3b1c810799a6be8314abf6512c",
        "7_软件著作过滤器":"{\"content\":\"  22222222222222222222222222222222222222222222222222\\n中华人民共和国国家版权局\\n计算机软件著作权登记证书\\n证书号：软著登字第1623261号\\n软件名称：\\n基于遥感影像的快速变化检测系统\\nV1.0\\n著作权人：中国科学院遥感与数字地球研究所\\n开发完成日期：2016年08月01日\\n首次发表日期：未发表\\n权利取得方式：原始取得\\n权利范围：全部权利\\n登记号：2017SR037977\\n根据《计算机软件保护条例》和《计算机软件著作权登记办法》的\\n规定，经中国版权保护中心审核，对以上事项予以登记\\n计算机软件著作权\\n登记专用章\\n2017年02月10日\\nNo.01433672\",\"fileId\":\"338847e34904fa96e8834cb220667db8\",\"fileName\":\"2-基于遥感影像的快速变化检测系统.jpg\",\"filePath\":\"/软件著作/2-基于遥感影像的快速变化检测系统.jpg\",\"fileType\":\"jpg\",\"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230816/16/06/1/2-基于遥感影像的快速变化检测系统.jpg\",\"pageNum\":1}"
    },
    "created":1691004265000,
    "module":"OCR",
    "start_tag":"false",
    "last_edit":1692464331000,
    "next_app_id":[
        {
            "start_id":86,
            "edge_id":49,
            "end_id":90
        }
    ],
    "transfer_id":11,
    "blueprint_id":3,
    "scenes_id":3,
    "scenario":{
        "dataloss":1,
        "autoCommitTriggerLast":1,
        "maxErrors":3,
        "autoCommit":1,
        "freshVariables":1
    },
    "wait_condition":[

    ],
    "scheduling":{
        "interval":-1,
        "type":"single"
    },
    "name":"软件著作抽取",
    "businessKey":"185aef3b1c810799a6be8314abf6512c",
    "id":86,
    "describe":"软件著作抽取"
}
    a=get_content(inputdata,"")
    print(a)