fakeNewIdentification/inputdata/eg.py


								#coding:utf8

								import pandas as pd

								import numpy as np

								import networkx as nx

								from textblob import TextBlob

								from snownlp import SnowNLP

								from wordcloud import STOPWORDS

								import jieba

								from tqdm import tqdm

								import datetime

								# from sklearn.model_selection import train_test_split

								# from sklearn.ensemble import RandomForestClassifier

								# from sklearn.model_selection import GridSearchCV

								import joblib

								def pre_user(data_user):

								    data_user['accountName'] = data_user['accountName'].apply(lambda x:'@'+x)

								    data_user['otherInfo'] = 1-pd.isnull(data_user['otherInfo']).astype(int)

								    data_user['nickName'] = 1-pd.isnull(data_user['nickName']).astype(int)

								    data_user = data_user[['accountName', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']]

								    data_user = data_user.dropna()

								    data_user = data_user.drop_duplicates().reset_index(drop = True)

								    data_user['fansCount'] = data_user['fansCount'].astype(int)

								    data_user['likeCount'] = data_user['likeCount'].astype(int)

								    data_user['postCount'] = data_user['postCount'].astype(int)

								    data_user.columns = ['用户名', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']

								    return data_user


								def getText_count_eng(txt):

								    """英文词频统计"""

								    txt = txt.lower()     #将所有大写字母变成小写

								    for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。，：；！？’‘“”、·——/\–《》（）… ':   #将文本中特殊符号替换为空格

								        txt = txt.replace(ch," ")

								    words = txt.split()

								    counts = {}

								    for word in words:

								        if word not in STOPWORDS:

								            if word != '\t':

								                counts[word] = counts.get(word,0) + 1   #统计字数

								    items = pd.DataFrame(list(counts.items()))

								    return items


								def getText_count_ch(txt):

								    """中文词频统计"""

								    txt = txt.lower()     #将所有大写字母变成小写

								    for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。，：；！？’‘“”、·——/\–《》（）… 0123456789abcdefghijklmnopqrstuvwxyz':   #将文本中特殊符号数字删除

								        txt = txt.replace(ch,"")

								    words = jieba.lcut(txt)

								    counts = {}

								    for word in words:

								        counts[word] = counts.get(word,0) + 1

								    items = list(counts.items())

								    fin_items = []

								    for item in items:

								        if len(item[0])>=2:

								            fin_items.append(item)

								    fin_items = pd.DataFrame(fin_items)

								    return fin_items


								def getText_count_U(txt):

								    """统计英文大写词频"""

								    for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。，：；！？’‘“”、·——/\–《》（）… ':   #将文本中特殊符号替换为空格

								        txt = txt.replace(ch," ")

								    words = txt.split()

								    counts = {}

								    for word in words:

								        if word not in STOPWORDS:

								            if word != '/t':

								                if word.isupper():    #统计大写

								                    counts[word] = counts.get(word,0) + 1   #统计字数

								    items = pd.DataFrame(list(counts.items()))   #将字典类型转换成列表类型

								    if items.shape == (0,0):

								        out = 0

								    else:

								        out = sum(items[1])

								    return out


								def is_chinese(strs):

								        """判断一个unicode是否是汉字/英文"""

								        strs = strs.lower()

								        for uchar in strs:

								            if (uchar < u'\u0061') or (u'\u007a' < uchar < u'\u4e00') or (u'\u9fff' < uchar):

								                return False

								        return True


								def is_eng(strs):

								    """判断一个unicode是否是英文"""

								    strs = strs.lower()

								    for uchar in strs:

								        if (uchar < u'\u0061') or (u'\u007a' < uchar):

								            return False

								    return True


								def pre_user(data_user):

								    data_user['accountName'] = data_user['accountName'].apply(lambda x:'@'+x)

								    data_user['otherInfo'] = 1-pd.isnull(data_user['otherInfo']).astype(int)

								    data_user['nickName'] = 1-pd.isnull(data_user['nickName']).astype(int)

								    data_user = data_user[['accountName', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']]

								    data_user = data_user.dropna()

								    data_user = data_user.drop_duplicates().reset_index(drop = True)

								    data_user['fansCount'] = data_user['fansCount'].astype(int)

								    data_user['likeCount'] = data_user['likeCount'].astype(int)

								    data_user['postCount'] = data_user['postCount'].astype(int)

								    data_user.columns = ['用户名', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']

								    return data_user


								def post_related(df, data_user):

								    # postset = pd.DataFrame(columns=['传播层级', '帖子id', '转发来源id', '用户名', '用户id', '发表内容', '发表时间', 'shareCount',

								    #                                 'url', 'layer', 'shape', 'degree', 'closeness_centrality', 'pagerank',

								    #                                 'sub_shareCount', '语言', '主贴长度', '主贴http', '主贴at', '主贴tag',

								    #                                 'emotion', 'emotion_sub', '最大词频数', '重复词汇占比', '大写词频', '有无传播内容',

								    #                                 '传播链语言均值', '传播链语言标准差', '传播链贴文emotion均值', '传播链贴文emotion标准差',

								    #                                 '传播链贴文emotion_sub均值', '传播链贴文emotion_sub标准差',

								    #                                 '传播链贴文长度均值', '传播链贴文长度标准差', '传播链贴文http均值', '传播链贴文http标准差', '传播链贴文at均值',

								    #                                 '传播链贴文at标准差', '传播链贴文tag均值', '传播链贴文tag标准差', 'diffdate均值', 'diffdate标准差'])

								    postset = pd.DataFrame(columns=['传播层级', '帖子id', '转发来源id','所属帖子id', '用户名', '用户id', '发表内容', '发表时间', 'shareCount',

								                                    'url', 'layer', 'shape', 'degree', 'closeness_centrality', 'pagerank',

								                                    '语言', '主贴长度', '主贴http', '主贴at', '主贴tag',

								                                    'emotion', 'emotion_sub', '最大词频数', '重复词汇占比'])


								    for post_id in tqdm(df['所属帖子id'].drop_duplicates().reset_index(drop=True)):


								        data = df[df['所属帖子id'] == post_id].reset_index(drop=True)


								        data.columns = ['传播层级', '帖子id', '转发来源id', '所属帖子id', '用户名', '用户id', '发表内容', '发表时间',

								                        'shareCount', 'url']


								        data = data.drop_duplicates()


								        post = data[data['传播层级'] == '1'].head(1)


								        ### 一、新闻传播--贴文网络

								        ##1.layer/shape/degree

								        post['layer'] = int(max(data['传播层级']))

								        post['shape'] = data.shape[0] - 1

								        post['degree'] = data[data['传播层级'] == '2'].shape[0]


								        ##2.整体网络测度（贴文网络测度）

								        ###2.1把转发来源id对应到转发来源用户

								        tmp_zfyh = pd.merge(data[data['传播层级'] != '1']['转发来源id'].drop_duplicates(),

								                            data[data['帖子id'].notnull()][['帖子id', '用户名']],

								                            left_on=['转发来源id'], right_on=['帖子id'], how='left')[['转发来源id', '用户名']]

								        tmp_zfyh.columns = ['转发来源id', '转发来源用户名']

								        data = pd.merge(data, tmp_zfyh, left_on=['转发来源id'], right_on=['转发来源id'], how='left')

								        post_edge = data.copy()

								        post_edge = data[data['传播层级'] != '1'][['用户名', '转发来源用户名']]

								        post_edge.columns = ['source', 'target']

								        post_edge['count_all'] = 1

								        post_edge = post_edge.groupby(['source', 'target'])['count_all'].count().reset_index()

								        # post_edge.to_csv(r'E:\项目文件\情报\假新闻\数据\画图\post_edge_tmp.csv',index=False)


								        edgeweightset = post_edge[['source', 'target', 'count_all']]

								        edgeweightset_l = [[] for _ in range(edgeweightset.shape[0])]

								        for k in range(len(edgeweightset_l)):

								            for j in range(edgeweightset.shape[1]):

								                edgeweightset_l[k].append(edgeweightset.iloc[k, j])

								                # print(i/len(edgeweightset_l))


								        if len(edgeweightset_l) == 0:  # 没有传播链

								            post['closeness_centrality'] = 1

								            post['pagerank'] = 1

								        else:

								            g = nx.DiGraph()

								            g.add_weighted_edges_from(edgeweightset_l)

								            centrality = [nx.closeness_centrality(g),

								                          nx.pagerank(g)]

								            results = []

								            nodes = g.nodes()  # 提取网络中节点列表

								            for node in nodes:  # 遍历所有节点，提取每个节点度中心性计算结果，并存储为[[节点1,结果],[节点2,结果],...]的形式

								                results.append([node,

								                                centrality[0][node],

								                                centrality[1][node]])

								            results = pd.DataFrame(results)

								            results.columns = ['node', 'closeness_centrality', 'pagerank']


								            post['closeness_centrality'] = results[results['node'] == results[

								                results['closeness_centrality'] == max(results['closeness_centrality'])]['node'].iloc[0]][

								                'closeness_centrality'].iloc[0]

								            post['pagerank'] = results[results['node'] ==

								                                       results[results['closeness_centrality'] == max(results['closeness_centrality'])][

								                                           'node'].iloc[0]]['pagerank'].iloc[0]


								        # post['closeness_centrality'] = results[results['node'] == post['帖子id'].iloc[0]]['closeness_centrality'].iloc[0]

								        # post['pagerank'] = results[results['node'] == post['帖子id'].iloc[0]]['pagerank'].iloc[0]


								        #——————————hh——————————————

								        # 特征未使用

								        # ##3.传播链中的平均影响力shareCount

								        # tmp = 0

								        # for k in range(data[(data['传播层级'] != '1') & (data['帖子id'].notnull())].shape[0]):

								        #     tmp = tmp + int(data[(data['传播层级'] != '1') & (data['帖子id'].notnull())].shareCount.iloc[k])

								        # if tmp == 0:

								        #     post['sub_shareCount'] = 0

								        # else:

								        #     post['sub_shareCount'] = tmp / data[(data['传播层级'] != '1') & (data['帖子id'].notnull())].shape[0]


								        #————————————————————————


								        ##二、主贴文本

								        # post['发表内容'] = 'October 10th commemorates the 1911 Revolution happened in Wuchang of China, which ended thousands-year-long absolute monarchy. Tsai and DPP authorities want to separate Taiwan from China and betray history. The Chinese people and Chinese history will never forgive these traitors.'

								        ##文本特殊字符个数（http、@、#）

								        post['主贴http'] = post['发表内容'].iloc[0].count('http')

								        post['主贴at'] = post['发表内容'].iloc[0].count('@')

								        post['主贴tag'] = post['发表内容'].iloc[0].count('#')


								        ##判断语言

								        tmp = post['发表内容'].iloc[0]

								        for ch in 'éí!\'"#＃$%&()*+,-.:;<=>?@[\\]^_`{|}~。，：；！？’‘“”、·——/\–《》（）…【】|「」| 0123456789':

								            tmp = tmp.replace(ch, '')


								        if is_eng(tmp):  ##主贴英文内容


								            post['语言'] = 0

								            text = post['发表内容'].iloc[0]

								            # text = '#Americans，for the first time in their lives，are seeing empty shelves in the stores.This isn’t right.We need to cut #China out of our supply chains by producing locally.#onshoring'

								            text = text[0:text.rfind("http")]

								            for ch in '!\'"#＃$%&()*+,-.:;<=>?@[\\]^_`{|}~。，：；！？’‘“”、·——/\–《》（）…【】|「」| ':

								                text = text.replace(ch, ' ')


								            ##文本长度

								            words = text.split(' ')

								            post['主贴长度'] = len(words)


								            ##文本情感

								            # post['emotion'] = post['发表内容'].apply(lambda x: SnowNLP(x).sentiments)

								            emo = pd.DataFrame(TextBlob(post['发表内容'].iloc[0]).sentiment)

								            post['emotion'] = emo.loc[0, 0]

								            post['emotion_sub'] = emo.loc[1, 0]


								            ##文本词频

								            ## 词频统计1:最大词频数

								            ## 词频统计2:正文中出现两次及以上的词占比

								            items = getText_count_eng(text)

								            if items.shape == (0, 0):

								                post['最大词频数'] = 0

								                post['重复词汇占比'] = 0

								            else:

								                post['最大词频数'] = max(items[1])

								                post['重复词汇占比'] = items[items[1] >= 2].shape[0] / items.shape[0]


								            ## 词频统计3:全部大写词频

								            post['大写词频'] = getText_count_U(text)


								        elif is_chinese(tmp):  ##主贴中文内容


								            post['语言'] = 1


								            text = post['发表内容'].iloc[0]

								            text = text[0:text.rfind("http")]

								            post['主贴长度'] = len(text)


								            post['emotion'] = (SnowNLP(text).sentiments - 0.5) * 2

								            post['emotion_sub'] = np.NaN

								            # post['emotion_blob'] = pd.DataFrame(post['发表内容'].apply(lambda x: TextBlob(x).sentiment).iloc[0]).loc[0]

								            # post['emotion_sub'] = pd.DataFrame(post['发表内容'].apply(lambda x: TextBlob(x).sentiment).iloc[0]).loc[1]


								            ##文本词频

								            ## 词频统计1:标题中出现的词，在正文中出现最大词频

								            ## 词频统计2:正文中出现两次及以上的词占比

								            items = getText_count_ch(text)

								            if items.shape == (0, 0):

								                post['最大词频数'] = 0

								                post['重复词汇占比'] = 0

								            else:

								                post['最大词频数'] = max(items[1])

								                post['重复词汇占比'] = items[items[1] >= 2].shape[0] / items.shape[0]

								            ## 词频统计3:全部大写词频

								            post['大写词频'] = np.NaN


								        else:

								            post['语言'] = np.NaN

								            post['主贴长度'] = np.NaN

								            post['emotion'] = np.NaN

								            post['emotion_sub'] = np.NaN

								            post['最大词频数'] = np.NaN

								            post['重复词汇占比'] = np.NaN

								            post['大写词频'] = np.NaN


								        # ##4.2传播链中的文本

								        # sub_post = pd.DataFrame(data[(data['传播层级'] != '1') & (data['帖子id'].notnull())][['发表内容', '发表时间']])

								        # sub_post['语言'] = np.NaN

								        # sub_post['文本长度'] = np.NaN

								        # sub_post['http'] = np.NaN

								        # sub_post['at'] = np.NaN

								        # sub_post['tag'] = np.NaN

								        # sub_post['emotion'] = np.NaN

								        # sub_post['emotion_sub'] = np.NaN

								        # sub_post['diffdate'] = np.NaN

								        #

								        # for k in range(sub_post.shape[0]):

								        #     ##文本特殊字符个数（http、@、#）

								        #     sub_post['http'].iloc[k] = sub_post['发表内容'].iloc[k].count('http')

								        #     sub_post['at'].iloc[k] = sub_post['发表内容'].iloc[k].count('@')

								        #     sub_post['tag'].iloc[k] = sub_post['发表内容'].iloc[k].count('#')

								        #

								        #     ##时间差

								        #     d1 = datetime.datetime.strptime(sub_post['发表时间'].iloc[k], "%Y-%m-%d %H:%M:%S")

								        #     base = datetime.datetime.strptime(post['发表时间'].iloc[0], "%Y-%m-%d %H:%M:%S")

								        #

								        #     # now = datetime.datetime.now()

								        #     sub_post['diffdate'].iloc[k] = (d1 - base).days

								        #

								        #     ##判断语言

								        #     tmp = sub_post['发表内容'].iloc[k]

								        #     for ch in 'éí!\'"#＃$%&()*+,-.:;<=>?@[\\]^_`{|}~。，：；！？’‘“”、·——/\–《》（）…【】|「」| 0123456789':

								        #         tmp = tmp.replace(ch, '')

								        #

								        #     if is_eng(tmp):  ##英文内容

								        #

								        #         sub_post['语言'].iloc[k] = 0

								        #

								        #         ##文本长度

								        #         text = sub_post['发表内容'].iloc[k]

								        #         # text = "'America is collapsing and it's China's fault' is definitely a change of direction?"

								        #         text = text[0:text.rfind("http")]

								        #         for ch in '!\'"#＃$%&()*+,-.:;<=>?@[\\]^_`{|}~。，：；！？’‘“”、·——/\–《》（）…【】|「」| ':

								        #             text = text.replace(ch, ' ')

								        #         words = text.split(' ')

								        #         sub_post['文本长度'].iloc[k] = len(words)

								        #         ##情感

								        #         sub_emo = pd.DataFrame(TextBlob(sub_post['发表内容'].iloc[k]).sentiment)

								        #         sub_post['emotion'].iloc[k] = sub_emo.loc[0, 0]

								        #         sub_post['emotion_sub'].iloc[k] = sub_emo.loc[1, 0]

								        #

								        #     elif is_chinese(tmp):  ##中文内容

								        #

								        #         sub_post['语言'].iloc[k] = 1

								        #

								        #         ##文本长度

								        #         text = sub_post['发表内容'].iloc[k]

								        #         text = text[0:text.rfind("http")]

								        #         sub_post['文本长度'].iloc[k] = len(text)

								        #         ##情感

								        #         sub_post['emotion'].iloc[k] = (SnowNLP(sub_post['发表内容'].iloc[k]).sentiments - 0.5) * 2

								        #         sub_post['emotion_sub'].iloc[k] = np.NaN

								        #

								        #     else:

								        #

								        #         sub_post['语言'].iloc[k] = np.NaN

								        #         sub_post['文本长度'].iloc[k] = np.NaN

								        #         sub_post['emotion'].iloc[k] = np.NaN

								        #         sub_post['emotion_sub'].iloc[k] = np.NaN

								        #

								        # if sub_post.shape[0] == 0:

								        #     post['有无传播内容'] = 0

								        # else:

								        #     post['有无传播内容'] = 1

								        #

								        # post['传播链语言均值'] = sub_post['语言'].mean()

								        # post['传播链贴文长度均值'] = sub_post['文本长度'].mean()

								        # post['传播链贴文emotion均值'] = sub_post['emotion'].mean()

								        #

								        # ##emotion_sub取有值的均值

								        # post['传播链贴文emotion_sub均值'] = sub_post['emotion_sub'].mean()

								        #

								        # post['传播链贴文http均值'] = sub_post['http'].mean()

								        #

								        # post['传播链贴文at均值'] = sub_post['at'].mean()

								        #

								        # post['传播链贴文tag均值'] = sub_post['tag'].mean()

								        #

								        # post['diffdate均值'] = sub_post['diffdate'].mean()


								        ##三、用户信息

								        ##发帖用户

								        post = pd.merge(post, data_user, how='left', on='用户名')


								        ##传播链用户

								        sub_user = pd.DataFrame(data[data['传播层级'] != '1'][['用户名']])

								        sub_user = pd.merge(sub_user, data_user, how='left', on='用户名')

								        sub_user = sub_user.dropna()


								        post['nickName均值'] = sub_user['nickName'].mean()

								        post['fansCount均值'] = sub_user['fansCount'].mean()

								        post['likeCount均值'] = sub_user['likeCount'].mean()

								        post['postCount均值'] = sub_user['postCount'].mean()

								        post['otherInfo均值'] = sub_user['otherInfo'].mean()


								        postset = pd.concat([postset, post]).reset_index(drop=True)


								    postset = postset.fillna(0)

								    postset['emotion_degree'] = abs(postset['emotion'])


								    return postset


								xlsx_path_po = r'假新闻数据输入\传播分析test.xlsx'

								data_po = pd.read_excel(xlsx_path_po, dtype="str")

								data_user = pd.read_excel(r'假新闻数据输入\用户test.xlsx', dtype="str")

								data_user = pre_user(data_user)

								#data_user=dataframe[@XHNews,1,878,1178,938,1]

								#data_user.columns=['用户名', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']


								postset_po = post_related(data_po,data_user)  ## 正面文件

								features = postset_po[[

								       #'shareCount',

								       'layer', 'shape', 'degree', 'pagerank', 'closeness_centrality',

								       '主贴http', '主贴at', '主贴tag',

								       '主贴长度','emotion', 'emotion_degree',

								       '最大词频数', '重复词汇占比',#(中英文差异大)

								       #'有无传播内容',

								       'fansCount','likeCount', 'postCount',

								       #'sub_shareCount',

								       'fansCount均值', 'postCount均值', 'otherInfo均值'

								       ]]


								clf = joblib.load(r'fake_news_model.pkl')

								clf_predict = clf.predict(features)

								print(clf_predict)

								res=pd.DataFrame(clf_predict)

								res.columns=['假新闻预测结果']

								result = pd.concat([postset_po, res], axis=1)

								result.to_excel('test_1209_1.xlsx',index=None)