# -*- coding: utf-8 -*- """ Created on Wed Sep 13 18:13:03 2023 @author: chong """ import pandas as pd import numpy as np import networkx as nx from textblob import TextBlob from snownlp import SnowNLP from wordcloud import STOPWORDS import jieba import datetime from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import GridSearchCV # from sklearn import metrics import joblib # import matplotlib.pyplot as plt # import seaborn as sns def pre_user(data_user): data_user['accountName'] = data_user['accountName'].apply(lambda x:'@'+x) data_user['otherInfo'] = 1-pd.isnull(data_user['otherInfo']).astype(int) data_user['nickName'] = 1-pd.isnull(data_user['nickName']).astype(int) data_user = data_user[['accountName', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']] data_user = data_user.dropna() data_user = data_user.drop_duplicates().reset_index(drop = True) data_user['fansCount'] = data_user['fansCount'].astype(int) data_user['likeCount'] = data_user['likeCount'].astype(int) data_user['postCount'] = data_user['postCount'].astype(int) data_user.columns = ['用户名', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo'] return data_user def getText_count_eng(txt): """英文词频统计""" txt = txt.lower() #将所有大写字母变成小写 for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()… ': #将文本中特殊符号替换为空格 txt = txt.replace(ch," ") words = txt.split() counts = {} for word in words: if word not in STOPWORDS: if word != '\t': counts[word] = counts.get(word,0) + 1 #统计字数 items = pd.DataFrame(list(counts.items())) return items def getText_count_ch(txt): """中文词频统计""" txt = txt.lower() #将所有大写字母变成小写 for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()… 0123456789abcdefghijklmnopqrstuvwxyz': #将文本中特殊符号数字删除 txt = txt.replace(ch,"") words = jieba.lcut(txt) counts = {} for word in words: counts[word] = counts.get(word,0) + 1 items = list(counts.items()) fin_items = [] for item in items: if len(item[0])>=2: fin_items.append(item) fin_items = pd.DataFrame(fin_items) return fin_items def getText_count_U(txt): """统计英文大写词频""" for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()… ': #将文本中特殊符号替换为空格 txt = txt.replace(ch," ") words = txt.split() counts = {} for word in words: if word not in STOPWORDS: if word != '/t': if word.isupper(): #统计大写 counts[word] = counts.get(word,0) + 1 #统计字数 items = pd.DataFrame(list(counts.items())) #将字典类型转换成列表类型 if items.shape == (0,0): out = 0 else: out = sum(items[1]) return out def is_chinese(strs): """判断一个unicode是否是汉字/英文""" strs = strs.lower() for uchar in strs: if (uchar < u'\u0061') or (u'\u007a' < uchar < u'\u4e00') or (u'\u9fff' < uchar): return False return True def is_eng(strs): """判断一个unicode是否是英文""" strs = strs.lower() for uchar in strs: if (uchar < u'\u0061') or (u'\u007a' < uchar): return False return True def post_related(df,data_user): postset = pd.DataFrame(columns=['传播层级', '帖子id', '转发来源id', '用户名', '用户id', '发表内容', '发表时间', 'shareCount', 'url', 'layer', 'shape', 'degree', 'closeness_centrality', 'pagerank', 'sub_shareCount', '语言', '主贴长度', '主贴http', '主贴at', '主贴tag', 'emotion', 'emotion_sub', '最大词频数', '重复词汇占比', '大写词频','有无传播内容', '传播链语言均值', '传播链语言标准差', '传播链贴文emotion均值', '传播链贴文emotion标准差', '传播链贴文emotion_sub均值','传播链贴文emotion_sub标准差', '传播链贴文长度均值', '传播链贴文长度标准差', '传播链贴文http均值', '传播链贴文http标准差', '传播链贴文at均值', '传播链贴文at标准差', '传播链贴文tag均值', '传播链贴文tag标准差', 'diffdate均值', 'diffdate标准差']) for post_id in df['所属帖子id'].drop_duplicates().reset_index(drop = True): data = df[df['所属帖子id']==post_id].reset_index(drop = True) data.columns = ['传播层级', '帖子id', '转发来源id', '所属帖子id', '用户名', '用户id', '发表内容', '发表时间', 'shareCount', 'url'] data = data.drop_duplicates() post = data[data['传播层级']=='1'].head(1) ### 一、新闻传播--贴文网络 ##1.layer/shape/degree post['layer'] = int(max(data['传播层级'])) post['shape'] = data.shape[0]-1 post['degree'] = data[data['传播层级']=='2'].shape[0] ##2.整体网络测度(贴文网络测度) ###2.1把转发来源id对应到转发来源用户 tmp_zfyh = pd.merge(data[data['传播层级']!='1']['转发来源id'].drop_duplicates(), data[data['帖子id'].notnull()][['帖子id','用户名']], left_on = ['转发来源id'], right_on = ['帖子id'], how = 'left')[['转发来源id','用户名']] tmp_zfyh.columns = ['转发来源id','转发来源用户名'] data = pd.merge(data, tmp_zfyh, left_on = ['转发来源id'], right_on = ['转发来源id'], how = 'left') post_edge = data.copy() post_edge = data[data['传播层级']!='1'][['用户名','转发来源用户名']] post_edge.columns = ['source','target'] # tmp1 = data[(data['帖子id'].notnull())&(data['传播层级']!='1')][['帖子id','转发来源id']] # tmp2 = data[data['帖子id'].isnull()][['用户名','转发来源id']] # tmp1.columns = ['source','target'] # tmp2.columns = ['source','target'] # post_edge = pd.concat([tmp1,tmp2]) post_edge['count_all'] = 1 post_edge = post_edge.groupby(['source','target'])['count_all'].count().reset_index() # post_edge.to_csv(r'E:\项目文件\情报\假新闻\数据\画图\post_edge_tmp.csv',index=False) edgeweightset = post_edge[['source','target','count_all']] edgeweightset_l = [[] for _ in range(edgeweightset.shape[0])] for k in range(len(edgeweightset_l)): for j in range(edgeweightset.shape[1]): edgeweightset_l[k].append(edgeweightset.iloc[k,j]) # print(i/len(edgeweightset_l)) if len(edgeweightset_l)==0: #没有传播链 post['closeness_centrality'] = 1 post['pagerank'] = 1 else: g = nx.DiGraph() g.add_weighted_edges_from(edgeweightset_l) centrality = [nx.closeness_centrality(g), nx.pagerank(g)] results = [] nodes = g.nodes() # 提取网络中节点列表 for node in nodes: # 遍历所有节点,提取每个节点度中心性计算结果,并存储为[[节点1,结果],[节点2,结果],...]的形式 results.append([node, centrality[0][node], centrality[1][node]]) results = pd.DataFrame(results) results.columns = ['node','closeness_centrality','pagerank'] post['closeness_centrality'] = results[results['node'] == results[results['closeness_centrality'] == max(results['closeness_centrality'])]['node'].iloc[0]]['closeness_centrality'].iloc[0] post['pagerank'] = results[results['node'] == results[results['closeness_centrality'] == max(results['closeness_centrality'])]['node'].iloc[0]]['pagerank'].iloc[0] #post['closeness_centrality'] = results[results['node'] == post['帖子id'].iloc[0]]['closeness_centrality'].iloc[0] #post['pagerank'] = results[results['node'] == post['帖子id'].iloc[0]]['pagerank'].iloc[0] ##3.传播链中的平均影响力shareCount tmp = 0 for k in range(data[(data['传播层级']!='1') & (data['帖子id'].notnull())].shape[0]): tmp = tmp + int(data[(data['传播层级']!='1') & (data['帖子id'].notnull())].shareCount.iloc[k]) if tmp == 0: post['sub_shareCount'] = 0 else: post['sub_shareCount'] = tmp/data[(data['传播层级']!='1') & (data['帖子id'].notnull())].shape[0] ##二、主贴文本 # post['发表内容'] = 'October 10th commemorates the 1911 Revolution happened in Wuchang of China, which ended thousands-year-long absolute monarchy. Tsai and DPP authorities want to separate Taiwan from China and betray history. The Chinese people and Chinese history will never forgive these traitors.' ##文本特殊字符个数(http、@、#) post['主贴http'] = post['发表内容'].iloc[0].count('http') post['主贴at'] = post['发表内容'].iloc[0].count('@') post['主贴tag'] = post['发表内容'].iloc[0].count('#') ##判断语言 tmp = post['发表内容'].iloc[0] for ch in 'éí!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| 0123456789': tmp = tmp.replace(ch,'') if is_eng(tmp): ##主贴英文内容 post['语言'] = 0 text = post['发表内容'].iloc[0] #text = '#Americans,for the first time in their lives,are seeing empty shelves in the stores.This isn’t right.We need to cut #China out of our supply chains by producing locally.#onshoring' text = text[0:text.rfind("http")] for ch in '!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| ': text = text.replace(ch,' ') ##文本长度 words = text.split(' ') post['主贴长度'] = len(words) ##文本情感 # post['emotion'] = post['发表内容'].apply(lambda x: SnowNLP(x).sentiments) emo = pd.DataFrame(TextBlob(post['发表内容'].iloc[0]).sentiment) post['emotion'] = emo.loc[0,0] post['emotion_sub'] = emo.loc[1,0] ##文本词频 ## 词频统计1:最大词频数 ## 词频统计2:正文中出现两次及以上的词占比 items = getText_count_eng(text) if items.shape==(0,0): post['最大词频数'] = 0 post['重复词汇占比'] = 0 else: post['最大词频数'] = max(items[1]) post['重复词汇占比'] = items[items[1]>=2].shape[0]/items.shape[0] ## 词频统计3:全部大写词频 post['大写词频'] = getText_count_U(text) elif is_chinese(tmp): ##主贴中文内容 post['语言'] = 1 text = post['发表内容'].iloc[0] text = text[0:text.rfind("http")] post['主贴长度'] = len(text) post['emotion'] = (SnowNLP(text).sentiments-0.5)*2 post['emotion_sub'] = np.NaN # post['emotion_blob'] = pd.DataFrame(post['发表内容'].apply(lambda x: TextBlob(x).sentiment).iloc[0]).loc[0] # post['emotion_sub'] = pd.DataFrame(post['发表内容'].apply(lambda x: TextBlob(x).sentiment).iloc[0]).loc[1] ##文本词频 ## 词频统计1:标题中出现的词,在正文中出现最大词频 ## 词频统计2:正文中出现两次及以上的词占比 items = getText_count_ch(text) if items.shape==(0,0): post['最大词频数'] = 0 post['重复词汇占比'] = 0 else: post['最大词频数'] = max(items[1]) post['重复词汇占比'] = items[items[1]>=2].shape[0]/items.shape[0] ## 词频统计3:全部大写词频 post['大写词频'] = np.NaN else: post['语言'] = np.NaN post['主贴长度'] = np.NaN post['emotion'] = np.NaN post['emotion_sub'] = np.NaN post['最大词频数'] = np.NaN post['重复词汇占比'] = np.NaN post['大写词频'] = np.NaN ##4.2传播链中的文本 sub_post = pd.DataFrame(data[(data['传播层级']!='1')&(data['帖子id'].notnull())][['发表内容','发表时间']]) sub_post['语言'] = np.NaN sub_post['文本长度'] = np.NaN sub_post['http'] = np.NaN sub_post['at'] = np.NaN sub_post['tag'] = np.NaN sub_post['emotion'] = np.NaN sub_post['emotion_sub'] = np.NaN sub_post['diffdate'] = np.NaN for k in range(sub_post.shape[0]): ##文本特殊字符个数(http、@、#) sub_post['http'].iloc[k] = sub_post['发表内容'].iloc[k].count('http') sub_post['at'].iloc[k] = sub_post['发表内容'].iloc[k].count('@') sub_post['tag'].iloc[k] = sub_post['发表内容'].iloc[k].count('#') ##时间差 d1 = datetime.datetime.strptime(sub_post['发表时间'].iloc[k],"%Y-%m-%d %H:%M:%S") base = datetime.datetime.strptime(post['发表时间'].iloc[0],"%Y-%m-%d %H:%M:%S") # now = datetime.datetime.now() sub_post['diffdate'].iloc[k] = (d1-base).days ##判断语言 tmp = sub_post['发表内容'].iloc[k] for ch in 'éí!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| 0123456789': tmp = tmp.replace(ch,'') if is_eng(tmp): ##英文内容 sub_post['语言'].iloc[k] = 0 ##文本长度 text = sub_post['发表内容'].iloc[k] # text = "'America is collapsing and it's China's fault' is definitely a change of direction?" text = text[0:text.rfind("http")] for ch in '!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| ': text = text.replace(ch,' ') words = text.split(' ') sub_post['文本长度'].iloc[k] = len(words) ##情感 sub_emo = pd.DataFrame(TextBlob(sub_post['发表内容'].iloc[k]).sentiment) sub_post['emotion'].iloc[k] = sub_emo.loc[0,0] sub_post['emotion_sub'].iloc[k] = sub_emo.loc[1,0] elif is_chinese(tmp): ##中文内容 sub_post['语言'].iloc[k] = 1 ##文本长度 text = sub_post['发表内容'].iloc[k] text = text[0:text.rfind("http")] sub_post['文本长度'].iloc[k] = len(text) ##情感 sub_post['emotion'].iloc[k] = (SnowNLP(sub_post['发表内容'].iloc[k]).sentiments-0.5)*2 sub_post['emotion_sub'].iloc[k] = np.NaN else: sub_post['语言'].iloc[k] = np.NaN sub_post['文本长度'].iloc[k] = np.NaN sub_post['emotion'].iloc[k] = np.NaN sub_post['emotion_sub'].iloc[k] = np.NaN if sub_post.shape[0] == 0: post['有无传播内容'] = 0 else: post['有无传播内容'] = 1 post['传播链语言均值'] = sub_post['语言'].mean() post['传播链贴文长度均值'] = sub_post['文本长度'].mean() post['传播链贴文emotion均值'] = sub_post['emotion'].mean() ##emotion_sub取有值的均值 post['传播链贴文emotion_sub均值'] = sub_post['emotion_sub'].mean() post['传播链贴文http均值'] = sub_post['http'].mean() post['传播链贴文at均值'] = sub_post['at'].mean() post['传播链贴文tag均值'] = sub_post['tag'].mean() post['diffdate均值'] = sub_post['diffdate'].mean() ##三、用户信息 ##发帖用户 post = pd.merge(post,data_user,how='left',on='用户名') ##传播链用户 sub_user = pd.DataFrame(data[data['传播层级']!='1'][['用户名']]) sub_user = pd.merge(sub_user,data_user,how='left',on='用户名') sub_user = sub_user.dropna() post['nickName均值'] = sub_user['nickName'].mean() post['fansCount均值'] = sub_user['fansCount'].mean() post['likeCount均值'] = sub_user['likeCount'].mean() post['postCount均值'] = sub_user['postCount'].mean() post['otherInfo均值'] = sub_user['otherInfo'].mean() postset = pd.concat([postset,post]).reset_index(drop=True) postset = postset.fillna(0) postset['emotion_degree'] = abs(postset['emotion']) return postset xlsx_path_po = r'假新闻数据输入\传播分析1209.xlsx' xlsx_path_ne = r'假新闻数据输入\传播分析1220.xlsx' data_po = pd.read_excel(xlsx_path_po, dtype="str") data_ne = pd.read_excel(xlsx_path_ne, dtype="str") data_user = pd.read_excel(r'假新闻数据输入\Twitter_Account.xlsx', dtype="str") data_user = pre_user(data_user) postset_po = post_related(data_po,data_user) ## 正面文件 postset_ne = post_related(data_ne,data_user) ## 负面文件 postset_po['y'] = 1 postset_ne['y'] = 0 postset = pd.concat([postset_po,postset_ne]).drop_duplicates().reset_index(drop = True) features = postset[[ #'shareCount', 'layer', 'shape', 'degree', 'pagerank', 'closeness_centrality', '主贴http', '主贴at', '主贴tag', '主贴长度','emotion', 'emotion_degree', '最大词频数', '重复词汇占比',#(中英文差异大) #'有无传播内容', 'fansCount','likeCount', 'postCount', #'sub_shareCount', 'fansCount均值', 'postCount均值', 'otherInfo均值' #,'结果' ]] target = pd.DataFrame(postset[postset.columns[-1]],columns=[postset.columns[-1]]) X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.25, random_state = 123) RF_model = RandomForestClassifier(n_estimators=10, max_depth=None,min_samples_split=2, random_state=0) params = {"n_estimators":range(10,101,10)} clf = GridSearchCV(estimator=RF_model, param_grid=params, cv=10) clf.fit(X_train,y_train) clf.best_params_ clf_predict = clf.predict(X_test) joblib.dump(clf,r'F:\项目文件\情报\假新闻\fake_news_model.pkl') clf = joblib.load(r'F:\项目文件\情报\假新闻\fake_news_model.pkl') clf_predict = clf.predict(features) # cm5 = pd.crosstab(clf_predict,target.y) # sns.heatmap(cm5, annot = True, cmap = 'GnBu', fmt = 'd') # plt.xlabel('Real') # plt.ylabel('Predict') # plt.show() # accuracy_rate = sum(clf_predict == target.y) / len(target.y) # target = pd.get_dummies(target)['y'] # sum((clf_predict == target) & (target ==1)) / sum(clf_predict==1) # sum((clf_predict == target) & (target ==0)) / sum(clf_predict==0) # print('模型的准确率为:\n',accuracy_rate) # print('模型的评估报告:\n',metrics.classification_report(target, clf_predict))