You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
415 lines
20 KiB
415 lines
20 KiB
#coding:utf8
|
|
import pandas as pd
|
|
import numpy as np
|
|
import networkx as nx
|
|
from textblob import TextBlob
|
|
from snownlp import SnowNLP
|
|
from wordcloud import STOPWORDS
|
|
import jieba
|
|
from tqdm import tqdm
|
|
import datetime
|
|
# from sklearn.model_selection import train_test_split
|
|
# from sklearn.ensemble import RandomForestClassifier
|
|
# from sklearn.model_selection import GridSearchCV
|
|
import joblib
|
|
def pre_user(data_user):
|
|
data_user['accountName'] = data_user['accountName'].apply(lambda x:'@'+x)
|
|
data_user['otherInfo'] = 1-pd.isnull(data_user['otherInfo']).astype(int)
|
|
data_user['nickName'] = 1-pd.isnull(data_user['nickName']).astype(int)
|
|
data_user = data_user[['accountName', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']]
|
|
data_user = data_user.dropna()
|
|
data_user = data_user.drop_duplicates().reset_index(drop = True)
|
|
data_user['fansCount'] = data_user['fansCount'].astype(int)
|
|
data_user['likeCount'] = data_user['likeCount'].astype(int)
|
|
data_user['postCount'] = data_user['postCount'].astype(int)
|
|
data_user.columns = ['用户名', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']
|
|
return data_user
|
|
|
|
def getText_count_eng(txt):
|
|
"""英文词频统计"""
|
|
txt = txt.lower() #将所有大写字母变成小写
|
|
for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()… ': #将文本中特殊符号替换为空格
|
|
txt = txt.replace(ch," ")
|
|
words = txt.split()
|
|
counts = {}
|
|
for word in words:
|
|
if word not in STOPWORDS:
|
|
if word != '\t':
|
|
counts[word] = counts.get(word,0) + 1 #统计字数
|
|
items = pd.DataFrame(list(counts.items()))
|
|
return items
|
|
|
|
def getText_count_ch(txt):
|
|
"""中文词频统计"""
|
|
txt = txt.lower() #将所有大写字母变成小写
|
|
for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()… 0123456789abcdefghijklmnopqrstuvwxyz': #将文本中特殊符号数字删除
|
|
txt = txt.replace(ch,"")
|
|
words = jieba.lcut(txt)
|
|
counts = {}
|
|
for word in words:
|
|
counts[word] = counts.get(word,0) + 1
|
|
items = list(counts.items())
|
|
fin_items = []
|
|
for item in items:
|
|
if len(item[0])>=2:
|
|
fin_items.append(item)
|
|
fin_items = pd.DataFrame(fin_items)
|
|
return fin_items
|
|
|
|
def getText_count_U(txt):
|
|
"""统计英文大写词频"""
|
|
for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()… ': #将文本中特殊符号替换为空格
|
|
txt = txt.replace(ch," ")
|
|
words = txt.split()
|
|
counts = {}
|
|
for word in words:
|
|
if word not in STOPWORDS:
|
|
if word != '/t':
|
|
if word.isupper(): #统计大写
|
|
counts[word] = counts.get(word,0) + 1 #统计字数
|
|
items = pd.DataFrame(list(counts.items())) #将字典类型转换成列表类型
|
|
if items.shape == (0,0):
|
|
out = 0
|
|
else:
|
|
out = sum(items[1])
|
|
return out
|
|
|
|
def is_chinese(strs):
|
|
"""判断一个unicode是否是汉字/英文"""
|
|
strs = strs.lower()
|
|
for uchar in strs:
|
|
if (uchar < u'\u0061') or (u'\u007a' < uchar < u'\u4e00') or (u'\u9fff' < uchar):
|
|
return False
|
|
return True
|
|
|
|
def is_eng(strs):
|
|
"""判断一个unicode是否是英文"""
|
|
strs = strs.lower()
|
|
for uchar in strs:
|
|
if (uchar < u'\u0061') or (u'\u007a' < uchar):
|
|
return False
|
|
return True
|
|
|
|
def pre_user(data_user):
|
|
data_user['accountName'] = data_user['accountName'].apply(lambda x:'@'+x)
|
|
data_user['otherInfo'] = 1-pd.isnull(data_user['otherInfo']).astype(int)
|
|
data_user['nickName'] = 1-pd.isnull(data_user['nickName']).astype(int)
|
|
data_user = data_user[['accountName', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']]
|
|
data_user = data_user.dropna()
|
|
data_user = data_user.drop_duplicates().reset_index(drop = True)
|
|
data_user['fansCount'] = data_user['fansCount'].astype(int)
|
|
data_user['likeCount'] = data_user['likeCount'].astype(int)
|
|
data_user['postCount'] = data_user['postCount'].astype(int)
|
|
data_user.columns = ['用户名', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']
|
|
return data_user
|
|
|
|
|
|
def post_related(df, data_user):
|
|
# postset = pd.DataFrame(columns=['传播层级', '帖子id', '转发来源id', '用户名', '用户id', '发表内容', '发表时间', 'shareCount',
|
|
# 'url', 'layer', 'shape', 'degree', 'closeness_centrality', 'pagerank',
|
|
# 'sub_shareCount', '语言', '主贴长度', '主贴http', '主贴at', '主贴tag',
|
|
# 'emotion', 'emotion_sub', '最大词频数', '重复词汇占比', '大写词频', '有无传播内容',
|
|
# '传播链语言均值', '传播链语言标准差', '传播链贴文emotion均值', '传播链贴文emotion标准差',
|
|
# '传播链贴文emotion_sub均值', '传播链贴文emotion_sub标准差',
|
|
# '传播链贴文长度均值', '传播链贴文长度标准差', '传播链贴文http均值', '传播链贴文http标准差', '传播链贴文at均值',
|
|
# '传播链贴文at标准差', '传播链贴文tag均值', '传播链贴文tag标准差', 'diffdate均值', 'diffdate标准差'])
|
|
postset = pd.DataFrame(columns=['传播层级', '帖子id', '转发来源id','所属帖子id', '用户名', '用户id', '发表内容', '发表时间', 'shareCount',
|
|
'url', 'layer', 'shape', 'degree', 'closeness_centrality', 'pagerank',
|
|
'语言', '主贴长度', '主贴http', '主贴at', '主贴tag',
|
|
'emotion', 'emotion_sub', '最大词频数', '重复词汇占比'])
|
|
|
|
for post_id in tqdm(df['所属帖子id'].drop_duplicates().reset_index(drop=True)):
|
|
|
|
data = df[df['所属帖子id'] == post_id].reset_index(drop=True)
|
|
|
|
data.columns = ['传播层级', '帖子id', '转发来源id', '所属帖子id', '用户名', '用户id', '发表内容', '发表时间',
|
|
'shareCount', 'url']
|
|
|
|
data = data.drop_duplicates()
|
|
|
|
post = data[data['传播层级'] == '1'].head(1)
|
|
|
|
### 一、新闻传播--贴文网络
|
|
##1.layer/shape/degree
|
|
post['layer'] = int(max(data['传播层级']))
|
|
post['shape'] = data.shape[0] - 1
|
|
post['degree'] = data[data['传播层级'] == '2'].shape[0]
|
|
|
|
##2.整体网络测度(贴文网络测度)
|
|
###2.1把转发来源id对应到转发来源用户
|
|
tmp_zfyh = pd.merge(data[data['传播层级'] != '1']['转发来源id'].drop_duplicates(),
|
|
data[data['帖子id'].notnull()][['帖子id', '用户名']],
|
|
left_on=['转发来源id'], right_on=['帖子id'], how='left')[['转发来源id', '用户名']]
|
|
tmp_zfyh.columns = ['转发来源id', '转发来源用户名']
|
|
data = pd.merge(data, tmp_zfyh, left_on=['转发来源id'], right_on=['转发来源id'], how='left')
|
|
post_edge = data.copy()
|
|
post_edge = data[data['传播层级'] != '1'][['用户名', '转发来源用户名']]
|
|
post_edge.columns = ['source', 'target']
|
|
post_edge['count_all'] = 1
|
|
post_edge = post_edge.groupby(['source', 'target'])['count_all'].count().reset_index()
|
|
# post_edge.to_csv(r'E:\项目文件\情报\假新闻\数据\画图\post_edge_tmp.csv',index=False)
|
|
|
|
edgeweightset = post_edge[['source', 'target', 'count_all']]
|
|
edgeweightset_l = [[] for _ in range(edgeweightset.shape[0])]
|
|
for k in range(len(edgeweightset_l)):
|
|
for j in range(edgeweightset.shape[1]):
|
|
edgeweightset_l[k].append(edgeweightset.iloc[k, j])
|
|
# print(i/len(edgeweightset_l))
|
|
|
|
if len(edgeweightset_l) == 0: # 没有传播链
|
|
post['closeness_centrality'] = 1
|
|
post['pagerank'] = 1
|
|
else:
|
|
g = nx.DiGraph()
|
|
g.add_weighted_edges_from(edgeweightset_l)
|
|
centrality = [nx.closeness_centrality(g),
|
|
nx.pagerank(g)]
|
|
results = []
|
|
nodes = g.nodes() # 提取网络中节点列表
|
|
for node in nodes: # 遍历所有节点,提取每个节点度中心性计算结果,并存储为[[节点1,结果],[节点2,结果],...]的形式
|
|
results.append([node,
|
|
centrality[0][node],
|
|
centrality[1][node]])
|
|
results = pd.DataFrame(results)
|
|
results.columns = ['node', 'closeness_centrality', 'pagerank']
|
|
|
|
post['closeness_centrality'] = results[results['node'] == results[
|
|
results['closeness_centrality'] == max(results['closeness_centrality'])]['node'].iloc[0]][
|
|
'closeness_centrality'].iloc[0]
|
|
post['pagerank'] = results[results['node'] ==
|
|
results[results['closeness_centrality'] == max(results['closeness_centrality'])][
|
|
'node'].iloc[0]]['pagerank'].iloc[0]
|
|
|
|
# post['closeness_centrality'] = results[results['node'] == post['帖子id'].iloc[0]]['closeness_centrality'].iloc[0]
|
|
# post['pagerank'] = results[results['node'] == post['帖子id'].iloc[0]]['pagerank'].iloc[0]
|
|
|
|
#——————————hh——————————————
|
|
# 特征未使用
|
|
# ##3.传播链中的平均影响力shareCount
|
|
# tmp = 0
|
|
# for k in range(data[(data['传播层级'] != '1') & (data['帖子id'].notnull())].shape[0]):
|
|
# tmp = tmp + int(data[(data['传播层级'] != '1') & (data['帖子id'].notnull())].shareCount.iloc[k])
|
|
# if tmp == 0:
|
|
# post['sub_shareCount'] = 0
|
|
# else:
|
|
# post['sub_shareCount'] = tmp / data[(data['传播层级'] != '1') & (data['帖子id'].notnull())].shape[0]
|
|
|
|
#————————————————————————
|
|
|
|
|
|
##二、主贴文本
|
|
# post['发表内容'] = 'October 10th commemorates the 1911 Revolution happened in Wuchang of China, which ended thousands-year-long absolute monarchy. Tsai and DPP authorities want to separate Taiwan from China and betray history. The Chinese people and Chinese history will never forgive these traitors.'
|
|
##文本特殊字符个数(http、@、#)
|
|
post['主贴http'] = post['发表内容'].iloc[0].count('http')
|
|
post['主贴at'] = post['发表内容'].iloc[0].count('@')
|
|
post['主贴tag'] = post['发表内容'].iloc[0].count('#')
|
|
|
|
##判断语言
|
|
tmp = post['发表内容'].iloc[0]
|
|
for ch in 'éí!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| 0123456789':
|
|
tmp = tmp.replace(ch, '')
|
|
|
|
if is_eng(tmp): ##主贴英文内容
|
|
|
|
post['语言'] = 0
|
|
text = post['发表内容'].iloc[0]
|
|
# text = '#Americans,for the first time in their lives,are seeing empty shelves in the stores.This isn’t right.We need to cut #China out of our supply chains by producing locally.#onshoring'
|
|
text = text[0:text.rfind("http")]
|
|
for ch in '!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| ':
|
|
text = text.replace(ch, ' ')
|
|
|
|
##文本长度
|
|
words = text.split(' ')
|
|
post['主贴长度'] = len(words)
|
|
|
|
##文本情感
|
|
# post['emotion'] = post['发表内容'].apply(lambda x: SnowNLP(x).sentiments)
|
|
emo = pd.DataFrame(TextBlob(post['发表内容'].iloc[0]).sentiment)
|
|
post['emotion'] = emo.loc[0, 0]
|
|
post['emotion_sub'] = emo.loc[1, 0]
|
|
|
|
##文本词频
|
|
## 词频统计1:最大词频数
|
|
## 词频统计2:正文中出现两次及以上的词占比
|
|
items = getText_count_eng(text)
|
|
if items.shape == (0, 0):
|
|
post['最大词频数'] = 0
|
|
post['重复词汇占比'] = 0
|
|
else:
|
|
post['最大词频数'] = max(items[1])
|
|
post['重复词汇占比'] = items[items[1] >= 2].shape[0] / items.shape[0]
|
|
|
|
## 词频统计3:全部大写词频
|
|
post['大写词频'] = getText_count_U(text)
|
|
|
|
elif is_chinese(tmp): ##主贴中文内容
|
|
|
|
post['语言'] = 1
|
|
|
|
text = post['发表内容'].iloc[0]
|
|
text = text[0:text.rfind("http")]
|
|
post['主贴长度'] = len(text)
|
|
|
|
post['emotion'] = (SnowNLP(text).sentiments - 0.5) * 2
|
|
post['emotion_sub'] = np.NaN
|
|
# post['emotion_blob'] = pd.DataFrame(post['发表内容'].apply(lambda x: TextBlob(x).sentiment).iloc[0]).loc[0]
|
|
# post['emotion_sub'] = pd.DataFrame(post['发表内容'].apply(lambda x: TextBlob(x).sentiment).iloc[0]).loc[1]
|
|
|
|
##文本词频
|
|
## 词频统计1:标题中出现的词,在正文中出现最大词频
|
|
## 词频统计2:正文中出现两次及以上的词占比
|
|
items = getText_count_ch(text)
|
|
if items.shape == (0, 0):
|
|
post['最大词频数'] = 0
|
|
post['重复词汇占比'] = 0
|
|
else:
|
|
post['最大词频数'] = max(items[1])
|
|
post['重复词汇占比'] = items[items[1] >= 2].shape[0] / items.shape[0]
|
|
## 词频统计3:全部大写词频
|
|
post['大写词频'] = np.NaN
|
|
|
|
else:
|
|
post['语言'] = np.NaN
|
|
post['主贴长度'] = np.NaN
|
|
post['emotion'] = np.NaN
|
|
post['emotion_sub'] = np.NaN
|
|
post['最大词频数'] = np.NaN
|
|
post['重复词汇占比'] = np.NaN
|
|
post['大写词频'] = np.NaN
|
|
|
|
# ##4.2传播链中的文本
|
|
# sub_post = pd.DataFrame(data[(data['传播层级'] != '1') & (data['帖子id'].notnull())][['发表内容', '发表时间']])
|
|
# sub_post['语言'] = np.NaN
|
|
# sub_post['文本长度'] = np.NaN
|
|
# sub_post['http'] = np.NaN
|
|
# sub_post['at'] = np.NaN
|
|
# sub_post['tag'] = np.NaN
|
|
# sub_post['emotion'] = np.NaN
|
|
# sub_post['emotion_sub'] = np.NaN
|
|
# sub_post['diffdate'] = np.NaN
|
|
#
|
|
# for k in range(sub_post.shape[0]):
|
|
# ##文本特殊字符个数(http、@、#)
|
|
# sub_post['http'].iloc[k] = sub_post['发表内容'].iloc[k].count('http')
|
|
# sub_post['at'].iloc[k] = sub_post['发表内容'].iloc[k].count('@')
|
|
# sub_post['tag'].iloc[k] = sub_post['发表内容'].iloc[k].count('#')
|
|
#
|
|
# ##时间差
|
|
# d1 = datetime.datetime.strptime(sub_post['发表时间'].iloc[k], "%Y-%m-%d %H:%M:%S")
|
|
# base = datetime.datetime.strptime(post['发表时间'].iloc[0], "%Y-%m-%d %H:%M:%S")
|
|
#
|
|
# # now = datetime.datetime.now()
|
|
# sub_post['diffdate'].iloc[k] = (d1 - base).days
|
|
#
|
|
# ##判断语言
|
|
# tmp = sub_post['发表内容'].iloc[k]
|
|
# for ch in 'éí!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| 0123456789':
|
|
# tmp = tmp.replace(ch, '')
|
|
#
|
|
# if is_eng(tmp): ##英文内容
|
|
#
|
|
# sub_post['语言'].iloc[k] = 0
|
|
#
|
|
# ##文本长度
|
|
# text = sub_post['发表内容'].iloc[k]
|
|
# # text = "'America is collapsing and it's China's fault' is definitely a change of direction?"
|
|
# text = text[0:text.rfind("http")]
|
|
# for ch in '!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| ':
|
|
# text = text.replace(ch, ' ')
|
|
# words = text.split(' ')
|
|
# sub_post['文本长度'].iloc[k] = len(words)
|
|
# ##情感
|
|
# sub_emo = pd.DataFrame(TextBlob(sub_post['发表内容'].iloc[k]).sentiment)
|
|
# sub_post['emotion'].iloc[k] = sub_emo.loc[0, 0]
|
|
# sub_post['emotion_sub'].iloc[k] = sub_emo.loc[1, 0]
|
|
#
|
|
# elif is_chinese(tmp): ##中文内容
|
|
#
|
|
# sub_post['语言'].iloc[k] = 1
|
|
#
|
|
# ##文本长度
|
|
# text = sub_post['发表内容'].iloc[k]
|
|
# text = text[0:text.rfind("http")]
|
|
# sub_post['文本长度'].iloc[k] = len(text)
|
|
# ##情感
|
|
# sub_post['emotion'].iloc[k] = (SnowNLP(sub_post['发表内容'].iloc[k]).sentiments - 0.5) * 2
|
|
# sub_post['emotion_sub'].iloc[k] = np.NaN
|
|
#
|
|
# else:
|
|
#
|
|
# sub_post['语言'].iloc[k] = np.NaN
|
|
# sub_post['文本长度'].iloc[k] = np.NaN
|
|
# sub_post['emotion'].iloc[k] = np.NaN
|
|
# sub_post['emotion_sub'].iloc[k] = np.NaN
|
|
#
|
|
# if sub_post.shape[0] == 0:
|
|
# post['有无传播内容'] = 0
|
|
# else:
|
|
# post['有无传播内容'] = 1
|
|
#
|
|
# post['传播链语言均值'] = sub_post['语言'].mean()
|
|
# post['传播链贴文长度均值'] = sub_post['文本长度'].mean()
|
|
# post['传播链贴文emotion均值'] = sub_post['emotion'].mean()
|
|
#
|
|
# ##emotion_sub取有值的均值
|
|
# post['传播链贴文emotion_sub均值'] = sub_post['emotion_sub'].mean()
|
|
#
|
|
# post['传播链贴文http均值'] = sub_post['http'].mean()
|
|
#
|
|
# post['传播链贴文at均值'] = sub_post['at'].mean()
|
|
#
|
|
# post['传播链贴文tag均值'] = sub_post['tag'].mean()
|
|
#
|
|
# post['diffdate均值'] = sub_post['diffdate'].mean()
|
|
|
|
##三、用户信息
|
|
##发帖用户
|
|
post = pd.merge(post, data_user, how='left', on='用户名')
|
|
|
|
##传播链用户
|
|
sub_user = pd.DataFrame(data[data['传播层级'] != '1'][['用户名']])
|
|
sub_user = pd.merge(sub_user, data_user, how='left', on='用户名')
|
|
sub_user = sub_user.dropna()
|
|
|
|
post['nickName均值'] = sub_user['nickName'].mean()
|
|
post['fansCount均值'] = sub_user['fansCount'].mean()
|
|
post['likeCount均值'] = sub_user['likeCount'].mean()
|
|
post['postCount均值'] = sub_user['postCount'].mean()
|
|
post['otherInfo均值'] = sub_user['otherInfo'].mean()
|
|
|
|
postset = pd.concat([postset, post]).reset_index(drop=True)
|
|
|
|
postset = postset.fillna(0)
|
|
postset['emotion_degree'] = abs(postset['emotion'])
|
|
|
|
return postset
|
|
|
|
|
|
xlsx_path_po = r'假新闻数据输入\传播分析test.xlsx'
|
|
data_po = pd.read_excel(xlsx_path_po, dtype="str")
|
|
data_user = pd.read_excel(r'假新闻数据输入\用户test.xlsx', dtype="str")
|
|
data_user = pre_user(data_user)
|
|
#data_user=dataframe[@XHNews,1,878,1178,938,1]
|
|
#data_user.columns=['用户名', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']
|
|
|
|
postset_po = post_related(data_po,data_user) ## 正面文件
|
|
features = postset_po[[
|
|
#'shareCount',
|
|
'layer', 'shape', 'degree', 'pagerank', 'closeness_centrality',
|
|
'主贴http', '主贴at', '主贴tag',
|
|
'主贴长度','emotion', 'emotion_degree',
|
|
'最大词频数', '重复词汇占比',#(中英文差异大)
|
|
#'有无传播内容',
|
|
'fansCount','likeCount', 'postCount',
|
|
#'sub_shareCount',
|
|
'fansCount均值', 'postCount均值', 'otherInfo均值'
|
|
]]
|
|
|
|
|
|
clf = joblib.load(r'fake_news_model.pkl')
|
|
clf_predict = clf.predict(features)
|
|
print(clf_predict)
|
|
res=pd.DataFrame(clf_predict)
|
|
res.columns=['假新闻预测结果']
|
|
result = pd.concat([postset_po, res], axis=1)
|
|
result.to_excel('test_1209_1.xlsx',index=None)
|