假新闻识别应用
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

456 lines
22 KiB

#coding:utf8
import pandas as pd
import numpy as np
import networkx as nx
from textblob import TextBlob
from snownlp import SnowNLP
from wordcloud import STOPWORDS
import jieba
# import tool
from tqdm import tqdm
import os,sys
cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd()
par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir))
sys.path.append(cur_dir)
sys.path.append(par_dir)
import datetime
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import GridSearchCV
import joblib
def pre_user(data_user):
data_user['accountName'] = data_user['accountName'].apply(lambda x:'@'+x)
data_user['otherInfo'] = 1-pd.isnull(data_user['otherInfo']).astype(int)
data_user['nickName'] = 1-pd.isnull(data_user['nickName']).astype(int)
data_user = data_user[['accountName', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']]
data_user = data_user.dropna()
data_user = data_user.drop_duplicates().reset_index(drop = True)
data_user['fansCount'] = data_user['fansCount'].astype(int)
data_user['likeCount'] = data_user['likeCount'].astype(int)
data_user['postCount'] = data_user['postCount'].astype(int)
data_user.columns = ['用户名', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']
return data_user
def getText_count_eng(txt):
"""英文词频统计"""
txt = txt.lower() #将所有大写字母变成小写
for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()… ': #将文本中特殊符号替换为空格
txt = txt.replace(ch," ")
words = txt.split()
counts = {}
for word in words:
if word not in STOPWORDS:
if word != '\t':
counts[word] = counts.get(word,0) + 1 #统计字数
items = pd.DataFrame(list(counts.items()))
return items
def getText_count_ch(txt):
"""中文词频统计"""
txt = txt.lower() #将所有大写字母变成小写
for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()… 0123456789abcdefghijklmnopqrstuvwxyz': #将文本中特殊符号数字删除
txt = txt.replace(ch,"")
words = jieba.lcut(txt)
counts = {}
for word in words:
counts[word] = counts.get(word,0) + 1
items = list(counts.items())
fin_items = []
for item in items:
if len(item[0])>=2:
fin_items.append(item)
fin_items = pd.DataFrame(fin_items)
return fin_items
def getText_count_U(txt):
"""统计英文大写词频"""
for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()… ': #将文本中特殊符号替换为空格
txt = txt.replace(ch," ")
words = txt.split()
counts = {}
for word in words:
if word not in STOPWORDS:
if word != '/t':
if word.isupper(): #统计大写
counts[word] = counts.get(word,0) + 1 #统计字数
items = pd.DataFrame(list(counts.items())) #将字典类型转换成列表类型
if items.shape == (0,0):
out = 0
else:
out = sum(items[1])
return out
def is_chinese(strs):
"""判断一个unicode是否是汉字/英文"""
strs = strs.lower()
for uchar in strs:
if (uchar < u'\u0061') or (u'\u007a' < uchar < u'\u4e00') or (u'\u9fff' < uchar):
return False
return True
def is_eng(strs):
"""判断一个unicode是否是英文"""
strs = strs.lower()
for uchar in strs:
if (uchar < u'\u0061') or (u'\u007a' < uchar):
return False
return True
# def pre_user(data_user):
# data_user['accountName'] = data_user['accountName'].apply(lambda x:'@'+x)
# data_user['otherInfo'] = 1-pd.isnull(data_user['otherInfo']).astype(int)
# data_user['nickName'] = 1-pd.isnull(data_user['nickName']).astype(int)
# data_user = data_user[['accountName', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']]
# data_user = data_user.dropna()
# data_user = data_user.drop_duplicates().reset_index(drop = True)
# data_user['fansCount'] = data_user['fansCount'].astype(int)
# data_user['likeCount'] = data_user['likeCount'].astype(int)
# data_user['postCount'] = data_user['postCount'].astype(int)
# data_user.columns = ['用户名', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']
# return data_user
def post_related(df, data_user,logging):
# postset = pd.DataFrame(columns=['传播层级', '帖子id', '转发来源id', '用户名', '用户id', '发表内容', '发表时间', 'shareCount',
# 'url', 'layer', 'shape', 'degree', 'closeness_centrality', 'pagerank',
# 'sub_shareCount', '语言', '主贴长度', '主贴http', '主贴at', '主贴tag',
# 'emotion', 'emotion_sub', '最大词频数', '重复词汇占比', '大写词频', '有无传播内容',
# '传播链语言均值', '传播链语言标准差', '传播链贴文emotion均值', '传播链贴文emotion标准差',
# '传播链贴文emotion_sub均值', '传播链贴文emotion_sub标准差',
# '传播链贴文长度均值', '传播链贴文长度标准差', '传播链贴文http均值', '传播链贴文http标准差', '传播链贴文at均值',
# '传播链贴文at标准差', '传播链贴文tag均值', '传播链贴文tag标准差', 'diffdate均值', 'diffdate标准差'])
postset = pd.DataFrame(columns=['传播层级', '帖子id', '转发来源id','所属帖子id', '用户名', '用户id', '发表内容', '发表时间', 'shareCount',
'url', 'layer', 'shape', 'degree', 'closeness_centrality', 'pagerank',
'语言', '主贴长度', '主贴http', '主贴at', '主贴tag',
'emotion', 'emotion_sub', '最大词频数', '重复词汇占比'])
for post_id in df['所属帖子id'].drop_duplicates().reset_index(drop=True):
data = df[df['所属帖子id'] == post_id].reset_index(drop=True)
data.columns = ['传播层级', '帖子id', '转发来源id', '所属帖子id', '用户名', '用户id', '发表内容', '发表时间',
'shareCount', 'url']
data = data.drop_duplicates()
post = data[data['传播层级'] == 1].head(1)
### 一、新闻传播--贴文网络
##1.layer/shape/degree
post['layer'] = int(max(data['传播层级']))
post['shape'] = data.shape[0] - 1
post['degree'] = data[data['传播层级'] == 2].shape[0]
##2.整体网络测度(贴文网络测度)
###2.1把转发来源id对应到转发来源用户
tmp_zfyh = pd.merge(data[data['传播层级'] != 1]['转发来源id'].drop_duplicates(),
data[data['帖子id'].notnull()][['帖子id', '用户名']],
left_on=['转发来源id'], right_on=['帖子id'], how='left')[['转发来源id', '用户名']]
tmp_zfyh.columns = ['转发来源id', '转发来源用户名']
data = pd.merge(data, tmp_zfyh, left_on=['转发来源id'], right_on=['转发来源id'], how='left')
post_edge = data.copy()
post_edge = data[data['传播层级'] != 1][['用户名', '转发来源用户名']]
post_edge.columns = ['source', 'target']
post_edge['count_all'] = 1
post_edge = post_edge.groupby(['source', 'target'])['count_all'].count().reset_index()
# post_edge.to_csv(r'E:\项目文件\情报\假新闻\数据\画图\post_edge_tmp.csv',index=False)
edgeweightset = post_edge[['source', 'target', 'count_all']]
edgeweightset_l = [[] for _ in range(edgeweightset.shape[0])]
for k in range(len(edgeweightset_l)):
for j in range(edgeweightset.shape[1]):
edgeweightset_l[k].append(edgeweightset.iloc[k, j])
# print(i/len(edgeweightset_l))
if len(edgeweightset_l) == 0: # 没有传播链
post['closeness_centrality'] = 1
post['pagerank'] = 1
else:
g = nx.DiGraph()
g.add_weighted_edges_from(edgeweightset_l)
centrality = [nx.closeness_centrality(g),
nx.pagerank(g)]
results = []
nodes = g.nodes() # 提取网络中节点列表
for node in nodes: # 遍历所有节点,提取每个节点度中心性计算结果,并存储为[[节点1,结果],[节点2,结果],...]的形式
results.append([node,
centrality[0][node],
centrality[1][node]])
results = pd.DataFrame(results)
results.columns = ['node', 'closeness_centrality', 'pagerank']
post['closeness_centrality'] = results[results['node'] == results[
results['closeness_centrality'] == max(results['closeness_centrality'])]['node'].iloc[0]][
'closeness_centrality'].iloc[0]
post['pagerank'] = results[results['node'] ==
results[results['closeness_centrality'] == max(results['closeness_centrality'])][
'node'].iloc[0]]['pagerank'].iloc[0]
# post['closeness_centrality'] = results[results['node'] == post['帖子id'].iloc[0]]['closeness_centrality'].iloc[0]
# post['pagerank'] = results[results['node'] == post['帖子id'].iloc[0]]['pagerank'].iloc[0]
#——————————hh——————————————
# 特征未使用
# ##3.传播链中的平均影响力shareCount
# tmp = 0
# for k in range(data[(data['传播层级'] != '1') & (data['帖子id'].notnull())].shape[0]):
# tmp = tmp + int(data[(data['传播层级'] != '1') & (data['帖子id'].notnull())].shareCount.iloc[k])
# if tmp == 0:
# post['sub_shareCount'] = 0
# else:
# post['sub_shareCount'] = tmp / data[(data['传播层级'] != '1') & (data['帖子id'].notnull())].shape[0]
#————————————————————————
##二、主贴文本
# post['发表内容'] = 'October 10th commemorates the 1911 Revolution happened in Wuchang of China, which ended thousands-year-long absolute monarchy. Tsai and DPP authorities want to separate Taiwan from China and betray history. The Chinese people and Chinese history will never forgive these traitors.'
##文本特殊字符个数(http、@、#)
# logging.info(post)
post['主贴http'] = post['发表内容'].iloc[0].count('http')
post['主贴at'] = post['发表内容'].iloc[0].count('@')
post['主贴tag'] = post['发表内容'].iloc[0].count('#')
##判断语言
tmp = post['发表内容'].iloc[0]
for ch in 'éí!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| 0123456789':
tmp = tmp.replace(ch, '')
if is_eng(tmp): ##主贴英文内容
post['语言'] = 0
text = post['发表内容'].iloc[0]
# text = '#Americans,for the first time in their lives,are seeing empty shelves in the stores.This isn’t right.We need to cut #China out of our supply chains by producing locally.#onshoring'
text = text[0:text.rfind("http")]
for ch in '!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| ':
text = text.replace(ch, ' ')
##文本长度
words = text.split(' ')
post['主贴长度'] = len(words)
##文本情感
# post['emotion'] = post['发表内容'].apply(lambda x: SnowNLP(x).sentiments)
emo = pd.DataFrame(TextBlob(post['发表内容'].iloc[0]).sentiment)
post['emotion'] = emo.loc[0, 0]
post['emotion_sub'] = emo.loc[1, 0]
##文本词频
## 词频统计1:最大词频数
## 词频统计2:正文中出现两次及以上的词占比
items = getText_count_eng(text)
if items.shape == (0, 0):
post['最大词频数'] = 0
post['重复词汇占比'] = 0
else:
post['最大词频数'] = max(items[1])
post['重复词汇占比'] = items[items[1] >= 2].shape[0] / items.shape[0]
## 词频统计3:全部大写词频
post['大写词频'] = getText_count_U(text)
elif is_chinese(tmp): ##主贴中文内容
post['语言'] = 1
text = post['发表内容'].iloc[0]
text = text[0:text.rfind("http")]
post['主贴长度'] = len(text)
post['emotion'] = (SnowNLP(text).sentiments - 0.5) * 2
post['emotion_sub'] = np.NaN
# post['emotion_blob'] = pd.DataFrame(post['发表内容'].apply(lambda x: TextBlob(x).sentiment).iloc[0]).loc[0]
# post['emotion_sub'] = pd.DataFrame(post['发表内容'].apply(lambda x: TextBlob(x).sentiment).iloc[0]).loc[1]
##文本词频
## 词频统计1:标题中出现的词,在正文中出现最大词频
## 词频统计2:正文中出现两次及以上的词占比
items = getText_count_ch(text)
if items.shape == (0, 0):
post['最大词频数'] = 0
post['重复词汇占比'] = 0
else:
post['最大词频数'] = max(items[1])
post['重复词汇占比'] = items[items[1] >= 2].shape[0] / items.shape[0]
## 词频统计3:全部大写词频
post['大写词频'] = np.NaN
else:
post['语言'] = np.NaN
post['主贴长度'] = np.NaN
post['emotion'] = np.NaN
post['emotion_sub'] = np.NaN
post['最大词频数'] = np.NaN
post['重复词汇占比'] = np.NaN
post['大写词频'] = np.NaN
# ##4.2传播链中的文本
# sub_post = pd.DataFrame(data[(data['传播层级'] != '1') & (data['帖子id'].notnull())][['发表内容', '发表时间']])
# sub_post['语言'] = np.NaN
# sub_post['文本长度'] = np.NaN
# sub_post['http'] = np.NaN
# sub_post['at'] = np.NaN
# sub_post['tag'] = np.NaN
# sub_post['emotion'] = np.NaN
# sub_post['emotion_sub'] = np.NaN
# sub_post['diffdate'] = np.NaN
#
# for k in range(sub_post.shape[0]):
# ##文本特殊字符个数(http、@、#)
# sub_post['http'].iloc[k] = sub_post['发表内容'].iloc[k].count('http')
# sub_post['at'].iloc[k] = sub_post['发表内容'].iloc[k].count('@')
# sub_post['tag'].iloc[k] = sub_post['发表内容'].iloc[k].count('#')
#
# ##时间差
# d1 = datetime.datetime.strptime(sub_post['发表时间'].iloc[k], "%Y-%m-%d %H:%M:%S")
# base = datetime.datetime.strptime(post['发表时间'].iloc[0], "%Y-%m-%d %H:%M:%S")
#
# # now = datetime.datetime.now()
# sub_post['diffdate'].iloc[k] = (d1 - base).days
#
# ##判断语言
# tmp = sub_post['发表内容'].iloc[k]
# for ch in 'éí!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| 0123456789':
# tmp = tmp.replace(ch, '')
#
# if is_eng(tmp): ##英文内容
#
# sub_post['语言'].iloc[k] = 0
#
# ##文本长度
# text = sub_post['发表内容'].iloc[k]
# # text = "'America is collapsing and it's China's fault' is definitely a change of direction?"
# text = text[0:text.rfind("http")]
# for ch in '!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| ':
# text = text.replace(ch, ' ')
# words = text.split(' ')
# sub_post['文本长度'].iloc[k] = len(words)
# ##情感
# sub_emo = pd.DataFrame(TextBlob(sub_post['发表内容'].iloc[k]).sentiment)
# sub_post['emotion'].iloc[k] = sub_emo.loc[0, 0]
# sub_post['emotion_sub'].iloc[k] = sub_emo.loc[1, 0]
#
# elif is_chinese(tmp): ##中文内容
#
# sub_post['语言'].iloc[k] = 1
#
# ##文本长度
# text = sub_post['发表内容'].iloc[k]
# text = text[0:text.rfind("http")]
# sub_post['文本长度'].iloc[k] = len(text)
# ##情感
# sub_post['emotion'].iloc[k] = (SnowNLP(sub_post['发表内容'].iloc[k]).sentiments - 0.5) * 2
# sub_post['emotion_sub'].iloc[k] = np.NaN
#
# else:
#
# sub_post['语言'].iloc[k] = np.NaN
# sub_post['文本长度'].iloc[k] = np.NaN
# sub_post['emotion'].iloc[k] = np.NaN
# sub_post['emotion_sub'].iloc[k] = np.NaN
#
# if sub_post.shape[0] == 0:
# post['有无传播内容'] = 0
# else:
# post['有无传播内容'] = 1
#
# post['传播链语言均值'] = sub_post['语言'].mean()
# post['传播链贴文长度均值'] = sub_post['文本长度'].mean()
# post['传播链贴文emotion均值'] = sub_post['emotion'].mean()
#
# ##emotion_sub取有值的均值
# post['传播链贴文emotion_sub均值'] = sub_post['emotion_sub'].mean()
#
# post['传播链贴文http均值'] = sub_post['http'].mean()
#
# post['传播链贴文at均值'] = sub_post['at'].mean()
#
# post['传播链贴文tag均值'] = sub_post['tag'].mean()
#
# post['diffdate均值'] = sub_post['diffdate'].mean()
##三、用户信息
##发帖用户
post = pd.merge(post, data_user, how='left', on='用户名')
##传播链用户
sub_user = pd.DataFrame(data[data['传播层级'] != 1][['用户名']])
sub_user = pd.merge(sub_user, data_user, how='left', on='用户名')
sub_user = sub_user.dropna()
post['nickName均值'] = sub_user['nickName'].mean()
post['fansCount均值'] = sub_user['fansCount'].mean()
post['likeCount均值'] = sub_user['likeCount'].mean()
post['postCount均值'] = sub_user['postCount'].mean()
post['otherInfo均值'] = sub_user['otherInfo'].mean()
postset = pd.concat([postset, post]).reset_index(drop=True)
postset = postset.fillna(0)
postset['emotion_degree'] = abs(postset['emotion'])
return postset
def predict_news(userData,postChain,logging):
data_po = pd.DataFrame(postChain).replace('', np.nan)
data_po.columns = ['id','层级','帖子id','转发来源id','所属帖子id','用户名','用户id','发表内容','发表时间','shareCount','url','topicId']
data_po=data_po[['层级','帖子id','转发来源id','所属帖子id','用户名','用户id','发表内容','发表时间','shareCount','url']]
if not userData:
columns=['topicId','id','accountName','nickName','fansCount','likeCount','postCount','account_url','otherInfo','topicId']
data_user=pd.DataFrame(columns=columns)
else:
data_user = pd.DataFrame(userData).replace('', np.nan)
data_user.columns = ['topicId','id','accountName','nickName','fansCount','likeCount','postCount','account_url','otherInfo']
data_user=data_user[['accountName','nickName','fansCount','likeCount','postCount','account_url','otherInfo']]
data_user = pre_user(data_user)
#data_user=dataframe[@XHNews,1,878,1178,938,1]
#data_user.columns=['用户名', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']
postset_po = post_related(data_po,data_user,logging) ## 正面文件
features = postset_po[[
#'shareCount',
'layer', 'shape', 'degree', 'pagerank', 'closeness_centrality',
'主贴http', '主贴at', '主贴tag',
'主贴长度','emotion', 'emotion_degree',
'最大词频数', '重复词汇占比',#(中英文差异大)
#'有无传播内容',
'fansCount','likeCount', 'postCount',
#'sub_shareCount',
'fansCount均值', 'postCount均值', 'otherInfo均值'
]]
clf = joblib.load(par_dir+'/model/fake_news_model.pkl')
clf_predict = clf.predict(features)
res=pd.DataFrame(clf_predict)
res.columns=['假新闻预测结果']
res['recognitionResult'] = res['假新闻预测结果'].apply(lambda x: '假新闻' if x == 1 else '真新闻')
result = pd.concat([postset_po, res], axis=1)
return result
if __name__=="__main__":
print(par_dir)
# user={
# "topicId":1209,
# "host":"172.26.28.30",
# "user":"crawl",
# "passwd":"crawl123",
# "db":"test",
# "port":3306,
# "table":"TwitterAccount"
# }
# userData = tool.mysqlData(user,"")
# # logging.info("账号数据获取完毕!")
# # 传播链数据
# # post = raw_data["metadata"]["admin"]["Twitter_chain"]
# post={
# "topicId":1209,
# "host":"172.26.28.30",
# "user":"crawl",
# "passwd":"crawl123",
# "db":"test",
# "port":3306,
# "table":"Twitter_chain"
# }
# postChain = tool.mysqlData(post, "")
# # logging.info("传播链数据获取完毕!")
# predict_news(userData,postChain,"")