commit
574071d287
82 changed files with 4975 additions and 0 deletions
-
13.idea/asr.iml
-
4.idea/misc.xml
-
8.idea/modules.xml
-
823.idea/workspace.xml
-
23config.ini
-
415inputdata/eg.py
-
BINinputdata/fake_news_model.pkl
-
BINinputdata/test.xlsx
-
BINinputdata/test_1220.xlsx
-
45inputdata/to_mysql.py
-
BINinputdata/假新闻数据输入/Twitter_Account.xlsx
-
BINinputdata/假新闻数据输入/test.xlsx
-
BINinputdata/假新闻数据输入/传播分析1209.xlsx
-
BINinputdata/假新闻数据输入/传播分析1220.xlsx
-
BINinputdata/假新闻数据输入/传播分析test.xlsx
-
BINinputdata/假新闻数据输入/用户test.xlsx
-
433inputdata/假新闻识别@20230918.py
-
BINlog_util/__pycache__/set_logger.cpython-36.pyc
-
BINlog_util/__pycache__/set_logger.cpython-38.pyc
-
33log_util/set_logger.py
-
0logs/results.log
-
18manage.py
-
35src.py
-
1start.sh
-
1stop_uwsgi.sh
-
103test.py
-
0text_analysis/__init__.py
-
BINtext_analysis/__pycache__/__init__.cpython-36.pyc
-
BINtext_analysis/__pycache__/__init__.cpython-38.pyc
-
BINtext_analysis/__pycache__/cusException.cpython-38.pyc
-
BINtext_analysis/__pycache__/read_config.cpython-38.pyc
-
BINtext_analysis/__pycache__/settings.cpython-36.pyc
-
BINtext_analysis/__pycache__/settings.cpython-38.pyc
-
BINtext_analysis/__pycache__/urls.cpython-36.pyc
-
BINtext_analysis/__pycache__/urls.cpython-38.pyc
-
BINtext_analysis/__pycache__/views.cpython-36.pyc
-
BINtext_analysis/__pycache__/views.cpython-38.pyc
-
BINtext_analysis/__pycache__/wsgi.cpython-36.pyc
-
BINtext_analysis/__pycache__/wsgi.cpython-38.pyc
-
108text_analysis/bak/views.py_0226
-
115text_analysis/bak/views.py_0607
-
117text_analysis/bak/views_20240807.py
-
10text_analysis/cusException.py
-
9text_analysis/linshi.py
-
BINtext_analysis/model/bot_user.pkl
-
BINtext_analysis/model/fake_news_model.pkl
-
10text_analysis/read_config.py
-
14text_analysis/request.py
-
148text_analysis/settings.py
-
90text_analysis/src.py
-
BINtext_analysis/tools/__pycache__/cusException.cpython-36.pyc
-
BINtext_analysis/tools/__pycache__/mysql_helper.cpython-36.pyc
-
BINtext_analysis/tools/__pycache__/pred.cpython-38.pyc
-
BINtext_analysis/tools/__pycache__/process.cpython-36.pyc
-
BINtext_analysis/tools/__pycache__/to_kafka.cpython-36.pyc
-
BINtext_analysis/tools/__pycache__/to_kafka.cpython-38.pyc
-
BINtext_analysis/tools/__pycache__/tool.cpython-36.pyc
-
BINtext_analysis/tools/__pycache__/tool.cpython-38.pyc
-
BINtext_analysis/tools/__pycache__/tools.cpython-36.pyc
-
456text_analysis/tools/bak/pred.py
-
220text_analysis/tools/bak/tool.py
-
25text_analysis/tools/cusException.py
-
67text_analysis/tools/kakfa_util.py
-
338text_analysis/tools/mysql_helper.py
-
456text_analysis/tools/pred.py
-
51text_analysis/tools/process.py
-
171text_analysis/tools/seleniumTest.py
-
25text_analysis/tools/to_kafka.py
-
233text_analysis/tools/tool.py
-
1text_analysis/tools/关系链数据.txt
-
1text_analysis/tools/账号数据.txt
-
13text_analysis/urls.py
-
158text_analysis/views.py
-
16text_analysis/wsgi.py
-
83txt/fakeNew.txt
-
1txt/关系链数据.txt
-
BINtxt/技术部分初稿@20230302.docx
-
3txt/环境要求.txt
-
1txt/账号数据.txt
-
8uwsgi.ini
-
38wsgi.log
-
34wsgi.py
@ -0,0 +1,13 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<module type="PYTHON_MODULE" version="4"> |
|||
<component name="NewModuleRootManager"> |
|||
<content url="file://$MODULE_DIR$"> |
|||
<sourceFolder url="file://$MODULE_DIR$/text_analysis/tools" isTestSource="false" /> |
|||
</content> |
|||
<orderEntry type="jdk" jdkName="Python 3.8.16 (D:\LH_program\Anaconda3\envs\python38_env\python.exe)" jdkType="Python SDK" /> |
|||
<orderEntry type="sourceFolder" forTests="false" /> |
|||
</component> |
|||
<component name="TestRunnerService"> |
|||
<option name="PROJECT_TEST_RUNNER" value="Unittests" /> |
|||
</component> |
|||
</module> |
@ -0,0 +1,4 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<project version="4"> |
|||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8.16 (D:\LH_program\Anaconda3\envs\python38_env\python.exe)" project-jdk-type="Python SDK" /> |
|||
</project> |
@ -0,0 +1,8 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<project version="4"> |
|||
<component name="ProjectModuleManager"> |
|||
<modules> |
|||
<module fileurl="file://$PROJECT_DIR$/.idea/asr.iml" filepath="$PROJECT_DIR$/.idea/asr.iml" /> |
|||
</modules> |
|||
</component> |
|||
</project> |
@ -0,0 +1,823 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<project version="4"> |
|||
<component name="ChangeListManager"> |
|||
<list default="true" id="26e841a3-8bef-4d1d-bf9a-d6d27e32457a" name="Default" comment="" /> |
|||
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" /> |
|||
<option name="TRACKING_ENABLED" value="true" /> |
|||
<option name="SHOW_DIALOG" value="false" /> |
|||
<option name="HIGHLIGHT_CONFLICTS" value="true" /> |
|||
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" /> |
|||
<option name="LAST_RESOLUTION" value="IGNORE" /> |
|||
</component> |
|||
<component name="ExecutionTargetManager" SELECTED_TARGET="default_target" /> |
|||
<component name="FileEditorManager"> |
|||
<leaf SIDE_TABS_SIZE_LIMIT_KEY="450"> |
|||
<file leaf-file-name="eg.py" pinned="false" current-in-tab="false"> |
|||
<entry file="file://$PROJECT_DIR$/inputdata/eg.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="423"> |
|||
<caret line="282" column="35" lean-forward="true" selection-start-line="282" selection-start-column="35" selection-end-line="282" selection-end-column="35" /> |
|||
<folding /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
</file> |
|||
<file leaf-file-name="pred.py" pinned="false" current-in-tab="false"> |
|||
<entry file="file://$PROJECT_DIR$/text_analysis/tools/pred.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="309"> |
|||
<caret line="127" column="0" lean-forward="true" selection-start-line="127" selection-start-column="0" selection-end-line="127" selection-end-column="0" /> |
|||
<folding> |
|||
<element signature="e#13#32#0" expanded="true" /> |
|||
</folding> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
</file> |
|||
<file leaf-file-name="views.py" pinned="false" current-in-tab="false"> |
|||
<entry file="file://$PROJECT_DIR$/text_analysis/views.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="399"> |
|||
<caret line="69" column="44" lean-forward="false" selection-start-line="69" selection-start-column="36" selection-end-line="69" selection-end-column="44" /> |
|||
<folding> |
|||
<element signature="e#13#27#0" expanded="true" /> |
|||
</folding> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
</file> |
|||
<file leaf-file-name="to_kafka.py" pinned="false" current-in-tab="true"> |
|||
<entry file="file://$PROJECT_DIR$/text_analysis/tools/to_kafka.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="513"> |
|||
<caret line="23" column="0" lean-forward="true" selection-start-line="23" selection-start-column="0" selection-end-line="23" selection-end-column="0" /> |
|||
<folding> |
|||
<element signature="e#13#29#0" expanded="true" /> |
|||
</folding> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
</file> |
|||
<file leaf-file-name="tool.py" pinned="false" current-in-tab="false"> |
|||
<entry file="file://$PROJECT_DIR$/text_analysis/tools/tool.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="108"> |
|||
<caret line="8" column="23" lean-forward="false" selection-start-line="8" selection-start-column="14" selection-end-line="8" selection-end-column="23" /> |
|||
<folding /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
</file> |
|||
</leaf> |
|||
</component> |
|||
<component name="FileTemplateManagerImpl"> |
|||
<option name="RECENT_TEMPLATES"> |
|||
<list> |
|||
<option value="Python Script" /> |
|||
</list> |
|||
</option> |
|||
</component> |
|||
<component name="FindInProjectRecents"> |
|||
<findStrings> |
|||
<find>KafkaClient</find> |
|||
<find>open</find> |
|||
<find>layer</find> |
|||
<find>post</find> |
|||
<find>is_eng</find> |
|||
<find>getText_count_eng</find> |
|||
<find>fansCount</find> |
|||
<find>postset</find> |
|||
<find>columns</find> |
|||
<find>diffdate均值</find> |
|||
<find>sub_shareCount</find> |
|||
<find>pre_user</find> |
|||
<find>post_related</find> |
|||
</findStrings> |
|||
</component> |
|||
<component name="IdeDocumentHistory"> |
|||
<option name="CHANGED_PATHS"> |
|||
<list> |
|||
<option value="$PROJECT_DIR$/src.py" /> |
|||
<option value="$PROJECT_DIR$/test.py" /> |
|||
<option value="$PROJECT_DIR$/text_analysis/src.py" /> |
|||
<option value="$PROJECT_DIR$/text_analysis/linshi.py" /> |
|||
<option value="$PROJECT_DIR$/uwsgi.ini" /> |
|||
<option value="$PROJECT_DIR$/start.sh" /> |
|||
<option value="$PROJECT_DIR$/stop_uwsgi.sh" /> |
|||
<option value="$PROJECT_DIR$/wsgi.py" /> |
|||
<option value="$PROJECT_DIR$/inputdata/假新闻识别@20230918.py" /> |
|||
<option value="$PROJECT_DIR$/../robotIdentificationTopic/text_analysis/linshi.py" /> |
|||
<option value="$PROJECT_DIR$/text_analysis/urls.py" /> |
|||
<option value="$PROJECT_DIR$/manage.py" /> |
|||
<option value="$PROJECT_DIR$/text_analysis/tools/tool.py" /> |
|||
<option value="$PROJECT_DIR$/linshi.py" /> |
|||
<option value="$PROJECT_DIR$/inputdata/to_mysql.py" /> |
|||
<option value="$PROJECT_DIR$/inputdata/eg.py" /> |
|||
<option value="$PROJECT_DIR$/text_analysis/tools/pred.py" /> |
|||
<option value="$PROJECT_DIR$/text_analysis/tools/to_kafka.py" /> |
|||
<option value="$PROJECT_DIR$/text_analysis/views.py" /> |
|||
</list> |
|||
</option> |
|||
</component> |
|||
<component name="ProjectFrameBounds"> |
|||
<option name="x" value="-11" /> |
|||
<option name="y" value="-11" /> |
|||
<option name="width" value="1942" /> |
|||
<option name="height" value="1042" /> |
|||
</component> |
|||
<component name="ProjectView"> |
|||
<navigator currentView="ProjectPane" proportions="" version="1"> |
|||
<flattenPackages /> |
|||
<showMembers /> |
|||
<showModules /> |
|||
<showLibraryContents /> |
|||
<hideEmptyPackages /> |
|||
<abbreviatePackageNames /> |
|||
<autoscrollToSource /> |
|||
<autoscrollFromSource /> |
|||
<sortByType /> |
|||
<manualOrder /> |
|||
<foldersAlwaysOnTop value="true" /> |
|||
</navigator> |
|||
<panes> |
|||
<pane id="Scratches" /> |
|||
<pane id="Scope" /> |
|||
<pane id="ProjectPane"> |
|||
<subPane> |
|||
<PATH> |
|||
<PATH_ELEMENT> |
|||
<option name="myItemId" value="fakeNewIdentification" /> |
|||
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" /> |
|||
</PATH_ELEMENT> |
|||
<PATH_ELEMENT> |
|||
<option name="myItemId" value="fakeNewIdentification" /> |
|||
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" /> |
|||
</PATH_ELEMENT> |
|||
</PATH> |
|||
<PATH> |
|||
<PATH_ELEMENT> |
|||
<option name="myItemId" value="fakeNewIdentification" /> |
|||
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" /> |
|||
</PATH_ELEMENT> |
|||
<PATH_ELEMENT> |
|||
<option name="myItemId" value="fakeNewIdentification" /> |
|||
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" /> |
|||
</PATH_ELEMENT> |
|||
<PATH_ELEMENT> |
|||
<option name="myItemId" value="text_analysis" /> |
|||
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" /> |
|||
</PATH_ELEMENT> |
|||
</PATH> |
|||
<PATH> |
|||
<PATH_ELEMENT> |
|||
<option name="myItemId" value="fakeNewIdentification" /> |
|||
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" /> |
|||
</PATH_ELEMENT> |
|||
<PATH_ELEMENT> |
|||
<option name="myItemId" value="fakeNewIdentification" /> |
|||
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" /> |
|||
</PATH_ELEMENT> |
|||
<PATH_ELEMENT> |
|||
<option name="myItemId" value="text_analysis" /> |
|||
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" /> |
|||
</PATH_ELEMENT> |
|||
<PATH_ELEMENT> |
|||
<option name="myItemId" value="tools" /> |
|||
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" /> |
|||
</PATH_ELEMENT> |
|||
</PATH> |
|||
<PATH> |
|||
<PATH_ELEMENT> |
|||
<option name="myItemId" value="fakeNewIdentification" /> |
|||
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" /> |
|||
</PATH_ELEMENT> |
|||
<PATH_ELEMENT> |
|||
<option name="myItemId" value="fakeNewIdentification" /> |
|||
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" /> |
|||
</PATH_ELEMENT> |
|||
<PATH_ELEMENT> |
|||
<option name="myItemId" value="inputdata" /> |
|||
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" /> |
|||
</PATH_ELEMENT> |
|||
</PATH> |
|||
</subPane> |
|||
</pane> |
|||
</panes> |
|||
</component> |
|||
<component name="PropertiesComponent"> |
|||
<property name="last_opened_file_path" value="$PROJECT_DIR$/../chatGpt" /> |
|||
<property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" /> |
|||
</component> |
|||
<component name="PyDebuggerOptionsProvider"> |
|||
<option name="mySupportQtDebugging" value="false" /> |
|||
</component> |
|||
<component name="RunDashboard"> |
|||
<option name="ruleStates"> |
|||
<list> |
|||
<RuleState> |
|||
<option name="name" value="ConfigurationTypeDashboardGroupingRule" /> |
|||
</RuleState> |
|||
<RuleState> |
|||
<option name="name" value="StatusDashboardGroupingRule" /> |
|||
</RuleState> |
|||
</list> |
|||
</option> |
|||
</component> |
|||
<component name="RunManager" selected="Python.to_kafka"> |
|||
<configuration default="false" name="假新闻识别@20230918" type="PythonConfigurationType" factoryName="Python" temporary="true"> |
|||
<option name="INTERPRETER_OPTIONS" value="" /> |
|||
<option name="PARENT_ENVS" value="true" /> |
|||
<envs> |
|||
<env name="PYTHONUNBUFFERED" value="1" /> |
|||
</envs> |
|||
<option name="SDK_HOME" value="" /> |
|||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/inputdata" /> |
|||
<option name="IS_MODULE_SDK" value="true" /> |
|||
<option name="ADD_CONTENT_ROOTS" value="true" /> |
|||
<option name="ADD_SOURCE_ROOTS" value="true" /> |
|||
<module name="asr" /> |
|||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/inputdata/假新闻识别@20230918.py" /> |
|||
<option name="PARAMETERS" value="" /> |
|||
<option name="SHOW_COMMAND_LINE" value="false" /> |
|||
<option name="EMULATE_TERMINAL" value="false" /> |
|||
<method /> |
|||
</configuration> |
|||
<configuration default="false" name="linshi (1)" type="PythonConfigurationType" factoryName="Python" temporary="true"> |
|||
<option name="INTERPRETER_OPTIONS" value="" /> |
|||
<option name="PARENT_ENVS" value="true" /> |
|||
<envs> |
|||
<env name="PYTHONUNBUFFERED" value="1" /> |
|||
</envs> |
|||
<option name="SDK_HOME" value="" /> |
|||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" /> |
|||
<option name="IS_MODULE_SDK" value="true" /> |
|||
<option name="ADD_CONTENT_ROOTS" value="true" /> |
|||
<option name="ADD_SOURCE_ROOTS" value="true" /> |
|||
<module name="asr" /> |
|||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/linshi.py" /> |
|||
<option name="PARAMETERS" value="" /> |
|||
<option name="SHOW_COMMAND_LINE" value="false" /> |
|||
<option name="EMULATE_TERMINAL" value="false" /> |
|||
<method /> |
|||
</configuration> |
|||
<configuration default="false" name="eg" type="PythonConfigurationType" factoryName="Python" temporary="true"> |
|||
<option name="INTERPRETER_OPTIONS" value="" /> |
|||
<option name="PARENT_ENVS" value="true" /> |
|||
<envs> |
|||
<env name="PYTHONUNBUFFERED" value="1" /> |
|||
</envs> |
|||
<option name="SDK_HOME" value="" /> |
|||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/inputdata" /> |
|||
<option name="IS_MODULE_SDK" value="true" /> |
|||
<option name="ADD_CONTENT_ROOTS" value="true" /> |
|||
<option name="ADD_SOURCE_ROOTS" value="true" /> |
|||
<module name="asr" /> |
|||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/inputdata/eg.py" /> |
|||
<option name="PARAMETERS" value="" /> |
|||
<option name="SHOW_COMMAND_LINE" value="false" /> |
|||
<option name="EMULATE_TERMINAL" value="false" /> |
|||
<method /> |
|||
</configuration> |
|||
<configuration default="false" name="pred" type="PythonConfigurationType" factoryName="Python" temporary="true"> |
|||
<option name="INTERPRETER_OPTIONS" value="" /> |
|||
<option name="PARENT_ENVS" value="true" /> |
|||
<envs> |
|||
<env name="PYTHONUNBUFFERED" value="1" /> |
|||
</envs> |
|||
<option name="SDK_HOME" value="" /> |
|||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/text_analysis/tools" /> |
|||
<option name="IS_MODULE_SDK" value="true" /> |
|||
<option name="ADD_CONTENT_ROOTS" value="true" /> |
|||
<option name="ADD_SOURCE_ROOTS" value="true" /> |
|||
<module name="asr" /> |
|||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/text_analysis/tools/pred.py" /> |
|||
<option name="PARAMETERS" value="" /> |
|||
<option name="SHOW_COMMAND_LINE" value="false" /> |
|||
<option name="EMULATE_TERMINAL" value="false" /> |
|||
<method /> |
|||
</configuration> |
|||
<configuration default="false" name="to_kafka" type="PythonConfigurationType" factoryName="Python" temporary="true"> |
|||
<option name="INTERPRETER_OPTIONS" value="" /> |
|||
<option name="PARENT_ENVS" value="true" /> |
|||
<envs> |
|||
<env name="PYTHONUNBUFFERED" value="1" /> |
|||
</envs> |
|||
<option name="SDK_HOME" value="" /> |
|||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/text_analysis/tools" /> |
|||
<option name="IS_MODULE_SDK" value="true" /> |
|||
<option name="ADD_CONTENT_ROOTS" value="true" /> |
|||
<option name="ADD_SOURCE_ROOTS" value="true" /> |
|||
<module name="asr" /> |
|||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/text_analysis/tools/to_kafka.py" /> |
|||
<option name="PARAMETERS" value="" /> |
|||
<option name="SHOW_COMMAND_LINE" value="false" /> |
|||
<option name="EMULATE_TERMINAL" value="false" /> |
|||
<method /> |
|||
</configuration> |
|||
<configuration default="true" type="PythonConfigurationType" factoryName="Python"> |
|||
<option name="INTERPRETER_OPTIONS" value="" /> |
|||
<option name="PARENT_ENVS" value="true" /> |
|||
<envs> |
|||
<env name="PYTHONUNBUFFERED" value="1" /> |
|||
</envs> |
|||
<option name="SDK_HOME" value="" /> |
|||
<option name="WORKING_DIRECTORY" value="" /> |
|||
<option name="IS_MODULE_SDK" value="false" /> |
|||
<option name="ADD_CONTENT_ROOTS" value="true" /> |
|||
<option name="ADD_SOURCE_ROOTS" value="true" /> |
|||
<module name="asr" /> |
|||
<option name="SCRIPT_NAME" value="" /> |
|||
<option name="PARAMETERS" value="" /> |
|||
<option name="SHOW_COMMAND_LINE" value="false" /> |
|||
<option name="EMULATE_TERMINAL" value="false" /> |
|||
<method /> |
|||
</configuration> |
|||
<configuration default="true" type="Tox" factoryName="Tox"> |
|||
<option name="INTERPRETER_OPTIONS" value="" /> |
|||
<option name="PARENT_ENVS" value="true" /> |
|||
<envs /> |
|||
<option name="SDK_HOME" value="" /> |
|||
<option name="WORKING_DIRECTORY" value="" /> |
|||
<option name="IS_MODULE_SDK" value="false" /> |
|||
<option name="ADD_CONTENT_ROOTS" value="true" /> |
|||
<option name="ADD_SOURCE_ROOTS" value="true" /> |
|||
<module name="asr" /> |
|||
<method /> |
|||
</configuration> |
|||
<configuration default="true" type="tests" factoryName="Doctests"> |
|||
<option name="INTERPRETER_OPTIONS" value="" /> |
|||
<option name="PARENT_ENVS" value="true" /> |
|||
<envs /> |
|||
<option name="SDK_HOME" value="" /> |
|||
<option name="WORKING_DIRECTORY" value="" /> |
|||
<option name="IS_MODULE_SDK" value="false" /> |
|||
<option name="ADD_CONTENT_ROOTS" value="true" /> |
|||
<option name="ADD_SOURCE_ROOTS" value="true" /> |
|||
<module name="asr" /> |
|||
<option name="SCRIPT_NAME" value="" /> |
|||
<option name="CLASS_NAME" value="" /> |
|||
<option name="METHOD_NAME" value="" /> |
|||
<option name="FOLDER_NAME" value="" /> |
|||
<option name="TEST_TYPE" value="TEST_SCRIPT" /> |
|||
<option name="PATTERN" value="" /> |
|||
<option name="USE_PATTERN" value="false" /> |
|||
<method /> |
|||
</configuration> |
|||
<configuration default="true" type="tests" factoryName="Unittests"> |
|||
<option name="INTERPRETER_OPTIONS" value="" /> |
|||
<option name="PARENT_ENVS" value="true" /> |
|||
<envs /> |
|||
<option name="SDK_HOME" value="" /> |
|||
<option name="WORKING_DIRECTORY" value="" /> |
|||
<option name="IS_MODULE_SDK" value="false" /> |
|||
<option name="ADD_CONTENT_ROOTS" value="true" /> |
|||
<option name="ADD_SOURCE_ROOTS" value="true" /> |
|||
<module name="asr" /> |
|||
<option name="_new_additionalArguments" value="""" /> |
|||
<option name="_new_target" value=""."" /> |
|||
<option name="_new_targetType" value=""PATH"" /> |
|||
<method /> |
|||
</configuration> |
|||
<list size="5"> |
|||
<item index="0" class="java.lang.String" itemvalue="Python.假新闻识别@20230918" /> |
|||
<item index="1" class="java.lang.String" itemvalue="Python.linshi (1)" /> |
|||
<item index="2" class="java.lang.String" itemvalue="Python.eg" /> |
|||
<item index="3" class="java.lang.String" itemvalue="Python.pred" /> |
|||
<item index="4" class="java.lang.String" itemvalue="Python.to_kafka" /> |
|||
</list> |
|||
<recent_temporary> |
|||
<list size="5"> |
|||
<item index="0" class="java.lang.String" itemvalue="Python.to_kafka" /> |
|||
<item index="1" class="java.lang.String" itemvalue="Python.pred" /> |
|||
<item index="2" class="java.lang.String" itemvalue="Python.eg" /> |
|||
<item index="3" class="java.lang.String" itemvalue="Python.假新闻识别@20230918" /> |
|||
<item index="4" class="java.lang.String" itemvalue="Python.linshi (1)" /> |
|||
</list> |
|||
</recent_temporary> |
|||
</component> |
|||
<component name="ShelveChangesManager" show_recycled="false"> |
|||
<option name="remove_strategy" value="false" /> |
|||
</component> |
|||
<component name="TaskManager"> |
|||
<task active="true" id="Default" summary="Default task"> |
|||
<changelist id="26e841a3-8bef-4d1d-bf9a-d6d27e32457a" name="Default" comment="" /> |
|||
<created>1692600024256</created> |
|||
<option name="number" value="Default" /> |
|||
<option name="presentableId" value="Default" /> |
|||
<updated>1692600024256</updated> |
|||
</task> |
|||
<servers /> |
|||
</component> |
|||
<component name="ToolWindowManager"> |
|||
<frame x="-11" y="-11" width="1942" height="1042" extended-state="7" /> |
|||
<editor active="true" /> |
|||
<layout> |
|||
<window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.06614583" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" /> |
|||
<window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="6" side_tool="false" content_ui="tabs" /> |
|||
<window_info id="Event Log" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="true" content_ui="tabs" /> |
|||
<window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="false" weight="0.33" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" /> |
|||
<window_info id="Python Console" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" /> |
|||
<window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.25711036" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" /> |
|||
<window_info id="Structure" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" /> |
|||
<window_info id="Terminal" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" /> |
|||
<window_info id="Favorites" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="true" content_ui="tabs" /> |
|||
<window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.17633675" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" /> |
|||
<window_info id="Data View" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4515625" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" /> |
|||
<window_info id="Cvs" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="4" side_tool="false" content_ui="tabs" /> |
|||
<window_info id="Message" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" /> |
|||
<window_info id="Commander" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" /> |
|||
<window_info id="Inspection" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="5" side_tool="false" content_ui="tabs" /> |
|||
<window_info id="Hierarchy" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="2" side_tool="false" content_ui="combo" /> |
|||
<window_info id="Find" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" /> |
|||
<window_info id="Ant Build" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" /> |
|||
</layout> |
|||
</component> |
|||
<component name="VcsContentAnnotationSettings"> |
|||
<option name="myLimit" value="2678400000" /> |
|||
</component> |
|||
<component name="XDebuggerManager"> |
|||
<breakpoint-manager> |
|||
<breakpoints> |
|||
<line-breakpoint enabled="true" suspend="THREAD" type="python-line"> |
|||
<url>file://$PROJECT_DIR$/inputdata/假新闻识别@20230918.py</url> |
|||
<line>190</line> |
|||
<option name="timeStamp" value="33" /> |
|||
</line-breakpoint> |
|||
</breakpoints> |
|||
<option name="time" value="43" /> |
|||
</breakpoint-manager> |
|||
<watches-manager /> |
|||
</component> |
|||
<component name="editorHistoryManager"> |
|||
<entry file="file://$PROJECT_DIR$/test.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="3240"> |
|||
<caret line="93" column="18" lean-forward="false" selection-start-line="93" selection-start-column="14" selection-end-line="93" selection-end-column="18" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/text_analysis/tools/to_kafka.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="1692"> |
|||
<caret line="54" column="0" lean-forward="false" selection-start-line="54" selection-start-column="0" selection-end-line="54" selection-end-column="0" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/manage.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="324"> |
|||
<caret line="13" column="19" lean-forward="false" selection-start-line="13" selection-start-column="19" selection-end-line="13" selection-end-column="19" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/text_analysis/urls.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="216"> |
|||
<caret line="8" column="62" lean-forward="false" selection-start-line="8" selection-start-column="62" selection-end-line="8" selection-end-column="62" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/wsgi.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="396"> |
|||
<caret line="14" column="0" lean-forward="false" selection-start-line="14" selection-start-column="0" selection-end-line="20" selection-end-column="9" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/text_analysis/views.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="4464"> |
|||
<caret line="125" column="51" lean-forward="false" selection-start-line="125" selection-start-column="44" selection-end-line="125" selection-end-column="51" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/src.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="1152"> |
|||
<caret line="32" column="0" lean-forward="false" selection-start-line="32" selection-start-column="0" selection-end-line="32" selection-end-column="0" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/test.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="3240"> |
|||
<caret line="93" column="18" lean-forward="false" selection-start-line="93" selection-start-column="14" selection-end-line="93" selection-end-column="18" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/text_analysis/tools/to_kafka.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="1692"> |
|||
<caret line="54" column="0" lean-forward="false" selection-start-line="54" selection-start-column="0" selection-end-line="54" selection-end-column="0" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/manage.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="324"> |
|||
<caret line="13" column="19" lean-forward="false" selection-start-line="13" selection-start-column="19" selection-end-line="13" selection-end-column="19" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/text_analysis/urls.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="216"> |
|||
<caret line="8" column="62" lean-forward="false" selection-start-line="8" selection-start-column="62" selection-end-line="8" selection-end-column="62" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/wsgi.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="396"> |
|||
<caret line="14" column="0" lean-forward="false" selection-start-line="14" selection-start-column="0" selection-end-line="20" selection-end-column="9" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/text_analysis/views.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="2916"> |
|||
<caret line="82" column="33" lean-forward="true" selection-start-line="82" selection-start-column="33" selection-end-line="82" selection-end-column="65" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/src.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="792"> |
|||
<caret line="22" column="14" lean-forward="false" selection-start-line="22" selection-start-column="14" selection-end-line="22" selection-end-column="14" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/text_analysis/views.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="1332"> |
|||
<caret line="38" column="4" lean-forward="true" selection-start-line="38" selection-start-column="4" selection-end-line="38" selection-end-column="4" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/text_analysis/tools/to_kafka.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="612"> |
|||
<caret line="24" column="26" lean-forward="true" selection-start-line="24" selection-start-column="26" selection-end-line="24" selection-end-column="26" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/text_analysis/urls.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="216"> |
|||
<caret line="8" column="62" lean-forward="true" selection-start-line="8" selection-start-column="62" selection-end-line="8" selection-end-column="62" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/wsgi.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="396"> |
|||
<caret line="14" column="0" lean-forward="false" selection-start-line="14" selection-start-column="0" selection-end-line="20" selection-end-column="9" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/text_analysis/tools/tool.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="360"> |
|||
<caret line="10" column="0" lean-forward="false" selection-start-line="10" selection-start-column="0" selection-end-line="10" selection-end-column="0" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/manage.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="324"> |
|||
<caret line="13" column="19" lean-forward="true" selection-start-line="13" selection-start-column="19" selection-end-line="13" selection-end-column="19" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://D:/LH_program/Anaconda3/envs/python3.6test/Lib/site-packages/pandas/tests/reshape/merge/test_merge_asof.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="144"> |
|||
<caret line="13" column="8" lean-forward="false" selection-start-line="13" selection-start-column="8" selection-end-line="13" selection-end-column="8" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/../../../2022/空天院高分项目/Project_kongtianyuan/text_analysis/tools/tool.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="108"> |
|||
<caret line="3" column="10" lean-forward="true" selection-start-line="3" selection-start-column="10" selection-end-line="4" selection-end-column="38" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/../../../2022/空天院高分项目/Project_kongtianyuan/text_analysis/views.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="288"> |
|||
<caret line="107" column="21" lean-forward="false" selection-start-line="107" selection-start-column="12" selection-end-line="107" selection-end-column="21" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/../../../2022/Project_KG_Content/text_analysis/views.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="252"> |
|||
<caret line="49" column="0" lean-forward="false" selection-start-line="49" selection-start-column="0" selection-end-line="51" selection-end-column="37" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/../mySql/text_analysis/views.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="108"> |
|||
<caret line="90" column="90" lean-forward="true" selection-start-line="90" selection-start-column="90" selection-end-line="90" selection-end-column="90" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/test.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="723"> |
|||
<caret line="93" column="18" lean-forward="false" selection-start-line="93" selection-start-column="14" selection-end-line="93" selection-end-column="18" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/src.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="554"> |
|||
<caret line="32" column="0" lean-forward="false" selection-start-line="32" selection-start-column="0" selection-end-line="32" selection-end-column="0" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/text_analysis/linshi.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="3528"> |
|||
<caret line="100" column="0" lean-forward="false" selection-start-line="100" selection-start-column="0" selection-end-line="100" selection-end-column="0" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/text_analysis/src.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="1656"> |
|||
<caret line="46" column="13" lean-forward="false" selection-start-line="46" selection-start-column="4" selection-end-line="46" selection-end-column="13" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/start.sh"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="0"> |
|||
<caret line="0" column="30" lean-forward="false" selection-start-line="0" selection-start-column="30" selection-end-line="0" selection-end-column="30" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/stop_uwsgi.sh"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="0"> |
|||
<caret line="0" column="12" lean-forward="false" selection-start-line="0" selection-start-column="12" selection-end-line="0" selection-end-column="12" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/uwsgi.ini"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="72"> |
|||
<caret line="2" column="36" lean-forward="true" selection-start-line="2" selection-start-column="36" selection-end-line="2" selection-end-column="36" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/wsgi.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="210"> |
|||
<caret line="12" column="39" lean-forward="false" selection-start-line="12" selection-start-column="39" selection-end-line="12" selection-end-column="39" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://D:/LH_program/Anaconda3/envs/python38_env/Lib/site-packages/pandas/compat/_optional.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="63"> |
|||
<caret line="138" column="23" lean-forward="true" selection-start-line="138" selection-start-column="23" selection-end-line="138" selection-end-column="23" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/../robotIdentificationTopic/linshi.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="-521"> |
|||
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/../robotIdentificationTopic/src.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="-432"> |
|||
<caret line="9" column="12" lean-forward="true" selection-start-line="9" selection-start-column="12" selection-end-line="9" selection-end-column="12" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/../robotIdentificationTopic/text_analysis/linshi.py" /> |
|||
<entry file="file://$PROJECT_DIR$/../robotIdentificationTopic/text_analysis/views.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="324"> |
|||
<caret line="10" column="29" lean-forward="false" selection-start-line="10" selection-start-column="0" selection-end-line="11" selection-end-column="0" /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/linshi.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="288"> |
|||
<caret line="9" column="0" lean-forward="false" selection-start-line="9" selection-start-column="0" selection-end-line="9" selection-end-column="0" /> |
|||
<folding /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/text_analysis/urls.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="144"> |
|||
<caret line="6" column="32" lean-forward="false" selection-start-line="6" selection-start-column="11" selection-end-line="6" selection-end-column="32" /> |
|||
<folding /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/manage.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="216"> |
|||
<caret line="10" column="13" lean-forward="false" selection-start-line="10" selection-start-column="13" selection-end-line="10" selection-end-column="13" /> |
|||
<folding /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/inputdata/to_mysql.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="1512"> |
|||
<caret line="45" column="0" lean-forward="false" selection-start-line="45" selection-start-column="0" selection-end-line="45" selection-end-column="0" /> |
|||
<folding /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://D:/LH_program/Anaconda3/envs/python38_env/Lib/site-packages/pandas/core/internals/base.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="118"> |
|||
<caret line="68" column="0" lean-forward="false" selection-start-line="68" selection-start-column="0" selection-end-line="68" selection-end-column="0" /> |
|||
<folding /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/inputdata/假新闻识别@20230918.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="-9209"> |
|||
<caret line="118" column="8" lean-forward="true" selection-start-line="118" selection-start-column="8" selection-end-line="118" selection-end-column="8" /> |
|||
<folding /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/text_analysis/tools/tool.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="108"> |
|||
<caret line="8" column="23" lean-forward="false" selection-start-line="8" selection-start-column="14" selection-end-line="8" selection-end-column="23" /> |
|||
<folding /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/../../假新闻识别/假新闻识别/假新闻识别@20230918.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="378"> |
|||
<caret line="382" column="0" lean-forward="true" selection-start-line="382" selection-start-column="0" selection-end-line="382" selection-end-column="0" /> |
|||
<folding /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/inputdata/eg.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="423"> |
|||
<caret line="282" column="35" lean-forward="true" selection-start-line="282" selection-start-column="35" selection-end-line="282" selection-end-column="35" /> |
|||
<folding /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/text_analysis/tools/pred.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="309"> |
|||
<caret line="127" column="0" lean-forward="true" selection-start-line="127" selection-start-column="0" selection-end-line="127" selection-end-column="0" /> |
|||
<folding> |
|||
<element signature="e#13#32#0" expanded="true" /> |
|||
</folding> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/../chatGpt/text_analysis/views.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="432"> |
|||
<caret line="67" column="48" lean-forward="false" selection-start-line="67" selection-start-column="40" selection-end-line="67" selection-end-column="48" /> |
|||
<folding /> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/text_analysis/views.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="399"> |
|||
<caret line="69" column="44" lean-forward="false" selection-start-line="69" selection-start-column="36" selection-end-line="69" selection-end-column="44" /> |
|||
<folding> |
|||
<element signature="e#13#27#0" expanded="true" /> |
|||
</folding> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
<entry file="file://$PROJECT_DIR$/text_analysis/tools/to_kafka.py"> |
|||
<provider selected="true" editor-type-id="text-editor"> |
|||
<state relative-caret-position="513"> |
|||
<caret line="23" column="0" lean-forward="true" selection-start-line="23" selection-start-column="0" selection-end-line="23" selection-end-column="0" /> |
|||
<folding> |
|||
<element signature="e#13#29#0" expanded="true" /> |
|||
</folding> |
|||
</state> |
|||
</provider> |
|||
</entry> |
|||
</component> |
|||
</project> |
@ -0,0 +1,23 @@ |
|||
[database] |
|||
;数据库地址 |
|||
host=node-01 |
|||
;端口 |
|||
port=3306 |
|||
;用户名 |
|||
username=root |
|||
;密码 |
|||
password=bw@2025 |
|||
;数据库 |
|||
db=analyze |
|||
|
|||
[zookeeper] |
|||
;zk地址 |
|||
zkhost=node-01:12181,node-02:12181,node-03:12181 |
|||
;节点 |
|||
node=/analyze |
|||
|
|||
[kafka] |
|||
;服务器地址 |
|||
bootstrap_servers=node-01:19092,node-02:19092,node-03:19092 |
|||
;topic |
|||
topic=produce_analyze |
@ -0,0 +1,415 @@ |
|||
#coding:utf8 |
|||
import pandas as pd |
|||
import numpy as np |
|||
import networkx as nx |
|||
from textblob import TextBlob |
|||
from snownlp import SnowNLP |
|||
from wordcloud import STOPWORDS |
|||
import jieba |
|||
from tqdm import tqdm |
|||
import datetime |
|||
# from sklearn.model_selection import train_test_split |
|||
# from sklearn.ensemble import RandomForestClassifier |
|||
# from sklearn.model_selection import GridSearchCV |
|||
import joblib |
|||
def pre_user(data_user): |
|||
data_user['accountName'] = data_user['accountName'].apply(lambda x:'@'+x) |
|||
data_user['otherInfo'] = 1-pd.isnull(data_user['otherInfo']).astype(int) |
|||
data_user['nickName'] = 1-pd.isnull(data_user['nickName']).astype(int) |
|||
data_user = data_user[['accountName', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']] |
|||
data_user = data_user.dropna() |
|||
data_user = data_user.drop_duplicates().reset_index(drop = True) |
|||
data_user['fansCount'] = data_user['fansCount'].astype(int) |
|||
data_user['likeCount'] = data_user['likeCount'].astype(int) |
|||
data_user['postCount'] = data_user['postCount'].astype(int) |
|||
data_user.columns = ['用户名', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo'] |
|||
return data_user |
|||
|
|||
def getText_count_eng(txt): |
|||
"""英文词频统计""" |
|||
txt = txt.lower() #将所有大写字母变成小写 |
|||
for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()… ': #将文本中特殊符号替换为空格 |
|||
txt = txt.replace(ch," ") |
|||
words = txt.split() |
|||
counts = {} |
|||
for word in words: |
|||
if word not in STOPWORDS: |
|||
if word != '\t': |
|||
counts[word] = counts.get(word,0) + 1 #统计字数 |
|||
items = pd.DataFrame(list(counts.items())) |
|||
return items |
|||
|
|||
def getText_count_ch(txt): |
|||
"""中文词频统计""" |
|||
txt = txt.lower() #将所有大写字母变成小写 |
|||
for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()… 0123456789abcdefghijklmnopqrstuvwxyz': #将文本中特殊符号数字删除 |
|||
txt = txt.replace(ch,"") |
|||
words = jieba.lcut(txt) |
|||
counts = {} |
|||
for word in words: |
|||
counts[word] = counts.get(word,0) + 1 |
|||
items = list(counts.items()) |
|||
fin_items = [] |
|||
for item in items: |
|||
if len(item[0])>=2: |
|||
fin_items.append(item) |
|||
fin_items = pd.DataFrame(fin_items) |
|||
return fin_items |
|||
|
|||
def getText_count_U(txt): |
|||
"""统计英文大写词频""" |
|||
for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()… ': #将文本中特殊符号替换为空格 |
|||
txt = txt.replace(ch," ") |
|||
words = txt.split() |
|||
counts = {} |
|||
for word in words: |
|||
if word not in STOPWORDS: |
|||
if word != '/t': |
|||
if word.isupper(): #统计大写 |
|||
counts[word] = counts.get(word,0) + 1 #统计字数 |
|||
items = pd.DataFrame(list(counts.items())) #将字典类型转换成列表类型 |
|||
if items.shape == (0,0): |
|||
out = 0 |
|||
else: |
|||
out = sum(items[1]) |
|||
return out |
|||
|
|||
def is_chinese(strs): |
|||
"""判断一个unicode是否是汉字/英文""" |
|||
strs = strs.lower() |
|||
for uchar in strs: |
|||
if (uchar < u'\u0061') or (u'\u007a' < uchar < u'\u4e00') or (u'\u9fff' < uchar): |
|||
return False |
|||
return True |
|||
|
|||
def is_eng(strs): |
|||
"""判断一个unicode是否是英文""" |
|||
strs = strs.lower() |
|||
for uchar in strs: |
|||
if (uchar < u'\u0061') or (u'\u007a' < uchar): |
|||
return False |
|||
return True |
|||
|
|||
def pre_user(data_user): |
|||
data_user['accountName'] = data_user['accountName'].apply(lambda x:'@'+x) |
|||
data_user['otherInfo'] = 1-pd.isnull(data_user['otherInfo']).astype(int) |
|||
data_user['nickName'] = 1-pd.isnull(data_user['nickName']).astype(int) |
|||
data_user = data_user[['accountName', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']] |
|||
data_user = data_user.dropna() |
|||
data_user = data_user.drop_duplicates().reset_index(drop = True) |
|||
data_user['fansCount'] = data_user['fansCount'].astype(int) |
|||
data_user['likeCount'] = data_user['likeCount'].astype(int) |
|||
data_user['postCount'] = data_user['postCount'].astype(int) |
|||
data_user.columns = ['用户名', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo'] |
|||
return data_user |
|||
|
|||
|
|||
def post_related(df, data_user): |
|||
# postset = pd.DataFrame(columns=['传播层级', '帖子id', '转发来源id', '用户名', '用户id', '发表内容', '发表时间', 'shareCount', |
|||
# 'url', 'layer', 'shape', 'degree', 'closeness_centrality', 'pagerank', |
|||
# 'sub_shareCount', '语言', '主贴长度', '主贴http', '主贴at', '主贴tag', |
|||
# 'emotion', 'emotion_sub', '最大词频数', '重复词汇占比', '大写词频', '有无传播内容', |
|||
# '传播链语言均值', '传播链语言标准差', '传播链贴文emotion均值', '传播链贴文emotion标准差', |
|||
# '传播链贴文emotion_sub均值', '传播链贴文emotion_sub标准差', |
|||
# '传播链贴文长度均值', '传播链贴文长度标准差', '传播链贴文http均值', '传播链贴文http标准差', '传播链贴文at均值', |
|||
# '传播链贴文at标准差', '传播链贴文tag均值', '传播链贴文tag标准差', 'diffdate均值', 'diffdate标准差']) |
|||
postset = pd.DataFrame(columns=['传播层级', '帖子id', '转发来源id','所属帖子id', '用户名', '用户id', '发表内容', '发表时间', 'shareCount', |
|||
'url', 'layer', 'shape', 'degree', 'closeness_centrality', 'pagerank', |
|||
'语言', '主贴长度', '主贴http', '主贴at', '主贴tag', |
|||
'emotion', 'emotion_sub', '最大词频数', '重复词汇占比']) |
|||
|
|||
for post_id in tqdm(df['所属帖子id'].drop_duplicates().reset_index(drop=True)): |
|||
|
|||
data = df[df['所属帖子id'] == post_id].reset_index(drop=True) |
|||
|
|||
data.columns = ['传播层级', '帖子id', '转发来源id', '所属帖子id', '用户名', '用户id', '发表内容', '发表时间', |
|||
'shareCount', 'url'] |
|||
|
|||
data = data.drop_duplicates() |
|||
|
|||
post = data[data['传播层级'] == '1'].head(1) |
|||
|
|||
### 一、新闻传播--贴文网络 |
|||
##1.layer/shape/degree |
|||
post['layer'] = int(max(data['传播层级'])) |
|||
post['shape'] = data.shape[0] - 1 |
|||
post['degree'] = data[data['传播层级'] == '2'].shape[0] |
|||
|
|||
##2.整体网络测度(贴文网络测度) |
|||
###2.1把转发来源id对应到转发来源用户 |
|||
tmp_zfyh = pd.merge(data[data['传播层级'] != '1']['转发来源id'].drop_duplicates(), |
|||
data[data['帖子id'].notnull()][['帖子id', '用户名']], |
|||
left_on=['转发来源id'], right_on=['帖子id'], how='left')[['转发来源id', '用户名']] |
|||
tmp_zfyh.columns = ['转发来源id', '转发来源用户名'] |
|||
data = pd.merge(data, tmp_zfyh, left_on=['转发来源id'], right_on=['转发来源id'], how='left') |
|||
post_edge = data.copy() |
|||
post_edge = data[data['传播层级'] != '1'][['用户名', '转发来源用户名']] |
|||
post_edge.columns = ['source', 'target'] |
|||
post_edge['count_all'] = 1 |
|||
post_edge = post_edge.groupby(['source', 'target'])['count_all'].count().reset_index() |
|||
# post_edge.to_csv(r'E:\项目文件\情报\假新闻\数据\画图\post_edge_tmp.csv',index=False) |
|||
|
|||
edgeweightset = post_edge[['source', 'target', 'count_all']] |
|||
edgeweightset_l = [[] for _ in range(edgeweightset.shape[0])] |
|||
for k in range(len(edgeweightset_l)): |
|||
for j in range(edgeweightset.shape[1]): |
|||
edgeweightset_l[k].append(edgeweightset.iloc[k, j]) |
|||
# print(i/len(edgeweightset_l)) |
|||
|
|||
if len(edgeweightset_l) == 0: # 没有传播链 |
|||
post['closeness_centrality'] = 1 |
|||
post['pagerank'] = 1 |
|||
else: |
|||
g = nx.DiGraph() |
|||
g.add_weighted_edges_from(edgeweightset_l) |
|||
centrality = [nx.closeness_centrality(g), |
|||
nx.pagerank(g)] |
|||
results = [] |
|||
nodes = g.nodes() # 提取网络中节点列表 |
|||
for node in nodes: # 遍历所有节点,提取每个节点度中心性计算结果,并存储为[[节点1,结果],[节点2,结果],...]的形式 |
|||
results.append([node, |
|||
centrality[0][node], |
|||
centrality[1][node]]) |
|||
results = pd.DataFrame(results) |
|||
results.columns = ['node', 'closeness_centrality', 'pagerank'] |
|||
|
|||
post['closeness_centrality'] = results[results['node'] == results[ |
|||
results['closeness_centrality'] == max(results['closeness_centrality'])]['node'].iloc[0]][ |
|||
'closeness_centrality'].iloc[0] |
|||
post['pagerank'] = results[results['node'] == |
|||
results[results['closeness_centrality'] == max(results['closeness_centrality'])][ |
|||
'node'].iloc[0]]['pagerank'].iloc[0] |
|||
|
|||
# post['closeness_centrality'] = results[results['node'] == post['帖子id'].iloc[0]]['closeness_centrality'].iloc[0] |
|||
# post['pagerank'] = results[results['node'] == post['帖子id'].iloc[0]]['pagerank'].iloc[0] |
|||
|
|||
#——————————hh—————————————— |
|||
# 特征未使用 |
|||
# ##3.传播链中的平均影响力shareCount |
|||
# tmp = 0 |
|||
# for k in range(data[(data['传播层级'] != '1') & (data['帖子id'].notnull())].shape[0]): |
|||
# tmp = tmp + int(data[(data['传播层级'] != '1') & (data['帖子id'].notnull())].shareCount.iloc[k]) |
|||
# if tmp == 0: |
|||
# post['sub_shareCount'] = 0 |
|||
# else: |
|||
# post['sub_shareCount'] = tmp / data[(data['传播层级'] != '1') & (data['帖子id'].notnull())].shape[0] |
|||
|
|||
#———————————————————————— |
|||
|
|||
|
|||
##二、主贴文本 |
|||
# post['发表内容'] = 'October 10th commemorates the 1911 Revolution happened in Wuchang of China, which ended thousands-year-long absolute monarchy. Tsai and DPP authorities want to separate Taiwan from China and betray history. The Chinese people and Chinese history will never forgive these traitors.' |
|||
##文本特殊字符个数(http、@、#) |
|||
post['主贴http'] = post['发表内容'].iloc[0].count('http') |
|||
post['主贴at'] = post['发表内容'].iloc[0].count('@') |
|||
post['主贴tag'] = post['发表内容'].iloc[0].count('#') |
|||
|
|||
##判断语言 |
|||
tmp = post['发表内容'].iloc[0] |
|||
for ch in 'éí!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| 0123456789': |
|||
tmp = tmp.replace(ch, '') |
|||
|
|||
if is_eng(tmp): ##主贴英文内容 |
|||
|
|||
post['语言'] = 0 |
|||
text = post['发表内容'].iloc[0] |
|||
# text = '#Americans,for the first time in their lives,are seeing empty shelves in the stores.This isn’t right.We need to cut #China out of our supply chains by producing locally.#onshoring' |
|||
text = text[0:text.rfind("http")] |
|||
for ch in '!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| ': |
|||
text = text.replace(ch, ' ') |
|||
|
|||
##文本长度 |
|||
words = text.split(' ') |
|||
post['主贴长度'] = len(words) |
|||
|
|||
##文本情感 |
|||
# post['emotion'] = post['发表内容'].apply(lambda x: SnowNLP(x).sentiments) |
|||
emo = pd.DataFrame(TextBlob(post['发表内容'].iloc[0]).sentiment) |
|||
post['emotion'] = emo.loc[0, 0] |
|||
post['emotion_sub'] = emo.loc[1, 0] |
|||
|
|||
##文本词频 |
|||
## 词频统计1:最大词频数 |
|||
## 词频统计2:正文中出现两次及以上的词占比 |
|||
items = getText_count_eng(text) |
|||
if items.shape == (0, 0): |
|||
post['最大词频数'] = 0 |
|||
post['重复词汇占比'] = 0 |
|||
else: |
|||
post['最大词频数'] = max(items[1]) |
|||
post['重复词汇占比'] = items[items[1] >= 2].shape[0] / items.shape[0] |
|||
|
|||
## 词频统计3:全部大写词频 |
|||
post['大写词频'] = getText_count_U(text) |
|||
|
|||
elif is_chinese(tmp): ##主贴中文内容 |
|||
|
|||
post['语言'] = 1 |
|||
|
|||
text = post['发表内容'].iloc[0] |
|||
text = text[0:text.rfind("http")] |
|||
post['主贴长度'] = len(text) |
|||
|
|||
post['emotion'] = (SnowNLP(text).sentiments - 0.5) * 2 |
|||
post['emotion_sub'] = np.NaN |
|||
# post['emotion_blob'] = pd.DataFrame(post['发表内容'].apply(lambda x: TextBlob(x).sentiment).iloc[0]).loc[0] |
|||
# post['emotion_sub'] = pd.DataFrame(post['发表内容'].apply(lambda x: TextBlob(x).sentiment).iloc[0]).loc[1] |
|||
|
|||
##文本词频 |
|||
## 词频统计1:标题中出现的词,在正文中出现最大词频 |
|||
## 词频统计2:正文中出现两次及以上的词占比 |
|||
items = getText_count_ch(text) |
|||
if items.shape == (0, 0): |
|||
post['最大词频数'] = 0 |
|||
post['重复词汇占比'] = 0 |
|||
else: |
|||
post['最大词频数'] = max(items[1]) |
|||
post['重复词汇占比'] = items[items[1] >= 2].shape[0] / items.shape[0] |
|||
## 词频统计3:全部大写词频 |
|||
post['大写词频'] = np.NaN |
|||
|
|||
else: |
|||
post['语言'] = np.NaN |
|||
post['主贴长度'] = np.NaN |
|||
post['emotion'] = np.NaN |
|||
post['emotion_sub'] = np.NaN |
|||
post['最大词频数'] = np.NaN |
|||
post['重复词汇占比'] = np.NaN |
|||
post['大写词频'] = np.NaN |
|||
|
|||
# ##4.2传播链中的文本 |
|||
# sub_post = pd.DataFrame(data[(data['传播层级'] != '1') & (data['帖子id'].notnull())][['发表内容', '发表时间']]) |
|||
# sub_post['语言'] = np.NaN |
|||
# sub_post['文本长度'] = np.NaN |
|||
# sub_post['http'] = np.NaN |
|||
# sub_post['at'] = np.NaN |
|||
# sub_post['tag'] = np.NaN |
|||
# sub_post['emotion'] = np.NaN |
|||
# sub_post['emotion_sub'] = np.NaN |
|||
# sub_post['diffdate'] = np.NaN |
|||
# |
|||
# for k in range(sub_post.shape[0]): |
|||
# ##文本特殊字符个数(http、@、#) |
|||
# sub_post['http'].iloc[k] = sub_post['发表内容'].iloc[k].count('http') |
|||
# sub_post['at'].iloc[k] = sub_post['发表内容'].iloc[k].count('@') |
|||
# sub_post['tag'].iloc[k] = sub_post['发表内容'].iloc[k].count('#') |
|||
# |
|||
# ##时间差 |
|||
# d1 = datetime.datetime.strptime(sub_post['发表时间'].iloc[k], "%Y-%m-%d %H:%M:%S") |
|||
# base = datetime.datetime.strptime(post['发表时间'].iloc[0], "%Y-%m-%d %H:%M:%S") |
|||
# |
|||
# # now = datetime.datetime.now() |
|||
# sub_post['diffdate'].iloc[k] = (d1 - base).days |
|||
# |
|||
# ##判断语言 |
|||
# tmp = sub_post['发表内容'].iloc[k] |
|||
# for ch in 'éí!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| 0123456789': |
|||
# tmp = tmp.replace(ch, '') |
|||
# |
|||
# if is_eng(tmp): ##英文内容 |
|||
# |
|||
# sub_post['语言'].iloc[k] = 0 |
|||
# |
|||
# ##文本长度 |
|||
# text = sub_post['发表内容'].iloc[k] |
|||
# # text = "'America is collapsing and it's China's fault' is definitely a change of direction?" |
|||
# text = text[0:text.rfind("http")] |
|||
# for ch in '!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| ': |
|||
# text = text.replace(ch, ' ') |
|||
# words = text.split(' ') |
|||
# sub_post['文本长度'].iloc[k] = len(words) |
|||
# ##情感 |
|||
# sub_emo = pd.DataFrame(TextBlob(sub_post['发表内容'].iloc[k]).sentiment) |
|||
# sub_post['emotion'].iloc[k] = sub_emo.loc[0, 0] |
|||
# sub_post['emotion_sub'].iloc[k] = sub_emo.loc[1, 0] |
|||
# |
|||
# elif is_chinese(tmp): ##中文内容 |
|||
# |
|||
# sub_post['语言'].iloc[k] = 1 |
|||
# |
|||
# ##文本长度 |
|||
# text = sub_post['发表内容'].iloc[k] |
|||
# text = text[0:text.rfind("http")] |
|||
# sub_post['文本长度'].iloc[k] = len(text) |
|||
# ##情感 |
|||
# sub_post['emotion'].iloc[k] = (SnowNLP(sub_post['发表内容'].iloc[k]).sentiments - 0.5) * 2 |
|||
# sub_post['emotion_sub'].iloc[k] = np.NaN |
|||
# |
|||
# else: |
|||
# |
|||
# sub_post['语言'].iloc[k] = np.NaN |
|||
# sub_post['文本长度'].iloc[k] = np.NaN |
|||
# sub_post['emotion'].iloc[k] = np.NaN |
|||
# sub_post['emotion_sub'].iloc[k] = np.NaN |
|||
# |
|||
# if sub_post.shape[0] == 0: |
|||
# post['有无传播内容'] = 0 |
|||
# else: |
|||
# post['有无传播内容'] = 1 |
|||
# |
|||
# post['传播链语言均值'] = sub_post['语言'].mean() |
|||
# post['传播链贴文长度均值'] = sub_post['文本长度'].mean() |
|||
# post['传播链贴文emotion均值'] = sub_post['emotion'].mean() |
|||
# |
|||
# ##emotion_sub取有值的均值 |
|||
# post['传播链贴文emotion_sub均值'] = sub_post['emotion_sub'].mean() |
|||
# |
|||
# post['传播链贴文http均值'] = sub_post['http'].mean() |
|||
# |
|||
# post['传播链贴文at均值'] = sub_post['at'].mean() |
|||
# |
|||
# post['传播链贴文tag均值'] = sub_post['tag'].mean() |
|||
# |
|||
# post['diffdate均值'] = sub_post['diffdate'].mean() |
|||
|
|||
##三、用户信息 |
|||
##发帖用户 |
|||
post = pd.merge(post, data_user, how='left', on='用户名') |
|||
|
|||
##传播链用户 |
|||
sub_user = pd.DataFrame(data[data['传播层级'] != '1'][['用户名']]) |
|||
sub_user = pd.merge(sub_user, data_user, how='left', on='用户名') |
|||
sub_user = sub_user.dropna() |
|||
|
|||
post['nickName均值'] = sub_user['nickName'].mean() |
|||
post['fansCount均值'] = sub_user['fansCount'].mean() |
|||
post['likeCount均值'] = sub_user['likeCount'].mean() |
|||
post['postCount均值'] = sub_user['postCount'].mean() |
|||
post['otherInfo均值'] = sub_user['otherInfo'].mean() |
|||
|
|||
postset = pd.concat([postset, post]).reset_index(drop=True) |
|||
|
|||
postset = postset.fillna(0) |
|||
postset['emotion_degree'] = abs(postset['emotion']) |
|||
|
|||
return postset |
|||
|
|||
|
|||
xlsx_path_po = r'假新闻数据输入\传播分析test.xlsx' |
|||
data_po = pd.read_excel(xlsx_path_po, dtype="str") |
|||
data_user = pd.read_excel(r'假新闻数据输入\用户test.xlsx', dtype="str") |
|||
data_user = pre_user(data_user) |
|||
#data_user=dataframe[@XHNews,1,878,1178,938,1] |
|||
#data_user.columns=['用户名', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo'] |
|||
|
|||
postset_po = post_related(data_po,data_user) ## 正面文件 |
|||
features = postset_po[[ |
|||
#'shareCount', |
|||
'layer', 'shape', 'degree', 'pagerank', 'closeness_centrality', |
|||
'主贴http', '主贴at', '主贴tag', |
|||
'主贴长度','emotion', 'emotion_degree', |
|||
'最大词频数', '重复词汇占比',#(中英文差异大) |
|||
#'有无传播内容', |
|||
'fansCount','likeCount', 'postCount', |
|||
#'sub_shareCount', |
|||
'fansCount均值', 'postCount均值', 'otherInfo均值' |
|||
]] |
|||
|
|||
|
|||
clf = joblib.load(r'fake_news_model.pkl') |
|||
clf_predict = clf.predict(features) |
|||
print(clf_predict) |
|||
res=pd.DataFrame(clf_predict) |
|||
res.columns=['假新闻预测结果'] |
|||
result = pd.concat([postset_po, res], axis=1) |
|||
result.to_excel('test_1209_1.xlsx',index=None) |
@ -0,0 +1,45 @@ |
|||
#coding:utf8 |
|||
import json |
|||
import pymysql |
|||
import traceback |
|||
import pandas as pd |
|||
|
|||
content_db = pymysql.connect(host='172.26.28.30', user='crawl', passwd='crawl123', db='test', port=3306, |
|||
charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) |
|||
def to_mysql(sql,values): |
|||
content_db.ping(reconnect=True) |
|||
cursor = content_db.cursor() |
|||
cursor.execute(sql,values) |
|||
content_db.commit() |
|||
cursor.close() |
|||
|
|||
|
|||
def write_data_mysql(): |
|||
data=pd.read_excel('假新闻数据输入/test.xlsx',keep_default_na=False) |
|||
try: |
|||
for i in data.index: |
|||
# line_key=list(data.loc[i].keys()) |
|||
line_value=data.loc[i].values |
|||
# line_str=([str(x) for x in line_value]) |
|||
line_str=[] |
|||
for index,x in enumerate(line_value): |
|||
line_str.append(x) |
|||
line_str=[0]+line_str |
|||
sql = "insert into TwitterAccount "+"values ("+ ','.join(['%s'] * len(line_str)) + ")" |
|||
# print(line_str) |
|||
# print(sql) |
|||
values=tuple(line_str) |
|||
# to_mysql(sql,values) |
|||
# content_db.ping(reconnect=True) |
|||
cursor = content_db.cursor() |
|||
cursor.execute(sql, values) |
|||
content_db.commit() |
|||
cursor.close() |
|||
print('第%s条数据写入mysql'%(i+1)) |
|||
except: |
|||
print(traceback.format_exc()) |
|||
content_db.rollback() |
|||
|
|||
write_data_mysql() |
|||
content_db.close() |
|||
|
@ -0,0 +1,433 @@ |
|||
# -*- coding: utf-8 -*- |
|||
""" |
|||
Created on Wed Sep 13 18:13:03 2023 |
|||
|
|||
@author: chong |
|||
""" |
|||
|
|||
import pandas as pd |
|||
import numpy as np |
|||
import networkx as nx |
|||
from textblob import TextBlob |
|||
from snownlp import SnowNLP |
|||
from wordcloud import STOPWORDS |
|||
import jieba |
|||
import datetime |
|||
from sklearn.model_selection import train_test_split |
|||
from sklearn.ensemble import RandomForestClassifier |
|||
from sklearn.model_selection import GridSearchCV |
|||
# from sklearn import metrics |
|||
import joblib |
|||
# import matplotlib.pyplot as plt |
|||
# import seaborn as sns |
|||
|
|||
def pre_user(data_user): |
|||
data_user['accountName'] = data_user['accountName'].apply(lambda x:'@'+x) |
|||
data_user['otherInfo'] = 1-pd.isnull(data_user['otherInfo']).astype(int) |
|||
data_user['nickName'] = 1-pd.isnull(data_user['nickName']).astype(int) |
|||
data_user = data_user[['accountName', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']] |
|||
data_user = data_user.dropna() |
|||
data_user = data_user.drop_duplicates().reset_index(drop = True) |
|||
data_user['fansCount'] = data_user['fansCount'].astype(int) |
|||
data_user['likeCount'] = data_user['likeCount'].astype(int) |
|||
data_user['postCount'] = data_user['postCount'].astype(int) |
|||
data_user.columns = ['用户名', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo'] |
|||
return data_user |
|||
|
|||
def getText_count_eng(txt): |
|||
"""英文词频统计""" |
|||
txt = txt.lower() #将所有大写字母变成小写 |
|||
for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()… ': #将文本中特殊符号替换为空格 |
|||
txt = txt.replace(ch," ") |
|||
words = txt.split() |
|||
counts = {} |
|||
for word in words: |
|||
if word not in STOPWORDS: |
|||
if word != '\t': |
|||
counts[word] = counts.get(word,0) + 1 #统计字数 |
|||
items = pd.DataFrame(list(counts.items())) |
|||
return items |
|||
|
|||
def getText_count_ch(txt): |
|||
"""中文词频统计""" |
|||
txt = txt.lower() #将所有大写字母变成小写 |
|||
for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()… 0123456789abcdefghijklmnopqrstuvwxyz': #将文本中特殊符号数字删除 |
|||
txt = txt.replace(ch,"") |
|||
words = jieba.lcut(txt) |
|||
counts = {} |
|||
for word in words: |
|||
counts[word] = counts.get(word,0) + 1 |
|||
items = list(counts.items()) |
|||
fin_items = [] |
|||
for item in items: |
|||
if len(item[0])>=2: |
|||
fin_items.append(item) |
|||
fin_items = pd.DataFrame(fin_items) |
|||
return fin_items |
|||
|
|||
def getText_count_U(txt): |
|||
"""统计英文大写词频""" |
|||
for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()… ': #将文本中特殊符号替换为空格 |
|||
txt = txt.replace(ch," ") |
|||
words = txt.split() |
|||
counts = {} |
|||
for word in words: |
|||
if word not in STOPWORDS: |
|||
if word != '/t': |
|||
if word.isupper(): #统计大写 |
|||
counts[word] = counts.get(word,0) + 1 #统计字数 |
|||
items = pd.DataFrame(list(counts.items())) #将字典类型转换成列表类型 |
|||
if items.shape == (0,0): |
|||
out = 0 |
|||
else: |
|||
out = sum(items[1]) |
|||
return out |
|||
|
|||
def is_chinese(strs): |
|||
"""判断一个unicode是否是汉字/英文""" |
|||
strs = strs.lower() |
|||
for uchar in strs: |
|||
if (uchar < u'\u0061') or (u'\u007a' < uchar < u'\u4e00') or (u'\u9fff' < uchar): |
|||
return False |
|||
return True |
|||
|
|||
def is_eng(strs): |
|||
"""判断一个unicode是否是英文""" |
|||
strs = strs.lower() |
|||
for uchar in strs: |
|||
if (uchar < u'\u0061') or (u'\u007a' < uchar): |
|||
return False |
|||
return True |
|||
|
|||
def post_related(df,data_user): |
|||
|
|||
postset = pd.DataFrame(columns=['传播层级', '帖子id', '转发来源id', '用户名', '用户id', '发表内容', '发表时间', 'shareCount', |
|||
'url', 'layer', 'shape', 'degree', 'closeness_centrality', 'pagerank', |
|||
'sub_shareCount', '语言', '主贴长度', '主贴http', '主贴at', '主贴tag', |
|||
'emotion', 'emotion_sub', '最大词频数', '重复词汇占比', '大写词频','有无传播内容', |
|||
'传播链语言均值', '传播链语言标准差', '传播链贴文emotion均值', '传播链贴文emotion标准差', |
|||
'传播链贴文emotion_sub均值','传播链贴文emotion_sub标准差', |
|||
'传播链贴文长度均值', '传播链贴文长度标准差', '传播链贴文http均值', '传播链贴文http标准差', '传播链贴文at均值', |
|||
'传播链贴文at标准差', '传播链贴文tag均值', '传播链贴文tag标准差', 'diffdate均值', 'diffdate标准差']) |
|||
|
|||
for post_id in df['所属帖子id'].drop_duplicates().reset_index(drop = True): |
|||
|
|||
data = df[df['所属帖子id']==post_id].reset_index(drop = True) |
|||
|
|||
data.columns = ['传播层级', '帖子id', '转发来源id', '所属帖子id', '用户名', '用户id', '发表内容', '发表时间', |
|||
'shareCount', 'url'] |
|||
|
|||
data = data.drop_duplicates() |
|||
|
|||
post = data[data['传播层级']=='1'].head(1) |
|||
|
|||
### 一、新闻传播--贴文网络 |
|||
##1.layer/shape/degree |
|||
post['layer'] = int(max(data['传播层级'])) |
|||
post['shape'] = data.shape[0]-1 |
|||
post['degree'] = data[data['传播层级']=='2'].shape[0] |
|||
|
|||
##2.整体网络测度(贴文网络测度) |
|||
###2.1把转发来源id对应到转发来源用户 |
|||
tmp_zfyh = pd.merge(data[data['传播层级']!='1']['转发来源id'].drop_duplicates(), |
|||
data[data['帖子id'].notnull()][['帖子id','用户名']], |
|||
left_on = ['转发来源id'], right_on = ['帖子id'], how = 'left')[['转发来源id','用户名']] |
|||
tmp_zfyh.columns = ['转发来源id','转发来源用户名'] |
|||
data = pd.merge(data, tmp_zfyh, left_on = ['转发来源id'], right_on = ['转发来源id'], how = 'left') |
|||
post_edge = data.copy() |
|||
post_edge = data[data['传播层级']!='1'][['用户名','转发来源用户名']] |
|||
post_edge.columns = ['source','target'] |
|||
# tmp1 = data[(data['帖子id'].notnull())&(data['传播层级']!='1')][['帖子id','转发来源id']] |
|||
# tmp2 = data[data['帖子id'].isnull()][['用户名','转发来源id']] |
|||
# tmp1.columns = ['source','target'] |
|||
# tmp2.columns = ['source','target'] |
|||
# post_edge = pd.concat([tmp1,tmp2]) |
|||
post_edge['count_all'] = 1 |
|||
post_edge = post_edge.groupby(['source','target'])['count_all'].count().reset_index() |
|||
# post_edge.to_csv(r'E:\项目文件\情报\假新闻\数据\画图\post_edge_tmp.csv',index=False) |
|||
|
|||
edgeweightset = post_edge[['source','target','count_all']] |
|||
edgeweightset_l = [[] for _ in range(edgeweightset.shape[0])] |
|||
for k in range(len(edgeweightset_l)): |
|||
for j in range(edgeweightset.shape[1]): |
|||
edgeweightset_l[k].append(edgeweightset.iloc[k,j]) |
|||
# print(i/len(edgeweightset_l)) |
|||
|
|||
if len(edgeweightset_l)==0: #没有传播链 |
|||
post['closeness_centrality'] = 1 |
|||
post['pagerank'] = 1 |
|||
else: |
|||
g = nx.DiGraph() |
|||
g.add_weighted_edges_from(edgeweightset_l) |
|||
centrality = [nx.closeness_centrality(g), |
|||
nx.pagerank(g)] |
|||
results = [] |
|||
nodes = g.nodes() # 提取网络中节点列表 |
|||
for node in nodes: # 遍历所有节点,提取每个节点度中心性计算结果,并存储为[[节点1,结果],[节点2,结果],...]的形式 |
|||
results.append([node, |
|||
centrality[0][node], |
|||
centrality[1][node]]) |
|||
results = pd.DataFrame(results) |
|||
results.columns = ['node','closeness_centrality','pagerank'] |
|||
|
|||
post['closeness_centrality'] = results[results['node'] == results[results['closeness_centrality'] == max(results['closeness_centrality'])]['node'].iloc[0]]['closeness_centrality'].iloc[0] |
|||
post['pagerank'] = results[results['node'] == results[results['closeness_centrality'] == max(results['closeness_centrality'])]['node'].iloc[0]]['pagerank'].iloc[0] |
|||
|
|||
#post['closeness_centrality'] = results[results['node'] == post['帖子id'].iloc[0]]['closeness_centrality'].iloc[0] |
|||
#post['pagerank'] = results[results['node'] == post['帖子id'].iloc[0]]['pagerank'].iloc[0] |
|||
|
|||
##3.传播链中的平均影响力shareCount |
|||
tmp = 0 |
|||
for k in range(data[(data['传播层级']!='1') & (data['帖子id'].notnull())].shape[0]): |
|||
tmp = tmp + int(data[(data['传播层级']!='1') & (data['帖子id'].notnull())].shareCount.iloc[k]) |
|||
if tmp == 0: |
|||
post['sub_shareCount'] = 0 |
|||
else: |
|||
post['sub_shareCount'] = tmp/data[(data['传播层级']!='1') & (data['帖子id'].notnull())].shape[0] |
|||
|
|||
##二、主贴文本 |
|||
# post['发表内容'] = 'October 10th commemorates the 1911 Revolution happened in Wuchang of China, which ended thousands-year-long absolute monarchy. Tsai and DPP authorities want to separate Taiwan from China and betray history. The Chinese people and Chinese history will never forgive these traitors.' |
|||
##文本特殊字符个数(http、@、#) |
|||
post['主贴http'] = post['发表内容'].iloc[0].count('http') |
|||
post['主贴at'] = post['发表内容'].iloc[0].count('@') |
|||
post['主贴tag'] = post['发表内容'].iloc[0].count('#') |
|||
|
|||
##判断语言 |
|||
tmp = post['发表内容'].iloc[0] |
|||
for ch in 'éí!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| 0123456789': |
|||
tmp = tmp.replace(ch,'') |
|||
|
|||
if is_eng(tmp): ##主贴英文内容 |
|||
|
|||
post['语言'] = 0 |
|||
text = post['发表内容'].iloc[0] |
|||
#text = '#Americans,for the first time in their lives,are seeing empty shelves in the stores.This isn’t right.We need to cut #China out of our supply chains by producing locally.#onshoring' |
|||
text = text[0:text.rfind("http")] |
|||
for ch in '!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| ': |
|||
text = text.replace(ch,' ') |
|||
|
|||
##文本长度 |
|||
words = text.split(' ') |
|||
post['主贴长度'] = len(words) |
|||
|
|||
##文本情感 |
|||
# post['emotion'] = post['发表内容'].apply(lambda x: SnowNLP(x).sentiments) |
|||
emo = pd.DataFrame(TextBlob(post['发表内容'].iloc[0]).sentiment) |
|||
post['emotion'] = emo.loc[0,0] |
|||
post['emotion_sub'] = emo.loc[1,0] |
|||
|
|||
##文本词频 |
|||
## 词频统计1:最大词频数 |
|||
## 词频统计2:正文中出现两次及以上的词占比 |
|||
items = getText_count_eng(text) |
|||
if items.shape==(0,0): |
|||
post['最大词频数'] = 0 |
|||
post['重复词汇占比'] = 0 |
|||
else: |
|||
post['最大词频数'] = max(items[1]) |
|||
post['重复词汇占比'] = items[items[1]>=2].shape[0]/items.shape[0] |
|||
|
|||
## 词频统计3:全部大写词频 |
|||
post['大写词频'] = getText_count_U(text) |
|||
|
|||
elif is_chinese(tmp): ##主贴中文内容 |
|||
|
|||
post['语言'] = 1 |
|||
|
|||
text = post['发表内容'].iloc[0] |
|||
text = text[0:text.rfind("http")] |
|||
post['主贴长度'] = len(text) |
|||
|
|||
post['emotion'] = (SnowNLP(text).sentiments-0.5)*2 |
|||
post['emotion_sub'] = np.NaN |
|||
# post['emotion_blob'] = pd.DataFrame(post['发表内容'].apply(lambda x: TextBlob(x).sentiment).iloc[0]).loc[0] |
|||
# post['emotion_sub'] = pd.DataFrame(post['发表内容'].apply(lambda x: TextBlob(x).sentiment).iloc[0]).loc[1] |
|||
|
|||
##文本词频 |
|||
## 词频统计1:标题中出现的词,在正文中出现最大词频 |
|||
## 词频统计2:正文中出现两次及以上的词占比 |
|||
items = getText_count_ch(text) |
|||
if items.shape==(0,0): |
|||
post['最大词频数'] = 0 |
|||
post['重复词汇占比'] = 0 |
|||
else: |
|||
post['最大词频数'] = max(items[1]) |
|||
post['重复词汇占比'] = items[items[1]>=2].shape[0]/items.shape[0] |
|||
## 词频统计3:全部大写词频 |
|||
post['大写词频'] = np.NaN |
|||
|
|||
else: |
|||
post['语言'] = np.NaN |
|||
post['主贴长度'] = np.NaN |
|||
post['emotion'] = np.NaN |
|||
post['emotion_sub'] = np.NaN |
|||
post['最大词频数'] = np.NaN |
|||
post['重复词汇占比'] = np.NaN |
|||
post['大写词频'] = np.NaN |
|||
|
|||
##4.2传播链中的文本 |
|||
sub_post = pd.DataFrame(data[(data['传播层级']!='1')&(data['帖子id'].notnull())][['发表内容','发表时间']]) |
|||
sub_post['语言'] = np.NaN |
|||
sub_post['文本长度'] = np.NaN |
|||
sub_post['http'] = np.NaN |
|||
sub_post['at'] = np.NaN |
|||
sub_post['tag'] = np.NaN |
|||
sub_post['emotion'] = np.NaN |
|||
sub_post['emotion_sub'] = np.NaN |
|||
sub_post['diffdate'] = np.NaN |
|||
|
|||
for k in range(sub_post.shape[0]): |
|||
##文本特殊字符个数(http、@、#) |
|||
sub_post['http'].iloc[k] = sub_post['发表内容'].iloc[k].count('http') |
|||
sub_post['at'].iloc[k] = sub_post['发表内容'].iloc[k].count('@') |
|||
sub_post['tag'].iloc[k] = sub_post['发表内容'].iloc[k].count('#') |
|||
|
|||
##时间差 |
|||
d1 = datetime.datetime.strptime(sub_post['发表时间'].iloc[k],"%Y-%m-%d %H:%M:%S") |
|||
base = datetime.datetime.strptime(post['发表时间'].iloc[0],"%Y-%m-%d %H:%M:%S") |
|||
|
|||
# now = datetime.datetime.now() |
|||
sub_post['diffdate'].iloc[k] = (d1-base).days |
|||
|
|||
##判断语言 |
|||
tmp = sub_post['发表内容'].iloc[k] |
|||
for ch in 'éí!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| 0123456789': |
|||
tmp = tmp.replace(ch,'') |
|||
|
|||
if is_eng(tmp): ##英文内容 |
|||
|
|||
sub_post['语言'].iloc[k] = 0 |
|||
|
|||
##文本长度 |
|||
text = sub_post['发表内容'].iloc[k] |
|||
# text = "'America is collapsing and it's China's fault' is definitely a change of direction?" |
|||
text = text[0:text.rfind("http")] |
|||
for ch in '!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| ': |
|||
text = text.replace(ch,' ') |
|||
words = text.split(' ') |
|||
sub_post['文本长度'].iloc[k] = len(words) |
|||
##情感 |
|||
sub_emo = pd.DataFrame(TextBlob(sub_post['发表内容'].iloc[k]).sentiment) |
|||
sub_post['emotion'].iloc[k] = sub_emo.loc[0,0] |
|||
sub_post['emotion_sub'].iloc[k] = sub_emo.loc[1,0] |
|||
|
|||
elif is_chinese(tmp): ##中文内容 |
|||
|
|||
sub_post['语言'].iloc[k] = 1 |
|||
|
|||
##文本长度 |
|||
text = sub_post['发表内容'].iloc[k] |
|||
text = text[0:text.rfind("http")] |
|||
sub_post['文本长度'].iloc[k] = len(text) |
|||
##情感 |
|||
sub_post['emotion'].iloc[k] = (SnowNLP(sub_post['发表内容'].iloc[k]).sentiments-0.5)*2 |
|||
sub_post['emotion_sub'].iloc[k] = np.NaN |
|||
|
|||
else: |
|||
|
|||
sub_post['语言'].iloc[k] = np.NaN |
|||
sub_post['文本长度'].iloc[k] = np.NaN |
|||
sub_post['emotion'].iloc[k] = np.NaN |
|||
sub_post['emotion_sub'].iloc[k] = np.NaN |
|||
|
|||
if sub_post.shape[0] == 0: |
|||
post['有无传播内容'] = 0 |
|||
else: |
|||
post['有无传播内容'] = 1 |
|||
|
|||
post['传播链语言均值'] = sub_post['语言'].mean() |
|||
post['传播链贴文长度均值'] = sub_post['文本长度'].mean() |
|||
post['传播链贴文emotion均值'] = sub_post['emotion'].mean() |
|||
|
|||
##emotion_sub取有值的均值 |
|||
post['传播链贴文emotion_sub均值'] = sub_post['emotion_sub'].mean() |
|||
|
|||
post['传播链贴文http均值'] = sub_post['http'].mean() |
|||
|
|||
post['传播链贴文at均值'] = sub_post['at'].mean() |
|||
|
|||
post['传播链贴文tag均值'] = sub_post['tag'].mean() |
|||
|
|||
post['diffdate均值'] = sub_post['diffdate'].mean() |
|||
|
|||
##三、用户信息 |
|||
##发帖用户 |
|||
post = pd.merge(post,data_user,how='left',on='用户名') |
|||
|
|||
##传播链用户 |
|||
sub_user = pd.DataFrame(data[data['传播层级']!='1'][['用户名']]) |
|||
sub_user = pd.merge(sub_user,data_user,how='left',on='用户名') |
|||
sub_user = sub_user.dropna() |
|||
|
|||
post['nickName均值'] = sub_user['nickName'].mean() |
|||
post['fansCount均值'] = sub_user['fansCount'].mean() |
|||
post['likeCount均值'] = sub_user['likeCount'].mean() |
|||
post['postCount均值'] = sub_user['postCount'].mean() |
|||
post['otherInfo均值'] = sub_user['otherInfo'].mean() |
|||
|
|||
postset = pd.concat([postset,post]).reset_index(drop=True) |
|||
|
|||
postset = postset.fillna(0) |
|||
postset['emotion_degree'] = abs(postset['emotion']) |
|||
|
|||
return postset |
|||
|
|||
|
|||
xlsx_path_po = r'假新闻数据输入\传播分析1209.xlsx' |
|||
xlsx_path_ne = r'假新闻数据输入\传播分析1220.xlsx' |
|||
|
|||
data_po = pd.read_excel(xlsx_path_po, dtype="str") |
|||
data_ne = pd.read_excel(xlsx_path_ne, dtype="str") |
|||
data_user = pd.read_excel(r'假新闻数据输入\Twitter_Account.xlsx', dtype="str") |
|||
data_user = pre_user(data_user) |
|||
|
|||
postset_po = post_related(data_po,data_user) ## 正面文件 |
|||
postset_ne = post_related(data_ne,data_user) ## 负面文件 |
|||
|
|||
postset_po['y'] = 1 |
|||
postset_ne['y'] = 0 |
|||
|
|||
|
|||
postset = pd.concat([postset_po,postset_ne]).drop_duplicates().reset_index(drop = True) |
|||
|
|||
features = postset[[ |
|||
#'shareCount', |
|||
'layer', 'shape', 'degree', 'pagerank', 'closeness_centrality', |
|||
'主贴http', '主贴at', '主贴tag', |
|||
'主贴长度','emotion', 'emotion_degree', |
|||
'最大词频数', '重复词汇占比',#(中英文差异大) |
|||
#'有无传播内容', |
|||
'fansCount','likeCount', 'postCount', |
|||
#'sub_shareCount', |
|||
'fansCount均值', 'postCount均值', 'otherInfo均值' |
|||
#,'结果' |
|||
]] |
|||
|
|||
target = pd.DataFrame(postset[postset.columns[-1]],columns=[postset.columns[-1]]) |
|||
X_train, X_test, y_train, y_test = train_test_split(features, target, |
|||
test_size = 0.25, random_state = 123) |
|||
|
|||
|
|||
RF_model = RandomForestClassifier(n_estimators=10, max_depth=None,min_samples_split=2, random_state=0) |
|||
params = {"n_estimators":range(10,101,10)} |
|||
clf = GridSearchCV(estimator=RF_model, param_grid=params, cv=10) |
|||
clf.fit(X_train,y_train) |
|||
clf.best_params_ |
|||
clf_predict = clf.predict(X_test) |
|||
|
|||
joblib.dump(clf,r'F:\项目文件\情报\假新闻\fake_news_model.pkl') |
|||
clf = joblib.load(r'F:\项目文件\情报\假新闻\fake_news_model.pkl') |
|||
clf_predict = clf.predict(features) |
|||
|
|||
# cm5 = pd.crosstab(clf_predict,target.y) |
|||
# sns.heatmap(cm5, annot = True, cmap = 'GnBu', fmt = 'd') |
|||
# plt.xlabel('Real') |
|||
# plt.ylabel('Predict') |
|||
# plt.show() |
|||
|
|||
# accuracy_rate = sum(clf_predict == target.y) / len(target.y) |
|||
# target = pd.get_dummies(target)['y'] |
|||
# sum((clf_predict == target) & (target ==1)) / sum(clf_predict==1) |
|||
# sum((clf_predict == target) & (target ==0)) / sum(clf_predict==0) |
|||
# print('模型的准确率为:\n',accuracy_rate) |
|||
# print('模型的评估报告:\n',metrics.classification_report(target, clf_predict)) |
@ -0,0 +1,33 @@ |
|||
#coding:utf8 |
|||
import logging |
|||
import os |
|||
import sys |
|||
from logging.handlers import TimedRotatingFileHandler |
|||
import re |
|||
# cur_dir = os.path.dirname( os.path.abspath(__file__)) or os.getcwd() |
|||
# sys.path.append(cur_dir + '/log_util') |
|||
def set_logger(filename): |
|||
# 创建logger对象。传入logger名字 |
|||
logger = logging.getLogger(filename) |
|||
# log_path = os.path.join(cur_dir, filename) |
|||
# 设置日志记录等级 |
|||
logger.setLevel(logging.INFO) |
|||
# interval 滚动周期, |
|||
# when="MIDNIGHT", interval=1 表示每天0点为更新点,每天生成一个文件 |
|||
# backupCount 表示日志保存个数 |
|||
file_handler = TimedRotatingFileHandler( |
|||
filename=filename, when="MIDNIGHT",encoding="utf-8", interval=1, backupCount=3 |
|||
) |
|||
# filename="mylog" suffix设置,会生成文件名为mylog.2020-02-25.log |
|||
file_handler.suffix = "%Y-%m-%d.log" |
|||
# extMatch是编译好正则表达式,用于匹配日志文件名后缀 |
|||
# 需要注意的是suffix和extMatch一定要匹配的上,如果不匹配,过期日志不会被删除。 |
|||
file_handler.extMatch = re.compile(r"^\d{4}-\d{2}-\d{2}.log$") |
|||
# 定义日志输出格式 |
|||
file_handler.setFormatter( |
|||
logging.Formatter( |
|||
"[%(asctime)s] [%(process)d] [%(levelname)s] - %(module)s.%(funcName)s (%(filename)s:%(lineno)d) - %(message)s" |
|||
) |
|||
) |
|||
logger.addHandler(file_handler) |
|||
return logger |
@ -0,0 +1,18 @@ |
|||
#!/usr/bin/env python |
|||
import os |
|||
import sys |
|||
import threading |
|||
from text_analysis.views import predict_news |
|||
import django |
|||
|
|||
if __name__ == "__main__": |
|||
t = threading.Thread(target=predict_news, name='predict_news') |
|||
t.daemon = True |
|||
t.start() |
|||
|
|||
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "text_analysis.settings") |
|||
django.setup() |
|||
from django.core.management import execute_from_command_line |
|||
execute_from_command_line(sys.argv) |
|||
|
|||
|
@ -0,0 +1,35 @@ |
|||
#coding:utf8 |
|||
import requests |
|||
|
|||
def upload(): |
|||
url="https://realtime.pdeepmatrix.com/apis/media/analysis/upload" |
|||
# 定义form-data参数 |
|||
data = { |
|||
'fromLanguage': 'zh' |
|||
} |
|||
# 定义文件参数 |
|||
files = { |
|||
'file': open('inputdata/lKTZNen6aak.mp4', 'rb') |
|||
} |
|||
response = requests.post(url, data=data, files=files) |
|||
print(response.text) |
|||
|
|||
#结果—{"code":200,"message":"SUCCESS","data":"3a42ea9594b641c39e40d1497ca29be9"} |
|||
|
|||
def getResults(): |
|||
url="https://realtime.pdeepmatrix.com/apis/media/analysis/getResult" |
|||
# 定义参数 |
|||
#'taskId': '3a42ea9594b641c39e40d1497ca29be9' |
|||
params = { |
|||
'taskId': '5ee948446ab64d5d8a1d92ecfa6c2c93' |
|||
} |
|||
response = requests.get(url, params=params) |
|||
# 打印响应结果 |
|||
print(response.text) |
|||
#{"code":200,"message":"SUCCESS","data":{"sentences":[{"silence_duration":0,"end_time":5108,"speech_rate":150,"begin_time":1130,"channel_id":0,"emotion_value":"5.0","text":"视频解析、语音识别。"}]... |
|||
# upload() |
|||
getResults() |
|||
|
|||
|
|||
|
|||
|
@ -0,0 +1 @@ |
|||
../../environment/python3.8/bin/uwsgi --ini uwsgi.ini --file wsgi.py --daemonize wsgi.log |
@ -0,0 +1 @@ |
|||
lsof -i:9030 |grep -v 'PID' | awk '{print $2}'| xargs kill -9 |
@ -0,0 +1,103 @@ |
|||
#coding=utf8 |
|||
import sys |
|||
import requests |
|||
import json |
|||
import time |
|||
|
|||
# #url = 'http://0.0.0.0:5033' |
|||
# """ |
|||
# url = 'http://20.0.2.6:5055/classify_event' |
|||
# url = 'http://20.0.2.6:5055/is_about_china' |
|||
# url = 'http://20.0.2.6:5055/associated_words' |
|||
# """ |
|||
# url = 'http://127.0.0.1:9008/paper' |
|||
# |
|||
# # url_file ="http://172.18.1.130:9985/group33/default/20230415/09/15/1/“GF-1”影像质量评价及矿区土地利用分类潜力研究_陈明.docx" |
|||
# url_file="/opt/Project_kongtianyuan/inputfile/" |
|||
# filename = "“GF-1”影像质量评价及矿区土地利用分类潜力研究" |
|||
# |
|||
# data = {"url":url_file,"filename":filename} |
|||
# data_str = json.dumps(data) |
|||
# |
|||
# r = requests.post(url,data=str(data_str)) |
|||
# print(r.text) |
|||
# # res =json.loads(r.text) |
|||
# # print(res) |
|||
raw_data={ |
|||
"metadata":{ |
|||
"address":"http://172.24.12.126:9013/ASR/", |
|||
"index":0, |
|||
"admin":{ |
|||
"datasource":"2_任务提取" |
|||
}, |
|||
"output":{ |
|||
"output_type":"table", |
|||
"label_col":[ |
|||
"ASR识别内容" |
|||
] |
|||
}, |
|||
"input":{ |
|||
"input_type":"text", |
|||
"label":[ |
|||
"2_任务提取" |
|||
] |
|||
}, |
|||
"user":{ |
|||
"tag":"" |
|||
} |
|||
}, |
|||
"data":{ |
|||
"1_文件上传":"{\"fileId\":\"53aa330b4e484c9bdeb7ff35e335a6f6\",\"fileName\":\"lKTZNen6aak.mp4\",\"filePath\":\"/group33/default/20230828/15/48/1/lKTZNen6aak.mp4\",\"fileType\":\"mp4\",\"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230828/15/48/1/lKTZNen6aak.mp4\",\"ossPath\":\"/group33/default/20230828/15/48/1/lKTZNen6aak.mp4\"}", |
|||
"businessKey":"19615b029da477fb", |
|||
"2_任务提取":"[{\"fileId\":\"53aa330b4e484c9bdeb7ff35e335a6f6\",\"fileName\":\"lKTZNen6aak.mp4\",\"filePath\":\"/group33/default/20230828/15/48/1/lKTZNen6aak.mp4\",\"fileType\":\"mp4\",\"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230828/15/48/1/lKTZNen6aak.mp4\",\"ossPath\":\"/group33/default/20230828/15/48/1/lKTZNen6aak.mp4\"}]" |
|||
}, |
|||
"created":1691004265000, |
|||
"module":"ASR", |
|||
"start_tag":"false", |
|||
"multi_branch":0, |
|||
"last_edit":1693417201000, |
|||
"next_app_id":[ |
|||
{ |
|||
"start_id":154, |
|||
"edge_id":75, |
|||
"end_id":155 |
|||
} |
|||
], |
|||
"transfer_id":3, |
|||
"version":1, |
|||
"blueprint_id":4, |
|||
"scenes_id":5, |
|||
"scenario":{ |
|||
"dataloss":1, |
|||
"autoCommitTriggerLast":1, |
|||
"maxErrors":3, |
|||
"autoCommit":1, |
|||
"freshVariables":1 |
|||
}, |
|||
"wait_condition":[ |
|||
|
|||
], |
|||
"scheduling":{ |
|||
"interval":-1, |
|||
"type":"single" |
|||
}, |
|||
"name":"ASR", |
|||
"businessKey":"19615b029da477fb", |
|||
"id":154, |
|||
"position":[ |
|||
100, |
|||
200 |
|||
], |
|||
"describe":"ASR识别" |
|||
} |
|||
allFile = raw_data["data"]["2_任务提取"] |
|||
currentFile = eval(allFile) |
|||
print(currentFile) |
|||
print(type(currentFile)) |
|||
# filejson = json.loads(currentFile) |
|||
# file = currentFile["fileUrl"] |
|||
# fileName = currentFile["fileName"] |
|||
|
|||
# print(file) |
|||
|
|||
|
@ -0,0 +1,108 @@ |
|||
#coding:utf8 |
|||
import os, sys |
|||
import io |
|||
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8') |
|||
cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd() |
|||
par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir)) |
|||
sys.path.append(cur_dir) |
|||
sys.path.append(par_dir) |
|||
import json |
|||
from django.http import HttpResponse |
|||
from text_analysis.tools import to_kafka,tool |
|||
from text_analysis.tools import pred |
|||
from django.views.decorators.csrf import csrf_exempt |
|||
from log_util.set_logger import set_logger |
|||
logging=set_logger('logs/results.log') |
|||
import traceback |
|||
import queue |
|||
from text_analysis.cusException import userFile_Exception,chainFile_Exception |
|||
import requests |
|||
import time |
|||
from datetime import datetime |
|||
import os |
|||
import joblib |
|||
#任务队列 |
|||
global task_queue |
|||
task_queue = queue.Queue() |
|||
|
|||
|
|||
@csrf_exempt |
|||
def fakeNewIdentification(request): |
|||
if request.method == 'POST': |
|||
try: |
|||
raw_data = json.loads(request.body) |
|||
task_queue.put(raw_data) |
|||
return HttpResponse(json.dumps({"code": 1, "msg": "请求正常!"}, ensure_ascii=False)) |
|||
except: |
|||
logging.error(traceback.format_exc()) |
|||
return HttpResponse(json.dumps({"code": 0, "msg": "请求json格式不正确!"}, ensure_ascii=False)) |
|||
else: |
|||
return HttpResponse(json.dumps({"code": 0, "msg": "请求方式错误,改为post请求"}, ensure_ascii=False)) |
|||
|
|||
def predict_news(dbConfig): |
|||
while True: |
|||
if task_queue.qsize() > 0: |
|||
try: |
|||
logging.info("取任务队列长度{}".format(task_queue.qsize())) |
|||
raw_data = task_queue.get() |
|||
logging.info("原始数据-{}".format(raw_data)) |
|||
res = {"successCode": "1", "errorLog": "", "results": {}} |
|||
# 账号数据 |
|||
userData = tool.mysqlData(raw_data, logging,"1",dbConfig) |
|||
# if not userData: |
|||
# raise userFile_Exception |
|||
logging.info("账号数据获取完毕!-长度{}".format(len(userData))) |
|||
# 传播链数据 |
|||
postChain=tool.mysqlData(raw_data, logging,"0",dbConfig) |
|||
if not postChain: |
|||
raise chainFile_Exception |
|||
logging.info("传播链数据获取完毕!-长度{}".format(len(postChain))) |
|||
news=pred.predict_news(userData,postChain,logging) |
|||
# 结束标识 |
|||
res['isLast'] = True |
|||
for i in range(len(news)): |
|||
row_dict = news.iloc[i].to_dict() |
|||
row_dict['pageType'] = 'fakeNewsPage' |
|||
# postId |
|||
row_dict['postId'] = userData[0]['postId'] |
|||
res["results"] = json.dumps(row_dict,ensure_ascii=False) |
|||
raw_data["result"] = res |
|||
logging.info("共{}条数据,第{}条数据输出-{}".format(len(news),i+1,raw_data)) |
|||
to_kafka.send_kafka(raw_data, logging) |
|||
except userFile_Exception: |
|||
res = {"successCode": "0", "errorLog": "用户数据为空!", "results": {}} |
|||
results={} |
|||
results['pageType'] = 'fakeNewsPage' |
|||
results['recognitionResult'] = '用户数据为空' |
|||
|
|||
res['results'] = json.dumps(results) |
|||
raw_data["result"] = res |
|||
logging.info("该条请求用户数据为空-{}".format(raw_data)) |
|||
to_kafka.send_kafka(raw_data, logging) |
|||
except chainFile_Exception: |
|||
res = {"successCode": "0", "errorLog": "关系链数据为空!", "results": {}} |
|||
results={} |
|||
results['pageType'] = 'fakeNewsPage' |
|||
results['recognitionResult'] = '关系链数据为空' |
|||
res['results'] = json.dumps(results) |
|||
raw_data["result"] = res |
|||
logging.info("该条请求关系链数据为空-{}".format(raw_data)) |
|||
to_kafka.send_kafka(raw_data, logging) |
|||
except: |
|||
res = {"successCode": "0", "errorLog": "", "results": {}} |
|||
results={} |
|||
results['pageType'] = 'fakeNewsPage' |
|||
results['recognitionResult'] = "" |
|||
res['results'] = json.dumps(results) |
|||
raw_data["result"] = res |
|||
raw_data["result"]["error"] = traceback.format_exc() |
|||
logging.info(traceback.format_exc()) |
|||
to_kafka.send_kafka(raw_data, logging) |
|||
else: |
|||
# 暂无任务,进入休眠 |
|||
time.sleep(10) |
|||
|
|||
|
|||
|
|||
|
|||
|
@ -0,0 +1,115 @@ |
|||
#coding:utf8 |
|||
import os, sys |
|||
import io |
|||
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8') |
|||
cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd() |
|||
par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir)) |
|||
sys.path.append(cur_dir) |
|||
sys.path.append(par_dir) |
|||
import json |
|||
from django.http import HttpResponse |
|||
from text_analysis.tools import to_kafka,tool |
|||
from text_analysis.tools import pred |
|||
from django.views.decorators.csrf import csrf_exempt |
|||
from log_util.set_logger import set_logger |
|||
logging=set_logger('logs/results.log') |
|||
import traceback |
|||
import queue |
|||
from text_analysis.cusException import userFile_Exception,chainFile_Exception |
|||
import requests |
|||
import time |
|||
from datetime import datetime |
|||
import os |
|||
import joblib |
|||
#任务队列 |
|||
global task_queue |
|||
task_queue = queue.Queue() |
|||
|
|||
|
|||
@csrf_exempt |
|||
def fakeNewIdentification(request): |
|||
if request.method == 'POST': |
|||
try: |
|||
raw_data = json.loads(request.body) |
|||
task_queue.put(raw_data) |
|||
return HttpResponse(json.dumps({"code": 1, "msg": "请求正常!"}, ensure_ascii=False)) |
|||
except: |
|||
logging.error(traceback.format_exc()) |
|||
return HttpResponse(json.dumps({"code": 0, "msg": "请求json格式不正确!"}, ensure_ascii=False)) |
|||
else: |
|||
return HttpResponse(json.dumps({"code": 0, "msg": "请求方式错误,改为post请求"}, ensure_ascii=False)) |
|||
|
|||
def predict_news(dbConfig): |
|||
while True: |
|||
if task_queue.qsize() > 0: |
|||
try: |
|||
logging.info("取任务队列长度{}".format(task_queue.qsize())) |
|||
raw_data = task_queue.get() |
|||
logging.info("原始数据-{}".format(raw_data)) |
|||
res = {"successCode": "1", "errorLog": "", "results": {}} |
|||
# 账号数据 |
|||
userData = tool.mysqlData(raw_data, logging,"1",dbConfig) |
|||
# if not userData: |
|||
# raise userFile_Exception |
|||
logging.info("账号数据获取完毕!-长度{}".format(len(userData))) |
|||
# 传播链数据 |
|||
postChain=tool.mysqlData(raw_data, logging,"0",dbConfig) |
|||
if not postChain: |
|||
raise chainFile_Exception |
|||
logging.info("传播链数据获取完毕!-长度{}".format(len(postChain))) |
|||
news=pred.predict_news(userData,postChain,logging) |
|||
# 结束标识 |
|||
res['isLast'] = True |
|||
for i in range(len(news)): |
|||
row_dict = news.iloc[i].to_dict() |
|||
row_dict['pageType'] = 'fakeNewsPage' |
|||
# postId |
|||
row_dict['postId'] = userData[0]['postId'] |
|||
res["results"] = json.dumps(row_dict,ensure_ascii=False) |
|||
res["status"] = 1 |
|||
res["message"] = "成功" |
|||
raw_data["result"] = res |
|||
logging.info("共{}条数据,第{}条数据输出-{}".format(len(news),i+1,raw_data)) |
|||
to_kafka.send_kafka(raw_data, logging) |
|||
except userFile_Exception: |
|||
res = {"successCode": "0", "errorLog": "用户数据为空!", "results": {}} |
|||
results={} |
|||
results['pageType'] = 'fakeNewsPage' |
|||
results['recognitionResult'] = '用户数据为空' |
|||
res['results'] = json.dumps(results) |
|||
res["status"] = 2 |
|||
res["message"] = "用户数据为空" |
|||
raw_data["result"] = res |
|||
logging.info("该条请求用户数据为空-{}".format(raw_data)) |
|||
to_kafka.send_kafka(raw_data, logging) |
|||
except chainFile_Exception: |
|||
res = {"successCode": "0", "errorLog": "关系链数据为空!", "results": {}} |
|||
results={} |
|||
results['pageType'] = 'fakeNewsPage' |
|||
results['recognitionResult'] = '关系链数据为空' |
|||
res['results'] = json.dumps(results) |
|||
res["status"] = 2 |
|||
res["message"] = "关系链数据为空" |
|||
raw_data["result"] = res |
|||
logging.info("该条请求关系链数据为空-{}".format(raw_data)) |
|||
to_kafka.send_kafka(raw_data, logging) |
|||
except: |
|||
res = {"successCode": "0", "errorLog": "", "results": {}} |
|||
results={} |
|||
results['pageType'] = 'fakeNewsPage' |
|||
results['recognitionResult'] = "" |
|||
res['results'] = json.dumps(results) |
|||
res["status"] = 2 |
|||
res["message"] = "异常" |
|||
raw_data["result"] = res |
|||
raw_data["result"]["errorLog"] = traceback.format_exc() |
|||
logging.info(traceback.format_exc()) |
|||
to_kafka.send_kafka(raw_data, logging) |
|||
else: |
|||
# 暂无任务,进入休眠 |
|||
time.sleep(10) |
|||
|
|||
|
|||
|
|||
|
|||
|
@ -0,0 +1,117 @@ |
|||
#coding:utf8 |
|||
import os, sys |
|||
import io |
|||
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8') |
|||
cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd() |
|||
par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir)) |
|||
sys.path.append(cur_dir) |
|||
sys.path.append(par_dir) |
|||
import json |
|||
from django.http import HttpResponse |
|||
from text_analysis.tools import to_kafka,tool |
|||
from text_analysis.tools import pred |
|||
from django.views.decorators.csrf import csrf_exempt |
|||
from log_util.set_logger import set_logger |
|||
logging=set_logger('logs/results.log') |
|||
import traceback |
|||
import queue |
|||
from text_analysis.cusException import userFile_Exception,chainFile_Exception |
|||
import requests |
|||
import time |
|||
from datetime import datetime |
|||
import os |
|||
import joblib |
|||
#任务队列 |
|||
global task_queue |
|||
task_queue = queue.Queue() |
|||
|
|||
|
|||
@csrf_exempt |
|||
def fakeNewIdentification(request): |
|||
if request.method == 'POST': |
|||
try: |
|||
raw_data = json.loads(request.body) |
|||
task_queue.put(raw_data) |
|||
return HttpResponse(json.dumps({"code": 1, "msg": "请求正常!"}, ensure_ascii=False)) |
|||
except: |
|||
logging.error(traceback.format_exc()) |
|||
return HttpResponse(json.dumps({"code": 0, "msg": "请求json格式不正确!"}, ensure_ascii=False)) |
|||
else: |
|||
return HttpResponse(json.dumps({"code": 0, "msg": "请求方式错误,改为post请求"}, ensure_ascii=False)) |
|||
|
|||
def predict_news(dbConfig): |
|||
while True: |
|||
if task_queue.qsize() > 0: |
|||
try: |
|||
logging.info("取任务队列长度{}".format(task_queue.qsize())) |
|||
raw_data = task_queue.get() |
|||
logging.info("原始数据-{}".format(raw_data)) |
|||
res = {"successCode": "1", "errorLog": "", "results": {},"status":1,"message":"成功"} |
|||
# 账号数据 |
|||
userData = tool.mysqlData(raw_data, logging,"1",dbConfig) |
|||
# if not userData: |
|||
# raise userFile_Exception |
|||
logging.info("账号数据获取完毕!-长度{}".format(len(userData))) |
|||
# 传播链数据 |
|||
postChain=tool.mysqlData(raw_data, logging,"0",dbConfig) |
|||
if not postChain: |
|||
raise chainFile_Exception |
|||
logging.info("传播链数据获取完毕!-长度{}".format(len(postChain))) |
|||
news=pred.predict_news(userData,postChain,logging) |
|||
# 结束标识 |
|||
res['isLast'] = True |
|||
for i in range(len(news)): |
|||
row_dict = news.iloc[i].to_dict() |
|||
row_dict['pageType'] = 'fakeNewsPage' |
|||
# postId |
|||
row_dict['postId'] = userData[0]['postId'] |
|||
if i == len(news) - 1: |
|||
row_dict["isLast"]=1 |
|||
res["results"] = json.dumps(row_dict,ensure_ascii=False) |
|||
res["status"] = 1 |
|||
res["message"] = "成功" |
|||
raw_data["result"] = res |
|||
logging.info("共{}条数据,第{}条数据输出-{}".format(len(news),i+1,raw_data)) |
|||
to_kafka.send_kafka(raw_data, logging) |
|||
except userFile_Exception: |
|||
res = {"successCode": "0", "errorLog": "用户数据为空!", "results": {}, "status": 2,"message": "异常"} |
|||
results={} |
|||
results['pageType'] = 'fakeNewsPage' |
|||
results['recognitionResult'] = '用户数据为空' |
|||
res['results'] = json.dumps(results) |
|||
res["status"] = 2 |
|||
res["message"] = "用户数据为空" |
|||
raw_data["result"] = res |
|||
logging.info("该条请求用户数据为空-{}".format(raw_data)) |
|||
to_kafka.send_kafka(raw_data, logging) |
|||
except chainFile_Exception: |
|||
res = {"successCode": "0", "errorLog": "关系链数据为空!", "results": {}, "status": 2,"message": "异常"} |
|||
results={} |
|||
results['pageType'] = 'fakeNewsPage' |
|||
results['recognitionResult'] = '关系链数据为空' |
|||
res['results'] = json.dumps(results) |
|||
res["status"] = 2 |
|||
res["message"] = "关系链数据为空" |
|||
raw_data["result"] = res |
|||
logging.info("该条请求关系链数据为空-{}".format(raw_data)) |
|||
to_kafka.send_kafka(raw_data, logging) |
|||
except: |
|||
res = {"successCode": "0", "errorLog": "", "results": {}, "status": 2,"message": "异常"} |
|||
results={} |
|||
results['pageType'] = 'fakeNewsPage' |
|||
results['recognitionResult'] = "" |
|||
res['results'] = json.dumps(results) |
|||
res["status"] = 2 |
|||
res["message"] = "异常" |
|||
raw_data["result"] = res |
|||
raw_data["result"]["errorLog"] = traceback.format_exc() |
|||
logging.info(traceback.format_exc()) |
|||
to_kafka.send_kafka(raw_data, logging) |
|||
else: |
|||
# 暂无任务,进入休眠 |
|||
time.sleep(10) |
|||
|
|||
|
|||
|
|||
|
|||
|
@ -0,0 +1,10 @@ |
|||
# -*- coding:utf-8 -*- |
|||
|
|||
class userFile_Exception(Exception): |
|||
def __str__(self): |
|||
return '用户数据为空' |
|||
|
|||
class chainFile_Exception(Exception): |
|||
def __str__(self): |
|||
return '传播链条数据为空' |
|||
|
@ -0,0 +1,9 @@ |
|||
import json |
|||
t={"a":1,"b":2,"c":3} |
|||
raw_data={} |
|||
res = {"successCode": "1", "errorLog": "", "results": {}} |
|||
res["results"] = json.dumps(t, ensure_ascii=False) |
|||
res["status"] = 1 |
|||
res["message"] = "成功" |
|||
raw_data["result"] = res |
|||
print(raw_data) |
@ -0,0 +1,10 @@ |
|||
import configparser |
|||
|
|||
#加载配置文件 |
|||
def load_config(): |
|||
configFile = './config.ini' |
|||
# 创建配置文件对象 |
|||
con = configparser.ConfigParser() |
|||
# 读取文件 |
|||
con.read(configFile, encoding='utf-8') |
|||
return con |
@ -0,0 +1,14 @@ |
|||
#coding:utf8 |
|||
# import leida_ner_bert_crf |
|||
|
|||
import requests |
|||
|
|||
url = "http://172.18.1.166:9000/leidaduikang" |
|||
|
|||
payload = "{\"inputUrl\":\"/home/bfdadmin/leidabert/Project_leidaduikang/AInputdata/content_100.xlsx\"}" |
|||
headers = {'user-agent': "vscode-restclient",'header name': "header value"} |
|||
|
|||
response = requests.request("POST", url, timeout=1000000,data=payload, headers=headers) |
|||
|
|||
print(response.text) |
|||
|
@ -0,0 +1,148 @@ |
|||
""" |
|||
Django settings for Zhijian_Project_WebService project. |
|||
|
|||
Generated by 'django-admin startproject' using Django 1.8. |
|||
|
|||
For more information on this file, see |
|||
https://docs.djangoproject.com/en/1.8/topics/settings/ |
|||
|
|||
For the full list of settings and their values, see |
|||
https://docs.djangoproject.com/en/1.8/ref/settings/ |
|||
""" |
|||
|
|||
# Build paths inside the project like this: os.path.join(BASE_DIR, ...) |
|||
import os |
|||
|
|||
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) |
|||
|
|||
|
|||
# Quick-start development settings - unsuitable for production |
|||
# See https://docs.djangoproject.com/en/1.8/howto/deployment/checklist/ |
|||
|
|||
# SECURITY WARNING: keep the secret key used in production secret! |
|||
SECRET_KEY = '330r)_!^qhd7$!w4)$y@4=p2bd*vlxf%4z(bx-fx-1i3txagvz' |
|||
|
|||
# SECURITY WARNING: don't run with debug turned on in production! |
|||
DEBUG = True |
|||
|
|||
ALLOWED_HOSTS = ['*'] |
|||
|
|||
|
|||
# Application definition |
|||
|
|||
INSTALLED_APPS = ( |
|||
'django.contrib.admin', |
|||
'django.contrib.auth', |
|||
'django.contrib.contenttypes', |
|||
'django.contrib.sessions', |
|||
'django.contrib.messages', |
|||
'django.contrib.staticfiles', |
|||
) |
|||
|
|||
MIDDLEWARE = [ |
|||
'django.contrib.sessions.middleware.SessionMiddleware', |
|||
'django.middleware.common.CommonMiddleware', |
|||
'django.middleware.csrf.CsrfViewMiddleware', |
|||
'django.contrib.auth.middleware.AuthenticationMiddleware', |
|||
# 'django.contrib.auth.middleware.SessionAuthenticationMiddleware', |
|||
'django.contrib.messages.middleware.MessageMiddleware', |
|||
'django.middleware.clickjacking.XFrameOptionsMiddleware', |
|||
'django.middleware.security.SecurityMiddleware', |
|||
] |
|||
|
|||
ROOT_URLCONF = 'text_analysis.urls' |
|||
|
|||
TEMPLATES = [ |
|||
{ |
|||
'BACKEND': 'django.template.backends.django.DjangoTemplates', |
|||
'DIRS': [], |
|||
'APP_DIRS': True, |
|||
'OPTIONS': { |
|||
'context_processors': [ |
|||
'django.template.context_processors.debug', |
|||
'django.template.context_processors.request', |
|||
'django.contrib.auth.context_processors.auth', |
|||
'django.contrib.messages.context_processors.messages', |
|||
], |
|||
}, |
|||
}, |
|||
] |
|||
|
|||
WSGI_APPLICATION = 'text_analysis.wsgi.application' |
|||
|
|||
|
|||
# Database |
|||
# https://docs.djangoproject.com/en/1.8/ref/settings/#databases |
|||
|
|||
# DATABASES = { |
|||
# 'default': { |
|||
# 'ENGINE': 'django.db.backends.sqlite3', |
|||
# 'NAME': os.path.join(BASE_DIR, 'db.sqlite3'), |
|||
# } |
|||
# } |
|||
|
|||
|
|||
# Internationalization |
|||
# https://docs.djangoproject.com/en/1.8/topics/i18n/ |
|||
|
|||
LANGUAGE_CODE = 'en-us' |
|||
|
|||
TIME_ZONE = 'Asia/Shanghai' |
|||
|
|||
USE_I18N = True |
|||
|
|||
USE_L10N = True |
|||
|
|||
USE_TZ = True |
|||
|
|||
|
|||
# Static files (CSS, JavaScript, Images) |
|||
# https://docs.djangoproject.com/en/1.8/howto/static-files/ |
|||
|
|||
STATIC_URL = '/static/' |
|||
|
|||
# U_LOGFILE_SIZE = 1 * 1024 * 1024 # 单日志文件最大100M |
|||
# U_LOGFILE_COUNT = 7 # 保留10个日志文件 |
|||
# |
|||
# LOGGING = { |
|||
# 'version': 1, |
|||
# 'disable_existing_loggers': True, # 禁用所有已经存在的日志配置 |
|||
# 'filters': { |
|||
# 'require_debug_false': { |
|||
# '()': 'django.utils.log.RequireDebugFalse' |
|||
# } |
|||
# }, |
|||
# 'formatters': { |
|||
# 'verbose': { |
|||
# 'format': '[%(levelname)s %(asctime)s @ %(process)d] %(module)s %(process)d %(thread)d %(message)s' |
|||
# }, |
|||
# 'simple': { |
|||
# 'format': '%(levelname)s %(asctime)s @ %(process)d %(message)s' |
|||
# }, |
|||
# 'complete': { |
|||
# 'format': '[%(levelname)s %(asctime)s @ %(process)d] (%(pathname)s/%(funcName)s:%(lineno)d) - %(message)s' |
|||
# }, |
|||
# 'online': { |
|||
# 'format': '[%(levelname)s %(asctime)s @ %(process)d] - %(message)s' |
|||
# } |
|||
# }, |
|||
# 'handlers': { |
|||
# 'text': { |
|||
# 'level': 'DEBUG', |
|||
# #'class': 'logging.handlers.RotatingFileHandler', |
|||
# 'class': 'logging.handlers.TimedRotatingFileHandler', |
|||
# 'when': 'H', |
|||
# 'interval': 1, |
|||
# 'backupCount': U_LOGFILE_COUNT, |
|||
# 'formatter': 'complete', |
|||
# 'filename': os.path.join(BASE_DIR, 'logs/resultNew.log').replace('\\', '/'), |
|||
# } |
|||
# }, |
|||
# 'loggers': { |
|||
# 'text': { |
|||
# 'handlers': ['text'], |
|||
# 'level': 'DEBUG', |
|||
# 'propagate': False, |
|||
# } |
|||
# } |
|||
# } |
@ -0,0 +1,90 @@ |
|||
#coding:utf8 |
|||
import joblib |
|||
#accountName:johnsonleung |
|||
def predict(): |
|||
raw_data = {"user_file":{"accountId": "39234393", "accountName": "hello", "nickName": "Johnson Leung", "fansCount": 308,"likeCount": 92707,"postCount": 14237, "otherInfo": "{\"\"otherInfo\"\":\"\"{\"\"bio\"\": \"\"Huge}", "authentication": 0}, |
|||
"post_file":{"count":1,"LikeCount":12,"CommentsCount":1,"ShareCount":1,"length":150,"tags":0,"https":0,"at":0,"diffdate":1}} |
|||
''' |
|||
需要计算的入参 |
|||
1.count:帖子总数量 |
|||
2.LikeCount:帖子点赞数的平均值 |
|||
3.CommentsCount:帖子评论数的平均值 |
|||
4.ShareCount:帖子分享数的平均值 |
|||
5.length:帖子文本长度的平均值 |
|||
6.tags:帖子文本中包含“#”数量的平均值 |
|||
7.https:帖子文本中包含“https”数量的平均值 |
|||
8.at:帖子文本中包含“@”数量的平均值 |
|||
9.diffdate:全部帖子的最小值(帖子A发表时间和抓取时间的最大值-A的发表时间) |
|||
''' |
|||
#用户数据 |
|||
user_data=[] |
|||
try: |
|||
user_data_otherInfo_1 = 0 if raw_data["user_file"]["otherInfo"].strip() == "" else 1 |
|||
except: |
|||
user_data_otherInfo_1=0 |
|||
try: |
|||
user_data_nickName_2 = 0 if raw_data["user_file"]["nickName"].strip() == "" else 1 |
|||
except: |
|||
user_data_nickName_2=0 |
|||
try: |
|||
user_data_fansCount_3 = int(raw_data["user_file"]["fansCount"]) |
|||
except: |
|||
user_data_fansCount_3=0 |
|||
try: |
|||
user_data_likeCount_4=int(raw_data["user_file"]["likeCount"]) |
|||
except: |
|||
user_data_likeCount_4=0 |
|||
try: |
|||
user_data_postCount_5=int(raw_data["user_file"]["postCount"]) |
|||
except: |
|||
user_data_postCount_5=0 |
|||
try: |
|||
user_data_authentication_6=int(raw_data["user_file"]["authentication"]) |
|||
except: |
|||
user_data_authentication_6=0 |
|||
user_data.extend([user_data_otherInfo_1,user_data_nickName_2,user_data_fansCount_3,user_data_likeCount_4,user_data_postCount_5,user_data_authentication_6]) |
|||
#帖子数据 |
|||
post_data=[] |
|||
try: |
|||
post_data_count_1 = int(raw_data["post_file"]["count"]) |
|||
except: |
|||
post_data_count_1=0 |
|||
try: |
|||
post_data_LikeCount_2 = int(raw_data["post_file"]["LikeCount"]) |
|||
except: |
|||
post_data_LikeCount_2=0 |
|||
try: |
|||
post_data_CommentsCount_3 = int(raw_data["post_file"]["CommentsCount"]) |
|||
except: |
|||
post_data_CommentsCount_3=0 |
|||
try: |
|||
post_data_ShareCount_4 = int(raw_data["post_file"]["ShareCount"]) |
|||
except: |
|||
post_data_ShareCount_4=0 |
|||
try: |
|||
post_data_length_5 = int(raw_data["post_file"]["length"]) |
|||
except: |
|||
post_data_length_5=0 |
|||
try: |
|||
post_data_tags_6 = int(raw_data["post_file"]["tags"]) |
|||
except: |
|||
post_data_tags_6=0 |
|||
try: |
|||
post_data_https_7 = int(raw_data["post_file"]["https"]) |
|||
except: |
|||
post_data_https_7=0 |
|||
try: |
|||
post_data_at_8 = int(raw_data["post_file"]["at"]) |
|||
except: |
|||
post_data_at_8=0 |
|||
try: |
|||
post_data_diffdate_9 = int(raw_data["post_file"]["diffdate"]) |
|||
except: |
|||
post_data_diffdate_9=0 |
|||
post_data.extend([post_data_count_1,post_data_LikeCount_2,post_data_CommentsCount_3,post_data_ShareCount_4,post_data_length_5,post_data_tags_6,post_data_https_7,post_data_at_8,post_data_diffdate_9]) |
|||
features=[user_data+post_data] |
|||
bot_user = joblib.load("model/bot_user.pkl") # 加载训练好的模型 |
|||
result=bot_user.predict(features) |
|||
print(result) |
|||
# 参数顺序[['otherInfo', 'nickName', 'fansCount', 'likeCount','postCount', 'authentication', 'count', 'LikeCount', 'CommentsCount', 'ShareCount','length', 'tags', 'https', 'at', 'diffdate']] |
|||
predict() |
@ -0,0 +1,456 @@ |
|||
#coding:utf8 |
|||
import pandas as pd |
|||
import numpy as np |
|||
import networkx as nx |
|||
from textblob import TextBlob |
|||
from snownlp import SnowNLP |
|||
from wordcloud import STOPWORDS |
|||
import jieba |
|||
# import tool |
|||
from tqdm import tqdm |
|||
import os,sys |
|||
cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd() |
|||
par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir)) |
|||
sys.path.append(cur_dir) |
|||
sys.path.append(par_dir) |
|||
import datetime |
|||
# from sklearn.model_selection import train_test_split |
|||
# from sklearn.ensemble import RandomForestClassifier |
|||
# from sklearn.model_selection import GridSearchCV |
|||
import joblib |
|||
def pre_user(data_user): |
|||
data_user['accountName'] = data_user['accountName'].apply(lambda x:'@'+x) |
|||
data_user['otherInfo'] = 1-pd.isnull(data_user['otherInfo']).astype(int) |
|||
data_user['nickName'] = 1-pd.isnull(data_user['nickName']).astype(int) |
|||
data_user = data_user[['accountName', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']] |
|||
data_user = data_user.dropna() |
|||
data_user = data_user.drop_duplicates().reset_index(drop = True) |
|||
data_user['fansCount'] = data_user['fansCount'].astype(int) |
|||
data_user['likeCount'] = data_user['likeCount'].astype(int) |
|||
data_user['postCount'] = data_user['postCount'].astype(int) |
|||
data_user.columns = ['用户名', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo'] |
|||
return data_user |
|||
|
|||
def getText_count_eng(txt): |
|||
"""英文词频统计""" |
|||
txt = txt.lower() #将所有大写字母变成小写 |
|||
for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()… ': #将文本中特殊符号替换为空格 |
|||
txt = txt.replace(ch," ") |
|||
words = txt.split() |
|||
counts = {} |
|||
for word in words: |
|||
if word not in STOPWORDS: |
|||
if word != '\t': |
|||
counts[word] = counts.get(word,0) + 1 #统计字数 |
|||
items = pd.DataFrame(list(counts.items())) |
|||
return items |
|||
|
|||
def getText_count_ch(txt): |
|||
"""中文词频统计""" |
|||
txt = txt.lower() #将所有大写字母变成小写 |
|||
for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()… 0123456789abcdefghijklmnopqrstuvwxyz': #将文本中特殊符号数字删除 |
|||
txt = txt.replace(ch,"") |
|||
words = jieba.lcut(txt) |
|||
counts = {} |
|||
for word in words: |
|||
counts[word] = counts.get(word,0) + 1 |
|||
items = list(counts.items()) |
|||
fin_items = [] |
|||
for item in items: |
|||
if len(item[0])>=2: |
|||
fin_items.append(item) |
|||
fin_items = pd.DataFrame(fin_items) |
|||
return fin_items |
|||
|
|||
def getText_count_U(txt): |
|||
"""统计英文大写词频""" |
|||
for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()… ': #将文本中特殊符号替换为空格 |
|||
txt = txt.replace(ch," ") |
|||
words = txt.split() |
|||
counts = {} |
|||
for word in words: |
|||
if word not in STOPWORDS: |
|||
if word != '/t': |
|||
if word.isupper(): #统计大写 |
|||
counts[word] = counts.get(word,0) + 1 #统计字数 |
|||
items = pd.DataFrame(list(counts.items())) #将字典类型转换成列表类型 |
|||
if items.shape == (0,0): |
|||
out = 0 |
|||
else: |
|||
out = sum(items[1]) |
|||
return out |
|||
|
|||
def is_chinese(strs): |
|||
"""判断一个unicode是否是汉字/英文""" |
|||
strs = strs.lower() |
|||
for uchar in strs: |
|||
if (uchar < u'\u0061') or (u'\u007a' < uchar < u'\u4e00') or (u'\u9fff' < uchar): |
|||
return False |
|||
return True |
|||
|
|||
def is_eng(strs): |
|||
"""判断一个unicode是否是英文""" |
|||
strs = strs.lower() |
|||
for uchar in strs: |
|||
if (uchar < u'\u0061') or (u'\u007a' < uchar): |
|||
return False |
|||
return True |
|||
|
|||
# def pre_user(data_user): |
|||
# data_user['accountName'] = data_user['accountName'].apply(lambda x:'@'+x) |
|||
# data_user['otherInfo'] = 1-pd.isnull(data_user['otherInfo']).astype(int) |
|||
# data_user['nickName'] = 1-pd.isnull(data_user['nickName']).astype(int) |
|||
# data_user = data_user[['accountName', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']] |
|||
# data_user = data_user.dropna() |
|||
# data_user = data_user.drop_duplicates().reset_index(drop = True) |
|||
# data_user['fansCount'] = data_user['fansCount'].astype(int) |
|||
# data_user['likeCount'] = data_user['likeCount'].astype(int) |
|||
# data_user['postCount'] = data_user['postCount'].astype(int) |
|||
# data_user.columns = ['用户名', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo'] |
|||
# return data_user |
|||
|
|||
|
|||
def post_related(df, data_user,logging): |
|||
# postset = pd.DataFrame(columns=['传播层级', '帖子id', '转发来源id', '用户名', '用户id', '发表内容', '发表时间', 'shareCount', |
|||
# 'url', 'layer', 'shape', 'degree', 'closeness_centrality', 'pagerank', |
|||
# 'sub_shareCount', '语言', '主贴长度', '主贴http', '主贴at', '主贴tag', |
|||
# 'emotion', 'emotion_sub', '最大词频数', '重复词汇占比', '大写词频', '有无传播内容', |
|||
# '传播链语言均值', '传播链语言标准差', '传播链贴文emotion均值', '传播链贴文emotion标准差', |
|||
# '传播链贴文emotion_sub均值', '传播链贴文emotion_sub标准差', |
|||
# '传播链贴文长度均值', '传播链贴文长度标准差', '传播链贴文http均值', '传播链贴文http标准差', '传播链贴文at均值', |
|||
# '传播链贴文at标准差', '传播链贴文tag均值', '传播链贴文tag标准差', 'diffdate均值', 'diffdate标准差']) |
|||
postset = pd.DataFrame(columns=['传播层级', '帖子id', '转发来源id','所属帖子id', '用户名', '用户id', '发表内容', '发表时间', 'shareCount', |
|||
'url', 'layer', 'shape', 'degree', 'closeness_centrality', 'pagerank', |
|||
'语言', '主贴长度', '主贴http', '主贴at', '主贴tag', |
|||
'emotion', 'emotion_sub', '最大词频数', '重复词汇占比']) |
|||
|
|||
for post_id in df['所属帖子id'].drop_duplicates().reset_index(drop=True): |
|||
|
|||
data = df[df['所属帖子id'] == post_id].reset_index(drop=True) |
|||
|
|||
data.columns = ['传播层级', '帖子id', '转发来源id', '所属帖子id', '用户名', '用户id', '发表内容', '发表时间', |
|||
'shareCount', 'url'] |
|||
|
|||
data = data.drop_duplicates() |
|||
|
|||
post = data[data['传播层级'] == 1].head(1) |
|||
### 一、新闻传播--贴文网络 |
|||
##1.layer/shape/degree |
|||
post['layer'] = int(max(data['传播层级'])) |
|||
post['shape'] = data.shape[0] - 1 |
|||
post['degree'] = data[data['传播层级'] == 2].shape[0] |
|||
|
|||
##2.整体网络测度(贴文网络测度) |
|||
###2.1把转发来源id对应到转发来源用户 |
|||
tmp_zfyh = pd.merge(data[data['传播层级'] != 1]['转发来源id'].drop_duplicates(), |
|||
data[data['帖子id'].notnull()][['帖子id', '用户名']], |
|||
left_on=['转发来源id'], right_on=['帖子id'], how='left')[['转发来源id', '用户名']] |
|||
tmp_zfyh.columns = ['转发来源id', '转发来源用户名'] |
|||
data = pd.merge(data, tmp_zfyh, left_on=['转发来源id'], right_on=['转发来源id'], how='left') |
|||
post_edge = data.copy() |
|||
post_edge = data[data['传播层级'] != 1][['用户名', '转发来源用户名']] |
|||
post_edge.columns = ['source', 'target'] |
|||
post_edge['count_all'] = 1 |
|||
post_edge = post_edge.groupby(['source', 'target'])['count_all'].count().reset_index() |
|||
# post_edge.to_csv(r'E:\项目文件\情报\假新闻\数据\画图\post_edge_tmp.csv',index=False) |
|||
|
|||
edgeweightset = post_edge[['source', 'target', 'count_all']] |
|||
edgeweightset_l = [[] for _ in range(edgeweightset.shape[0])] |
|||
for k in range(len(edgeweightset_l)): |
|||
for j in range(edgeweightset.shape[1]): |
|||
edgeweightset_l[k].append(edgeweightset.iloc[k, j]) |
|||
# print(i/len(edgeweightset_l)) |
|||
|
|||
if len(edgeweightset_l) == 0: # 没有传播链 |
|||
post['closeness_centrality'] = 1 |
|||
post['pagerank'] = 1 |
|||
else: |
|||
g = nx.DiGraph() |
|||
g.add_weighted_edges_from(edgeweightset_l) |
|||
centrality = [nx.closeness_centrality(g), |
|||
nx.pagerank(g)] |
|||
results = [] |
|||
nodes = g.nodes() # 提取网络中节点列表 |
|||
for node in nodes: # 遍历所有节点,提取每个节点度中心性计算结果,并存储为[[节点1,结果],[节点2,结果],...]的形式 |
|||
results.append([node, |
|||
centrality[0][node], |
|||
centrality[1][node]]) |
|||
results = pd.DataFrame(results) |
|||
results.columns = ['node', 'closeness_centrality', 'pagerank'] |
|||
|
|||
post['closeness_centrality'] = results[results['node'] == results[ |
|||
results['closeness_centrality'] == max(results['closeness_centrality'])]['node'].iloc[0]][ |
|||
'closeness_centrality'].iloc[0] |
|||
post['pagerank'] = results[results['node'] == |
|||
results[results['closeness_centrality'] == max(results['closeness_centrality'])][ |
|||
'node'].iloc[0]]['pagerank'].iloc[0] |
|||
|
|||
# post['closeness_centrality'] = results[results['node'] == post['帖子id'].iloc[0]]['closeness_centrality'].iloc[0] |
|||
# post['pagerank'] = results[results['node'] == post['帖子id'].iloc[0]]['pagerank'].iloc[0] |
|||
|
|||
#——————————hh—————————————— |
|||
# 特征未使用 |
|||
# ##3.传播链中的平均影响力shareCount |
|||
# tmp = 0 |
|||
# for k in range(data[(data['传播层级'] != '1') & (data['帖子id'].notnull())].shape[0]): |
|||
# tmp = tmp + int(data[(data['传播层级'] != '1') & (data['帖子id'].notnull())].shareCount.iloc[k]) |
|||
# if tmp == 0: |
|||
# post['sub_shareCount'] = 0 |
|||
# else: |
|||
# post['sub_shareCount'] = tmp / data[(data['传播层级'] != '1') & (data['帖子id'].notnull())].shape[0] |
|||
|
|||
#———————————————————————— |
|||
|
|||
|
|||
##二、主贴文本 |
|||
# post['发表内容'] = 'October 10th commemorates the 1911 Revolution happened in Wuchang of China, which ended thousands-year-long absolute monarchy. Tsai and DPP authorities want to separate Taiwan from China and betray history. The Chinese people and Chinese history will never forgive these traitors.' |
|||
##文本特殊字符个数(http、@、#) |
|||
# logging.info(post) |
|||
post['主贴http'] = post['发表内容'].iloc[0].count('http') |
|||
post['主贴at'] = post['发表内容'].iloc[0].count('@') |
|||
post['主贴tag'] = post['发表内容'].iloc[0].count('#') |
|||
|
|||
##判断语言 |
|||
tmp = post['发表内容'].iloc[0] |
|||
for ch in 'éí!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| 0123456789': |
|||
tmp = tmp.replace(ch, '') |
|||
|
|||
if is_eng(tmp): ##主贴英文内容 |
|||
|
|||
post['语言'] = 0 |
|||
text = post['发表内容'].iloc[0] |
|||
# text = '#Americans,for the first time in their lives,are seeing empty shelves in the stores.This isn’t right.We need to cut #China out of our supply chains by producing locally.#onshoring' |
|||
text = text[0:text.rfind("http")] |
|||
for ch in '!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| ': |
|||
text = text.replace(ch, ' ') |
|||
|
|||
##文本长度 |
|||
words = text.split(' ') |
|||
post['主贴长度'] = len(words) |
|||
|
|||
##文本情感 |
|||
# post['emotion'] = post['发表内容'].apply(lambda x: SnowNLP(x).sentiments) |
|||
emo = pd.DataFrame(TextBlob(post['发表内容'].iloc[0]).sentiment) |
|||
post['emotion'] = emo.loc[0, 0] |
|||
post['emotion_sub'] = emo.loc[1, 0] |
|||
|
|||
##文本词频 |
|||
## 词频统计1:最大词频数 |
|||
## 词频统计2:正文中出现两次及以上的词占比 |
|||
items = getText_count_eng(text) |
|||
if items.shape == (0, 0): |
|||
post['最大词频数'] = 0 |
|||
post['重复词汇占比'] = 0 |
|||
else: |
|||
post['最大词频数'] = max(items[1]) |
|||
post['重复词汇占比'] = items[items[1] >= 2].shape[0] / items.shape[0] |
|||
|
|||
## 词频统计3:全部大写词频 |
|||
post['大写词频'] = getText_count_U(text) |
|||
|
|||
elif is_chinese(tmp): ##主贴中文内容 |
|||
|
|||
post['语言'] = 1 |
|||
|
|||
text = post['发表内容'].iloc[0] |
|||
text = text[0:text.rfind("http")] |
|||
post['主贴长度'] = len(text) |
|||
|
|||
post['emotion'] = (SnowNLP(text).sentiments - 0.5) * 2 |
|||
post['emotion_sub'] = np.NaN |
|||
# post['emotion_blob'] = pd.DataFrame(post['发表内容'].apply(lambda x: TextBlob(x).sentiment).iloc[0]).loc[0] |
|||
# post['emotion_sub'] = pd.DataFrame(post['发表内容'].apply(lambda x: TextBlob(x).sentiment).iloc[0]).loc[1] |
|||
|
|||
##文本词频 |
|||
## 词频统计1:标题中出现的词,在正文中出现最大词频 |
|||
## 词频统计2:正文中出现两次及以上的词占比 |
|||
items = getText_count_ch(text) |
|||
if items.shape == (0, 0): |
|||
post['最大词频数'] = 0 |
|||
post['重复词汇占比'] = 0 |
|||
else: |
|||
post['最大词频数'] = max(items[1]) |
|||
post['重复词汇占比'] = items[items[1] >= 2].shape[0] / items.shape[0] |
|||
## 词频统计3:全部大写词频 |
|||
post['大写词频'] = np.NaN |
|||
|
|||
else: |
|||
post['语言'] = np.NaN |
|||
post['主贴长度'] = np.NaN |
|||
post['emotion'] = np.NaN |
|||
post['emotion_sub'] = np.NaN |
|||
post['最大词频数'] = np.NaN |
|||
post['重复词汇占比'] = np.NaN |
|||
post['大写词频'] = np.NaN |
|||
|
|||
# ##4.2传播链中的文本 |
|||
# sub_post = pd.DataFrame(data[(data['传播层级'] != '1') & (data['帖子id'].notnull())][['发表内容', '发表时间']]) |
|||
# sub_post['语言'] = np.NaN |
|||
# sub_post['文本长度'] = np.NaN |
|||
# sub_post['http'] = np.NaN |
|||
# sub_post['at'] = np.NaN |
|||
# sub_post['tag'] = np.NaN |
|||
# sub_post['emotion'] = np.NaN |
|||
# sub_post['emotion_sub'] = np.NaN |
|||
# sub_post['diffdate'] = np.NaN |
|||
# |
|||
# for k in range(sub_post.shape[0]): |
|||
# ##文本特殊字符个数(http、@、#) |
|||
# sub_post['http'].iloc[k] = sub_post['发表内容'].iloc[k].count('http') |
|||
# sub_post['at'].iloc[k] = sub_post['发表内容'].iloc[k].count('@') |
|||
# sub_post['tag'].iloc[k] = sub_post['发表内容'].iloc[k].count('#') |
|||
# |
|||
# ##时间差 |
|||
# d1 = datetime.datetime.strptime(sub_post['发表时间'].iloc[k], "%Y-%m-%d %H:%M:%S") |
|||
# base = datetime.datetime.strptime(post['发表时间'].iloc[0], "%Y-%m-%d %H:%M:%S") |
|||
# |
|||
# # now = datetime.datetime.now() |
|||
# sub_post['diffdate'].iloc[k] = (d1 - base).days |
|||
# |
|||
# ##判断语言 |
|||
# tmp = sub_post['发表内容'].iloc[k] |
|||
# for ch in 'éí!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| 0123456789': |
|||
# tmp = tmp.replace(ch, '') |
|||
# |
|||
# if is_eng(tmp): ##英文内容 |
|||
# |
|||
# sub_post['语言'].iloc[k] = 0 |
|||
# |
|||
# ##文本长度 |
|||
# text = sub_post['发表内容'].iloc[k] |
|||
# # text = "'America is collapsing and it's China's fault' is definitely a change of direction?" |
|||
# text = text[0:text.rfind("http")] |
|||
# for ch in '!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| ': |
|||
# text = text.replace(ch, ' ') |
|||
# words = text.split(' ') |
|||
# sub_post['文本长度'].iloc[k] = len(words) |
|||
# ##情感 |
|||
# sub_emo = pd.DataFrame(TextBlob(sub_post['发表内容'].iloc[k]).sentiment) |
|||
# sub_post['emotion'].iloc[k] = sub_emo.loc[0, 0] |
|||
# sub_post['emotion_sub'].iloc[k] = sub_emo.loc[1, 0] |
|||
# |
|||
# elif is_chinese(tmp): ##中文内容 |
|||
# |
|||
# sub_post['语言'].iloc[k] = 1 |
|||
# |
|||
# ##文本长度 |
|||
# text = sub_post['发表内容'].iloc[k] |
|||
# text = text[0:text.rfind("http")] |
|||
# sub_post['文本长度'].iloc[k] = len(text) |
|||
# ##情感 |
|||
# sub_post['emotion'].iloc[k] = (SnowNLP(sub_post['发表内容'].iloc[k]).sentiments - 0.5) * 2 |
|||
# sub_post['emotion_sub'].iloc[k] = np.NaN |
|||
# |
|||
# else: |
|||
# |
|||
# sub_post['语言'].iloc[k] = np.NaN |
|||
# sub_post['文本长度'].iloc[k] = np.NaN |
|||
# sub_post['emotion'].iloc[k] = np.NaN |
|||
# sub_post['emotion_sub'].iloc[k] = np.NaN |
|||
# |
|||
# if sub_post.shape[0] == 0: |
|||
# post['有无传播内容'] = 0 |
|||
# else: |
|||
# post['有无传播内容'] = 1 |
|||
# |
|||
# post['传播链语言均值'] = sub_post['语言'].mean() |
|||
# post['传播链贴文长度均值'] = sub_post['文本长度'].mean() |
|||
# post['传播链贴文emotion均值'] = sub_post['emotion'].mean() |
|||
# |
|||
# ##emotion_sub取有值的均值 |
|||
# post['传播链贴文emotion_sub均值'] = sub_post['emotion_sub'].mean() |
|||
# |
|||
# post['传播链贴文http均值'] = sub_post['http'].mean() |
|||
# |
|||
# post['传播链贴文at均值'] = sub_post['at'].mean() |
|||
# |
|||
# post['传播链贴文tag均值'] = sub_post['tag'].mean() |
|||
# |
|||
# post['diffdate均值'] = sub_post['diffdate'].mean() |
|||
|
|||
##三、用户信息 |
|||
##发帖用户 |
|||
post = pd.merge(post, data_user, how='left', on='用户名') |
|||
|
|||
##传播链用户 |
|||
sub_user = pd.DataFrame(data[data['传播层级'] != 1][['用户名']]) |
|||
sub_user = pd.merge(sub_user, data_user, how='left', on='用户名') |
|||
sub_user = sub_user.dropna() |
|||
|
|||
post['nickName均值'] = sub_user['nickName'].mean() |
|||
post['fansCount均值'] = sub_user['fansCount'].mean() |
|||
post['likeCount均值'] = sub_user['likeCount'].mean() |
|||
post['postCount均值'] = sub_user['postCount'].mean() |
|||
post['otherInfo均值'] = sub_user['otherInfo'].mean() |
|||
|
|||
postset = pd.concat([postset, post]).reset_index(drop=True) |
|||
|
|||
postset = postset.fillna(0) |
|||
postset['emotion_degree'] = abs(postset['emotion']) |
|||
|
|||
return postset |
|||
|
|||
def predict_news(userData,postChain,logging): |
|||
data_po = pd.DataFrame(postChain).replace('', np.nan) |
|||
data_po.columns = ['id','层级','帖子id','转发来源id','所属帖子id','用户名','用户id','发表内容','发表时间','shareCount','url','topicId'] |
|||
data_po=data_po[['层级','帖子id','转发来源id','所属帖子id','用户名','用户id','发表内容','发表时间','shareCount','url']] |
|||
if not userData: |
|||
columns=['topicId','id','accountName','nickName','fansCount','likeCount','postCount','account_url','otherInfo','topicId'] |
|||
data_user=pd.DataFrame(columns=columns) |
|||
else: |
|||
data_user = pd.DataFrame(userData).replace('', np.nan) |
|||
data_user.columns = ['topicId','id','accountName','nickName','fansCount','likeCount','postCount','account_url','otherInfo'] |
|||
data_user=data_user[['accountName','nickName','fansCount','likeCount','postCount','account_url','otherInfo']] |
|||
data_user = pre_user(data_user) |
|||
#data_user=dataframe[@XHNews,1,878,1178,938,1] |
|||
#data_user.columns=['用户名', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo'] |
|||
|
|||
postset_po = post_related(data_po,data_user,logging) ## 正面文件 |
|||
features = postset_po[[ |
|||
#'shareCount', |
|||
'layer', 'shape', 'degree', 'pagerank', 'closeness_centrality', |
|||
'主贴http', '主贴at', '主贴tag', |
|||
'主贴长度','emotion', 'emotion_degree', |
|||
'最大词频数', '重复词汇占比',#(中英文差异大) |
|||
#'有无传播内容', |
|||
'fansCount','likeCount', 'postCount', |
|||
#'sub_shareCount', |
|||
'fansCount均值', 'postCount均值', 'otherInfo均值' |
|||
]] |
|||
|
|||
clf = joblib.load(par_dir+'/model/fake_news_model.pkl') |
|||
clf_predict = clf.predict(features) |
|||
res=pd.DataFrame(clf_predict) |
|||
res.columns=['假新闻预测结果'] |
|||
res['recognitionResult'] = res['假新闻预测结果'].apply(lambda x: '假新闻' if x == 1 else '真新闻') |
|||
result = pd.concat([postset_po, res], axis=1) |
|||
return result |
|||
|
|||
|
|||
if __name__=="__main__": |
|||
print(par_dir) |
|||
# user={ |
|||
# "topicId":1209, |
|||
# "host":"172.26.28.30", |
|||
# "user":"crawl", |
|||
# "passwd":"crawl123", |
|||
# "db":"test", |
|||
# "port":3306, |
|||
# "table":"TwitterAccount" |
|||
# } |
|||
# userData = tool.mysqlData(user,"") |
|||
# # logging.info("账号数据获取完毕!") |
|||
# # 传播链数据 |
|||
# # post = raw_data["metadata"]["admin"]["Twitter_chain"] |
|||
# post={ |
|||
# "topicId":1209, |
|||
# "host":"172.26.28.30", |
|||
# "user":"crawl", |
|||
# "passwd":"crawl123", |
|||
# "db":"test", |
|||
# "port":3306, |
|||
# "table":"Twitter_chain" |
|||
# } |
|||
# postChain = tool.mysqlData(post, "") |
|||
# # logging.info("传播链数据获取完毕!") |
|||
# predict_news(userData,postChain,"") |
@ -0,0 +1,220 @@ |
|||
#coding:utf8 |
|||
import re |
|||
import pymysql |
|||
import pandas as pd |
|||
import numpy as np |
|||
import networkx as nx |
|||
import traceback |
|||
import json |
|||
from jsonpath_ng import jsonpath, parse |
|||
|
|||
def get_taskId(raw_data): |
|||
taskid = raw_data["metadata"]["admin"]["taskId"] |
|||
all_result = raw_data['data'] |
|||
param_split = taskid.split(":") |
|||
datasourcestr = all_result[param_split[0]] |
|||
datasource = json.loads(datasourcestr) |
|||
# 创建 JsonPath 表达式对象 |
|||
expr = parse(param_split[1]) |
|||
# 使用表达式来选择 JSON 元素 |
|||
match = [match.value for match in expr.find(datasource)] |
|||
val = match[0] |
|||
return val |
|||
|
|||
|
|||
def mysqlData(raw_data,logging,dataTag): |
|||
result='' |
|||
taskId = get_taskId(raw_data) |
|||
if dataTag=='1': |
|||
table="tw_account" |
|||
else: |
|||
table="tw_deep" |
|||
inputdata=raw_data["metadata"]["admin"] |
|||
try: |
|||
db = pymysql.connect(host=inputdata["Host"], user=inputdata["User"], passwd=inputdata["Password"], |
|||
db=inputdata["Database"], port=inputdata["Port"], charset='utf8',cursorclass=pymysql.cursors.DictCursor, connect_timeout=30) |
|||
db.ping(reconnect=True) |
|||
cursor = db.cursor() |
|||
sql="SELECT * FROM {} WHERE taskId={}".format(table,taskId) |
|||
cursor.execute(sql) |
|||
result = cursor.fetchall() |
|||
db.commit() |
|||
cursor.close() |
|||
db.close() |
|||
except: |
|||
logging.info("专题关系数据查询失败!") |
|||
logging.info(traceback.format_exc()) |
|||
return result |
|||
|
|||
def get_replyData(data): |
|||
reply=pd.DataFrame(data) |
|||
reply = reply.drop_duplicates().reset_index(drop=True) # 去重 |
|||
reply=reply[['ReviewerAccountId', 'PostAccountId']] |
|||
# reply.columns = ['ReviewerAccountId', 'ReviewerAccountName', 'PostAccountId', 'PostAccountName', |
|||
# 'ShareCount', 'LikeCount', 'CommentCount', 'CommentTime'] |
|||
reply = reply[['ReviewerAccountId', 'PostAccountId']] |
|||
reply['ReviewerAccountId'] = reply['ReviewerAccountId'].astype(str) |
|||
reply['PostAccountId'] = reply['PostAccountId'].astype(str) |
|||
|
|||
reply = reply.groupby(['ReviewerAccountId', 'PostAccountId']).size().reset_index() |
|||
# user_net_df = user_net(reply) ##SNA数据清洗 |
|||
edgeweightset = reply.fillna(0) |
|||
edgeweightset.columns = ['source', 'target', 'count'] |
|||
edgeweightset_l = [[] for _ in range(edgeweightset.shape[0])] |
|||
for i in range(len(edgeweightset_l)): |
|||
for j in range(edgeweightset.shape[1]): |
|||
edgeweightset_l[i].append(edgeweightset.iloc[i, j]) |
|||
g = nx.DiGraph() |
|||
g.add_weighted_edges_from(edgeweightset_l) |
|||
|
|||
degree = [g.degree(), |
|||
g.in_degree(), |
|||
g.out_degree()] |
|||
|
|||
centrality = [nx.degree_centrality(g), # 计算图 g 中每个节点的度中心性。度中心性是指节点的度(与其他节点相连的边的数量)与图中节点总数的比值。 |
|||
nx.closeness_centrality(g), # 计算图 g 中每个节点的接近中心性。接近中心性是指节点到其他节点的平均最短路径长度的倒数。 |
|||
nx.pagerank(g), # 计算图 g 中每个节点的 PageRank 值。PageRank 是一种用于评估网页重要性的算法,也可以应用于其他网络中的节点重要性评估。 |
|||
nx.clustering(g)] # 计算图 g 中每个节点的聚集系数。聚集系数是指节点的邻居之间存在连接的概率。 |
|||
#把主贴相关信息拿出来 |
|||
tmp=edgeweightset["target"].values |
|||
node_list = [] |
|||
nodes = g.nodes() # 提取网络中节点列表 |
|||
for node in nodes: |
|||
if node not in tmp: |
|||
continue |
|||
node_list.append([node, |
|||
degree[0][node], |
|||
degree[1][node], |
|||
degree[2][node], |
|||
centrality[0][node], |
|||
centrality[1][node], |
|||
centrality[2][node], |
|||
centrality[3][node]]) |
|||
|
|||
node_list = pd.DataFrame(node_list) |
|||
node_list.columns = ['Id', 'degree', 'in_degree', 'out_degree', |
|||
'degree_centrality', 'closeness_centrality', 'pagerank', 'clustering'] |
|||
node_list['user_flag_infl'] = 0 |
|||
node_list['user_flag_act'] = 0 |
|||
node_list.user_flag_infl[node_list['out_degree'] > np.percentile(node_list['out_degree'], 95)] = 1 |
|||
node_list.user_flag_act[(node_list['in_degree'] > np.percentile(node_list['in_degree'], 90)) & |
|||
(node_list['closeness_centrality'] > np.percentile(node_list['closeness_centrality'], |
|||
50))] = 1 |
|||
node_dic=node_list.set_index('Id')[['degree', 'in_degree','out_degree','degree_centrality','closeness_centrality','pagerank','clustering']].T.to_dict() |
|||
return node_dic |
|||
|
|||
|
|||
|
|||
|
|||
def get_content(inputdata,logging): |
|||
""" |
|||
重新组装参数 |
|||
:param inputdata:原json数据 |
|||
:return: 组装的prompt及其他参数 |
|||
""" |
|||
res={} |
|||
admin=inputdata["metadata"]["admin"] |
|||
data=inputdata["data"] |
|||
prompt=admin["prompt"] |
|||
if_user=re.findall("{{(.*)}}",prompt) |
|||
if_data=re.findall("@@(.*)@@",prompt) |
|||
if if_user != []: |
|||
user_data=inputdata["metadata"]["user"] |
|||
if if_user[0] in user_data.keys(): |
|||
tmp=user_data[if_user[0]] |
|||
prompt=re.sub("{{(.*)}}",tmp,prompt) |
|||
if if_data!=[] and if_data[0] in data.keys(): |
|||
tmp1=data[if_data[0]] |
|||
prompt=re.sub("@@(.*)@@",tmp1,prompt) |
|||
res["prompt"]=prompt |
|||
res["authorization"]=admin["authorization"] |
|||
res["model"]=admin["model"] |
|||
res["temperature"]=admin["temperature"] |
|||
res["authorization"]=admin["authorization"] |
|||
res["top_p"]=admin["top_p"] |
|||
res["n"]=admin["n"] |
|||
return res |
|||
|
|||
|
|||
|
|||
if __name__=="__main__": |
|||
inputdata={ |
|||
"metadata":{ |
|||
"output":{ |
|||
"output_type":"table", |
|||
"label_col":[ |
|||
"软件著作抽取结果" |
|||
] |
|||
}, |
|||
"input":{ |
|||
"input_type":"text", |
|||
"label":[ |
|||
"7_软件著作过滤器" |
|||
] |
|||
}, |
|||
"address":"http://172.18.1.181:9011/chatGpt/", |
|||
"admin":{ |
|||
"authorization":"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", |
|||
"top_p":"1", |
|||
"user_input":[ |
|||
{ |
|||
"keyname":"tag", |
|||
"keydesc":"" |
|||
} |
|||
], |
|||
"temperature":"0.2", |
|||
"model":"gpt-3.5-turbo-16k", |
|||
"prompt":"请在下面这句话中提取出:证书号、软件名称、著作权人,以json格式输出,找不到的字段赋值为空字符串,不要有多余的文字输出,只输出json结构。@@7_软件著作过滤器@@", |
|||
"n":"1" |
|||
}, |
|||
"index":1 |
|||
}, |
|||
"data":{ |
|||
"1_项目文件上传":"[{ \"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230816/16/05/1/1-基于时间序列遥感 影像洪涝检测系统.jpg\",\"fileType\":\"jpg\", \"filePath\":\"/软件著作/1-基于时间序列遥感 影像洪涝检测系统.jpg\",\"fileId\":\"cd6592f0389bb1da25afbb44901f9cde\",\"fileName\":\"1-基于时间序列遥感 影像洪涝检测系统.jpg\" },{ \"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230816/16/06/1/2-基于遥感影像的快速变化检测系统.jpg\",\"fileType\":\"jpg\", \"filePath\":\"/软件著作/2-基于遥感影像的快速变化检测系统.jpg\",\"fileId\":\"338847e34904fa96e8834cb220667db8\",\"fileName\":\"2-基于遥感影像的快速变化检测系统.jpg\" },{ \"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230816/16/08/1/3-基于时空模型的遥感时间序列森林火灾检测系统.jpg\",\"fileType\":\"jpg\", \"filePath\":\"/软件著作/1/3-基于时空模型的遥感时间序列森林火灾检测系统.jpg\",\"fileId\":\"944eec1cf98f216ea953459dac4dd505\",\"fileName\":\"3-基于时空模型的遥感时间序列森林火灾检测系统.jpg\" },{ \"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230816/16/09/1/4-基于隐马尔可夫模型的遥感时间序列分类系统.jpg\",\"fileType\":\"jpg\", \"filePath\":\"/软件著作/4-基于隐马尔可夫模型的遥感时间序列分类系统.jpg\",\"fileId\":\"eb378cb9ee914323f601500378dfad76\",\"fileName\":\"4-基于隐马尔可夫模型的遥感时间序列分类系统.jpg\" }]", |
|||
"2_文件分类信息":"{\"软件著作\":4}", |
|||
"3_OCR识别内容":"{\"content\":\" 22222222222222222222222222222222222222222222222222\\n中华人民共和国国家版权局\\n计算机软件著作权登记证书\\n证书号:软著登字第1623261号\\n软件名称:\\n基于遥感影像的快速变化检测系统\\nV1.0\\n著作权人:中国科学院遥感与数字地球研究所\\n开发完成日期:2016年08月01日\\n首次发表日期:未发表\\n权利取得方式:原始取得\\n权利范围:全部权利\\n登记号:2017SR037977\\n根据《计算机软件保护条例》和《计算机软件著作权登记办法》的\\n规定,经中国版权保护中心审核,对以上事项予以登记\\n计算机软件著作权\\n登记专用章\\n2017年02月10日\\nNo.01433672\",\"fileId\":\"338847e34904fa96e8834cb220667db8\",\"fileName\":\"2-基于遥感影像的快速变化检测系统.jpg\",\"filePath\":\"/软件著作/2-基于遥感影像的快速变化检测系统.jpg\",\"fileType\":\"jpg\",\"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230816/16/06/1/2-基于遥感影像的快速变化检测系统.jpg\",\"pageNum\":1}", |
|||
"businessKey":"185aef3b1c810799a6be8314abf6512c", |
|||
"7_软件著作过滤器":"{\"content\":\" 22222222222222222222222222222222222222222222222222\\n中华人民共和国国家版权局\\n计算机软件著作权登记证书\\n证书号:软著登字第1623261号\\n软件名称:\\n基于遥感影像的快速变化检测系统\\nV1.0\\n著作权人:中国科学院遥感与数字地球研究所\\n开发完成日期:2016年08月01日\\n首次发表日期:未发表\\n权利取得方式:原始取得\\n权利范围:全部权利\\n登记号:2017SR037977\\n根据《计算机软件保护条例》和《计算机软件著作权登记办法》的\\n规定,经中国版权保护中心审核,对以上事项予以登记\\n计算机软件著作权\\n登记专用章\\n2017年02月10日\\nNo.01433672\",\"fileId\":\"338847e34904fa96e8834cb220667db8\",\"fileName\":\"2-基于遥感影像的快速变化检测系统.jpg\",\"filePath\":\"/软件著作/2-基于遥感影像的快速变化检测系统.jpg\",\"fileType\":\"jpg\",\"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230816/16/06/1/2-基于遥感影像的快速变化检测系统.jpg\",\"pageNum\":1}" |
|||
}, |
|||
"created":1691004265000, |
|||
"module":"OCR", |
|||
"start_tag":"false", |
|||
"last_edit":1692464331000, |
|||
"next_app_id":[ |
|||
{ |
|||
"start_id":86, |
|||
"edge_id":49, |
|||
"end_id":90 |
|||
} |
|||
], |
|||
"transfer_id":11, |
|||
"blueprint_id":3, |
|||
"scenes_id":3, |
|||
"scenario":{ |
|||
"dataloss":1, |
|||
"autoCommitTriggerLast":1, |
|||
"maxErrors":3, |
|||
"autoCommit":1, |
|||
"freshVariables":1 |
|||
}, |
|||
"wait_condition":[ |
|||
|
|||
], |
|||
"scheduling":{ |
|||
"interval":-1, |
|||
"type":"single" |
|||
}, |
|||
"name":"软件著作抽取", |
|||
"businessKey":"185aef3b1c810799a6be8314abf6512c", |
|||
"id":86, |
|||
"describe":"软件著作抽取" |
|||
} |
|||
a=get_content(inputdata,"") |
|||
print(a) |
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
|||
|
@ -0,0 +1,25 @@ |
|||
# -*- coding:utf-8 -*- |
|||
|
|||
class pt_v_Exception(Exception): |
|||
def __str__(self): |
|||
return 'pt规则未在缓存中命中' |
|||
|
|||
class dt_v_Exception(Exception): |
|||
def __str__(self): |
|||
return 'dt规则未在缓存中命中' |
|||
|
|||
class dt_v_attr_Exception(Exception): |
|||
def __str__(self): |
|||
return 'dt_attrcode规则未在缓存中命中' |
|||
|
|||
class dt_v_codeid_Exception(Exception): |
|||
def __str__(self): |
|||
return 'dt_codeid规则未在缓存中命中' |
|||
|
|||
class dt_v_senti_Exception(Exception): |
|||
def __str__(self): |
|||
return 'dt_senti规则未在缓存中命中' |
|||
|
|||
class dt_v_res_Exception(Exception): |
|||
def __str__(self): |
|||
return 'dt_resverse规则未在缓存中命中' |
@ -0,0 +1,67 @@ |
|||
# coding=utf-8 |
|||
from kafka import KafkaProducer |
|||
from kafka import KafkaConsumer |
|||
import json |
|||
import traceback |
|||
import time |
|||
import traceback |
|||
import datetime |
|||
import queue |
|||
from logUtil import get_logger |
|||
|
|||
logger = get_logger("crawlWebsrcCode.log") |
|||
""" |
|||
写到kafka |
|||
""" |
|||
def kafkaProduce(topic,resultData,address): |
|||
producer = KafkaProducer(bootstrap_servers = '{}'.format(address),request_timeout_ms=120000) |
|||
topics = topic.split(',') |
|||
for tc in topics: |
|||
future = producer.send(tc,resultData) |
|||
result = future.get(timeout=60) |
|||
producer.flush() |
|||
print (result) |
|||
|
|||
#写入文件 |
|||
def writeTxt(filePath,result): |
|||
f = open(filePath,'a',encoding='utf-8') |
|||
f.write(result.encode('utf-8').decode('unicode_escape')+'\n') |
|||
f.close |
|||
|
|||
def KafkaConsume(topic,address,group_id,task_queue,logger): |
|||
''' |
|||
监控kafka,读取数据写到任务队列 |
|||
:param topic: |
|||
:param address: |
|||
:param group_id: |
|||
:param task_queue: |
|||
:return: |
|||
''' |
|||
try: |
|||
consumer = KafkaConsumer(topic, auto_offset_reset='earliest',fetch_max_bytes=1024768000,fetch_max_wait_ms=5000, bootstrap_servers=address,group_id = group_id) |
|||
i = 1 |
|||
while True: |
|||
for msg in consumer: |
|||
print('第{}条数据'.format(i)) |
|||
data = str(msg.value, encoding = "utf-8") |
|||
print(data) |
|||
task_queue.put(data) |
|||
i = i+1 |
|||
else: |
|||
print('暂无任务------') |
|||
time.sleep(10) |
|||
except Exception as e: |
|||
print('kafka未知异常----') |
|||
traceback.print_exc() |
|||
|
|||
def writeTxt(filePath,result): |
|||
f = open(filePath,'a') |
|||
f.write(result+'\n') |
|||
f.close |
|||
|
|||
if __name__ == '__main__': |
|||
# resultData = {'id': '中文', 'url': 'https://zh.wikipedia.org/zh/%E8%94%A1%E8%8B%B1%E6%96%87'} |
|||
# kafkaProduce('test', json.dumps(resultData).encode('utf-8').decode('unicode_escape').encode(),'121.4.41.194:8008') |
|||
task_queue = queue.Queue() |
|||
KafkaConsume('fq-Taobao-eccontent','39.129.129.172:6666,39.129.129.172:6668,39.129.129.172:6669,39.129.129.172:6670,39.129.129.172:6671','news_sche_8',task_queue,logger) |
|||
# KafkaConsume('zxbnewstopic','120.133.14.71:9992','group3',task_queue,logger) |
@ -0,0 +1,338 @@ |
|||
# coding:utf8 |
|||
import os, sys |
|||
cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd() |
|||
par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir)) |
|||
sys.path.append(cur_dir) |
|||
sys.path.append(par_dir) |
|||
import json |
|||
import re |
|||
# from log_util.set_logger import set_logger |
|||
# logging = set_logger('logs/error.log') |
|||
import pymysql.cursors |
|||
import traceback |
|||
|
|||
def mysqlConn(data,logging): |
|||
res={"successCode":"1","errorLog":"","results":""} |
|||
p_host=data["Host"] |
|||
p_port=int(data["Port"]) |
|||
p_db=data["Database"] |
|||
p_user=data["User"] |
|||
p_password=data["Password"] |
|||
try: |
|||
db = pymysql.connect(host=p_host, user=p_user, passwd=p_password, db=p_db, port=p_port, |
|||
charset='utf8', cursorclass=pymysql.cursors.DictCursor) |
|||
db.ping(reconnect=True) |
|||
cursor = db.cursor() |
|||
sql = "SHOW TABLES" |
|||
cursor.execute(sql) |
|||
tables = cursor.fetchall() |
|||
if tables: |
|||
table_names = list(map(lambda x: list(x.values())[0], tables)) |
|||
res["results"] = table_names |
|||
else: |
|||
res["successCode"] = "0" |
|||
cursor.close() |
|||
db.close() |
|||
return res |
|||
except: |
|||
res["successCode"] = "0" |
|||
res["errorLog"]=traceback.format_exc() |
|||
logging.error(traceback.format_exc()) |
|||
return res |
|||
|
|||
def getTableColumnNames(data,logging): |
|||
res={"successCode":"1","errorLog":"","results":""} |
|||
p_host=data["Host"] |
|||
p_port=int(data["Port"]) |
|||
p_db=data["Database"] |
|||
p_user=data["User"] |
|||
p_password=data["Password"] |
|||
p_table=data["Table"] |
|||
try: |
|||
db = pymysql.connect(host=p_host, user=p_user, passwd=p_password, db=p_db, port=p_port, |
|||
charset='utf8', cursorclass=pymysql.cursors.DictCursor) |
|||
db.ping(reconnect=True) |
|||
cursor = db.cursor() |
|||
sql = "DESCRIBE "+p_table |
|||
cursor.execute(sql) |
|||
tables = cursor.fetchall() |
|||
if tables: |
|||
table_names = list(map(lambda x: x['Field'], tables)) |
|||
res["results"] = table_names |
|||
else: |
|||
res["successCode"] = "0" |
|||
cursor.close() |
|||
db.close() |
|||
return res |
|||
except: |
|||
res["successCode"] = "0" |
|||
res["errorLog"]=traceback.format_exc() |
|||
logging.error(traceback.format_exc()) |
|||
return res |
|||
|
|||
def mysqlInsert(input,logging): |
|||
res={"successCode":"1","errorLog":"","results":""} |
|||
data=input["metadata"]["admin"] |
|||
p_host=data["Host"] |
|||
p_port=int(data["Port"]) |
|||
p_db=data["Database"] |
|||
p_user=data["User"] |
|||
p_password=data["Password"] |
|||
p_table=data["Table"] |
|||
p_columnName=data["columnName"] |
|||
cN='('+','.join(p_columnName)+') ' |
|||
p_values=data["values"] |
|||
val=tuple(p_values) |
|||
try: |
|||
db = pymysql.connect(host=p_host, user=p_user, passwd=p_password, db=p_db, port=p_port, |
|||
charset='utf8', cursorclass=pymysql.cursors.DictCursor) |
|||
db.ping(reconnect=True) |
|||
cursor = db.cursor() |
|||
sql = "insert into " + p_table + cN + "values ("+ ','.join(['%s'] * len(val)) + ")" |
|||
cursor.execute(sql,val) |
|||
db.commit() |
|||
cursor.close() |
|||
db.close() |
|||
return res |
|||
except: |
|||
res["successCode"] = "0" |
|||
res["errorLog"]=traceback.format_exc() |
|||
logging.error(traceback.format_exc()) |
|||
return res |
|||
|
|||
def mysqlUpdate(input,logging): |
|||
res={"successCode":"1","errorLog":"","results":""} |
|||
data=input["metadata"]["admin"] |
|||
p_host=data["Host"] |
|||
p_port=int(data["Port"]) |
|||
p_db=data["Database"] |
|||
p_user=data["User"] |
|||
p_password=data["Password"] |
|||
p_table=data["Table"] |
|||
# p_set=data["Set"] |
|||
p_set=get_updateSet(input) |
|||
# where=process_where(data["Filter"]) |
|||
where=get_filter(data["Filter"]) |
|||
try: |
|||
db = pymysql.connect(host=p_host, user=p_user, passwd=p_password, db=p_db, port=p_port, |
|||
charset='utf8', cursorclass=pymysql.cursors.DictCursor) |
|||
db.ping(reconnect=True) |
|||
cursor = db.cursor() |
|||
sql = "UPDATE " + p_table + p_set + where |
|||
print(sql) |
|||
cursor.execute(sql) |
|||
db.commit() |
|||
cursor.close() |
|||
db.close() |
|||
return res |
|||
except: |
|||
res["successCode"] = "0" |
|||
res["errorLog"]=traceback.format_exc() |
|||
logging.error(traceback.format_exc()) |
|||
return res |
|||
|
|||
def mysqlExecute(input,logging): |
|||
res={"successCode":"1","errorLog":"","results":""} |
|||
data=input["metadata"]["admin"] |
|||
p_host=data["Host"] |
|||
p_port=int(data["Port"]) |
|||
p_db=data["Database"] |
|||
p_user=data["User"] |
|||
p_password=data["Password"] |
|||
execute=data["Execute"] |
|||
try: |
|||
db = pymysql.connect(host=p_host, user=p_user, passwd=p_password, db=p_db, port=p_port, |
|||
charset='utf8', cursorclass=pymysql.cursors.DictCursor) |
|||
db.ping(reconnect=True) |
|||
cursor = db.cursor() |
|||
cursor.execute(execute) |
|||
if 'select' in execute.lower(): |
|||
result = cursor.fetchall() |
|||
res["results"]=json.dumps(result,ensure_ascii=False) |
|||
else: |
|||
db.commit() |
|||
cursor.close() |
|||
db.close() |
|||
return res |
|||
except: |
|||
res["successCode"] = "0" |
|||
res["errorLog"]=traceback.format_exc() |
|||
logging.error(traceback.format_exc()) |
|||
return res |
|||
|
|||
# def process_where(data): |
|||
# ''' |
|||
# 组装where |
|||
# :param data: data["Filter"],{"key":"age","value":"20","operator":">"},{"logicalSymbol":"and"},{"key":"weight","value":"50","operator":"<"} |
|||
# :return: WHERE age>20 and weight<50 |
|||
# ''' |
|||
# if data=="" or data==[]: |
|||
# return "" |
|||
# where = " WHERE " |
|||
# for line in data: |
|||
# if "key" in line.keys(): |
|||
# val = line["value"] |
|||
# if isinstance(val, str): |
|||
# val = "\'" + val + "\'" |
|||
# tmp = str(line["key"]) + " " + line["operator"] + " " + str(val) |
|||
# where += tmp |
|||
# else: |
|||
# where += " " + line["logicalSymbol"] + " " |
|||
# return where |
|||
# |
|||
# def process_filter(data): |
|||
# ''' |
|||
# 组装key,value,operator |
|||
# :param data: data["Filter"],{"key":"age",value:"20","operator":"="} |
|||
# :return: age=20 |
|||
# ''' |
|||
# if data=="" or data==[]: |
|||
# return "" |
|||
# res=data["key"]+" "+data["operator"]+" "+data["value"] |
|||
# return res |
|||
|
|||
def get_updateSet(input): |
|||
metadata=input["metadata"] |
|||
user=metadata["user"] |
|||
sets=metadata["admin"]["Set"] |
|||
res=[] |
|||
for line in sets: |
|||
part=line.split("=") |
|||
tmp = [] |
|||
for p in part: |
|||
user_match=re.findall('##(.*?)##', p) |
|||
if user_match!=[]: |
|||
tmp.append(user[user_match[0]]) |
|||
res.append(str(tmp[0])+"="+str(tmp[1])) |
|||
result=" SET "+",".join(res) |
|||
return result |
|||
|
|||
def get_filter(data): |
|||
if "OR" not in data.keys(): |
|||
return "" |
|||
op_or=data["OR"] |
|||
res = "" |
|||
if len(op_or) == 1: |
|||
tmp = [] |
|||
line = op_or[0]["AND"] |
|||
for single_line in line: |
|||
val = single_line["value"] |
|||
if isinstance(val, str): |
|||
val = "\'" + val + "\'" |
|||
tmp.append(str(single_line["key"]) + single_line["operator"] + str(val)) |
|||
if single_line != line[-1]: |
|||
tmp.append("and") |
|||
res = " WHERE "+" ".join(tmp) |
|||
elif len(op_or) > 1: |
|||
tmp = [] |
|||
for single_and in op_or: |
|||
line = single_and["AND"] |
|||
for sigle_line in line: |
|||
val = sigle_line["value"] |
|||
if isinstance(val, str): |
|||
val = "\'" + val + "\'" |
|||
tmp.append(str(sigle_line["key"]) + sigle_line["operator"] + str(val)) |
|||
if sigle_line != line[-1]: |
|||
tmp.append("and") |
|||
if single_and != op_or[-1]: |
|||
tmp.append("or") |
|||
res = " WHERE "+" ".join(tmp) |
|||
return res |
|||
|
|||
|
|||
def mysqlQuery(input,logging): |
|||
res={"successCode":"1","errorLog":"","results":""} |
|||
data=input["metadata"]["admin"] |
|||
p_host=data["Host"] |
|||
p_port=int(data["Port"]) |
|||
p_db=data["Database"] |
|||
p_user=data["User"] |
|||
p_password=data["Password"] |
|||
p_table=data["Table"] |
|||
p_columnNames=data["columnNames"] |
|||
# p_filter=data["Filter"] |
|||
column='*' |
|||
if len(p_columnNames)==1: |
|||
column=p_columnNames[0] |
|||
elif len(p_columnNames)>1: |
|||
column=','.join(p_columnNames) |
|||
where=get_filter(data["Filter"]) |
|||
try: |
|||
db = pymysql.connect(host=p_host, user=p_user, passwd=p_password, db=p_db, port=p_port, |
|||
charset='utf8', cursorclass=pymysql.cursors.DictCursor) |
|||
db.ping(reconnect=True) |
|||
cursor = db.cursor() |
|||
sql = "SELECT " + column +" From "+ p_table + where |
|||
# print(sql) |
|||
cursor.execute(sql) |
|||
result = cursor.fetchall() |
|||
res["results"]=json.dumps(result,ensure_ascii=False) |
|||
cursor.close() |
|||
db.close() |
|||
return res |
|||
except: |
|||
res["successCode"] = "0" |
|||
res["errorLog"]=traceback.format_exc() |
|||
logging.error(traceback.format_exc()) |
|||
return res |
|||
|
|||
def mysqlDelete(input,logging): |
|||
res={"successCode":"1","errorLog":"","results":""} |
|||
data=input["metadata"]["admin"] |
|||
p_host=data["Host"] |
|||
p_port=int(data["Port"]) |
|||
p_db=data["Database"] |
|||
p_user=data["User"] |
|||
p_password=data["Password"] |
|||
p_table=data["Table"] |
|||
# where=process_where(data["Filter"]) |
|||
where=get_filter(data["Filter"]) |
|||
try: |
|||
db = pymysql.connect(host=p_host, user=p_user, passwd=p_password, db=p_db, port=p_port, |
|||
charset='utf8', cursorclass=pymysql.cursors.DictCursor) |
|||
db.ping(reconnect=True) |
|||
cursor = db.cursor() |
|||
sql = "DELETE From "+ p_table + where |
|||
cursor.execute(sql) |
|||
db.commit() |
|||
cursor.close() |
|||
db.close() |
|||
return res |
|||
except: |
|||
res["successCode"] = "0" |
|||
res["errorLog"]=traceback.format_exc() |
|||
logging.error(traceback.format_exc()) |
|||
return res |
|||
|
|||
|
|||
if __name__=="__main__": |
|||
input={"metadata":{"admin":{ |
|||
"type":"query", |
|||
"Table":"student", |
|||
"columnNames":["name","age"], |
|||
"Set":["##tag1##=##value1##","##tag2##=##value2##"], |
|||
"Filter":{ |
|||
"OR":[ |
|||
{ |
|||
"AND":[{"key":"age","value":20,"operator":">"},{"key":"weight","value":50,"operator":"<"}] |
|||
}, |
|||
{ |
|||
"AND":[{"key":"name","value":"ff","operator":"="}] |
|||
} |
|||
] |
|||
}, |
|||
"Host":"172.26.28.30", |
|||
"Port":"3306", |
|||
"Database":"test", |
|||
"User":"crawl", |
|||
"Password":"crawl123" |
|||
}}, |
|||
"user": { |
|||
"tag1": "age", |
|||
"tag2": "weight", |
|||
"value1": 2, |
|||
"value2": 100 |
|||
} |
|||
} |
|||
res=mysqlUpdate(input,"") |
|||
print(res) |
@ -0,0 +1,456 @@ |
|||
#coding:utf8 |
|||
import pandas as pd |
|||
import numpy as np |
|||
import networkx as nx |
|||
from textblob import TextBlob |
|||
from snownlp import SnowNLP |
|||
from wordcloud import STOPWORDS |
|||
import jieba |
|||
# import tool |
|||
from tqdm import tqdm |
|||
import os,sys |
|||
cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd() |
|||
par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir)) |
|||
sys.path.append(cur_dir) |
|||
sys.path.append(par_dir) |
|||
import datetime |
|||
# from sklearn.model_selection import train_test_split |
|||
# from sklearn.ensemble import RandomForestClassifier |
|||
# from sklearn.model_selection import GridSearchCV |
|||
import joblib |
|||
def pre_user(data_user): |
|||
data_user['accountName'] = data_user['accountName'].apply(lambda x:'@'+x) |
|||
data_user['otherInfo'] = 1-pd.isnull(data_user['otherInfo']).astype(int) |
|||
data_user['nickName'] = 1-pd.isnull(data_user['nickName']).astype(int) |
|||
data_user = data_user[['accountName', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']] |
|||
data_user = data_user.dropna() |
|||
data_user = data_user.drop_duplicates().reset_index(drop = True) |
|||
data_user['fansCount'] = data_user['fansCount'].astype(int) |
|||
data_user['likeCount'] = data_user['likeCount'].astype(int) |
|||
data_user['postCount'] = data_user['postCount'].astype(int) |
|||
data_user.columns = ['用户名', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo'] |
|||
return data_user |
|||
|
|||
def getText_count_eng(txt): |
|||
"""英文词频统计""" |
|||
txt = txt.lower() #将所有大写字母变成小写 |
|||
for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()… ': #将文本中特殊符号替换为空格 |
|||
txt = txt.replace(ch," ") |
|||
words = txt.split() |
|||
counts = {} |
|||
for word in words: |
|||
if word not in STOPWORDS: |
|||
if word != '\t': |
|||
counts[word] = counts.get(word,0) + 1 #统计字数 |
|||
items = pd.DataFrame(list(counts.items())) |
|||
return items |
|||
|
|||
def getText_count_ch(txt): |
|||
"""中文词频统计""" |
|||
txt = txt.lower() #将所有大写字母变成小写 |
|||
for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()… 0123456789abcdefghijklmnopqrstuvwxyz': #将文本中特殊符号数字删除 |
|||
txt = txt.replace(ch,"") |
|||
words = jieba.lcut(txt) |
|||
counts = {} |
|||
for word in words: |
|||
counts[word] = counts.get(word,0) + 1 |
|||
items = list(counts.items()) |
|||
fin_items = [] |
|||
for item in items: |
|||
if len(item[0])>=2: |
|||
fin_items.append(item) |
|||
fin_items = pd.DataFrame(fin_items) |
|||
return fin_items |
|||
|
|||
def getText_count_U(txt): |
|||
"""统计英文大写词频""" |
|||
for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()… ': #将文本中特殊符号替换为空格 |
|||
txt = txt.replace(ch," ") |
|||
words = txt.split() |
|||
counts = {} |
|||
for word in words: |
|||
if word not in STOPWORDS: |
|||
if word != '/t': |
|||
if word.isupper(): #统计大写 |
|||
counts[word] = counts.get(word,0) + 1 #统计字数 |
|||
items = pd.DataFrame(list(counts.items())) #将字典类型转换成列表类型 |
|||
if items.shape == (0,0): |
|||
out = 0 |
|||
else: |
|||
out = sum(items[1]) |
|||
return out |
|||
|
|||
def is_chinese(strs): |
|||
"""判断一个unicode是否是汉字/英文""" |
|||
strs = strs.lower() |
|||
for uchar in strs: |
|||
if (uchar < u'\u0061') or (u'\u007a' < uchar < u'\u4e00') or (u'\u9fff' < uchar): |
|||
return False |
|||
return True |
|||
|
|||
def is_eng(strs): |
|||
"""判断一个unicode是否是英文""" |
|||
strs = strs.lower() |
|||
for uchar in strs: |
|||
if (uchar < u'\u0061') or (u'\u007a' < uchar): |
|||
return False |
|||
return True |
|||
|
|||
# def pre_user(data_user): |
|||
# data_user['accountName'] = data_user['accountName'].apply(lambda x:'@'+x) |
|||
# data_user['otherInfo'] = 1-pd.isnull(data_user['otherInfo']).astype(int) |
|||
# data_user['nickName'] = 1-pd.isnull(data_user['nickName']).astype(int) |
|||
# data_user = data_user[['accountName', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']] |
|||
# data_user = data_user.dropna() |
|||
# data_user = data_user.drop_duplicates().reset_index(drop = True) |
|||
# data_user['fansCount'] = data_user['fansCount'].astype(int) |
|||
# data_user['likeCount'] = data_user['likeCount'].astype(int) |
|||
# data_user['postCount'] = data_user['postCount'].astype(int) |
|||
# data_user.columns = ['用户名', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo'] |
|||
# return data_user |
|||
|
|||
|
|||
def post_related(df, data_user,logging): |
|||
# postset = pd.DataFrame(columns=['传播层级', '帖子id', '转发来源id', '用户名', '用户id', '发表内容', '发表时间', 'shareCount', |
|||
# 'url', 'layer', 'shape', 'degree', 'closeness_centrality', 'pagerank', |
|||
# 'sub_shareCount', '语言', '主贴长度', '主贴http', '主贴at', '主贴tag', |
|||
# 'emotion', 'emotion_sub', '最大词频数', '重复词汇占比', '大写词频', '有无传播内容', |
|||
# '传播链语言均值', '传播链语言标准差', '传播链贴文emotion均值', '传播链贴文emotion标准差', |
|||
# '传播链贴文emotion_sub均值', '传播链贴文emotion_sub标准差', |
|||
# '传播链贴文长度均值', '传播链贴文长度标准差', '传播链贴文http均值', '传播链贴文http标准差', '传播链贴文at均值', |
|||
# '传播链贴文at标准差', '传播链贴文tag均值', '传播链贴文tag标准差', 'diffdate均值', 'diffdate标准差']) |
|||
postset = pd.DataFrame(columns=['传播层级', '帖子id', '转发来源id','所属帖子id', '用户名', '用户id', '发表内容', '发表时间', 'shareCount', |
|||
'url', 'layer', 'shape', 'degree', 'closeness_centrality', 'pagerank', |
|||
'语言', '主贴长度', '主贴http', '主贴at', '主贴tag', |
|||
'emotion', 'emotion_sub', '最大词频数', '重复词汇占比']) |
|||
|
|||
for post_id in df['所属帖子id'].drop_duplicates().reset_index(drop=True): |
|||
|
|||
data = df[df['所属帖子id'] == post_id].reset_index(drop=True) |
|||
|
|||
data.columns = ['传播层级', '帖子id', '转发来源id', '所属帖子id', '用户名', '用户id', '发表内容', '发表时间', |
|||
'shareCount', 'url'] |
|||
|
|||
data = data.drop_duplicates() |
|||
|
|||
post = data[data['传播层级'] == 1].head(1) |
|||
### 一、新闻传播--贴文网络 |
|||
##1.layer/shape/degree |
|||
post['layer'] = int(max(data['传播层级'])) |
|||
post['shape'] = data.shape[0] - 1 |
|||
post['degree'] = data[data['传播层级'] == 2].shape[0] |
|||
|
|||
##2.整体网络测度(贴文网络测度) |
|||
###2.1把转发来源id对应到转发来源用户 |
|||
tmp_zfyh = pd.merge(data[data['传播层级'] != 1]['转发来源id'].drop_duplicates(), |
|||
data[data['帖子id'].notnull()][['帖子id', '用户名']], |
|||
left_on=['转发来源id'], right_on=['帖子id'], how='left')[['转发来源id', '用户名']] |
|||
tmp_zfyh.columns = ['转发来源id', '转发来源用户名'] |
|||
data = pd.merge(data, tmp_zfyh, left_on=['转发来源id'], right_on=['转发来源id'], how='left') |
|||
post_edge = data.copy() |
|||
post_edge = data[data['传播层级'] != 1][['用户名', '转发来源用户名']] |
|||
post_edge.columns = ['source', 'target'] |
|||
post_edge['count_all'] = 1 |
|||
post_edge = post_edge.groupby(['source', 'target'])['count_all'].count().reset_index() |
|||
# post_edge.to_csv(r'E:\项目文件\情报\假新闻\数据\画图\post_edge_tmp.csv',index=False) |
|||
|
|||
edgeweightset = post_edge[['source', 'target', 'count_all']] |
|||
edgeweightset_l = [[] for _ in range(edgeweightset.shape[0])] |
|||
for k in range(len(edgeweightset_l)): |
|||
for j in range(edgeweightset.shape[1]): |
|||
edgeweightset_l[k].append(edgeweightset.iloc[k, j]) |
|||
# print(i/len(edgeweightset_l)) |
|||
|
|||
if len(edgeweightset_l) == 0: # 没有传播链 |
|||
post['closeness_centrality'] = 1 |
|||
post['pagerank'] = 1 |
|||
else: |
|||
g = nx.DiGraph() |
|||
g.add_weighted_edges_from(edgeweightset_l) |
|||
centrality = [nx.closeness_centrality(g), |
|||
nx.pagerank(g)] |
|||
results = [] |
|||
nodes = g.nodes() # 提取网络中节点列表 |
|||
for node in nodes: # 遍历所有节点,提取每个节点度中心性计算结果,并存储为[[节点1,结果],[节点2,结果],...]的形式 |
|||
results.append([node, |
|||
centrality[0][node], |
|||
centrality[1][node]]) |
|||
results = pd.DataFrame(results) |
|||
results.columns = ['node', 'closeness_centrality', 'pagerank'] |
|||
|
|||
post['closeness_centrality'] = results[results['node'] == results[ |
|||
results['closeness_centrality'] == max(results['closeness_centrality'])]['node'].iloc[0]][ |
|||
'closeness_centrality'].iloc[0] |
|||
post['pagerank'] = results[results['node'] == |
|||
results[results['closeness_centrality'] == max(results['closeness_centrality'])][ |
|||
'node'].iloc[0]]['pagerank'].iloc[0] |
|||
|
|||
# post['closeness_centrality'] = results[results['node'] == post['帖子id'].iloc[0]]['closeness_centrality'].iloc[0] |
|||
# post['pagerank'] = results[results['node'] == post['帖子id'].iloc[0]]['pagerank'].iloc[0] |
|||
|
|||
#——————————hh—————————————— |
|||
# 特征未使用 |
|||
# ##3.传播链中的平均影响力shareCount |
|||
# tmp = 0 |
|||
# for k in range(data[(data['传播层级'] != '1') & (data['帖子id'].notnull())].shape[0]): |
|||
# tmp = tmp + int(data[(data['传播层级'] != '1') & (data['帖子id'].notnull())].shareCount.iloc[k]) |
|||
# if tmp == 0: |
|||
# post['sub_shareCount'] = 0 |
|||
# else: |
|||
# post['sub_shareCount'] = tmp / data[(data['传播层级'] != '1') & (data['帖子id'].notnull())].shape[0] |
|||
|
|||
#———————————————————————— |
|||
|
|||
|
|||
##二、主贴文本 |
|||
# post['发表内容'] = 'October 10th commemorates the 1911 Revolution happened in Wuchang of China, which ended thousands-year-long absolute monarchy. Tsai and DPP authorities want to separate Taiwan from China and betray history. The Chinese people and Chinese history will never forgive these traitors.' |
|||
##文本特殊字符个数(http、@、#) |
|||
# logging.info(post) |
|||
post['主贴http'] = post['发表内容'].iloc[0].count('http') |
|||
post['主贴at'] = post['发表内容'].iloc[0].count('@') |
|||
post['主贴tag'] = post['发表内容'].iloc[0].count('#') |
|||
|
|||
##判断语言 |
|||
tmp = post['发表内容'].iloc[0] |
|||
for ch in 'éí!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| 0123456789': |
|||
tmp = tmp.replace(ch, '') |
|||
|
|||
if is_eng(tmp): ##主贴英文内容 |
|||
|
|||
post['语言'] = 0 |
|||
text = post['发表内容'].iloc[0] |
|||
# text = '#Americans,for the first time in their lives,are seeing empty shelves in the stores.This isn’t right.We need to cut #China out of our supply chains by producing locally.#onshoring' |
|||
text = text[0:text.rfind("http")] |
|||
for ch in '!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| ': |
|||
text = text.replace(ch, ' ') |
|||
|
|||
##文本长度 |
|||
words = text.split(' ') |
|||
post['主贴长度'] = len(words) |
|||
|
|||
##文本情感 |
|||
# post['emotion'] = post['发表内容'].apply(lambda x: SnowNLP(x).sentiments) |
|||
emo = pd.DataFrame(TextBlob(post['发表内容'].iloc[0]).sentiment) |
|||
post['emotion'] = emo.loc[0, 0] |
|||
post['emotion_sub'] = emo.loc[1, 0] |
|||
|
|||
##文本词频 |
|||
## 词频统计1:最大词频数 |
|||
## 词频统计2:正文中出现两次及以上的词占比 |
|||
items = getText_count_eng(text) |
|||
if items.shape == (0, 0): |
|||
post['最大词频数'] = 0 |
|||
post['重复词汇占比'] = 0 |
|||
else: |
|||
post['最大词频数'] = max(items[1]) |
|||
post['重复词汇占比'] = items[items[1] >= 2].shape[0] / items.shape[0] |
|||
|
|||
## 词频统计3:全部大写词频 |
|||
post['大写词频'] = getText_count_U(text) |
|||
|
|||
elif is_chinese(tmp): ##主贴中文内容 |
|||
|
|||
post['语言'] = 1 |
|||
|
|||
text = post['发表内容'].iloc[0] |
|||
text = text[0:text.rfind("http")] |
|||
post['主贴长度'] = len(text) |
|||
|
|||
post['emotion'] = (SnowNLP(text).sentiments - 0.5) * 2 |
|||
post['emotion_sub'] = np.NaN |
|||
# post['emotion_blob'] = pd.DataFrame(post['发表内容'].apply(lambda x: TextBlob(x).sentiment).iloc[0]).loc[0] |
|||
# post['emotion_sub'] = pd.DataFrame(post['发表内容'].apply(lambda x: TextBlob(x).sentiment).iloc[0]).loc[1] |
|||
|
|||
##文本词频 |
|||
## 词频统计1:标题中出现的词,在正文中出现最大词频 |
|||
## 词频统计2:正文中出现两次及以上的词占比 |
|||
items = getText_count_ch(text) |
|||
if items.shape == (0, 0): |
|||
post['最大词频数'] = 0 |
|||
post['重复词汇占比'] = 0 |
|||
else: |
|||
post['最大词频数'] = max(items[1]) |
|||
post['重复词汇占比'] = items[items[1] >= 2].shape[0] / items.shape[0] |
|||
## 词频统计3:全部大写词频 |
|||
post['大写词频'] = np.NaN |
|||
|
|||
else: |
|||
post['语言'] = np.NaN |
|||
post['主贴长度'] = np.NaN |
|||
post['emotion'] = np.NaN |
|||
post['emotion_sub'] = np.NaN |
|||
post['最大词频数'] = np.NaN |
|||
post['重复词汇占比'] = np.NaN |
|||
post['大写词频'] = np.NaN |
|||
|
|||
# ##4.2传播链中的文本 |
|||
# sub_post = pd.DataFrame(data[(data['传播层级'] != '1') & (data['帖子id'].notnull())][['发表内容', '发表时间']]) |
|||
# sub_post['语言'] = np.NaN |
|||
# sub_post['文本长度'] = np.NaN |
|||
# sub_post['http'] = np.NaN |
|||
# sub_post['at'] = np.NaN |
|||
# sub_post['tag'] = np.NaN |
|||
# sub_post['emotion'] = np.NaN |
|||
# sub_post['emotion_sub'] = np.NaN |
|||
# sub_post['diffdate'] = np.NaN |
|||
# |
|||
# for k in range(sub_post.shape[0]): |
|||
# ##文本特殊字符个数(http、@、#) |
|||
# sub_post['http'].iloc[k] = sub_post['发表内容'].iloc[k].count('http') |
|||
# sub_post['at'].iloc[k] = sub_post['发表内容'].iloc[k].count('@') |
|||
# sub_post['tag'].iloc[k] = sub_post['发表内容'].iloc[k].count('#') |
|||
# |
|||
# ##时间差 |
|||
# d1 = datetime.datetime.strptime(sub_post['发表时间'].iloc[k], "%Y-%m-%d %H:%M:%S") |
|||
# base = datetime.datetime.strptime(post['发表时间'].iloc[0], "%Y-%m-%d %H:%M:%S") |
|||
# |
|||
# # now = datetime.datetime.now() |
|||
# sub_post['diffdate'].iloc[k] = (d1 - base).days |
|||
# |
|||
# ##判断语言 |
|||
# tmp = sub_post['发表内容'].iloc[k] |
|||
# for ch in 'éí!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| 0123456789': |
|||
# tmp = tmp.replace(ch, '') |
|||
# |
|||
# if is_eng(tmp): ##英文内容 |
|||
# |
|||
# sub_post['语言'].iloc[k] = 0 |
|||
# |
|||
# ##文本长度 |
|||
# text = sub_post['发表内容'].iloc[k] |
|||
# # text = "'America is collapsing and it's China's fault' is definitely a change of direction?" |
|||
# text = text[0:text.rfind("http")] |
|||
# for ch in '!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| ': |
|||
# text = text.replace(ch, ' ') |
|||
# words = text.split(' ') |
|||
# sub_post['文本长度'].iloc[k] = len(words) |
|||
# ##情感 |
|||
# sub_emo = pd.DataFrame(TextBlob(sub_post['发表内容'].iloc[k]).sentiment) |
|||
# sub_post['emotion'].iloc[k] = sub_emo.loc[0, 0] |
|||
# sub_post['emotion_sub'].iloc[k] = sub_emo.loc[1, 0] |
|||
# |
|||
# elif is_chinese(tmp): ##中文内容 |
|||
# |
|||
# sub_post['语言'].iloc[k] = 1 |
|||
# |
|||
# ##文本长度 |
|||
# text = sub_post['发表内容'].iloc[k] |
|||
# text = text[0:text.rfind("http")] |
|||
# sub_post['文本长度'].iloc[k] = len(text) |
|||
# ##情感 |
|||
# sub_post['emotion'].iloc[k] = (SnowNLP(sub_post['发表内容'].iloc[k]).sentiments - 0.5) * 2 |
|||
# sub_post['emotion_sub'].iloc[k] = np.NaN |
|||
# |
|||
# else: |
|||
# |
|||
# sub_post['语言'].iloc[k] = np.NaN |
|||
# sub_post['文本长度'].iloc[k] = np.NaN |
|||
# sub_post['emotion'].iloc[k] = np.NaN |
|||
# sub_post['emotion_sub'].iloc[k] = np.NaN |
|||
# |
|||
# if sub_post.shape[0] == 0: |
|||
# post['有无传播内容'] = 0 |
|||
# else: |
|||
# post['有无传播内容'] = 1 |
|||
# |
|||
# post['传播链语言均值'] = sub_post['语言'].mean() |
|||
# post['传播链贴文长度均值'] = sub_post['文本长度'].mean() |
|||
# post['传播链贴文emotion均值'] = sub_post['emotion'].mean() |
|||
# |
|||
# ##emotion_sub取有值的均值 |
|||
# post['传播链贴文emotion_sub均值'] = sub_post['emotion_sub'].mean() |
|||
# |
|||
# post['传播链贴文http均值'] = sub_post['http'].mean() |
|||
# |
|||
# post['传播链贴文at均值'] = sub_post['at'].mean() |
|||
# |
|||
# post['传播链贴文tag均值'] = sub_post['tag'].mean() |
|||
# |
|||
# post['diffdate均值'] = sub_post['diffdate'].mean() |
|||
|
|||
##三、用户信息 |
|||
##发帖用户 |
|||
post = pd.merge(post, data_user, how='left', on='用户名') |
|||
|
|||
##传播链用户 |
|||
sub_user = pd.DataFrame(data[data['传播层级'] != 1][['用户名']]) |
|||
sub_user = pd.merge(sub_user, data_user, how='left', on='用户名') |
|||
sub_user = sub_user.dropna() |
|||
|
|||
post['nickName均值'] = sub_user['nickName'].mean() |
|||
post['fansCount均值'] = sub_user['fansCount'].mean() |
|||
post['likeCount均值'] = sub_user['likeCount'].mean() |
|||
post['postCount均值'] = sub_user['postCount'].mean() |
|||
post['otherInfo均值'] = sub_user['otherInfo'].mean() |
|||
|
|||
postset = pd.concat([postset, post]).reset_index(drop=True) |
|||
|
|||
postset = postset.fillna(0) |
|||
postset['emotion_degree'] = abs(postset['emotion']) |
|||
|
|||
return postset |
|||
|
|||
def predict_news(userData,postChain,logging): |
|||
data_po = pd.DataFrame(postChain).replace('', np.nan) |
|||
data_po.columns = ['id','层级','帖子id','转发来源id','所属帖子id','用户名','用户id','发表内容','发表时间','shareCount','url','topicId'] |
|||
data_po=data_po[['层级','帖子id','转发来源id','所属帖子id','用户名','用户id','发表内容','发表时间','shareCount','url']] |
|||
if not userData: |
|||
columns=['topicId','id','accountName','nickName','fansCount','likeCount','postCount','account_url','otherInfo','postId','ssId'] |
|||
data_user=pd.DataFrame(columns=columns) |
|||
else: |
|||
data_user = pd.DataFrame(userData).replace('', np.nan) |
|||
data_user.columns = ['topicId','id','accountName','nickName','fansCount','likeCount','postCount','account_url','otherInfo','postId','ssId'] |
|||
data_user=data_user[['accountName','nickName','fansCount','likeCount','postCount','account_url','otherInfo']] |
|||
data_user = pre_user(data_user) |
|||
#data_user=dataframe[@XHNews,1,878,1178,938,1] |
|||
#data_user.columns=['用户名', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo'] |
|||
|
|||
postset_po = post_related(data_po,data_user,logging) ## 正面文件 |
|||
features = postset_po[[ |
|||
#'shareCount', |
|||
'layer', 'shape', 'degree', 'pagerank', 'closeness_centrality', |
|||
'主贴http', '主贴at', '主贴tag', |
|||
'主贴长度','emotion', 'emotion_degree', |
|||
'最大词频数', '重复词汇占比',#(中英文差异大) |
|||
#'有无传播内容', |
|||
'fansCount','likeCount', 'postCount', |
|||
#'sub_shareCount', |
|||
'fansCount均值', 'postCount均值', 'otherInfo均值' |
|||
]] |
|||
|
|||
clf = joblib.load(par_dir+'/model/fake_news_model.pkl') |
|||
clf_predict = clf.predict(features) |
|||
res=pd.DataFrame(clf_predict) |
|||
res.columns=['假新闻预测结果'] |
|||
res['recognitionResult'] = res['假新闻预测结果'].apply(lambda x: '真新闻' if x == 1 else '假新闻') |
|||
result = pd.concat([postset_po, res], axis=1) |
|||
return result |
|||
|
|||
|
|||
if __name__=="__main__": |
|||
print(par_dir) |
|||
# user={ |
|||
# "topicId":1209, |
|||
# "host":"172.26.28.30", |
|||
# "user":"crawl", |
|||
# "passwd":"crawl123", |
|||
# "db":"test", |
|||
# "port":3306, |
|||
# "table":"TwitterAccount" |
|||
# } |
|||
# userData = tool.mysqlData(user,"") |
|||
# # logging.info("账号数据获取完毕!") |
|||
# # 传播链数据 |
|||
# # post = raw_data["metadata"]["admin"]["Twitter_chain"] |
|||
# post={ |
|||
# "topicId":1209, |
|||
# "host":"172.26.28.30", |
|||
# "user":"crawl", |
|||
# "passwd":"crawl123", |
|||
# "db":"test", |
|||
# "port":3306, |
|||
# "table":"Twitter_chain" |
|||
# } |
|||
# postChain = tool.mysqlData(post, "") |
|||
# # logging.info("传播链数据获取完毕!") |
|||
# predict_news(userData,postChain,"") |
@ -0,0 +1,51 @@ |
|||
#coding:utf8 |
|||
import os, sys |
|||
cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd() |
|||
par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir)) |
|||
sys.path.append(cur_dir) |
|||
sys.path.append(par_dir) |
|||
import json |
|||
from text_analysis.tools import to_kafka |
|||
from tools.mysql_helper import mysqlConn,mysqlInsert,mysqlQuery,mysqlExecute,mysqlUpdate,mysqlDelete,getTableColumnNames |
|||
import traceback |
|||
import time |
|||
from log_util.set_logger import set_logger |
|||
logging=set_logger('results.log') |
|||
|
|||
from views import task_queue |
|||
|
|||
def process_data(): |
|||
while True: |
|||
try: |
|||
# print("task_queue:",task_queue) |
|||
if task_queue.qsize() >0: |
|||
try: |
|||
raw_data = task_queue.get() |
|||
res = "" |
|||
logging.info("启动数据处理线程——") |
|||
logging.info(raw_data) |
|||
flag = raw_data["metadata"]["admin"]["type"] |
|||
# type分为execute、query、insert、update、delete |
|||
if flag == 'insert': |
|||
res = mysqlInsert(raw_data, logging) |
|||
elif flag == 'execute': |
|||
res = mysqlExecute(raw_data, logging) |
|||
elif flag == 'update': |
|||
res = mysqlUpdate(raw_data, logging) |
|||
elif flag == 'query': |
|||
res = mysqlQuery(raw_data, logging) |
|||
elif flag == 'delete': |
|||
res = mysqlDelete(raw_data, logging) |
|||
raw_data["result"] = res |
|||
logging.info("************写入kafka***********") |
|||
to_kafka.send_kafka(raw_data) |
|||
except: |
|||
raw_data["result"] = {"successCode": "0", "errorLog": "", "results": ""} |
|||
raw_data["result"]["errorLog"] = traceback.format_exc() |
|||
to_kafka.send_kafka(raw_data) |
|||
else: |
|||
logging.info("暂无任务,进入休眠--") |
|||
print("222222222222222222222222") |
|||
time.sleep(10) |
|||
except: |
|||
logging.error(traceback.format_exc()) |
@ -0,0 +1,171 @@ |
|||
# -*- coding: utf-8 -*- |
|||
import time |
|||
import threading |
|||
from selenium import webdriver |
|||
import json |
|||
from urllib.parse import urljoin |
|||
from kakfa_util import KafkaConsume |
|||
from kakfa_util import kafkaProduce |
|||
from logUtil import get_logger |
|||
from Go_fastDfs import uploadFile |
|||
import traceback |
|||
import queue |
|||
import configparser |
|||
import os, sys |
|||
import re |
|||
logger = get_logger("./logs/crawlWebsrcCode.log") |
|||
#加载配置文件 |
|||
configFile = './config.ini' |
|||
# 创建配置文件对象 |
|||
con = configparser.ConfigParser() |
|||
# 读取文件 |
|||
con.read(configFile, encoding='utf-8') |
|||
kafkaConfig = dict(con.items('kafka'))#kafka配置信息 |
|||
goFastdfsConfig = dict(con.items('goFastdfs'))#goFastdfs配置信息 |
|||
class Spider(object): |
|||
def __init__(self,url): |
|||
self.chromeOptions = self.get_profile() |
|||
self.browser = self.get_browser() |
|||
self.url = url |
|||
def get_profile(self): |
|||
chromeOptions = webdriver.ChromeOptions() |
|||
chromeOptions.add_argument('--headless') # 谷歌无头模式 |
|||
chromeOptions.add_argument('--disable-gpu') # 禁用显卡 |
|||
# chromeOptions.add_argument('window-size=1280,800') # 指定浏览器分辨率 |
|||
chromeOptions.add_argument("--no-sandbox") |
|||
return chromeOptions |
|||
|
|||
def get_browser(self): |
|||
browser = webdriver.Chrome("D:\\工作使用\\zhaoshang\\chromedriver.exe",chrome_options=self.chromeOptions) |
|||
return browser |
|||
|
|||
def _get_page(self,path): |
|||
''' |
|||
获取页面原格式,写入文件并返回路径 |
|||
:param path: |
|||
:return: |
|||
''' |
|||
self.browser.get(self.url) |
|||
time.sleep(5) |
|||
logger.info("休眠结束") |
|||
# 向下偏移了10000个像素,到达底部。 |
|||
scrollTop = 10000 |
|||
for num in range(1,10): |
|||
js = "var q=document.documentElement.scrollTop={}".format(scrollTop*num) |
|||
logger.info("第{}次滚动".format(num)) |
|||
self.browser.execute_script(js) |
|||
time.sleep(5) |
|||
# 执行 Chome 开发工具命令,得到mhtml内容 |
|||
res = self.browser.execute_cdp_cmd('Page.captureSnapshot', {}) |
|||
#获取文章标题 |
|||
title = '无标题' |
|||
try: |
|||
title = self.browser.find_element_by_css_selector("title").get_attribute("textContent") |
|||
except Exception as e: |
|||
logger.error('获取标题异常----') |
|||
traceback.print_exc() |
|||
pathName = '{}{}.mhtml'.format(path,title) |
|||
with open(pathName, 'w',newline='') as f: |
|||
f.write(res['data']) |
|||
return pathName,title |
|||
if __name__ == '__main__': |
|||
#初始化任务队列 |
|||
task_queue = queue.Queue() |
|||
#跟读kafka线程 |
|||
logger.info("开启读取kafka线程---") |
|||
t = threading.Thread(target=KafkaConsume, name='LoopThread',args=(kafkaConfig['read_topic'], kafkaConfig['address'], kafkaConfig['group_id'], task_queue,logger)) |
|||
t.daemon = True |
|||
t.start() |
|||
#获取任务执行页面原格式保留 |
|||
|
|||
|
|||
|
|||
|
|||
|
|||
while True: |
|||
try: |
|||
if task_queue.qsize() >0: |
|||
taskStr = task_queue.get() |
|||
logger.info('当前任务:{}'.format(taskStr)) |
|||
task = json.loads(taskStr) |
|||
p1 = u'(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]' |
|||
pattern1 = re.compile(p1) |
|||
matcher1 = re.search(p1, task['url']) |
|||
if matcher1: |
|||
l = Spider(task['url']) |
|||
pathName,title = l._get_page(goFastdfsConfig['path']) |
|||
l.browser.quit() |
|||
#gofast 上传,写入kafka |
|||
if '404 Not Found' in title: |
|||
logger.error('页面404,无效') |
|||
resultData = { |
|||
'code': 500, |
|||
'id': task['id'], |
|||
'message': '页面404' |
|||
} |
|||
kafkaProduce(kafkaConfig['data_topics'], |
|||
json.dumps(resultData).encode('utf-8').decode('unicode_escape').encode(), |
|||
kafkaConfig['address']) |
|||
time.sleep(2) |
|||
continue |
|||
try: |
|||
uploadStr = uploadFile('{}upload'.format(goFastdfsConfig['uploadaddress']),pathName,logger) |
|||
uploadJson = json.loads(uploadStr) |
|||
except Exception as e: |
|||
logger.error('文件上传异常----') |
|||
traceback.print_exc() |
|||
resultData = { |
|||
'code': 500, |
|||
'id': task['id'], |
|||
'message': '文件上传失败' |
|||
} |
|||
kafkaProduce(kafkaConfig['data_topics'], |
|||
json.dumps(resultData).encode('utf-8').decode('unicode_escape').encode(), |
|||
kafkaConfig['address']) |
|||
time.sleep(2) |
|||
continue |
|||
resultData = { |
|||
'code':200, |
|||
'id':task['id'], |
|||
'url':goFastdfsConfig['downloadaddress']+uploadJson['path'], |
|||
'title':title, |
|||
'delMd5':uploadJson['md5'], |
|||
'uploadTime':uploadJson['mtime'], |
|||
'message':'成功' |
|||
} |
|||
kafkaProduce(kafkaConfig['data_topics'],json.dumps(resultData).encode('utf-8').decode('unicode_escape').encode(),kafkaConfig['address']) |
|||
logger.info('数据写入成功') |
|||
#删除文件 |
|||
if (os.path.exists(pathName)): |
|||
os.remove(pathName) |
|||
logger.info('清除文件:{}'.format(pathName)) |
|||
else: |
|||
logger.info('要删除的文件不存在:{}'.format(pathName)) |
|||
else: |
|||
logger.error('非正确url:'.format(task['url'])) |
|||
resultData = { |
|||
'code': 500, |
|||
'id': task['id'], |
|||
'message': '非正确url' |
|||
} |
|||
kafkaProduce(kafkaConfig['data_topics'], |
|||
json.dumps(resultData).encode('utf-8').decode('unicode_escape').encode(), |
|||
kafkaConfig['address']) |
|||
time.sleep(2) |
|||
continue |
|||
else: |
|||
logger.info("暂无任务,进入休眠--") |
|||
time.sleep(10) |
|||
except Exception as e: |
|||
logger.error('未知异常----') |
|||
traceback.print_exc() |
|||
resultData = { |
|||
'code': 500, |
|||
'id': task['id'], |
|||
'message': '未知异常' |
|||
} |
|||
kafkaProduce(kafkaConfig['data_topics'], |
|||
json.dumps(resultData).encode('utf-8').decode('unicode_escape').encode(), |
|||
kafkaConfig['address']) |
|||
time.sleep(2) |
|||
|
@ -0,0 +1,25 @@ |
|||
#coding:utf8 |
|||
import traceback |
|||
import json |
|||
from kafka import KafkaProducer |
|||
from text_analysis.read_config import load_config |
|||
config=load_config() |
|||
|
|||
def send_kafka(data,logging): |
|||
try: |
|||
producer = None |
|||
topic = config["kafka"]["topic"] |
|||
data1=json.dumps(data,ensure_ascii=False) |
|||
kafkaProduce(topic,bytes(data1, encoding='utf-8')) |
|||
logging.info("数据推入kafka!") |
|||
|
|||
except Exception as e: |
|||
logging.info(traceback.format_exc()) |
|||
logging.info('写入kafka失败') |
|||
|
|||
def kafkaProduce(topic,resultData): |
|||
producer = KafkaProducer(bootstrap_servers = '{}'.format(config["kafka"]["bootstrap_servers"]),max_request_size=52428800) |
|||
topics = topic.split(',') |
|||
for tc in topics: |
|||
future = producer.send(tc,resultData) |
|||
producer.flush() |
233
text_analysis/tools/tool.py
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
1
text_analysis/tools/关系链数据.txt
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
1
text_analysis/tools/账号数据.txt
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
@ -0,0 +1,13 @@ |
|||
from django.conf.urls import include, url |
|||
from django.contrib import admin |
|||
from text_analysis import views |
|||
|
|||
urlpatterns = [ |
|||
|
|||
url(r'^fakeNewIdentification',views.fakeNewIdentification, name='fakeNewIdentification'), |
|||
# url(r'^mysqlConnection',views.mysqlConnection, name='mysqlConnection'), |
|||
# url(r'^mysqlField', views.mysqlField, name='mysqlField') |
|||
|
|||
] |
|||
|
|||
|
@ -0,0 +1,158 @@ |
|||
#coding:utf8 |
|||
import os, sys |
|||
import io |
|||
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8') |
|||
cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd() |
|||
par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir)) |
|||
sys.path.append(cur_dir) |
|||
sys.path.append(par_dir) |
|||
import json |
|||
from django.http import HttpResponse |
|||
from text_analysis.tools import to_kafka,tool |
|||
from text_analysis.tools import pred |
|||
from django.views.decorators.csrf import csrf_exempt |
|||
from log_util.set_logger import set_logger |
|||
logging=set_logger('logs/results.log') |
|||
import traceback |
|||
import queue |
|||
from text_analysis.cusException import userFile_Exception,chainFile_Exception |
|||
import requests |
|||
import time |
|||
from kazoo.client import KazooClient |
|||
from kazoo.protocol.states import EventType |
|||
#任务队列 |
|||
import queue |
|||
task_queue = queue.PriorityQueue() |
|||
stop_dict={} |
|||
from text_analysis.read_config import load_config |
|||
config=load_config() |
|||
|
|||
|
|||
@csrf_exempt |
|||
def fakeNewIdentification(request): |
|||
if request.method == 'POST': |
|||
try: |
|||
raw_data = json.loads(request.body) |
|||
if "trace" in raw_data.keys() and raw_data["trace"]==True: |
|||
task_queue.put((-1,time.time(), raw_data)) |
|||
else: |
|||
task_queue.put((1,time.time(), raw_data)) |
|||
return HttpResponse(json.dumps({"code": 1, "msg": "请求正常!"}, ensure_ascii=False)) |
|||
except: |
|||
logging.error(traceback.format_exc()) |
|||
return HttpResponse(json.dumps({"code": 0, "msg": "请求json格式不正确!"}, ensure_ascii=False)) |
|||
else: |
|||
return HttpResponse(json.dumps({"code": 0, "msg": "请求方式错误,改为post请求"}, ensure_ascii=False)) |
|||
|
|||
def predict_news(): |
|||
dbConfig = dict(config.items('database')) |
|||
while True: |
|||
try: |
|||
if task_queue.qsize()>0: |
|||
p,t,raw_data = task_queue.get(timeout=1) |
|||
logging.info("当前任务队列长度{}".format(task_queue.qsize()+1)) |
|||
logging.info("任务数据-{}".format(raw_data)) |
|||
task_id=raw_data["scenes_id"] |
|||
task_version=raw_data["version"] |
|||
logging.info("当前version信息为:{}".format(stop_dict)) |
|||
if task_id in stop_dict.keys() and task_version!=stop_dict[task_id]["version"]: |
|||
logging.info("已暂停任务,数据过滤掉") |
|||
continue |
|||
res = {"successCode": "1", "errorLog": "", "results": {},"status":1,"message":"成功"} |
|||
# 账号数据 |
|||
userData = tool.mysqlData(raw_data, logging,"1",dbConfig) |
|||
# if not userData: |
|||
# raise userFile_Exception |
|||
logging.info("账号数据获取完毕!-长度{}".format(len(userData))) |
|||
# 传播链数据 |
|||
postChain=tool.mysqlData(raw_data, logging,"0",dbConfig) |
|||
if not postChain: |
|||
raise chainFile_Exception |
|||
logging.info("传播链数据获取完毕!-长度{}".format(len(postChain))) |
|||
news=pred.predict_news(userData,postChain,logging) |
|||
# 结束标识 |
|||
res['isLast'] = True |
|||
for i in range(len(news)): |
|||
row_dict = news.iloc[i].to_dict() |
|||
row_dict['pageType'] = 'fakeNewsPage' |
|||
# postId |
|||
row_dict['postId'] = userData[0]['postId'] |
|||
if i == len(news) - 1: |
|||
row_dict["isLast"]=1 |
|||
res["results"] = json.dumps(row_dict,ensure_ascii=False) |
|||
res["status"] = 1 |
|||
res["message"] = "成功" |
|||
raw_data["result"] = res |
|||
logging.info("共{}条数据,第{}条数据输出-{}".format(len(news),i+1,raw_data)) |
|||
to_kafka.send_kafka(raw_data, logging) |
|||
else: |
|||
# 暂无任务,进入休眠 |
|||
time.sleep(10) |
|||
except userFile_Exception: |
|||
res = {"successCode": "0", "errorLog": "用户数据为空!", "results": {}, "status": 2,"message": "异常"} |
|||
results={} |
|||
results['pageType'] = 'fakeNewsPage' |
|||
results['recognitionResult'] = '用户数据为空' |
|||
res['results'] = json.dumps(results) |
|||
res["status"] = 2 |
|||
res["message"] = "用户数据为空" |
|||
raw_data["result"] = res |
|||
logging.info("该条请求用户数据为空-{}".format(raw_data)) |
|||
to_kafka.send_kafka(raw_data, logging) |
|||
except chainFile_Exception: |
|||
res = {"successCode": "0", "errorLog": "关系链数据为空!", "results": {}, "status": 2,"message": "异常"} |
|||
results={} |
|||
results['pageType'] = 'fakeNewsPage' |
|||
results['recognitionResult'] = '关系链数据为空' |
|||
res['results'] = json.dumps(results) |
|||
res["status"] = 2 |
|||
res["message"] = "关系链数据为空" |
|||
raw_data["result"] = res |
|||
logging.info("该条请求关系链数据为空-{}".format(raw_data)) |
|||
to_kafka.send_kafka(raw_data, logging) |
|||
except: |
|||
res = {"successCode": "0", "errorLog": "", "results": {}, "status": 2,"message": "异常"} |
|||
results={} |
|||
results['pageType'] = 'fakeNewsPage' |
|||
results['recognitionResult'] = "" |
|||
res['results'] = json.dumps(results) |
|||
res["status"] = 2 |
|||
res["message"] = "异常" |
|||
raw_data["result"] = res |
|||
raw_data["result"]["errorLog"] = traceback.format_exc() |
|||
logging.info(traceback.format_exc()) |
|||
to_kafka.send_kafka(raw_data, logging) |
|||
|
|||
|
|||
def zk_monitoring(): |
|||
try: |
|||
#线上环境 |
|||
zk = KazooClient(hosts=config['zookeeper']['zkhost']) |
|||
#测试环境 |
|||
# zk = KazooClient(hosts='172.16.12.55:2181,172.16.12.56:2181,172.16.12.57:2181') |
|||
zk.start() |
|||
# 设置监听器 |
|||
@zk.DataWatch("/analyze") |
|||
def watch_node(data, stat, event): |
|||
if event is not None and event.type == EventType.CHANGED: |
|||
data, stat = zk.get("/analyze") |
|||
logging.info("执行删除操作:{}".format(data)) |
|||
d = json.loads(data) |
|||
id = d["scenes_id"] |
|||
stop_dict[id] = {} |
|||
stop_dict[id]["version"] = d["version"] |
|||
stop_dict[id]["operation"] = d["operation"] |
|||
# 保持程序运行以监听节点变化 |
|||
try: |
|||
while True: |
|||
time.sleep(1) |
|||
except: |
|||
logging.info("Stopping...") |
|||
# 关闭连接 |
|||
zk.stop() |
|||
zk.close() |
|||
except: |
|||
logging.error(traceback.format_exc()) |
|||
|
|||
|
|||
|
@ -0,0 +1,16 @@ |
|||
""" |
|||
WSGI config for Zhijian_Project_WebService project. |
|||
|
|||
It exposes the WSGI callable as a module-level variable named ``application``. |
|||
|
|||
For more information on this file, see |
|||
https://docs.djangoproject.com/en/1.8/howto/deployment/wsgi/ |
|||
""" |
|||
|
|||
import os |
|||
|
|||
from django.core.wsgi import get_wsgi_application |
|||
|
|||
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "text_analysis.settings") |
|||
|
|||
application = get_wsgi_application() |
@ -0,0 +1,83 @@ |
|||
假新闻:英文的都是1-假新闻,中文的都是0-非假新闻。 |
|||
{ |
|||
"metadata":{ |
|||
"address":"http://172.24.12.127:9025/robotIdentificationTopic/", |
|||
"index":0, |
|||
"admin":{ |
|||
"TwitterAccount":{ |
|||
"topicId":1209, |
|||
"host":"172.26.28.30", |
|||
"user":"crawl", |
|||
"passwd":"crawl123", |
|||
"db":"test", |
|||
"port":3306, |
|||
"table":"TwitterAccount" |
|||
}, |
|||
"Twitter_chain":{ |
|||
"topicId":1209, |
|||
"host":"172.26.28.30", |
|||
"user":"crawl", |
|||
"passwd":"crawl123", |
|||
"db":"test", |
|||
"port":3306, |
|||
"table":"Twitter_chain" |
|||
} |
|||
} |
|||
}, |
|||
"output":{ |
|||
"output_type":"table", |
|||
"label_col":[ |
|||
|
|||
] |
|||
}, |
|||
"input":{ |
|||
"input_type":"text", |
|||
"label":[ |
|||
"2_任务提取" |
|||
] |
|||
}, |
|||
"user":{ |
|||
"tag":"" |
|||
}, |
|||
"data":{ |
|||
|
|||
}, |
|||
"created":1691004265000, |
|||
"module":"robotIdentification", |
|||
"start_tag":false, |
|||
"multi_branch":0, |
|||
"last_edit":1693417201000, |
|||
"next_app_id":[ |
|||
{ |
|||
"start_id":154, |
|||
"edge_id":75, |
|||
"end_id":155 |
|||
} |
|||
], |
|||
"transfer_id":3, |
|||
"version":1, |
|||
"blueprint_id":4, |
|||
"scenes_id":5, |
|||
"scenario":{ |
|||
"dataloss":1, |
|||
"autoCommitTriggerLast":1, |
|||
"maxErrors":3, |
|||
"autoCommit":1, |
|||
"freshVariables":1 |
|||
}, |
|||
"wait_condition":[ |
|||
|
|||
], |
|||
"scheduling":{ |
|||
"interval":-1, |
|||
"type":"single" |
|||
}, |
|||
"name":"robotIdentification", |
|||
"businessKey":"19615b029da477fb", |
|||
"id":154, |
|||
"position":[ |
|||
100, |
|||
200 |
|||
], |
|||
"describe":"" |
|||
} |
1
txt/关系链数据.txt
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
@ -0,0 +1,3 @@ |
|||
1.python>3.7 |
|||
2.pandas=1.4.4 |
|||
3.sklearn=0.24.2 |
1
txt/账号数据.txt
File diff suppressed because it is too large
View File
File diff suppressed because it is too large
View File
@ -0,0 +1,8 @@ |
|||
[uwsgi] |
|||
http = 0.0.0.0:9030 |
|||
chdir = ../fakeNewIdentification |
|||
wsgi-file = ../fakeNewIdentification/wsgi.py |
|||
processes = 1 |
|||
threads = 2 |
|||
listen = 1024 |
|||
http-timeout=21600 |
@ -0,0 +1,38 @@ |
|||
*** Starting uWSGI 2.0.21 (64bit) on [Fri Jan 3 09:27:26 2025] *** |
|||
compiled with version: 11.2.0 on 24 October 2023 19:53:56 |
|||
os: Linux-3.10.0-1127.19.1.el7.x86_64 #1 SMP Tue Aug 25 17:23:54 UTC 2020 |
|||
nodename: node-04 |
|||
machine: x86_64 |
|||
clock source: unix |
|||
pcre jit disabled |
|||
detected number of CPU cores: 64 |
|||
current working directory: /opt/analyze/apps/fakeNewIdentification |
|||
detected binary path: /opt/analyze/environment/python3.8/bin/uwsgi |
|||
uWSGI running as root, you can use --uid/--gid/--chroot options |
|||
*** WARNING: you are running uWSGI as root !!! (use the --uid flag) *** |
|||
chdir() to ../fakeNewIdentification |
|||
*** WARNING: you are running uWSGI without its master process manager *** |
|||
your processes number limit is 1031041 |
|||
your memory page size is 4096 bytes |
|||
detected max file descriptor number: 65535 |
|||
lock engine: pthread robust mutexes |
|||
thunder lock: disabled (you can enable it with --thunder-lock) |
|||
uWSGI http bound on 0.0.0.0:9030 fd 4 |
|||
spawned uWSGI http 1 (pid: 51183) |
|||
uwsgi socket 0 bound to TCP address 127.0.0.1:42416 (port auto-assigned) fd 3 |
|||
uWSGI running as root, you can use --uid/--gid/--chroot options |
|||
*** WARNING: you are running uWSGI as root !!! (use the --uid flag) *** |
|||
Python version: 3.8.16 (default, Jun 12 2023, 18:09:05) [GCC 11.2.0] |
|||
Python main interpreter initialized at 0x1f87250 |
|||
uWSGI running as root, you can use --uid/--gid/--chroot options |
|||
*** WARNING: you are running uWSGI as root !!! (use the --uid flag) *** |
|||
python threads support enabled |
|||
your server socket listen backlog is limited to 1024 connections |
|||
your mercy for graceful operations on workers is 60 seconds |
|||
mapped 83376 bytes (81 KB) for 2 cores |
|||
*** Operational MODE: threaded *** |
|||
WSGI app 0 (mountpoint='') ready in 8 seconds on interpreter 0x1f87250 pid: 51182 (default app) |
|||
uWSGI running as root, you can use --uid/--gid/--chroot options |
|||
*** WARNING: you are running uWSGI as root !!! (use the --uid flag) *** |
|||
*** uWSGI is running in multiple interpreter mode *** |
|||
spawned uWSGI worker 1 (and the only) (pid: 51182, cores: 2) |
@ -0,0 +1,34 @@ |
|||
""" |
|||
WSGI config for Zhijian_Project_WebService project. |
|||
|
|||
It exposes the WSGI callable as a module-level variable named ``application``. |
|||
|
|||
For more information on this file, see |
|||
https://docs.djangoproject.com/en/1.8/howto/deployment/wsgi/ |
|||
""" |
|||
|
|||
import os |
|||
import configparser |
|||
import threading |
|||
from text_analysis.views import predict_news |
|||
#加载配置文件 |
|||
# configFile = './config.ini' |
|||
# # 创建配置文件对象 |
|||
# con = configparser.ConfigParser() |
|||
# # 读取文件 |
|||
# con.read(configFile, encoding='utf-8') |
|||
# #数据库配置信息 |
|||
# dbConfig = dict(con.items('database')) |
|||
|
|||
t = threading.Thread(target=predict_news, name='predict_news') |
|||
t.daemon = True |
|||
t.start() |
|||
|
|||
from django.core.wsgi import get_wsgi_application |
|||
|
|||
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "text_analysis.settings") |
|||
application = get_wsgi_application() |
|||
|
|||
|
|||
|
|||
|
Write
Preview
Loading…
Cancel
Save
Reference in new issue