Browse Source

假新闻识别应用

master
maojian 7 months ago
commit
574071d287
  1. 13
      .idea/asr.iml
  2. 4
      .idea/misc.xml
  3. 8
      .idea/modules.xml
  4. 823
      .idea/workspace.xml
  5. 23
      config.ini
  6. 415
      inputdata/eg.py
  7. BIN
      inputdata/fake_news_model.pkl
  8. BIN
      inputdata/test.xlsx
  9. BIN
      inputdata/test_1220.xlsx
  10. 45
      inputdata/to_mysql.py
  11. BIN
      inputdata/假新闻数据输入/Twitter_Account.xlsx
  12. BIN
      inputdata/假新闻数据输入/test.xlsx
  13. BIN
      inputdata/假新闻数据输入/传播分析1209.xlsx
  14. BIN
      inputdata/假新闻数据输入/传播分析1220.xlsx
  15. BIN
      inputdata/假新闻数据输入/传播分析test.xlsx
  16. BIN
      inputdata/假新闻数据输入/用户test.xlsx
  17. 433
      inputdata/假新闻识别@20230918.py
  18. BIN
      log_util/__pycache__/set_logger.cpython-36.pyc
  19. BIN
      log_util/__pycache__/set_logger.cpython-38.pyc
  20. 33
      log_util/set_logger.py
  21. 0
      logs/results.log
  22. 18
      manage.py
  23. 35
      src.py
  24. 1
      start.sh
  25. 1
      stop_uwsgi.sh
  26. 103
      test.py
  27. 0
      text_analysis/__init__.py
  28. BIN
      text_analysis/__pycache__/__init__.cpython-36.pyc
  29. BIN
      text_analysis/__pycache__/__init__.cpython-38.pyc
  30. BIN
      text_analysis/__pycache__/cusException.cpython-38.pyc
  31. BIN
      text_analysis/__pycache__/read_config.cpython-38.pyc
  32. BIN
      text_analysis/__pycache__/settings.cpython-36.pyc
  33. BIN
      text_analysis/__pycache__/settings.cpython-38.pyc
  34. BIN
      text_analysis/__pycache__/urls.cpython-36.pyc
  35. BIN
      text_analysis/__pycache__/urls.cpython-38.pyc
  36. BIN
      text_analysis/__pycache__/views.cpython-36.pyc
  37. BIN
      text_analysis/__pycache__/views.cpython-38.pyc
  38. BIN
      text_analysis/__pycache__/wsgi.cpython-36.pyc
  39. BIN
      text_analysis/__pycache__/wsgi.cpython-38.pyc
  40. 108
      text_analysis/bak/views.py_0226
  41. 115
      text_analysis/bak/views.py_0607
  42. 117
      text_analysis/bak/views_20240807.py
  43. 10
      text_analysis/cusException.py
  44. 9
      text_analysis/linshi.py
  45. BIN
      text_analysis/model/bot_user.pkl
  46. BIN
      text_analysis/model/fake_news_model.pkl
  47. 10
      text_analysis/read_config.py
  48. 14
      text_analysis/request.py
  49. 148
      text_analysis/settings.py
  50. 90
      text_analysis/src.py
  51. BIN
      text_analysis/tools/__pycache__/cusException.cpython-36.pyc
  52. BIN
      text_analysis/tools/__pycache__/mysql_helper.cpython-36.pyc
  53. BIN
      text_analysis/tools/__pycache__/pred.cpython-38.pyc
  54. BIN
      text_analysis/tools/__pycache__/process.cpython-36.pyc
  55. BIN
      text_analysis/tools/__pycache__/to_kafka.cpython-36.pyc
  56. BIN
      text_analysis/tools/__pycache__/to_kafka.cpython-38.pyc
  57. BIN
      text_analysis/tools/__pycache__/tool.cpython-36.pyc
  58. BIN
      text_analysis/tools/__pycache__/tool.cpython-38.pyc
  59. BIN
      text_analysis/tools/__pycache__/tools.cpython-36.pyc
  60. 456
      text_analysis/tools/bak/pred.py
  61. 220
      text_analysis/tools/bak/tool.py
  62. 25
      text_analysis/tools/cusException.py
  63. 67
      text_analysis/tools/kakfa_util.py
  64. 338
      text_analysis/tools/mysql_helper.py
  65. 456
      text_analysis/tools/pred.py
  66. 51
      text_analysis/tools/process.py
  67. 171
      text_analysis/tools/seleniumTest.py
  68. 25
      text_analysis/tools/to_kafka.py
  69. 233
      text_analysis/tools/tool.py
  70. 1
      text_analysis/tools/关系链数据.txt
  71. 1
      text_analysis/tools/账号数据.txt
  72. 13
      text_analysis/urls.py
  73. 158
      text_analysis/views.py
  74. 16
      text_analysis/wsgi.py
  75. 83
      txt/fakeNew.txt
  76. 1
      txt/关系链数据.txt
  77. BIN
      txt/技术部分初稿@20230302.docx
  78. 3
      txt/环境要求.txt
  79. 1
      txt/账号数据.txt
  80. 8
      uwsgi.ini
  81. 38
      wsgi.log
  82. 34
      wsgi.py

13
.idea/asr.iml

@ -0,0 +1,13 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/text_analysis/tools" isTestSource="false" />
</content>
<orderEntry type="jdk" jdkName="Python 3.8.16 (D:\LH_program\Anaconda3\envs\python38_env\python.exe)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TestRunnerService">
<option name="PROJECT_TEST_RUNNER" value="Unittests" />
</component>
</module>

4
.idea/misc.xml

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8.16 (D:\LH_program\Anaconda3\envs\python38_env\python.exe)" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/asr.iml" filepath="$PROJECT_DIR$/.idea/asr.iml" />
</modules>
</component>
</project>

823
.idea/workspace.xml

@ -0,0 +1,823 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ChangeListManager">
<list default="true" id="26e841a3-8bef-4d1d-bf9a-d6d27e32457a" name="Default" comment="" />
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="TRACKING_ENABLED" value="true" />
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
<option name="LAST_RESOLUTION" value="IGNORE" />
</component>
<component name="ExecutionTargetManager" SELECTED_TARGET="default_target" />
<component name="FileEditorManager">
<leaf SIDE_TABS_SIZE_LIMIT_KEY="450">
<file leaf-file-name="eg.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/inputdata/eg.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="423">
<caret line="282" column="35" lean-forward="true" selection-start-line="282" selection-start-column="35" selection-end-line="282" selection-end-column="35" />
<folding />
</state>
</provider>
</entry>
</file>
<file leaf-file-name="pred.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/text_analysis/tools/pred.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="309">
<caret line="127" column="0" lean-forward="true" selection-start-line="127" selection-start-column="0" selection-end-line="127" selection-end-column="0" />
<folding>
<element signature="e#13#32#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file leaf-file-name="views.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/text_analysis/views.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="399">
<caret line="69" column="44" lean-forward="false" selection-start-line="69" selection-start-column="36" selection-end-line="69" selection-end-column="44" />
<folding>
<element signature="e#13#27#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file leaf-file-name="to_kafka.py" pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/text_analysis/tools/to_kafka.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="513">
<caret line="23" column="0" lean-forward="true" selection-start-line="23" selection-start-column="0" selection-end-line="23" selection-end-column="0" />
<folding>
<element signature="e#13#29#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file leaf-file-name="tool.py" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/text_analysis/tools/tool.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="108">
<caret line="8" column="23" lean-forward="false" selection-start-line="8" selection-start-column="14" selection-end-line="8" selection-end-column="23" />
<folding />
</state>
</provider>
</entry>
</file>
</leaf>
</component>
<component name="FileTemplateManagerImpl">
<option name="RECENT_TEMPLATES">
<list>
<option value="Python Script" />
</list>
</option>
</component>
<component name="FindInProjectRecents">
<findStrings>
<find>KafkaClient</find>
<find>open</find>
<find>layer</find>
<find>post</find>
<find>is_eng</find>
<find>getText_count_eng</find>
<find>fansCount</find>
<find>postset</find>
<find>columns</find>
<find>diffdate均值</find>
<find>sub_shareCount</find>
<find>pre_user</find>
<find>post_related</find>
</findStrings>
</component>
<component name="IdeDocumentHistory">
<option name="CHANGED_PATHS">
<list>
<option value="$PROJECT_DIR$/src.py" />
<option value="$PROJECT_DIR$/test.py" />
<option value="$PROJECT_DIR$/text_analysis/src.py" />
<option value="$PROJECT_DIR$/text_analysis/linshi.py" />
<option value="$PROJECT_DIR$/uwsgi.ini" />
<option value="$PROJECT_DIR$/start.sh" />
<option value="$PROJECT_DIR$/stop_uwsgi.sh" />
<option value="$PROJECT_DIR$/wsgi.py" />
<option value="$PROJECT_DIR$/inputdata/假新闻识别@20230918.py" />
<option value="$PROJECT_DIR$/../robotIdentificationTopic/text_analysis/linshi.py" />
<option value="$PROJECT_DIR$/text_analysis/urls.py" />
<option value="$PROJECT_DIR$/manage.py" />
<option value="$PROJECT_DIR$/text_analysis/tools/tool.py" />
<option value="$PROJECT_DIR$/linshi.py" />
<option value="$PROJECT_DIR$/inputdata/to_mysql.py" />
<option value="$PROJECT_DIR$/inputdata/eg.py" />
<option value="$PROJECT_DIR$/text_analysis/tools/pred.py" />
<option value="$PROJECT_DIR$/text_analysis/tools/to_kafka.py" />
<option value="$PROJECT_DIR$/text_analysis/views.py" />
</list>
</option>
</component>
<component name="ProjectFrameBounds">
<option name="x" value="-11" />
<option name="y" value="-11" />
<option name="width" value="1942" />
<option name="height" value="1042" />
</component>
<component name="ProjectView">
<navigator currentView="ProjectPane" proportions="" version="1">
<flattenPackages />
<showMembers />
<showModules />
<showLibraryContents />
<hideEmptyPackages />
<abbreviatePackageNames />
<autoscrollToSource />
<autoscrollFromSource />
<sortByType />
<manualOrder />
<foldersAlwaysOnTop value="true" />
</navigator>
<panes>
<pane id="Scratches" />
<pane id="Scope" />
<pane id="ProjectPane">
<subPane>
<PATH>
<PATH_ELEMENT>
<option name="myItemId" value="fakeNewIdentification" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="fakeNewIdentification" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
</PATH>
<PATH>
<PATH_ELEMENT>
<option name="myItemId" value="fakeNewIdentification" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="fakeNewIdentification" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="text_analysis" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
</PATH>
<PATH>
<PATH_ELEMENT>
<option name="myItemId" value="fakeNewIdentification" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="fakeNewIdentification" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="text_analysis" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="tools" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
</PATH>
<PATH>
<PATH_ELEMENT>
<option name="myItemId" value="fakeNewIdentification" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="fakeNewIdentification" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="inputdata" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
</PATH>
</subPane>
</pane>
</panes>
</component>
<component name="PropertiesComponent">
<property name="last_opened_file_path" value="$PROJECT_DIR$/../chatGpt" />
<property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
</component>
<component name="PyDebuggerOptionsProvider">
<option name="mySupportQtDebugging" value="false" />
</component>
<component name="RunDashboard">
<option name="ruleStates">
<list>
<RuleState>
<option name="name" value="ConfigurationTypeDashboardGroupingRule" />
</RuleState>
<RuleState>
<option name="name" value="StatusDashboardGroupingRule" />
</RuleState>
</list>
</option>
</component>
<component name="RunManager" selected="Python.to_kafka">
<configuration default="false" name="假新闻识别@20230918" type="PythonConfigurationType" factoryName="Python" temporary="true">
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/inputdata" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<module name="asr" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/inputdata/假新闻识别@20230918.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<method />
</configuration>
<configuration default="false" name="linshi (1)" type="PythonConfigurationType" factoryName="Python" temporary="true">
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<module name="asr" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/linshi.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<method />
</configuration>
<configuration default="false" name="eg" type="PythonConfigurationType" factoryName="Python" temporary="true">
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/inputdata" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<module name="asr" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/inputdata/eg.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<method />
</configuration>
<configuration default="false" name="pred" type="PythonConfigurationType" factoryName="Python" temporary="true">
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/text_analysis/tools" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<module name="asr" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/text_analysis/tools/pred.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<method />
</configuration>
<configuration default="false" name="to_kafka" type="PythonConfigurationType" factoryName="Python" temporary="true">
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/text_analysis/tools" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<module name="asr" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/text_analysis/tools/to_kafka.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<method />
</configuration>
<configuration default="true" type="PythonConfigurationType" factoryName="Python">
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="" />
<option name="IS_MODULE_SDK" value="false" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<module name="asr" />
<option name="SCRIPT_NAME" value="" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<method />
</configuration>
<configuration default="true" type="Tox" factoryName="Tox">
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs />
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="" />
<option name="IS_MODULE_SDK" value="false" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<module name="asr" />
<method />
</configuration>
<configuration default="true" type="tests" factoryName="Doctests">
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs />
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="" />
<option name="IS_MODULE_SDK" value="false" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<module name="asr" />
<option name="SCRIPT_NAME" value="" />
<option name="CLASS_NAME" value="" />
<option name="METHOD_NAME" value="" />
<option name="FOLDER_NAME" value="" />
<option name="TEST_TYPE" value="TEST_SCRIPT" />
<option name="PATTERN" value="" />
<option name="USE_PATTERN" value="false" />
<method />
</configuration>
<configuration default="true" type="tests" factoryName="Unittests">
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs />
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="" />
<option name="IS_MODULE_SDK" value="false" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<module name="asr" />
<option name="_new_additionalArguments" value="&quot;&quot;" />
<option name="_new_target" value="&quot;.&quot;" />
<option name="_new_targetType" value="&quot;PATH&quot;" />
<method />
</configuration>
<list size="5">
<item index="0" class="java.lang.String" itemvalue="Python.假新闻识别@20230918" />
<item index="1" class="java.lang.String" itemvalue="Python.linshi (1)" />
<item index="2" class="java.lang.String" itemvalue="Python.eg" />
<item index="3" class="java.lang.String" itemvalue="Python.pred" />
<item index="4" class="java.lang.String" itemvalue="Python.to_kafka" />
</list>
<recent_temporary>
<list size="5">
<item index="0" class="java.lang.String" itemvalue="Python.to_kafka" />
<item index="1" class="java.lang.String" itemvalue="Python.pred" />
<item index="2" class="java.lang.String" itemvalue="Python.eg" />
<item index="3" class="java.lang.String" itemvalue="Python.假新闻识别@20230918" />
<item index="4" class="java.lang.String" itemvalue="Python.linshi (1)" />
</list>
</recent_temporary>
</component>
<component name="ShelveChangesManager" show_recycled="false">
<option name="remove_strategy" value="false" />
</component>
<component name="TaskManager">
<task active="true" id="Default" summary="Default task">
<changelist id="26e841a3-8bef-4d1d-bf9a-d6d27e32457a" name="Default" comment="" />
<created>1692600024256</created>
<option name="number" value="Default" />
<option name="presentableId" value="Default" />
<updated>1692600024256</updated>
</task>
<servers />
</component>
<component name="ToolWindowManager">
<frame x="-11" y="-11" width="1942" height="1042" extended-state="7" />
<editor active="true" />
<layout>
<window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.06614583" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
<window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="6" side_tool="false" content_ui="tabs" />
<window_info id="Event Log" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="true" content_ui="tabs" />
<window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="false" weight="0.33" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
<window_info id="Python Console" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
<window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.25711036" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
<window_info id="Structure" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
<window_info id="Terminal" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
<window_info id="Favorites" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="true" content_ui="tabs" />
<window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.17633675" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
<window_info id="Data View" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4515625" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
<window_info id="Cvs" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="4" side_tool="false" content_ui="tabs" />
<window_info id="Message" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
<window_info id="Commander" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
<window_info id="Inspection" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="5" side_tool="false" content_ui="tabs" />
<window_info id="Hierarchy" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="2" side_tool="false" content_ui="combo" />
<window_info id="Find" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
<window_info id="Ant Build" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
</layout>
</component>
<component name="VcsContentAnnotationSettings">
<option name="myLimit" value="2678400000" />
</component>
<component name="XDebuggerManager">
<breakpoint-manager>
<breakpoints>
<line-breakpoint enabled="true" suspend="THREAD" type="python-line">
<url>file://$PROJECT_DIR$/inputdata/假新闻识别@20230918.py</url>
<line>190</line>
<option name="timeStamp" value="33" />
</line-breakpoint>
</breakpoints>
<option name="time" value="43" />
</breakpoint-manager>
<watches-manager />
</component>
<component name="editorHistoryManager">
<entry file="file://$PROJECT_DIR$/test.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="3240">
<caret line="93" column="18" lean-forward="false" selection-start-line="93" selection-start-column="14" selection-end-line="93" selection-end-column="18" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/text_analysis/tools/to_kafka.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1692">
<caret line="54" column="0" lean-forward="false" selection-start-line="54" selection-start-column="0" selection-end-line="54" selection-end-column="0" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/manage.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="324">
<caret line="13" column="19" lean-forward="false" selection-start-line="13" selection-start-column="19" selection-end-line="13" selection-end-column="19" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/text_analysis/urls.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="216">
<caret line="8" column="62" lean-forward="false" selection-start-line="8" selection-start-column="62" selection-end-line="8" selection-end-column="62" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/wsgi.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="396">
<caret line="14" column="0" lean-forward="false" selection-start-line="14" selection-start-column="0" selection-end-line="20" selection-end-column="9" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/text_analysis/views.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="4464">
<caret line="125" column="51" lean-forward="false" selection-start-line="125" selection-start-column="44" selection-end-line="125" selection-end-column="51" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1152">
<caret line="32" column="0" lean-forward="false" selection-start-line="32" selection-start-column="0" selection-end-line="32" selection-end-column="0" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/test.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="3240">
<caret line="93" column="18" lean-forward="false" selection-start-line="93" selection-start-column="14" selection-end-line="93" selection-end-column="18" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/text_analysis/tools/to_kafka.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1692">
<caret line="54" column="0" lean-forward="false" selection-start-line="54" selection-start-column="0" selection-end-line="54" selection-end-column="0" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/manage.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="324">
<caret line="13" column="19" lean-forward="false" selection-start-line="13" selection-start-column="19" selection-end-line="13" selection-end-column="19" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/text_analysis/urls.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="216">
<caret line="8" column="62" lean-forward="false" selection-start-line="8" selection-start-column="62" selection-end-line="8" selection-end-column="62" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/wsgi.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="396">
<caret line="14" column="0" lean-forward="false" selection-start-line="14" selection-start-column="0" selection-end-line="20" selection-end-column="9" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/text_analysis/views.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="2916">
<caret line="82" column="33" lean-forward="true" selection-start-line="82" selection-start-column="33" selection-end-line="82" selection-end-column="65" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="792">
<caret line="22" column="14" lean-forward="false" selection-start-line="22" selection-start-column="14" selection-end-line="22" selection-end-column="14" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/text_analysis/views.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1332">
<caret line="38" column="4" lean-forward="true" selection-start-line="38" selection-start-column="4" selection-end-line="38" selection-end-column="4" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/text_analysis/tools/to_kafka.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="612">
<caret line="24" column="26" lean-forward="true" selection-start-line="24" selection-start-column="26" selection-end-line="24" selection-end-column="26" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/text_analysis/urls.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="216">
<caret line="8" column="62" lean-forward="true" selection-start-line="8" selection-start-column="62" selection-end-line="8" selection-end-column="62" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/wsgi.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="396">
<caret line="14" column="0" lean-forward="false" selection-start-line="14" selection-start-column="0" selection-end-line="20" selection-end-column="9" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/text_analysis/tools/tool.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="360">
<caret line="10" column="0" lean-forward="false" selection-start-line="10" selection-start-column="0" selection-end-line="10" selection-end-column="0" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/manage.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="324">
<caret line="13" column="19" lean-forward="true" selection-start-line="13" selection-start-column="19" selection-end-line="13" selection-end-column="19" />
</state>
</provider>
</entry>
<entry file="file://D:/LH_program/Anaconda3/envs/python3.6test/Lib/site-packages/pandas/tests/reshape/merge/test_merge_asof.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="144">
<caret line="13" column="8" lean-forward="false" selection-start-line="13" selection-start-column="8" selection-end-line="13" selection-end-column="8" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/../../../2022/空天院高分项目/Project_kongtianyuan/text_analysis/tools/tool.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="108">
<caret line="3" column="10" lean-forward="true" selection-start-line="3" selection-start-column="10" selection-end-line="4" selection-end-column="38" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/../../../2022/空天院高分项目/Project_kongtianyuan/text_analysis/views.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="288">
<caret line="107" column="21" lean-forward="false" selection-start-line="107" selection-start-column="12" selection-end-line="107" selection-end-column="21" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/../../../2022/Project_KG_Content/text_analysis/views.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="252">
<caret line="49" column="0" lean-forward="false" selection-start-line="49" selection-start-column="0" selection-end-line="51" selection-end-column="37" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/../mySql/text_analysis/views.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="108">
<caret line="90" column="90" lean-forward="true" selection-start-line="90" selection-start-column="90" selection-end-line="90" selection-end-column="90" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/test.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="723">
<caret line="93" column="18" lean-forward="false" selection-start-line="93" selection-start-column="14" selection-end-line="93" selection-end-column="18" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="554">
<caret line="32" column="0" lean-forward="false" selection-start-line="32" selection-start-column="0" selection-end-line="32" selection-end-column="0" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/text_analysis/linshi.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="3528">
<caret line="100" column="0" lean-forward="false" selection-start-line="100" selection-start-column="0" selection-end-line="100" selection-end-column="0" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/text_analysis/src.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1656">
<caret line="46" column="13" lean-forward="false" selection-start-line="46" selection-start-column="4" selection-end-line="46" selection-end-column="13" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/start.sh">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="0">
<caret line="0" column="30" lean-forward="false" selection-start-line="0" selection-start-column="30" selection-end-line="0" selection-end-column="30" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/stop_uwsgi.sh">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="0">
<caret line="0" column="12" lean-forward="false" selection-start-line="0" selection-start-column="12" selection-end-line="0" selection-end-column="12" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/uwsgi.ini">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="72">
<caret line="2" column="36" lean-forward="true" selection-start-line="2" selection-start-column="36" selection-end-line="2" selection-end-column="36" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/wsgi.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="210">
<caret line="12" column="39" lean-forward="false" selection-start-line="12" selection-start-column="39" selection-end-line="12" selection-end-column="39" />
</state>
</provider>
</entry>
<entry file="file://D:/LH_program/Anaconda3/envs/python38_env/Lib/site-packages/pandas/compat/_optional.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="63">
<caret line="138" column="23" lean-forward="true" selection-start-line="138" selection-start-column="23" selection-end-line="138" selection-end-column="23" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/../robotIdentificationTopic/linshi.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-521">
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/../robotIdentificationTopic/src.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-432">
<caret line="9" column="12" lean-forward="true" selection-start-line="9" selection-start-column="12" selection-end-line="9" selection-end-column="12" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/../robotIdentificationTopic/text_analysis/linshi.py" />
<entry file="file://$PROJECT_DIR$/../robotIdentificationTopic/text_analysis/views.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="324">
<caret line="10" column="29" lean-forward="false" selection-start-line="10" selection-start-column="0" selection-end-line="11" selection-end-column="0" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/linshi.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="288">
<caret line="9" column="0" lean-forward="false" selection-start-line="9" selection-start-column="0" selection-end-line="9" selection-end-column="0" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/text_analysis/urls.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="144">
<caret line="6" column="32" lean-forward="false" selection-start-line="6" selection-start-column="11" selection-end-line="6" selection-end-column="32" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/manage.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="216">
<caret line="10" column="13" lean-forward="false" selection-start-line="10" selection-start-column="13" selection-end-line="10" selection-end-column="13" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/inputdata/to_mysql.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1512">
<caret line="45" column="0" lean-forward="false" selection-start-line="45" selection-start-column="0" selection-end-line="45" selection-end-column="0" />
<folding />
</state>
</provider>
</entry>
<entry file="file://D:/LH_program/Anaconda3/envs/python38_env/Lib/site-packages/pandas/core/internals/base.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="118">
<caret line="68" column="0" lean-forward="false" selection-start-line="68" selection-start-column="0" selection-end-line="68" selection-end-column="0" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/inputdata/假新闻识别@20230918.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-9209">
<caret line="118" column="8" lean-forward="true" selection-start-line="118" selection-start-column="8" selection-end-line="118" selection-end-column="8" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/text_analysis/tools/tool.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="108">
<caret line="8" column="23" lean-forward="false" selection-start-line="8" selection-start-column="14" selection-end-line="8" selection-end-column="23" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/../../假新闻识别/假新闻识别/假新闻识别@20230918.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="378">
<caret line="382" column="0" lean-forward="true" selection-start-line="382" selection-start-column="0" selection-end-line="382" selection-end-column="0" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/inputdata/eg.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="423">
<caret line="282" column="35" lean-forward="true" selection-start-line="282" selection-start-column="35" selection-end-line="282" selection-end-column="35" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/text_analysis/tools/pred.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="309">
<caret line="127" column="0" lean-forward="true" selection-start-line="127" selection-start-column="0" selection-end-line="127" selection-end-column="0" />
<folding>
<element signature="e#13#32#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/../chatGpt/text_analysis/views.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="432">
<caret line="67" column="48" lean-forward="false" selection-start-line="67" selection-start-column="40" selection-end-line="67" selection-end-column="48" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/text_analysis/views.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="399">
<caret line="69" column="44" lean-forward="false" selection-start-line="69" selection-start-column="36" selection-end-line="69" selection-end-column="44" />
<folding>
<element signature="e#13#27#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/text_analysis/tools/to_kafka.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="513">
<caret line="23" column="0" lean-forward="true" selection-start-line="23" selection-start-column="0" selection-end-line="23" selection-end-column="0" />
<folding>
<element signature="e#13#29#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</component>
</project>

23
config.ini

@ -0,0 +1,23 @@
[database]
;数据库地址
host=node-01
;端口
port=3306
;用户名
username=root
;密码
password=bw@2025
;数据库
db=analyze
[zookeeper]
;zk地址
zkhost=node-01:12181,node-02:12181,node-03:12181
;节点
node=/analyze
[kafka]
;服务器地址
bootstrap_servers=node-01:19092,node-02:19092,node-03:19092
;topic
topic=produce_analyze

415
inputdata/eg.py

@ -0,0 +1,415 @@
#coding:utf8
import pandas as pd
import numpy as np
import networkx as nx
from textblob import TextBlob
from snownlp import SnowNLP
from wordcloud import STOPWORDS
import jieba
from tqdm import tqdm
import datetime
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import GridSearchCV
import joblib
def pre_user(data_user):
data_user['accountName'] = data_user['accountName'].apply(lambda x:'@'+x)
data_user['otherInfo'] = 1-pd.isnull(data_user['otherInfo']).astype(int)
data_user['nickName'] = 1-pd.isnull(data_user['nickName']).astype(int)
data_user = data_user[['accountName', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']]
data_user = data_user.dropna()
data_user = data_user.drop_duplicates().reset_index(drop = True)
data_user['fansCount'] = data_user['fansCount'].astype(int)
data_user['likeCount'] = data_user['likeCount'].astype(int)
data_user['postCount'] = data_user['postCount'].astype(int)
data_user.columns = ['用户名', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']
return data_user
def getText_count_eng(txt):
"""英文词频统计"""
txt = txt.lower() #将所有大写字母变成小写
for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()… ': #将文本中特殊符号替换为空格
txt = txt.replace(ch," ")
words = txt.split()
counts = {}
for word in words:
if word not in STOPWORDS:
if word != '\t':
counts[word] = counts.get(word,0) + 1 #统计字数
items = pd.DataFrame(list(counts.items()))
return items
def getText_count_ch(txt):
"""中文词频统计"""
txt = txt.lower() #将所有大写字母变成小写
for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()… 0123456789abcdefghijklmnopqrstuvwxyz': #将文本中特殊符号数字删除
txt = txt.replace(ch,"")
words = jieba.lcut(txt)
counts = {}
for word in words:
counts[word] = counts.get(word,0) + 1
items = list(counts.items())
fin_items = []
for item in items:
if len(item[0])>=2:
fin_items.append(item)
fin_items = pd.DataFrame(fin_items)
return fin_items
def getText_count_U(txt):
"""统计英文大写词频"""
for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()… ': #将文本中特殊符号替换为空格
txt = txt.replace(ch," ")
words = txt.split()
counts = {}
for word in words:
if word not in STOPWORDS:
if word != '/t':
if word.isupper(): #统计大写
counts[word] = counts.get(word,0) + 1 #统计字数
items = pd.DataFrame(list(counts.items())) #将字典类型转换成列表类型
if items.shape == (0,0):
out = 0
else:
out = sum(items[1])
return out
def is_chinese(strs):
"""判断一个unicode是否是汉字/英文"""
strs = strs.lower()
for uchar in strs:
if (uchar < u'\u0061') or (u'\u007a' < uchar < u'\u4e00') or (u'\u9fff' < uchar):
return False
return True
def is_eng(strs):
"""判断一个unicode是否是英文"""
strs = strs.lower()
for uchar in strs:
if (uchar < u'\u0061') or (u'\u007a' < uchar):
return False
return True
def pre_user(data_user):
data_user['accountName'] = data_user['accountName'].apply(lambda x:'@'+x)
data_user['otherInfo'] = 1-pd.isnull(data_user['otherInfo']).astype(int)
data_user['nickName'] = 1-pd.isnull(data_user['nickName']).astype(int)
data_user = data_user[['accountName', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']]
data_user = data_user.dropna()
data_user = data_user.drop_duplicates().reset_index(drop = True)
data_user['fansCount'] = data_user['fansCount'].astype(int)
data_user['likeCount'] = data_user['likeCount'].astype(int)
data_user['postCount'] = data_user['postCount'].astype(int)
data_user.columns = ['用户名', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']
return data_user
def post_related(df, data_user):
# postset = pd.DataFrame(columns=['传播层级', '帖子id', '转发来源id', '用户名', '用户id', '发表内容', '发表时间', 'shareCount',
# 'url', 'layer', 'shape', 'degree', 'closeness_centrality', 'pagerank',
# 'sub_shareCount', '语言', '主贴长度', '主贴http', '主贴at', '主贴tag',
# 'emotion', 'emotion_sub', '最大词频数', '重复词汇占比', '大写词频', '有无传播内容',
# '传播链语言均值', '传播链语言标准差', '传播链贴文emotion均值', '传播链贴文emotion标准差',
# '传播链贴文emotion_sub均值', '传播链贴文emotion_sub标准差',
# '传播链贴文长度均值', '传播链贴文长度标准差', '传播链贴文http均值', '传播链贴文http标准差', '传播链贴文at均值',
# '传播链贴文at标准差', '传播链贴文tag均值', '传播链贴文tag标准差', 'diffdate均值', 'diffdate标准差'])
postset = pd.DataFrame(columns=['传播层级', '帖子id', '转发来源id','所属帖子id', '用户名', '用户id', '发表内容', '发表时间', 'shareCount',
'url', 'layer', 'shape', 'degree', 'closeness_centrality', 'pagerank',
'语言', '主贴长度', '主贴http', '主贴at', '主贴tag',
'emotion', 'emotion_sub', '最大词频数', '重复词汇占比'])
for post_id in tqdm(df['所属帖子id'].drop_duplicates().reset_index(drop=True)):
data = df[df['所属帖子id'] == post_id].reset_index(drop=True)
data.columns = ['传播层级', '帖子id', '转发来源id', '所属帖子id', '用户名', '用户id', '发表内容', '发表时间',
'shareCount', 'url']
data = data.drop_duplicates()
post = data[data['传播层级'] == '1'].head(1)
### 一、新闻传播--贴文网络
##1.layer/shape/degree
post['layer'] = int(max(data['传播层级']))
post['shape'] = data.shape[0] - 1
post['degree'] = data[data['传播层级'] == '2'].shape[0]
##2.整体网络测度(贴文网络测度)
###2.1把转发来源id对应到转发来源用户
tmp_zfyh = pd.merge(data[data['传播层级'] != '1']['转发来源id'].drop_duplicates(),
data[data['帖子id'].notnull()][['帖子id', '用户名']],
left_on=['转发来源id'], right_on=['帖子id'], how='left')[['转发来源id', '用户名']]
tmp_zfyh.columns = ['转发来源id', '转发来源用户名']
data = pd.merge(data, tmp_zfyh, left_on=['转发来源id'], right_on=['转发来源id'], how='left')
post_edge = data.copy()
post_edge = data[data['传播层级'] != '1'][['用户名', '转发来源用户名']]
post_edge.columns = ['source', 'target']
post_edge['count_all'] = 1
post_edge = post_edge.groupby(['source', 'target'])['count_all'].count().reset_index()
# post_edge.to_csv(r'E:\项目文件\情报\假新闻\数据\画图\post_edge_tmp.csv',index=False)
edgeweightset = post_edge[['source', 'target', 'count_all']]
edgeweightset_l = [[] for _ in range(edgeweightset.shape[0])]
for k in range(len(edgeweightset_l)):
for j in range(edgeweightset.shape[1]):
edgeweightset_l[k].append(edgeweightset.iloc[k, j])
# print(i/len(edgeweightset_l))
if len(edgeweightset_l) == 0: # 没有传播链
post['closeness_centrality'] = 1
post['pagerank'] = 1
else:
g = nx.DiGraph()
g.add_weighted_edges_from(edgeweightset_l)
centrality = [nx.closeness_centrality(g),
nx.pagerank(g)]
results = []
nodes = g.nodes() # 提取网络中节点列表
for node in nodes: # 遍历所有节点,提取每个节点度中心性计算结果,并存储为[[节点1,结果],[节点2,结果],...]的形式
results.append([node,
centrality[0][node],
centrality[1][node]])
results = pd.DataFrame(results)
results.columns = ['node', 'closeness_centrality', 'pagerank']
post['closeness_centrality'] = results[results['node'] == results[
results['closeness_centrality'] == max(results['closeness_centrality'])]['node'].iloc[0]][
'closeness_centrality'].iloc[0]
post['pagerank'] = results[results['node'] ==
results[results['closeness_centrality'] == max(results['closeness_centrality'])][
'node'].iloc[0]]['pagerank'].iloc[0]
# post['closeness_centrality'] = results[results['node'] == post['帖子id'].iloc[0]]['closeness_centrality'].iloc[0]
# post['pagerank'] = results[results['node'] == post['帖子id'].iloc[0]]['pagerank'].iloc[0]
#——————————hh——————————————
# 特征未使用
# ##3.传播链中的平均影响力shareCount
# tmp = 0
# for k in range(data[(data['传播层级'] != '1') & (data['帖子id'].notnull())].shape[0]):
# tmp = tmp + int(data[(data['传播层级'] != '1') & (data['帖子id'].notnull())].shareCount.iloc[k])
# if tmp == 0:
# post['sub_shareCount'] = 0
# else:
# post['sub_shareCount'] = tmp / data[(data['传播层级'] != '1') & (data['帖子id'].notnull())].shape[0]
#————————————————————————
##二、主贴文本
# post['发表内容'] = 'October 10th commemorates the 1911 Revolution happened in Wuchang of China, which ended thousands-year-long absolute monarchy. Tsai and DPP authorities want to separate Taiwan from China and betray history. The Chinese people and Chinese history will never forgive these traitors.'
##文本特殊字符个数(http、@、#)
post['主贴http'] = post['发表内容'].iloc[0].count('http')
post['主贴at'] = post['发表内容'].iloc[0].count('@')
post['主贴tag'] = post['发表内容'].iloc[0].count('#')
##判断语言
tmp = post['发表内容'].iloc[0]
for ch in 'éí!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| 0123456789':
tmp = tmp.replace(ch, '')
if is_eng(tmp): ##主贴英文内容
post['语言'] = 0
text = post['发表内容'].iloc[0]
# text = '#Americans,for the first time in their lives,are seeing empty shelves in the stores.This isn’t right.We need to cut #China out of our supply chains by producing locally.#onshoring'
text = text[0:text.rfind("http")]
for ch in '!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| ':
text = text.replace(ch, ' ')
##文本长度
words = text.split(' ')
post['主贴长度'] = len(words)
##文本情感
# post['emotion'] = post['发表内容'].apply(lambda x: SnowNLP(x).sentiments)
emo = pd.DataFrame(TextBlob(post['发表内容'].iloc[0]).sentiment)
post['emotion'] = emo.loc[0, 0]
post['emotion_sub'] = emo.loc[1, 0]
##文本词频
## 词频统计1:最大词频数
## 词频统计2:正文中出现两次及以上的词占比
items = getText_count_eng(text)
if items.shape == (0, 0):
post['最大词频数'] = 0
post['重复词汇占比'] = 0
else:
post['最大词频数'] = max(items[1])
post['重复词汇占比'] = items[items[1] >= 2].shape[0] / items.shape[0]
## 词频统计3:全部大写词频
post['大写词频'] = getText_count_U(text)
elif is_chinese(tmp): ##主贴中文内容
post['语言'] = 1
text = post['发表内容'].iloc[0]
text = text[0:text.rfind("http")]
post['主贴长度'] = len(text)
post['emotion'] = (SnowNLP(text).sentiments - 0.5) * 2
post['emotion_sub'] = np.NaN
# post['emotion_blob'] = pd.DataFrame(post['发表内容'].apply(lambda x: TextBlob(x).sentiment).iloc[0]).loc[0]
# post['emotion_sub'] = pd.DataFrame(post['发表内容'].apply(lambda x: TextBlob(x).sentiment).iloc[0]).loc[1]
##文本词频
## 词频统计1:标题中出现的词,在正文中出现最大词频
## 词频统计2:正文中出现两次及以上的词占比
items = getText_count_ch(text)
if items.shape == (0, 0):
post['最大词频数'] = 0
post['重复词汇占比'] = 0
else:
post['最大词频数'] = max(items[1])
post['重复词汇占比'] = items[items[1] >= 2].shape[0] / items.shape[0]
## 词频统计3:全部大写词频
post['大写词频'] = np.NaN
else:
post['语言'] = np.NaN
post['主贴长度'] = np.NaN
post['emotion'] = np.NaN
post['emotion_sub'] = np.NaN
post['最大词频数'] = np.NaN
post['重复词汇占比'] = np.NaN
post['大写词频'] = np.NaN
# ##4.2传播链中的文本
# sub_post = pd.DataFrame(data[(data['传播层级'] != '1') & (data['帖子id'].notnull())][['发表内容', '发表时间']])
# sub_post['语言'] = np.NaN
# sub_post['文本长度'] = np.NaN
# sub_post['http'] = np.NaN
# sub_post['at'] = np.NaN
# sub_post['tag'] = np.NaN
# sub_post['emotion'] = np.NaN
# sub_post['emotion_sub'] = np.NaN
# sub_post['diffdate'] = np.NaN
#
# for k in range(sub_post.shape[0]):
# ##文本特殊字符个数(http、@、#)
# sub_post['http'].iloc[k] = sub_post['发表内容'].iloc[k].count('http')
# sub_post['at'].iloc[k] = sub_post['发表内容'].iloc[k].count('@')
# sub_post['tag'].iloc[k] = sub_post['发表内容'].iloc[k].count('#')
#
# ##时间差
# d1 = datetime.datetime.strptime(sub_post['发表时间'].iloc[k], "%Y-%m-%d %H:%M:%S")
# base = datetime.datetime.strptime(post['发表时间'].iloc[0], "%Y-%m-%d %H:%M:%S")
#
# # now = datetime.datetime.now()
# sub_post['diffdate'].iloc[k] = (d1 - base).days
#
# ##判断语言
# tmp = sub_post['发表内容'].iloc[k]
# for ch in 'éí!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| 0123456789':
# tmp = tmp.replace(ch, '')
#
# if is_eng(tmp): ##英文内容
#
# sub_post['语言'].iloc[k] = 0
#
# ##文本长度
# text = sub_post['发表内容'].iloc[k]
# # text = "'America is collapsing and it's China's fault' is definitely a change of direction?"
# text = text[0:text.rfind("http")]
# for ch in '!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| ':
# text = text.replace(ch, ' ')
# words = text.split(' ')
# sub_post['文本长度'].iloc[k] = len(words)
# ##情感
# sub_emo = pd.DataFrame(TextBlob(sub_post['发表内容'].iloc[k]).sentiment)
# sub_post['emotion'].iloc[k] = sub_emo.loc[0, 0]
# sub_post['emotion_sub'].iloc[k] = sub_emo.loc[1, 0]
#
# elif is_chinese(tmp): ##中文内容
#
# sub_post['语言'].iloc[k] = 1
#
# ##文本长度
# text = sub_post['发表内容'].iloc[k]
# text = text[0:text.rfind("http")]
# sub_post['文本长度'].iloc[k] = len(text)
# ##情感
# sub_post['emotion'].iloc[k] = (SnowNLP(sub_post['发表内容'].iloc[k]).sentiments - 0.5) * 2
# sub_post['emotion_sub'].iloc[k] = np.NaN
#
# else:
#
# sub_post['语言'].iloc[k] = np.NaN
# sub_post['文本长度'].iloc[k] = np.NaN
# sub_post['emotion'].iloc[k] = np.NaN
# sub_post['emotion_sub'].iloc[k] = np.NaN
#
# if sub_post.shape[0] == 0:
# post['有无传播内容'] = 0
# else:
# post['有无传播内容'] = 1
#
# post['传播链语言均值'] = sub_post['语言'].mean()
# post['传播链贴文长度均值'] = sub_post['文本长度'].mean()
# post['传播链贴文emotion均值'] = sub_post['emotion'].mean()
#
# ##emotion_sub取有值的均值
# post['传播链贴文emotion_sub均值'] = sub_post['emotion_sub'].mean()
#
# post['传播链贴文http均值'] = sub_post['http'].mean()
#
# post['传播链贴文at均值'] = sub_post['at'].mean()
#
# post['传播链贴文tag均值'] = sub_post['tag'].mean()
#
# post['diffdate均值'] = sub_post['diffdate'].mean()
##三、用户信息
##发帖用户
post = pd.merge(post, data_user, how='left', on='用户名')
##传播链用户
sub_user = pd.DataFrame(data[data['传播层级'] != '1'][['用户名']])
sub_user = pd.merge(sub_user, data_user, how='left', on='用户名')
sub_user = sub_user.dropna()
post['nickName均值'] = sub_user['nickName'].mean()
post['fansCount均值'] = sub_user['fansCount'].mean()
post['likeCount均值'] = sub_user['likeCount'].mean()
post['postCount均值'] = sub_user['postCount'].mean()
post['otherInfo均值'] = sub_user['otherInfo'].mean()
postset = pd.concat([postset, post]).reset_index(drop=True)
postset = postset.fillna(0)
postset['emotion_degree'] = abs(postset['emotion'])
return postset
xlsx_path_po = r'假新闻数据输入\传播分析test.xlsx'
data_po = pd.read_excel(xlsx_path_po, dtype="str")
data_user = pd.read_excel(r'假新闻数据输入\用户test.xlsx', dtype="str")
data_user = pre_user(data_user)
#data_user=dataframe[@XHNews,1,878,1178,938,1]
#data_user.columns=['用户名', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']
postset_po = post_related(data_po,data_user) ## 正面文件
features = postset_po[[
#'shareCount',
'layer', 'shape', 'degree', 'pagerank', 'closeness_centrality',
'主贴http', '主贴at', '主贴tag',
'主贴长度','emotion', 'emotion_degree',
'最大词频数', '重复词汇占比',#(中英文差异大)
#'有无传播内容',
'fansCount','likeCount', 'postCount',
#'sub_shareCount',
'fansCount均值', 'postCount均值', 'otherInfo均值'
]]
clf = joblib.load(r'fake_news_model.pkl')
clf_predict = clf.predict(features)
print(clf_predict)
res=pd.DataFrame(clf_predict)
res.columns=['假新闻预测结果']
result = pd.concat([postset_po, res], axis=1)
result.to_excel('test_1209_1.xlsx',index=None)

BIN
inputdata/fake_news_model.pkl

BIN
inputdata/test.xlsx

BIN
inputdata/test_1220.xlsx

45
inputdata/to_mysql.py

@ -0,0 +1,45 @@
#coding:utf8
import json
import pymysql
import traceback
import pandas as pd
content_db = pymysql.connect(host='172.26.28.30', user='crawl', passwd='crawl123', db='test', port=3306,
charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
def to_mysql(sql,values):
content_db.ping(reconnect=True)
cursor = content_db.cursor()
cursor.execute(sql,values)
content_db.commit()
cursor.close()
def write_data_mysql():
data=pd.read_excel('假新闻数据输入/test.xlsx',keep_default_na=False)
try:
for i in data.index:
# line_key=list(data.loc[i].keys())
line_value=data.loc[i].values
# line_str=([str(x) for x in line_value])
line_str=[]
for index,x in enumerate(line_value):
line_str.append(x)
line_str=[0]+line_str
sql = "insert into TwitterAccount "+"values ("+ ','.join(['%s'] * len(line_str)) + ")"
# print(line_str)
# print(sql)
values=tuple(line_str)
# to_mysql(sql,values)
# content_db.ping(reconnect=True)
cursor = content_db.cursor()
cursor.execute(sql, values)
content_db.commit()
cursor.close()
print('%s条数据写入mysql'%(i+1))
except:
print(traceback.format_exc())
content_db.rollback()
write_data_mysql()
content_db.close()

BIN
inputdata/假新闻数据输入/Twitter_Account.xlsx

BIN
inputdata/假新闻数据输入/test.xlsx

BIN
inputdata/假新闻数据输入/传播分析1209.xlsx

BIN
inputdata/假新闻数据输入/传播分析1220.xlsx

BIN
inputdata/假新闻数据输入/传播分析test.xlsx

BIN
inputdata/假新闻数据输入/用户test.xlsx

433
inputdata/假新闻识别@20230918.py

@ -0,0 +1,433 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 13 18:13:03 2023
@author: chong
"""
import pandas as pd
import numpy as np
import networkx as nx
from textblob import TextBlob
from snownlp import SnowNLP
from wordcloud import STOPWORDS
import jieba
import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
# from sklearn import metrics
import joblib
# import matplotlib.pyplot as plt
# import seaborn as sns
def pre_user(data_user):
data_user['accountName'] = data_user['accountName'].apply(lambda x:'@'+x)
data_user['otherInfo'] = 1-pd.isnull(data_user['otherInfo']).astype(int)
data_user['nickName'] = 1-pd.isnull(data_user['nickName']).astype(int)
data_user = data_user[['accountName', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']]
data_user = data_user.dropna()
data_user = data_user.drop_duplicates().reset_index(drop = True)
data_user['fansCount'] = data_user['fansCount'].astype(int)
data_user['likeCount'] = data_user['likeCount'].astype(int)
data_user['postCount'] = data_user['postCount'].astype(int)
data_user.columns = ['用户名', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']
return data_user
def getText_count_eng(txt):
"""英文词频统计"""
txt = txt.lower() #将所有大写字母变成小写
for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()… ': #将文本中特殊符号替换为空格
txt = txt.replace(ch," ")
words = txt.split()
counts = {}
for word in words:
if word not in STOPWORDS:
if word != '\t':
counts[word] = counts.get(word,0) + 1 #统计字数
items = pd.DataFrame(list(counts.items()))
return items
def getText_count_ch(txt):
"""中文词频统计"""
txt = txt.lower() #将所有大写字母变成小写
for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()… 0123456789abcdefghijklmnopqrstuvwxyz': #将文本中特殊符号数字删除
txt = txt.replace(ch,"")
words = jieba.lcut(txt)
counts = {}
for word in words:
counts[word] = counts.get(word,0) + 1
items = list(counts.items())
fin_items = []
for item in items:
if len(item[0])>=2:
fin_items.append(item)
fin_items = pd.DataFrame(fin_items)
return fin_items
def getText_count_U(txt):
"""统计英文大写词频"""
for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()… ': #将文本中特殊符号替换为空格
txt = txt.replace(ch," ")
words = txt.split()
counts = {}
for word in words:
if word not in STOPWORDS:
if word != '/t':
if word.isupper(): #统计大写
counts[word] = counts.get(word,0) + 1 #统计字数
items = pd.DataFrame(list(counts.items())) #将字典类型转换成列表类型
if items.shape == (0,0):
out = 0
else:
out = sum(items[1])
return out
def is_chinese(strs):
"""判断一个unicode是否是汉字/英文"""
strs = strs.lower()
for uchar in strs:
if (uchar < u'\u0061') or (u'\u007a' < uchar < u'\u4e00') or (u'\u9fff' < uchar):
return False
return True
def is_eng(strs):
"""判断一个unicode是否是英文"""
strs = strs.lower()
for uchar in strs:
if (uchar < u'\u0061') or (u'\u007a' < uchar):
return False
return True
def post_related(df,data_user):
postset = pd.DataFrame(columns=['传播层级', '帖子id', '转发来源id', '用户名', '用户id', '发表内容', '发表时间', 'shareCount',
'url', 'layer', 'shape', 'degree', 'closeness_centrality', 'pagerank',
'sub_shareCount', '语言', '主贴长度', '主贴http', '主贴at', '主贴tag',
'emotion', 'emotion_sub', '最大词频数', '重复词汇占比', '大写词频','有无传播内容',
'传播链语言均值', '传播链语言标准差', '传播链贴文emotion均值', '传播链贴文emotion标准差',
'传播链贴文emotion_sub均值','传播链贴文emotion_sub标准差',
'传播链贴文长度均值', '传播链贴文长度标准差', '传播链贴文http均值', '传播链贴文http标准差', '传播链贴文at均值',
'传播链贴文at标准差', '传播链贴文tag均值', '传播链贴文tag标准差', 'diffdate均值', 'diffdate标准差'])
for post_id in df['所属帖子id'].drop_duplicates().reset_index(drop = True):
data = df[df['所属帖子id']==post_id].reset_index(drop = True)
data.columns = ['传播层级', '帖子id', '转发来源id', '所属帖子id', '用户名', '用户id', '发表内容', '发表时间',
'shareCount', 'url']
data = data.drop_duplicates()
post = data[data['传播层级']=='1'].head(1)
### 一、新闻传播--贴文网络
##1.layer/shape/degree
post['layer'] = int(max(data['传播层级']))
post['shape'] = data.shape[0]-1
post['degree'] = data[data['传播层级']=='2'].shape[0]
##2.整体网络测度(贴文网络测度)
###2.1把转发来源id对应到转发来源用户
tmp_zfyh = pd.merge(data[data['传播层级']!='1']['转发来源id'].drop_duplicates(),
data[data['帖子id'].notnull()][['帖子id','用户名']],
left_on = ['转发来源id'], right_on = ['帖子id'], how = 'left')[['转发来源id','用户名']]
tmp_zfyh.columns = ['转发来源id','转发来源用户名']
data = pd.merge(data, tmp_zfyh, left_on = ['转发来源id'], right_on = ['转发来源id'], how = 'left')
post_edge = data.copy()
post_edge = data[data['传播层级']!='1'][['用户名','转发来源用户名']]
post_edge.columns = ['source','target']
# tmp1 = data[(data['帖子id'].notnull())&(data['传播层级']!='1')][['帖子id','转发来源id']]
# tmp2 = data[data['帖子id'].isnull()][['用户名','转发来源id']]
# tmp1.columns = ['source','target']
# tmp2.columns = ['source','target']
# post_edge = pd.concat([tmp1,tmp2])
post_edge['count_all'] = 1
post_edge = post_edge.groupby(['source','target'])['count_all'].count().reset_index()
# post_edge.to_csv(r'E:\项目文件\情报\假新闻\数据\画图\post_edge_tmp.csv',index=False)
edgeweightset = post_edge[['source','target','count_all']]
edgeweightset_l = [[] for _ in range(edgeweightset.shape[0])]
for k in range(len(edgeweightset_l)):
for j in range(edgeweightset.shape[1]):
edgeweightset_l[k].append(edgeweightset.iloc[k,j])
# print(i/len(edgeweightset_l))
if len(edgeweightset_l)==0: #没有传播链
post['closeness_centrality'] = 1
post['pagerank'] = 1
else:
g = nx.DiGraph()
g.add_weighted_edges_from(edgeweightset_l)
centrality = [nx.closeness_centrality(g),
nx.pagerank(g)]
results = []
nodes = g.nodes() # 提取网络中节点列表
for node in nodes: # 遍历所有节点,提取每个节点度中心性计算结果,并存储为[[节点1,结果],[节点2,结果],...]的形式
results.append([node,
centrality[0][node],
centrality[1][node]])
results = pd.DataFrame(results)
results.columns = ['node','closeness_centrality','pagerank']
post['closeness_centrality'] = results[results['node'] == results[results['closeness_centrality'] == max(results['closeness_centrality'])]['node'].iloc[0]]['closeness_centrality'].iloc[0]
post['pagerank'] = results[results['node'] == results[results['closeness_centrality'] == max(results['closeness_centrality'])]['node'].iloc[0]]['pagerank'].iloc[0]
#post['closeness_centrality'] = results[results['node'] == post['帖子id'].iloc[0]]['closeness_centrality'].iloc[0]
#post['pagerank'] = results[results['node'] == post['帖子id'].iloc[0]]['pagerank'].iloc[0]
##3.传播链中的平均影响力shareCount
tmp = 0
for k in range(data[(data['传播层级']!='1') & (data['帖子id'].notnull())].shape[0]):
tmp = tmp + int(data[(data['传播层级']!='1') & (data['帖子id'].notnull())].shareCount.iloc[k])
if tmp == 0:
post['sub_shareCount'] = 0
else:
post['sub_shareCount'] = tmp/data[(data['传播层级']!='1') & (data['帖子id'].notnull())].shape[0]
##二、主贴文本
# post['发表内容'] = 'October 10th commemorates the 1911 Revolution happened in Wuchang of China, which ended thousands-year-long absolute monarchy. Tsai and DPP authorities want to separate Taiwan from China and betray history. The Chinese people and Chinese history will never forgive these traitors.'
##文本特殊字符个数(http、@、#)
post['主贴http'] = post['发表内容'].iloc[0].count('http')
post['主贴at'] = post['发表内容'].iloc[0].count('@')
post['主贴tag'] = post['发表内容'].iloc[0].count('#')
##判断语言
tmp = post['发表内容'].iloc[0]
for ch in 'éí!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| 0123456789':
tmp = tmp.replace(ch,'')
if is_eng(tmp): ##主贴英文内容
post['语言'] = 0
text = post['发表内容'].iloc[0]
#text = '#Americans,for the first time in their lives,are seeing empty shelves in the stores.This isn’t right.We need to cut #China out of our supply chains by producing locally.#onshoring'
text = text[0:text.rfind("http")]
for ch in '!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| ':
text = text.replace(ch,' ')
##文本长度
words = text.split(' ')
post['主贴长度'] = len(words)
##文本情感
# post['emotion'] = post['发表内容'].apply(lambda x: SnowNLP(x).sentiments)
emo = pd.DataFrame(TextBlob(post['发表内容'].iloc[0]).sentiment)
post['emotion'] = emo.loc[0,0]
post['emotion_sub'] = emo.loc[1,0]
##文本词频
## 词频统计1:最大词频数
## 词频统计2:正文中出现两次及以上的词占比
items = getText_count_eng(text)
if items.shape==(0,0):
post['最大词频数'] = 0
post['重复词汇占比'] = 0
else:
post['最大词频数'] = max(items[1])
post['重复词汇占比'] = items[items[1]>=2].shape[0]/items.shape[0]
## 词频统计3:全部大写词频
post['大写词频'] = getText_count_U(text)
elif is_chinese(tmp): ##主贴中文内容
post['语言'] = 1
text = post['发表内容'].iloc[0]
text = text[0:text.rfind("http")]
post['主贴长度'] = len(text)
post['emotion'] = (SnowNLP(text).sentiments-0.5)*2
post['emotion_sub'] = np.NaN
# post['emotion_blob'] = pd.DataFrame(post['发表内容'].apply(lambda x: TextBlob(x).sentiment).iloc[0]).loc[0]
# post['emotion_sub'] = pd.DataFrame(post['发表内容'].apply(lambda x: TextBlob(x).sentiment).iloc[0]).loc[1]
##文本词频
## 词频统计1:标题中出现的词,在正文中出现最大词频
## 词频统计2:正文中出现两次及以上的词占比
items = getText_count_ch(text)
if items.shape==(0,0):
post['最大词频数'] = 0
post['重复词汇占比'] = 0
else:
post['最大词频数'] = max(items[1])
post['重复词汇占比'] = items[items[1]>=2].shape[0]/items.shape[0]
## 词频统计3:全部大写词频
post['大写词频'] = np.NaN
else:
post['语言'] = np.NaN
post['主贴长度'] = np.NaN
post['emotion'] = np.NaN
post['emotion_sub'] = np.NaN
post['最大词频数'] = np.NaN
post['重复词汇占比'] = np.NaN
post['大写词频'] = np.NaN
##4.2传播链中的文本
sub_post = pd.DataFrame(data[(data['传播层级']!='1')&(data['帖子id'].notnull())][['发表内容','发表时间']])
sub_post['语言'] = np.NaN
sub_post['文本长度'] = np.NaN
sub_post['http'] = np.NaN
sub_post['at'] = np.NaN
sub_post['tag'] = np.NaN
sub_post['emotion'] = np.NaN
sub_post['emotion_sub'] = np.NaN
sub_post['diffdate'] = np.NaN
for k in range(sub_post.shape[0]):
##文本特殊字符个数(http、@、#)
sub_post['http'].iloc[k] = sub_post['发表内容'].iloc[k].count('http')
sub_post['at'].iloc[k] = sub_post['发表内容'].iloc[k].count('@')
sub_post['tag'].iloc[k] = sub_post['发表内容'].iloc[k].count('#')
##时间差
d1 = datetime.datetime.strptime(sub_post['发表时间'].iloc[k],"%Y-%m-%d %H:%M:%S")
base = datetime.datetime.strptime(post['发表时间'].iloc[0],"%Y-%m-%d %H:%M:%S")
# now = datetime.datetime.now()
sub_post['diffdate'].iloc[k] = (d1-base).days
##判断语言
tmp = sub_post['发表内容'].iloc[k]
for ch in 'éí!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| 0123456789':
tmp = tmp.replace(ch,'')
if is_eng(tmp): ##英文内容
sub_post['语言'].iloc[k] = 0
##文本长度
text = sub_post['发表内容'].iloc[k]
# text = "'America is collapsing and it's China's fault' is definitely a change of direction?"
text = text[0:text.rfind("http")]
for ch in '!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| ':
text = text.replace(ch,' ')
words = text.split(' ')
sub_post['文本长度'].iloc[k] = len(words)
##情感
sub_emo = pd.DataFrame(TextBlob(sub_post['发表内容'].iloc[k]).sentiment)
sub_post['emotion'].iloc[k] = sub_emo.loc[0,0]
sub_post['emotion_sub'].iloc[k] = sub_emo.loc[1,0]
elif is_chinese(tmp): ##中文内容
sub_post['语言'].iloc[k] = 1
##文本长度
text = sub_post['发表内容'].iloc[k]
text = text[0:text.rfind("http")]
sub_post['文本长度'].iloc[k] = len(text)
##情感
sub_post['emotion'].iloc[k] = (SnowNLP(sub_post['发表内容'].iloc[k]).sentiments-0.5)*2
sub_post['emotion_sub'].iloc[k] = np.NaN
else:
sub_post['语言'].iloc[k] = np.NaN
sub_post['文本长度'].iloc[k] = np.NaN
sub_post['emotion'].iloc[k] = np.NaN
sub_post['emotion_sub'].iloc[k] = np.NaN
if sub_post.shape[0] == 0:
post['有无传播内容'] = 0
else:
post['有无传播内容'] = 1
post['传播链语言均值'] = sub_post['语言'].mean()
post['传播链贴文长度均值'] = sub_post['文本长度'].mean()
post['传播链贴文emotion均值'] = sub_post['emotion'].mean()
##emotion_sub取有值的均值
post['传播链贴文emotion_sub均值'] = sub_post['emotion_sub'].mean()
post['传播链贴文http均值'] = sub_post['http'].mean()
post['传播链贴文at均值'] = sub_post['at'].mean()
post['传播链贴文tag均值'] = sub_post['tag'].mean()
post['diffdate均值'] = sub_post['diffdate'].mean()
##三、用户信息
##发帖用户
post = pd.merge(post,data_user,how='left',on='用户名')
##传播链用户
sub_user = pd.DataFrame(data[data['传播层级']!='1'][['用户名']])
sub_user = pd.merge(sub_user,data_user,how='left',on='用户名')
sub_user = sub_user.dropna()
post['nickName均值'] = sub_user['nickName'].mean()
post['fansCount均值'] = sub_user['fansCount'].mean()
post['likeCount均值'] = sub_user['likeCount'].mean()
post['postCount均值'] = sub_user['postCount'].mean()
post['otherInfo均值'] = sub_user['otherInfo'].mean()
postset = pd.concat([postset,post]).reset_index(drop=True)
postset = postset.fillna(0)
postset['emotion_degree'] = abs(postset['emotion'])
return postset
xlsx_path_po = r'假新闻数据输入\传播分析1209.xlsx'
xlsx_path_ne = r'假新闻数据输入\传播分析1220.xlsx'
data_po = pd.read_excel(xlsx_path_po, dtype="str")
data_ne = pd.read_excel(xlsx_path_ne, dtype="str")
data_user = pd.read_excel(r'假新闻数据输入\Twitter_Account.xlsx', dtype="str")
data_user = pre_user(data_user)
postset_po = post_related(data_po,data_user) ## 正面文件
postset_ne = post_related(data_ne,data_user) ## 负面文件
postset_po['y'] = 1
postset_ne['y'] = 0
postset = pd.concat([postset_po,postset_ne]).drop_duplicates().reset_index(drop = True)
features = postset[[
#'shareCount',
'layer', 'shape', 'degree', 'pagerank', 'closeness_centrality',
'主贴http', '主贴at', '主贴tag',
'主贴长度','emotion', 'emotion_degree',
'最大词频数', '重复词汇占比',#(中英文差异大)
#'有无传播内容',
'fansCount','likeCount', 'postCount',
#'sub_shareCount',
'fansCount均值', 'postCount均值', 'otherInfo均值'
#,'结果'
]]
target = pd.DataFrame(postset[postset.columns[-1]],columns=[postset.columns[-1]])
X_train, X_test, y_train, y_test = train_test_split(features, target,
test_size = 0.25, random_state = 123)
RF_model = RandomForestClassifier(n_estimators=10, max_depth=None,min_samples_split=2, random_state=0)
params = {"n_estimators":range(10,101,10)}
clf = GridSearchCV(estimator=RF_model, param_grid=params, cv=10)
clf.fit(X_train,y_train)
clf.best_params_
clf_predict = clf.predict(X_test)
joblib.dump(clf,r'F:\项目文件\情报\假新闻\fake_news_model.pkl')
clf = joblib.load(r'F:\项目文件\情报\假新闻\fake_news_model.pkl')
clf_predict = clf.predict(features)
# cm5 = pd.crosstab(clf_predict,target.y)
# sns.heatmap(cm5, annot = True, cmap = 'GnBu', fmt = 'd')
# plt.xlabel('Real')
# plt.ylabel('Predict')
# plt.show()
# accuracy_rate = sum(clf_predict == target.y) / len(target.y)
# target = pd.get_dummies(target)['y']
# sum((clf_predict == target) & (target ==1)) / sum(clf_predict==1)
# sum((clf_predict == target) & (target ==0)) / sum(clf_predict==0)
# print('模型的准确率为:\n',accuracy_rate)
# print('模型的评估报告:\n',metrics.classification_report(target, clf_predict))

BIN
log_util/__pycache__/set_logger.cpython-36.pyc

BIN
log_util/__pycache__/set_logger.cpython-38.pyc

33
log_util/set_logger.py

@ -0,0 +1,33 @@
#coding:utf8
import logging
import os
import sys
from logging.handlers import TimedRotatingFileHandler
import re
# cur_dir = os.path.dirname( os.path.abspath(__file__)) or os.getcwd()
# sys.path.append(cur_dir + '/log_util')
def set_logger(filename):
# 创建logger对象。传入logger名字
logger = logging.getLogger(filename)
# log_path = os.path.join(cur_dir, filename)
# 设置日志记录等级
logger.setLevel(logging.INFO)
# interval 滚动周期,
# when="MIDNIGHT", interval=1 表示每天0点为更新点,每天生成一个文件
# backupCount 表示日志保存个数
file_handler = TimedRotatingFileHandler(
filename=filename, when="MIDNIGHT",encoding="utf-8", interval=1, backupCount=3
)
# filename="mylog" suffix设置,会生成文件名为mylog.2020-02-25.log
file_handler.suffix = "%Y-%m-%d.log"
# extMatch是编译好正则表达式,用于匹配日志文件名后缀
# 需要注意的是suffix和extMatch一定要匹配的上,如果不匹配,过期日志不会被删除。
file_handler.extMatch = re.compile(r"^\d{4}-\d{2}-\d{2}.log$")
# 定义日志输出格式
file_handler.setFormatter(
logging.Formatter(
"[%(asctime)s] [%(process)d] [%(levelname)s] - %(module)s.%(funcName)s (%(filename)s:%(lineno)d) - %(message)s"
)
)
logger.addHandler(file_handler)
return logger

0
logs/results.log

18
manage.py

@ -0,0 +1,18 @@
#!/usr/bin/env python
import os
import sys
import threading
from text_analysis.views import predict_news
import django
if __name__ == "__main__":
t = threading.Thread(target=predict_news, name='predict_news')
t.daemon = True
t.start()
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "text_analysis.settings")
django.setup()
from django.core.management import execute_from_command_line
execute_from_command_line(sys.argv)

35
src.py

@ -0,0 +1,35 @@
#coding:utf8
import requests
def upload():
url="https://realtime.pdeepmatrix.com/apis/media/analysis/upload"
# 定义form-data参数
data = {
'fromLanguage': 'zh'
}
# 定义文件参数
files = {
'file': open('inputdata/lKTZNen6aak.mp4', 'rb')
}
response = requests.post(url, data=data, files=files)
print(response.text)
#结果—{"code":200,"message":"SUCCESS","data":"3a42ea9594b641c39e40d1497ca29be9"}
def getResults():
url="https://realtime.pdeepmatrix.com/apis/media/analysis/getResult"
# 定义参数
#'taskId': '3a42ea9594b641c39e40d1497ca29be9'
params = {
'taskId': '5ee948446ab64d5d8a1d92ecfa6c2c93'
}
response = requests.get(url, params=params)
# 打印响应结果
print(response.text)
#{"code":200,"message":"SUCCESS","data":{"sentences":[{"silence_duration":0,"end_time":5108,"speech_rate":150,"begin_time":1130,"channel_id":0,"emotion_value":"5.0","text":"视频解析、语音识别。"}]...
# upload()
getResults()

1
start.sh

@ -0,0 +1 @@
../../environment/python3.8/bin/uwsgi --ini uwsgi.ini --file wsgi.py --daemonize wsgi.log

1
stop_uwsgi.sh

@ -0,0 +1 @@
lsof -i:9030 |grep -v 'PID' | awk '{print $2}'| xargs kill -9

103
test.py

@ -0,0 +1,103 @@
#coding=utf8
import sys
import requests
import json
import time
# #url = 'http://0.0.0.0:5033'
# """
# url = 'http://20.0.2.6:5055/classify_event'
# url = 'http://20.0.2.6:5055/is_about_china'
# url = 'http://20.0.2.6:5055/associated_words'
# """
# url = 'http://127.0.0.1:9008/paper'
#
# # url_file ="http://172.18.1.130:9985/group33/default/20230415/09/15/1/“GF-1”影像质量评价及矿区土地利用分类潜力研究_陈明.docx"
# url_file="/opt/Project_kongtianyuan/inputfile/"
# filename = "“GF-1”影像质量评价及矿区土地利用分类潜力研究"
#
# data = {"url":url_file,"filename":filename}
# data_str = json.dumps(data)
#
# r = requests.post(url,data=str(data_str))
# print(r.text)
# # res =json.loads(r.text)
# # print(res)
raw_data={
"metadata":{
"address":"http://172.24.12.126:9013/ASR/",
"index":0,
"admin":{
"datasource":"2_任务提取"
},
"output":{
"output_type":"table",
"label_col":[
"ASR识别内容"
]
},
"input":{
"input_type":"text",
"label":[
"2_任务提取"
]
},
"user":{
"tag":""
}
},
"data":{
"1_文件上传":"{\"fileId\":\"53aa330b4e484c9bdeb7ff35e335a6f6\",\"fileName\":\"lKTZNen6aak.mp4\",\"filePath\":\"/group33/default/20230828/15/48/1/lKTZNen6aak.mp4\",\"fileType\":\"mp4\",\"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230828/15/48/1/lKTZNen6aak.mp4\",\"ossPath\":\"/group33/default/20230828/15/48/1/lKTZNen6aak.mp4\"}",
"businessKey":"19615b029da477fb",
"2_任务提取":"[{\"fileId\":\"53aa330b4e484c9bdeb7ff35e335a6f6\",\"fileName\":\"lKTZNen6aak.mp4\",\"filePath\":\"/group33/default/20230828/15/48/1/lKTZNen6aak.mp4\",\"fileType\":\"mp4\",\"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230828/15/48/1/lKTZNen6aak.mp4\",\"ossPath\":\"/group33/default/20230828/15/48/1/lKTZNen6aak.mp4\"}]"
},
"created":1691004265000,
"module":"ASR",
"start_tag":"false",
"multi_branch":0,
"last_edit":1693417201000,
"next_app_id":[
{
"start_id":154,
"edge_id":75,
"end_id":155
}
],
"transfer_id":3,
"version":1,
"blueprint_id":4,
"scenes_id":5,
"scenario":{
"dataloss":1,
"autoCommitTriggerLast":1,
"maxErrors":3,
"autoCommit":1,
"freshVariables":1
},
"wait_condition":[
],
"scheduling":{
"interval":-1,
"type":"single"
},
"name":"ASR",
"businessKey":"19615b029da477fb",
"id":154,
"position":[
100,
200
],
"describe":"ASR识别"
}
allFile = raw_data["data"]["2_任务提取"]
currentFile = eval(allFile)
print(currentFile)
print(type(currentFile))
# filejson = json.loads(currentFile)
# file = currentFile["fileUrl"]
# fileName = currentFile["fileName"]
# print(file)

0
text_analysis/__init__.py

BIN
text_analysis/__pycache__/__init__.cpython-36.pyc

BIN
text_analysis/__pycache__/__init__.cpython-38.pyc

BIN
text_analysis/__pycache__/cusException.cpython-38.pyc

BIN
text_analysis/__pycache__/read_config.cpython-38.pyc

BIN
text_analysis/__pycache__/settings.cpython-36.pyc

BIN
text_analysis/__pycache__/settings.cpython-38.pyc

BIN
text_analysis/__pycache__/urls.cpython-36.pyc

BIN
text_analysis/__pycache__/urls.cpython-38.pyc

BIN
text_analysis/__pycache__/views.cpython-36.pyc

BIN
text_analysis/__pycache__/views.cpython-38.pyc

BIN
text_analysis/__pycache__/wsgi.cpython-36.pyc

BIN
text_analysis/__pycache__/wsgi.cpython-38.pyc

108
text_analysis/bak/views.py_0226

@ -0,0 +1,108 @@
#coding:utf8
import os, sys
import io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')
cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd()
par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir))
sys.path.append(cur_dir)
sys.path.append(par_dir)
import json
from django.http import HttpResponse
from text_analysis.tools import to_kafka,tool
from text_analysis.tools import pred
from django.views.decorators.csrf import csrf_exempt
from log_util.set_logger import set_logger
logging=set_logger('logs/results.log')
import traceback
import queue
from text_analysis.cusException import userFile_Exception,chainFile_Exception
import requests
import time
from datetime import datetime
import os
import joblib
#任务队列
global task_queue
task_queue = queue.Queue()
@csrf_exempt
def fakeNewIdentification(request):
if request.method == 'POST':
try:
raw_data = json.loads(request.body)
task_queue.put(raw_data)
return HttpResponse(json.dumps({"code": 1, "msg": "请求正常!"}, ensure_ascii=False))
except:
logging.error(traceback.format_exc())
return HttpResponse(json.dumps({"code": 0, "msg": "请求json格式不正确!"}, ensure_ascii=False))
else:
return HttpResponse(json.dumps({"code": 0, "msg": "请求方式错误,改为post请求"}, ensure_ascii=False))
def predict_news(dbConfig):
while True:
if task_queue.qsize() > 0:
try:
logging.info("取任务队列长度{}".format(task_queue.qsize()))
raw_data = task_queue.get()
logging.info("原始数据-{}".format(raw_data))
res = {"successCode": "1", "errorLog": "", "results": {}}
# 账号数据
userData = tool.mysqlData(raw_data, logging,"1",dbConfig)
# if not userData:
# raise userFile_Exception
logging.info("账号数据获取完毕!-长度{}".format(len(userData)))
# 传播链数据
postChain=tool.mysqlData(raw_data, logging,"0",dbConfig)
if not postChain:
raise chainFile_Exception
logging.info("传播链数据获取完毕!-长度{}".format(len(postChain)))
news=pred.predict_news(userData,postChain,logging)
# 结束标识
res['isLast'] = True
for i in range(len(news)):
row_dict = news.iloc[i].to_dict()
row_dict['pageType'] = 'fakeNewsPage'
# postId
row_dict['postId'] = userData[0]['postId']
res["results"] = json.dumps(row_dict,ensure_ascii=False)
raw_data["result"] = res
logging.info("共{}条数据,第{}条数据输出-{}".format(len(news),i+1,raw_data))
to_kafka.send_kafka(raw_data, logging)
except userFile_Exception:
res = {"successCode": "0", "errorLog": "用户数据为空!", "results": {}}
results={}
results['pageType'] = 'fakeNewsPage'
results['recognitionResult'] = '用户数据为空'
res['results'] = json.dumps(results)
raw_data["result"] = res
logging.info("该条请求用户数据为空-{}".format(raw_data))
to_kafka.send_kafka(raw_data, logging)
except chainFile_Exception:
res = {"successCode": "0", "errorLog": "关系链数据为空!", "results": {}}
results={}
results['pageType'] = 'fakeNewsPage'
results['recognitionResult'] = '关系链数据为空'
res['results'] = json.dumps(results)
raw_data["result"] = res
logging.info("该条请求关系链数据为空-{}".format(raw_data))
to_kafka.send_kafka(raw_data, logging)
except:
res = {"successCode": "0", "errorLog": "", "results": {}}
results={}
results['pageType'] = 'fakeNewsPage'
results['recognitionResult'] = ""
res['results'] = json.dumps(results)
raw_data["result"] = res
raw_data["result"]["error"] = traceback.format_exc()
logging.info(traceback.format_exc())
to_kafka.send_kafka(raw_data, logging)
else:
# 暂无任务,进入休眠
time.sleep(10)

115
text_analysis/bak/views.py_0607

@ -0,0 +1,115 @@
#coding:utf8
import os, sys
import io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')
cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd()
par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir))
sys.path.append(cur_dir)
sys.path.append(par_dir)
import json
from django.http import HttpResponse
from text_analysis.tools import to_kafka,tool
from text_analysis.tools import pred
from django.views.decorators.csrf import csrf_exempt
from log_util.set_logger import set_logger
logging=set_logger('logs/results.log')
import traceback
import queue
from text_analysis.cusException import userFile_Exception,chainFile_Exception
import requests
import time
from datetime import datetime
import os
import joblib
#任务队列
global task_queue
task_queue = queue.Queue()
@csrf_exempt
def fakeNewIdentification(request):
if request.method == 'POST':
try:
raw_data = json.loads(request.body)
task_queue.put(raw_data)
return HttpResponse(json.dumps({"code": 1, "msg": "请求正常!"}, ensure_ascii=False))
except:
logging.error(traceback.format_exc())
return HttpResponse(json.dumps({"code": 0, "msg": "请求json格式不正确!"}, ensure_ascii=False))
else:
return HttpResponse(json.dumps({"code": 0, "msg": "请求方式错误,改为post请求"}, ensure_ascii=False))
def predict_news(dbConfig):
while True:
if task_queue.qsize() > 0:
try:
logging.info("取任务队列长度{}".format(task_queue.qsize()))
raw_data = task_queue.get()
logging.info("原始数据-{}".format(raw_data))
res = {"successCode": "1", "errorLog": "", "results": {}}
# 账号数据
userData = tool.mysqlData(raw_data, logging,"1",dbConfig)
# if not userData:
# raise userFile_Exception
logging.info("账号数据获取完毕!-长度{}".format(len(userData)))
# 传播链数据
postChain=tool.mysqlData(raw_data, logging,"0",dbConfig)
if not postChain:
raise chainFile_Exception
logging.info("传播链数据获取完毕!-长度{}".format(len(postChain)))
news=pred.predict_news(userData,postChain,logging)
# 结束标识
res['isLast'] = True
for i in range(len(news)):
row_dict = news.iloc[i].to_dict()
row_dict['pageType'] = 'fakeNewsPage'
# postId
row_dict['postId'] = userData[0]['postId']
res["results"] = json.dumps(row_dict,ensure_ascii=False)
res["status"] = 1
res["message"] = "成功"
raw_data["result"] = res
logging.info("共{}条数据,第{}条数据输出-{}".format(len(news),i+1,raw_data))
to_kafka.send_kafka(raw_data, logging)
except userFile_Exception:
res = {"successCode": "0", "errorLog": "用户数据为空!", "results": {}}
results={}
results['pageType'] = 'fakeNewsPage'
results['recognitionResult'] = '用户数据为空'
res['results'] = json.dumps(results)
res["status"] = 2
res["message"] = "用户数据为空"
raw_data["result"] = res
logging.info("该条请求用户数据为空-{}".format(raw_data))
to_kafka.send_kafka(raw_data, logging)
except chainFile_Exception:
res = {"successCode": "0", "errorLog": "关系链数据为空!", "results": {}}
results={}
results['pageType'] = 'fakeNewsPage'
results['recognitionResult'] = '关系链数据为空'
res['results'] = json.dumps(results)
res["status"] = 2
res["message"] = "关系链数据为空"
raw_data["result"] = res
logging.info("该条请求关系链数据为空-{}".format(raw_data))
to_kafka.send_kafka(raw_data, logging)
except:
res = {"successCode": "0", "errorLog": "", "results": {}}
results={}
results['pageType'] = 'fakeNewsPage'
results['recognitionResult'] = ""
res['results'] = json.dumps(results)
res["status"] = 2
res["message"] = "异常"
raw_data["result"] = res
raw_data["result"]["errorLog"] = traceback.format_exc()
logging.info(traceback.format_exc())
to_kafka.send_kafka(raw_data, logging)
else:
# 暂无任务,进入休眠
time.sleep(10)

117
text_analysis/bak/views_20240807.py

@ -0,0 +1,117 @@
#coding:utf8
import os, sys
import io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')
cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd()
par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir))
sys.path.append(cur_dir)
sys.path.append(par_dir)
import json
from django.http import HttpResponse
from text_analysis.tools import to_kafka,tool
from text_analysis.tools import pred
from django.views.decorators.csrf import csrf_exempt
from log_util.set_logger import set_logger
logging=set_logger('logs/results.log')
import traceback
import queue
from text_analysis.cusException import userFile_Exception,chainFile_Exception
import requests
import time
from datetime import datetime
import os
import joblib
#任务队列
global task_queue
task_queue = queue.Queue()
@csrf_exempt
def fakeNewIdentification(request):
if request.method == 'POST':
try:
raw_data = json.loads(request.body)
task_queue.put(raw_data)
return HttpResponse(json.dumps({"code": 1, "msg": "请求正常!"}, ensure_ascii=False))
except:
logging.error(traceback.format_exc())
return HttpResponse(json.dumps({"code": 0, "msg": "请求json格式不正确!"}, ensure_ascii=False))
else:
return HttpResponse(json.dumps({"code": 0, "msg": "请求方式错误,改为post请求"}, ensure_ascii=False))
def predict_news(dbConfig):
while True:
if task_queue.qsize() > 0:
try:
logging.info("取任务队列长度{}".format(task_queue.qsize()))
raw_data = task_queue.get()
logging.info("原始数据-{}".format(raw_data))
res = {"successCode": "1", "errorLog": "", "results": {},"status":1,"message":"成功"}
# 账号数据
userData = tool.mysqlData(raw_data, logging,"1",dbConfig)
# if not userData:
# raise userFile_Exception
logging.info("账号数据获取完毕!-长度{}".format(len(userData)))
# 传播链数据
postChain=tool.mysqlData(raw_data, logging,"0",dbConfig)
if not postChain:
raise chainFile_Exception
logging.info("传播链数据获取完毕!-长度{}".format(len(postChain)))
news=pred.predict_news(userData,postChain,logging)
# 结束标识
res['isLast'] = True
for i in range(len(news)):
row_dict = news.iloc[i].to_dict()
row_dict['pageType'] = 'fakeNewsPage'
# postId
row_dict['postId'] = userData[0]['postId']
if i == len(news) - 1:
row_dict["isLast"]=1
res["results"] = json.dumps(row_dict,ensure_ascii=False)
res["status"] = 1
res["message"] = "成功"
raw_data["result"] = res
logging.info("共{}条数据,第{}条数据输出-{}".format(len(news),i+1,raw_data))
to_kafka.send_kafka(raw_data, logging)
except userFile_Exception:
res = {"successCode": "0", "errorLog": "用户数据为空!", "results": {}, "status": 2,"message": "异常"}
results={}
results['pageType'] = 'fakeNewsPage'
results['recognitionResult'] = '用户数据为空'
res['results'] = json.dumps(results)
res["status"] = 2
res["message"] = "用户数据为空"
raw_data["result"] = res
logging.info("该条请求用户数据为空-{}".format(raw_data))
to_kafka.send_kafka(raw_data, logging)
except chainFile_Exception:
res = {"successCode": "0", "errorLog": "关系链数据为空!", "results": {}, "status": 2,"message": "异常"}
results={}
results['pageType'] = 'fakeNewsPage'
results['recognitionResult'] = '关系链数据为空'
res['results'] = json.dumps(results)
res["status"] = 2
res["message"] = "关系链数据为空"
raw_data["result"] = res
logging.info("该条请求关系链数据为空-{}".format(raw_data))
to_kafka.send_kafka(raw_data, logging)
except:
res = {"successCode": "0", "errorLog": "", "results": {}, "status": 2,"message": "异常"}
results={}
results['pageType'] = 'fakeNewsPage'
results['recognitionResult'] = ""
res['results'] = json.dumps(results)
res["status"] = 2
res["message"] = "异常"
raw_data["result"] = res
raw_data["result"]["errorLog"] = traceback.format_exc()
logging.info(traceback.format_exc())
to_kafka.send_kafka(raw_data, logging)
else:
# 暂无任务,进入休眠
time.sleep(10)

10
text_analysis/cusException.py

@ -0,0 +1,10 @@
# -*- coding:utf-8 -*-
class userFile_Exception(Exception):
def __str__(self):
return '用户数据为空'
class chainFile_Exception(Exception):
def __str__(self):
return '传播链条数据为空'

9
text_analysis/linshi.py

@ -0,0 +1,9 @@
import json
t={"a":1,"b":2,"c":3}
raw_data={}
res = {"successCode": "1", "errorLog": "", "results": {}}
res["results"] = json.dumps(t, ensure_ascii=False)
res["status"] = 1
res["message"] = "成功"
raw_data["result"] = res
print(raw_data)

BIN
text_analysis/model/bot_user.pkl

BIN
text_analysis/model/fake_news_model.pkl

10
text_analysis/read_config.py

@ -0,0 +1,10 @@
import configparser
#加载配置文件
def load_config():
configFile = './config.ini'
# 创建配置文件对象
con = configparser.ConfigParser()
# 读取文件
con.read(configFile, encoding='utf-8')
return con

14
text_analysis/request.py

@ -0,0 +1,14 @@
#coding:utf8
# import leida_ner_bert_crf
import requests
url = "http://172.18.1.166:9000/leidaduikang"
payload = "{\"inputUrl\":\"/home/bfdadmin/leidabert/Project_leidaduikang/AInputdata/content_100.xlsx\"}"
headers = {'user-agent': "vscode-restclient",'header name': "header value"}
response = requests.request("POST", url, timeout=1000000,data=payload, headers=headers)
print(response.text)

148
text_analysis/settings.py

@ -0,0 +1,148 @@
"""
Django settings for Zhijian_Project_WebService project.
Generated by 'django-admin startproject' using Django 1.8.
For more information on this file, see
https://docs.djangoproject.com/en/1.8/topics/settings/
For the full list of settings and their values, see
https://docs.djangoproject.com/en/1.8/ref/settings/
"""
# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
import os
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# Quick-start development settings - unsuitable for production
# See https://docs.djangoproject.com/en/1.8/howto/deployment/checklist/
# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = '330r)_!^qhd7$!w4)$y@4=p2bd*vlxf%4z(bx-fx-1i3txagvz'
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True
ALLOWED_HOSTS = ['*']
# Application definition
INSTALLED_APPS = (
'django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
)
MIDDLEWARE = [
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
# 'django.contrib.auth.middleware.SessionAuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware',
'django.middleware.security.SecurityMiddleware',
]
ROOT_URLCONF = 'text_analysis.urls'
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': [],
'APP_DIRS': True,
'OPTIONS': {
'context_processors': [
'django.template.context_processors.debug',
'django.template.context_processors.request',
'django.contrib.auth.context_processors.auth',
'django.contrib.messages.context_processors.messages',
],
},
},
]
WSGI_APPLICATION = 'text_analysis.wsgi.application'
# Database
# https://docs.djangoproject.com/en/1.8/ref/settings/#databases
# DATABASES = {
# 'default': {
# 'ENGINE': 'django.db.backends.sqlite3',
# 'NAME': os.path.join(BASE_DIR, 'db.sqlite3'),
# }
# }
# Internationalization
# https://docs.djangoproject.com/en/1.8/topics/i18n/
LANGUAGE_CODE = 'en-us'
TIME_ZONE = 'Asia/Shanghai'
USE_I18N = True
USE_L10N = True
USE_TZ = True
# Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/1.8/howto/static-files/
STATIC_URL = '/static/'
# U_LOGFILE_SIZE = 1 * 1024 * 1024 # 单日志文件最大100M
# U_LOGFILE_COUNT = 7 # 保留10个日志文件
#
# LOGGING = {
# 'version': 1,
# 'disable_existing_loggers': True, # 禁用所有已经存在的日志配置
# 'filters': {
# 'require_debug_false': {
# '()': 'django.utils.log.RequireDebugFalse'
# }
# },
# 'formatters': {
# 'verbose': {
# 'format': '[%(levelname)s %(asctime)s @ %(process)d] %(module)s %(process)d %(thread)d %(message)s'
# },
# 'simple': {
# 'format': '%(levelname)s %(asctime)s @ %(process)d %(message)s'
# },
# 'complete': {
# 'format': '[%(levelname)s %(asctime)s @ %(process)d] (%(pathname)s/%(funcName)s:%(lineno)d) - %(message)s'
# },
# 'online': {
# 'format': '[%(levelname)s %(asctime)s @ %(process)d] - %(message)s'
# }
# },
# 'handlers': {
# 'text': {
# 'level': 'DEBUG',
# #'class': 'logging.handlers.RotatingFileHandler',
# 'class': 'logging.handlers.TimedRotatingFileHandler',
# 'when': 'H',
# 'interval': 1,
# 'backupCount': U_LOGFILE_COUNT,
# 'formatter': 'complete',
# 'filename': os.path.join(BASE_DIR, 'logs/resultNew.log').replace('\\', '/'),
# }
# },
# 'loggers': {
# 'text': {
# 'handlers': ['text'],
# 'level': 'DEBUG',
# 'propagate': False,
# }
# }
# }

90
text_analysis/src.py

@ -0,0 +1,90 @@
#coding:utf8
import joblib
#accountName:johnsonleung
def predict():
raw_data = {"user_file":{"accountId": "39234393", "accountName": "hello", "nickName": "Johnson Leung", "fansCount": 308,"likeCount": 92707,"postCount": 14237, "otherInfo": "{\"\"otherInfo\"\":\"\"{\"\"bio\"\": \"\"Huge}", "authentication": 0},
"post_file":{"count":1,"LikeCount":12,"CommentsCount":1,"ShareCount":1,"length":150,"tags":0,"https":0,"at":0,"diffdate":1}}
'''
1.count:
2.LikeCount
3.CommentsCount
4.ShareCount
5.length
6.tags#”数量的平均值
7.httpshttps
8.at@
9.diffdate:(A发表时间和抓取时间的最大值-A的发表时间)
'''
#用户数据
user_data=[]
try:
user_data_otherInfo_1 = 0 if raw_data["user_file"]["otherInfo"].strip() == "" else 1
except:
user_data_otherInfo_1=0
try:
user_data_nickName_2 = 0 if raw_data["user_file"]["nickName"].strip() == "" else 1
except:
user_data_nickName_2=0
try:
user_data_fansCount_3 = int(raw_data["user_file"]["fansCount"])
except:
user_data_fansCount_3=0
try:
user_data_likeCount_4=int(raw_data["user_file"]["likeCount"])
except:
user_data_likeCount_4=0
try:
user_data_postCount_5=int(raw_data["user_file"]["postCount"])
except:
user_data_postCount_5=0
try:
user_data_authentication_6=int(raw_data["user_file"]["authentication"])
except:
user_data_authentication_6=0
user_data.extend([user_data_otherInfo_1,user_data_nickName_2,user_data_fansCount_3,user_data_likeCount_4,user_data_postCount_5,user_data_authentication_6])
#帖子数据
post_data=[]
try:
post_data_count_1 = int(raw_data["post_file"]["count"])
except:
post_data_count_1=0
try:
post_data_LikeCount_2 = int(raw_data["post_file"]["LikeCount"])
except:
post_data_LikeCount_2=0
try:
post_data_CommentsCount_3 = int(raw_data["post_file"]["CommentsCount"])
except:
post_data_CommentsCount_3=0
try:
post_data_ShareCount_4 = int(raw_data["post_file"]["ShareCount"])
except:
post_data_ShareCount_4=0
try:
post_data_length_5 = int(raw_data["post_file"]["length"])
except:
post_data_length_5=0
try:
post_data_tags_6 = int(raw_data["post_file"]["tags"])
except:
post_data_tags_6=0
try:
post_data_https_7 = int(raw_data["post_file"]["https"])
except:
post_data_https_7=0
try:
post_data_at_8 = int(raw_data["post_file"]["at"])
except:
post_data_at_8=0
try:
post_data_diffdate_9 = int(raw_data["post_file"]["diffdate"])
except:
post_data_diffdate_9=0
post_data.extend([post_data_count_1,post_data_LikeCount_2,post_data_CommentsCount_3,post_data_ShareCount_4,post_data_length_5,post_data_tags_6,post_data_https_7,post_data_at_8,post_data_diffdate_9])
features=[user_data+post_data]
bot_user = joblib.load("model/bot_user.pkl") # 加载训练好的模型
result=bot_user.predict(features)
print(result)
# 参数顺序[['otherInfo', 'nickName', 'fansCount', 'likeCount','postCount', 'authentication', 'count', 'LikeCount', 'CommentsCount', 'ShareCount','length', 'tags', 'https', 'at', 'diffdate']]
predict()

BIN
text_analysis/tools/__pycache__/cusException.cpython-36.pyc

BIN
text_analysis/tools/__pycache__/mysql_helper.cpython-36.pyc

BIN
text_analysis/tools/__pycache__/pred.cpython-38.pyc

BIN
text_analysis/tools/__pycache__/process.cpython-36.pyc

BIN
text_analysis/tools/__pycache__/to_kafka.cpython-36.pyc

BIN
text_analysis/tools/__pycache__/to_kafka.cpython-38.pyc

BIN
text_analysis/tools/__pycache__/tool.cpython-36.pyc

BIN
text_analysis/tools/__pycache__/tool.cpython-38.pyc

BIN
text_analysis/tools/__pycache__/tools.cpython-36.pyc

456
text_analysis/tools/bak/pred.py

@ -0,0 +1,456 @@
#coding:utf8
import pandas as pd
import numpy as np
import networkx as nx
from textblob import TextBlob
from snownlp import SnowNLP
from wordcloud import STOPWORDS
import jieba
# import tool
from tqdm import tqdm
import os,sys
cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd()
par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir))
sys.path.append(cur_dir)
sys.path.append(par_dir)
import datetime
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import GridSearchCV
import joblib
def pre_user(data_user):
data_user['accountName'] = data_user['accountName'].apply(lambda x:'@'+x)
data_user['otherInfo'] = 1-pd.isnull(data_user['otherInfo']).astype(int)
data_user['nickName'] = 1-pd.isnull(data_user['nickName']).astype(int)
data_user = data_user[['accountName', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']]
data_user = data_user.dropna()
data_user = data_user.drop_duplicates().reset_index(drop = True)
data_user['fansCount'] = data_user['fansCount'].astype(int)
data_user['likeCount'] = data_user['likeCount'].astype(int)
data_user['postCount'] = data_user['postCount'].astype(int)
data_user.columns = ['用户名', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']
return data_user
def getText_count_eng(txt):
"""英文词频统计"""
txt = txt.lower() #将所有大写字母变成小写
for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()… ': #将文本中特殊符号替换为空格
txt = txt.replace(ch," ")
words = txt.split()
counts = {}
for word in words:
if word not in STOPWORDS:
if word != '\t':
counts[word] = counts.get(word,0) + 1 #统计字数
items = pd.DataFrame(list(counts.items()))
return items
def getText_count_ch(txt):
"""中文词频统计"""
txt = txt.lower() #将所有大写字母变成小写
for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()… 0123456789abcdefghijklmnopqrstuvwxyz': #将文本中特殊符号数字删除
txt = txt.replace(ch,"")
words = jieba.lcut(txt)
counts = {}
for word in words:
counts[word] = counts.get(word,0) + 1
items = list(counts.items())
fin_items = []
for item in items:
if len(item[0])>=2:
fin_items.append(item)
fin_items = pd.DataFrame(fin_items)
return fin_items
def getText_count_U(txt):
"""统计英文大写词频"""
for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()… ': #将文本中特殊符号替换为空格
txt = txt.replace(ch," ")
words = txt.split()
counts = {}
for word in words:
if word not in STOPWORDS:
if word != '/t':
if word.isupper(): #统计大写
counts[word] = counts.get(word,0) + 1 #统计字数
items = pd.DataFrame(list(counts.items())) #将字典类型转换成列表类型
if items.shape == (0,0):
out = 0
else:
out = sum(items[1])
return out
def is_chinese(strs):
"""判断一个unicode是否是汉字/英文"""
strs = strs.lower()
for uchar in strs:
if (uchar < u'\u0061') or (u'\u007a' < uchar < u'\u4e00') or (u'\u9fff' < uchar):
return False
return True
def is_eng(strs):
"""判断一个unicode是否是英文"""
strs = strs.lower()
for uchar in strs:
if (uchar < u'\u0061') or (u'\u007a' < uchar):
return False
return True
# def pre_user(data_user):
# data_user['accountName'] = data_user['accountName'].apply(lambda x:'@'+x)
# data_user['otherInfo'] = 1-pd.isnull(data_user['otherInfo']).astype(int)
# data_user['nickName'] = 1-pd.isnull(data_user['nickName']).astype(int)
# data_user = data_user[['accountName', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']]
# data_user = data_user.dropna()
# data_user = data_user.drop_duplicates().reset_index(drop = True)
# data_user['fansCount'] = data_user['fansCount'].astype(int)
# data_user['likeCount'] = data_user['likeCount'].astype(int)
# data_user['postCount'] = data_user['postCount'].astype(int)
# data_user.columns = ['用户名', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']
# return data_user
def post_related(df, data_user,logging):
# postset = pd.DataFrame(columns=['传播层级', '帖子id', '转发来源id', '用户名', '用户id', '发表内容', '发表时间', 'shareCount',
# 'url', 'layer', 'shape', 'degree', 'closeness_centrality', 'pagerank',
# 'sub_shareCount', '语言', '主贴长度', '主贴http', '主贴at', '主贴tag',
# 'emotion', 'emotion_sub', '最大词频数', '重复词汇占比', '大写词频', '有无传播内容',
# '传播链语言均值', '传播链语言标准差', '传播链贴文emotion均值', '传播链贴文emotion标准差',
# '传播链贴文emotion_sub均值', '传播链贴文emotion_sub标准差',
# '传播链贴文长度均值', '传播链贴文长度标准差', '传播链贴文http均值', '传播链贴文http标准差', '传播链贴文at均值',
# '传播链贴文at标准差', '传播链贴文tag均值', '传播链贴文tag标准差', 'diffdate均值', 'diffdate标准差'])
postset = pd.DataFrame(columns=['传播层级', '帖子id', '转发来源id','所属帖子id', '用户名', '用户id', '发表内容', '发表时间', 'shareCount',
'url', 'layer', 'shape', 'degree', 'closeness_centrality', 'pagerank',
'语言', '主贴长度', '主贴http', '主贴at', '主贴tag',
'emotion', 'emotion_sub', '最大词频数', '重复词汇占比'])
for post_id in df['所属帖子id'].drop_duplicates().reset_index(drop=True):
data = df[df['所属帖子id'] == post_id].reset_index(drop=True)
data.columns = ['传播层级', '帖子id', '转发来源id', '所属帖子id', '用户名', '用户id', '发表内容', '发表时间',
'shareCount', 'url']
data = data.drop_duplicates()
post = data[data['传播层级'] == 1].head(1)
### 一、新闻传播--贴文网络
##1.layer/shape/degree
post['layer'] = int(max(data['传播层级']))
post['shape'] = data.shape[0] - 1
post['degree'] = data[data['传播层级'] == 2].shape[0]
##2.整体网络测度(贴文网络测度)
###2.1把转发来源id对应到转发来源用户
tmp_zfyh = pd.merge(data[data['传播层级'] != 1]['转发来源id'].drop_duplicates(),
data[data['帖子id'].notnull()][['帖子id', '用户名']],
left_on=['转发来源id'], right_on=['帖子id'], how='left')[['转发来源id', '用户名']]
tmp_zfyh.columns = ['转发来源id', '转发来源用户名']
data = pd.merge(data, tmp_zfyh, left_on=['转发来源id'], right_on=['转发来源id'], how='left')
post_edge = data.copy()
post_edge = data[data['传播层级'] != 1][['用户名', '转发来源用户名']]
post_edge.columns = ['source', 'target']
post_edge['count_all'] = 1
post_edge = post_edge.groupby(['source', 'target'])['count_all'].count().reset_index()
# post_edge.to_csv(r'E:\项目文件\情报\假新闻\数据\画图\post_edge_tmp.csv',index=False)
edgeweightset = post_edge[['source', 'target', 'count_all']]
edgeweightset_l = [[] for _ in range(edgeweightset.shape[0])]
for k in range(len(edgeweightset_l)):
for j in range(edgeweightset.shape[1]):
edgeweightset_l[k].append(edgeweightset.iloc[k, j])
# print(i/len(edgeweightset_l))
if len(edgeweightset_l) == 0: # 没有传播链
post['closeness_centrality'] = 1
post['pagerank'] = 1
else:
g = nx.DiGraph()
g.add_weighted_edges_from(edgeweightset_l)
centrality = [nx.closeness_centrality(g),
nx.pagerank(g)]
results = []
nodes = g.nodes() # 提取网络中节点列表
for node in nodes: # 遍历所有节点,提取每个节点度中心性计算结果,并存储为[[节点1,结果],[节点2,结果],...]的形式
results.append([node,
centrality[0][node],
centrality[1][node]])
results = pd.DataFrame(results)
results.columns = ['node', 'closeness_centrality', 'pagerank']
post['closeness_centrality'] = results[results['node'] == results[
results['closeness_centrality'] == max(results['closeness_centrality'])]['node'].iloc[0]][
'closeness_centrality'].iloc[0]
post['pagerank'] = results[results['node'] ==
results[results['closeness_centrality'] == max(results['closeness_centrality'])][
'node'].iloc[0]]['pagerank'].iloc[0]
# post['closeness_centrality'] = results[results['node'] == post['帖子id'].iloc[0]]['closeness_centrality'].iloc[0]
# post['pagerank'] = results[results['node'] == post['帖子id'].iloc[0]]['pagerank'].iloc[0]
#——————————hh——————————————
# 特征未使用
# ##3.传播链中的平均影响力shareCount
# tmp = 0
# for k in range(data[(data['传播层级'] != '1') & (data['帖子id'].notnull())].shape[0]):
# tmp = tmp + int(data[(data['传播层级'] != '1') & (data['帖子id'].notnull())].shareCount.iloc[k])
# if tmp == 0:
# post['sub_shareCount'] = 0
# else:
# post['sub_shareCount'] = tmp / data[(data['传播层级'] != '1') & (data['帖子id'].notnull())].shape[0]
#————————————————————————
##二、主贴文本
# post['发表内容'] = 'October 10th commemorates the 1911 Revolution happened in Wuchang of China, which ended thousands-year-long absolute monarchy. Tsai and DPP authorities want to separate Taiwan from China and betray history. The Chinese people and Chinese history will never forgive these traitors.'
##文本特殊字符个数(http、@、#)
# logging.info(post)
post['主贴http'] = post['发表内容'].iloc[0].count('http')
post['主贴at'] = post['发表内容'].iloc[0].count('@')
post['主贴tag'] = post['发表内容'].iloc[0].count('#')
##判断语言
tmp = post['发表内容'].iloc[0]
for ch in 'éí!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| 0123456789':
tmp = tmp.replace(ch, '')
if is_eng(tmp): ##主贴英文内容
post['语言'] = 0
text = post['发表内容'].iloc[0]
# text = '#Americans,for the first time in their lives,are seeing empty shelves in the stores.This isn’t right.We need to cut #China out of our supply chains by producing locally.#onshoring'
text = text[0:text.rfind("http")]
for ch in '!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| ':
text = text.replace(ch, ' ')
##文本长度
words = text.split(' ')
post['主贴长度'] = len(words)
##文本情感
# post['emotion'] = post['发表内容'].apply(lambda x: SnowNLP(x).sentiments)
emo = pd.DataFrame(TextBlob(post['发表内容'].iloc[0]).sentiment)
post['emotion'] = emo.loc[0, 0]
post['emotion_sub'] = emo.loc[1, 0]
##文本词频
## 词频统计1:最大词频数
## 词频统计2:正文中出现两次及以上的词占比
items = getText_count_eng(text)
if items.shape == (0, 0):
post['最大词频数'] = 0
post['重复词汇占比'] = 0
else:
post['最大词频数'] = max(items[1])
post['重复词汇占比'] = items[items[1] >= 2].shape[0] / items.shape[0]
## 词频统计3:全部大写词频
post['大写词频'] = getText_count_U(text)
elif is_chinese(tmp): ##主贴中文内容
post['语言'] = 1
text = post['发表内容'].iloc[0]
text = text[0:text.rfind("http")]
post['主贴长度'] = len(text)
post['emotion'] = (SnowNLP(text).sentiments - 0.5) * 2
post['emotion_sub'] = np.NaN
# post['emotion_blob'] = pd.DataFrame(post['发表内容'].apply(lambda x: TextBlob(x).sentiment).iloc[0]).loc[0]
# post['emotion_sub'] = pd.DataFrame(post['发表内容'].apply(lambda x: TextBlob(x).sentiment).iloc[0]).loc[1]
##文本词频
## 词频统计1:标题中出现的词,在正文中出现最大词频
## 词频统计2:正文中出现两次及以上的词占比
items = getText_count_ch(text)
if items.shape == (0, 0):
post['最大词频数'] = 0
post['重复词汇占比'] = 0
else:
post['最大词频数'] = max(items[1])
post['重复词汇占比'] = items[items[1] >= 2].shape[0] / items.shape[0]
## 词频统计3:全部大写词频
post['大写词频'] = np.NaN
else:
post['语言'] = np.NaN
post['主贴长度'] = np.NaN
post['emotion'] = np.NaN
post['emotion_sub'] = np.NaN
post['最大词频数'] = np.NaN
post['重复词汇占比'] = np.NaN
post['大写词频'] = np.NaN
# ##4.2传播链中的文本
# sub_post = pd.DataFrame(data[(data['传播层级'] != '1') & (data['帖子id'].notnull())][['发表内容', '发表时间']])
# sub_post['语言'] = np.NaN
# sub_post['文本长度'] = np.NaN
# sub_post['http'] = np.NaN
# sub_post['at'] = np.NaN
# sub_post['tag'] = np.NaN
# sub_post['emotion'] = np.NaN
# sub_post['emotion_sub'] = np.NaN
# sub_post['diffdate'] = np.NaN
#
# for k in range(sub_post.shape[0]):
# ##文本特殊字符个数(http、@、#)
# sub_post['http'].iloc[k] = sub_post['发表内容'].iloc[k].count('http')
# sub_post['at'].iloc[k] = sub_post['发表内容'].iloc[k].count('@')
# sub_post['tag'].iloc[k] = sub_post['发表内容'].iloc[k].count('#')
#
# ##时间差
# d1 = datetime.datetime.strptime(sub_post['发表时间'].iloc[k], "%Y-%m-%d %H:%M:%S")
# base = datetime.datetime.strptime(post['发表时间'].iloc[0], "%Y-%m-%d %H:%M:%S")
#
# # now = datetime.datetime.now()
# sub_post['diffdate'].iloc[k] = (d1 - base).days
#
# ##判断语言
# tmp = sub_post['发表内容'].iloc[k]
# for ch in 'éí!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| 0123456789':
# tmp = tmp.replace(ch, '')
#
# if is_eng(tmp): ##英文内容
#
# sub_post['语言'].iloc[k] = 0
#
# ##文本长度
# text = sub_post['发表内容'].iloc[k]
# # text = "'America is collapsing and it's China's fault' is definitely a change of direction?"
# text = text[0:text.rfind("http")]
# for ch in '!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| ':
# text = text.replace(ch, ' ')
# words = text.split(' ')
# sub_post['文本长度'].iloc[k] = len(words)
# ##情感
# sub_emo = pd.DataFrame(TextBlob(sub_post['发表内容'].iloc[k]).sentiment)
# sub_post['emotion'].iloc[k] = sub_emo.loc[0, 0]
# sub_post['emotion_sub'].iloc[k] = sub_emo.loc[1, 0]
#
# elif is_chinese(tmp): ##中文内容
#
# sub_post['语言'].iloc[k] = 1
#
# ##文本长度
# text = sub_post['发表内容'].iloc[k]
# text = text[0:text.rfind("http")]
# sub_post['文本长度'].iloc[k] = len(text)
# ##情感
# sub_post['emotion'].iloc[k] = (SnowNLP(sub_post['发表内容'].iloc[k]).sentiments - 0.5) * 2
# sub_post['emotion_sub'].iloc[k] = np.NaN
#
# else:
#
# sub_post['语言'].iloc[k] = np.NaN
# sub_post['文本长度'].iloc[k] = np.NaN
# sub_post['emotion'].iloc[k] = np.NaN
# sub_post['emotion_sub'].iloc[k] = np.NaN
#
# if sub_post.shape[0] == 0:
# post['有无传播内容'] = 0
# else:
# post['有无传播内容'] = 1
#
# post['传播链语言均值'] = sub_post['语言'].mean()
# post['传播链贴文长度均值'] = sub_post['文本长度'].mean()
# post['传播链贴文emotion均值'] = sub_post['emotion'].mean()
#
# ##emotion_sub取有值的均值
# post['传播链贴文emotion_sub均值'] = sub_post['emotion_sub'].mean()
#
# post['传播链贴文http均值'] = sub_post['http'].mean()
#
# post['传播链贴文at均值'] = sub_post['at'].mean()
#
# post['传播链贴文tag均值'] = sub_post['tag'].mean()
#
# post['diffdate均值'] = sub_post['diffdate'].mean()
##三、用户信息
##发帖用户
post = pd.merge(post, data_user, how='left', on='用户名')
##传播链用户
sub_user = pd.DataFrame(data[data['传播层级'] != 1][['用户名']])
sub_user = pd.merge(sub_user, data_user, how='left', on='用户名')
sub_user = sub_user.dropna()
post['nickName均值'] = sub_user['nickName'].mean()
post['fansCount均值'] = sub_user['fansCount'].mean()
post['likeCount均值'] = sub_user['likeCount'].mean()
post['postCount均值'] = sub_user['postCount'].mean()
post['otherInfo均值'] = sub_user['otherInfo'].mean()
postset = pd.concat([postset, post]).reset_index(drop=True)
postset = postset.fillna(0)
postset['emotion_degree'] = abs(postset['emotion'])
return postset
def predict_news(userData,postChain,logging):
data_po = pd.DataFrame(postChain).replace('', np.nan)
data_po.columns = ['id','层级','帖子id','转发来源id','所属帖子id','用户名','用户id','发表内容','发表时间','shareCount','url','topicId']
data_po=data_po[['层级','帖子id','转发来源id','所属帖子id','用户名','用户id','发表内容','发表时间','shareCount','url']]
if not userData:
columns=['topicId','id','accountName','nickName','fansCount','likeCount','postCount','account_url','otherInfo','topicId']
data_user=pd.DataFrame(columns=columns)
else:
data_user = pd.DataFrame(userData).replace('', np.nan)
data_user.columns = ['topicId','id','accountName','nickName','fansCount','likeCount','postCount','account_url','otherInfo']
data_user=data_user[['accountName','nickName','fansCount','likeCount','postCount','account_url','otherInfo']]
data_user = pre_user(data_user)
#data_user=dataframe[@XHNews,1,878,1178,938,1]
#data_user.columns=['用户名', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']
postset_po = post_related(data_po,data_user,logging) ## 正面文件
features = postset_po[[
#'shareCount',
'layer', 'shape', 'degree', 'pagerank', 'closeness_centrality',
'主贴http', '主贴at', '主贴tag',
'主贴长度','emotion', 'emotion_degree',
'最大词频数', '重复词汇占比',#(中英文差异大)
#'有无传播内容',
'fansCount','likeCount', 'postCount',
#'sub_shareCount',
'fansCount均值', 'postCount均值', 'otherInfo均值'
]]
clf = joblib.load(par_dir+'/model/fake_news_model.pkl')
clf_predict = clf.predict(features)
res=pd.DataFrame(clf_predict)
res.columns=['假新闻预测结果']
res['recognitionResult'] = res['假新闻预测结果'].apply(lambda x: '假新闻' if x == 1 else '真新闻')
result = pd.concat([postset_po, res], axis=1)
return result
if __name__=="__main__":
print(par_dir)
# user={
# "topicId":1209,
# "host":"172.26.28.30",
# "user":"crawl",
# "passwd":"crawl123",
# "db":"test",
# "port":3306,
# "table":"TwitterAccount"
# }
# userData = tool.mysqlData(user,"")
# # logging.info("账号数据获取完毕!")
# # 传播链数据
# # post = raw_data["metadata"]["admin"]["Twitter_chain"]
# post={
# "topicId":1209,
# "host":"172.26.28.30",
# "user":"crawl",
# "passwd":"crawl123",
# "db":"test",
# "port":3306,
# "table":"Twitter_chain"
# }
# postChain = tool.mysqlData(post, "")
# # logging.info("传播链数据获取完毕!")
# predict_news(userData,postChain,"")

220
text_analysis/tools/bak/tool.py

@ -0,0 +1,220 @@
#coding:utf8
import re
import pymysql
import pandas as pd
import numpy as np
import networkx as nx
import traceback
import json
from jsonpath_ng import jsonpath, parse
def get_taskId(raw_data):
taskid = raw_data["metadata"]["admin"]["taskId"]
all_result = raw_data['data']
param_split = taskid.split(":")
datasourcestr = all_result[param_split[0]]
datasource = json.loads(datasourcestr)
# 创建 JsonPath 表达式对象
expr = parse(param_split[1])
# 使用表达式来选择 JSON 元素
match = [match.value for match in expr.find(datasource)]
val = match[0]
return val
def mysqlData(raw_data,logging,dataTag):
result=''
taskId = get_taskId(raw_data)
if dataTag=='1':
table="tw_account"
else:
table="tw_deep"
inputdata=raw_data["metadata"]["admin"]
try:
db = pymysql.connect(host=inputdata["Host"], user=inputdata["User"], passwd=inputdata["Password"],
db=inputdata["Database"], port=inputdata["Port"], charset='utf8',cursorclass=pymysql.cursors.DictCursor, connect_timeout=30)
db.ping(reconnect=True)
cursor = db.cursor()
sql="SELECT * FROM {} WHERE taskId={}".format(table,taskId)
cursor.execute(sql)
result = cursor.fetchall()
db.commit()
cursor.close()
db.close()
except:
logging.info("专题关系数据查询失败!")
logging.info(traceback.format_exc())
return result
def get_replyData(data):
reply=pd.DataFrame(data)
reply = reply.drop_duplicates().reset_index(drop=True) # 去重
reply=reply[['ReviewerAccountId', 'PostAccountId']]
# reply.columns = ['ReviewerAccountId', 'ReviewerAccountName', 'PostAccountId', 'PostAccountName',
# 'ShareCount', 'LikeCount', 'CommentCount', 'CommentTime']
reply = reply[['ReviewerAccountId', 'PostAccountId']]
reply['ReviewerAccountId'] = reply['ReviewerAccountId'].astype(str)
reply['PostAccountId'] = reply['PostAccountId'].astype(str)
reply = reply.groupby(['ReviewerAccountId', 'PostAccountId']).size().reset_index()
# user_net_df = user_net(reply) ##SNA数据清洗
edgeweightset = reply.fillna(0)
edgeweightset.columns = ['source', 'target', 'count']
edgeweightset_l = [[] for _ in range(edgeweightset.shape[0])]
for i in range(len(edgeweightset_l)):
for j in range(edgeweightset.shape[1]):
edgeweightset_l[i].append(edgeweightset.iloc[i, j])
g = nx.DiGraph()
g.add_weighted_edges_from(edgeweightset_l)
degree = [g.degree(),
g.in_degree(),
g.out_degree()]
centrality = [nx.degree_centrality(g), # 计算图 g 中每个节点的度中心性。度中心性是指节点的度(与其他节点相连的边的数量)与图中节点总数的比值。
nx.closeness_centrality(g), # 计算图 g 中每个节点的接近中心性。接近中心性是指节点到其他节点的平均最短路径长度的倒数。
nx.pagerank(g), # 计算图 g 中每个节点的 PageRank 值。PageRank 是一种用于评估网页重要性的算法,也可以应用于其他网络中的节点重要性评估。
nx.clustering(g)] # 计算图 g 中每个节点的聚集系数。聚集系数是指节点的邻居之间存在连接的概率。
#把主贴相关信息拿出来
tmp=edgeweightset["target"].values
node_list = []
nodes = g.nodes() # 提取网络中节点列表
for node in nodes:
if node not in tmp:
continue
node_list.append([node,
degree[0][node],
degree[1][node],
degree[2][node],
centrality[0][node],
centrality[1][node],
centrality[2][node],
centrality[3][node]])
node_list = pd.DataFrame(node_list)
node_list.columns = ['Id', 'degree', 'in_degree', 'out_degree',
'degree_centrality', 'closeness_centrality', 'pagerank', 'clustering']
node_list['user_flag_infl'] = 0
node_list['user_flag_act'] = 0
node_list.user_flag_infl[node_list['out_degree'] > np.percentile(node_list['out_degree'], 95)] = 1
node_list.user_flag_act[(node_list['in_degree'] > np.percentile(node_list['in_degree'], 90)) &
(node_list['closeness_centrality'] > np.percentile(node_list['closeness_centrality'],
50))] = 1
node_dic=node_list.set_index('Id')[['degree', 'in_degree','out_degree','degree_centrality','closeness_centrality','pagerank','clustering']].T.to_dict()
return node_dic
def get_content(inputdata,logging):
"""
:param inputdata:json数据
:return: prompt及其他参数
"""
res={}
admin=inputdata["metadata"]["admin"]
data=inputdata["data"]
prompt=admin["prompt"]
if_user=re.findall("{{(.*)}}",prompt)
if_data=re.findall("@@(.*)@@",prompt)
if if_user != []:
user_data=inputdata["metadata"]["user"]
if if_user[0] in user_data.keys():
tmp=user_data[if_user[0]]
prompt=re.sub("{{(.*)}}",tmp,prompt)
if if_data!=[] and if_data[0] in data.keys():
tmp1=data[if_data[0]]
prompt=re.sub("@@(.*)@@",tmp1,prompt)
res["prompt"]=prompt
res["authorization"]=admin["authorization"]
res["model"]=admin["model"]
res["temperature"]=admin["temperature"]
res["authorization"]=admin["authorization"]
res["top_p"]=admin["top_p"]
res["n"]=admin["n"]
return res
if __name__=="__main__":
inputdata={
"metadata":{
"output":{
"output_type":"table",
"label_col":[
"软件著作抽取结果"
]
},
"input":{
"input_type":"text",
"label":[
"7_软件著作过滤器"
]
},
"address":"http://172.18.1.181:9011/chatGpt/",
"admin":{
"authorization":"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",
"top_p":"1",
"user_input":[
{
"keyname":"tag",
"keydesc":""
}
],
"temperature":"0.2",
"model":"gpt-3.5-turbo-16k",
"prompt":"请在下面这句话中提取出:证书号、软件名称、著作权人,以json格式输出,找不到的字段赋值为空字符串,不要有多余的文字输出,只输出json结构。@@7_软件著作过滤器@@",
"n":"1"
},
"index":1
},
"data":{
"1_项目文件上传":"[{ \"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230816/16/05/1/1-基于时间序列遥感 影像洪涝检测系统.jpg\",\"fileType\":\"jpg\", \"filePath\":\"/软件著作/1-基于时间序列遥感 影像洪涝检测系统.jpg\",\"fileId\":\"cd6592f0389bb1da25afbb44901f9cde\",\"fileName\":\"1-基于时间序列遥感 影像洪涝检测系统.jpg\" },{ \"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230816/16/06/1/2-基于遥感影像的快速变化检测系统.jpg\",\"fileType\":\"jpg\", \"filePath\":\"/软件著作/2-基于遥感影像的快速变化检测系统.jpg\",\"fileId\":\"338847e34904fa96e8834cb220667db8\",\"fileName\":\"2-基于遥感影像的快速变化检测系统.jpg\" },{ \"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230816/16/08/1/3-基于时空模型的遥感时间序列森林火灾检测系统.jpg\",\"fileType\":\"jpg\", \"filePath\":\"/软件著作/1/3-基于时空模型的遥感时间序列森林火灾检测系统.jpg\",\"fileId\":\"944eec1cf98f216ea953459dac4dd505\",\"fileName\":\"3-基于时空模型的遥感时间序列森林火灾检测系统.jpg\" },{ \"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230816/16/09/1/4-基于隐马尔可夫模型的遥感时间序列分类系统.jpg\",\"fileType\":\"jpg\", \"filePath\":\"/软件著作/4-基于隐马尔可夫模型的遥感时间序列分类系统.jpg\",\"fileId\":\"eb378cb9ee914323f601500378dfad76\",\"fileName\":\"4-基于隐马尔可夫模型的遥感时间序列分类系统.jpg\" }]",
"2_文件分类信息":"{\"软件著作\":4}",
"3_OCR识别内容":"{\"content\":\" 22222222222222222222222222222222222222222222222222\\n中华人民共和国国家版权局\\n计算机软件著作权登记证书\\n证书号:软著登字第1623261号\\n软件名称:\\n基于遥感影像的快速变化检测系统\\nV1.0\\n著作权人:中国科学院遥感与数字地球研究所\\n开发完成日期:2016年08月01日\\n首次发表日期:未发表\\n权利取得方式:原始取得\\n权利范围:全部权利\\n登记号:2017SR037977\\n根据《计算机软件保护条例》和《计算机软件著作权登记办法》的\\n规定,经中国版权保护中心审核,对以上事项予以登记\\n计算机软件著作权\\n登记专用章\\n2017年02月10日\\nNo.01433672\",\"fileId\":\"338847e34904fa96e8834cb220667db8\",\"fileName\":\"2-基于遥感影像的快速变化检测系统.jpg\",\"filePath\":\"/软件著作/2-基于遥感影像的快速变化检测系统.jpg\",\"fileType\":\"jpg\",\"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230816/16/06/1/2-基于遥感影像的快速变化检测系统.jpg\",\"pageNum\":1}",
"businessKey":"185aef3b1c810799a6be8314abf6512c",
"7_软件著作过滤器":"{\"content\":\" 22222222222222222222222222222222222222222222222222\\n中华人民共和国国家版权局\\n计算机软件著作权登记证书\\n证书号:软著登字第1623261号\\n软件名称:\\n基于遥感影像的快速变化检测系统\\nV1.0\\n著作权人:中国科学院遥感与数字地球研究所\\n开发完成日期:2016年08月01日\\n首次发表日期:未发表\\n权利取得方式:原始取得\\n权利范围:全部权利\\n登记号:2017SR037977\\n根据《计算机软件保护条例》和《计算机软件著作权登记办法》的\\n规定,经中国版权保护中心审核,对以上事项予以登记\\n计算机软件著作权\\n登记专用章\\n2017年02月10日\\nNo.01433672\",\"fileId\":\"338847e34904fa96e8834cb220667db8\",\"fileName\":\"2-基于遥感影像的快速变化检测系统.jpg\",\"filePath\":\"/软件著作/2-基于遥感影像的快速变化检测系统.jpg\",\"fileType\":\"jpg\",\"fileUrl\":\"http://172.18.1.130:9985/group33/default/20230816/16/06/1/2-基于遥感影像的快速变化检测系统.jpg\",\"pageNum\":1}"
},
"created":1691004265000,
"module":"OCR",
"start_tag":"false",
"last_edit":1692464331000,
"next_app_id":[
{
"start_id":86,
"edge_id":49,
"end_id":90
}
],
"transfer_id":11,
"blueprint_id":3,
"scenes_id":3,
"scenario":{
"dataloss":1,
"autoCommitTriggerLast":1,
"maxErrors":3,
"autoCommit":1,
"freshVariables":1
},
"wait_condition":[
],
"scheduling":{
"interval":-1,
"type":"single"
},
"name":"软件著作抽取",
"businessKey":"185aef3b1c810799a6be8314abf6512c",
"id":86,
"describe":"软件著作抽取"
}
a=get_content(inputdata,"")
print(a)

25
text_analysis/tools/cusException.py

@ -0,0 +1,25 @@
# -*- coding:utf-8 -*-
class pt_v_Exception(Exception):
def __str__(self):
return 'pt规则未在缓存中命中'
class dt_v_Exception(Exception):
def __str__(self):
return 'dt规则未在缓存中命中'
class dt_v_attr_Exception(Exception):
def __str__(self):
return 'dt_attrcode规则未在缓存中命中'
class dt_v_codeid_Exception(Exception):
def __str__(self):
return 'dt_codeid规则未在缓存中命中'
class dt_v_senti_Exception(Exception):
def __str__(self):
return 'dt_senti规则未在缓存中命中'
class dt_v_res_Exception(Exception):
def __str__(self):
return 'dt_resverse规则未在缓存中命中'

67
text_analysis/tools/kakfa_util.py

@ -0,0 +1,67 @@
# coding=utf-8
from kafka import KafkaProducer
from kafka import KafkaConsumer
import json
import traceback
import time
import traceback
import datetime
import queue
from logUtil import get_logger
logger = get_logger("crawlWebsrcCode.log")
"""
kafka
"""
def kafkaProduce(topic,resultData,address):
producer = KafkaProducer(bootstrap_servers = '{}'.format(address),request_timeout_ms=120000)
topics = topic.split(',')
for tc in topics:
future = producer.send(tc,resultData)
result = future.get(timeout=60)
producer.flush()
print (result)
#写入文件
def writeTxt(filePath,result):
f = open(filePath,'a',encoding='utf-8')
f.write(result.encode('utf-8').decode('unicode_escape')+'\n')
f.close
def KafkaConsume(topic,address,group_id,task_queue,logger):
'''
kafka
:param topic:
:param address:
:param group_id:
:param task_queue:
:return:
'''
try:
consumer = KafkaConsumer(topic, auto_offset_reset='earliest',fetch_max_bytes=1024768000,fetch_max_wait_ms=5000, bootstrap_servers=address,group_id = group_id)
i = 1
while True:
for msg in consumer:
print('第{}条数据'.format(i))
data = str(msg.value, encoding = "utf-8")
print(data)
task_queue.put(data)
i = i+1
else:
print('暂无任务------')
time.sleep(10)
except Exception as e:
print('kafka未知异常----')
traceback.print_exc()
def writeTxt(filePath,result):
f = open(filePath,'a')
f.write(result+'\n')
f.close
if __name__ == '__main__':
# resultData = {'id': '中文', 'url': 'https://zh.wikipedia.org/zh/%E8%94%A1%E8%8B%B1%E6%96%87'}
# kafkaProduce('test', json.dumps(resultData).encode('utf-8').decode('unicode_escape').encode(),'121.4.41.194:8008')
task_queue = queue.Queue()
KafkaConsume('fq-Taobao-eccontent','39.129.129.172:6666,39.129.129.172:6668,39.129.129.172:6669,39.129.129.172:6670,39.129.129.172:6671','news_sche_8',task_queue,logger)
# KafkaConsume('zxbnewstopic','120.133.14.71:9992','group3',task_queue,logger)

338
text_analysis/tools/mysql_helper.py

@ -0,0 +1,338 @@
# coding:utf8
import os, sys
cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd()
par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir))
sys.path.append(cur_dir)
sys.path.append(par_dir)
import json
import re
# from log_util.set_logger import set_logger
# logging = set_logger('logs/error.log')
import pymysql.cursors
import traceback
def mysqlConn(data,logging):
res={"successCode":"1","errorLog":"","results":""}
p_host=data["Host"]
p_port=int(data["Port"])
p_db=data["Database"]
p_user=data["User"]
p_password=data["Password"]
try:
db = pymysql.connect(host=p_host, user=p_user, passwd=p_password, db=p_db, port=p_port,
charset='utf8', cursorclass=pymysql.cursors.DictCursor)
db.ping(reconnect=True)
cursor = db.cursor()
sql = "SHOW TABLES"
cursor.execute(sql)
tables = cursor.fetchall()
if tables:
table_names = list(map(lambda x: list(x.values())[0], tables))
res["results"] = table_names
else:
res["successCode"] = "0"
cursor.close()
db.close()
return res
except:
res["successCode"] = "0"
res["errorLog"]=traceback.format_exc()
logging.error(traceback.format_exc())
return res
def getTableColumnNames(data,logging):
res={"successCode":"1","errorLog":"","results":""}
p_host=data["Host"]
p_port=int(data["Port"])
p_db=data["Database"]
p_user=data["User"]
p_password=data["Password"]
p_table=data["Table"]
try:
db = pymysql.connect(host=p_host, user=p_user, passwd=p_password, db=p_db, port=p_port,
charset='utf8', cursorclass=pymysql.cursors.DictCursor)
db.ping(reconnect=True)
cursor = db.cursor()
sql = "DESCRIBE "+p_table
cursor.execute(sql)
tables = cursor.fetchall()
if tables:
table_names = list(map(lambda x: x['Field'], tables))
res["results"] = table_names
else:
res["successCode"] = "0"
cursor.close()
db.close()
return res
except:
res["successCode"] = "0"
res["errorLog"]=traceback.format_exc()
logging.error(traceback.format_exc())
return res
def mysqlInsert(input,logging):
res={"successCode":"1","errorLog":"","results":""}
data=input["metadata"]["admin"]
p_host=data["Host"]
p_port=int(data["Port"])
p_db=data["Database"]
p_user=data["User"]
p_password=data["Password"]
p_table=data["Table"]
p_columnName=data["columnName"]
cN='('+','.join(p_columnName)+') '
p_values=data["values"]
val=tuple(p_values)
try:
db = pymysql.connect(host=p_host, user=p_user, passwd=p_password, db=p_db, port=p_port,
charset='utf8', cursorclass=pymysql.cursors.DictCursor)
db.ping(reconnect=True)
cursor = db.cursor()
sql = "insert into " + p_table + cN + "values ("+ ','.join(['%s'] * len(val)) + ")"
cursor.execute(sql,val)
db.commit()
cursor.close()
db.close()
return res
except:
res["successCode"] = "0"
res["errorLog"]=traceback.format_exc()
logging.error(traceback.format_exc())
return res
def mysqlUpdate(input,logging):
res={"successCode":"1","errorLog":"","results":""}
data=input["metadata"]["admin"]
p_host=data["Host"]
p_port=int(data["Port"])
p_db=data["Database"]
p_user=data["User"]
p_password=data["Password"]
p_table=data["Table"]
# p_set=data["Set"]
p_set=get_updateSet(input)
# where=process_where(data["Filter"])
where=get_filter(data["Filter"])
try:
db = pymysql.connect(host=p_host, user=p_user, passwd=p_password, db=p_db, port=p_port,
charset='utf8', cursorclass=pymysql.cursors.DictCursor)
db.ping(reconnect=True)
cursor = db.cursor()
sql = "UPDATE " + p_table + p_set + where
print(sql)
cursor.execute(sql)
db.commit()
cursor.close()
db.close()
return res
except:
res["successCode"] = "0"
res["errorLog"]=traceback.format_exc()
logging.error(traceback.format_exc())
return res
def mysqlExecute(input,logging):
res={"successCode":"1","errorLog":"","results":""}
data=input["metadata"]["admin"]
p_host=data["Host"]
p_port=int(data["Port"])
p_db=data["Database"]
p_user=data["User"]
p_password=data["Password"]
execute=data["Execute"]
try:
db = pymysql.connect(host=p_host, user=p_user, passwd=p_password, db=p_db, port=p_port,
charset='utf8', cursorclass=pymysql.cursors.DictCursor)
db.ping(reconnect=True)
cursor = db.cursor()
cursor.execute(execute)
if 'select' in execute.lower():
result = cursor.fetchall()
res["results"]=json.dumps(result,ensure_ascii=False)
else:
db.commit()
cursor.close()
db.close()
return res
except:
res["successCode"] = "0"
res["errorLog"]=traceback.format_exc()
logging.error(traceback.format_exc())
return res
# def process_where(data):
# '''
# 组装where
# :param data: data["Filter"],{"key":"age","value":"20","operator":">"},{"logicalSymbol":"and"},{"key":"weight","value":"50","operator":"<"}
# :return: WHERE age>20 and weight<50
# '''
# if data=="" or data==[]:
# return ""
# where = " WHERE "
# for line in data:
# if "key" in line.keys():
# val = line["value"]
# if isinstance(val, str):
# val = "\'" + val + "\'"
# tmp = str(line["key"]) + " " + line["operator"] + " " + str(val)
# where += tmp
# else:
# where += " " + line["logicalSymbol"] + " "
# return where
#
# def process_filter(data):
# '''
# 组装key,value,operator
# :param data: data["Filter"],{"key":"age",value:"20","operator":"="}
# :return: age=20
# '''
# if data=="" or data==[]:
# return ""
# res=data["key"]+" "+data["operator"]+" "+data["value"]
# return res
def get_updateSet(input):
metadata=input["metadata"]
user=metadata["user"]
sets=metadata["admin"]["Set"]
res=[]
for line in sets:
part=line.split("=")
tmp = []
for p in part:
user_match=re.findall('##(.*?)##', p)
if user_match!=[]:
tmp.append(user[user_match[0]])
res.append(str(tmp[0])+"="+str(tmp[1]))
result=" SET "+",".join(res)
return result
def get_filter(data):
if "OR" not in data.keys():
return ""
op_or=data["OR"]
res = ""
if len(op_or) == 1:
tmp = []
line = op_or[0]["AND"]
for single_line in line:
val = single_line["value"]
if isinstance(val, str):
val = "\'" + val + "\'"
tmp.append(str(single_line["key"]) + single_line["operator"] + str(val))
if single_line != line[-1]:
tmp.append("and")
res = " WHERE "+" ".join(tmp)
elif len(op_or) > 1:
tmp = []
for single_and in op_or:
line = single_and["AND"]
for sigle_line in line:
val = sigle_line["value"]
if isinstance(val, str):
val = "\'" + val + "\'"
tmp.append(str(sigle_line["key"]) + sigle_line["operator"] + str(val))
if sigle_line != line[-1]:
tmp.append("and")
if single_and != op_or[-1]:
tmp.append("or")
res = " WHERE "+" ".join(tmp)
return res
def mysqlQuery(input,logging):
res={"successCode":"1","errorLog":"","results":""}
data=input["metadata"]["admin"]
p_host=data["Host"]
p_port=int(data["Port"])
p_db=data["Database"]
p_user=data["User"]
p_password=data["Password"]
p_table=data["Table"]
p_columnNames=data["columnNames"]
# p_filter=data["Filter"]
column='*'
if len(p_columnNames)==1:
column=p_columnNames[0]
elif len(p_columnNames)>1:
column=','.join(p_columnNames)
where=get_filter(data["Filter"])
try:
db = pymysql.connect(host=p_host, user=p_user, passwd=p_password, db=p_db, port=p_port,
charset='utf8', cursorclass=pymysql.cursors.DictCursor)
db.ping(reconnect=True)
cursor = db.cursor()
sql = "SELECT " + column +" From "+ p_table + where
# print(sql)
cursor.execute(sql)
result = cursor.fetchall()
res["results"]=json.dumps(result,ensure_ascii=False)
cursor.close()
db.close()
return res
except:
res["successCode"] = "0"
res["errorLog"]=traceback.format_exc()
logging.error(traceback.format_exc())
return res
def mysqlDelete(input,logging):
res={"successCode":"1","errorLog":"","results":""}
data=input["metadata"]["admin"]
p_host=data["Host"]
p_port=int(data["Port"])
p_db=data["Database"]
p_user=data["User"]
p_password=data["Password"]
p_table=data["Table"]
# where=process_where(data["Filter"])
where=get_filter(data["Filter"])
try:
db = pymysql.connect(host=p_host, user=p_user, passwd=p_password, db=p_db, port=p_port,
charset='utf8', cursorclass=pymysql.cursors.DictCursor)
db.ping(reconnect=True)
cursor = db.cursor()
sql = "DELETE From "+ p_table + where
cursor.execute(sql)
db.commit()
cursor.close()
db.close()
return res
except:
res["successCode"] = "0"
res["errorLog"]=traceback.format_exc()
logging.error(traceback.format_exc())
return res
if __name__=="__main__":
input={"metadata":{"admin":{
"type":"query",
"Table":"student",
"columnNames":["name","age"],
"Set":["##tag1##=##value1##","##tag2##=##value2##"],
"Filter":{
"OR":[
{
"AND":[{"key":"age","value":20,"operator":">"},{"key":"weight","value":50,"operator":"<"}]
},
{
"AND":[{"key":"name","value":"ff","operator":"="}]
}
]
},
"Host":"172.26.28.30",
"Port":"3306",
"Database":"test",
"User":"crawl",
"Password":"crawl123"
}},
"user": {
"tag1": "age",
"tag2": "weight",
"value1": 2,
"value2": 100
}
}
res=mysqlUpdate(input,"")
print(res)

456
text_analysis/tools/pred.py

@ -0,0 +1,456 @@
#coding:utf8
import pandas as pd
import numpy as np
import networkx as nx
from textblob import TextBlob
from snownlp import SnowNLP
from wordcloud import STOPWORDS
import jieba
# import tool
from tqdm import tqdm
import os,sys
cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd()
par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir))
sys.path.append(cur_dir)
sys.path.append(par_dir)
import datetime
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import GridSearchCV
import joblib
def pre_user(data_user):
data_user['accountName'] = data_user['accountName'].apply(lambda x:'@'+x)
data_user['otherInfo'] = 1-pd.isnull(data_user['otherInfo']).astype(int)
data_user['nickName'] = 1-pd.isnull(data_user['nickName']).astype(int)
data_user = data_user[['accountName', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']]
data_user = data_user.dropna()
data_user = data_user.drop_duplicates().reset_index(drop = True)
data_user['fansCount'] = data_user['fansCount'].astype(int)
data_user['likeCount'] = data_user['likeCount'].astype(int)
data_user['postCount'] = data_user['postCount'].astype(int)
data_user.columns = ['用户名', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']
return data_user
def getText_count_eng(txt):
"""英文词频统计"""
txt = txt.lower() #将所有大写字母变成小写
for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()… ': #将文本中特殊符号替换为空格
txt = txt.replace(ch," ")
words = txt.split()
counts = {}
for word in words:
if word not in STOPWORDS:
if word != '\t':
counts[word] = counts.get(word,0) + 1 #统计字数
items = pd.DataFrame(list(counts.items()))
return items
def getText_count_ch(txt):
"""中文词频统计"""
txt = txt.lower() #将所有大写字母变成小写
for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()… 0123456789abcdefghijklmnopqrstuvwxyz': #将文本中特殊符号数字删除
txt = txt.replace(ch,"")
words = jieba.lcut(txt)
counts = {}
for word in words:
counts[word] = counts.get(word,0) + 1
items = list(counts.items())
fin_items = []
for item in items:
if len(item[0])>=2:
fin_items.append(item)
fin_items = pd.DataFrame(fin_items)
return fin_items
def getText_count_U(txt):
"""统计英文大写词频"""
for ch in '!\'"#$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()… ': #将文本中特殊符号替换为空格
txt = txt.replace(ch," ")
words = txt.split()
counts = {}
for word in words:
if word not in STOPWORDS:
if word != '/t':
if word.isupper(): #统计大写
counts[word] = counts.get(word,0) + 1 #统计字数
items = pd.DataFrame(list(counts.items())) #将字典类型转换成列表类型
if items.shape == (0,0):
out = 0
else:
out = sum(items[1])
return out
def is_chinese(strs):
"""判断一个unicode是否是汉字/英文"""
strs = strs.lower()
for uchar in strs:
if (uchar < u'\u0061') or (u'\u007a' < uchar < u'\u4e00') or (u'\u9fff' < uchar):
return False
return True
def is_eng(strs):
"""判断一个unicode是否是英文"""
strs = strs.lower()
for uchar in strs:
if (uchar < u'\u0061') or (u'\u007a' < uchar):
return False
return True
# def pre_user(data_user):
# data_user['accountName'] = data_user['accountName'].apply(lambda x:'@'+x)
# data_user['otherInfo'] = 1-pd.isnull(data_user['otherInfo']).astype(int)
# data_user['nickName'] = 1-pd.isnull(data_user['nickName']).astype(int)
# data_user = data_user[['accountName', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']]
# data_user = data_user.dropna()
# data_user = data_user.drop_duplicates().reset_index(drop = True)
# data_user['fansCount'] = data_user['fansCount'].astype(int)
# data_user['likeCount'] = data_user['likeCount'].astype(int)
# data_user['postCount'] = data_user['postCount'].astype(int)
# data_user.columns = ['用户名', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']
# return data_user
def post_related(df, data_user,logging):
# postset = pd.DataFrame(columns=['传播层级', '帖子id', '转发来源id', '用户名', '用户id', '发表内容', '发表时间', 'shareCount',
# 'url', 'layer', 'shape', 'degree', 'closeness_centrality', 'pagerank',
# 'sub_shareCount', '语言', '主贴长度', '主贴http', '主贴at', '主贴tag',
# 'emotion', 'emotion_sub', '最大词频数', '重复词汇占比', '大写词频', '有无传播内容',
# '传播链语言均值', '传播链语言标准差', '传播链贴文emotion均值', '传播链贴文emotion标准差',
# '传播链贴文emotion_sub均值', '传播链贴文emotion_sub标准差',
# '传播链贴文长度均值', '传播链贴文长度标准差', '传播链贴文http均值', '传播链贴文http标准差', '传播链贴文at均值',
# '传播链贴文at标准差', '传播链贴文tag均值', '传播链贴文tag标准差', 'diffdate均值', 'diffdate标准差'])
postset = pd.DataFrame(columns=['传播层级', '帖子id', '转发来源id','所属帖子id', '用户名', '用户id', '发表内容', '发表时间', 'shareCount',
'url', 'layer', 'shape', 'degree', 'closeness_centrality', 'pagerank',
'语言', '主贴长度', '主贴http', '主贴at', '主贴tag',
'emotion', 'emotion_sub', '最大词频数', '重复词汇占比'])
for post_id in df['所属帖子id'].drop_duplicates().reset_index(drop=True):
data = df[df['所属帖子id'] == post_id].reset_index(drop=True)
data.columns = ['传播层级', '帖子id', '转发来源id', '所属帖子id', '用户名', '用户id', '发表内容', '发表时间',
'shareCount', 'url']
data = data.drop_duplicates()
post = data[data['传播层级'] == 1].head(1)
### 一、新闻传播--贴文网络
##1.layer/shape/degree
post['layer'] = int(max(data['传播层级']))
post['shape'] = data.shape[0] - 1
post['degree'] = data[data['传播层级'] == 2].shape[0]
##2.整体网络测度(贴文网络测度)
###2.1把转发来源id对应到转发来源用户
tmp_zfyh = pd.merge(data[data['传播层级'] != 1]['转发来源id'].drop_duplicates(),
data[data['帖子id'].notnull()][['帖子id', '用户名']],
left_on=['转发来源id'], right_on=['帖子id'], how='left')[['转发来源id', '用户名']]
tmp_zfyh.columns = ['转发来源id', '转发来源用户名']
data = pd.merge(data, tmp_zfyh, left_on=['转发来源id'], right_on=['转发来源id'], how='left')
post_edge = data.copy()
post_edge = data[data['传播层级'] != 1][['用户名', '转发来源用户名']]
post_edge.columns = ['source', 'target']
post_edge['count_all'] = 1
post_edge = post_edge.groupby(['source', 'target'])['count_all'].count().reset_index()
# post_edge.to_csv(r'E:\项目文件\情报\假新闻\数据\画图\post_edge_tmp.csv',index=False)
edgeweightset = post_edge[['source', 'target', 'count_all']]
edgeweightset_l = [[] for _ in range(edgeweightset.shape[0])]
for k in range(len(edgeweightset_l)):
for j in range(edgeweightset.shape[1]):
edgeweightset_l[k].append(edgeweightset.iloc[k, j])
# print(i/len(edgeweightset_l))
if len(edgeweightset_l) == 0: # 没有传播链
post['closeness_centrality'] = 1
post['pagerank'] = 1
else:
g = nx.DiGraph()
g.add_weighted_edges_from(edgeweightset_l)
centrality = [nx.closeness_centrality(g),
nx.pagerank(g)]
results = []
nodes = g.nodes() # 提取网络中节点列表
for node in nodes: # 遍历所有节点,提取每个节点度中心性计算结果,并存储为[[节点1,结果],[节点2,结果],...]的形式
results.append([node,
centrality[0][node],
centrality[1][node]])
results = pd.DataFrame(results)
results.columns = ['node', 'closeness_centrality', 'pagerank']
post['closeness_centrality'] = results[results['node'] == results[
results['closeness_centrality'] == max(results['closeness_centrality'])]['node'].iloc[0]][
'closeness_centrality'].iloc[0]
post['pagerank'] = results[results['node'] ==
results[results['closeness_centrality'] == max(results['closeness_centrality'])][
'node'].iloc[0]]['pagerank'].iloc[0]
# post['closeness_centrality'] = results[results['node'] == post['帖子id'].iloc[0]]['closeness_centrality'].iloc[0]
# post['pagerank'] = results[results['node'] == post['帖子id'].iloc[0]]['pagerank'].iloc[0]
#——————————hh——————————————
# 特征未使用
# ##3.传播链中的平均影响力shareCount
# tmp = 0
# for k in range(data[(data['传播层级'] != '1') & (data['帖子id'].notnull())].shape[0]):
# tmp = tmp + int(data[(data['传播层级'] != '1') & (data['帖子id'].notnull())].shareCount.iloc[k])
# if tmp == 0:
# post['sub_shareCount'] = 0
# else:
# post['sub_shareCount'] = tmp / data[(data['传播层级'] != '1') & (data['帖子id'].notnull())].shape[0]
#————————————————————————
##二、主贴文本
# post['发表内容'] = 'October 10th commemorates the 1911 Revolution happened in Wuchang of China, which ended thousands-year-long absolute monarchy. Tsai and DPP authorities want to separate Taiwan from China and betray history. The Chinese people and Chinese history will never forgive these traitors.'
##文本特殊字符个数(http、@、#)
# logging.info(post)
post['主贴http'] = post['发表内容'].iloc[0].count('http')
post['主贴at'] = post['发表内容'].iloc[0].count('@')
post['主贴tag'] = post['发表内容'].iloc[0].count('#')
##判断语言
tmp = post['发表内容'].iloc[0]
for ch in 'éí!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| 0123456789':
tmp = tmp.replace(ch, '')
if is_eng(tmp): ##主贴英文内容
post['语言'] = 0
text = post['发表内容'].iloc[0]
# text = '#Americans,for the first time in their lives,are seeing empty shelves in the stores.This isn’t right.We need to cut #China out of our supply chains by producing locally.#onshoring'
text = text[0:text.rfind("http")]
for ch in '!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| ':
text = text.replace(ch, ' ')
##文本长度
words = text.split(' ')
post['主贴长度'] = len(words)
##文本情感
# post['emotion'] = post['发表内容'].apply(lambda x: SnowNLP(x).sentiments)
emo = pd.DataFrame(TextBlob(post['发表内容'].iloc[0]).sentiment)
post['emotion'] = emo.loc[0, 0]
post['emotion_sub'] = emo.loc[1, 0]
##文本词频
## 词频统计1:最大词频数
## 词频统计2:正文中出现两次及以上的词占比
items = getText_count_eng(text)
if items.shape == (0, 0):
post['最大词频数'] = 0
post['重复词汇占比'] = 0
else:
post['最大词频数'] = max(items[1])
post['重复词汇占比'] = items[items[1] >= 2].shape[0] / items.shape[0]
## 词频统计3:全部大写词频
post['大写词频'] = getText_count_U(text)
elif is_chinese(tmp): ##主贴中文内容
post['语言'] = 1
text = post['发表内容'].iloc[0]
text = text[0:text.rfind("http")]
post['主贴长度'] = len(text)
post['emotion'] = (SnowNLP(text).sentiments - 0.5) * 2
post['emotion_sub'] = np.NaN
# post['emotion_blob'] = pd.DataFrame(post['发表内容'].apply(lambda x: TextBlob(x).sentiment).iloc[0]).loc[0]
# post['emotion_sub'] = pd.DataFrame(post['发表内容'].apply(lambda x: TextBlob(x).sentiment).iloc[0]).loc[1]
##文本词频
## 词频统计1:标题中出现的词,在正文中出现最大词频
## 词频统计2:正文中出现两次及以上的词占比
items = getText_count_ch(text)
if items.shape == (0, 0):
post['最大词频数'] = 0
post['重复词汇占比'] = 0
else:
post['最大词频数'] = max(items[1])
post['重复词汇占比'] = items[items[1] >= 2].shape[0] / items.shape[0]
## 词频统计3:全部大写词频
post['大写词频'] = np.NaN
else:
post['语言'] = np.NaN
post['主贴长度'] = np.NaN
post['emotion'] = np.NaN
post['emotion_sub'] = np.NaN
post['最大词频数'] = np.NaN
post['重复词汇占比'] = np.NaN
post['大写词频'] = np.NaN
# ##4.2传播链中的文本
# sub_post = pd.DataFrame(data[(data['传播层级'] != '1') & (data['帖子id'].notnull())][['发表内容', '发表时间']])
# sub_post['语言'] = np.NaN
# sub_post['文本长度'] = np.NaN
# sub_post['http'] = np.NaN
# sub_post['at'] = np.NaN
# sub_post['tag'] = np.NaN
# sub_post['emotion'] = np.NaN
# sub_post['emotion_sub'] = np.NaN
# sub_post['diffdate'] = np.NaN
#
# for k in range(sub_post.shape[0]):
# ##文本特殊字符个数(http、@、#)
# sub_post['http'].iloc[k] = sub_post['发表内容'].iloc[k].count('http')
# sub_post['at'].iloc[k] = sub_post['发表内容'].iloc[k].count('@')
# sub_post['tag'].iloc[k] = sub_post['发表内容'].iloc[k].count('#')
#
# ##时间差
# d1 = datetime.datetime.strptime(sub_post['发表时间'].iloc[k], "%Y-%m-%d %H:%M:%S")
# base = datetime.datetime.strptime(post['发表时间'].iloc[0], "%Y-%m-%d %H:%M:%S")
#
# # now = datetime.datetime.now()
# sub_post['diffdate'].iloc[k] = (d1 - base).days
#
# ##判断语言
# tmp = sub_post['发表内容'].iloc[k]
# for ch in 'éí!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| 0123456789':
# tmp = tmp.replace(ch, '')
#
# if is_eng(tmp): ##英文内容
#
# sub_post['语言'].iloc[k] = 0
#
# ##文本长度
# text = sub_post['发表内容'].iloc[k]
# # text = "'America is collapsing and it's China's fault' is definitely a change of direction?"
# text = text[0:text.rfind("http")]
# for ch in '!\'"##$%&()*+,-.:;<=>?@[\\]^_`{|}~。,:;!?’‘“”、·——/\–《》()…【】|「」| ':
# text = text.replace(ch, ' ')
# words = text.split(' ')
# sub_post['文本长度'].iloc[k] = len(words)
# ##情感
# sub_emo = pd.DataFrame(TextBlob(sub_post['发表内容'].iloc[k]).sentiment)
# sub_post['emotion'].iloc[k] = sub_emo.loc[0, 0]
# sub_post['emotion_sub'].iloc[k] = sub_emo.loc[1, 0]
#
# elif is_chinese(tmp): ##中文内容
#
# sub_post['语言'].iloc[k] = 1
#
# ##文本长度
# text = sub_post['发表内容'].iloc[k]
# text = text[0:text.rfind("http")]
# sub_post['文本长度'].iloc[k] = len(text)
# ##情感
# sub_post['emotion'].iloc[k] = (SnowNLP(sub_post['发表内容'].iloc[k]).sentiments - 0.5) * 2
# sub_post['emotion_sub'].iloc[k] = np.NaN
#
# else:
#
# sub_post['语言'].iloc[k] = np.NaN
# sub_post['文本长度'].iloc[k] = np.NaN
# sub_post['emotion'].iloc[k] = np.NaN
# sub_post['emotion_sub'].iloc[k] = np.NaN
#
# if sub_post.shape[0] == 0:
# post['有无传播内容'] = 0
# else:
# post['有无传播内容'] = 1
#
# post['传播链语言均值'] = sub_post['语言'].mean()
# post['传播链贴文长度均值'] = sub_post['文本长度'].mean()
# post['传播链贴文emotion均值'] = sub_post['emotion'].mean()
#
# ##emotion_sub取有值的均值
# post['传播链贴文emotion_sub均值'] = sub_post['emotion_sub'].mean()
#
# post['传播链贴文http均值'] = sub_post['http'].mean()
#
# post['传播链贴文at均值'] = sub_post['at'].mean()
#
# post['传播链贴文tag均值'] = sub_post['tag'].mean()
#
# post['diffdate均值'] = sub_post['diffdate'].mean()
##三、用户信息
##发帖用户
post = pd.merge(post, data_user, how='left', on='用户名')
##传播链用户
sub_user = pd.DataFrame(data[data['传播层级'] != 1][['用户名']])
sub_user = pd.merge(sub_user, data_user, how='left', on='用户名')
sub_user = sub_user.dropna()
post['nickName均值'] = sub_user['nickName'].mean()
post['fansCount均值'] = sub_user['fansCount'].mean()
post['likeCount均值'] = sub_user['likeCount'].mean()
post['postCount均值'] = sub_user['postCount'].mean()
post['otherInfo均值'] = sub_user['otherInfo'].mean()
postset = pd.concat([postset, post]).reset_index(drop=True)
postset = postset.fillna(0)
postset['emotion_degree'] = abs(postset['emotion'])
return postset
def predict_news(userData,postChain,logging):
data_po = pd.DataFrame(postChain).replace('', np.nan)
data_po.columns = ['id','层级','帖子id','转发来源id','所属帖子id','用户名','用户id','发表内容','发表时间','shareCount','url','topicId']
data_po=data_po[['层级','帖子id','转发来源id','所属帖子id','用户名','用户id','发表内容','发表时间','shareCount','url']]
if not userData:
columns=['topicId','id','accountName','nickName','fansCount','likeCount','postCount','account_url','otherInfo','postId','ssId']
data_user=pd.DataFrame(columns=columns)
else:
data_user = pd.DataFrame(userData).replace('', np.nan)
data_user.columns = ['topicId','id','accountName','nickName','fansCount','likeCount','postCount','account_url','otherInfo','postId','ssId']
data_user=data_user[['accountName','nickName','fansCount','likeCount','postCount','account_url','otherInfo']]
data_user = pre_user(data_user)
#data_user=dataframe[@XHNews,1,878,1178,938,1]
#data_user.columns=['用户名', 'nickName', 'fansCount', 'likeCount', 'postCount', 'otherInfo']
postset_po = post_related(data_po,data_user,logging) ## 正面文件
features = postset_po[[
#'shareCount',
'layer', 'shape', 'degree', 'pagerank', 'closeness_centrality',
'主贴http', '主贴at', '主贴tag',
'主贴长度','emotion', 'emotion_degree',
'最大词频数', '重复词汇占比',#(中英文差异大)
#'有无传播内容',
'fansCount','likeCount', 'postCount',
#'sub_shareCount',
'fansCount均值', 'postCount均值', 'otherInfo均值'
]]
clf = joblib.load(par_dir+'/model/fake_news_model.pkl')
clf_predict = clf.predict(features)
res=pd.DataFrame(clf_predict)
res.columns=['假新闻预测结果']
res['recognitionResult'] = res['假新闻预测结果'].apply(lambda x: '真新闻' if x == 1 else '假新闻')
result = pd.concat([postset_po, res], axis=1)
return result
if __name__=="__main__":
print(par_dir)
# user={
# "topicId":1209,
# "host":"172.26.28.30",
# "user":"crawl",
# "passwd":"crawl123",
# "db":"test",
# "port":3306,
# "table":"TwitterAccount"
# }
# userData = tool.mysqlData(user,"")
# # logging.info("账号数据获取完毕!")
# # 传播链数据
# # post = raw_data["metadata"]["admin"]["Twitter_chain"]
# post={
# "topicId":1209,
# "host":"172.26.28.30",
# "user":"crawl",
# "passwd":"crawl123",
# "db":"test",
# "port":3306,
# "table":"Twitter_chain"
# }
# postChain = tool.mysqlData(post, "")
# # logging.info("传播链数据获取完毕!")
# predict_news(userData,postChain,"")

51
text_analysis/tools/process.py

@ -0,0 +1,51 @@
#coding:utf8
import os, sys
cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd()
par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir))
sys.path.append(cur_dir)
sys.path.append(par_dir)
import json
from text_analysis.tools import to_kafka
from tools.mysql_helper import mysqlConn,mysqlInsert,mysqlQuery,mysqlExecute,mysqlUpdate,mysqlDelete,getTableColumnNames
import traceback
import time
from log_util.set_logger import set_logger
logging=set_logger('results.log')
from views import task_queue
def process_data():
while True:
try:
# print("task_queue:",task_queue)
if task_queue.qsize() >0:
try:
raw_data = task_queue.get()
res = ""
logging.info("启动数据处理线程——")
logging.info(raw_data)
flag = raw_data["metadata"]["admin"]["type"]
# type分为execute、query、insert、update、delete
if flag == 'insert':
res = mysqlInsert(raw_data, logging)
elif flag == 'execute':
res = mysqlExecute(raw_data, logging)
elif flag == 'update':
res = mysqlUpdate(raw_data, logging)
elif flag == 'query':
res = mysqlQuery(raw_data, logging)
elif flag == 'delete':
res = mysqlDelete(raw_data, logging)
raw_data["result"] = res
logging.info("************写入kafka***********")
to_kafka.send_kafka(raw_data)
except:
raw_data["result"] = {"successCode": "0", "errorLog": "", "results": ""}
raw_data["result"]["errorLog"] = traceback.format_exc()
to_kafka.send_kafka(raw_data)
else:
logging.info("暂无任务,进入休眠--")
print("222222222222222222222222")
time.sleep(10)
except:
logging.error(traceback.format_exc())

171
text_analysis/tools/seleniumTest.py

@ -0,0 +1,171 @@
# -*- coding: utf-8 -*-
import time
import threading
from selenium import webdriver
import json
from urllib.parse import urljoin
from kakfa_util import KafkaConsume
from kakfa_util import kafkaProduce
from logUtil import get_logger
from Go_fastDfs import uploadFile
import traceback
import queue
import configparser
import os, sys
import re
logger = get_logger("./logs/crawlWebsrcCode.log")
#加载配置文件
configFile = './config.ini'
# 创建配置文件对象
con = configparser.ConfigParser()
# 读取文件
con.read(configFile, encoding='utf-8')
kafkaConfig = dict(con.items('kafka'))#kafka配置信息
goFastdfsConfig = dict(con.items('goFastdfs'))#goFastdfs配置信息
class Spider(object):
def __init__(self,url):
self.chromeOptions = self.get_profile()
self.browser = self.get_browser()
self.url = url
def get_profile(self):
chromeOptions = webdriver.ChromeOptions()
chromeOptions.add_argument('--headless') # 谷歌无头模式
chromeOptions.add_argument('--disable-gpu') # 禁用显卡
# chromeOptions.add_argument('window-size=1280,800') # 指定浏览器分辨率
chromeOptions.add_argument("--no-sandbox")
return chromeOptions
def get_browser(self):
browser = webdriver.Chrome("D:\\工作使用\\zhaoshang\\chromedriver.exe",chrome_options=self.chromeOptions)
return browser
def _get_page(self,path):
'''
:param path:
:return:
'''
self.browser.get(self.url)
time.sleep(5)
logger.info("休眠结束")
# 向下偏移了10000个像素,到达底部。
scrollTop = 10000
for num in range(1,10):
js = "var q=document.documentElement.scrollTop={}".format(scrollTop*num)
logger.info("第{}次滚动".format(num))
self.browser.execute_script(js)
time.sleep(5)
# 执行 Chome 开发工具命令,得到mhtml内容
res = self.browser.execute_cdp_cmd('Page.captureSnapshot', {})
#获取文章标题
title = '无标题'
try:
title = self.browser.find_element_by_css_selector("title").get_attribute("textContent")
except Exception as e:
logger.error('获取标题异常----')
traceback.print_exc()
pathName = '{}{}.mhtml'.format(path,title)
with open(pathName, 'w',newline='') as f:
f.write(res['data'])
return pathName,title
if __name__ == '__main__':
#初始化任务队列
task_queue = queue.Queue()
#跟读kafka线程
logger.info("开启读取kafka线程---")
t = threading.Thread(target=KafkaConsume, name='LoopThread',args=(kafkaConfig['read_topic'], kafkaConfig['address'], kafkaConfig['group_id'], task_queue,logger))
t.daemon = True
t.start()
#获取任务执行页面原格式保留
while True:
try:
if task_queue.qsize() >0:
taskStr = task_queue.get()
logger.info('当前任务:{}'.format(taskStr))
task = json.loads(taskStr)
p1 = u'(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]'
pattern1 = re.compile(p1)
matcher1 = re.search(p1, task['url'])
if matcher1:
l = Spider(task['url'])
pathName,title = l._get_page(goFastdfsConfig['path'])
l.browser.quit()
#gofast 上传,写入kafka
if '404 Not Found' in title:
logger.error('页面404,无效')
resultData = {
'code': 500,
'id': task['id'],
'message': '页面404'
}
kafkaProduce(kafkaConfig['data_topics'],
json.dumps(resultData).encode('utf-8').decode('unicode_escape').encode(),
kafkaConfig['address'])
time.sleep(2)
continue
try:
uploadStr = uploadFile('{}upload'.format(goFastdfsConfig['uploadaddress']),pathName,logger)
uploadJson = json.loads(uploadStr)
except Exception as e:
logger.error('文件上传异常----')
traceback.print_exc()
resultData = {
'code': 500,
'id': task['id'],
'message': '文件上传失败'
}
kafkaProduce(kafkaConfig['data_topics'],
json.dumps(resultData).encode('utf-8').decode('unicode_escape').encode(),
kafkaConfig['address'])
time.sleep(2)
continue
resultData = {
'code':200,
'id':task['id'],
'url':goFastdfsConfig['downloadaddress']+uploadJson['path'],
'title':title,
'delMd5':uploadJson['md5'],
'uploadTime':uploadJson['mtime'],
'message':'成功'
}
kafkaProduce(kafkaConfig['data_topics'],json.dumps(resultData).encode('utf-8').decode('unicode_escape').encode(),kafkaConfig['address'])
logger.info('数据写入成功')
#删除文件
if (os.path.exists(pathName)):
os.remove(pathName)
logger.info('清除文件:{}'.format(pathName))
else:
logger.info('要删除的文件不存在:{}'.format(pathName))
else:
logger.error('非正确url:'.format(task['url']))
resultData = {
'code': 500,
'id': task['id'],
'message': '非正确url'
}
kafkaProduce(kafkaConfig['data_topics'],
json.dumps(resultData).encode('utf-8').decode('unicode_escape').encode(),
kafkaConfig['address'])
time.sleep(2)
continue
else:
logger.info("暂无任务,进入休眠--")
time.sleep(10)
except Exception as e:
logger.error('未知异常----')
traceback.print_exc()
resultData = {
'code': 500,
'id': task['id'],
'message': '未知异常'
}
kafkaProduce(kafkaConfig['data_topics'],
json.dumps(resultData).encode('utf-8').decode('unicode_escape').encode(),
kafkaConfig['address'])
time.sleep(2)

25
text_analysis/tools/to_kafka.py

@ -0,0 +1,25 @@
#coding:utf8
import traceback
import json
from kafka import KafkaProducer
from text_analysis.read_config import load_config
config=load_config()
def send_kafka(data,logging):
try:
producer = None
topic = config["kafka"]["topic"]
data1=json.dumps(data,ensure_ascii=False)
kafkaProduce(topic,bytes(data1, encoding='utf-8'))
logging.info("数据推入kafka!")
except Exception as e:
logging.info(traceback.format_exc())
logging.info('写入kafka失败')
def kafkaProduce(topic,resultData):
producer = KafkaProducer(bootstrap_servers = '{}'.format(config["kafka"]["bootstrap_servers"]),max_request_size=52428800)
topics = topic.split(',')
for tc in topics:
future = producer.send(tc,resultData)
producer.flush()

233
text_analysis/tools/tool.py
File diff suppressed because it is too large
View File

1
text_analysis/tools/关系链数据.txt
File diff suppressed because it is too large
View File

1
text_analysis/tools/账号数据.txt
File diff suppressed because it is too large
View File

13
text_analysis/urls.py

@ -0,0 +1,13 @@
from django.conf.urls import include, url
from django.contrib import admin
from text_analysis import views
urlpatterns = [
url(r'^fakeNewIdentification',views.fakeNewIdentification, name='fakeNewIdentification'),
# url(r'^mysqlConnection',views.mysqlConnection, name='mysqlConnection'),
# url(r'^mysqlField', views.mysqlField, name='mysqlField')
]

158
text_analysis/views.py

@ -0,0 +1,158 @@
#coding:utf8
import os, sys
import io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')
cur_dir = os.path.dirname(os.path.abspath(__file__)) or os.getcwd()
par_dir = os.path.abspath(os.path.join(cur_dir, os.path.pardir))
sys.path.append(cur_dir)
sys.path.append(par_dir)
import json
from django.http import HttpResponse
from text_analysis.tools import to_kafka,tool
from text_analysis.tools import pred
from django.views.decorators.csrf import csrf_exempt
from log_util.set_logger import set_logger
logging=set_logger('logs/results.log')
import traceback
import queue
from text_analysis.cusException import userFile_Exception,chainFile_Exception
import requests
import time
from kazoo.client import KazooClient
from kazoo.protocol.states import EventType
#任务队列
import queue
task_queue = queue.PriorityQueue()
stop_dict={}
from text_analysis.read_config import load_config
config=load_config()
@csrf_exempt
def fakeNewIdentification(request):
if request.method == 'POST':
try:
raw_data = json.loads(request.body)
if "trace" in raw_data.keys() and raw_data["trace"]==True:
task_queue.put((-1,time.time(), raw_data))
else:
task_queue.put((1,time.time(), raw_data))
return HttpResponse(json.dumps({"code": 1, "msg": "请求正常!"}, ensure_ascii=False))
except:
logging.error(traceback.format_exc())
return HttpResponse(json.dumps({"code": 0, "msg": "请求json格式不正确!"}, ensure_ascii=False))
else:
return HttpResponse(json.dumps({"code": 0, "msg": "请求方式错误,改为post请求"}, ensure_ascii=False))
def predict_news():
dbConfig = dict(config.items('database'))
while True:
try:
if task_queue.qsize()>0:
p,t,raw_data = task_queue.get(timeout=1)
logging.info("当前任务队列长度{}".format(task_queue.qsize()+1))
logging.info("任务数据-{}".format(raw_data))
task_id=raw_data["scenes_id"]
task_version=raw_data["version"]
logging.info("当前version信息为:{}".format(stop_dict))
if task_id in stop_dict.keys() and task_version!=stop_dict[task_id]["version"]:
logging.info("已暂停任务,数据过滤掉")
continue
res = {"successCode": "1", "errorLog": "", "results": {},"status":1,"message":"成功"}
# 账号数据
userData = tool.mysqlData(raw_data, logging,"1",dbConfig)
# if not userData:
# raise userFile_Exception
logging.info("账号数据获取完毕!-长度{}".format(len(userData)))
# 传播链数据
postChain=tool.mysqlData(raw_data, logging,"0",dbConfig)
if not postChain:
raise chainFile_Exception
logging.info("传播链数据获取完毕!-长度{}".format(len(postChain)))
news=pred.predict_news(userData,postChain,logging)
# 结束标识
res['isLast'] = True
for i in range(len(news)):
row_dict = news.iloc[i].to_dict()
row_dict['pageType'] = 'fakeNewsPage'
# postId
row_dict['postId'] = userData[0]['postId']
if i == len(news) - 1:
row_dict["isLast"]=1
res["results"] = json.dumps(row_dict,ensure_ascii=False)
res["status"] = 1
res["message"] = "成功"
raw_data["result"] = res
logging.info("共{}条数据,第{}条数据输出-{}".format(len(news),i+1,raw_data))
to_kafka.send_kafka(raw_data, logging)
else:
# 暂无任务,进入休眠
time.sleep(10)
except userFile_Exception:
res = {"successCode": "0", "errorLog": "用户数据为空!", "results": {}, "status": 2,"message": "异常"}
results={}
results['pageType'] = 'fakeNewsPage'
results['recognitionResult'] = '用户数据为空'
res['results'] = json.dumps(results)
res["status"] = 2
res["message"] = "用户数据为空"
raw_data["result"] = res
logging.info("该条请求用户数据为空-{}".format(raw_data))
to_kafka.send_kafka(raw_data, logging)
except chainFile_Exception:
res = {"successCode": "0", "errorLog": "关系链数据为空!", "results": {}, "status": 2,"message": "异常"}
results={}
results['pageType'] = 'fakeNewsPage'
results['recognitionResult'] = '关系链数据为空'
res['results'] = json.dumps(results)
res["status"] = 2
res["message"] = "关系链数据为空"
raw_data["result"] = res
logging.info("该条请求关系链数据为空-{}".format(raw_data))
to_kafka.send_kafka(raw_data, logging)
except:
res = {"successCode": "0", "errorLog": "", "results": {}, "status": 2,"message": "异常"}
results={}
results['pageType'] = 'fakeNewsPage'
results['recognitionResult'] = ""
res['results'] = json.dumps(results)
res["status"] = 2
res["message"] = "异常"
raw_data["result"] = res
raw_data["result"]["errorLog"] = traceback.format_exc()
logging.info(traceback.format_exc())
to_kafka.send_kafka(raw_data, logging)
def zk_monitoring():
try:
#线上环境
zk = KazooClient(hosts=config['zookeeper']['zkhost'])
#测试环境
# zk = KazooClient(hosts='172.16.12.55:2181,172.16.12.56:2181,172.16.12.57:2181')
zk.start()
# 设置监听器
@zk.DataWatch("/analyze")
def watch_node(data, stat, event):
if event is not None and event.type == EventType.CHANGED:
data, stat = zk.get("/analyze")
logging.info("执行删除操作:{}".format(data))
d = json.loads(data)
id = d["scenes_id"]
stop_dict[id] = {}
stop_dict[id]["version"] = d["version"]
stop_dict[id]["operation"] = d["operation"]
# 保持程序运行以监听节点变化
try:
while True:
time.sleep(1)
except:
logging.info("Stopping...")
# 关闭连接
zk.stop()
zk.close()
except:
logging.error(traceback.format_exc())

16
text_analysis/wsgi.py

@ -0,0 +1,16 @@
"""
WSGI config for Zhijian_Project_WebService project.
It exposes the WSGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/1.8/howto/deployment/wsgi/
"""
import os
from django.core.wsgi import get_wsgi_application
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "text_analysis.settings")
application = get_wsgi_application()

83
txt/fakeNew.txt

@ -0,0 +1,83 @@
假新闻:英文的都是1-假新闻,中文的都是0-非假新闻。
{
"metadata":{
"address":"http://172.24.12.127:9025/robotIdentificationTopic/",
"index":0,
"admin":{
"TwitterAccount":{
"topicId":1209,
"host":"172.26.28.30",
"user":"crawl",
"passwd":"crawl123",
"db":"test",
"port":3306,
"table":"TwitterAccount"
},
"Twitter_chain":{
"topicId":1209,
"host":"172.26.28.30",
"user":"crawl",
"passwd":"crawl123",
"db":"test",
"port":3306,
"table":"Twitter_chain"
}
}
},
"output":{
"output_type":"table",
"label_col":[
]
},
"input":{
"input_type":"text",
"label":[
"2_任务提取"
]
},
"user":{
"tag":""
},
"data":{
},
"created":1691004265000,
"module":"robotIdentification",
"start_tag":false,
"multi_branch":0,
"last_edit":1693417201000,
"next_app_id":[
{
"start_id":154,
"edge_id":75,
"end_id":155
}
],
"transfer_id":3,
"version":1,
"blueprint_id":4,
"scenes_id":5,
"scenario":{
"dataloss":1,
"autoCommitTriggerLast":1,
"maxErrors":3,
"autoCommit":1,
"freshVariables":1
},
"wait_condition":[
],
"scheduling":{
"interval":-1,
"type":"single"
},
"name":"robotIdentification",
"businessKey":"19615b029da477fb",
"id":154,
"position":[
100,
200
],
"describe":""
}

1
txt/关系链数据.txt
File diff suppressed because it is too large
View File

BIN
txt/技术部分初稿@20230302.docx

3
txt/环境要求.txt

@ -0,0 +1,3 @@
1.python>3.7
2.pandas=1.4.4
3.sklearn=0.24.2

1
txt/账号数据.txt
File diff suppressed because it is too large
View File

8
uwsgi.ini

@ -0,0 +1,8 @@
[uwsgi]
http = 0.0.0.0:9030
chdir = ../fakeNewIdentification
wsgi-file = ../fakeNewIdentification/wsgi.py
processes = 1
threads = 2
listen = 1024
http-timeout=21600

38
wsgi.log

@ -0,0 +1,38 @@
*** Starting uWSGI 2.0.21 (64bit) on [Fri Jan 3 09:27:26 2025] ***
compiled with version: 11.2.0 on 24 October 2023 19:53:56
os: Linux-3.10.0-1127.19.1.el7.x86_64 #1 SMP Tue Aug 25 17:23:54 UTC 2020
nodename: node-04
machine: x86_64
clock source: unix
pcre jit disabled
detected number of CPU cores: 64
current working directory: /opt/analyze/apps/fakeNewIdentification
detected binary path: /opt/analyze/environment/python3.8/bin/uwsgi
uWSGI running as root, you can use --uid/--gid/--chroot options
*** WARNING: you are running uWSGI as root !!! (use the --uid flag) ***
chdir() to ../fakeNewIdentification
*** WARNING: you are running uWSGI without its master process manager ***
your processes number limit is 1031041
your memory page size is 4096 bytes
detected max file descriptor number: 65535
lock engine: pthread robust mutexes
thunder lock: disabled (you can enable it with --thunder-lock)
uWSGI http bound on 0.0.0.0:9030 fd 4
spawned uWSGI http 1 (pid: 51183)
uwsgi socket 0 bound to TCP address 127.0.0.1:42416 (port auto-assigned) fd 3
uWSGI running as root, you can use --uid/--gid/--chroot options
*** WARNING: you are running uWSGI as root !!! (use the --uid flag) ***
Python version: 3.8.16 (default, Jun 12 2023, 18:09:05) [GCC 11.2.0]
Python main interpreter initialized at 0x1f87250
uWSGI running as root, you can use --uid/--gid/--chroot options
*** WARNING: you are running uWSGI as root !!! (use the --uid flag) ***
python threads support enabled
your server socket listen backlog is limited to 1024 connections
your mercy for graceful operations on workers is 60 seconds
mapped 83376 bytes (81 KB) for 2 cores
*** Operational MODE: threaded ***
WSGI app 0 (mountpoint='') ready in 8 seconds on interpreter 0x1f87250 pid: 51182 (default app)
uWSGI running as root, you can use --uid/--gid/--chroot options
*** WARNING: you are running uWSGI as root !!! (use the --uid flag) ***
*** uWSGI is running in multiple interpreter mode ***
spawned uWSGI worker 1 (and the only) (pid: 51182, cores: 2)

34
wsgi.py

@ -0,0 +1,34 @@
"""
WSGI config for Zhijian_Project_WebService project.
It exposes the WSGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/1.8/howto/deployment/wsgi/
"""
import os
import configparser
import threading
from text_analysis.views import predict_news
#加载配置文件
# configFile = './config.ini'
# # 创建配置文件对象
# con = configparser.ConfigParser()
# # 读取文件
# con.read(configFile, encoding='utf-8')
# #数据库配置信息
# dbConfig = dict(con.items('database'))
t = threading.Thread(target=predict_news, name='predict_news')
t.daemon = True
t.start()
from django.core.wsgi import get_wsgi_application
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "text_analysis.settings")
application = get_wsgi_application()
Loading…
Cancel
Save