From 53059d68fe6face910b8dee8ec800dc4f183a651 Mon Sep 17 00:00:00 2001 From: maojian <550076202@qq.com> Date: Wed, 8 Jan 2025 15:21:10 +0800 Subject: [PATCH] =?UTF-8?q?=E7=9F=A5=E7=BD=91=E9=87=87=E9=9B=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .classpath | 40 ++ .gitignore | 3 + .idea/.gitignore | 8 + .idea/compiler.xml | 17 + .idea/encodings.xml | 6 + .idea/inspectionProfiles/Project_Default.xml | 36 ++ .idea/jarRepositories.xml | 20 + .idea/libraries/Maven__cglib_cglib_nodep_2_1_3.xml | 13 + ...aven__ch_qos_logback_logback_classic_1_1_11.xml | 13 + .../Maven__ch_qos_logback_logback_core_1_1_11.xml | 13 + .../libraries/Maven__com_101tec_zkclient_0_10.xml | 13 + .../Maven__com_alibaba_fastjson_1_1_22.xml | 13 + ...Maven__com_alibaba_otter_canal_client_1_1_4.xml | 13 + .../Maven__com_fasterxml_classmate_1_3_3.xml | 13 + ...rxml_jackson_core_jackson_annotations_2_8_0.xml | 13 + ...m_fasterxml_jackson_core_jackson_core_2_8_9.xml | 13 + ...sterxml_jackson_core_jackson_databind_2_8_9.xml | 13 + ...on_dataformat_jackson_dataformat_yaml_2_8_9.xml | 13 + .../Maven__com_github_virtuald_curvesapi_1_05.xml | 13 + .../Maven__com_google_code_gson_gson_2_8_1.xml | 13 + .../Maven__com_google_guava_guava_18_0.xml | 13 + ...en__com_google_protobuf_protobuf_java_3_6_1.xml | 13 + .../libraries/Maven__com_mchange_c3p0_0_9_5_5.xml | 13 + ...en__com_mchange_mchange_commons_java_0_2_19.xml | 13 + .../Maven__com_oracle_database_ha_ons_12_2_0_1.xml | 13 + ...__com_oracle_database_ha_simplefan_12_2_0_1.xml | 13 + ...n__com_oracle_database_jdbc_ojdbc8_12_2_0_1.xml | 13 + ...aven__com_oracle_database_jdbc_ucp_12_2_0_1.xml | 13 + ...oracle_database_security_oraclepki_12_2_0_1.xml | 13 + ...oracle_database_security_osdt_cert_12_2_0_1.xml | 13 + ...oracle_database_security_osdt_core_12_2_0_1.xml | 13 + .../Maven__com_squareup_okhttp3_okhttp_3_9_1.xml | 13 + .../Maven__com_squareup_okio_okio_1_13_0.xml | 13 + ...aven__com_yammer_metrics_metrics_core_2_2_0.xml | 13 + .../Maven__commons_cli_commons_cli_1_2.xml | 13 + .../Maven__commons_codec_commons_codec_1_10.xml | 13 + .../libraries/Maven__commons_io_commons_io_1_4.xml | 13 + .../Maven__commons_lang_commons_lang_2_6.xml | 13 + .../Maven__io_netty_netty_3_5_7_Final.xml | 13 + .../Maven__io_netty_netty_all_4_1_36_Final.xml | 13 + .../Maven__io_netty_netty_buffer_4_1_54_Final.xml | 13 + .../Maven__io_netty_netty_codec_4_1_54_Final.xml | 13 + ...aven__io_netty_netty_codec_dns_4_1_54_Final.xml | 13 + .../Maven__io_netty_netty_common_4_1_54_Final.xml | 13 + .../Maven__io_netty_netty_handler_4_1_54_Final.xml | 13 + ...Maven__io_netty_netty_resolver_4_1_54_Final.xml | 13 + ...n__io_netty_netty_resolver_dns_4_1_54_Final.xml | 13 + ...tty_tcnative_boringssl_static_1_1_33_Fork26.xml | 13 + ...aven__io_netty_netty_transport_4_1_54_Final.xml | 13 + ...o_projectreactor_reactor_core_2_0_8_RELEASE.xml | 13 + .../Maven__io_reactivex_rxjava3_rxjava_3_0_7.xml | 13 + .../Maven__javax_cache_cache_api_1_0_0.xml | 13 + ...javax_validation_validation_api_1_1_0_Final.xml | 13 + .idea/libraries/Maven__junit_junit_4_11.xml | 13 + .idea/libraries/Maven__log4j_log4j_1_2_14.xml | 13 + .../Maven__mysql_mysql_connector_java_8_0_29.xml | 13 + .../Maven__net_bytebuddy_byte_buddy_1_8_15.xml | 13 + .../Maven__net_java_dev_jna_jna_4_2_2.xml | 13 + .../Maven__net_java_dev_jna_jna_platform_4_1_0.xml | 13 + ...Maven__net_sf_jopt_simple_jopt_simple_5_0_4.xml | 13 + ...org_apache_commons_commons_collections4_4_2.xml | 13 + ...n__org_apache_commons_commons_compress_1_18.xml | 13 + .../Maven__org_apache_commons_commons_exec_1_3.xml | 13 + ...ven__org_apache_commons_commons_lang3_3_8_1.xml | 13 + ...ven__org_apache_commons_commons_math3_3_6_1.xml | 13 + ...ven__org_apache_commons_commons_pool2_2_4_2.xml | 13 + ..._org_apache_httpcomponents_httpclient_4_5_3.xml | 13 + ...n__org_apache_httpcomponents_httpcore_4_4_6.xml | 13 + .../Maven__org_apache_kafka_kafka_2_12_1_0_2.xml | 13 + ...Maven__org_apache_kafka_kafka_clients_2_0_1.xml | 13 + .../libraries/Maven__org_apache_poi_poi_4_0_1.xml | 13 + .../Maven__org_apache_poi_poi_ooxml_4_0_1.xml | 13 + ...ven__org_apache_poi_poi_ooxml_schemas_4_0_1.xml | 13 + ...ven__org_apache_rocketmq_rocketmq_acl_4_5_2.xml | 13 + ...__org_apache_rocketmq_rocketmq_client_4_5_2.xml | 13 + ...__org_apache_rocketmq_rocketmq_common_4_5_2.xml | 13 + ..._org_apache_rocketmq_rocketmq_logging_4_5_2.xml | 13 + ...org_apache_rocketmq_rocketmq_remoting_4_5_2.xml | 13 + ..._org_apache_rocketmq_rocketmq_srvutil_4_5_2.xml | 13 + ...pache_tomcat_embed_tomcat_embed_core_8_5_16.xml | 13 + ..._apache_tomcat_embed_tomcat_embed_el_8_5_16.xml | 13 + ..._tomcat_embed_tomcat_embed_websocket_8_5_16.xml | 13 + .../Maven__org_apache_xmlbeans_xmlbeans_3_0_2.xml | 13 + ...Maven__org_apache_zookeeper_zookeeper_3_4_5.xml | 13 + .../Maven__org_hamcrest_hamcrest_core_1_3.xml | 13 + ...g_hibernate_hibernate_validator_5_3_5_Final.xml | 13 + ...org_jboss_logging_jboss_logging_3_3_1_Final.xml | 13 + ..._marshalling_jboss_marshalling_2_0_10_Final.xml | 13 + ...alling_jboss_marshalling_river_2_0_10_Final.xml | 13 + .../Maven__org_jboss_netty_netty_3_2_2_Final.xml | 13 + .../libraries/Maven__org_jodd_jodd_bean_5_1_6.xml | 13 + .../libraries/Maven__org_jodd_jodd_core_5_1_6.xml | 13 + .idea/libraries/Maven__org_jsoup_jsoup_1_8_1.xml | 13 + .idea/libraries/Maven__org_lz4_lz4_java_1_4_1.xml | 13 + .../Maven__org_projectlombok_lombok_1_16_18.xml | 13 + ..._org_reactivestreams_reactive_streams_1_0_0.xml | 13 + .../Maven__org_redisson_redisson_3_14_1.xml | 13 + .../Maven__org_scala_lang_scala_library_2_12_4.xml | 13 + ...org_seleniumhq_selenium_selenium_api_2_53_1.xml | 13 + ...umhq_selenium_selenium_chrome_driver_2_53_1.xml | 13 + ...umhq_selenium_selenium_edge_driver_3_141_59.xml | 13 + ...mhq_selenium_selenium_firefox_driver_2_53_1.xml | 13 + ...leniumhq_selenium_selenium_ie_driver_2_53_1.xml | 13 + ..._seleniumhq_selenium_selenium_java_3_141_59.xml | 13 + ...mhq_selenium_selenium_opera_driver_3_141_59.xml | 13 + ...umhq_selenium_selenium_remote_driver_2_53_1.xml | 13 + ...umhq_selenium_selenium_safari_driver_2_53_1.xml | 13 + ...seleniumhq_selenium_selenium_support_2_53_1.xml | 13 + .../Maven__org_slf4j_jcl_over_slf4j_1_7_25.xml | 13 + .../Maven__org_slf4j_jul_to_slf4j_1_7_25.xml | 13 + .../Maven__org_slf4j_log4j_over_slf4j_1_7_25.xml | 13 + .../Maven__org_slf4j_slf4j_api_1_7_25.xml | 13 + .../Maven__org_slf4j_slf4j_log4j12_1_7_25.xml | 13 + ...ingframework_boot_spring_boot_1_5_6_RELEASE.xml | 13 + ...oot_spring_boot_autoconfigure_1_5_6_RELEASE.xml | 13 + ...work_boot_spring_boot_starter_1_5_6_RELEASE.xml | 13 + ...pring_boot_starter_data_redis_1_5_6_RELEASE.xml | 13 + ...t_spring_boot_starter_logging_1_5_6_RELEASE.xml | 13 + ...ot_spring_boot_starter_tomcat_1_5_6_RELEASE.xml | 13 + ..._boot_spring_boot_starter_web_1_5_6_RELEASE.xml | 13 + ...ork_data_spring_data_commons_1_13_6_RELEASE.xml | 13 + ...ork_data_spring_data_keyvalue_1_2_6_RELEASE.xml | 13 + ...mework_data_spring_data_redis_1_8_6_RELEASE.xml | 13 + ...g_springframework_spring_aop_4_3_10_RELEASE.xml | 13 + ...springframework_spring_beans_4_3_10_RELEASE.xml | 13 + ...ringframework_spring_context_4_3_10_RELEASE.xml | 13 + ...ework_spring_context_support_4_3_10_RELEASE.xml | 13 + ..._springframework_spring_core_4_3_10_RELEASE.xml | 13 + ...gframework_spring_expression_4_3_10_RELEASE.xml | 13 + ..._springframework_spring_jdbc_4_3_10_RELEASE.xml | 13 + ...g_springframework_spring_orm_4_3_10_RELEASE.xml | 13 + ...g_springframework_spring_oxm_4_3_10_RELEASE.xml | 13 + ...rg_springframework_spring_tx_4_3_10_RELEASE.xml | 13 + ...g_springframework_spring_web_4_3_10_RELEASE.xml | 13 + ...pringframework_spring_webmvc_4_3_10_RELEASE.xml | 13 + ...aven__org_xerial_snappy_snappy_java_1_1_7_1.xml | 13 + .idea/libraries/Maven__org_yaml_snakeyaml_1_17.xml | 13 + .../libraries/Maven__redis_clients_jedis_2_9_0.xml | 13 + .idea/misc.xml | 11 + .idea/modules.xml | 8 + .idea/uiDesigner.xml | 124 +++++ .idea/vcs.xml | 6 + .project | 23 + .settings/org.eclipse.core.resources.prefs | 7 + .settings/org.eclipse.jdt.core.prefs | 8 + .settings/org.eclipse.m2e.core.prefs | 4 + README.md | 1 + cnki_crawl.iml | 169 ++++++ config.properties | 15 + dddd_ocr.py | 28 + pom.xml | 250 +++++++++ src/main/java/com/bfd/cnki/App.java | 13 + .../java/com/bfd/cnki/crawl/cache/ConfigCache.java | 26 + .../java/com/bfd/cnki/crawl/entity/Constants.java | 62 +++ .../bfd/cnki/crawl/process/CnkiCrawlServer.java | 247 +++++++++ .../bfd/cnki/crawl/process/ConditionalClick.java | 149 +++++ .../bfd/cnki/crawl/process/KyyzContentParse.java | 609 +++++++++++++++++++++ .../com/bfd/cnki/crawl/process/KyyzProcess.java | 15 + .../bfd/cnki/crawl/process/KyyzProcessImpl.java | 322 +++++++++++ .../cnki/crawl/process/KyyzProcessImplTest.java | 361 ++++++++++++ .../java/com/bfd/cnki/crawl/process/ListParse.java | 105 ++++ .../bfd/cnki/crawl/process/OilContentParse.java | 230 ++++++++ .../com/bfd/cnki/crawl/process/OilProcess.java | 13 + .../com/bfd/cnki/crawl/process/OilProcessImpl.java | 482 ++++++++++++++++ .../bfd/cnki/crawl/process/SfgzContentParse.java | 580 ++++++++++++++++++++ .../com/bfd/cnki/crawl/process/SfgzProcess.java | 12 + .../bfd/cnki/crawl/process/SfgzProcessImpl.java | 387 +++++++++++++ src/main/java/com/bfd/cnki/crawl/test/Demo.java | 203 +++++++ .../com/bfd/cnki/crawl/test/TestListDownload.java | 79 +++ .../java/com/bfd/cnki/crawl/util/DateUtil.java | 240 ++++++++ .../com/bfd/cnki/crawl/util/DesDecryption.java | 92 ++++ .../bfd/cnki/crawl/util/DownlodContentHtml.java | 350 ++++++++++++ .../com/bfd/cnki/crawl/util/EjdDownloadHtml.java | 460 ++++++++++++++++ .../java/com/bfd/cnki/crawl/util/GetDriver.java | 23 + .../java/com/bfd/cnki/crawl/util/Kafkautils.java | 45 ++ .../bfd/cnki/crawl/util/ParametricAssembly.java | 145 +++++ .../java/com/bfd/cnki/crawl/util/PicCheckUtil.java | 75 +++ .../java/com/bfd/cnki/crawl/util/QueueUtils.java | 19 + .../java/com/bfd/cnki/crawl/util/SeleniumTest.java | 64 +++ src/main/java/com/bfd/cnki/crawl/util/Test.java | 27 + .../com/bfd/cnki/crawl/util/ThrowMessageUtil.java | 18 + src/main/java/com/bfd/cnki/crawl/util/UseDb.java | 194 +++++++ src/main/java/com/bfd/cnki/main/Application.java | 169 ++++++ src/main/resources/application.properties | 6 + src/main/resources/banner.txt | 20 + src/main/resources/logback-spring.xml | 38 ++ src/test/java/com/bfd/cnki/AppTest.java | 20 + 187 files changed, 8387 insertions(+) create mode 100644 .classpath create mode 100644 .gitignore create mode 100644 .idea/.gitignore create mode 100644 .idea/compiler.xml create mode 100644 .idea/encodings.xml create mode 100644 .idea/inspectionProfiles/Project_Default.xml create mode 100644 .idea/jarRepositories.xml create mode 100644 .idea/libraries/Maven__cglib_cglib_nodep_2_1_3.xml create mode 100644 .idea/libraries/Maven__ch_qos_logback_logback_classic_1_1_11.xml create mode 100644 .idea/libraries/Maven__ch_qos_logback_logback_core_1_1_11.xml create mode 100644 .idea/libraries/Maven__com_101tec_zkclient_0_10.xml create mode 100644 .idea/libraries/Maven__com_alibaba_fastjson_1_1_22.xml create mode 100644 .idea/libraries/Maven__com_alibaba_otter_canal_client_1_1_4.xml create mode 100644 .idea/libraries/Maven__com_fasterxml_classmate_1_3_3.xml create mode 100644 .idea/libraries/Maven__com_fasterxml_jackson_core_jackson_annotations_2_8_0.xml create mode 100644 .idea/libraries/Maven__com_fasterxml_jackson_core_jackson_core_2_8_9.xml create mode 100644 .idea/libraries/Maven__com_fasterxml_jackson_core_jackson_databind_2_8_9.xml create mode 100644 .idea/libraries/Maven__com_fasterxml_jackson_dataformat_jackson_dataformat_yaml_2_8_9.xml create mode 100644 .idea/libraries/Maven__com_github_virtuald_curvesapi_1_05.xml create mode 100644 .idea/libraries/Maven__com_google_code_gson_gson_2_8_1.xml create mode 100644 .idea/libraries/Maven__com_google_guava_guava_18_0.xml create mode 100644 .idea/libraries/Maven__com_google_protobuf_protobuf_java_3_6_1.xml create mode 100644 .idea/libraries/Maven__com_mchange_c3p0_0_9_5_5.xml create mode 100644 .idea/libraries/Maven__com_mchange_mchange_commons_java_0_2_19.xml create mode 100644 .idea/libraries/Maven__com_oracle_database_ha_ons_12_2_0_1.xml create mode 100644 .idea/libraries/Maven__com_oracle_database_ha_simplefan_12_2_0_1.xml create mode 100644 .idea/libraries/Maven__com_oracle_database_jdbc_ojdbc8_12_2_0_1.xml create mode 100644 .idea/libraries/Maven__com_oracle_database_jdbc_ucp_12_2_0_1.xml create mode 100644 .idea/libraries/Maven__com_oracle_database_security_oraclepki_12_2_0_1.xml create mode 100644 .idea/libraries/Maven__com_oracle_database_security_osdt_cert_12_2_0_1.xml create mode 100644 .idea/libraries/Maven__com_oracle_database_security_osdt_core_12_2_0_1.xml create mode 100644 .idea/libraries/Maven__com_squareup_okhttp3_okhttp_3_9_1.xml create mode 100644 .idea/libraries/Maven__com_squareup_okio_okio_1_13_0.xml create mode 100644 .idea/libraries/Maven__com_yammer_metrics_metrics_core_2_2_0.xml create mode 100644 .idea/libraries/Maven__commons_cli_commons_cli_1_2.xml create mode 100644 .idea/libraries/Maven__commons_codec_commons_codec_1_10.xml create mode 100644 .idea/libraries/Maven__commons_io_commons_io_1_4.xml create mode 100644 .idea/libraries/Maven__commons_lang_commons_lang_2_6.xml create mode 100644 .idea/libraries/Maven__io_netty_netty_3_5_7_Final.xml create mode 100644 .idea/libraries/Maven__io_netty_netty_all_4_1_36_Final.xml create mode 100644 .idea/libraries/Maven__io_netty_netty_buffer_4_1_54_Final.xml create mode 100644 .idea/libraries/Maven__io_netty_netty_codec_4_1_54_Final.xml create mode 100644 .idea/libraries/Maven__io_netty_netty_codec_dns_4_1_54_Final.xml create mode 100644 .idea/libraries/Maven__io_netty_netty_common_4_1_54_Final.xml create mode 100644 .idea/libraries/Maven__io_netty_netty_handler_4_1_54_Final.xml create mode 100644 .idea/libraries/Maven__io_netty_netty_resolver_4_1_54_Final.xml create mode 100644 .idea/libraries/Maven__io_netty_netty_resolver_dns_4_1_54_Final.xml create mode 100644 .idea/libraries/Maven__io_netty_netty_tcnative_boringssl_static_1_1_33_Fork26.xml create mode 100644 .idea/libraries/Maven__io_netty_netty_transport_4_1_54_Final.xml create mode 100644 .idea/libraries/Maven__io_projectreactor_reactor_core_2_0_8_RELEASE.xml create mode 100644 .idea/libraries/Maven__io_reactivex_rxjava3_rxjava_3_0_7.xml create mode 100644 .idea/libraries/Maven__javax_cache_cache_api_1_0_0.xml create mode 100644 .idea/libraries/Maven__javax_validation_validation_api_1_1_0_Final.xml create mode 100644 .idea/libraries/Maven__junit_junit_4_11.xml create mode 100644 .idea/libraries/Maven__log4j_log4j_1_2_14.xml create mode 100644 .idea/libraries/Maven__mysql_mysql_connector_java_8_0_29.xml create mode 100644 .idea/libraries/Maven__net_bytebuddy_byte_buddy_1_8_15.xml create mode 100644 .idea/libraries/Maven__net_java_dev_jna_jna_4_2_2.xml create mode 100644 .idea/libraries/Maven__net_java_dev_jna_jna_platform_4_1_0.xml create mode 100644 .idea/libraries/Maven__net_sf_jopt_simple_jopt_simple_5_0_4.xml create mode 100644 .idea/libraries/Maven__org_apache_commons_commons_collections4_4_2.xml create mode 100644 .idea/libraries/Maven__org_apache_commons_commons_compress_1_18.xml create mode 100644 .idea/libraries/Maven__org_apache_commons_commons_exec_1_3.xml create mode 100644 .idea/libraries/Maven__org_apache_commons_commons_lang3_3_8_1.xml create mode 100644 .idea/libraries/Maven__org_apache_commons_commons_math3_3_6_1.xml create mode 100644 .idea/libraries/Maven__org_apache_commons_commons_pool2_2_4_2.xml create mode 100644 .idea/libraries/Maven__org_apache_httpcomponents_httpclient_4_5_3.xml create mode 100644 .idea/libraries/Maven__org_apache_httpcomponents_httpcore_4_4_6.xml create mode 100644 .idea/libraries/Maven__org_apache_kafka_kafka_2_12_1_0_2.xml create mode 100644 .idea/libraries/Maven__org_apache_kafka_kafka_clients_2_0_1.xml create mode 100644 .idea/libraries/Maven__org_apache_poi_poi_4_0_1.xml create mode 100644 .idea/libraries/Maven__org_apache_poi_poi_ooxml_4_0_1.xml create mode 100644 .idea/libraries/Maven__org_apache_poi_poi_ooxml_schemas_4_0_1.xml create mode 100644 .idea/libraries/Maven__org_apache_rocketmq_rocketmq_acl_4_5_2.xml create mode 100644 .idea/libraries/Maven__org_apache_rocketmq_rocketmq_client_4_5_2.xml create mode 100644 .idea/libraries/Maven__org_apache_rocketmq_rocketmq_common_4_5_2.xml create mode 100644 .idea/libraries/Maven__org_apache_rocketmq_rocketmq_logging_4_5_2.xml create mode 100644 .idea/libraries/Maven__org_apache_rocketmq_rocketmq_remoting_4_5_2.xml create mode 100644 .idea/libraries/Maven__org_apache_rocketmq_rocketmq_srvutil_4_5_2.xml create mode 100644 .idea/libraries/Maven__org_apache_tomcat_embed_tomcat_embed_core_8_5_16.xml create mode 100644 .idea/libraries/Maven__org_apache_tomcat_embed_tomcat_embed_el_8_5_16.xml create mode 100644 .idea/libraries/Maven__org_apache_tomcat_embed_tomcat_embed_websocket_8_5_16.xml create mode 100644 .idea/libraries/Maven__org_apache_xmlbeans_xmlbeans_3_0_2.xml create mode 100644 .idea/libraries/Maven__org_apache_zookeeper_zookeeper_3_4_5.xml create mode 100644 .idea/libraries/Maven__org_hamcrest_hamcrest_core_1_3.xml create mode 100644 .idea/libraries/Maven__org_hibernate_hibernate_validator_5_3_5_Final.xml create mode 100644 .idea/libraries/Maven__org_jboss_logging_jboss_logging_3_3_1_Final.xml create mode 100644 .idea/libraries/Maven__org_jboss_marshalling_jboss_marshalling_2_0_10_Final.xml create mode 100644 .idea/libraries/Maven__org_jboss_marshalling_jboss_marshalling_river_2_0_10_Final.xml create mode 100644 .idea/libraries/Maven__org_jboss_netty_netty_3_2_2_Final.xml create mode 100644 .idea/libraries/Maven__org_jodd_jodd_bean_5_1_6.xml create mode 100644 .idea/libraries/Maven__org_jodd_jodd_core_5_1_6.xml create mode 100644 .idea/libraries/Maven__org_jsoup_jsoup_1_8_1.xml create mode 100644 .idea/libraries/Maven__org_lz4_lz4_java_1_4_1.xml create mode 100644 .idea/libraries/Maven__org_projectlombok_lombok_1_16_18.xml create mode 100644 .idea/libraries/Maven__org_reactivestreams_reactive_streams_1_0_0.xml create mode 100644 .idea/libraries/Maven__org_redisson_redisson_3_14_1.xml create mode 100644 .idea/libraries/Maven__org_scala_lang_scala_library_2_12_4.xml create mode 100644 .idea/libraries/Maven__org_seleniumhq_selenium_selenium_api_2_53_1.xml create mode 100644 .idea/libraries/Maven__org_seleniumhq_selenium_selenium_chrome_driver_2_53_1.xml create mode 100644 .idea/libraries/Maven__org_seleniumhq_selenium_selenium_edge_driver_3_141_59.xml create mode 100644 .idea/libraries/Maven__org_seleniumhq_selenium_selenium_firefox_driver_2_53_1.xml create mode 100644 .idea/libraries/Maven__org_seleniumhq_selenium_selenium_ie_driver_2_53_1.xml create mode 100644 .idea/libraries/Maven__org_seleniumhq_selenium_selenium_java_3_141_59.xml create mode 100644 .idea/libraries/Maven__org_seleniumhq_selenium_selenium_opera_driver_3_141_59.xml create mode 100644 .idea/libraries/Maven__org_seleniumhq_selenium_selenium_remote_driver_2_53_1.xml create mode 100644 .idea/libraries/Maven__org_seleniumhq_selenium_selenium_safari_driver_2_53_1.xml create mode 100644 .idea/libraries/Maven__org_seleniumhq_selenium_selenium_support_2_53_1.xml create mode 100644 .idea/libraries/Maven__org_slf4j_jcl_over_slf4j_1_7_25.xml create mode 100644 .idea/libraries/Maven__org_slf4j_jul_to_slf4j_1_7_25.xml create mode 100644 .idea/libraries/Maven__org_slf4j_log4j_over_slf4j_1_7_25.xml create mode 100644 .idea/libraries/Maven__org_slf4j_slf4j_api_1_7_25.xml create mode 100644 .idea/libraries/Maven__org_slf4j_slf4j_log4j12_1_7_25.xml create mode 100644 .idea/libraries/Maven__org_springframework_boot_spring_boot_1_5_6_RELEASE.xml create mode 100644 .idea/libraries/Maven__org_springframework_boot_spring_boot_autoconfigure_1_5_6_RELEASE.xml create mode 100644 .idea/libraries/Maven__org_springframework_boot_spring_boot_starter_1_5_6_RELEASE.xml create mode 100644 .idea/libraries/Maven__org_springframework_boot_spring_boot_starter_data_redis_1_5_6_RELEASE.xml create mode 100644 .idea/libraries/Maven__org_springframework_boot_spring_boot_starter_logging_1_5_6_RELEASE.xml create mode 100644 .idea/libraries/Maven__org_springframework_boot_spring_boot_starter_tomcat_1_5_6_RELEASE.xml create mode 100644 .idea/libraries/Maven__org_springframework_boot_spring_boot_starter_web_1_5_6_RELEASE.xml create mode 100644 .idea/libraries/Maven__org_springframework_data_spring_data_commons_1_13_6_RELEASE.xml create mode 100644 .idea/libraries/Maven__org_springframework_data_spring_data_keyvalue_1_2_6_RELEASE.xml create mode 100644 .idea/libraries/Maven__org_springframework_data_spring_data_redis_1_8_6_RELEASE.xml create mode 100644 .idea/libraries/Maven__org_springframework_spring_aop_4_3_10_RELEASE.xml create mode 100644 .idea/libraries/Maven__org_springframework_spring_beans_4_3_10_RELEASE.xml create mode 100644 .idea/libraries/Maven__org_springframework_spring_context_4_3_10_RELEASE.xml create mode 100644 .idea/libraries/Maven__org_springframework_spring_context_support_4_3_10_RELEASE.xml create mode 100644 .idea/libraries/Maven__org_springframework_spring_core_4_3_10_RELEASE.xml create mode 100644 .idea/libraries/Maven__org_springframework_spring_expression_4_3_10_RELEASE.xml create mode 100644 .idea/libraries/Maven__org_springframework_spring_jdbc_4_3_10_RELEASE.xml create mode 100644 .idea/libraries/Maven__org_springframework_spring_orm_4_3_10_RELEASE.xml create mode 100644 .idea/libraries/Maven__org_springframework_spring_oxm_4_3_10_RELEASE.xml create mode 100644 .idea/libraries/Maven__org_springframework_spring_tx_4_3_10_RELEASE.xml create mode 100644 .idea/libraries/Maven__org_springframework_spring_web_4_3_10_RELEASE.xml create mode 100644 .idea/libraries/Maven__org_springframework_spring_webmvc_4_3_10_RELEASE.xml create mode 100644 .idea/libraries/Maven__org_xerial_snappy_snappy_java_1_1_7_1.xml create mode 100644 .idea/libraries/Maven__org_yaml_snakeyaml_1_17.xml create mode 100644 .idea/libraries/Maven__redis_clients_jedis_2_9_0.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/uiDesigner.xml create mode 100644 .idea/vcs.xml create mode 100644 .project create mode 100644 .settings/org.eclipse.core.resources.prefs create mode 100644 .settings/org.eclipse.jdt.core.prefs create mode 100644 .settings/org.eclipse.m2e.core.prefs create mode 100644 README.md create mode 100644 cnki_crawl.iml create mode 100644 config.properties create mode 100644 dddd_ocr.py create mode 100644 pom.xml create mode 100644 src/main/java/com/bfd/cnki/App.java create mode 100644 src/main/java/com/bfd/cnki/crawl/cache/ConfigCache.java create mode 100644 src/main/java/com/bfd/cnki/crawl/entity/Constants.java create mode 100644 src/main/java/com/bfd/cnki/crawl/process/CnkiCrawlServer.java create mode 100644 src/main/java/com/bfd/cnki/crawl/process/ConditionalClick.java create mode 100644 src/main/java/com/bfd/cnki/crawl/process/KyyzContentParse.java create mode 100644 src/main/java/com/bfd/cnki/crawl/process/KyyzProcess.java create mode 100644 src/main/java/com/bfd/cnki/crawl/process/KyyzProcessImpl.java create mode 100644 src/main/java/com/bfd/cnki/crawl/process/KyyzProcessImplTest.java create mode 100644 src/main/java/com/bfd/cnki/crawl/process/ListParse.java create mode 100644 src/main/java/com/bfd/cnki/crawl/process/OilContentParse.java create mode 100644 src/main/java/com/bfd/cnki/crawl/process/OilProcess.java create mode 100644 src/main/java/com/bfd/cnki/crawl/process/OilProcessImpl.java create mode 100644 src/main/java/com/bfd/cnki/crawl/process/SfgzContentParse.java create mode 100644 src/main/java/com/bfd/cnki/crawl/process/SfgzProcess.java create mode 100644 src/main/java/com/bfd/cnki/crawl/process/SfgzProcessImpl.java create mode 100644 src/main/java/com/bfd/cnki/crawl/test/Demo.java create mode 100644 src/main/java/com/bfd/cnki/crawl/test/TestListDownload.java create mode 100644 src/main/java/com/bfd/cnki/crawl/util/DateUtil.java create mode 100644 src/main/java/com/bfd/cnki/crawl/util/DesDecryption.java create mode 100644 src/main/java/com/bfd/cnki/crawl/util/DownlodContentHtml.java create mode 100644 src/main/java/com/bfd/cnki/crawl/util/EjdDownloadHtml.java create mode 100644 src/main/java/com/bfd/cnki/crawl/util/GetDriver.java create mode 100644 src/main/java/com/bfd/cnki/crawl/util/Kafkautils.java create mode 100644 src/main/java/com/bfd/cnki/crawl/util/ParametricAssembly.java create mode 100644 src/main/java/com/bfd/cnki/crawl/util/PicCheckUtil.java create mode 100644 src/main/java/com/bfd/cnki/crawl/util/QueueUtils.java create mode 100644 src/main/java/com/bfd/cnki/crawl/util/SeleniumTest.java create mode 100644 src/main/java/com/bfd/cnki/crawl/util/Test.java create mode 100644 src/main/java/com/bfd/cnki/crawl/util/ThrowMessageUtil.java create mode 100644 src/main/java/com/bfd/cnki/crawl/util/UseDb.java create mode 100644 src/main/java/com/bfd/cnki/main/Application.java create mode 100644 src/main/resources/application.properties create mode 100644 src/main/resources/banner.txt create mode 100644 src/main/resources/logback-spring.xml create mode 100644 src/test/java/com/bfd/cnki/AppTest.java diff --git a/.classpath b/.classpath new file mode 100644 index 0000000..1a0a8d2 --- /dev/null +++ b/.classpath @@ -0,0 +1,40 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..26c4637 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/target/ +/logs/ +/target/ diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..73f69e0 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml +# Editor-based HTTP Client requests +/httpRequests/ diff --git a/.idea/compiler.xml b/.idea/compiler.xml new file mode 100644 index 0000000..920839d --- /dev/null +++ b/.idea/compiler.xml @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/encodings.xml b/.idea/encodings.xml new file mode 100644 index 0000000..63e9001 --- /dev/null +++ b/.idea/encodings.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..6560a98 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,36 @@ + + + + \ No newline at end of file diff --git a/.idea/jarRepositories.xml b/.idea/jarRepositories.xml new file mode 100644 index 0000000..712ab9d --- /dev/null +++ b/.idea/jarRepositories.xml @@ -0,0 +1,20 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__cglib_cglib_nodep_2_1_3.xml b/.idea/libraries/Maven__cglib_cglib_nodep_2_1_3.xml new file mode 100644 index 0000000..55692b9 --- /dev/null +++ b/.idea/libraries/Maven__cglib_cglib_nodep_2_1_3.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__ch_qos_logback_logback_classic_1_1_11.xml b/.idea/libraries/Maven__ch_qos_logback_logback_classic_1_1_11.xml new file mode 100644 index 0000000..c6ea67e --- /dev/null +++ b/.idea/libraries/Maven__ch_qos_logback_logback_classic_1_1_11.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__ch_qos_logback_logback_core_1_1_11.xml b/.idea/libraries/Maven__ch_qos_logback_logback_core_1_1_11.xml new file mode 100644 index 0000000..f538e36 --- /dev/null +++ b/.idea/libraries/Maven__ch_qos_logback_logback_core_1_1_11.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_101tec_zkclient_0_10.xml b/.idea/libraries/Maven__com_101tec_zkclient_0_10.xml new file mode 100644 index 0000000..a2b08a2 --- /dev/null +++ b/.idea/libraries/Maven__com_101tec_zkclient_0_10.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_alibaba_fastjson_1_1_22.xml b/.idea/libraries/Maven__com_alibaba_fastjson_1_1_22.xml new file mode 100644 index 0000000..24e5ac0 --- /dev/null +++ b/.idea/libraries/Maven__com_alibaba_fastjson_1_1_22.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_alibaba_otter_canal_client_1_1_4.xml b/.idea/libraries/Maven__com_alibaba_otter_canal_client_1_1_4.xml new file mode 100644 index 0000000..51eedf1 --- /dev/null +++ b/.idea/libraries/Maven__com_alibaba_otter_canal_client_1_1_4.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_fasterxml_classmate_1_3_3.xml b/.idea/libraries/Maven__com_fasterxml_classmate_1_3_3.xml new file mode 100644 index 0000000..e1e9ace --- /dev/null +++ b/.idea/libraries/Maven__com_fasterxml_classmate_1_3_3.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_fasterxml_jackson_core_jackson_annotations_2_8_0.xml b/.idea/libraries/Maven__com_fasterxml_jackson_core_jackson_annotations_2_8_0.xml new file mode 100644 index 0000000..49b4ec7 --- /dev/null +++ b/.idea/libraries/Maven__com_fasterxml_jackson_core_jackson_annotations_2_8_0.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_fasterxml_jackson_core_jackson_core_2_8_9.xml b/.idea/libraries/Maven__com_fasterxml_jackson_core_jackson_core_2_8_9.xml new file mode 100644 index 0000000..0e50bdf --- /dev/null +++ b/.idea/libraries/Maven__com_fasterxml_jackson_core_jackson_core_2_8_9.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_fasterxml_jackson_core_jackson_databind_2_8_9.xml b/.idea/libraries/Maven__com_fasterxml_jackson_core_jackson_databind_2_8_9.xml new file mode 100644 index 0000000..879b200 --- /dev/null +++ b/.idea/libraries/Maven__com_fasterxml_jackson_core_jackson_databind_2_8_9.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_fasterxml_jackson_dataformat_jackson_dataformat_yaml_2_8_9.xml b/.idea/libraries/Maven__com_fasterxml_jackson_dataformat_jackson_dataformat_yaml_2_8_9.xml new file mode 100644 index 0000000..1b8e09b --- /dev/null +++ b/.idea/libraries/Maven__com_fasterxml_jackson_dataformat_jackson_dataformat_yaml_2_8_9.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_github_virtuald_curvesapi_1_05.xml b/.idea/libraries/Maven__com_github_virtuald_curvesapi_1_05.xml new file mode 100644 index 0000000..d18709e --- /dev/null +++ b/.idea/libraries/Maven__com_github_virtuald_curvesapi_1_05.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_google_code_gson_gson_2_8_1.xml b/.idea/libraries/Maven__com_google_code_gson_gson_2_8_1.xml new file mode 100644 index 0000000..ef489bc --- /dev/null +++ b/.idea/libraries/Maven__com_google_code_gson_gson_2_8_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_google_guava_guava_18_0.xml b/.idea/libraries/Maven__com_google_guava_guava_18_0.xml new file mode 100644 index 0000000..bbd71d7 --- /dev/null +++ b/.idea/libraries/Maven__com_google_guava_guava_18_0.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_google_protobuf_protobuf_java_3_6_1.xml b/.idea/libraries/Maven__com_google_protobuf_protobuf_java_3_6_1.xml new file mode 100644 index 0000000..233670d --- /dev/null +++ b/.idea/libraries/Maven__com_google_protobuf_protobuf_java_3_6_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_mchange_c3p0_0_9_5_5.xml b/.idea/libraries/Maven__com_mchange_c3p0_0_9_5_5.xml new file mode 100644 index 0000000..4831186 --- /dev/null +++ b/.idea/libraries/Maven__com_mchange_c3p0_0_9_5_5.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_mchange_mchange_commons_java_0_2_19.xml b/.idea/libraries/Maven__com_mchange_mchange_commons_java_0_2_19.xml new file mode 100644 index 0000000..b5ee94d --- /dev/null +++ b/.idea/libraries/Maven__com_mchange_mchange_commons_java_0_2_19.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_oracle_database_ha_ons_12_2_0_1.xml b/.idea/libraries/Maven__com_oracle_database_ha_ons_12_2_0_1.xml new file mode 100644 index 0000000..c113d31 --- /dev/null +++ b/.idea/libraries/Maven__com_oracle_database_ha_ons_12_2_0_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_oracle_database_ha_simplefan_12_2_0_1.xml b/.idea/libraries/Maven__com_oracle_database_ha_simplefan_12_2_0_1.xml new file mode 100644 index 0000000..23ed94a --- /dev/null +++ b/.idea/libraries/Maven__com_oracle_database_ha_simplefan_12_2_0_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_oracle_database_jdbc_ojdbc8_12_2_0_1.xml b/.idea/libraries/Maven__com_oracle_database_jdbc_ojdbc8_12_2_0_1.xml new file mode 100644 index 0000000..25b447d --- /dev/null +++ b/.idea/libraries/Maven__com_oracle_database_jdbc_ojdbc8_12_2_0_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_oracle_database_jdbc_ucp_12_2_0_1.xml b/.idea/libraries/Maven__com_oracle_database_jdbc_ucp_12_2_0_1.xml new file mode 100644 index 0000000..e043f95 --- /dev/null +++ b/.idea/libraries/Maven__com_oracle_database_jdbc_ucp_12_2_0_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_oracle_database_security_oraclepki_12_2_0_1.xml b/.idea/libraries/Maven__com_oracle_database_security_oraclepki_12_2_0_1.xml new file mode 100644 index 0000000..c5e2148 --- /dev/null +++ b/.idea/libraries/Maven__com_oracle_database_security_oraclepki_12_2_0_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_oracle_database_security_osdt_cert_12_2_0_1.xml b/.idea/libraries/Maven__com_oracle_database_security_osdt_cert_12_2_0_1.xml new file mode 100644 index 0000000..86c003d --- /dev/null +++ b/.idea/libraries/Maven__com_oracle_database_security_osdt_cert_12_2_0_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_oracle_database_security_osdt_core_12_2_0_1.xml b/.idea/libraries/Maven__com_oracle_database_security_osdt_core_12_2_0_1.xml new file mode 100644 index 0000000..9d0bacf --- /dev/null +++ b/.idea/libraries/Maven__com_oracle_database_security_osdt_core_12_2_0_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_squareup_okhttp3_okhttp_3_9_1.xml b/.idea/libraries/Maven__com_squareup_okhttp3_okhttp_3_9_1.xml new file mode 100644 index 0000000..ba9cb81 --- /dev/null +++ b/.idea/libraries/Maven__com_squareup_okhttp3_okhttp_3_9_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_squareup_okio_okio_1_13_0.xml b/.idea/libraries/Maven__com_squareup_okio_okio_1_13_0.xml new file mode 100644 index 0000000..cb99e2f --- /dev/null +++ b/.idea/libraries/Maven__com_squareup_okio_okio_1_13_0.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_yammer_metrics_metrics_core_2_2_0.xml b/.idea/libraries/Maven__com_yammer_metrics_metrics_core_2_2_0.xml new file mode 100644 index 0000000..fc159c5 --- /dev/null +++ b/.idea/libraries/Maven__com_yammer_metrics_metrics_core_2_2_0.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__commons_cli_commons_cli_1_2.xml b/.idea/libraries/Maven__commons_cli_commons_cli_1_2.xml new file mode 100644 index 0000000..cec2493 --- /dev/null +++ b/.idea/libraries/Maven__commons_cli_commons_cli_1_2.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__commons_codec_commons_codec_1_10.xml b/.idea/libraries/Maven__commons_codec_commons_codec_1_10.xml new file mode 100644 index 0000000..27424a1 --- /dev/null +++ b/.idea/libraries/Maven__commons_codec_commons_codec_1_10.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__commons_io_commons_io_1_4.xml b/.idea/libraries/Maven__commons_io_commons_io_1_4.xml new file mode 100644 index 0000000..054eda8 --- /dev/null +++ b/.idea/libraries/Maven__commons_io_commons_io_1_4.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__commons_lang_commons_lang_2_6.xml b/.idea/libraries/Maven__commons_lang_commons_lang_2_6.xml new file mode 100644 index 0000000..2ec8376 --- /dev/null +++ b/.idea/libraries/Maven__commons_lang_commons_lang_2_6.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__io_netty_netty_3_5_7_Final.xml b/.idea/libraries/Maven__io_netty_netty_3_5_7_Final.xml new file mode 100644 index 0000000..fee6f09 --- /dev/null +++ b/.idea/libraries/Maven__io_netty_netty_3_5_7_Final.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__io_netty_netty_all_4_1_36_Final.xml b/.idea/libraries/Maven__io_netty_netty_all_4_1_36_Final.xml new file mode 100644 index 0000000..f1d5d7c --- /dev/null +++ b/.idea/libraries/Maven__io_netty_netty_all_4_1_36_Final.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__io_netty_netty_buffer_4_1_54_Final.xml b/.idea/libraries/Maven__io_netty_netty_buffer_4_1_54_Final.xml new file mode 100644 index 0000000..f0fcfbc --- /dev/null +++ b/.idea/libraries/Maven__io_netty_netty_buffer_4_1_54_Final.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__io_netty_netty_codec_4_1_54_Final.xml b/.idea/libraries/Maven__io_netty_netty_codec_4_1_54_Final.xml new file mode 100644 index 0000000..b9b5f75 --- /dev/null +++ b/.idea/libraries/Maven__io_netty_netty_codec_4_1_54_Final.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__io_netty_netty_codec_dns_4_1_54_Final.xml b/.idea/libraries/Maven__io_netty_netty_codec_dns_4_1_54_Final.xml new file mode 100644 index 0000000..abfb0a3 --- /dev/null +++ b/.idea/libraries/Maven__io_netty_netty_codec_dns_4_1_54_Final.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__io_netty_netty_common_4_1_54_Final.xml b/.idea/libraries/Maven__io_netty_netty_common_4_1_54_Final.xml new file mode 100644 index 0000000..df27ddc --- /dev/null +++ b/.idea/libraries/Maven__io_netty_netty_common_4_1_54_Final.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__io_netty_netty_handler_4_1_54_Final.xml b/.idea/libraries/Maven__io_netty_netty_handler_4_1_54_Final.xml new file mode 100644 index 0000000..b874cd6 --- /dev/null +++ b/.idea/libraries/Maven__io_netty_netty_handler_4_1_54_Final.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__io_netty_netty_resolver_4_1_54_Final.xml b/.idea/libraries/Maven__io_netty_netty_resolver_4_1_54_Final.xml new file mode 100644 index 0000000..0026ccb --- /dev/null +++ b/.idea/libraries/Maven__io_netty_netty_resolver_4_1_54_Final.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__io_netty_netty_resolver_dns_4_1_54_Final.xml b/.idea/libraries/Maven__io_netty_netty_resolver_dns_4_1_54_Final.xml new file mode 100644 index 0000000..08001ef --- /dev/null +++ b/.idea/libraries/Maven__io_netty_netty_resolver_dns_4_1_54_Final.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__io_netty_netty_tcnative_boringssl_static_1_1_33_Fork26.xml b/.idea/libraries/Maven__io_netty_netty_tcnative_boringssl_static_1_1_33_Fork26.xml new file mode 100644 index 0000000..0230ffc --- /dev/null +++ b/.idea/libraries/Maven__io_netty_netty_tcnative_boringssl_static_1_1_33_Fork26.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__io_netty_netty_transport_4_1_54_Final.xml b/.idea/libraries/Maven__io_netty_netty_transport_4_1_54_Final.xml new file mode 100644 index 0000000..5fc96fd --- /dev/null +++ b/.idea/libraries/Maven__io_netty_netty_transport_4_1_54_Final.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__io_projectreactor_reactor_core_2_0_8_RELEASE.xml b/.idea/libraries/Maven__io_projectreactor_reactor_core_2_0_8_RELEASE.xml new file mode 100644 index 0000000..a6a5387 --- /dev/null +++ b/.idea/libraries/Maven__io_projectreactor_reactor_core_2_0_8_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__io_reactivex_rxjava3_rxjava_3_0_7.xml b/.idea/libraries/Maven__io_reactivex_rxjava3_rxjava_3_0_7.xml new file mode 100644 index 0000000..02dc925 --- /dev/null +++ b/.idea/libraries/Maven__io_reactivex_rxjava3_rxjava_3_0_7.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__javax_cache_cache_api_1_0_0.xml b/.idea/libraries/Maven__javax_cache_cache_api_1_0_0.xml new file mode 100644 index 0000000..4ff335f --- /dev/null +++ b/.idea/libraries/Maven__javax_cache_cache_api_1_0_0.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__javax_validation_validation_api_1_1_0_Final.xml b/.idea/libraries/Maven__javax_validation_validation_api_1_1_0_Final.xml new file mode 100644 index 0000000..940ce73 --- /dev/null +++ b/.idea/libraries/Maven__javax_validation_validation_api_1_1_0_Final.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__junit_junit_4_11.xml b/.idea/libraries/Maven__junit_junit_4_11.xml new file mode 100644 index 0000000..f33320d --- /dev/null +++ b/.idea/libraries/Maven__junit_junit_4_11.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__log4j_log4j_1_2_14.xml b/.idea/libraries/Maven__log4j_log4j_1_2_14.xml new file mode 100644 index 0000000..2825a67 --- /dev/null +++ b/.idea/libraries/Maven__log4j_log4j_1_2_14.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__mysql_mysql_connector_java_8_0_29.xml b/.idea/libraries/Maven__mysql_mysql_connector_java_8_0_29.xml new file mode 100644 index 0000000..10a3857 --- /dev/null +++ b/.idea/libraries/Maven__mysql_mysql_connector_java_8_0_29.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__net_bytebuddy_byte_buddy_1_8_15.xml b/.idea/libraries/Maven__net_bytebuddy_byte_buddy_1_8_15.xml new file mode 100644 index 0000000..c7e234c --- /dev/null +++ b/.idea/libraries/Maven__net_bytebuddy_byte_buddy_1_8_15.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__net_java_dev_jna_jna_4_2_2.xml b/.idea/libraries/Maven__net_java_dev_jna_jna_4_2_2.xml new file mode 100644 index 0000000..9cedeae --- /dev/null +++ b/.idea/libraries/Maven__net_java_dev_jna_jna_4_2_2.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__net_java_dev_jna_jna_platform_4_1_0.xml b/.idea/libraries/Maven__net_java_dev_jna_jna_platform_4_1_0.xml new file mode 100644 index 0000000..148c52a --- /dev/null +++ b/.idea/libraries/Maven__net_java_dev_jna_jna_platform_4_1_0.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__net_sf_jopt_simple_jopt_simple_5_0_4.xml b/.idea/libraries/Maven__net_sf_jopt_simple_jopt_simple_5_0_4.xml new file mode 100644 index 0000000..1ef5173 --- /dev/null +++ b/.idea/libraries/Maven__net_sf_jopt_simple_jopt_simple_5_0_4.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_commons_commons_collections4_4_2.xml b/.idea/libraries/Maven__org_apache_commons_commons_collections4_4_2.xml new file mode 100644 index 0000000..1779477 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_commons_commons_collections4_4_2.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_commons_commons_compress_1_18.xml b/.idea/libraries/Maven__org_apache_commons_commons_compress_1_18.xml new file mode 100644 index 0000000..cdc29f4 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_commons_commons_compress_1_18.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_commons_commons_exec_1_3.xml b/.idea/libraries/Maven__org_apache_commons_commons_exec_1_3.xml new file mode 100644 index 0000000..e064a82 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_commons_commons_exec_1_3.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_commons_commons_lang3_3_8_1.xml b/.idea/libraries/Maven__org_apache_commons_commons_lang3_3_8_1.xml new file mode 100644 index 0000000..33b78e9 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_commons_commons_lang3_3_8_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_commons_commons_math3_3_6_1.xml b/.idea/libraries/Maven__org_apache_commons_commons_math3_3_6_1.xml new file mode 100644 index 0000000..ebfe0a8 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_commons_commons_math3_3_6_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_commons_commons_pool2_2_4_2.xml b/.idea/libraries/Maven__org_apache_commons_commons_pool2_2_4_2.xml new file mode 100644 index 0000000..2970b5e --- /dev/null +++ b/.idea/libraries/Maven__org_apache_commons_commons_pool2_2_4_2.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_httpcomponents_httpclient_4_5_3.xml b/.idea/libraries/Maven__org_apache_httpcomponents_httpclient_4_5_3.xml new file mode 100644 index 0000000..c82bb7c --- /dev/null +++ b/.idea/libraries/Maven__org_apache_httpcomponents_httpclient_4_5_3.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_httpcomponents_httpcore_4_4_6.xml b/.idea/libraries/Maven__org_apache_httpcomponents_httpcore_4_4_6.xml new file mode 100644 index 0000000..a9f6e19 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_httpcomponents_httpcore_4_4_6.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_kafka_kafka_2_12_1_0_2.xml b/.idea/libraries/Maven__org_apache_kafka_kafka_2_12_1_0_2.xml new file mode 100644 index 0000000..2957338 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_kafka_kafka_2_12_1_0_2.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_kafka_kafka_clients_2_0_1.xml b/.idea/libraries/Maven__org_apache_kafka_kafka_clients_2_0_1.xml new file mode 100644 index 0000000..56bbf8a --- /dev/null +++ b/.idea/libraries/Maven__org_apache_kafka_kafka_clients_2_0_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_poi_poi_4_0_1.xml b/.idea/libraries/Maven__org_apache_poi_poi_4_0_1.xml new file mode 100644 index 0000000..84c04a4 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_poi_poi_4_0_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_poi_poi_ooxml_4_0_1.xml b/.idea/libraries/Maven__org_apache_poi_poi_ooxml_4_0_1.xml new file mode 100644 index 0000000..65821fb --- /dev/null +++ b/.idea/libraries/Maven__org_apache_poi_poi_ooxml_4_0_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_poi_poi_ooxml_schemas_4_0_1.xml b/.idea/libraries/Maven__org_apache_poi_poi_ooxml_schemas_4_0_1.xml new file mode 100644 index 0000000..8ffe78f --- /dev/null +++ b/.idea/libraries/Maven__org_apache_poi_poi_ooxml_schemas_4_0_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_rocketmq_rocketmq_acl_4_5_2.xml b/.idea/libraries/Maven__org_apache_rocketmq_rocketmq_acl_4_5_2.xml new file mode 100644 index 0000000..63e10c4 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_rocketmq_rocketmq_acl_4_5_2.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_rocketmq_rocketmq_client_4_5_2.xml b/.idea/libraries/Maven__org_apache_rocketmq_rocketmq_client_4_5_2.xml new file mode 100644 index 0000000..6b4881b --- /dev/null +++ b/.idea/libraries/Maven__org_apache_rocketmq_rocketmq_client_4_5_2.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_rocketmq_rocketmq_common_4_5_2.xml b/.idea/libraries/Maven__org_apache_rocketmq_rocketmq_common_4_5_2.xml new file mode 100644 index 0000000..22ddf44 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_rocketmq_rocketmq_common_4_5_2.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_rocketmq_rocketmq_logging_4_5_2.xml b/.idea/libraries/Maven__org_apache_rocketmq_rocketmq_logging_4_5_2.xml new file mode 100644 index 0000000..40ae89e --- /dev/null +++ b/.idea/libraries/Maven__org_apache_rocketmq_rocketmq_logging_4_5_2.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_rocketmq_rocketmq_remoting_4_5_2.xml b/.idea/libraries/Maven__org_apache_rocketmq_rocketmq_remoting_4_5_2.xml new file mode 100644 index 0000000..80f5479 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_rocketmq_rocketmq_remoting_4_5_2.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_rocketmq_rocketmq_srvutil_4_5_2.xml b/.idea/libraries/Maven__org_apache_rocketmq_rocketmq_srvutil_4_5_2.xml new file mode 100644 index 0000000..f970ae2 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_rocketmq_rocketmq_srvutil_4_5_2.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_tomcat_embed_tomcat_embed_core_8_5_16.xml b/.idea/libraries/Maven__org_apache_tomcat_embed_tomcat_embed_core_8_5_16.xml new file mode 100644 index 0000000..97aab57 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_tomcat_embed_tomcat_embed_core_8_5_16.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_tomcat_embed_tomcat_embed_el_8_5_16.xml b/.idea/libraries/Maven__org_apache_tomcat_embed_tomcat_embed_el_8_5_16.xml new file mode 100644 index 0000000..92d54fb --- /dev/null +++ b/.idea/libraries/Maven__org_apache_tomcat_embed_tomcat_embed_el_8_5_16.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_tomcat_embed_tomcat_embed_websocket_8_5_16.xml b/.idea/libraries/Maven__org_apache_tomcat_embed_tomcat_embed_websocket_8_5_16.xml new file mode 100644 index 0000000..b43f570 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_tomcat_embed_tomcat_embed_websocket_8_5_16.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_xmlbeans_xmlbeans_3_0_2.xml b/.idea/libraries/Maven__org_apache_xmlbeans_xmlbeans_3_0_2.xml new file mode 100644 index 0000000..566350f --- /dev/null +++ b/.idea/libraries/Maven__org_apache_xmlbeans_xmlbeans_3_0_2.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_zookeeper_zookeeper_3_4_5.xml b/.idea/libraries/Maven__org_apache_zookeeper_zookeeper_3_4_5.xml new file mode 100644 index 0000000..3019b0a --- /dev/null +++ b/.idea/libraries/Maven__org_apache_zookeeper_zookeeper_3_4_5.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_hamcrest_hamcrest_core_1_3.xml b/.idea/libraries/Maven__org_hamcrest_hamcrest_core_1_3.xml new file mode 100644 index 0000000..f58bbc1 --- /dev/null +++ b/.idea/libraries/Maven__org_hamcrest_hamcrest_core_1_3.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_hibernate_hibernate_validator_5_3_5_Final.xml b/.idea/libraries/Maven__org_hibernate_hibernate_validator_5_3_5_Final.xml new file mode 100644 index 0000000..4bd0591 --- /dev/null +++ b/.idea/libraries/Maven__org_hibernate_hibernate_validator_5_3_5_Final.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_jboss_logging_jboss_logging_3_3_1_Final.xml b/.idea/libraries/Maven__org_jboss_logging_jboss_logging_3_3_1_Final.xml new file mode 100644 index 0000000..6dc7a37 --- /dev/null +++ b/.idea/libraries/Maven__org_jboss_logging_jboss_logging_3_3_1_Final.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_jboss_marshalling_jboss_marshalling_2_0_10_Final.xml b/.idea/libraries/Maven__org_jboss_marshalling_jboss_marshalling_2_0_10_Final.xml new file mode 100644 index 0000000..59e5d7c --- /dev/null +++ b/.idea/libraries/Maven__org_jboss_marshalling_jboss_marshalling_2_0_10_Final.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_jboss_marshalling_jboss_marshalling_river_2_0_10_Final.xml b/.idea/libraries/Maven__org_jboss_marshalling_jboss_marshalling_river_2_0_10_Final.xml new file mode 100644 index 0000000..d7337a4 --- /dev/null +++ b/.idea/libraries/Maven__org_jboss_marshalling_jboss_marshalling_river_2_0_10_Final.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_jboss_netty_netty_3_2_2_Final.xml b/.idea/libraries/Maven__org_jboss_netty_netty_3_2_2_Final.xml new file mode 100644 index 0000000..a5ce0d2 --- /dev/null +++ b/.idea/libraries/Maven__org_jboss_netty_netty_3_2_2_Final.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_jodd_jodd_bean_5_1_6.xml b/.idea/libraries/Maven__org_jodd_jodd_bean_5_1_6.xml new file mode 100644 index 0000000..38aaf15 --- /dev/null +++ b/.idea/libraries/Maven__org_jodd_jodd_bean_5_1_6.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_jodd_jodd_core_5_1_6.xml b/.idea/libraries/Maven__org_jodd_jodd_core_5_1_6.xml new file mode 100644 index 0000000..97d32c6 --- /dev/null +++ b/.idea/libraries/Maven__org_jodd_jodd_core_5_1_6.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_jsoup_jsoup_1_8_1.xml b/.idea/libraries/Maven__org_jsoup_jsoup_1_8_1.xml new file mode 100644 index 0000000..90eb1bc --- /dev/null +++ b/.idea/libraries/Maven__org_jsoup_jsoup_1_8_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_lz4_lz4_java_1_4_1.xml b/.idea/libraries/Maven__org_lz4_lz4_java_1_4_1.xml new file mode 100644 index 0000000..51c2c7d --- /dev/null +++ b/.idea/libraries/Maven__org_lz4_lz4_java_1_4_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_projectlombok_lombok_1_16_18.xml b/.idea/libraries/Maven__org_projectlombok_lombok_1_16_18.xml new file mode 100644 index 0000000..8503cb4 --- /dev/null +++ b/.idea/libraries/Maven__org_projectlombok_lombok_1_16_18.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_reactivestreams_reactive_streams_1_0_0.xml b/.idea/libraries/Maven__org_reactivestreams_reactive_streams_1_0_0.xml new file mode 100644 index 0000000..afef071 --- /dev/null +++ b/.idea/libraries/Maven__org_reactivestreams_reactive_streams_1_0_0.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_redisson_redisson_3_14_1.xml b/.idea/libraries/Maven__org_redisson_redisson_3_14_1.xml new file mode 100644 index 0000000..b19b616 --- /dev/null +++ b/.idea/libraries/Maven__org_redisson_redisson_3_14_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_scala_lang_scala_library_2_12_4.xml b/.idea/libraries/Maven__org_scala_lang_scala_library_2_12_4.xml new file mode 100644 index 0000000..bfe26d5 --- /dev/null +++ b/.idea/libraries/Maven__org_scala_lang_scala_library_2_12_4.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_seleniumhq_selenium_selenium_api_2_53_1.xml b/.idea/libraries/Maven__org_seleniumhq_selenium_selenium_api_2_53_1.xml new file mode 100644 index 0000000..d464f52 --- /dev/null +++ b/.idea/libraries/Maven__org_seleniumhq_selenium_selenium_api_2_53_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_seleniumhq_selenium_selenium_chrome_driver_2_53_1.xml b/.idea/libraries/Maven__org_seleniumhq_selenium_selenium_chrome_driver_2_53_1.xml new file mode 100644 index 0000000..2a3b7b3 --- /dev/null +++ b/.idea/libraries/Maven__org_seleniumhq_selenium_selenium_chrome_driver_2_53_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_seleniumhq_selenium_selenium_edge_driver_3_141_59.xml b/.idea/libraries/Maven__org_seleniumhq_selenium_selenium_edge_driver_3_141_59.xml new file mode 100644 index 0000000..ecf2da2 --- /dev/null +++ b/.idea/libraries/Maven__org_seleniumhq_selenium_selenium_edge_driver_3_141_59.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_seleniumhq_selenium_selenium_firefox_driver_2_53_1.xml b/.idea/libraries/Maven__org_seleniumhq_selenium_selenium_firefox_driver_2_53_1.xml new file mode 100644 index 0000000..443d041 --- /dev/null +++ b/.idea/libraries/Maven__org_seleniumhq_selenium_selenium_firefox_driver_2_53_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_seleniumhq_selenium_selenium_ie_driver_2_53_1.xml b/.idea/libraries/Maven__org_seleniumhq_selenium_selenium_ie_driver_2_53_1.xml new file mode 100644 index 0000000..aa4deef --- /dev/null +++ b/.idea/libraries/Maven__org_seleniumhq_selenium_selenium_ie_driver_2_53_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_seleniumhq_selenium_selenium_java_3_141_59.xml b/.idea/libraries/Maven__org_seleniumhq_selenium_selenium_java_3_141_59.xml new file mode 100644 index 0000000..bd12142 --- /dev/null +++ b/.idea/libraries/Maven__org_seleniumhq_selenium_selenium_java_3_141_59.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_seleniumhq_selenium_selenium_opera_driver_3_141_59.xml b/.idea/libraries/Maven__org_seleniumhq_selenium_selenium_opera_driver_3_141_59.xml new file mode 100644 index 0000000..e3736ac --- /dev/null +++ b/.idea/libraries/Maven__org_seleniumhq_selenium_selenium_opera_driver_3_141_59.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_seleniumhq_selenium_selenium_remote_driver_2_53_1.xml b/.idea/libraries/Maven__org_seleniumhq_selenium_selenium_remote_driver_2_53_1.xml new file mode 100644 index 0000000..3608d9a --- /dev/null +++ b/.idea/libraries/Maven__org_seleniumhq_selenium_selenium_remote_driver_2_53_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_seleniumhq_selenium_selenium_safari_driver_2_53_1.xml b/.idea/libraries/Maven__org_seleniumhq_selenium_selenium_safari_driver_2_53_1.xml new file mode 100644 index 0000000..112e25a --- /dev/null +++ b/.idea/libraries/Maven__org_seleniumhq_selenium_selenium_safari_driver_2_53_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_seleniumhq_selenium_selenium_support_2_53_1.xml b/.idea/libraries/Maven__org_seleniumhq_selenium_selenium_support_2_53_1.xml new file mode 100644 index 0000000..7e216f2 --- /dev/null +++ b/.idea/libraries/Maven__org_seleniumhq_selenium_selenium_support_2_53_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_slf4j_jcl_over_slf4j_1_7_25.xml b/.idea/libraries/Maven__org_slf4j_jcl_over_slf4j_1_7_25.xml new file mode 100644 index 0000000..bae9949 --- /dev/null +++ b/.idea/libraries/Maven__org_slf4j_jcl_over_slf4j_1_7_25.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_slf4j_jul_to_slf4j_1_7_25.xml b/.idea/libraries/Maven__org_slf4j_jul_to_slf4j_1_7_25.xml new file mode 100644 index 0000000..6073e53 --- /dev/null +++ b/.idea/libraries/Maven__org_slf4j_jul_to_slf4j_1_7_25.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_slf4j_log4j_over_slf4j_1_7_25.xml b/.idea/libraries/Maven__org_slf4j_log4j_over_slf4j_1_7_25.xml new file mode 100644 index 0000000..a14ac63 --- /dev/null +++ b/.idea/libraries/Maven__org_slf4j_log4j_over_slf4j_1_7_25.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_slf4j_slf4j_api_1_7_25.xml b/.idea/libraries/Maven__org_slf4j_slf4j_api_1_7_25.xml new file mode 100644 index 0000000..20e8163 --- /dev/null +++ b/.idea/libraries/Maven__org_slf4j_slf4j_api_1_7_25.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_slf4j_slf4j_log4j12_1_7_25.xml b/.idea/libraries/Maven__org_slf4j_slf4j_log4j12_1_7_25.xml new file mode 100644 index 0000000..f073fd8 --- /dev/null +++ b/.idea/libraries/Maven__org_slf4j_slf4j_log4j12_1_7_25.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_boot_spring_boot_1_5_6_RELEASE.xml b/.idea/libraries/Maven__org_springframework_boot_spring_boot_1_5_6_RELEASE.xml new file mode 100644 index 0000000..7de775a --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_boot_spring_boot_1_5_6_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_boot_spring_boot_autoconfigure_1_5_6_RELEASE.xml b/.idea/libraries/Maven__org_springframework_boot_spring_boot_autoconfigure_1_5_6_RELEASE.xml new file mode 100644 index 0000000..ababaf5 --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_boot_spring_boot_autoconfigure_1_5_6_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_boot_spring_boot_starter_1_5_6_RELEASE.xml b/.idea/libraries/Maven__org_springframework_boot_spring_boot_starter_1_5_6_RELEASE.xml new file mode 100644 index 0000000..fdb29ec --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_boot_spring_boot_starter_1_5_6_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_boot_spring_boot_starter_data_redis_1_5_6_RELEASE.xml b/.idea/libraries/Maven__org_springframework_boot_spring_boot_starter_data_redis_1_5_6_RELEASE.xml new file mode 100644 index 0000000..ee41d66 --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_boot_spring_boot_starter_data_redis_1_5_6_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_boot_spring_boot_starter_logging_1_5_6_RELEASE.xml b/.idea/libraries/Maven__org_springframework_boot_spring_boot_starter_logging_1_5_6_RELEASE.xml new file mode 100644 index 0000000..8fbadc8 --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_boot_spring_boot_starter_logging_1_5_6_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_boot_spring_boot_starter_tomcat_1_5_6_RELEASE.xml b/.idea/libraries/Maven__org_springframework_boot_spring_boot_starter_tomcat_1_5_6_RELEASE.xml new file mode 100644 index 0000000..eed0d9d --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_boot_spring_boot_starter_tomcat_1_5_6_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_boot_spring_boot_starter_web_1_5_6_RELEASE.xml b/.idea/libraries/Maven__org_springframework_boot_spring_boot_starter_web_1_5_6_RELEASE.xml new file mode 100644 index 0000000..20ebb83 --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_boot_spring_boot_starter_web_1_5_6_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_data_spring_data_commons_1_13_6_RELEASE.xml b/.idea/libraries/Maven__org_springframework_data_spring_data_commons_1_13_6_RELEASE.xml new file mode 100644 index 0000000..ae30a32 --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_data_spring_data_commons_1_13_6_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_data_spring_data_keyvalue_1_2_6_RELEASE.xml b/.idea/libraries/Maven__org_springframework_data_spring_data_keyvalue_1_2_6_RELEASE.xml new file mode 100644 index 0000000..5f40815 --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_data_spring_data_keyvalue_1_2_6_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_data_spring_data_redis_1_8_6_RELEASE.xml b/.idea/libraries/Maven__org_springframework_data_spring_data_redis_1_8_6_RELEASE.xml new file mode 100644 index 0000000..d7869ab --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_data_spring_data_redis_1_8_6_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_spring_aop_4_3_10_RELEASE.xml b/.idea/libraries/Maven__org_springframework_spring_aop_4_3_10_RELEASE.xml new file mode 100644 index 0000000..6e14e11 --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_spring_aop_4_3_10_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_spring_beans_4_3_10_RELEASE.xml b/.idea/libraries/Maven__org_springframework_spring_beans_4_3_10_RELEASE.xml new file mode 100644 index 0000000..014e25c --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_spring_beans_4_3_10_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_spring_context_4_3_10_RELEASE.xml b/.idea/libraries/Maven__org_springframework_spring_context_4_3_10_RELEASE.xml new file mode 100644 index 0000000..7e7126e --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_spring_context_4_3_10_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_spring_context_support_4_3_10_RELEASE.xml b/.idea/libraries/Maven__org_springframework_spring_context_support_4_3_10_RELEASE.xml new file mode 100644 index 0000000..d7b9867 --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_spring_context_support_4_3_10_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_spring_core_4_3_10_RELEASE.xml b/.idea/libraries/Maven__org_springframework_spring_core_4_3_10_RELEASE.xml new file mode 100644 index 0000000..9ea2a4f --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_spring_core_4_3_10_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_spring_expression_4_3_10_RELEASE.xml b/.idea/libraries/Maven__org_springframework_spring_expression_4_3_10_RELEASE.xml new file mode 100644 index 0000000..d124628 --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_spring_expression_4_3_10_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_spring_jdbc_4_3_10_RELEASE.xml b/.idea/libraries/Maven__org_springframework_spring_jdbc_4_3_10_RELEASE.xml new file mode 100644 index 0000000..5725fe6 --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_spring_jdbc_4_3_10_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_spring_orm_4_3_10_RELEASE.xml b/.idea/libraries/Maven__org_springframework_spring_orm_4_3_10_RELEASE.xml new file mode 100644 index 0000000..cdd5c3f --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_spring_orm_4_3_10_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_spring_oxm_4_3_10_RELEASE.xml b/.idea/libraries/Maven__org_springframework_spring_oxm_4_3_10_RELEASE.xml new file mode 100644 index 0000000..c510755 --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_spring_oxm_4_3_10_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_spring_tx_4_3_10_RELEASE.xml b/.idea/libraries/Maven__org_springframework_spring_tx_4_3_10_RELEASE.xml new file mode 100644 index 0000000..315f091 --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_spring_tx_4_3_10_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_spring_web_4_3_10_RELEASE.xml b/.idea/libraries/Maven__org_springframework_spring_web_4_3_10_RELEASE.xml new file mode 100644 index 0000000..0c4a084 --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_spring_web_4_3_10_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_spring_webmvc_4_3_10_RELEASE.xml b/.idea/libraries/Maven__org_springframework_spring_webmvc_4_3_10_RELEASE.xml new file mode 100644 index 0000000..9cc6fbc --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_spring_webmvc_4_3_10_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_xerial_snappy_snappy_java_1_1_7_1.xml b/.idea/libraries/Maven__org_xerial_snappy_snappy_java_1_1_7_1.xml new file mode 100644 index 0000000..06fc4e8 --- /dev/null +++ b/.idea/libraries/Maven__org_xerial_snappy_snappy_java_1_1_7_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_yaml_snakeyaml_1_17.xml b/.idea/libraries/Maven__org_yaml_snakeyaml_1_17.xml new file mode 100644 index 0000000..20e2920 --- /dev/null +++ b/.idea/libraries/Maven__org_yaml_snakeyaml_1_17.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__redis_clients_jedis_2_9_0.xml b/.idea/libraries/Maven__redis_clients_jedis_2_9_0.xml new file mode 100644 index 0000000..77144ff --- /dev/null +++ b/.idea/libraries/Maven__redis_clients_jedis_2_9_0.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..4361200 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,11 @@ + + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..c367e55 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/uiDesigner.xml b/.idea/uiDesigner.xml new file mode 100644 index 0000000..e96534f --- /dev/null +++ b/.idea/uiDesigner.xml @@ -0,0 +1,124 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.project b/.project new file mode 100644 index 0000000..09459d8 --- /dev/null +++ b/.project @@ -0,0 +1,23 @@ + + + cnki_crawl + + + + + + org.eclipse.jdt.core.javabuilder + + + + + org.eclipse.m2e.core.maven2Builder + + + + + + org.eclipse.jdt.core.javanature + org.eclipse.m2e.core.maven2Nature + + diff --git a/.settings/org.eclipse.core.resources.prefs b/.settings/org.eclipse.core.resources.prefs new file mode 100644 index 0000000..f4f6d8c --- /dev/null +++ b/.settings/org.eclipse.core.resources.prefs @@ -0,0 +1,7 @@ +eclipse.preferences.version=1 +encoding//src/main/java=UTF-8 +encoding//src/main/resources=UTF-8 +encoding//src/main/resources/application.properties=UTF-8 +encoding//src/test/java=UTF-8 +encoding/=UTF-8 +encoding/config.properties=UTF-8 diff --git a/.settings/org.eclipse.jdt.core.prefs b/.settings/org.eclipse.jdt.core.prefs new file mode 100644 index 0000000..2f5cc74 --- /dev/null +++ b/.settings/org.eclipse.jdt.core.prefs @@ -0,0 +1,8 @@ +eclipse.preferences.version=1 +org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 +org.eclipse.jdt.core.compiler.compliance=1.8 +org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled +org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning +org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore +org.eclipse.jdt.core.compiler.release=disabled +org.eclipse.jdt.core.compiler.source=1.8 diff --git a/.settings/org.eclipse.m2e.core.prefs b/.settings/org.eclipse.m2e.core.prefs new file mode 100644 index 0000000..f897a7f --- /dev/null +++ b/.settings/org.eclipse.m2e.core.prefs @@ -0,0 +1,4 @@ +activeProfiles= +eclipse.preferences.version=1 +resolveWorkspaceProjects=true +version=1 diff --git a/README.md b/README.md new file mode 100644 index 0000000..4186187 --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +知网定制采集引擎 diff --git a/cnki_crawl.iml b/cnki_crawl.iml new file mode 100644 index 0000000..e73963b --- /dev/null +++ b/cnki_crawl.iml @@ -0,0 +1,169 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/config.properties b/config.properties new file mode 100644 index 0000000..1c9f961 --- /dev/null +++ b/config.properties @@ -0,0 +1,15 @@ +#mysql连接配置 +driver = com.mysql.cj.jdbc.Driver +jdbcurl = jdbc:mysql://172.18.1.180:3306/bfd_crawl_list?allowMultiQueries=true&rewriteBatchedStatements=true&useUnicode=true&characterEncoding=utf8 +user = crawl +password = crawl +#kafka配置 +brokers = 172.18.1.101:9092,172.18.1.102:9092,172.18.1.104:9092 +topic = newsTopicdata2 +brokersTwo = 172.18.1.146:9092,172.18.1.147:9092,172.18.1.148:9092 +topicTwo = zhiWangTest2 +#topic = testoil +errortopic = zhiWangErrorTest +#时间倒序排列为1(1/0) +orderBy = 1 +threadNum = 30 \ No newline at end of file diff --git a/dddd_ocr.py b/dddd_ocr.py new file mode 100644 index 0000000..0065588 --- /dev/null +++ b/dddd_ocr.py @@ -0,0 +1,28 @@ +import ddddocr + +ocr = ddddocr.DdddOcr() +import requests +import sys +headerCookie = sys.argv[1] +headers = { + 'Accept': 'image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8', + 'Accept-Language': 'zh-CN,zh;q=0.9', + 'Connection': 'keep-alive', + 'Cookie': headerCookie, + 'Referer': 'https://ref.cnki.net/REF/AdvSearch', + 'Sec-Fetch-Dest': 'image', + 'Sec-Fetch-Mode': 'no-cors', + 'Sec-Fetch-Site': 'same-origin', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.115 Safari/537.36', + 'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': '"Windows"' +} + +url_code = 'https://ie.cnki.net/kns/checkcode.aspx?t=0.15957984515339407' +rep_code=requests.get(url_code,headers=headers) +code_bytes=rep_code.content +code = ocr.classification(code_bytes) +print(code) + + diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..6ea088b --- /dev/null +++ b/pom.xml @@ -0,0 +1,250 @@ + + + + 4.0.0 + + + org.springframework.boot + spring-boot-starter-parent + 1.5.6.RELEASE + + com.bfd + cnki_crawl_kyyz + 0.0.1-SNAPSHOT + + cnki_crawl + + http://www.example.com + + + UTF-8 + 1.8 + 1.8 + + + + + junit + junit + 4.11 + test + + + org.springframework.boot + spring-boot-starter-web + + + com.alibaba.otter + canal.client + 1.1.4 + + + log4j + log4j + 1.2.14 + + + com.alibaba + fastjson + 1.1.22 + + + + + + commons-io + commons-io + 1.4 + + + + org.apache.httpcomponents + httpclient + 4.5.3 + + + + com.squareup.okhttp3 + okhttp + 3.9.1 + + + + org.apache.commons + commons-lang3 + 3.8.1 + + + + com.oracle.database.jdbc + ojdbc8 + 12.2.0.1 + + + + com.mchange + c3p0 + 0.9.5.5 + + + + + + + + + mysql + mysql-connector-java + 8.0.29 + + + + org.apache.kafka + kafka_2.12 + 1.0.2 + + + + org.apache.kafka + kafka-clients + 2.0.1 + + + + + org.jsoup + jsoup + 1.8.1 + + + org.apache.poi + poi + 4.0.1 + + + org.apache.poi + poi-ooxml + 4.0.1 + + + + org.seleniumhq.selenium + selenium-java + 3.141.59 + + + + org.projectlombok + lombok + true + + + + org.springframework.boot + spring-boot-starter-data-redis + + + io.netty + netty-all + 4.1.36.Final + + + org.redisson + redisson + 3.14.1 + + + + + + + + + maven-clean-plugin + 3.1.0 + + + + maven-resources-plugin + 3.0.2 + + + maven-compiler-plugin + 3.8.0 + + + maven-surefire-plugin + 2.22.1 + + + maven-jar-plugin + 3.0.2 + + + maven-install-plugin + 2.5.2 + + + maven-deploy-plugin + 2.8.2 + + + + maven-site-plugin + 3.7.1 + + + maven-project-info-reports-plugin + 3.0.0 + + + + + org.springframework.boot + spring-boot-maven-plugin + + com.bfd.cnki.main.Application + ZIP + + + ${project.groupId} + ${project.artifactId} + + + + + + + repackage + + + + + + org.apache.maven.plugins + maven-dependency-plugin + + + copy + package + + copy-dependencies + + + jar + jar + runtime + ${project.build.directory}/libs + + + + + + + + diff --git a/src/main/java/com/bfd/cnki/App.java b/src/main/java/com/bfd/cnki/App.java new file mode 100644 index 0000000..6ec3ecb --- /dev/null +++ b/src/main/java/com/bfd/cnki/App.java @@ -0,0 +1,13 @@ +package com.bfd.cnki; + +/** + * Hello world! + * + */ +public class App +{ + public static void main( String[] args ) + { + System.out.println( "ready go" ); + } +} diff --git a/src/main/java/com/bfd/cnki/crawl/cache/ConfigCache.java b/src/main/java/com/bfd/cnki/crawl/cache/ConfigCache.java new file mode 100644 index 0000000..c728fee --- /dev/null +++ b/src/main/java/com/bfd/cnki/crawl/cache/ConfigCache.java @@ -0,0 +1,26 @@ +package com.bfd.cnki.crawl.cache; + +import org.openqa.selenium.WebDriver; +import org.redisson.api.RBloomFilter; +import org.redisson.api.RedissonClient; + +import java.sql.Driver; +import java.util.HashMap; +import java.util.Map; + +/** + * @author jian.mao + * @date 2022年11月11日 + * @description 静态变量类 + */ +public class ConfigCache { + + /**总配置对象**/ + public static Map mainConfig = new HashMap(16); + public static RBloomFilter OilBloomFilter; + public static RBloomFilter KyyBloomFilter; + public static RBloomFilter SfgzBloomFilter; + public static RedissonClient redisson; + public static boolean isExec = true; +// public static WebDriver driver = null; +} diff --git a/src/main/java/com/bfd/cnki/crawl/entity/Constants.java b/src/main/java/com/bfd/cnki/crawl/entity/Constants.java new file mode 100644 index 0000000..a67170c --- /dev/null +++ b/src/main/java/com/bfd/cnki/crawl/entity/Constants.java @@ -0,0 +1,62 @@ +package com.bfd.cnki.crawl.entity; + +import org.apache.kafka.clients.producer.KafkaProducer; + +/** + * @author jian.mao + * @date 2022年11月11日 + * @description 常量类 + */ +public class Constants { + + public final static String JDBC_DRIVER = "driver"; + public final static String JDBC_URL = "jdbcurl"; + public final static String JDBC_USER = "user"; + public final static String JDBC_PASSWORD = "password"; + public final static String ATTACH_TAG = "attachTag"; + public final static String KEYWORD = "keyword"; + public final static String URL = "url"; + public final static String DETAILURL = "detailUrl"; + public final static String PURL = "purl"; + public final static String CHANNELNAME = "channelName"; + public final static String BROKERS = "brokers"; + public final static String BROKERSTWO = "brokersTwo"; + public final static String TOPIC = "topic"; + public final static String TOPICTWO = "topicTwo"; + public final static String ERRORTOPIC = "errortopic"; + public final static String EMPTY = ""; + public final static String WXTYPE = "wxtype"; + public final static String CHINESE = "中文"; + public final static String FOREIGNLANGUAGE = "外文"; + public final static String TITLE = "title"; + public final static String AUTHOR = "author"; + public final static String SOURCE = "source"; + public final static String DATE = "date"; + public final static String RID = "rid"; + public final static String OIL_URL_PREFIX = "https://ie.cnki.net/KCMS/detail/detail.aspx?dbcode=SYHJ&"; + public final static String DBNAME_IS = "dbname="; + public final static String FILENAME_IS = "filename="; + public final static String AND = "&"; + public final static String BRIEF = "brief"; + public final static String MAPDATA = "mapData"; + public final static String MAINKEY = "mainKey"; + public final static String ERRORTIME = "errortime"; + public final static String CONTENT = "content"; + public final static String ORDERBY = "orderBy"; + public final static String THREADNUM = "threadNum"; + public final static String CRAWLSTARTTIME = "crawlStartTime"; + public final static String CRAWLENDTIME = "crawlEndTime"; + public final static String SUCCESS = "success"; + public final static String NAME = "NAME"; + public final static String ORG = "ORG"; + public final static String YJLY = "YJLY"; + public final static String TI = "TI"; + public final static String AU = "AU"; + public final static String BKLY = "BKLY"; + public final static String LISTTYPE = "176"; + public final static String CONTENTTYPE = "177"; + public final static String DOCUMENTTYPE_ONE = "1"; + public final static String DOCUMENTTYPE_TWO = "2"; + public final static String DOCUMENTTYPE_THREE = "3"; + +} diff --git a/src/main/java/com/bfd/cnki/crawl/process/CnkiCrawlServer.java b/src/main/java/com/bfd/cnki/crawl/process/CnkiCrawlServer.java new file mode 100644 index 0000000..cfa5611 --- /dev/null +++ b/src/main/java/com/bfd/cnki/crawl/process/CnkiCrawlServer.java @@ -0,0 +1,247 @@ +package com.bfd.cnki.crawl.process; + +import com.alibaba.fastjson.JSONObject; +import com.bfd.cnki.crawl.cache.ConfigCache; +import com.bfd.cnki.crawl.entity.Constants; +import com.bfd.cnki.crawl.util.DateUtil; +import com.bfd.cnki.crawl.util.UseDb; +import lombok.extern.slf4j.Slf4j; +import org.redisson.Redisson; +import org.redisson.config.Config; + +import java.util.*; + +/** + * 抓取执行静态入口 + * + * @author jian.mao + * @date 2022年11月14日 + * @description + */ +@Slf4j +public class CnkiCrawlServer { +// @Autowired +// private static OilProcessImpl oilProcessImpl; + + + static { + //redis连接 + Config config = new Config(); +// config.useSingleServer().setAddress("redis://172.18.1.101:6379"); + config.useSingleServer().setAddress("redis://node-01:6379"); + //构造Redisson + ConfigCache.redisson = Redisson.create(config); + ConfigCache.OilBloomFilter = ConfigCache.redisson.getBloomFilter("oilFilter"); + ConfigCache.KyyBloomFilter = ConfigCache.redisson.getBloomFilter("kkyzFilter_kyyz_5"); + ConfigCache.SfgzBloomFilter = ConfigCache.redisson.getBloomFilter("SfgzBloomFilter0521"); + //初始化布隆过滤器:预计元素为100000000L,误差率为3% + ConfigCache.OilBloomFilter.tryInit(100000000L, 0.03); + ConfigCache.KyyBloomFilter.tryInit(100000000L, 0.03); + ConfigCache.SfgzBloomFilter.tryInit(100000000L, 0.03); + + } + + public static void exec() { + //定时扫表 + TimerTask timerTask = new TimerTask() { + @Override + public void run() { + loadTasks(); + } + }; + Timer timer = new Timer(); + timer.schedule(timerTask, 1000 * 10, 1000 * 10); + } + + public static void loadTasks() { + // mysql查询 + Map keyMap = new HashMap(16); + Map mysqlTask = new HashMap<>(16); + List params = new ArrayList<>(); + params.add("rid"); + params.add("url"); + params.add("keyword"); + params.add("attachTag"); + params.add("channelName"); + params.add("pageTypeID"); + params.add("nextPageTime"); + //任务开始时间 + params.add("crawlStartTime"); + //任务结束时间 + params.add("crawlEndTime"); + //加分布式锁 start + String mysqlSelect = "select * from newslist_000 where cid = 'Nzhiwang' and `status`=1 and (`nextcrawltime` is null or `nextcrawltime` <= now()) limit 5"; +// String mysqlSelect = "select * from newslist_000 where cid = 'Nzhiwang' AND url = 'A' AND STATUS = 1 limit 1;"; + mysqlTask.put("select", mysqlSelect); + List> results = new ArrayList>(); + results = UseDb.queryBySql(mysqlTask, params); + log.info("查询数据库结果 results={}", JSONObject.toJSONString(results)); + if (results.size() > 0) { + //修改任务状态为2 + String mysqlUpdate = "UPDATE newslist_000 SET status = 2 where `rid` = '" + results.get(0).get(Constants.RID) + "'"; + mysqlTask.put("update", mysqlUpdate); + boolean iscon = UseDb.writeUpdate(mysqlTask); + if (iscon) { + log.info("任务{}已被扫走并状态更新成功", JSONObject.toJSONString(results)); + } else { + log.info("任务{}已被扫走但状态更新失败", JSONObject.toJSONString(results)); + } + } else { + log.info("无可执行任务。"); + return; + } + + //加分布式锁 end + + //任务判断 + for (Map map : results) { +// {"crawlDataFlag":"keyword:新能源","attachTag":"{\"field\":10371}","appId":"kyyz","project_name":"kyyz"} + String attachTag = map.get(Constants.ATTACH_TAG).toString(); + String url = map.get(Constants.URL).toString(); + String channelName = map.get(Constants.CHANNELNAME).toString(); + String keyword = map.get(Constants.KEYWORD).toString(); + String crawlStartTime = map.get(Constants.CRAWLSTARTTIME).toString(); + String crawlEndTime = map.get(Constants.CRAWLENDTIME).toString(); + String pageTypeID = map.get("pageTypeID").toString(); + String nextPageTime = map.get("nextPageTime").toString(); +// if (channelName.contains(Constants.KEYWORD)||pageTypeID.equals(Constants.CONTENTTYPE)) { + if (!keyword.equals(Constants.EMPTY) || pageTypeID.equals(Constants.CONTENTTYPE)) { + //是关键词任务-科研有知 + log.info("是关键词任务-科研有知/身份感知"); +// String shenfenganzhi = "身份感知"; + String shenfenganzhi = "oi"; + if (attachTag.toLowerCase().contains(shenfenganzhi)) { + log.info("身份感知"); + try { + SfgzProcess sfgzProcess = new SfgzProcessImpl(); + Boolean code = sfgzProcess.process(attachTag, url, channelName, keyword, crawlStartTime, pageTypeID); + if (code) { + // 任务执行成功 --单词任务 + String mysqlUpdate = "UPDATE newslist_000 SET status = 3 where `rid` = '" + results.get(0).get(Constants.RID) + "'"; + mysqlTask.put("update", mysqlUpdate); + boolean iscon = UseDb.writeUpdate(mysqlTask); + if (iscon) { + log.info("任务{}完成并状态更新成功", url); + } else { + log.info("任务{}完成但状态更新失败", url); + } + } else { + log.info("任务{}执行意外中断,稍后继续执行", url); + String mysqlUpdate = "UPDATE newslist_000 SET status = 1 where `rid` = '" + results.get(0).get(Constants.RID) + "'"; + mysqlTask.put("update", mysqlUpdate); + boolean iscon = UseDb.writeUpdate(mysqlTask); + if (iscon) { + log.info("任务{}执行意外中断更新状态为1", url); + } else { + log.info("任务{}执行意外中断更新状态为1失败", url); + } + } + } catch (Exception e) { + e.printStackTrace(); + } + } else { + log.info("科研有知"); + try { +// KyyzProcess kyyzProcess = new KyyzProcessImpl(); +// Boolean code = kyyzProcess.process(attachTag, url, channelName, keyword, crawlStartTime,nextPageTime); + Map result = new HashMap<>(16); + Map attachTagMap = JSONObject.parseObject(attachTag); +// String att = attachTagMap.get("attachTag").toString(); +// Map attMap = JSONObject.parseObject(att); + String documentTypeKey = "documentType"; + String documentType = ""; +// if (attMap.containsKey(documentTypeKey)) { +// documentType = attMap.get("documentType").toString(); +// } + if (attachTagMap.containsKey(documentTypeKey)) { + documentType = attachTagMap.get("documentType").toString(); + } + log.info("任务参数============documentType:{}", documentType); + String type = ""; + if (!documentType.equals(Constants.EMPTY)) { + String[] documentTypes = documentType.split(","); + for (String s : documentTypes) { + type = s; + log.info("任务拆分============s:{}", s); + KyyzProcessImplTest kyyzProcessImplTest = new KyyzProcessImplTest(); + result = kyyzProcessImplTest.processTest(attachTag, url, channelName, keyword, crawlStartTime, crawlEndTime, nextPageTime, s); + + } + } else { + log.info("没传参数,任务不需要拆分"); + KyyzProcessImplTest kyyzProcessImplTest = new KyyzProcessImplTest(); + result = kyyzProcessImplTest.processTest(attachTag, url, channelName, keyword, crawlStartTime, crawlEndTime, nextPageTime, documentType); + } + Boolean code = (Boolean) result.get("code"); + if (code) { + // 任务执行成功 --单词任务 + String mysqlUpdate = "UPDATE newslist_000 SET status = 3 where `rid` = '" + results.get(0).get(Constants.RID) + "'"; + mysqlTask.put("update", mysqlUpdate); + boolean iscon = UseDb.writeUpdate(mysqlTask); + if (iscon) { + log.info("任务{}完成并状态更新成功", url); + } else { + log.info("任务{}完成但状态更新失败", url); + } + } else { + String nextcrawltime = DateUtil.getCurrentTimePlusMinutes(30); + log.info("任务{}执行意外中断,稍后继续执行", url); + url = result.get("page").toString(); + String mysqlUpdate = "UPDATE newslist_000 SET status = 1 , crawl_account = '" + type + "' ,nextcrawltime = '" + nextcrawltime + "' , url = '" + url + "' where `rid` = '" + results.get(0).get(Constants.RID) + "'"; + mysqlTask.put("update", mysqlUpdate); + boolean iscon = UseDb.writeUpdate(mysqlTask); + if (iscon) { + log.info("任务{}执行意外中断更新状态为1", url); + } else { + log.info("任务{}执行意外中断更新状态为1失败", url); + } + } + + } catch (Exception e) { + e.printStackTrace(); + } + + + } + } +// else +// if (channelName.contains(Constants.URL)) { +// //是模块任务-石油 +// log.info("是模块任务-石油"); +// try { +// OilProcess oilProcess = new OilProcessImpl(); +// Boolean code = oilProcess.process(attachTag, url, channelName); +// if (code) { +// // 任务执行成功 --单词任务 +// String mysqlUpdate = "UPDATE newslist_000 SET status = 3 where `rid` = '" + results.get(0).get(Constants.RID) + "'"; +// mysqlTask.put("update", mysqlUpdate); +// boolean iscon = UseDb.writeUpdate(mysqlTask); +// if (iscon) { +// log.info("任务{}完成并状态更新成功", url); +// } else { +// log.info("任务{}完成但状态更新失败", url); +// } +// } else { +// log.info("任务{}执行意外中断,稍后继续执行", url); +// String mysqlUpdate = "UPDATE newslist_000 SET status = 1 where `rid` = '" + results.get(0).get(Constants.RID) + "'"; +// mysqlTask.put("update", mysqlUpdate); +// boolean iscon = UseDb.writeUpdate(mysqlTask); +// if (iscon) { +// log.info("任务{}执行意外中断更新状态为1", url); +// } else { +// log.info("任务{}执行意外中断更新状态为1失败", url); +// } +// } +// } catch (Exception e) { +// e.printStackTrace(); +// } +// } + } + } + + public static void main(String[] args) { + String a = ""; + String[] b = a.split(","); + System.out.println(b.length); + } +} diff --git a/src/main/java/com/bfd/cnki/crawl/process/ConditionalClick.java b/src/main/java/com/bfd/cnki/crawl/process/ConditionalClick.java new file mode 100644 index 0000000..e06cf79 --- /dev/null +++ b/src/main/java/com/bfd/cnki/crawl/process/ConditionalClick.java @@ -0,0 +1,149 @@ +package com.bfd.cnki.crawl.process; + +import lombok.extern.slf4j.Slf4j; +import org.openqa.selenium.By; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.interactions.Actions; + +/** + * @author:zhaoying + * @className:ConditionalClick + * @version:1.0 + * @description: + * @Date:2022-11-18 14:08:59 + */ +@Slf4j +public class ConditionalClick { + public static void main(String[] args) { + String keyword = "火电"; + String splitKey = " "; + String and = "AND"; + String or = "OR"; + String[] keywords = keyword.split(splitKey); + System.out.println("keywords=" + keywords.length); + } + + /** + * 点击任务需要的检索条件 + */ + public WebDriver clickRetrieveCondition(WebDriver driver, String keyword) { + String splitKey = " "; + String and = "AND"; + String or = "OR"; + String[] keywords = keyword.split(splitKey); + if (keywords.length == 1) { + //只有一个关键词的情况 + //输入关键词 + log.info("只有一个关键词"); + String keywordPath = "//*[@id=\"gradetxt\"]/dd[1]/div[2]/input"; + driver.findElement(By.xpath(keywordPath)).sendKeys(keywords[0]); + } +// else if(keywords.length<=5) { +// //关键词不超过三个 +// } + else { + int five = 5; + if (keywords.length > five) { + //关键词超过三个,需要点击加号 + //*[@id="gradetxt"]/dt/a + //*[@id="gradetxt"]/dt/a + //*[@id="gradetxt"]/dt/a + String addBottonPath = "//*[@id=\"gradetxt\"]/dt/a"; + for (int i = 0; i < keywords.length - five; i++) { + driver.findElement(By.xpath(addBottonPath)).click(); + log.info("点击第" + i + "个加号"); + try { + Thread.sleep(1000 * 5); + } catch (InterruptedException e) { + log.error(e.getMessage()); + } + } + } + try { + int two = 2; + for (int i = 0; i <= two; i++) { + //选择主题 + //*[@id="gradetxt"]/dd[1]/div[2]/div[1]/div[1]/i + //*[@id="gradetxt"]/dd[2]/div[2]/div[1]/div[1]/i + //*[@id="gradetxt"]/dd[3]/div[2]/div[1]/div[1]/i + String themeBottonPath = "//*[@id=\"gradetxt\"]/dd[" + (i + 1) + "]/div[2]/div[1]/div[1]/i"; + driver.findElement(By.xpath(themeBottonPath)).click(); + log.info("点击第" + (i + 1) + "个按钮"); + Thread.sleep(1000 * 10); + //*[@id="gradetxt"]/dd[1]/div[2]/div[1]/div[2]/ul/li[1]/a + //*[@id="gradetxt"]/dd[2]/div[2]/div[1]/div[2]/ul/li[1]/a + //*[@id="gradetxt"]/dd[3]/div[2]/div[1]/div[2]/ul/li[1]/a + String themePath = "//*[@id=\"gradetxt\"]/dd[" + (i + 1) + "]/div[2]/div[1]/div[2]/ul/li[1]/a"; + driver.findElement(By.xpath(themePath)).click(); + driver.findElement(By.xpath("//*[@id=\"gradetxt\"]/dd[2]/div[2]/input")).click(); + log.info("第" + (i + 1) + "次选择主题 并点击一下空白处"); + Thread.sleep(1000 * 5); + } + } catch (Exception e) { + log.error(e.getMessage()); + } + int index = (keywords.length - 1) / 2; + log.info("一共有关键词" + index); + for (int i = 0; i <= index; i++) { + log.info("当前i=" + i); + //输入关键词 + //*[@id="gradetxt"]/dd[1]/div[2]/input + //*[@id="gradetxt"]/dd[2]/div[2]/input + //*[@id="gradetxt"]/dd[3]/div[2]/input + String keywordPath = "//*[@id=\"gradetxt\"]/dd[" + (i + 1) + "]/div[2]/input"; + driver.findElement(By.xpath(keywordPath)).sendKeys(keywords[i * 2]); + log.info("成功输入第" + (i + 1) + "个关键词:{}", keywords[i * 2]); + //选择条件是and还是or + try { + if (keywords[i * 2 + 1].contains(and)) { + log.info("需要选择AND"); + continue; + } else if (keywords[i * 2 + 1].contains(or)) { + log.info("需要选择OR"); + //需要选择OR + //*[@id="gradetxt"]/dd[2]/div[1]/div/em + //*[@id="gradetxt"]/dd[3]/div[1]/div/em + //*[@id="gradetxt"]/dd[4]/div[1]/div/em + String conditionBottonPath = "//*[@id=\"gradetxt\"]/dd[" + (i + 2) + "]/div[1]/div/em"; + driver.findElement(By.xpath(conditionBottonPath)).click(); + Thread.sleep(1000 * 10); + //*[@id="gradetxt"]/dd[2]/div[1]/ul/li[2] + //*[@id="gradetxt"]/dd[3]/div[1]/ul/li[2] + String conditionOrPath = "//*[@id=\"gradetxt\"]/dd[" + (i + 2) + "]/div[1]/ul/li[2]/a"; + driver.findElement(By.xpath(conditionOrPath)).click(); + driver.findElement(By.xpath("//*[@id=\"gradetxt\"]/dd[2]/div[2]/input")).click(); + Thread.sleep(1000 * 5); + } + } catch (Exception e) { + log.info("循环结束"); + log.error("异常:", e); + } + } + } + + //点击检索 + try { + log.info("点击检索"); +// String retrievePath = "/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/div[2]/input"; +// String retrievePath = "//*[@id=\"ModuleSearch\"]/div[1]/div/div[2]/div/div[1]/div[1]/div[2]/div[3]/input"; + //*[@id="ModuleSearch"]/div[1]/div/div[2]/div/div[1]/div[1]/div[2]/div[3]/input +// driver.findElement(By.xpath(retrievePath)).click(); +// driver.findElement(By.className("search-buttons")).findElement(By.className("btn-search")).click(); + driver.findElement(By.cssSelector(".search-buttons .btn-search")).click(); + Thread.sleep(1000 * 20); + Actions actions = new Actions(driver); +// driver.findElement(By.xpath("//*[@id=\"DivDisplayMode\"]/li[2]")); + //选择每页显示50条 +// actions.sendKeys(Keys.ARROW_UP).perform(); +// Thread.sleep(1000 * 10); +// String numBottonPath = "//*[@id=\"perPageDiv\"]"; +// driver.findElement(By.xpath(numBottonPath)).click(); +// Thread.sleep(1000 * 10); +// String numPath = "//*[@id=\"perPageDiv\"]/ul/li[3]"; +// driver.findElement(By.xpath(numPath)).click(); + } catch (Exception e) { + log.error("点击检索异常:", e); + } + return driver; + } +} diff --git a/src/main/java/com/bfd/cnki/crawl/process/KyyzContentParse.java b/src/main/java/com/bfd/cnki/crawl/process/KyyzContentParse.java new file mode 100644 index 0000000..3ecc242 --- /dev/null +++ b/src/main/java/com/bfd/cnki/crawl/process/KyyzContentParse.java @@ -0,0 +1,609 @@ +package com.bfd.cnki.crawl.process; + +import com.alibaba.fastjson.JSONObject; +import com.bfd.cnki.crawl.cache.ConfigCache; +import com.bfd.cnki.crawl.entity.Constants; +import com.bfd.cnki.crawl.util.DateUtil; +import com.bfd.cnki.crawl.util.EjdDownloadHtml; +import com.bfd.cnki.crawl.util.Kafkautils; +import com.bfd.cnki.crawl.util.QueueUtils; +import lombok.extern.slf4j.Slf4j; +import org.apache.kafka.clients.producer.KafkaProducer; +import org.apache.kafka.clients.producer.ProducerRecord; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URL; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * @author:zhaoying + * @className:ContentParse + * @version:1.0 + * @description: + * @Date:2022-11-18 11:52:56 + */ +@Slf4j +public class KyyzContentParse implements Runnable { + private static KafkaProducer producer = Kafkautils.getKafkaProdect(ConfigCache.mainConfig.get(Constants.BROKERS).toString()); + private static KafkaProducer producerTwo = Kafkautils.getKafkaProdect(ConfigCache.mainConfig.get(Constants.BROKERSTWO).toString()); + public static final Pattern PATTERN = Pattern.compile("(?<=TurnPageToKnetV\\().*?(?=\\))"); + public Matcher matcher; + + @Override + public void run() { + while (true) { + log.info("当前KyyzTaskQueue队列长度={}",QueueUtils.kyyzTaskQueue.size()); + try{ + if (QueueUtils.kyyzTaskQueue.size() > 0) { + Map resultData = null; + try { + resultData = QueueUtils.kyyzTaskQueue.take(); + Thread.sleep(1000*10); + doContentParse(resultData); + } catch (InterruptedException e) { + log.warn("detail task is:{}", resultData); + e.printStackTrace(); + } + } else { + try { + Thread.sleep(1000 * 10); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + }catch (Throwable e){ + log.error("未知异常!!!!!!!!!!!", e); + } + + } + } + + /** + * 解析详情 + */ + public void doContentParse(Map resultData) throws InterruptedException { +// Map briefMap = new HashMap<>(16); + Map map = null; + try { + map = (Map) resultData.get(Constants.BRIEF); + } catch (Exception e) { + map = JSONObject.parseObject(resultData.get(Constants.BRIEF).toString()); + } + Map headers = new HashMap(16); + headers.put("Accept", + "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"); + headers.put("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8"); + headers.put("User-Agent", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"); + String detailUrl = map.get(Constants.DETAILURL).toString(); + String downLoadError = "Download failed error is:"; + String html = null; + String content = ""; + String forwardcontent = ""; + try { +// html = DownlodContentHtml.doGet(detailUrl, headers); + html = EjdDownloadHtml.okHttpProxyGetRe(detailUrl, headers); + } catch (Exception e) { + html = downLoadError; + log.info("页面下载失败", e); + } + if (html.contains(downLoadError)) { + log.info("download fial"); + if (resultData.containsKey(Constants.ERRORTIME)) { + int errortime = (int) resultData.get(Constants.ERRORTIME); + int four = 4; + if (errortime >= four) { + log.error("重试次数已用尽{}", JSONObject.toJSONString(resultData)); + QueueUtils.errorKyyzTaskQueue.put(resultData); + log.info("下载失败重试次数已用尽,休眠60s"); + Thread.sleep(1000 * 60); + log.info("休眠结束,继续执行"); + return; + } + resultData.put(Constants.ERRORTIME, errortime + 1); + } else { + resultData.put(Constants.ERRORTIME, 1); + } + QueueUtils.kyyzTaskQueue.put(resultData); + } + if (map.get(Constants.WXTYPE).equals(Constants.CHINESE)) { + List> agencysList = new ArrayList<>(); + String agencyName = ""; + String agencyUrl = ""; + try { + Document doc = Jsoup.parse(html); + content = doc.select(".wrapper").text(); + forwardcontent = doc.select(".wrapper").html(); + Elements elements4 = doc.select("h3"); + String agency = ""; + List agencyList = new ArrayList<>(); + List authorAgencyUrls = new ArrayList(); + if (elements4 != null && elements4.size() > 0) { + if (elements4.size() > 1) { + Elements agencys = elements4.get(1).select("span"); + if (agencys.size() > 0) { + for (Element element : agencys) { + Map agencysListMap = new HashMap<>(16); + agencyName = element.select("a").text(); + agencyList.add(agencyName); + String params = element.select("a").attr("href"); + log.info("作者 参数链接:{}", params); + String authorAgencyUrl = getAbsPath(detailUrl, params); + if (!authorAgencyUrl.equals(Constants.EMPTY)) { + authorAgencyUrls.add(authorAgencyUrl); + agencyUrl = authorAgencyUrl; + } + if (!agencyUrl.equals(Constants.EMPTY) && !agencyName.equals(Constants.EMPTY)) { + agencysListMap.put("name", agencyName); + agencysListMap.put("url", agencyUrl); + agencysList.add(agencysListMap); + } + } + + } + agency = String.join(",", agencyList); +// Elements elements5 = elements4.get(1).select("a"); +// if (elements5 != null && elements5.size() > 0) { +// for (Element element : elements5) { +// String params = element.attr("href"); +// log.info("作者机构参数链接:{}", params); +//// matcher = PATTERN.matcher(params); +//// if (matcher.find()) { +//// String[] paramList = matcher.group().replace("'", "") +//// .split(","); +//// String authorAgencyUrl = "https://kns.cnki.net/kcms/detail/knetsearch.aspx?sfield=" +//// + paramList[0] + "&skey=" + paramList[1] +//// + "&code=" + paramList[2] + "&v=" +//// + paramList[3]; +//// authorAgencyUrls.add(authorAgencyUrl); +//// } +//// } +// String authorAgencyUrl = getAbsPath(detailUrl, params); +// if (!authorAgencyUrl.equals(Constants.EMPTY)) { +// authorAgencyUrls.add(authorAgencyUrl); +// } +// } +// +// } + } else { + Elements agencys = elements4.get(0).select("span"); + if (agencys.size() > 0) { + for (Element element : agencys) { + Map agencysListMap = new HashMap<>(16); + agencyName = element.select("a").text(); + agencyList.add(agencyName); + String params = element.select("a").attr("href"); + log.info("作者机构参数链接:{}", params); + String authorAgencyUrl = getAbsPath(detailUrl, params); + if (!authorAgencyUrl.equals(Constants.EMPTY)) { + authorAgencyUrls.add(authorAgencyUrl); + agencyUrl = authorAgencyUrl; + } + if (!agencyUrl.equals(Constants.EMPTY) && !agencyName.equals(Constants.EMPTY)) { + agencysListMap.put("name", agencyName); + agencysListMap.put("url", agencyUrl); + agencysList.add(agencysListMap); + } + } + } + agency = String.join(",", agencyList); + } + } + log.info("authorAgencyUrls:{}", authorAgencyUrls); + map.put("author_agency_urls", authorAgencyUrls); + map.put("agency", agency); + map.put("agencys", agencysList); + + Elements author = doc.select(".author"); + List> authorAndAgency = new ArrayList<>(); + List authorList = new ArrayList<>(); + if (author.size() > 0) { + if (author.size() == 1) { + log.info("只有作者"); + Map authorAndAgencyMap = new HashMap<>(16); + String authorN = doc.select(".author").get(0).select("span").text().trim(); + String agencyN = ""; + authorList.add(authorN); + authorAndAgencyMap.put("author", authorN); + authorAndAgencyMap.put("agency", agencyN); + authorAndAgency.add(authorAndAgencyMap); + } else if (author.size() == 2) { + log.info("有作者和机构"); + Elements authorNames = author.get(0).select("span"); + Elements agencyNames = author.get(1).select("span"); + if (agencyNames.size() == authorNames.size() && authorNames.size() == 1) { + log.info("作者和机构是1:1的"); + Map authorAndAgencyMap = new HashMap<>(16); + String authorN = authorNames.get(0).text().trim(); + String agencyN = agencyNames.get(0).text().trim(); + authorList.add(authorN); + authorAndAgencyMap.put("author", authorN); + authorAndAgencyMap.put("agency", agencyN); + authorAndAgency.add(authorAndAgencyMap); + } else if (agencyNames.size() == 1 && authorNames.size() > 1) { + log.info("多个作者所属同一个机构"); + for (Element element : authorNames) { + Map authorAndAgencyMap = new HashMap<>(16); + String authorN = element.text().trim(); + String agencyN = agencyNames.get(0).text().trim(); + authorList.add(authorN); + authorAndAgencyMap.put("author", authorN); + authorAndAgencyMap.put("agency", agencyN); + authorAndAgency.add(authorAndAgencyMap); + } + } else { + Map agencyNamesMap = new HashMap<>(16); + for (Element element : agencyNames) { + String text = element.text().trim(); + String num = DateUtil.getStrByPattern(text, "(?<=^\\s*)\\d+(?=\\.)"); + String agencyN = text.replace(num, "").trim().replace(".", ""); + agencyNamesMap.put(num, agencyN); + log.info("agencyNamesMap:{}", JSONObject.toJSONString(agencyNamesMap)); + } + for (Element element : authorNames) { + String num = element.select("sup").text().trim(); + log.info("作者角标的num:{}", num); + if (num.contains(",")) { + String[] nums = num.split(","); + for (String s : nums) { + Map authorAndAgencyMap = new HashMap<>(16); + String authorN = element.text().replace(num, "").trim(); + String agencyN = ""; + if (agencyNamesMap.containsKey(s)) { + agencyN = agencyNamesMap.get(s); + } + authorList.add(authorN); + authorAndAgencyMap.put("author", authorN); + authorAndAgencyMap.put("agency", agencyN); + authorAndAgency.add(authorAndAgencyMap); + } + } else { + Map authorAndAgencyMap = new HashMap<>(16); + String authorN = element.text().replace(num, "").trim(); + String agencyN = ""; + if (agencyNamesMap.containsKey(num)) { + agencyN = agencyNamesMap.get(num); + } + authorList.add(authorN); + authorAndAgencyMap.put("author", authorN); + authorAndAgencyMap.put("agency", agencyN); + authorAndAgency.add(authorAndAgencyMap); + } + + } + } + } + } + map.put("authorAndAgency", authorAndAgency); + // 使用 HashSet 去重 + Set set = new HashSet<>(authorList); + authorList.clear(); + authorList.addAll(set); + map.put("author", String.join(";", authorList)); + +// resultData.put("authorAndAgency", authorAndAgency); + + if (map.get(Constants.TITLE).toString().equals(Constants.EMPTY)) { + map.put("title", doc.select(".wx-tit").select("h1").text()); + } + Map paramter = new HashMap(16); + Elements elements2 = doc.select(".row"); + for (Element element : elements2) { + String key = element.select(".rowtit").text(); + Elements elements3 = element.select(".top-space"); + if (elements3 != null && elements3.size() > 0) { + for (Element element2 : elements3) { + key = element2.select(".rowtit").text(); + if (!key.equals(Constants.EMPTY)) { + paramter.put(key, + element2.text().replace(key, "").replace("更多还原", "").replace("更多 还原", "")); + } + } + } else { + if (!key.equals(Constants.EMPTY)) { + paramter.put(key, + element.text().replace(key, "").replace("更多还原", "").replace("更多 还原", "")); + } + } + } + map.put("paramter", paramter); + List authorUrls = new ArrayList(); + Elements elements3 = doc.select("#authorpart").select("span").select("a"); + if (elements3 != null && elements3.size() > 0) { + for (Element element : elements3) { +// String params = element.attr("onclick"); + String params = element.attr("href"); + log.info(":{}", params); +// matcher = PATTERN.matcher(params); +// if (matcher.find()) { +// String[] paramList = matcher.group().replace("'", "").split(","); +// if (paramList.length == 4) { +// String authorUrl = "https://kns.cnki.net/kcms/detail/knetsearch.aspx?sfield=" +// + paramList[0] + "&skey=" + paramList[1] + "&code=" +// + paramList[2] + "&v=" + paramList[3]; +// authorUrls.add(authorUrl); +// } +// } + String authorUrl = getAbsPath(detailUrl, params); + if (!authorUrl.equals(Constants.EMPTY)) { + authorUrls.add(authorUrl); + } + + } + } + log.info("authorUrls:{}", authorUrls); + map.put("author_urls", authorUrls); + List publishAgencyUrls = new ArrayList(); + Elements elements5 = doc.select(".top-tip").select("span").select("a"); + if (elements5 != null && elements5.size() > 0) { + for (Element element : elements5) { + String publishAgencyUrl = element.attr("href"); + log.info("publishAgencyUrl:{}", publishAgencyUrl); + String title = element.attr("title"); + String shiyongbangzhu = "使用帮助"; + if (!title.equals(shiyongbangzhu)) { + publishAgencyUrls.add(publishAgencyUrl); + } + } + +// if (elements5 != null && elements5.size() > 0) { +// Pattern pattern1 = Pattern.compile("(?<=getKns8NaviLink\\().*?(?=\\))"); +// Matcher matcher1 = null; +// for (Element element : elements5) { +//// matcher1 = pattern1.matcher(element.attr("onclick")); +// matcher1 = pattern1.matcher(element.attr("href")); +// if (matcher1.find()) { +// String[] paramList = matcher1.group().replace("'", "") +// .split(","); +// if (paramList.length == 2) { +// String publishAgencyUrl = "https://kns.cnki.net/kcms/detail/navipage.aspx?dbcode=" +// + paramList[0] + "&baseid=" + paramList[1]; +// publishAgencyUrls.add(publishAgencyUrl); +// } +// } +// } + +// } + } + log.info("publishAgencyUrls:{}", publishAgencyUrls); + map.put("publish_agency_urls", publishAgencyUrls); + Elements elements1 = doc.select(".top-tip").select(".type"); + map.put("eisci", ""); + if (elements1 != null && elements1.size() > 0) { + String eisci = ""; + for (Element element : elements1) { + if (eisci.equals(Constants.EMPTY)) { + eisci = element.text(); + } else { + eisci += "," + element.text(); + } + } + map.put("eisci", eisci); + } + map.put("initial_mark", doc.select(".head-time").text()); + map.put("language", "中文"); + try{ + String fileUrl = doc.select(".btn-dlpdf").get(0).select("a").attr("href"); + map.put("fileUrl",fileUrl ); + }catch (Exception e){ + log.info("没找到pdf按钮{}", detailUrl); + } + } catch (Exception e) { + // TODO: handle exception + e.printStackTrace(); + log.info("详情页面不正常,丢掉:{}", detailUrl); + } + } else { + // 外文 + Document doc = Jsoup.parse(html); + content = doc.select(".detail_content-left__2vUAX").text(); + forwardcontent = doc.select(".detail_content-left__2vUAX").html(); + if (map.get(Constants.TITLE).equals(Constants.EMPTY)) { + map.put("title", doc.select("#doc-title").text()); + } + Map paramter = new HashMap(16); + String summary = ""; + summary = doc.select("#doc-summary-content-text").text(); + paramter.put("摘要", summary); + String keywords = null; + keywords = doc.select("#doc-keyword-text") + .select(".detail_keyword-context-ori__32xuT").text(); + if (keywords.equals(Constants.EMPTY)) { + keywords = doc.select("#doc-keyword-text").select("a").text(); + } + paramter.put("关键词", keywords); + Elements doc_indexedBy = doc.select("#doc-indexedBy"); + if (doc_indexedBy.size() > 0) { + String indexedBy = doc.select("#doc-indexedBy").select("div").text() + .replace("Indexed by / 核心评价", ""); + paramter.put("核心评价", indexedBy); + System.out.println("核心评价" + indexedBy); + } else { + paramter.put("核心评价", ""); + } + Elements authors = doc.select("#doc-author-text"); + if (authors.size() > 0) { + paramter.put("Author", authors.select("a").text()); + System.out.println("作者" + authors.select("a").text()); + } + + Elements dois = doc.select(".detail_doc-doi__VX6o2.detail_doc-item__2l-2B"); + if (dois.size() > 0) { + paramter.put("DOI", dois.select("a").text()); + } + Elements doc_affi_text = doc.select("#doc-affi-text"); + if (doc_affi_text.size() > 0) { + paramter.put("Affiliations", doc_affi_text.select("a").text()); + } + map.put("paramter", paramter); + List authorAgencyUrls = new ArrayList(); + map.put("author_agency_urls", authorAgencyUrls); + map.put("agency", ""); + List authorUrls = new ArrayList(); + map.put("author_urls", authorUrls); + List publishAgencyUrls = new ArrayList(); + map.put("publish_agency_urls", publishAgencyUrls); + map.put("eisci", ""); + map.put("initial_mark", ""); + map.put("language", "外文"); + try{ + String fileUrl = doc.select(".btn-dlpdf").get(0).select("a").attr("href"); + map.put("fileUrl",fileUrl ); + }catch (Exception e){ + log.info("没找到pdf按钮{}", detailUrl); + } + } + try { + resultData.put("post_time", formatTime(map.get(Constants.DATE).toString(), "yyyy-MM-dd HH:mm")); + } catch (Exception e) { + try { + resultData.put("post_time", formatTime(map.get(Constants.DATE).toString(), "yyyy-MM-dd")); + } catch (Exception e2) { + try { + resultData.put("post_time", formatTime(map.get(Constants.DATE).toString(), "yyyy")); + } catch (ParseException parseException) { + log.error("时间解析失败-->{}", map.get(Constants.DATE).toString()); + } + + } + } + resultData.put("content", content); + resultData.put("forwardcontent", forwardcontent); + resultData.put("news_id", DateUtil.getMd5(detailUrl)); + resultData.put("author", map.get(Constants.AUTHOR).toString()); +// resultData.put("source", Constants.SOURCE); + resultData.put("cid", "Nzhiwang"); + resultData.put("url", detailUrl); + resultData.put("type", "newscontent"); + resultData.put("isDownload", false); + resultData.put("iid", DateUtil.getMd5(detailUrl)); + resultData.put("tasks", new ArrayList<>()); + resultData.put("version", "3"); + inserContent(map); + resultData.put("brief", JSONObject.toJSONString(map)); + if (resultData.get(Constants.TITLE).equals(Constants.EMPTY) || resultData.get(Constants.CONTENT).equals(Constants.EMPTY)) { + log.error("解析失败的页面={}", detailUrl); +// if (resultData.containsKey(Constants.ERRORTIME)) { +// int errortime = (int) resultData.get(Constants.ERRORTIME); +// int four = 4; +// if (errortime >= four) { +// log.error("重试次数已用尽{}", JSONObject.toJSONString(resultData)); +// QueueUtils.errorKyyzTaskQueue.put(resultData); +// log.info("失败重试次数已用尽,休眠60s"); +// Thread.sleep(1000 * 60); +// log.info("休眠结束,继续执行"); +// return; +// } +// resultData.put(Constants.ERRORTIME, errortime + 1); +// } else { +// resultData.put(Constants.ERRORTIME, 1); +// } +// QueueUtils.kyyzTaskQueue.put(resultData); + } else { + // 发送103kafka + producer.send(new ProducerRecord(ConfigCache.mainConfig.get(Constants.TOPIC).toString(), + JSONObject.toJSONString(resultData))); + producer.flush(); + // 发送147kafka + producerTwo.send(new ProducerRecord(ConfigCache.mainConfig.get(Constants.TOPICTWO).toString(), + JSONObject.toJSONString(resultData))); + producerTwo.flush(); + log.info("发送kafka:{}", detailUrl); +// //存入过滤器 +// ConfigCache.KyyBloomFilter.add(resultData.get("dedupKey").toString()); + } + } + + /** + * 1. @Description: + * 2. @Param: + * 3. @return: + * 4. @Author: ying.zhao + * 5. @Date: 2022/11/18 + */ + public static String formatTime(String date, String format) throws ParseException { + SimpleDateFormat sdf = new SimpleDateFormat(format, Locale.ENGLISH); + String format1 = ""; + Date parse = sdf.parse(date); + SimpleDateFormat sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + format1 = sdf1.format(parse); + return format1; + } + + + public void inserContent(Map map) { + String id = DateUtil.getMd5(map.get("detailUrl").toString()); + String code = map.get("id").toString(); + String url = map.get("detailUrl").toString(); + String keyword = map.get("keyword").toString().replace("'", "''"); + String title = map.get("title").toString().replace("'", "''"); + String author = map.get("author").toString().replace("'", "''"); + String source = map.get("source").toString().replace("'", "''"); + String date = map.get("date").toString(); + String data = map.get("data").toString().replace("'", "''"); + String quote = map.get("quote").toString().replace("'", "''"); + String download = map.get("download").toString().replace("'", "''"); + String agency = ""; + try { + agency = map.get("agency").toString().replace("'", "''"); + } catch (Exception e) { + // TODO: handle exception + } + + Map paramterMap = (Map) map.get("paramter"); + // 基金资助 + String funding = ""; + // 文中关键词 + String keywords = ""; + // 摘要 + String summary = ""; + for (String key : paramterMap.keySet()) { + String value = (String) paramterMap.get(key); + paramterMap.put(key, value.replace("\"", "\\\"")); + if (key.contains("基金")) { + funding = value.replace("'", "''"); + ; + } else if (key.contains("摘要")) { + summary = value.replace("'", "''"); + ; + } else if (key.contains("关键词")) { + keywords = value.replace("'", "''"); + ; + } + } + map.put("funding", funding); + map.put("keywords", keywords); + map.put("summary", summary); + } + + + public static String getAbsPath(String uri, String urlPath) { + String abURL = ""; + + try { + URI base = new URI(uri); + URI abs = base.resolve(urlPath); + URL absURL = abs.toURL(); + abURL = absURL.toString(); + return abURL; + } catch (URISyntaxException var10) { + var10.printStackTrace(); + return abURL; + } catch (MalformedURLException var11) { + var11.printStackTrace(); + return abURL; + } finally { + ; + } + } +} diff --git a/src/main/java/com/bfd/cnki/crawl/process/KyyzProcess.java b/src/main/java/com/bfd/cnki/crawl/process/KyyzProcess.java new file mode 100644 index 0000000..ce85ef5 --- /dev/null +++ b/src/main/java/com/bfd/cnki/crawl/process/KyyzProcess.java @@ -0,0 +1,15 @@ +package com.bfd.cnki.crawl.process; + +import java.util.Map; + +/** + * @author:zhaoying + * @className:KyyzProcess + * @version:1.0 + * @description: + * @Date:2022-11-16 11:39:43 + */ +public interface KyyzProcess { + Boolean process(String attachTag, String url, String channeName, String keyword, String crawlStartTime,String crawlEndTime, String nextPageTime); + Map processTest(String attachTag, String url, String channeName, String keyword, String crawlStartTime,String crawlEndTime, String nextPageTime,String documentType); +} diff --git a/src/main/java/com/bfd/cnki/crawl/process/KyyzProcessImpl.java b/src/main/java/com/bfd/cnki/crawl/process/KyyzProcessImpl.java new file mode 100644 index 0000000..9370375 --- /dev/null +++ b/src/main/java/com/bfd/cnki/crawl/process/KyyzProcessImpl.java @@ -0,0 +1,322 @@ +package com.bfd.cnki.crawl.process; + +import com.alibaba.fastjson.JSONObject; +import com.bfd.cnki.crawl.cache.ConfigCache; +import com.bfd.cnki.crawl.entity.Constants; +import com.bfd.cnki.crawl.util.DateUtil; +import com.bfd.cnki.crawl.util.GetDriver; +import com.bfd.cnki.crawl.util.QueueUtils; +import lombok.extern.slf4j.Slf4j; +import org.openqa.selenium.*; +import org.openqa.selenium.interactions.Actions; +import org.springframework.scheduling.concurrent.ThreadPoolTaskExecutor; +import org.springframework.stereotype.Service; + +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import static com.bfd.cnki.crawl.util.PicCheckUtil.getCodeByPhoto; + +/** + * @author:zhaoying + * @className:KyyzProcessImpl + * @version:1.0 + * @description: + * @Date:2022-11-18 11:41:30 + */ +@Slf4j +@Service +public class KyyzProcessImpl implements KyyzProcess { + /** + * 这里指的是python环境,不需要改动 + * 定义成常量 + */ + public final static String PPATH = "python"; + /** + * py脚本路径 + * 常量 + */ + public final static String PY_DDDD_OCR = "./dddd_ocr.py"; + protected ThreadPoolTaskExecutor kyyzExecutor; + + @Override + public Boolean process(String attachTag, String url, String channeName, String keyword, String crawlStartTime, String crawlEndTime,String nextPageTime) { + Boolean code = true; + WebDriver driver = null; + try { + ConditionalClick conditionalClick = new ConditionalClick(); + GetDriver getDriver = new GetDriver(); + driver = getDriver.getWebDriverDriver(); + Actions actions = new Actions(driver); + Map attachTagMap = (Map) JSONObject.parse(attachTag); + attachTag = attachTagMap.get(Constants.ATTACH_TAG).toString(); + String listUrl = "https://kns.cnki.net/kns8/AdvSearch?dbcode=CFLS"; + String shenfenganzhi = "身份感知"; + if(attachTag.contains(shenfenganzhi)){ + log.info("身份感知需求"); + listUrl = "https://www.cnki.net/"; + } + driver.get(listUrl); + Thread.sleep(1000 * 10); + driver = conditionalClick.clickRetrieveCondition(driver, keyword); + log.info("返回driver"); + // 获取当前请求的请求头cookie + String headerCookie = ""; + Set cookies = driver.manage().getCookies(); + String denghao = "="; + String fenhao = "; "; + for (Cookie cookie : cookies) { + headerCookie = headerCookie + cookie.getName() + denghao + cookie.getValue() + fenhao; + } + headerCookie = headerCookie.substring(0, headerCookie.length() - 2); + log.info("headerCookie{}", headerCookie); + // 把cookie放入参数数组 + int parmasSize = 1; + String[] parmas = new String[parmasSize]; + parmas[0] = headerCookie; + while (true) { + List elements = null; + elements = driver.findElements(By.cssSelector(".result-table-list tr")); + log.info("长度::::::" + elements.size()); + Thread.sleep(1000*10); + try{ + log.info("当前正在抓取的是{}",driver.findElement(By.className("countPageMark")).getText()); + }catch (Exception e){ + if(elements != null && elements.size() == 0){ + log.info("关键词-{}-无搜索结果,跳出循环!!!!!!!",keyword); + break; + }else{ + log.info("没定位到页码位置"); + } + } + + String lastOneTime = ""; + if (elements != null && elements.size() > 0) { + //不是空列表,进行列表解析 + for (WebElement webElement : elements) { + ListParse listParse = new ListParse(); + Map map = new HashMap(16); + Map resultData = new HashMap(16); + try { + String id = webElement.findElement(By.cssSelector(".seq")).getText(); + map.put("id",id); + } catch (Exception e) { + log.info("没有id"); + } + String title = ""; + try { + title = webElement.findElement(By.cssSelector(".name")).getText(); + } catch (Exception e) { + log.info("没有title"); + } + map.put("title", title); + resultData.put("title", title); + String author = ""; + try { + author = webElement.findElement(By.cssSelector(".author")).getText(); + } catch (Exception e) { + // TODO: handle exception + } + map.put("author", author); + resultData.put("author", author); + String source = ""; + try { + source = webElement.findElement(By.cssSelector(".source")).getText(); + } catch (Exception e) { + // TODO: handle exception + } + map.put("source", source); + resultData.put("source", source); + String date = ""; + try { + date = webElement.findElement(By.cssSelector(".date")).getText(); + lastOneTime = date; + } catch (Exception e) { + // TODO: handle exception + } + map.put("date", date); + String data = ""; + try { + data = webElement.findElement(By.cssSelector(".data")).getText(); + } catch (Exception e) { + log.info("没找到data"); + e.printStackTrace(); + } + map.put("data", data); + String quote = ""; + try { + quote = webElement.findElement(By.cssSelector(".quote")).getText(); + } catch (Exception e) { + // TODO: handle exception + } + map.put("quote", quote); + String download = ""; + try { + download = webElement.findElement(By.cssSelector(".download")).getText(); + } catch (Exception e) { + // TODO: handle exception + } + map.put("download", download); + //详情链接 + String detailUrl = ""; + String wxtype = ""; + url = "https://kns.cnki.net/kcms/detail/detail.aspx?"; + try { + detailUrl = webElement.findElement(By.cssSelector(".fz14")).getAttribute("href") + .replaceAll(".*Detail\\?", url); + wxtype = "中文"; + } catch (Exception e) { + try { + detailUrl = webElement.findElement(By.cssSelector(".fz14-w")).getAttribute("href") + .replaceAll(".*Detail\\?", url); + wxtype = "外文"; + } catch (Exception e2) { + log.info("无详情链接"); + continue; + } + } + map.put("detailUrl", detailUrl); + map.put("wxtype", wxtype); + map.put("keyword", keyword); + String pdfPath = "//*[@id=\"pdfDown\"]"; + log.info("详情链接=" + detailUrl); + String newsId = DateUtil.getMd5(detailUrl); + if (ConfigCache.KyyBloomFilter.contains(newsId)) { + log.info("重复数据"); + continue; + } else { + Map attrMap = new HashMap(16); + attrMap.put("crawlDataFlag", channeName); + Map attachMap = new HashMap(16); + attachMap.put("crawlDataFlag", channeName); + attachMap.put("attachTag", ""); + attachMap.put("appId", "kyyz"); + attachMap.put("project_name", "kyyz"); + attrMap.put("attachtag", attachMap); + attrMap.put("appId", "kyyz"); + attrMap.put("attachTag", attachTag); + attrMap.put("tname", channeName); + attrMap.put("keyword", keyword); + attrMap.put("project_name", "kyyz"); + resultData.put("attr", attrMap); + resultData.put("brief",map); + QueueUtils.kyyzTaskQueue.add(resultData); + } + } + } else { + // 触发验证了 + try { + if (driver.findElements(By.id("vericode")).size() > 0) { + WebElement texts = driver.findElement(By.id("vericode")); + String text = texts.getText(); + System.out.println("触发验证了"); + List imgs = driver.findElements(By.id("changeVercode")); + if (imgs.size() > 0) { + // 确定有图片验证码,调用识别图片验证码方法,传入参数—上面获取到的cookie + String vericode = getCodeByPhoto(parmas[0]); + System.out.println("识别到的验证码=" + vericode); + WebElement inputElem = driver.findElement(By.id("vericode")); + // 输入验证码 + inputElem.sendKeys(vericode); + // 点击提交 + driver.findElement(By.id("checkCodeBtn")).click(); + actions.sendKeys(Keys.END).perform(); + Thread.sleep(10000); + + } + } + continue; + } catch (Exception e) { + log.error("识别图片验证码失败", e); + try { + if (driver.findElements(By.id("vericode")).size() > 0) { + WebElement texts = driver.findElement(By.id("vericode")); + String text = texts.getText(); + System.out.println("触发验证了"); + List imgs = driver.findElements(By.id("changeVercode")); + if (imgs.size() > 0) { + // 确定有图片验证码,调用识别图片验证码方法,传入参数—上面获取到的cookie + String vericode = getCodeByPhoto(parmas[0]); + System.out.println("识别到的验证码=" + vericode); + WebElement inputElem = driver.findElement(By.id("vericode")); + // 输入验证码 + inputElem.sendKeys(vericode); + // 点击提交 + driver.findElement(By.id("checkCodeBtn")).click(); + actions.sendKeys(Keys.END).perform(); + Thread.sleep(10000); + + } + } + continue; + } catch (Exception e1) { + log.error("第二次识别图片验证码失败", e); + break; + } + } + } + boolean isnext = isJudgingElement(driver, By.id("PageNext")); + //判断任务开始时间 + SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + SimpleDateFormat formatTwo = new SimpleDateFormat("yyyy-MM-dd"); + long detailLastOneTime = 0; + long detailCrawlStartTime = 0; + try { + detailLastOneTime = format.parse(lastOneTime).getTime(); + } catch (ParseException e) { + detailLastOneTime = formatTwo.parse(lastOneTime).getTime(); + } + try { + detailCrawlStartTime = format.parse(crawlStartTime).getTime(); + } catch (ParseException e) { + detailCrawlStartTime = formatTwo.parse(crawlStartTime).getTime(); + } + boolean istime = detailLastOneTime > detailCrawlStartTime; + if (isnext||istime) { + System.out.println("下一页"); + driver.findElement(By.id("PageNext")).click(); + DateUtil.sleep(3000); + } else { + log.info("已经是最后一页了"); + break; + } + DateUtil.sleep(1000 * 20); + System.gc(); + } + } catch (InterruptedException | ParseException e) { + log.error("任务异常中断,扫表进行下一条任务的处理", e); + code = false; + } finally { + if (driver != null) { + driver.quit(); + } + } + return code; + } + + @Override + public Map processTest(String attachTag, String url, String channeName, String keyword, String crawlStartTime, String crawlEndTime, String nextPageTime, String documentType) { + return null; + } + /** + * 判断某个元素是否存在 + */ + public static boolean isJudgingElement(WebDriver webDriver, By by) { + try { +// webDriver.findElements(by).get(2); + webDriver.findElement(By.id("PageNext")); + return true; + } catch (Exception e) { + log.info("不存在此元素"); + return false; + } + } + +// public void main(String[] args) throws ParseException { +// process("{\"crawlDataFlag\":\"url:https://ie.cnki.net/kns/brief/result.aspx?dbprefix=SYSTZK&kw=&korder=2&other=&sel=1&NaviDatabaseName=SYST_042_CLS&NaviField=%e8%a1%8c%e4%b8%9a%e5%88%86%e7%b1%bb%e4%bb%a3%e7%a0%81&systemno=04201&DSCode=04201/石油与天然气工程\",\"attachTag\":\"石油与天然气工程\",\"appId\":\"113ic\",\"project_name\":\"113ic\"}","https://ie.cnki.net/kns/brief/result.aspx?dbprefix=SYSTZK&kw=&korder=2&other=&sel=1&NaviDatabaseName=SYST_042_CLS&NaviField=%e8%a1%8c%e4%b8%9a%e5%88%86%e7%b1%bb%e4%bb%a3%e7%a0%81&systemno=04201&DSCode=04201/石油与天然气工程","url:https://ie.cnki.net/kns/brief/result.aspx?dbprefix=SYSTZK&kw=&korder=2&other=&sel=1&NaviDatabaseName=SYST_042_CLS&NaviField=%e8%a1%8c%e4%b8%9a%e5%88%86%e7%b1%bb%e4%bb%a3%e7%a0%81&systemno=04201&DSCode=04201/石油与天然气工程","","2022-12-30 00:00:00"); +// } +} diff --git a/src/main/java/com/bfd/cnki/crawl/process/KyyzProcessImplTest.java b/src/main/java/com/bfd/cnki/crawl/process/KyyzProcessImplTest.java new file mode 100644 index 0000000..9671373 --- /dev/null +++ b/src/main/java/com/bfd/cnki/crawl/process/KyyzProcessImplTest.java @@ -0,0 +1,361 @@ +package com.bfd.cnki.crawl.process; + +import com.alibaba.fastjson.JSONObject; +import com.bfd.cnki.crawl.cache.ConfigCache; +import com.bfd.cnki.crawl.entity.Constants; +import com.bfd.cnki.crawl.util.DateUtil; +import com.bfd.cnki.crawl.util.EjdDownloadHtml; +import com.bfd.cnki.crawl.util.ParametricAssembly; +import com.bfd.cnki.crawl.util.QueueUtils; +import lombok.extern.slf4j.Slf4j; +import okhttp3.*; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.springframework.stereotype.Service; + +import java.io.IOException; +import java.net.InetSocketAddress; +import java.text.ParseException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * @author:zhaoying + * @className:KyyzProcessImplTest + * @version:1.0 + * @description: + * @Date:2024-05-30 11:11:56 + */ +@Slf4j +@Service +public class KyyzProcessImplTest implements KyyzProcess { + @Override + public Boolean process(String attachTag, String url, String channeName, String keyword, String crawlStartTime, String crawlEndTime, String nextPageTime) { + return null; + } + + @Override + public Map processTest(String attachTag, String url, String channeName, String keyword, String crawlStartTime, String crawlEndTime, String nextPageTime, String documentType) { + Map result = new HashMap(16); + int stopPage = Integer.valueOf(nextPageTime); + if (stopPage == -1) { + log.info("翻页到底"); + //目前限制为6页,之后可以改成120 + stopPage = 10; + } + boolean flag = true; + boolean hasNext = true; + int num = 1; + String jinghao = "#"; + if (!url.contains(jinghao)) { + num = Integer.valueOf(url); + } + System.out.println("任务翻页为:" + stopPage); + Map attachTagMap = JSONObject.parseObject(attachTag); + String field = attachTagMap.get("field").toString(); + log.info("field:{}", field); + log.info("任务参数============documentType:{}", documentType); +// 1学术期刊2学位论文3会议;多个1,2,3 + String params = ParametricAssembly.getParams(keyword, String.valueOf(num), documentType); + while ((num <= stopPage) && flag) { + System.out.println("第 " + num + " 页"); + log.info("组装完成的参数为:" + params); + String html = ""; + try { +// html = doDownload(params); + html = EjdDownloadHtml.okHttpProxyPostRe("https://kns.cnki.net/kns8s/brief/grid", params); + } catch (Exception e) { + e.printStackTrace(); + } + Document doc = Jsoup.parse(html); + Elements elemens = doc.select("tbody").select("tr"); + if (elemens.size() < 1) { + log.info("页面异常 html:{}", html); + System.out.println("列表页下载失败!!!!!!!!!!!!!!!!!!!"); + flag = false; + result.put("code", flag); + result.put("page", num); + return result; + } else { + log.info("下载成功"); + //解析 + for (Element element : elemens) { + log.info("解析列表=========================================="); +// try { +// Thread.sleep(2 * 1000); +// } catch (InterruptedException e) { +// e.printStackTrace(); +// } + ListParse listParse = new ListParse(); + Map map = new HashMap(16); + Map resultData = new HashMap(16); + resultData.put("searchKeyword", keyword); + resultData.put("field", field); + try { + String id = element.select(".seq").text(); + map.put("id", id); + } catch (Exception e) { + log.info("没有id"); + } + String title = ""; + try { + title = element.select(".name").select(".fz14").text(); + } catch (Exception e) { + log.info("没有title"); + } + map.put("title", title); + resultData.put("title", title); + String author = ""; + List authors = new ArrayList<>(); + try { +// author = element.select(".author").text().replace(" ", ";"); + Elements aus = element.select(".author").select("a"); + if (aus.size() > 0) { + for (Element au : aus) { + authors.add(au.text().trim()); + } + } + author = String.join(";", authors); +// author = element.select(".author").text(); +// if (!author.contains(";")) { +// author = author.replace(" ", ";"); +// } + } catch (Exception e) { + log.info("没有author"); + } + map.put("author", author); + resultData.put("author", author); + String source = ""; + try { + source = element.select(".source").text(); + } catch (Exception e) { + log.info("没有source"); + } + map.put("source", source); + resultData.put("source", source); + String date = ""; + try { + date = element.select(".date").text(); +// lastOneTime = date; + } catch (Exception e) { + log.info("没有date"); + } + map.put("date", date); + long dateTimestamp = 0; + try { + dateTimestamp = DateUtil.timeToTimestamp(date, "yyyy-MM-dd HH:mm"); + } catch (ParseException e) { + try { + dateTimestamp = DateUtil.timeToTimestamp(date, "yyyy-MM-dd"); + } catch (ParseException parseException) { + try { + dateTimestamp = DateUtil.timeToTimestamp(date, "yyyy"); + } catch (ParseException exception) { + log.info("时间解析失败 date:{}", date); + } + } + } + try { +// long startTime = DateUtil.timeToTimestamp(date, "yyyy-MM-dd HH:mm:ss"); +// long endTime = DateUtil.timeToTimestamp(date, "yyyy-MM-dd HH:mm:ss"); + log.info("crawlStartTime:{}", Long.valueOf(crawlStartTime)); + log.info("crawlEndTime:{}", Long.valueOf(crawlEndTime)); + if (dateTimestamp < Long.valueOf(crawlStartTime)) { + log.info("date:{},dateTimestamp:{}数据不在时间范围内了 结束任务!!!!!!", date, dateTimestamp); + hasNext = false; + break; + } else if (dateTimestamp > Long.valueOf(crawlEndTime)) { + log.info("不符合时间范围-大于任务结束时间, date:{},dateTimestamp:{}", date, dateTimestamp); + continue; + } else { + log.info("符合时间范围 date:{},dateTimestamp:{}", date, dateTimestamp); + } + } catch (Exception e) { + e.printStackTrace(); + } + + + String data = ""; + if (documentType.contains(Constants.DOCUMENTTYPE_ONE) || documentType.contains(Constants.DOCUMENTTYPE_TWO) || documentType.contains(Constants.DOCUMENTTYPE_THREE)) { + if (documentType.equals(Constants.DOCUMENTTYPE_ONE)) { + data = "学术期刊"; + } else if (documentType.equals(Constants.DOCUMENTTYPE_TWO)) { + data = "学位论文"; + } else if (documentType.equals(Constants.DOCUMENTTYPE_THREE)) { + data = "会议"; + } + } else { + try { + data = element.select(".data").text(); + } catch (Exception e) { + log.info("没找到data"); +// e.printStackTrace(); + } + } + log.info("data================================{}", data); + map.put("data", data); + String quote = ""; + try { + quote = element.select(".quote").text(); + } catch (Exception e) { + // TODO: handle exception + } + map.put("quote", quote); + String download = ""; + try { + download = element.select(".download").text(); + } catch (Exception e) { + // TODO: handle exception + } + map.put("download", download); + //详情链接 + String detailUrl = ""; + String wxtype = ""; + url = "https://kns.cnki.net/kcms/detail/detail.aspx?"; + try { + detailUrl = element.select(".fz14").attr("href") + .replaceAll(".*Detail\\?", url); + wxtype = "中文"; + } catch (Exception e) { + try { + detailUrl = element.select(".fz14-w").attr("href") + .replaceAll(".*Detail\\?", url); + wxtype = "外文"; + } catch (Exception e2) { + log.info("无详情链接"); + continue; + } + } + map.put("detailUrl", detailUrl); + map.put("wxtype", wxtype); + map.put("keyword", keyword); + String pdfPath = "//*[@id=\"pdfDown\"]"; + log.info("详情链接=" + detailUrl); + String newsId = DateUtil.getMd5(title + author + field); + log.info("解析到 newsId:{}", newsId); + resultData.put("dedupKey", newsId); + if (ConfigCache.KyyBloomFilter.contains(newsId)) { + log.info("重复数据"); + continue; + } else { + Map attrMap = new HashMap(16); +// attrMap.put("crawlDataFlag", channeName); + attrMap.put("crawlDataFlag", "keyword:数据汇总"); + Map attachMap = new HashMap(16); +// attachMap.put("crawlDataFlag", channeName); + attachMap.put("crawlDataFlag", "keyword:数据汇总"); + attachMap.put("attachTag", ""); + attachMap.put("appId", "ic"); + attachMap.put("project_name", "ic"); + attrMap.put("attachtag", attachMap); + attrMap.put("appId", "ic"); + attrMap.put("attachTag", attachTag); +// attrMap.put("tname", channeName); + attrMap.put("tname", "keyword:数据汇总"); +// attrMap.put("keyword", keyword); + attrMap.put("keyword", "数据汇总"); + attrMap.put("project_name", "ic"); + resultData.put("attr", attrMap); + resultData.put("brief", map); + QueueUtils.kyyzTaskQueue.add(resultData); + ConfigCache.KyyBloomFilter.add(newsId); + log.info("存入消重队列 newsId:{}", newsId); + } + } + num++; +// try { +// log.info("休眠5s"); +// Thread.sleep(5 * 1000); +// } catch (InterruptedException e) { +// e.printStackTrace(); +// } + //下一页 + Elements nexts = doc.select("#Page_next_top"); + if (nexts.size() > 0 && hasNext) { + System.out.println("还有下一页"); + Pattern pattern = Pattern.compile("pageNum=(\\d+)"); + Matcher matcher = pattern.matcher(params); + if (matcher.find()) { + int nextpagenum = Integer.valueOf(matcher.group(1)).intValue() + 1; + params = params.replace(matcher.group(), "pageNum=" + nextpagenum); + } + } else { + System.out.println("关键词:" + keyword + " 翻页结束"); + break; + } + } + + + } + result.put("code", flag); + result.put("page", num); + return result; + } + + + static java.net.Proxy proxy = new java.net.Proxy(java.net.Proxy.Type.HTTP, new InetSocketAddress("u270.40.tp.16yun.cn", 6448)); + final static String USERNAME = "16HFBVJC"; + final static String PASSWORD = "897944"; + static Authenticator proxyAuthenticator = new Authenticator() { + public Request authenticate(Route route, Response response) throws IOException { + // 设置代理服务器账号密码 + String credential = Credentials.basic(USERNAME, PASSWORD); + return response.request().newBuilder().header("proxy-authorization", credential).build(); + } + }; + + public static String doDownload(String params) throws IOException { + OkHttpClient.Builder builder = new OkHttpClient.Builder(); + builder.readTimeout(200, TimeUnit.SECONDS); + builder.connectTimeout(200, TimeUnit.SECONDS); + builder.proxy(proxy); + builder.proxyAuthenticator(proxyAuthenticator); + OkHttpClient client = builder.retryOnConnectionFailure(true).build(); + MediaType mediaType = MediaType.parse("application/x-www-form-urlencoded; charset=UTF-8"); + RequestBody body = RequestBody.create(mediaType, params); + Request request = new Request.Builder() + .url("https://kns.cnki.net/kns8s/brief/grid") + .method("POST", body) + .addHeader("Accept", "*/*") + .addHeader("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8") + .addHeader("Cache-Control", "no-cache") + .addHeader("Connection", "keep-alive") + .addHeader("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8") + .addHeader("Origin", "https://kns.cnki.net") + .addHeader("Pragma", "no-cache") + .addHeader("Referer", "https://kns.cnki.net/kns8s/AdvSearch?classid=WD0FTY92") + .addHeader("Sec-Fetch-Dest", "empty") + .addHeader("Sec-Fetch-Mode", "cors") + .addHeader("Sec-Fetch-Site", "same-origin") + .addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36") + .addHeader("X-Requested-With", "XMLHttpRequest") + .addHeader("sec-ch-ua", "\"Google Chrome\";v=\"125\", \"Chromium\";v=\"125\", \"Not.A/Brand\";v=\"24\"") + .addHeader("sec-ch-ua-mobile", "?0") + .addHeader("sec-ch-ua-platform", "\"Windows\"") +// .addHeader("Cookie", "KNS2COOKIE=1717035822.285.24942.769684|b25e41a932fd162af3b8c5cff4059fc3; SID_kns_new=kns15128006") + .build(); + Response response = client.newCall(request).execute(); + String html = response.body().string(); + response.close(); + return html; + } + + + public static void main(String[] args) { + Long a = 1652976000000L; + String b = "1577836800000"; + String c = "1724025600000"; + if (a < Long.valueOf(b) || a > Long.valueOf(c)) { + System.out.println("1"); + } else { + System.out.println("2"); + } + } +} diff --git a/src/main/java/com/bfd/cnki/crawl/process/ListParse.java b/src/main/java/com/bfd/cnki/crawl/process/ListParse.java new file mode 100644 index 0000000..3a4b7e3 --- /dev/null +++ b/src/main/java/com/bfd/cnki/crawl/process/ListParse.java @@ -0,0 +1,105 @@ +package com.bfd.cnki.crawl.process; + +import lombok.extern.slf4j.Slf4j; +import org.openqa.selenium.By; +import org.openqa.selenium.WebElement; + +import java.util.HashMap; +import java.util.Map; +/** + * @author:zhaoying + * @className:ListParse + * @version:1.0 + * @description: 列表页解析 + * @Date:2022-11-18 11:39:12 + */ +@Slf4j +public class ListParse { + public Map doListParse(WebElement webElement){ + Map map = new HashMap(16); + Map resultData = new HashMap(16); + try { + String id = webElement.findElement(By.cssSelector(".seq")).getText(); + map.put("id",id); + } catch (Exception e) { + log.info("没有id"); + } + String title = ""; + try { + title = webElement.findElement(By.cssSelector(".name")).getText(); + } catch (Exception e) { + log.info("没有title"); + } + map.put("title", title); + resultData.put("title", title); + String author = ""; + try { + author = webElement.findElement(By.cssSelector(".author")).getText(); + } catch (Exception e) { + log.info("没找到作者"); + // TODO: handle exception + } + map.put("author", author); + resultData.put("author", author); + String source = ""; + try { + source = webElement.findElement(By.cssSelector(".source")).getText(); + } catch (Exception e) { + // TODO: handle exception + } + map.put("source", source); + resultData.put("source", source); + String date = ""; + try { + date = webElement.findElement(By.cssSelector(".date")).getText(); + } catch (Exception e) { + // TODO: handle exception + } + map.put("date", date); + String data = ""; + try { + data = webElement.findElement(By.cssSelector(".data")).getText(); + } catch (Exception e) { + log.info("没找到data"); + e.printStackTrace(); + } + map.put("data", data); + String quote = ""; + try { + quote = webElement.findElement(By.cssSelector(".quote")).getText(); + } catch (Exception e) { + // TODO: handle exception + } + map.put("quote", quote); + String download = ""; + try { + download = webElement.findElement(By.cssSelector(".download")).getText(); + } catch (Exception e) { + // TODO: handle exception + } + map.put("download", download); + // 写库里 + String url = "https://kns.cnki.net/kcms/detail/detail.aspx?"; + //详情链接 + String detailUrl = ""; + String wxtype = ""; + try { + detailUrl = webElement.findElement(By.cssSelector(".fz14")).getAttribute("href") + .replaceAll(".*Detail\\?", url); + wxtype = "中文"; + } catch (Exception e) { + try { + detailUrl = webElement.findElement(By.cssSelector(".fz14-w")).getAttribute("href") + .replaceAll(".*Detail\\?", url); + wxtype = "外文"; + } catch (Exception e2) { + e2.printStackTrace(); + log.info("无详情链接"); + } + } + map.put("detailUrl", detailUrl); + map.put("wxtype", wxtype); + resultData.put("brief",map); + return resultData; + } +} diff --git a/src/main/java/com/bfd/cnki/crawl/process/OilContentParse.java b/src/main/java/com/bfd/cnki/crawl/process/OilContentParse.java new file mode 100644 index 0000000..2a86e0f --- /dev/null +++ b/src/main/java/com/bfd/cnki/crawl/process/OilContentParse.java @@ -0,0 +1,230 @@ +package com.bfd.cnki.crawl.process; + +import com.alibaba.fastjson.JSONObject; +import com.bfd.cnki.crawl.cache.ConfigCache; +import com.bfd.cnki.crawl.entity.Constants; +import com.bfd.cnki.crawl.util.DateUtil; +import com.bfd.cnki.crawl.util.DownlodContentHtml; +import com.bfd.cnki.crawl.util.Kafkautils; +import com.bfd.cnki.crawl.util.QueueUtils; +import lombok.extern.slf4j.Slf4j; +import org.apache.kafka.clients.producer.KafkaProducer; +import org.apache.kafka.clients.producer.ProducerRecord; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; + +/** + * @author:zhaoying + * @className:OilContentParse + * @version:1.0 + * @description: + * @Date:2022-11-21 17:56:29 + */ +@Slf4j +public class OilContentParse implements Runnable { + private static KafkaProducer producer = Kafkautils.getKafkaProdect(ConfigCache.mainConfig.get(Constants.BROKERS).toString()); + + @Override + public void run() { + while (ConfigCache.isExec) { + log.info("当前oilTaskQueue队列长度={}",QueueUtils.oilTaskQueue.size()); + try { + if (QueueUtils.oilTaskQueue.size() > 0) { + Map resultData = null; + try { + resultData = QueueUtils.oilTaskQueue.take(); + ContentParse(resultData); + } catch (Exception e) { + log.warn("detail task is:{}", resultData); + e.printStackTrace(); + } + + } else { + try { + Thread.sleep(1000 * 10); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } catch (Throwable e) { + log.error("未知异常!!!!!!!!!!!", e); + } + + } + } + + public static void ContentParse(Map resultData) throws InterruptedException { + String url = resultData.get(Constants.URL).toString(); + String purl = resultData.get(Constants.PURL).toString(); + Map breifMap; + try { + breifMap = (Map) resultData.get(Constants.BRIEF); + } catch (Exception e) { + log.info("列表传过来的是空的breifMap"); + breifMap = new HashMap(16); + } + Map header = new HashMap(16); + header.put("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"); + header.put("accept-language", "zh-CN,zh;q=0.9"); + header.put("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"); + header.put("Referer", purl); + String error = "Download failed error is:"; + String html = null; + try { + html = DownlodContentHtml.doGet(url, header); + } catch (Exception e) { + html = error; + log.info("页面下载失败", e); + } + if (html.contains(error)) { + log.info("download fial"); + if (resultData.containsKey(Constants.ERRORTIME)) { + int errortime = (int) resultData.get(Constants.ERRORTIME); + int four = 4; + if (errortime >= four) { + log.error("重试次数已用尽{}", JSONObject.toJSONString(resultData)); + QueueUtils.errorOilTaskQueue.put(resultData); + log.info("下载失败重试次数已用尽,休眠60s"); + Thread.sleep(1000 * 60); + log.info("休眠结束,继续执行"); + return; + } + resultData.put(Constants.ERRORTIME, errortime + 1); + } else { + resultData.put(Constants.ERRORTIME, 1); + } + QueueUtils.oilTaskQueue.put(resultData); + } +// log.info("html="+html.replace("\r\n","").replace("\r","").replace("\n","")); + String errorinfo = "由于访问过于频繁,本次请求已被阻止"; + if (html.contains(errorinfo)) { + log.info("由于访问过于频繁,本次请求已被阻止,休眠60s"); + Thread.sleep(1000 * 60); + log.info("休眠结束,继续执行"); + return; + } + Document doc = Jsoup.parse(html); + //作者 + String author = null; + try { + author = doc.select("#aulist").text().replace("【作者】", "").trim(); + } catch (Exception e) { + log.error("没找到作者{}", url); + } + String listAuthor = resultData.get("author").toString(); + if(author.length()>listAuthor.length()){ + resultData.put("author", author); + }else{ + resultData.put("author", listAuthor); + } + //Author + String authorSign = "【Author】"; + try { + String authorEn = doc.select("#au_en").text(); + if (authorEn.contains(authorSign)) { + breifMap.put(authorSign.replace("【", "").replace("】", ""), authorEn.replace(authorSign, "")); + } + } catch (Exception e) { + log.info("页面没有找到Author字段{}", url); + } + String jigouSign = "【机构】"; + String zhaiyaoSign = "【摘要】"; + String abstractSign = "【Abstract】"; + Elements allP = doc.select(".author.summaryRight").select("p"); + if (allP != null && allP.size() > 0) { + for (Element element : allP) { + if (element.text().contains(jigouSign)) { + breifMap.put(jigouSign.replace("【", "").replace("】", ""), element.text().replace(jigouSign, "")); + } else if (element.text().contains(zhaiyaoSign)) { + breifMap.put(zhaiyaoSign.replace("【", "").replace("】", ""), element.text().replace(zhaiyaoSign, "")); + } else if (element.text().contains(abstractSign)) { + breifMap.put(abstractSign.replace("【", "").replace("】", ""), element.text().replace(abstractSign, "")); + } + } + } else { + log.info("页面没有-机构,摘要,Abstract字段{}", url); + } + String guanjianciSign = "【关键词】"; + String keywordsSign = "【Key words】"; + String jijinSign = "【基金】"; + Elements summaryRight = doc.select(".keywords.int5.summaryRight"); + if (summaryRight != null && summaryRight.size() > 0) { + for (Element element : summaryRight) { + if (element.text().contains(guanjianciSign)) { + breifMap.put(guanjianciSign.replace("【", "").replace("】", ""), element.text().replace(guanjianciSign, "")); + } else if (element.text().contains(keywordsSign)) { + breifMap.put(keywordsSign.replace("【", "").replace("】", ""), element.text().replace(keywordsSign, "")); + } else if (element.text().contains(jijinSign)) { + breifMap.put(jijinSign.replace("【", "").replace("】", ""), element.text().replace(jijinSign, "")); + } + } + } else { + log.info("页面没有-关键词,Key words,基金字段{}", url); + } + //DOI + String doiSign = "【DOI】"; + Elements breakUl = doc.select(".break").select("li"); + if (breakUl != null && breakUl.size() > 0) { + for (Element li : breakUl) { + if (li.text().contains(doiSign)) { + breifMap.put(doiSign.replace("【", "").replace("】", ""), li.text().replace(doiSign, "")); + } + } + } else { + log.info("页面没有-DOI{}", url); + } + resultData.put("url", url); + resultData.put("iid", DateUtil.getMd5(url)); + resultData.put("news_id", DateUtil.getMd5(url)); + resultData.put("charset", "UTF-8"); + resultData.put("type", "newscontent"); + resultData.put("tasks", new ArrayList<>()); + resultData.put("version", "1"); + resultData.put("needDoDown", false); + resultData.put("isDownload", false); + resultData.put("cid", "Nzhiwang"); + resultData.put("content", doc.select("#main").text()); +// log.info("content="+doc.select("#main").text()); + resultData.put("forwardcontent", doc.select("#main").html()); + resultData.put("brief", JSONObject.toJSONString(breifMap)); +// log.info("breifMap={}", JSONObject.toJSONString(breifMap)); +// log.info("resultData={}", JSONObject.toJSONString(resultData)); + if (resultData.get(Constants.TITLE).equals(Constants.EMPTY) || resultData.get(Constants.CONTENT).equals(Constants.EMPTY)) { + log.error("解析失败的页面={}", url); + if (resultData.containsKey(Constants.ERRORTIME)) { + int errortime = (int) resultData.get(Constants.ERRORTIME); + int four = 4; + if (errortime >= four) { + log.error("重试次数已用尽{}", JSONObject.toJSONString(resultData)); + QueueUtils.errorOilTaskQueue.put(resultData); + log.info("失败重试次数已用尽,休眠60s"); + Thread.sleep(1000 * 60); + log.info("休眠结束,继续执行"); + return; + } + resultData.put(Constants.ERRORTIME, errortime + 1); + } else { + resultData.put(Constants.ERRORTIME, 1); + } + QueueUtils.oilTaskQueue.put(resultData); + // 发送到错误数据kafka +// producer.send(new ProducerRecord(ConfigCache.mainConfig.get(Constants.ERRORTOPIC).toString(), +// JSONObject.toJSONString(resultData))); +// producer.flush(); + } else { + // 发送kafka + producer.send(new ProducerRecord(ConfigCache.mainConfig.get(Constants.TOPIC).toString(), + JSONObject.toJSONString(resultData))); + producer.flush(); + //存入过滤器 + ConfigCache.OilBloomFilter.add(resultData.get(Constants.MAINKEY).toString()); + } + + } +} diff --git a/src/main/java/com/bfd/cnki/crawl/process/OilProcess.java b/src/main/java/com/bfd/cnki/crawl/process/OilProcess.java new file mode 100644 index 0000000..d42812d --- /dev/null +++ b/src/main/java/com/bfd/cnki/crawl/process/OilProcess.java @@ -0,0 +1,13 @@ +package com.bfd.cnki.crawl.process; + +import org.redisson.api.RBloomFilter; +/** + * @author:zhaoying + * @className:OilProcess + * @version:1.0 + * @description: + * @Date:2022-11-16 15:54:13 + */ +public interface OilProcess { + Boolean process(String attachTag, String url, String channeName); +} diff --git a/src/main/java/com/bfd/cnki/crawl/process/OilProcessImpl.java b/src/main/java/com/bfd/cnki/crawl/process/OilProcessImpl.java new file mode 100644 index 0000000..e83ac70 --- /dev/null +++ b/src/main/java/com/bfd/cnki/crawl/process/OilProcessImpl.java @@ -0,0 +1,482 @@ +package com.bfd.cnki.crawl.process; + +import com.alibaba.fastjson.JSONObject; +import com.bfd.cnki.crawl.cache.ConfigCache; +import com.bfd.cnki.crawl.entity.Constants; +import com.bfd.cnki.crawl.util.DateUtil; +import com.bfd.cnki.crawl.util.GetDriver; +import com.bfd.cnki.crawl.util.Kafkautils; +import com.bfd.cnki.crawl.util.QueueUtils; +import com.bfd.cnki.crawl.util.UseDb; + +import lombok.extern.slf4j.Slf4j; + +import org.apache.kafka.clients.producer.KafkaProducer; +import org.apache.kafka.clients.producer.ProducerRecord; +import org.openqa.selenium.*; +import org.openqa.selenium.interactions.Actions; +import org.redisson.api.RBloomFilter; +import org.springframework.stereotype.Service; + +import java.util.*; + +import static com.bfd.cnki.crawl.util.PicCheckUtil.getCodeByPhoto; + +/** + * @author:zhaoying + * @className:OilProcessImpl + * @version:1.0 + * @description: + * @Date:2022-11-16 11:48:59 + */ +@Slf4j +@Service +public class OilProcessImpl implements OilProcess { + private static KafkaProducer producer = Kafkautils.getKafkaProdect(ConfigCache.mainConfig.get(Constants.BROKERS).toString()); + ; + + @Override + public Boolean process(String attachTag, String url, String channeName) { + Boolean code = true; + WebDriver driver = null; + try { + GetDriver getDriver = new GetDriver(); + driver = getDriver.getWebDriverDriver(); + Actions actions = new Actions(driver); + Map attachTagMap = (Map) JSONObject.parse(attachTag); + attachTag = attachTagMap.get(Constants.ATTACH_TAG).toString(); + log.info("当前任务的attachTag={}", attachTag); + driver.get(url); + Thread.sleep(1000 * 10); + // 获取当前请求的请求头cookie + String headerCookie = ""; + Set cookies = driver.manage().getCookies(); + String denghao = "="; + String fenhao = "; "; + for (Cookie cookie : cookies) { + headerCookie = headerCookie + cookie.getName() + denghao + cookie.getValue() + fenhao; + } + headerCookie = headerCookie.substring(0, headerCookie.length() - 2); + log.info("headerCookie{}", headerCookie); + // 把cookie放入参数数组 + int parmasSize = 1; + String[] parmas = new String[parmasSize]; + parmas[0] = headerCookie; + List elements = driver.findElements(By.className("leftlist_1")); + if (elements.size() > 0) { + for (WebElement element : elements) { + String leftlist = element.getText(); + log.info("leftlist={}", leftlist); + if (leftlist.contains(attachTag)) { + element.click(); + break; + } + } + } + String pubTime = "2018-01-01"; + //输入开始的发表时间 + driver.findElement(By.xpath("//*[@id=\"publishdate_from\"]")).sendKeys(pubTime); + log.info("成功输入开始时间"); + driver.findElement(By.xpath("//*[@id=\"divSearchResult\"]/div[1]/div[1]")).click(); + //点击检索 + driver.findElement(By.xpath("//*[@id=\"btnSearch\"]")).click(); + log.info("成功点击检索"); + Thread.sleep(1000 * 10); + //点击按发表时间排序 + driver.findElement(By.xpath("//*[@id=\"divSearchResult\"]/div[1]/div[1]/a[2]")).click(); + log.info("成功点击按发表时间排序"); + Thread.sleep(1000 * 10); + String one = "1"; + log.info("当前的翻页规则是==={}",ConfigCache.mainConfig.get(Constants.ORDERBY)); + if (ConfigCache.mainConfig.get(Constants.ORDERBY).toString().contains(one)) { + Thread.sleep(1000*10); + log.info("需要点击按倒序排列"); + //再次点击按发表时间排序(倒序) + driver.findElement(By.xpath("//*[@id=\"divSearchResult\"]/div[1]/div[1]/a[2]")).click(); + log.info("成功点击按发表时间倒序排序"); + Thread.sleep(1000 * 10); + } + //点击每页显示50条 + driver.findElement(By.xpath("//*[@id=\"divSearchResult\"]/div[1]/div[1]/div/a[4]")).click(); + log.info("成功点击每页显示50条"); + Thread.sleep(1000 * 5); + log.info("以上完成了所有条件的点击--开始执行解析"); +// int page = 0; + while (true) { + //建一个map集合存放浏览器句柄 + HashMap handleMap = new HashMap<>(16); + List lists = null; + lists = driver.findElements(By.cssSelector(".s-item.clearfix")); + log.info("列表长度={}", lists.size()); +// page++; + try { + String page = driver.findElement(By.xpath("//*[@id=\"divSearchResult\"]/div[1]/div[2]/span[4]")).getText(); + log.info("当前抓取的是第{}页==================================", page); + } catch (Exception e) { + log.info("获取当前页码信息失败"); + } + int twenty = 20; + if (lists.size() == twenty) { + return false; + } + int index = 0; + if (lists != null && lists.size() > 0) { + for (WebElement webElement : lists) { + index++; + Map resultData = new HashMap(16); + Map breifMap = new HashMap(16); + //作者 + String author = null; + try { + author = webElement.findElements(By.className("s-author")).get(0).getText(); + if (author != null) { + author = DateUtil.getStrByPattern(author, "(?<=作者:).*"); + } + } catch (Exception e) { + log.error("没有定位到作者"); + } + resultData.put("author", author); + //来源 + String source = null; + try { + source = webElement.findElements(By.className("s-author")).get(1).getText(); + if (source != null) { + source = DateUtil.getStrByPattern(source, "(?<=来源:).*"); + resultData.put("source", source); + } + } catch (Exception e) { + log.error("没有定位到来源"); + } + //来源数据库 + String sourceDatabase = null; + try { + String sourceDatabaseSign = "来源数据库:"; + sourceDatabase = webElement.findElements(By.className("s-author")).get(2).getText(); + if (sourceDatabase.contains(sourceDatabaseSign)) { + breifMap.put("sourceDatabase", sourceDatabase.replace(sourceDatabaseSign, "")); + } + } catch (Exception e) { + log.error("没有定位到来源数据库"); + } + String title = null; + String pageUrl = null; + try { + title = webElement.findElement(By.className("s-title")).findElement(By.tagName("a")).getText(); + pageUrl = webElement.findElement(By.className("s-title")).findElement(By.tagName("a")).getAttribute("href"); + resultData.put("title", title); + } catch (Exception e) { + log.error("没有定位标题"); + continue; + } + if (pageUrl == null) { + log.info("没获取到详情链接,进行下一条"); + continue; + } + String postTime = null; + try { + postTime = webElement.findElement(By.cssSelector(".fr")).getText(); + if (postTime != null) { + String time = DateUtil.getStrByPattern(postTime, "[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9] [0-9][0-9]:[0-9][0-9]"); + if (time != null) { + postTime = DateUtil.formatTime(time, "yyyy-MM-dd HH:mm"); + } else { + time = DateUtil.getStrByPattern(postTime, "[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]"); + postTime = DateUtil.formatTime(time, "yyyy-MM-dd"); + } + } else { + log.info("没获取到时间"); + } + } catch (Exception e) { + log.info("没定位到时间"); + } + //详情链接 + StringBuffer detail = new StringBuffer(Constants.OIL_URL_PREFIX); + String[] pageUrls = pageUrl.split(Constants.AND); + for (String urlFragment : pageUrls) { + if (urlFragment.contains(Constants.DBNAME_IS)) { + detail.append(urlFragment); + } else if (urlFragment.contains(Constants.FILENAME_IS)) { + detail.append(Constants.AND); + detail.append(urlFragment); + } + } + resultData.put("post_time", postTime); + resultData.put("url", detail.toString()); + resultData.put("brief", breifMap); + Map attrMap = new HashMap(16); + attrMap.put("crawlDataFlag", channeName); + Map attachMap = new HashMap(16); + attachMap.put("crawlDataFlag", channeName); + attachMap.put("attachTag", attachTag); + attachMap.put("appId", "ic"); + attachMap.put("project_name", "ic"); + attrMap.put("attachtag", attachMap); + attrMap.put("appId", "ic"); + attrMap.put("attachTag", attachTag); + attrMap.put("tname", channeName); + attrMap.put("keyword", ""); + attrMap.put("project_name", "ic"); + resultData.put("attr", attrMap); + String mainKey = null; + mainKey = DateUtil.getMd5(title + postTime); + if (ConfigCache.OilBloomFilter.contains(mainKey)) { + log.info("重复数据"); + continue; + } else { + //开始解析详情 + resultData.put("mainKey", mainKey); + String purl = url.substring(0, url.lastIndexOf("/")); + resultData.put("purl", purl); + QueueUtils.oilTaskQueue.put(resultData); +// //获取到当前句柄 +// String liebiaoHandle = driver.getWindowHandle(); +// handleMap.put("liebiao", liebiaoHandle); +// //点击链接进入详情页 +// //*[@id="divSearchResult"]/div[2]/div[1]/div/div/h1/a +// //*[@id="divSearchResult"]/div[2]/div[2]/div/div/h1/a +// driver.findElement(By.xpath("//*[@id=\"divSearchResult\"]/div[2]/div[" + index + "]/div/div/h1/a")).click(); +// log.info("点击进入详情页"); +// //获取到所有句柄 +// Set set = driver.getWindowHandles(); +// for (String s : set) { +// String liebiaoSign = "liebiaoHandle"; +// String xiangqingSign = "xiangqing"; +// if (!s.equals(liebiaoSign)) { +// //将详情页的句柄放到map中 +// handleMap.put(xiangqingSign, s); +// } +// } +// //切换到详情页的窗口 +// driver.switchTo().window(handleMap.get("xiangqing")); +// Thread.sleep(1000*10); +// //作者 +// String author = null; +// try { +// author = driver.findElement(By.xpath("//*[@id=\"aulist\"]")).getText().replace("【作者】", "").trim(); +// resultData.put("author", author); +// } catch (Exception e) { +// log.error("没找到作者{}", driver.getCurrentUrl()); +// } +// //Author +// String authorSign = "【Author】"; +// try { +// String authorEn = driver.findElement(By.xpath("//*[@id=\"au_en\"]")).getText(); +// if (authorEn.contains(authorSign)) { +// breifMap.put(authorSign.replace("【", "").replace("】", ""), authorEn.replace(authorSign, "")); +// } +// } catch (Exception e) { +// log.info("页面没有找到Author字段{}", driver.getCurrentUrl()); +// } +// String jigouSign = "【机构】"; +// String zhaiyaoSign = "【摘要】"; +// String abstractSign = "【Abstract】"; +// List allP = null; +// allP = driver.findElements(By.cssSelector(".author.summaryRight p")); +// if (allP != null && allP.size() > 0) { +// for (WebElement p : allP) { +// if (p.getText().contains(jigouSign)) { +// breifMap.put(jigouSign.replace("【", "").replace("】", ""), p.getText().replace(jigouSign, "")); +// } else if (p.getText().contains(zhaiyaoSign)) { +// breifMap.put(zhaiyaoSign.replace("【", "").replace("】", ""), p.getText().replace(zhaiyaoSign, "")); +// } else if (p.getText().contains(abstractSign)) { +// breifMap.put(abstractSign.replace("【", "").replace("】", ""), p.getText().replace(abstractSign, "")); +// } +// } +// } else { +// log.info("页面没有-机构,摘要,Abstract字段{}", driver.getCurrentUrl()); +// } +// String guanjianciSign = "【关键词】"; +// String keywordsSign = "【Key words】"; +// String jijinSign = "【基金】"; +// List summaryRight = null; +// summaryRight = driver.findElements(By.cssSelector(".keywords.int5.summaryRight")); +// if (summaryRight != null && summaryRight.size() > 0) { +// for (WebElement summary : summaryRight) { +// if (summary.getText().contains(guanjianciSign)) { +// breifMap.put(guanjianciSign.replace("【", "").replace("】", ""), summary.getText().replace(guanjianciSign, "")); +// } else if (summary.getText().contains(keywordsSign)) { +// breifMap.put(keywordsSign.replace("【", "").replace("】", ""), summary.getText().replace(keywordsSign, "")); +// } else if (summary.getText().contains(jijinSign)) { +// breifMap.put(jijinSign.replace("【", "").replace("】", ""), summary.getText().replace(jijinSign, "")); +// } +// } +// } else { +// log.info("页面没有-关键词,Key words,基金字段{}", driver.getCurrentUrl()); +// } +// //DOI +// String doiSign = "【DOI】"; +// List breakUl = null; +// breakUl = driver.findElements(By.cssSelector(".break li")); +// if (breakUl != null && breakUl.size() > 0) { +// for (WebElement li : breakUl) { +// if (li.getText().contains(doiSign)) { +// breifMap.put(doiSign.replace("【", "").replace("】", ""), li.getText().replace(doiSign, "")); +// } +// } +// } else { +// log.info("页面没有-DOI{}", driver.getCurrentUrl()); +// } +// resultData.put("url", driver.getCurrentUrl()); +// resultData.put("iid", DateUtil.getMd5(driver.getCurrentUrl())); +// resultData.put("news_id", DateUtil.getMd5(driver.getCurrentUrl())); +// resultData.put("charset", "UTF-8"); +// resultData.put("type", "newscontent"); +// resultData.put("tasks", new ArrayList<>()); +// resultData.put("version", "1"); +// resultData.put("needDoDown", false); +// resultData.put("isDownload", false); +// resultData.put("cid", "Nzhiwang"); +// resultData.put("content", driver.findElement(By.xpath("//*[@id=\"content\"]")).getText()); +// resultData.put("forwardcontent", driver.getPageSource()); +// Map attrMap = new HashMap(16); +// attrMap.put("crawlDataFlag", channeName); +// Map attachMap = new HashMap(16); +// attachMap.put("crawlDataFlag", channeName); +// attachMap.put("attachTag", attachTag); +// attachMap.put("appId", "ic"); +// attachMap.put("project_name", "ic"); +// attrMap.put("attachtag", attachMap); +// attrMap.put("appId", "ic"); +// attrMap.put("attachTag", attachTag); +// attrMap.put("tname", channeName); +// attrMap.put("keyword", ""); +// attrMap.put("project_name", "ic"); +// resultData.put("attr", attrMap); +// resultData.put("brief", JSONObject.toJSONString(breifMap)); +//// log.info("breifMap={}", JSONObject.toJSONString(breifMap)); +//// log.info("resultData={}", JSONObject.toJSONString(resultData)); +// if(title==null||author==null||sourceDatabase==null||postTime==null){ +// log.error("解析失败的页面"); +// // 发送到错误数据kafka +// producer.send(new ProducerRecord(ConfigCache.mainConfig.get(Constants.ERRORTOPIC).toString(), +// JSONObject.toJSONString(resultData))); +// producer.flush(); +// }else{ +//// UseDb.writeMethod(JSONObject.toJSONString(resultData), +//// "D:\\工作使用\\石油\\resultsTest2.txt"); +// // 发送kafka +//// producer.send(new ProducerRecord(ConfigCache.mainConfig.get(Constants.TOPIC).toString(), +//// JSONObject.toJSONString(resultData))); +//// producer.flush(); +//// oilBloomFilter.add(mainKey); +// } +// driver.close(); +// driver.switchTo().window(handleMap.get("liebiao")); + } + } + } else { + log.info("空页面---------"); + //判断是否触发翻页验证了 + try { + if (driver.findElements(By.id("divVeriCode")).size() > 0) { + WebElement texts = driver.findElement(By.id("divVeriCode")); + String text = texts.getText(); + log.info("触发验证了"); + List imgs = driver.findElements(By.id("CheckCodeImg")); + if (imgs.size() > 0) { + // 确定有图片验证码,调用识别图片验证码方法,传入参数—上面获取到的cookie + String vericode = getCodeByPhoto(parmas[0]); + log.info("识别到的验证码={}", vericode); + WebElement inputElem = driver.findElement(By.id("CheckCode")); + // 输入验证码 + inputElem.sendKeys(vericode); + // 点击提交 + driver.findElement(By.xpath("//*[@id=\"divVeriCode\"]/p[1]/input[2]")).click(); + actions.sendKeys(Keys.END).perform(); + Thread.sleep(1000 * 10); + } + } + continue; + } catch (Exception e) { + log.error("识别图片验证码失败", e); + try { + if (driver.findElements(By.id("divVeriCode")).size() > 0) { + WebElement texts = driver.findElement(By.id("divVeriCode")); + String text = texts.getText(); + log.info("触发验证了"); + List imgs = driver.findElements(By.id("CheckCodeImg")); + if (imgs.size() > 0) { + // 确定有图片验证码,调用识别图片验证码方法,传入参数—上面获取到的cookie + String vericode = getCodeByPhoto(parmas[0]); + log.info("识别到的验证码={}" + vericode); + WebElement inputElem = driver.findElement(By.id("CheckCode")); + // 输入验证码 + inputElem.sendKeys(vericode); + // 点击提交 + driver.findElement(By.xpath("//*[@id=\"divVeriCode\"]/p[1]/input[2]")).click(); + actions.sendKeys(Keys.END).perform(); + Thread.sleep(1000 * 10); + } + } + continue; + } catch (Exception e1) { + log.error("第二次识别图片验证码失败", e); + break; + } + } + } + //下一页 + if (ConfigCache.mainConfig.get(Constants.ORDERBY).toString().contains(one)) { + boolean isnext = upPage(driver, By.className("page-prev")); + if (isnext) { + log.info("有下一页"); + driver.findElements(By.className("page-prev")).get(1).click(); + Thread.sleep(1000 * 5); + } else { + log.info("已经是最后一页了"); + break; + + } + } else { + boolean isnext = isJudgingElement(driver, By.className("page-prev")); + if (isnext) { + log.info("有下一页"); + driver.findElements(By.className("page-prev")).get(2).click(); + Thread.sleep(1000 * 5); + } else { + log.info("已经是最后一页了"); + break; + + } + } + Thread.sleep(1000 * 20); + System.gc(); + } + } catch (Throwable e) { + log.error("任务异常中断,扫表进行下一条任务的处理" + e); + code = false; + } finally { + if (driver != null) { + driver.quit(); + } + } + return code; + } + + /** + * 判断某个元素是否存在 + */ + public boolean isJudgingElement(WebDriver webDriver, By by) { + try { +// webDriver.findElements(by).get(2); + webDriver.findElements(by).get(2).getAttribute("onclick"); + return true; + } catch (Exception e) { + log.info("不存在此元素"); + return false; + } + } + /** + * 上一页 + */ + public boolean upPage(WebDriver webDriver, By by) { + try { +// webDriver.findElements(by).get(2); + webDriver.findElements(by).get(1).getAttribute("onclick"); + return true; + } catch (Exception e) { + log.info("不存在此元素"); + return false; + } + } +} diff --git a/src/main/java/com/bfd/cnki/crawl/process/SfgzContentParse.java b/src/main/java/com/bfd/cnki/crawl/process/SfgzContentParse.java new file mode 100644 index 0000000..9a2254b --- /dev/null +++ b/src/main/java/com/bfd/cnki/crawl/process/SfgzContentParse.java @@ -0,0 +1,580 @@ +package com.bfd.cnki.crawl.process; + +import com.alibaba.fastjson.JSONObject; +import com.bfd.cnki.crawl.cache.ConfigCache; +import com.bfd.cnki.crawl.entity.Constants; +import com.bfd.cnki.crawl.util.DateUtil; +import com.bfd.cnki.crawl.util.DownlodContentHtml; +import com.bfd.cnki.crawl.util.Kafkautils; +import com.bfd.cnki.crawl.util.QueueUtils; +import lombok.extern.slf4j.Slf4j; +import org.apache.kafka.clients.producer.KafkaProducer; +import org.apache.kafka.clients.producer.ProducerRecord; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; + +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * @author:zhaoying + * @className:ContentParse + * @version:1.0 + * @description: + * @Date:2022-11-18 11:52:56 + */ +@Slf4j +public class SfgzContentParse implements Runnable { + private static KafkaProducer producer = Kafkautils.getKafkaProdect(ConfigCache.mainConfig.get(Constants.BROKERS).toString()); + public static final Pattern PATTERN = Pattern.compile("(?<=TurnPageToKnetV\\().*?(?=\\))"); + public Matcher matcher; + + @Override + public void run() { + while (true) { + log.info("当前SfgzTaskQueue队列长度={}", QueueUtils.sfgzTaskQueue.size()); + try { + if (QueueUtils.sfgzTaskQueue.size() > 0) { + Map resultData = null; + try { + resultData = QueueUtils.sfgzTaskQueue.take(); + Thread.sleep(1000 * 10); + doContentParse(resultData); + } catch (InterruptedException e) { + log.warn("detail task is:{}", resultData); + e.printStackTrace(); + } + } else { + try { + Thread.sleep(1000 * 10); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } catch (Throwable e) { + log.error("未知异常!!!!!!!!!!!", e); + } + + } + } + + /** + * 解析详情 + */ + public void doContentParse(Map resultData) throws InterruptedException { +// Map briefMap = new HashMap<>(16); + Map map = (Map) resultData.get(Constants.MAPDATA); + Map headers = new HashMap(16); + headers.put("Accept", + "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"); + headers.put("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8"); + headers.put("User-Agent", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"); + String detailUrl = map.get(Constants.DETAILURL).toString(); + String downLoadError = "Download failed error is:"; + String html = null; + String content = ""; + String forwardcontent = ""; + String firstUrlKey = "firstUrl"; + String fhtml = ""; + String ghtml = ""; + String zhtml = ""; + String hhtml = ""; + String jhtml = ""; + String dhtml = ""; + if (!map.containsKey(firstUrlKey)) { + log.info("失败"); + return; + } + Map briefMap = new HashMap<>(16); + briefMap.put("url", detailUrl); + try { + fhtml = DownlodContentHtml.doGet(map.get("firstUrl").toString(), headers); + log.info("fhtml:{}",fhtml); + Map fMap = JSONObject.parseObject(fhtml); + String message = fMap.get("message").toString(); + if (message.equals(Constants.SUCCESS)) { + Map data = (Map) fMap.get("data"); + List> metadata = (List>) data.get("metadata"); + if (metadata.size() > 0) { + for (Map meta : metadata) { + String name = meta.get("name").toString(); + if (name.equals(Constants.NAME)) { + //名称 + String value = meta.get("value").toString(); + briefMap.put("authorName", value); + resultData.put("title", value); + } + if (name.equals(Constants.ORG)) { + //机构 + String value = meta.get("value").toString(); + briefMap.put("institution", value); + } + if (name.equals(Constants.YJLY)) { + //领域 + String value = meta.get("value").toString(); + briefMap.put("area", value); + } + } + } + } + } catch (Exception e) { + log.info("fhtml 页面解析失败", e); + } + Thread.sleep(2*1000); + try { + //作者关注领域 + ghtml = DownlodContentHtml.doGet(map.get("guanzhulingyu").toString(), headers); + Map fMap = JSONObject.parseObject(ghtml); + String message = fMap.get("message").toString(); + if (message.equals(Constants.SUCCESS)) { + Map data = (Map) fMap.get("data"); + List> metadata = (List>) data.get("data"); + if (metadata.size() > 0) { + List itemsList = new ArrayList<>(); + for (Map meta : metadata) { + String item = meta.get("item").toString(); + itemsList.add(item); + } + briefMap.put("focusAareas", itemsList); + } + } + } catch (Exception e) { + log.info("ghtml 页面解析失败", e); + } + Thread.sleep(2*1000); + try { + //作者文献 + zhtml = DownlodContentHtml.doGet(map.get("zuigaobeiyin").toString(), headers); + Map fMap = JSONObject.parseObject(zhtml); + String message = fMap.get("message").toString(); + if (message.equals(Constants.SUCCESS)) { + Map data = (Map) fMap.get("data"); + List> dataList = (List>) data.get("data"); + if (dataList.size() > 0) { + List itemsList = new ArrayList<>(); + for (Map dataMap : dataList) { + String value = ""; + List> metadata = (List>) dataMap.get("metadata"); + if (metadata.size() > 0) { + for (Map meta : metadata) { + String name = meta.get("name").toString(); + if (name.equals(Constants.TI)) { + //文献名 + value = meta.get("value").toString() + "."; + } + if (name.equals(Constants.AU)) { + //作者名 + value = value.concat(meta.get("value").toString()).concat("."); + } + } + } + Map source = (Map) dataMap.get("source"); + value = value + source.get("title").toString() + "."; + String issue = ""; + try { + issue = source.get("issue").toString(); + issue = "(" + issue + ")"; + } catch (Exception e) { + log.info("作者文献-没有编号"); + } + value = value + source.get("year").toString() + issue; + itemsList.add(value); + } + briefMap.put("contribution", itemsList); + } + } + } catch (Exception e) { + log.info("zhtml 页面解析失败", e); + } + Thread.sleep(2*1000); + try { + //作者导师 + dhtml = DownlodContentHtml.doGet(map.get("daoshi").toString(), headers); + Map fMap = JSONObject.parseObject(dhtml); + String message = fMap.get("message").toString(); + if (message.equals(Constants.SUCCESS)) { + List> dataList = (List>) fMap.get("data"); + if (dataList.size() > 0) { + List itemsList = new ArrayList<>(); + for (Map dataMap : dataList) { + String item = dataMap.get("title").toString(); + itemsList.add(item); + } + briefMap.put("tutors", itemsList); + } + } + } catch (Exception e) { + log.info("dhtml 页面解析失败", e); + } + Thread.sleep(2*1000); + try { + //合作作者 + hhtml = DownlodContentHtml.doGet(map.get("hezuo").toString(), headers); + Map fMap = JSONObject.parseObject(hhtml); + String message = fMap.get("message").toString(); + if (message.equals(Constants.SUCCESS)) { + Map data = (Map) fMap.get("data"); + List> dataList = (List>) data.get("data"); + if (dataList.size() > 0) { + List itemsList = new ArrayList<>(); + for (Map dataMap : dataList) { + String key = dataMap.get("title").toString(); + String value = ""; + List> affiliations = (List>) dataMap.get("affiliations"); + if (affiliations.size() > 0) { + for (Map affiliation : affiliations) { + value = affiliation.get("title").toString(); + } + } + String item = key + " " + value; + itemsList.add(item); + } + briefMap.put("coauthor", itemsList); + } + } + } catch (Exception e) { + log.info("hhtml 页面解析失败", e); + } + Thread.sleep(2*1000); + try { + //获得支持基金 + jhtml = DownlodContentHtml.doGet(map.get("jijin").toString(), headers); + Map fMap = JSONObject.parseObject(jhtml); + String message = fMap.get("message").toString(); + if (message.equals(Constants.SUCCESS)) { + Map data = (Map) fMap.get("data"); + List> dataList = (List>) data.get("data"); + if (dataList.size() > 0) { + List itemsList = new ArrayList<>(); + for (Map dataMap : dataList) { + String title = dataMap.get("title").toString(); + itemsList.add(title); + } + briefMap.put("funds", itemsList); + } + } + } catch (Exception e) { + log.info("jhtml 页面解析失败", e); + } + Thread.sleep(2*1000); + // TODO: 2024/3/19 指导学⽣ + briefMap.put("students", "未找到相关数据"); + String institutionKey = "institutionKey"; + if (!briefMap.containsKey(institutionKey)) { + briefMap.put("institution", "未找到相关数据"); + } + String areaKey = "areaKey"; + if (!briefMap.containsKey(areaKey)) { + briefMap.put("area", "未找到相关数据"); + } + String focusAareasKey = "focusAareas"; + if (!briefMap.containsKey(focusAareasKey)) { + briefMap.put("focusAareas", "未找到相关数据"); + } + String contributionKey = "contribution"; + if (!briefMap.containsKey(contributionKey)) { + briefMap.put("contribution", "未找到相关数据"); + } + String coauthorKey = "coauthor"; + if (!briefMap.containsKey(coauthorKey)) { + briefMap.put("coauthor", "未找到相关数据"); + } + String fundsKey = "funds"; + if (!briefMap.containsKey(fundsKey)) { + briefMap.put("funds", "未找到相关数据"); + } + String tutorsKey = "tutors"; + if (!briefMap.containsKey(tutorsKey)) { + briefMap.put("tutors", "未找到相关数据"); + } + String studentsKey = "students"; + if (!briefMap.containsKey(studentsKey)) { + briefMap.put("students", "未找到相关数据"); + } + log.info("briefMap={}", JSONObject.toJSONString(briefMap)); + + + try { + html = DownlodContentHtml.doGet(detailUrl, headers); + } catch (Exception e) { + log.info("html 页面解析失败", e); + } + if (html.contains(downLoadError)) { + log.info("download fial"); + if (resultData.containsKey(Constants.ERRORTIME)) { + int errortime = (int) resultData.get(Constants.ERRORTIME); + int four = 4; + if (errortime >= four) { + log.error("重试次数已用尽{}", JSONObject.toJSONString(resultData)); + QueueUtils.errorSfgzTaskQueue.put(resultData); + log.info("下载失败重试次数已用尽,休眠60s"); + Thread.sleep(1000 * 60); + log.info("休眠结束,继续执行"); + return; + } + resultData.put(Constants.ERRORTIME, errortime + 1); + } else { + resultData.put(Constants.ERRORTIME, 1); + } + QueueUtils.sfgzTaskQueue.put(resultData); + } + log.info("身份感知 html:{}",html); +// try { + Document doc = Jsoup.parse(html); + content = doc.select(".wrapper").text(); + forwardcontent = doc.select(".wrapper").html(); +// Elements elements4 = doc.select("h3"); +// String agency = ""; +// List authorAgencyUrls = new ArrayList(); +// if (elements4 != null && elements4.size() > 0) { +// if (elements4.size() > 1) { +// agency = elements4.get(1).text(); +// Elements elements5 = elements4.get(1).select("a"); +// if (elements5 != null && elements5.size() > 0) { +// for (Element element : elements5) { +// String params = element.attr("onclick"); +// matcher = PATTERN.matcher(params); +// if (matcher.find()) { +// String[] paramList = matcher.group().replace("'", "") +// .split(","); +// if (paramList.length == 4) { +// String authorAgencyUrl = "https://kns.cnki.net/kcms/detail/knetsearch.aspx?sfield=" +// + paramList[0] + "&skey=" + paramList[1] +// + "&code=" + paramList[2] + "&v=" +// + paramList[3]; +// authorAgencyUrls.add(authorAgencyUrl); +// } +// } +// } +// } +// } else { +// agency = elements4.get(0).text(); +// Elements elements5 = elements4.get(0).select("span").select("a"); +// if (elements5 != null && elements5.size() > 0) { +// for (Element element : elements5) { +// String params = element.attr("onclick"); +// matcher = PATTERN.matcher(params); +// if (matcher.find()) { +// String[] paramList = matcher.group().replace("'", "") +// .split(","); +// if (paramList.length == 4) { +// String authorAgencyUrl = "https://kns.cnki.net/kcms/detail/knetsearch.aspx?sfield=" +// + paramList[0] + "&skey=" + paramList[1] +// + "&code=" + paramList[2] + "&v=" +// + paramList[3]; +// authorAgencyUrls.add(authorAgencyUrl); +// } +// } +// } +// } +// } +// } +// map.put("author_agency_urls", authorAgencyUrls); +// map.put("agency", agency); +// if (map.get(Constants.TITLE).toString().equals(Constants.EMPTY)) { +// map.put("title", doc.select(".wx-tit").select("h1").text()); +// } +// Map paramter = new HashMap(16); +// Elements elements2 = doc.select(".row"); +// for (Element element : elements2) { +// String key = element.select(".rowtit").text(); +// Elements elements3 = element.select(".top-space"); +// if (elements3 != null && elements3.size() > 0) { +// for (Element element2 : elements3) { +// key = element2.select(".rowtit").text(); +// if (!key.equals(Constants.EMPTY)) { +// paramter.put(key, +// element2.text().replace(key, "").replace("更多还原", "")); +// } +// } +// } else { +// if (!key.equals(Constants.EMPTY)) { +// paramter.put(key, +// element.text().replace(key, "").replace("更多还原", "")); +// } +// } +// } +// map.put("paramter", paramter); +// List authorUrls = new ArrayList(); +// Elements elements3 = doc.select("#authorpart").select("span").select("a"); +// if (elements3 != null && elements3.size() > 0) { +// for (Element element : elements3) { +// String params = element.attr("onclick"); +// matcher = PATTERN.matcher(params); +// if (matcher.find()) { +// String[] paramList = matcher.group().replace("'", "").split(","); +// if (paramList.length == 4) { +// String authorUrl = "https://kns.cnki.net/kcms/detail/knetsearch.aspx?sfield=" +// + paramList[0] + "&skey=" + paramList[1] + "&code=" +// + paramList[2] + "&v=" + paramList[3]; +// authorUrls.add(authorUrl); +// } +// } +// } +// } +// map.put("author_urls", authorUrls); +// List publishAgencyUrls = new ArrayList(); +// Elements elements5 = doc.select(".top-tip").select("span").select("a"); +// if (elements5 != null && elements5.size() > 0) { +// if (elements5 != null && elements5.size() > 0) { +// Pattern pattern1 = Pattern.compile("(?<=getKns8NaviLink\\().*?(?=\\))"); +// Matcher matcher1 = null; +// for (Element element : elements5) { +// matcher1 = pattern1.matcher(element.attr("onclick")); +// if (matcher1.find()) { +// String[] paramList = matcher1.group().replace("'", "") +// .split(","); +// if (paramList.length == 2) { +// String publishAgencyUrl = "https://kns.cnki.net/kcms/detail/navipage.aspx?dbcode=" +// + paramList[0] + "&baseid=" + paramList[1]; +// publishAgencyUrls.add(publishAgencyUrl); +// } +// } +// } +// } +// } +// map.put("publish_agency_urls", publishAgencyUrls); +// Elements elements1 = doc.select(".top-tip").select(".type"); +// map.put("eisci", ""); +// if (elements1 != null && elements1.size() > 0) { +// String eisci = ""; +// for (Element element : elements1) { +// if (eisci.equals(Constants.EMPTY)) { +// eisci = element.text(); +// } else { +// eisci += "," + element.text(); +// } +// } +// map.put("eisci", eisci); +// } +// map.put("initial_mark", doc.select(".head-time").text()); +// map.put("language", "中文"); +// try { +// String fileUrl = doc.select(".btn-dlpdf").get(0).select("a").attr("href"); +// map.put("fileUrl", fileUrl); +// } catch (Exception e) { +// log.info("没找到pdf按钮{}", detailUrl); +// } +// } catch (Exception e) { +// // TODO: handle exception +// e.printStackTrace(); +// log.info("详情页面不正常,丢掉:{}", detailUrl); +// } +// try { +// resultData.put("post_time", formatTime(map.get(Constants.DATE).toString(), "yyyy-MM-dd HH:mm")); +// } catch (Exception e) { +// try { +// resultData.put("post_time", formatTime(map.get(Constants.DATE).toString(), "yyyy-MM-dd")); +// } catch (Exception e2) { +// log.error("时间解析失败-->{}", map.get(Constants.DATE).toString()); +// } +// } + resultData.put("content", content); + resultData.put("forwardcontent", forwardcontent); + resultData.put("news_id", DateUtil.getMd5(detailUrl)); + resultData.put("author", Constants.AUTHOR); + resultData.put("source", Constants.SOURCE); + resultData.put("cid", "Nzhiwang"); + resultData.put("url", detailUrl); + resultData.put("type", "newscontent"); + resultData.put("isDownload", false); + resultData.put("iid", DateUtil.getMd5(detailUrl)); + resultData.put("tasks", new ArrayList<>()); + resultData.put("version", "3"); +// inserContent(map); + resultData.put("brief", JSONObject.toJSONString(briefMap)); + if (resultData.get(Constants.TITLE).equals(Constants.EMPTY) || resultData.get(Constants.CONTENT).equals(Constants.EMPTY)) { + log.error("解析失败的页面={}", detailUrl); + if (resultData.containsKey(Constants.ERRORTIME)) { + int errortime = (int) resultData.get(Constants.ERRORTIME); + int four = 4; + if (errortime >= four) { + log.error("重试次数已用尽{}", JSONObject.toJSONString(resultData)); + QueueUtils.errorSfgzTaskQueue.put(resultData); + log.info("失败重试次数已用尽,休眠60s"); + Thread.sleep(1000 * 60); + log.info("休眠结束,继续执行"); + return; + } + resultData.put(Constants.ERRORTIME, errortime + 1); + } else { + resultData.put(Constants.ERRORTIME, 1); + } + QueueUtils.sfgzTaskQueue.put(resultData); + } else { + log.info("resultData={}", JSONObject.toJSONString(resultData)); +// // 发送kafka + producer.send(new ProducerRecord(ConfigCache.mainConfig.get(Constants.TOPIC).toString(), + JSONObject.toJSONString(resultData))); + producer.flush(); + //存入过滤器 + ConfigCache.SfgzBloomFilter.add(DateUtil.getMd5(detailUrl)); + } + } + + /** + * 1. @Description: + * 2. @Param: + * 3. @return: + * 4. @Author: ying.zhao + * 5. @Date: 2022/11/18 + */ + public static String formatTime(String date, String format) throws ParseException { + SimpleDateFormat sdf = new SimpleDateFormat(format, Locale.ENGLISH); + String format1 = ""; + Date parse = sdf.parse(date); + SimpleDateFormat sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + format1 = sdf1.format(parse); + return format1; + } + + +// public void inserContent(Map map) { +// String id = DateUtil.getMd5(map.get("detailUrl").toString()); +// String code = map.get("id").toString(); +// String url = map.get("detailUrl").toString(); +// String keyword = map.get("keyword").toString().replace("'", "''"); +// String title = map.get("title").toString().replace("'", "''"); +// String author = map.get("author").toString().replace("'", "''"); +// String source = map.get("source").toString().replace("'", "''"); +// String date = map.get("date").toString(); +// String data = map.get("data").toString().replace("'", "''"); +// String quote = map.get("quote").toString().replace("'", "''"); +// String download = map.get("download").toString().replace("'", "''"); +// String agency = ""; +// try { +// agency = map.get("agency").toString().replace("'", "''"); +// } catch (Exception e) { +// // TODO: handle exception +// } +// +// Map paramterMap = (Map) map.get("paramter"); +// // 基金资助 +// String funding = ""; +// // 文中关键词 +// String keywords = ""; +// // 摘要 +// String summary = ""; +// for (String key : paramterMap.keySet()) { +// String value = (String) paramterMap.get(key); +// paramterMap.put(key, value.replace("\"", "\\\"")); +// if (key.contains("基金资助")) { +// funding = value.replace("'", "''"); +// ; +// } else if (key.contains("摘要")) { +// summary = value.replace("'", "''"); +// ; +// } else if (key.contains("关键词")) { +// keywords = value.replace("'", "''"); +// ; +// } +// } +// map.put("funding", funding); +// map.put("keywords", keywords); +// map.put("summary", summary); +// } +} diff --git a/src/main/java/com/bfd/cnki/crawl/process/SfgzProcess.java b/src/main/java/com/bfd/cnki/crawl/process/SfgzProcess.java new file mode 100644 index 0000000..9065273 --- /dev/null +++ b/src/main/java/com/bfd/cnki/crawl/process/SfgzProcess.java @@ -0,0 +1,12 @@ +package com.bfd.cnki.crawl.process; + +/** + * @author:zhaoying + * @className:KyyzProcess + * @version:1.0 + * @description: + * @Date:2022-11-16 11:39:43 + */ +public interface SfgzProcess { + Boolean process(String attachTag, String url, String channeName, String keyword, String crawlStartTime,String pageTypeID); +} diff --git a/src/main/java/com/bfd/cnki/crawl/process/SfgzProcessImpl.java b/src/main/java/com/bfd/cnki/crawl/process/SfgzProcessImpl.java new file mode 100644 index 0000000..3867abc --- /dev/null +++ b/src/main/java/com/bfd/cnki/crawl/process/SfgzProcessImpl.java @@ -0,0 +1,387 @@ +package com.bfd.cnki.crawl.process; + +import com.alibaba.fastjson.JSONObject; +import com.bfd.cnki.crawl.cache.ConfigCache; +import com.bfd.cnki.crawl.entity.Constants; +import com.bfd.cnki.crawl.util.DateUtil; +import com.bfd.cnki.crawl.util.DesDecryption; +import com.bfd.cnki.crawl.util.GetDriver; +import com.bfd.cnki.crawl.util.QueueUtils; +import lombok.extern.slf4j.Slf4j; +import org.openqa.selenium.*; +import org.openqa.selenium.interactions.Actions; +import org.springframework.scheduling.concurrent.ThreadPoolTaskExecutor; +import org.springframework.stereotype.Service; + +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.time.Instant; +import java.time.temporal.ChronoUnit; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import static com.bfd.cnki.crawl.util.PicCheckUtil.getCodeByPhoto; + +/** + * @author:zhaoying + * @className:KyyzProcessImpl + * @version:1.0 + * @description: + * @Date:2022-11-18 11:41:30 + */ +@Slf4j +@Service +public class SfgzProcessImpl implements SfgzProcess { + /** + * 这里指的是python环境,不需要改动 + * 定义成常量 + */ + public final static String PPATH = "python"; + /** + * py脚本路径 + * 常量 + */ + public final static String PY_DDDD_OCR = "./dddd_ocr.py"; + protected ThreadPoolTaskExecutor kyyzExecutor; + + @Override + public Boolean process(String attachTag, String url, String channeName, String keyword, String crawlStartTime,String pageTypeID) { + Boolean code = true; + WebDriver driver = null; + try { + if(pageTypeID.equals(Constants.LISTTYPE)){ + log.info("列表页任务"); + ConditionalClick conditionalClick = new ConditionalClick(); + GetDriver getDriver = new GetDriver(); + driver = getDriver.getWebDriverDriver(); + Actions actions = new Actions(driver); + Map attachTagMap = (Map) JSONObject.parse(attachTag); + attachTag = attachTagMap.get(Constants.ATTACH_TAG).toString(); + String listUrl = "https://kns.cnki.net/kns8s/defaultresult/index"; + driver.get(listUrl); + Thread.sleep(1000 * 60); + log.info("点击选择搜索分类"); + driver.findElement(By.xpath("//*[@id=\"ModuleSearch\"]/div[1]/div/div/div[2]/div[1]/div[1]/div[1]/i")).click(); + Thread.sleep(1000 * 10); + log.info("点击作者"); + driver.findElement(By.xpath("//*[@id=\"ModuleSearch\"]/div[1]/div/div/div[2]/div[1]/div[1]/div[2]/ul/li[6]")).click(); + Thread.sleep(1000 * 2); + driver.findElement(By.xpath("//*[@id=\"ModuleSearch\"]/div[1]/div/div/div[2]/div[1]/input[1]")).sendKeys(keyword); + driver.findElement(By.xpath("//*[@id=\"ModuleSearch\"]/div[1]/div/div/div[2]/div[1]/input[2]")).click(); + log.info("点击搜索"); + // 获取当前请求的请求头cookie + String headerCookie = ""; + Set cookies = driver.manage().getCookies(); + String denghao = "="; + String fenhao = "; "; + for (Cookie cookie : cookies) { + headerCookie = headerCookie + cookie.getName() + denghao + cookie.getValue() + fenhao; + } + headerCookie = headerCookie.substring(0, headerCookie.length() - 2); + log.info("headerCookie{}", headerCookie); + // 把cookie放入参数数组 + int parmasSize = 1; + String[] parmas = new String[parmasSize]; + parmas[0] = headerCookie; + while (true) { + List elements = null; + elements = driver.findElements(By.cssSelector(".result-table-list tr")); + log.info("长度::::::" + elements.size()); + Thread.sleep(1000 * 10); + try { + log.info("当前正在抓取的是{}", driver.findElement(By.className("countPageMark")).getText()); + } catch (Exception e) { + if (elements != null && elements.size() == 0) { + log.info("关键词-{}-无搜索结果,跳出循环!!!!!!!", keyword); + log.info("页面html:{}", driver.getPageSource()); + break; + } else { + log.info("没定位到页码位置"); + } + } + String lastOneTime = ""; + if (elements != null && elements.size() > 0) { + //不是空列表,进行列表解析 + for (WebElement webElement : elements) { + ListParse listParse = new ListParse(); + Map resultData = new HashMap(16); +// List showAll = webElement.findElements(By.cssSelector(".showAllAuthors")); +// if (showAll.size() > 0) { +// log.info("点击显示全部作者"); +// webElement.findElement(By.cssSelector(".showAllAuthors")).click(); +// Thread.sleep(1000 * 2); +// } + try { + lastOneTime = webElement.findElements(By.className("date")).get(0).getText(); + } catch (Exception e) { + log.info("跳过第一条"); + continue; + } + log.info("lastOneTime={}",lastOneTime); + List authors = null; + try { + authors = webElement.findElements(By.className("KnowledgeNetLink")); + } catch (Exception e) { + log.info("跳过第一条"); + continue; + } + if (authors.size() > 0) { + for (WebElement author : authors) { + Map map = new HashMap(16); + //详情链接 + String detailUrl = author.getAttribute("href"); + log.info("作者-详情链接:{}", detailUrl); + String id = ""; + if (!detailUrl.equals(Constants.EMPTY)) { + id = DateUtil.getStrByPattern(detailUrl, "(?<=\\?v=).*?(?=&)"); + } + log.info("id={}", id); + if (!id.equals(Constants.EMPTY)) { + String firstUrl = "https://kns.cnki.net/restapi/knowledge-api/v1/experts/detail?v=" + id; + map.put("firstUrl", firstUrl); + //作者关注领域 + String guanzhulingyu = "https://kns.cnki.net/restapi/knowledge-api/v1/experts/relations/domains?v=" + id + "&resource=SCDB&size=20&start=1"; + map.put("guanzhulingyu", guanzhulingyu); + //合作作者 + String hezuo = "https://kns.cnki.net/restapi/knowledge-api/v1/experts/relations/coauthors?v=" + id + "&resource=SCDB&size=20&start=1"; + map.put("hezuo", hezuo); + //获得支持基金 + String jijin = "https://kns.cnki.net/restapi/knowledge-api/v1/experts/relations/funds?v=" + id + "&resource=SCDB&size=20&start=1"; + map.put("jijin", jijin); + //指导的学生 + String students = "https://kns.cnki.net/restapi/knowledge-api/v1/experts/relations/students?v=" + id; + map.put("students", students); + //作者导师 + String daoshi = "https://kns.cnki.net/restapi/knowledge-api/v1/experts/relations/tutors?v=" + id; + map.put("daoshi", daoshi); + //作者文献 + // TODO: 2024/3/19-目前只抓最高被引 + String zuigaobeiyin = "https://kns.cnki.net/restapi/knowledge-api/v1/experts/relations/resources?v="+id+"&resource=SCDB&sequence=CF&size=10&sort=desc&start=1"; + map.put("zuigaobeiyin", zuigaobeiyin); + } + + map.put("detailUrl", detailUrl); + map.put("keyword", keyword); + String newsId = DateUtil.getMd5(detailUrl); + if (ConfigCache.SfgzBloomFilter.contains(newsId)) { + log.info("重复数据"); + continue; + } else { + Map attrMap = new HashMap(16); + attrMap.put("crawlDataFlag", channeName); + Map attachMap = new HashMap(16); + attachMap.put("crawlDataFlag", channeName); + attachMap.put("attachTag", ""); + attachMap.put("appId", "sfgz"); + attachMap.put("project_name", "sfgz"); + attrMap.put("attachtag", attachMap); + attrMap.put("appId", "sfgz"); + attrMap.put("attachTag", attachTag); + attrMap.put("tname", channeName); + attrMap.put("keyword", keyword); + attrMap.put("project_name", "sfgz"); + resultData.put("attr", attrMap); + resultData.put("mapData", map); + QueueUtils.sfgzTaskQueue.add(resultData); + } + } + } + + } + } else { + // 触发验证了 + try { + if (driver.findElements(By.id("vericode")).size() > 0) { + WebElement texts = driver.findElement(By.id("vericode")); + String text = texts.getText(); + log.info("触发验证了"); + List imgs = driver.findElements(By.id("changeVercode")); + if (imgs.size() > 0) { + // 确定有图片验证码,调用识别图片验证码方法,传入参数—上面获取到的cookie + String vericode = getCodeByPhoto(parmas[0]); + log.info("识别到的验证码=" + vericode); + WebElement inputElem = driver.findElement(By.id("vericode")); + // 输入验证码 + inputElem.sendKeys(vericode); + // 点击提交 + driver.findElement(By.id("checkCodeBtn")).click(); + actions.sendKeys(Keys.END).perform(); + Thread.sleep(10000); + + } + } + continue; + } catch (Exception e) { + log.error("识别图片验证码失败", e); + try { + if (driver.findElements(By.id("vericode")).size() > 0) { + WebElement texts = driver.findElement(By.id("vericode")); + String text = texts.getText(); + log.info("触发验证了"); + List imgs = driver.findElements(By.id("changeVercode")); + if (imgs.size() > 0) { + // 确定有图片验证码,调用识别图片验证码方法,传入参数—上面获取到的cookie + String vericode = getCodeByPhoto(parmas[0]); + log.info("识别到的验证码=" + vericode); + WebElement inputElem = driver.findElement(By.id("vericode")); + // 输入验证码 + inputElem.sendKeys(vericode); + // 点击提交 + driver.findElement(By.id("checkCodeBtn")).click(); + actions.sendKeys(Keys.END).perform(); + Thread.sleep(10000); + + } + } + continue; + } catch (Exception e1) { + log.error("第二次识别图片验证码失败", e); + break; + } + } + } + boolean isnext = isJudgingElement(driver, By.id("PageNext")); + //判断任务开始时间 + SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + SimpleDateFormat formatTwo = new SimpleDateFormat("yyyy-MM-dd"); + long detailLastOneTime = 0; + long detailCrawlStartTime = 0; + try { + detailLastOneTime = format.parse(lastOneTime).getTime(); + } catch (ParseException e) { + detailLastOneTime = formatTwo.parse(lastOneTime).getTime(); + } + try { + detailCrawlStartTime = format.parse(crawlStartTime).getTime(); + } catch (ParseException e) { + detailCrawlStartTime = formatTwo.parse(crawlStartTime).getTime(); + } + boolean istime = detailLastOneTime > detailCrawlStartTime; + if (isnext || istime) { + System.out.println("下一页"); + driver.findElement(By.id("PageNext")).click(); + DateUtil.sleep(3000); + } else { + log.info("已经是最后一页了"); + break; + } + DateUtil.sleep(1000 * 20); + System.gc(); + } + } else { + log.info("详情任务"); + Thread.sleep(10 * 1000); + String v = DateUtil.getStrByPattern(url, "(?<=v=).*?(?=&)"); + log.info("原链接v{}", v); + Instant now = Instant.now(); + Instant oneHourAgo = now.minus(1, ChronoUnit.HOURS); + long ago = DateUtil.getOneHourAgoTimestamp(now); + long later = DateUtil.getOneHourLaterTimestamp(now); + //des解密 + String plaintext = ""; + String newV = ""; + plaintext = DesDecryption.decryptByDES(v); + log.info("解密结果为:{}", plaintext); + //st=1721036509624&dbcode=CJFN&filename=DFSN202404011&dbname=CJFDLASN2024&resource=JOURNAL&src=SEARCH&tag=KRDS&seq=true&order=1&et=1721040109624 + String st = DateUtil.getStrByPattern(plaintext, "(?<=st=).*?(?=&)"); + log.info("st:{}", st); + String et = DateUtil.getStrByPattern(plaintext, "(?<=et=).*"); + log.info("et:{}", et); + plaintext = plaintext.replace(st, String.valueOf(ago)).replace(et, String.valueOf(later)); + newV = DesDecryption.encryptByDES(plaintext); + url = url.replace(v, newV); + log.info("新链接URL:{}", url); + Map resultData = new HashMap(16); + Map map = new HashMap(16); + //详情链接 + String detailUrl = url; + log.info("作者-详情链接:{}", detailUrl); + String id = ""; + if (!detailUrl.equals(Constants.EMPTY)) { + id = DateUtil.getStrByPattern(detailUrl, "(?<=\\?v=).*?(?=&)"); + } + log.info("id={}", id); + if (!id.equals(Constants.EMPTY)) { + String firstUrl = "https://kns.cnki.net/restapi/knowledge-api/v1/experts/detail?v=" + id; + map.put("firstUrl", firstUrl); + //作者关注领域 + String guanzhulingyu = "https://kns.cnki.net/restapi/knowledge-api/v1/experts/relations/domains?v=" + id + "&resource=SCDB&size=20&start=1"; + map.put("guanzhulingyu", guanzhulingyu); + //合作作者 + String hezuo = "https://kns.cnki.net/restapi/knowledge-api/v1/experts/relations/coauthors?v=" + id + "&resource=SCDB&size=20&start=1"; + map.put("hezuo", hezuo); + //获得支持基金 + String jijin = "https://kns.cnki.net/restapi/knowledge-api/v1/experts/relations/funds?v=" + id + "&resource=SCDB&size=20&start=1"; + map.put("jijin", jijin); + //指导的学生 + String students = "https://kns.cnki.net/restapi/knowledge-api/v1/experts/relations/students?v=" + id; + map.put("students", students); + //作者导师 + String daoshi = "https://kns.cnki.net/restapi/knowledge-api/v1/experts/relations/tutors?v=" + id; + map.put("daoshi", daoshi); + //作者文献 + // TODO: 2024/3/19-目前只抓最高被引 + String zuigaobeiyin = "https://kns.cnki.net/restapi/knowledge-api/v1/experts/relations/resources?v="+id+"&resource=SCDB&sequence=CF&size=10&sort=desc&start=1"; + map.put("zuigaobeiyin", zuigaobeiyin); + } + + map.put("detailUrl", detailUrl); + map.put("keyword", keyword); + String newsId = DateUtil.getMd5(detailUrl); + if (ConfigCache.SfgzBloomFilter.contains(newsId)) { + log.info("重复数据"); +// continue; + } else { + Map attrMap = new HashMap(16); + attrMap.put("crawlDataFlag", channeName); + Map attachMap = new HashMap(16); + attachMap.put("crawlDataFlag", channeName); + attachMap.put("attachTag", ""); + attachMap.put("appId", "sfgz"); + attachMap.put("project_name", "sfgz"); + attrMap.put("attachtag", attachMap); + attrMap.put("appId", "sfgz"); + attrMap.put("attachTag", attachTag); + attrMap.put("tname", channeName); + attrMap.put("keyword", keyword); + attrMap.put("project_name", "sfgz"); + resultData.put("attr", attrMap); + resultData.put("mapData", map); + QueueUtils.sfgzTaskQueue.add(resultData); + } + } + + } catch (InterruptedException | ParseException e) { + log.error("任务异常中断,扫表进行下一条任务的处理", e); + code = false; + log.error("加密解密异常{}", e); + } catch (Exception e) { + e.printStackTrace(); + } finally { + if (driver != null) { + driver.quit(); + } + } + return code; + } + + /** + * 判断某个元素是否存在 + */ + public static boolean isJudgingElement(WebDriver webDriver, By by) { + try { +// webDriver.findElements(by).get(2); + webDriver.findElement(By.id("PageNext")); + return true; + } catch (Exception e) { + log.info("不存在此元素"); + return false; + } + } + +// public void main(String[] args) throws ParseException { +// process("{\"crawlDataFlag\":\"url:https://ie.cnki.net/kns/brief/result.aspx?dbprefix=SYSTZK&kw=&korder=2&other=&sel=1&NaviDatabaseName=SYST_042_CLS&NaviField=%e8%a1%8c%e4%b8%9a%e5%88%86%e7%b1%bb%e4%bb%a3%e7%a0%81&systemno=04201&DSCode=04201/石油与天然气工程\",\"attachTag\":\"石油与天然气工程\",\"appId\":\"113ic\",\"project_name\":\"113ic\"}","https://ie.cnki.net/kns/brief/result.aspx?dbprefix=SYSTZK&kw=&korder=2&other=&sel=1&NaviDatabaseName=SYST_042_CLS&NaviField=%e8%a1%8c%e4%b8%9a%e5%88%86%e7%b1%bb%e4%bb%a3%e7%a0%81&systemno=04201&DSCode=04201/石油与天然气工程","url:https://ie.cnki.net/kns/brief/result.aspx?dbprefix=SYSTZK&kw=&korder=2&other=&sel=1&NaviDatabaseName=SYST_042_CLS&NaviField=%e8%a1%8c%e4%b8%9a%e5%88%86%e7%b1%bb%e4%bb%a3%e7%a0%81&systemno=04201&DSCode=04201/石油与天然气工程","","2022-12-30 00:00:00"); +// } +} diff --git a/src/main/java/com/bfd/cnki/crawl/test/Demo.java b/src/main/java/com/bfd/cnki/crawl/test/Demo.java new file mode 100644 index 0000000..fc26d3a --- /dev/null +++ b/src/main/java/com/bfd/cnki/crawl/test/Demo.java @@ -0,0 +1,203 @@ +package com.bfd.cnki.crawl.test; + +import java.io.IOException; +import java.net.InetSocketAddress; +import java.net.Proxy; +import java.nio.charset.StandardCharsets; + +import org.apache.http.HttpHost; +import org.apache.http.auth.AuthScope; +import org.apache.http.auth.UsernamePasswordCredentials; +import org.apache.http.client.CredentialsProvider; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.impl.client.BasicCredentialsProvider; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClients; +import org.apache.http.util.EntityUtils; + +import okhttp3.Authenticator; +import okhttp3.Credentials; +import okhttp3.OkHttpClient; +import okhttp3.Request; +import okhttp3.Response; +import okhttp3.Route; + +public final class Demo { + + public static void main(String[] args) throws IOException { + String ip = "123.184.59.61"; // 代理主机地址 + int port = 50035; // 代理主机端口 + + // 使用OKHttp库 + OKHttpProxy.build(ip, port).test(); + // OKHttpProxy.build(ip, port, "aaaaaa", "bbbbbb").test(); // 代理认证 + + // 使用HttpClient库 + // HttpClientProxy.build(ip, port).test(); + // HttpClientProxy.build(ip, port, "aaaaaa", "bbbbbb").test(); // 代理认证 + } + + /** + * OKHttp库使用代理 + */ + static class OKHttpProxy { + + String proxyHost; + int proxyPort; + String proxyAccount; + String proxyPwd; + + /** + * @param host 代理主机地址 + * @param port 代理主机端口 + */ + public static OKHttpProxy build(String host, int port) { + OKHttpProxy proxy = new OKHttpProxy(); + proxy.proxyHost = host; + proxy.proxyPort = port; + return proxy; + } + + /** + * @param host 代理主机地址 + * @param port 代理主机端口 + * @param acc 代理认证账号 + * @param pwd 代理认证口令 + */ + public static OKHttpProxy build(String host, int port, String acc, String pwd) { + OKHttpProxy proxy = new OKHttpProxy(); + proxy.proxyHost = host; + proxy.proxyPort = port; + proxy.proxyAccount = acc; + proxy.proxyPwd = pwd; + return proxy; + } + + public void test() throws IOException { + String targetUrl = "https://kns.cnki.net/kcms2/article/abstract?v=sMQVub3UVPh6FSsnungdYXEt6VUemJudPQojOTW9UDlyBfPDst0awWIbvKBJRwrSPj8ysxSpyx5Eaz7e79tSNj_aoAoIw_MGLw7FTd9NPTbiYXt_YvDyjb_UgdDZBuebAhrAaKCkAZmcsXHuE8WKWX8xXonECGAkIOZSSzzffPYlA4myNjxAo4JyQsUWmbrl8gMuQRv8W5pfivMtBMpS2kEMkRGaZfZ3JiB0NRzNMGWoJZKycw7h5PeqvAeoBphSZzpyW7iqF6YlX-AB-CUq0pkXY8z2rZC63GSO4djrHskHGFzZjyebS5V69phJofPBsKxHj_jUBhGAT31SF3gQSlFUJ_qlO6nlxrYd1NRoYsD6dRtNTfqxt7op8OyJrYpM1-2jhOt_h4A=&uniplatform=NZKPT&language=CHS"; + + OkHttpClient client = null; + if (proxyAccount == null || proxyPwd == null) { + client = getHttpClient(proxyHost, proxyPort); + } else { + // 账号密码验证 + client = getHttpClient(proxyHost, proxyPort, proxyAccount, proxyPwd); + } + + Request request = new Request.Builder() + .url(targetUrl) + .build(); + Response response = client.newCall(request).execute(); + System.out.println(response.body().string()); + } + + /** + * 代理不需要账号密码认证的httpClient + */ + private static OkHttpClient getHttpClient(String proxyHost, int proxyPort) { + Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(proxyHost, proxyPort)); + return new OkHttpClient.Builder() + .proxy(proxy) + .build(); + } + + /** + * 代理需要账号密码认证的httpClient + */ + private static OkHttpClient getHttpClient(String proxyHost, int proxyPort, String acc, String pwd) { + Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(proxyHost, proxyPort)); + // 账号密码验证 + Authenticator authenticator = new Authenticator() { + @Override + public Request authenticate(Route route, Response resp) throws IOException { + String credential = Credentials.basic(acc, pwd); + return resp.request().newBuilder().header("Proxy-Authorization", credential).build(); + } + }; + return new OkHttpClient.Builder() + .proxy(proxy) + .proxyAuthenticator(authenticator) + .build(); + } + + } + + /** + * HttpClient库使用代理 + */ + static class HttpClientProxy { + String proxyHost; + int proxyPort; + String proxyAccount; + String proxyPwd; + + /** + * @param host 代理主机地址 + * @param port 代理主机端口 + */ + public static HttpClientProxy build(String host, int port) { + HttpClientProxy proxy = new HttpClientProxy(); + proxy.proxyHost = host; + proxy.proxyPort = port; + return proxy; + } + + /** + * @param host 代理主机地址 + * @param port 代理主机端口 + * @param acc 代理认证账号 + * @param pwd 代理认证口令 + */ + public static HttpClientProxy build(String host, int port, String acc, String pwd) { + HttpClientProxy proxy = new HttpClientProxy(); + proxy.proxyHost = host; + proxy.proxyPort = port; + proxy.proxyAccount = acc; + proxy.proxyPwd = pwd; + return proxy; + } + + public void test() throws IOException { + String targetUrl = "http://myip.ipip.net"; + + CloseableHttpClient client = null; + if (proxyAccount == null || proxyPwd == null) { + client = getHttpClient(proxyHost, proxyPort); + } else { + // 账号密码验证 + client = getHttpClient(proxyHost, proxyPort, proxyAccount, proxyPwd); + } + HttpGet httpGet = new HttpGet(targetUrl); + CloseableHttpResponse response = client.execute(httpGet); + String resultStr = EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8); + System.out.println(resultStr); + } + + /** + * 代理不需要账号密码认证的httpClient + */ + private static CloseableHttpClient getHttpClient(String proxyHost, int proxyPort) { + HttpHost proxy = new HttpHost(proxyHost, proxyPort, "HTTP"); + return HttpClients.custom() + .setProxy(proxy) + .build(); + } + + /** + * 代理需要账号密码认证的httpClient + */ + private static CloseableHttpClient getHttpClient(String proxyHost, int proxyPort, String acc, String pwd) { + HttpHost proxy = new HttpHost(proxyHost, proxyPort, "HTTP"); + CredentialsProvider provider = new BasicCredentialsProvider(); + provider.setCredentials(new AuthScope(proxy), new UsernamePasswordCredentials(acc, pwd)); + + return HttpClients.custom() + .setProxy(proxy) + .setDefaultCredentialsProvider(provider) + .build(); + } + + } + +} diff --git a/src/main/java/com/bfd/cnki/crawl/test/TestListDownload.java b/src/main/java/com/bfd/cnki/crawl/test/TestListDownload.java new file mode 100644 index 0000000..86cbc19 --- /dev/null +++ b/src/main/java/com/bfd/cnki/crawl/test/TestListDownload.java @@ -0,0 +1,79 @@ +package com.bfd.cnki.crawl.test; + +import okhttp3.*; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.select.Elements; + +import java.io.IOException; + +/** + * @PROJECT_NAME: cnki_crawl + * @DESCRIPTION: + * @AUTHOR: ying.zhao + * @DATE: 2024/5/30 10:24 + */ +public class TestListDownload { + public static void main(String[] args) { + boolean flag = true; + + int num = 1; + while (flag) { + System.out.println("第 "+num+" 页"); + String html = ""; + try { + html = doDownload(num); + } catch (IOException e) { + e.printStackTrace(); + } + Document doc = Jsoup.parse(html); + Elements elemens = doc.select(".fz14"); + if (elemens.size() < 50) { + System.out.println("页面异常 html=" + html); + flag = false; + } else { + System.out.println("下载成功"); + num++; + try { + Thread.sleep(5*1000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } + + + } + + public static String doDownload(int num) throws IOException { + OkHttpClient client = new OkHttpClient().newBuilder() + .build(); + MediaType mediaType = MediaType.parse("application/x-www-form-urlencoded; charset=UTF-8"); + RequestBody body = RequestBody.create(mediaType, "boolSearch=false&QueryJson=%7B%22Platform%22%3A%22%22%2C%22Resource%22%3A%22CROSSDB%22%2C%22Classid%22%3A%22WD0FTY92%22%2C%22Products%22%3A%22%22%2C%22QNode%22%3A%7B%22QGroup%22%3A%5B%7B%22Key%22%3A%22Subject%22%2C%22Title%22%3A%22%22%2C%22Logic%22%3A0%2C%22Items%22%3A%5B%5D%2C%22ChildItems%22%3A%5B%7B%22Key%22%3A%22input%5Bdata-tipid%3Dgradetxt-1%5D%22%2C%22Title%22%3A%22%E4%B8%BB%E9%A2%98%22%2C%22Logic%22%3A0%2C%22Items%22%3A%5B%7B%22Key%22%3A%22input%5Bdata-tipid%3Dgradetxt-1%5D%22%2C%22Title%22%3A%22%E4%B8%BB%E9%A2%98%22%2C%22Logic%22%3A0%2C%22Field%22%3A%22SU%22%2C%22Operator%22%3A%22TOPRANK%22%2C%22Value%22%3A%22%E7%94%B5%E5%8A%9B%22%2C%22Value2%22%3A%22%22%7D%5D%2C%22ChildItems%22%3A%5B%5D%7D%2C%7B%22Key%22%3A%22input%5Bdata-tipid%3Dgradetxt-2%5D%22%2C%22Title%22%3A%22%E4%B8%BB%E9%A2%98%22%2C%22Logic%22%3A0%2C%22Items%22%3A%5B%7B%22Key%22%3A%22input%5Bdata-tipid%3Dgradetxt-2%5D%22%2C%22Title%22%3A%22%E4%B8%BB%E9%A2%98%22%2C%22Logic%22%3A0%2C%22Field%22%3A%22SU%22%2C%22Operator%22%3A%22TOPRANK%22%2C%22Value%22%3A%22%E6%8A%80%E6%9C%AF%22%2C%22Value2%22%3A%22%22%7D%5D%2C%22ChildItems%22%3A%5B%5D%7D%5D%7D%2C%7B%22Key%22%3A%22ControlGroup%22%2C%22Title%22%3A%22%22%2C%22Logic%22%3A0%2C%22Items%22%3A%5B%5D%2C%22ChildItems%22%3A%5B%5D%7D%5D%7D%2C%22ExScope%22%3A%221%22%2C%22SearchType%22%3A7%2C%22Rlang%22%3A%22CHINESE%22%2C%22KuaKuCode%22%3A%22YSTT4HG0%2CLSTPFY1C%2CJUP3MUPD%2CMPMFIG1A%2CEMRPGLPA%2CWQ0UVIAA%2CBLZOG7CK%2CPWFIRAGL%2CNN3FJMUV%2CNLBO1Z6R%22%7D&pageNum=" + num + "&pageSize=50&sortField=PT&sortType=desc&dstyle=listmode&boolSortSearch=false&sentenceSearch=false&productStr=YSTT4HG0%2CLSTPFY1C%2CRMJLXHZ3%2CJQIRZIYA%2CJUP3MUPD%2C1UR4K4HZ%2CBPBAFJ5S%2CR79MZMCB%2CMPMFIG1A%2CEMRPGLPA%2CJ708GVCE%2CML4DRIDX%2CWQ0UVIAA%2CNB3BWEHK%2CXVLO76FD%2CHR1YT1Z9%2CBLZOG7CK%2CPWFIRAGL%2CNN3FJMUV%2CNLBO1Z6R%2C&aside=&searchFrom=%E8%B5%84%E6%BA%90%E8%8C%83%E5%9B%B4%EF%BC%9A%E6%80%BB%E5%BA%93%3B++%E4%B8%AD%E8%8B%B1%E6%96%87%E6%89%A9%E5%B1%95%3B++%E6%97%B6%E9%97%B4%E8%8C%83%E5%9B%B4%EF%BC%9A%E6%9B%B4%E6%96%B0%E6%97%B6%E9%97%B4%EF%BC%9A%E4%B8%8D%E9%99%90%3B++"); + Request request = new Request.Builder() + .url("https://kns.cnki.net/kns8s/brief/grid") + .method("POST", body) + .addHeader("Accept", "*/*") + .addHeader("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8") + .addHeader("Cache-Control", "no-cache") + .addHeader("Connection", "keep-alive") + .addHeader("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8") + .addHeader("Origin", "https://kns.cnki.net") + .addHeader("Pragma", "no-cache") + .addHeader("Referer", "https://kns.cnki.net/kns8s/AdvSearch?classid=WD0FTY92") + .addHeader("Sec-Fetch-Dest", "empty") + .addHeader("Sec-Fetch-Mode", "cors") + .addHeader("Sec-Fetch-Site", "same-origin") + .addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36") + .addHeader("X-Requested-With", "XMLHttpRequest") + .addHeader("sec-ch-ua", "\"Google Chrome\";v=\"125\", \"Chromium\";v=\"125\", \"Not.A/Brand\";v=\"24\"") + .addHeader("sec-ch-ua-mobile", "?0") + .addHeader("sec-ch-ua-platform", "\"Windows\"") +// .addHeader("Cookie", "KNS2COOKIE=1717035822.285.24942.769684|b25e41a932fd162af3b8c5cff4059fc3; SID_kns_new=kns15128006") + .build(); + Response response = client.newCall(request).execute(); + String html = response.body().string(); + response.close(); + return html; + } +} diff --git a/src/main/java/com/bfd/cnki/crawl/util/DateUtil.java b/src/main/java/com/bfd/cnki/crawl/util/DateUtil.java new file mode 100644 index 0000000..94179af --- /dev/null +++ b/src/main/java/com/bfd/cnki/crawl/util/DateUtil.java @@ -0,0 +1,240 @@ +package com.bfd.cnki.crawl.util; + +import java.security.MessageDigest; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.time.Instant; +import java.time.LocalDateTime; +import java.time.format.DateTimeFormatter; +import java.time.temporal.ChronoUnit; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +/** + * @author:jian.mao + * @className:DateUtil + * @version:1.0 + * @description: + * @Date:2022-11-16 11:38:49 + */ +public class DateUtil { + + + /** + * 英文日期转换 + * @param time + * @param format + * @return + */ + public static String formatTimeEn(String time, String format) { + SimpleDateFormat sdf = new SimpleDateFormat(format, Locale.ENGLISH); + String format1 = ""; + try { + Date parse = sdf.parse(time); + SimpleDateFormat sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + format1 = sdf1.format(parse); + } catch (Exception e) { + e.printStackTrace(); + } + return format1; + } + /** + * 日期格式话 + * @param format 日期格式 + * @param date 要转换的日期 + * @return + */ + public static String formatTime(String date, String format) throws ParseException { + SimpleDateFormat sdf = new SimpleDateFormat(format, Locale.ENGLISH); + String format1 = ""; + Date parse = sdf.parse(date); + SimpleDateFormat sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + format1 = sdf1.format(parse); + return format1; + } + /** + * 正则匹配并返回参数 + * @param str 字符串 + * @param regex 正则表达式 + * @return + */ + public static String getStrByPattern(String str, String regex) { + Pattern pattern = Pattern.compile(regex); + Matcher m = pattern.matcher(str); + return m.find() ? m.group(0) : null; + } + /** + * 字符串转日期 + * @param format + * @param date + * @return + */ + public static Date strToDate(String format,String date){ + SimpleDateFormat sdf = new SimpleDateFormat(format); + if (date == null || date.equals("")){ + return new Date(); + }else{ + Date ru = null; + try { + ru = sdf.parse(date); + } catch (ParseException e) { + e.printStackTrace(); + } + return ru; + } + } + /** + * 休眠 + * @param millis 毫秒 + */ + public static void sleep(long millis){ + try { + Thread.sleep(millis); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + /** + * 时间戳转换时间字符串 + * @param time + * @param format + * @return + */ + public static String loJngTimeToDateStr(long time,String format){ + String result2 = new SimpleDateFormat(format).format(time); + return result2; + } + public static String getWeekOne(Date date){ + Map weekMap = new HashMap<>(7); + weekMap.put(1, "星期日"); + weekMap.put(2, "星期一"); + weekMap.put(3, "星期二"); + weekMap.put(4, "星期三"); + weekMap.put(5, "星期四"); + weekMap.put(6, "星期五"); + weekMap.put(7, "星期六"); + Calendar calendar = Calendar.getInstance(); + calendar.setTime(date); +// System.out.println(calendar.get(Calendar.DAY_OF_WEEK)); +// System.out.println("今天是" + weekMap.get(calendar.get(Calendar.DAY_OF_WEEK))); + return weekMap.get(calendar.get(Calendar.DAY_OF_WEEK)); + } + /** + * 获取当前时间前后几天的时间 + * @param daycount + * @return + */ + public static Date getAfterTime(int daycount){ + Date date = new Date(); + Calendar calendar = Calendar.getInstance(); + calendar.setTime(date); + calendar.add(Calendar.DATE, daycount); + return calendar.getTime(); + } + /** + * 获取指定时间前后几天的时间 + * @param daycount + * @return + */ + public static Date getSpecifyAfterTime(Date date,int daycount){ + Calendar calendar = Calendar.getInstance(); + calendar.setTime(date); + calendar.add(Calendar.DATE, daycount); + return calendar.getTime(); + } + /** + * 比较时间(相等时返回flase) + * @param last + * @param now + * @return + */ + public static boolean compareToForBBS(String last, String now) { + try { + if (last.equals(now)) { + return false; + } + SimpleDateFormat formatter = new SimpleDateFormat( + "yyyy-MM-dd"); + Date temp1 = formatter.parse(last); + Date temp2 = formatter.parse(now); + if (temp1.after(temp2)) { + return false; + } + if (temp1.before(temp2)) { + return true; + } + } catch (ParseException e) { + e.printStackTrace(); + } + return false; + } + /** + * 获取当前季度 + * @param date + * @return + */ + public static String getQuarterly(String date){ + return ""; + } + + + /** + * 生成MD5 + * @param string + * @return + */ + public static String getMd5(String string) { + try { + MessageDigest md5 = MessageDigest.getInstance("MD5"); + byte[] bs = md5.digest(string.getBytes("UTF-8")); + StringBuilder sb = new StringBuilder(40); + for (byte x : bs) { + if ((x & 0xff) >> 4 == 0) { + sb.append("0").append(Integer.toHexString(x & 0xff)); + } else { + sb.append(Integer.toHexString(x & 0xff)); + } + } + return sb.toString(); + } catch (Exception e) { + return "nceaform" + System.currentTimeMillis(); + } + } + + public static String getCurrentTimePlusMinutes(int min) { + // 获取当前系统时间 + LocalDateTime currentTime = LocalDateTime.now(); + + // 延迟30分钟 + LocalDateTime futureTime = currentTime.plus(min, ChronoUnit.MINUTES); + + // 定义时间格式 + DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); + + // 格式化时间并返回 + return futureTime.format(formatter); + } + + + public static long getOneHourAgoTimestamp(Instant now) { + Instant oneHourAgo = now.minus(1, ChronoUnit.HOURS); + return oneHourAgo.toEpochMilli(); + } + + public static long getOneHourLaterTimestamp(Instant now) { + Instant oneHourLater = now.plus(1, ChronoUnit.HOURS); + return oneHourLater.toEpochMilli(); + } + + public static long timeToTimestamp( String date,String format) throws ParseException { + SimpleDateFormat sdf = new SimpleDateFormat(format, Locale.ENGLISH); + String format1 = ""; + Date parse = sdf.parse(date); + // 将Date对象转换为13位时间戳 + long timestamp = parse.getTime(); + + // 输出结果 + System.out.println("13位时间戳: " + timestamp); + return timestamp; + } +} diff --git a/src/main/java/com/bfd/cnki/crawl/util/DesDecryption.java b/src/main/java/com/bfd/cnki/crawl/util/DesDecryption.java new file mode 100644 index 0000000..b74369e --- /dev/null +++ b/src/main/java/com/bfd/cnki/crawl/util/DesDecryption.java @@ -0,0 +1,92 @@ +package com.bfd.cnki.crawl.util; + +import javax.crypto.Cipher; +import javax.crypto.spec.IvParameterSpec; +import javax.crypto.spec.SecretKeySpec; +import java.time.Instant; +import java.time.temporal.ChronoUnit; +import java.util.Base64; + +/** + * @PROJECT_NAME: cnki_crawl + * @DESCRIPTION: + * @AUTHOR: ying.zhao + * @DATE: 2024/7/18 15:44 + */ +public class DesDecryption { + public static String decryptByDES(String ciphertext) throws Exception { + // 秘钥和IV + String key = "CNKIKCMS"; + byte[] ivBytes = {22, 52, 86, 88, -120, -85, -51, -17}; + // 处理IV的字节 + for (int i = 0; i < ivBytes.length; i++) { + if (ivBytes[i] < 0) { + ivBytes[i] += 256; + } + } + + // 解密密文 + SecretKeySpec keySpec = new SecretKeySpec(key.getBytes("UTF-8"), "DES"); + IvParameterSpec ivSpec = new IvParameterSpec(ivBytes); + Cipher cipher = Cipher.getInstance("DES/CBC/PKCS5Padding"); + cipher.init(Cipher.DECRYPT_MODE, keySpec, ivSpec); + + // 处理密文,假设它是Base64url编码的 + byte[] decodedCiphertext = Base64.getUrlDecoder().decode(ciphertext); + byte[] decrypted = cipher.doFinal(decodedCiphertext); + + return new String(decrypted, "UTF-8"); + } + + public static String encryptByDES(String message) throws Exception { + // 秘钥和IV + String key = "CNKIKCMS"; + byte[] ivBytes = {22, 52, 86, 88, -120, -85, -51, -17}; + // 处理IV的字节 + for (int i = 0; i < ivBytes.length; i++) { + if (ivBytes[i] < 0) { + ivBytes[i] += 256; + } + } + + // 加密消息 + SecretKeySpec keySpec = new SecretKeySpec(key.getBytes("UTF-8"), "DES"); + IvParameterSpec ivSpec = new IvParameterSpec(ivBytes); + Cipher cipher = Cipher.getInstance("DES/CBC/PKCS5Padding"); + cipher.init(Cipher.ENCRYPT_MODE, keySpec, ivSpec); + + byte[] encrypted = cipher.doFinal(message.getBytes("UTF-8")); + + // 将密文转换为Base64url编码 + String encodedCiphertext = Base64.getUrlEncoder().encodeToString(encrypted); + return encodedCiphertext; + } + + public static void main(String[] args) { + try { + // 替换为实际的密文 +// String ciphertext = "VKFFl0Cm57ZzUBqSVU7qUH-PuZHmSigo7LVPuqhlErtSk4FDi1L-zSlsD7h-xNghqe2FnTuLgK6vgZHV9Q4XTCmsbSuPMRqZgSYEORyW55e9RDeueEyjKQ=="; +// String decryptedText = decryptByDES(ciphertext); +// System.out.println("Decrypted text: " + decryptedText); +//// String time = String.valueOf(System.currentTimeMillis()); +//// System.out.println("time="+time); +//// Thread.sleep(60*1000); +//// String time2 = String.valueOf(System.currentTimeMillis()); +//// System.out.println("time2="+time); +////// String five = time.substring(time.length()-5); +////// System.out.println("five="+five); +// Instant now = Instant.now(); +// Instant oneHourAgo = now.minus(1, ChronoUnit.HOURS); +// long ago = DateUtil.getOneHourAgoTimestamp(now); +// long later = DateUtil.getOneHourLaterTimestamp(now); +// System.out.println(ago); +// System.out.println(later); + String x = "st=1721290265765&code=000041881595&skey=%E6%B1%A4%E8%B6%85&sfield=au&et=1721297465765"; + String decryptedText = encryptByDES(x); + System.out.println("Decrypted text: " + decryptedText); + } catch (Exception e) { + e.printStackTrace(); + } + } +} + diff --git a/src/main/java/com/bfd/cnki/crawl/util/DownlodContentHtml.java b/src/main/java/com/bfd/cnki/crawl/util/DownlodContentHtml.java new file mode 100644 index 0000000..d45d433 --- /dev/null +++ b/src/main/java/com/bfd/cnki/crawl/util/DownlodContentHtml.java @@ -0,0 +1,350 @@ +package com.bfd.cnki.crawl.util; + +import com.bfd.cnki.crawl.entity.Constants; +import okhttp3.*; +import org.apache.http.HttpEntity; +import org.apache.http.HttpHost; +import org.apache.http.StatusLine; +import org.apache.http.auth.AuthScope; +import org.apache.http.auth.UsernamePasswordCredentials; +import org.apache.http.client.AuthCache; +import org.apache.http.client.CredentialsProvider; +import org.apache.http.client.HttpRequestRetryHandler; +import org.apache.http.client.config.RequestConfig; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.protocol.HttpClientContext; +import org.apache.http.config.Registry; +import org.apache.http.config.RegistryBuilder; +import org.apache.http.config.SocketConfig; +import org.apache.http.conn.socket.ConnectionSocketFactory; +import org.apache.http.conn.socket.LayeredConnectionSocketFactory; +import org.apache.http.conn.socket.PlainConnectionSocketFactory; +import org.apache.http.conn.ssl.SSLConnectionSocketFactory; +import org.apache.http.impl.auth.BasicScheme; +import org.apache.http.impl.client.BasicAuthCache; +import org.apache.http.impl.client.BasicCredentialsProvider; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClientBuilder; +import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; +import org.apache.http.util.EntityUtils; + +import java.io.IOException; +import java.net.InetSocketAddress; +import java.net.Proxy; +import java.util.*; +import java.util.concurrent.TimeUnit; + +/** + * @author:zhaoying + * @className:DownlodContentHtml + * @version:1.0 + * @description: + * @Date:2022-11-18 11:38:53 + */ +public class DownlodContentHtml { + /** 代理服务器(产品官网 www.16yun.cn) **/ + final static String PROXYHOST = "u270.40.tp.16yun.cn"; + final static Integer PROXYPORT = 6448; + /** 代理验证信息 **/ + final static String PROXYUSER = "16HFBVJC"; + final static String PROXYPASS = "897944"; + + private static PoolingHttpClientConnectionManager cm = null; + private static HttpRequestRetryHandler httpRequestRetryHandler = null; + private static HttpHost proxy = null; + + private static CredentialsProvider credsProvider = null; + private static RequestConfig reqConfig = null; + + static { + ConnectionSocketFactory plainsf = PlainConnectionSocketFactory + .getSocketFactory(); + LayeredConnectionSocketFactory sslsf = SSLConnectionSocketFactory + .getSocketFactory(); + + Registry registry = RegistryBuilder.create().register("http", plainsf) + .register("https", sslsf).build(); + + cm = new PoolingHttpClientConnectionManager(registry); + cm.setMaxTotal(20); + cm.setDefaultMaxPerRoute(5); + + proxy = new HttpHost(PROXYHOST, PROXYPORT, "https"); + + credsProvider = new BasicCredentialsProvider(); + credsProvider.setCredentials(AuthScope.ANY, + new UsernamePasswordCredentials(PROXYUSER, PROXYPASS)); + + reqConfig = RequestConfig.custom().setConnectionRequestTimeout(5000) + .setConnectTimeout(5000).setSocketTimeout(5000) + .setExpectContinueEnabled(false) + .setProxy(new HttpHost(PROXYHOST, PROXYPORT)).build(); + } + + /** + * 模拟客户端get请求 + * + * @param url + * 模拟请求得url + * @param headers + * 头部信息,没有可以不传 + * @return + */ + public static String doGet(String url, Map... headers) { + // 设置超时时间 + int timeout = 30; + RequestConfig config = RequestConfig.custom() + .setConnectTimeout(timeout * 1000) + .setConnectionRequestTimeout(timeout * 1000) + .setSocketTimeout(timeout * 1000).build(); + SocketConfig socketConfig = SocketConfig.custom() + .setSoKeepAlive(false) + .setSoLinger(1) + .setSoReuseAddress(true) + .setSoTimeout(timeout * 1000) + .setTcpNoDelay(true).build(); + AuthCache authCache = new BasicAuthCache(); + authCache.put(proxy, new BasicScheme()); + HttpClientContext localContext = HttpClientContext.create(); + localContext.setAuthCache(authCache); + HttpClientBuilder httpBuilder = HttpClientBuilder.create(); + CloseableHttpClient httpClient = httpBuilder + .setDefaultSocketConfig(socketConfig) + .setDefaultRequestConfig(config) + .setDefaultCredentialsProvider(credsProvider).build(); + HttpGet httpGet = new HttpGet(url); + httpGet.setConfig(reqConfig); + if (headers != null && headers.length > 0) { + Map tempHeaders = headers[0]; + for (String key : tempHeaders.keySet()) { + httpGet.setHeader(key, tempHeaders.get(key).toString()); + } + } else { + httpGet.setHeader("Accept", + "application/json, text/javascript, */*; q=0.01"); + httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8"); + } + CloseableHttpResponse response = null; + String html = ""; + int notFundCode = 404; + int successCode = 200; + try { + response = httpClient.execute(httpGet, localContext); + // 从响应模型中获取响应实体 + HttpEntity responseEntity = response.getEntity(); + StatusLine statusLine = response.getStatusLine(); + System.out.println("响应状态为:" + response.getStatusLine()); + if (statusLine.getStatusCode() == successCode) { + if (responseEntity != null) { + html = EntityUtils.toString(responseEntity, "utf-8"); + System.out.println("响应内容长度为:" + + responseEntity.getContentLength()); + // 下载结果为空不正常 + if (html.equals(Constants.EMPTY)) { + html = "Download failed error is:reslut is null"; + } + } + } else if (statusLine.getStatusCode() == notFundCode) { + html = "

页面404,正常结束请求即可

"; + } else { + throw new Exception("请求错误,code码为:" + statusLine.getStatusCode()); + } + } catch (Exception e) { + e.printStackTrace(); + html = "Download failed error is:reslut is null"; + }finally{ + try { + response.close(); + httpClient.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + return html; + + } + + +// final static String PROXYHOST = "172.18.128.225"; +// private static List prots = new ArrayList(Arrays.asList( +// 45007, +// 45008, +// 45009, +// 45010, +// 45011, +// 45012, +// 45013, +// 45014, +// 45015, +// 45016, +// 45017, +// 45018, +// 45019, +// 45020, +// 45021, +// 45022, +// 45023, +// 45024, +// 45025, +// 45026, +// 45027, +// 45028 +// )); +// +// private static RequestConfig reqConfig = null; +// private static HttpHost proxy = null; +// +// +// public static String doGet(String url, Map... headers) { +// int i = new Random().nextInt(20); +// int prot = prots.get(i); +// proxy = new HttpHost(PROXYHOST, prot, "https"); +// reqConfig = RequestConfig.custom().setConnectionRequestTimeout(5000) +// .setConnectTimeout(5000).setSocketTimeout(5000) +// .setExpectContinueEnabled(false) +// .setProxy(new HttpHost(PROXYHOST, prot)).build(); +// // 设置超时时间 +// int timeout = 30; +// RequestConfig config = RequestConfig.custom() +// .setConnectTimeout(timeout * 1000) +// .setConnectionRequestTimeout(timeout * 1000) +// .setSocketTimeout(timeout * 1000).build(); +// SocketConfig socketConfig = SocketConfig.custom() +// .setSoKeepAlive(false) +// .setSoLinger(1) +// .setSoReuseAddress(true) +// .setSoTimeout(timeout * 1000) +// .setTcpNoDelay(true).build(); +// AuthCache authCache = new BasicAuthCache(); +// authCache.put(proxy, new BasicScheme()); +// HttpClientContext localContext = HttpClientContext.create(); +// localContext.setAuthCache(authCache); +// HttpClientBuilder httpBuilder = HttpClientBuilder.create(); +//// CloseableHttpClient httpClient = httpBuilder.setDefaultSocketConfig(socketConfig).setDefaultRequestConfig(config).setDefaultCredentialsProvider(credsProvider).build(); +// CloseableHttpClient httpClient = httpBuilder.setDefaultSocketConfig(socketConfig).setDefaultRequestConfig(config).build(); +// HttpGet httpGet = new HttpGet(url); +// httpGet.setConfig(reqConfig); +// if (headers != null && headers.length > 0) { +// Map tempHeaders = headers[0]; +// for (String key : tempHeaders.keySet()) { +// httpGet.setHeader(key, tempHeaders.get(key).toString()); +// } +// } else { +// httpGet.setHeader("Accept", "application/json, text/plain, */*"); +// httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.9"); +// httpGet.setHeader("Cache-Control", "no-cache"); +// httpGet.setHeader("Connection", "keep-alive"); +// httpGet.setHeader("Content-Type", "application/json"); +//// httpGet.setHeader("Origin", "https://www.tianyancha.com"); +// httpGet.setHeader("Pragma", "no-cache"); +// httpGet.setHeader("Host", "kns.cnki.net"); +// httpGet.setHeader("Sec-Fetch-Dest", "empty"); +// httpGet.setHeader("Sec-Fetch-Mode", "cors"); +// httpGet.setHeader("Sec-Fetch-Site", "same-site"); +// httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"); +// httpGet.setHeader("sec-ch-ua", "\"Google Chrome\";v=\"111\", \"Not(A:Brand\";v=\"8\", \"Chromium\";v=\"111\""); +// httpGet.setHeader("sec-ch-ua-mobile", "?0"); +// httpGet.setHeader("sec-ch-ua-platform", "\"Windows\""); +// httpGet.setHeader("Cookie", "Ecp_ClientId=7231017185000317816; Ecp_loginuserjf=17804240679; Ecp_ClientIp=120.133.68.37; cnkiUserKey=d51b4bb3-80b6-e81f-73fe-491769540f89; SID_kns_new=kns15128003; KNS2COOKIE=1721013277.576.54758.952051|b25e41a932fd162af3b8c5cff4059fc3; dblang=both; SID_sug=018105; Ecp_IpLoginFail=240715120.133.68.37; SID_restapi=018106; Hm_lvt_dcec09ba2227fd02c55623c1bb82776a=1719914523,1721023923; Hm_lpvt_dcec09ba2227fd02c55623c1bb82776a=1721023923; HMACCOUNT=C387EECC72C26564; tfstk=fUkoNzarcbP7aTIprx2WOhcuZyAxVawQz2BLJJUegrzbeWU-Yycnz4VU93aKoe0EoTEUUQ6UY4gbvz8BVkynyqu3O3aL8y0KYbKvBdnSVJwEKFp9Bb885qHu8weUedGV-FL9BKdheOTeWXUM3U20ArrPz9rym-r_xJ5PU2z4glrOzJyE8m-0xuWPUurUgZr-gs6USvkj3UY4-cqohGubq74hprXH5HEuZvnzoF8ZnLPu70zcKFRXKOz3lA8vimgia4Ej-dY0QfgmK5yHENZogc24yRJP38HS4fPEQe1E4-zu_XPD4Lo0EzViHSSOG8cEPvVSZMts08uj1PNyGEZmeAF35Sb2zmn0z5cnyE6Uyfmqo5HAkOeZ1qD3_RSPDs5ZLbXQ0HHVO6Nzco46RkzJoCp-Jz-Dm1O7aoZJWvKctmPzcuzMmnf1o7rb2j5.."); +// +// } +// CloseableHttpResponse response = null; +// String html = ""; +// int notFundCode = 404; +// int successCode = 200; +// try { +// response = httpClient.execute(httpGet, localContext); +// // 从响应模型中获取响应实体 +// HttpEntity responseEntity = response.getEntity(); +// StatusLine statusLine = response.getStatusLine(); +// System.out.println("响应状态为:" + response.getStatusLine()); +// if (statusLine.getStatusCode() == successCode) { +// if (responseEntity != null) { +// html = EntityUtils.toString(responseEntity, "utf-8"); +// System.out.println("响应内容长度为:" +// + responseEntity.getContentLength()); +// // 下载结果为空不正常 +// if (html.equals(Constants.EMPTY)) { +// html = "Download failed error is:reslut is null"; +// } +// } +// } else if (statusLine.getStatusCode() == notFundCode) { +// html = "

页面404,正常结束请求即可

"; +// } else { +// throw new Exception("请求错误,code码为:" + statusLine.getStatusCode()); +// } +// } catch (Exception e) { +// e.printStackTrace(); +// html = "Download failed error is:reslut is null"; +// } finally { +// try { +// response.close(); +// httpClient.close(); +// } catch (Exception e) { +// e.printStackTrace(); +// } +// } +// return html; +// +// } + + + + + + + +// static Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress("u270.40.tp.16yun.cn", 6448)); +// final static String USERNAME = "16HFBVJC"; +// final static String PASSWORD = "897944"; +// static Authenticator proxyAuthenticator = new Authenticator() { +// public Request authenticate(Route route, Response response) throws IOException { +// // 设置代理服务器账号密码 +// String credential = Credentials.basic(USERNAME, PASSWORD); +// return response.request().newBuilder().header("proxy-authorization", credential).build(); +// } +// }; + +// public static String doGet(String url, Map... headers) throws IOException { +// OkHttpClient client = new OkHttpClient().newBuilder() +// .build(); +// OkHttpClient.Builder builder = new OkHttpClient.Builder(); +// builder.readTimeout(180, TimeUnit.SECONDS); +// builder.connectTimeout(180, TimeUnit.SECONDS); +// builder.proxy(proxy); +// builder.proxyAuthenticator(proxyAuthenticator); +//// OkHttpClient client = builder.retryOnConnectionFailure(true).build(); +// Request request = new Request.Builder() +// .url(url) +// .method("GET", null) +// .addHeader("Host", "kns.cnki.net") +// .addHeader("accept", "application/json, text/javascript, */*; q=0.01") +// .addHeader("accept-language", "zh-CN,zh;q=0.9,en;q=0.8") +// .addHeader("cache-control", "no-cache") +// .addHeader("pragma", "no-cache") +// .addHeader("sec-ch-ua", "\"Not/A)Brand\";v=\"8\", \"Chromium\";v=\"126\", \"Google Chrome\";v=\"126\"") +// .addHeader("sec-ch-ua-mobile", "?0") +// .addHeader("sec-ch-ua-platform", "\"Windows\"") +// .addHeader("sec-fetch-dest", "document") +// .addHeader("sec-fetch-mode", "navigate") +// .addHeader("sec-fetch-site", "none") +// .addHeader("sec-fetch-user", "?1") +// .addHeader("upgrade-insecure-requests", "1") +// .addHeader("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36") +// .build(); +// Response response = client.newCall(request).execute(); +// String html = response.body().string(); +// response.close(); +// return html; +// } +// public static void main(String[] args) { +// String url = +// "https://ie.cnki.net/KCMS/detail/detail.aspx?dbcode=SYHJ&dbname=SYSTLKCCJDLAST2&filename=FDGJ202102010"; +// Map header = new HashMap(16); +// header.put("Referer", +// "https://ie.cnki.net/kns/brief/result.aspx?dbprefix=SYSTZK&kw=&korder=2&other=&sel=1&NaviDatabaseName=SYST_042_CLS&NaviField=%E8%A1%8C%E4%B8%9A%E5%88%86%E7%B1%BB%E4%BB%A3%E7%A0%81&systemno=04201&DSCode=04201"); +// String html = doGet(url,header); +// System.out.println(html); +// } +} diff --git a/src/main/java/com/bfd/cnki/crawl/util/EjdDownloadHtml.java b/src/main/java/com/bfd/cnki/crawl/util/EjdDownloadHtml.java new file mode 100644 index 0000000..9cc2977 --- /dev/null +++ b/src/main/java/com/bfd/cnki/crawl/util/EjdDownloadHtml.java @@ -0,0 +1,460 @@ +package com.bfd.cnki.crawl.util; + +import com.bfd.cnki.crawl.entity.Constants; +import lombok.extern.slf4j.Slf4j; +import okhttp3.*; +import org.apache.http.HttpHost; +import org.apache.http.client.config.RequestConfig; + +import java.io.IOException; +import java.net.InetSocketAddress; +import java.net.Proxy; +import java.util.*; +import java.util.concurrent.TimeUnit; + +/** + * @PROJECT_NAME: cnki_crawl_kyyz + * @DESCRIPTION: + * @AUTHOR: ying.zhao + * @DATE: 2024/8/29 16:42 + */ +@Slf4j +public class EjdDownloadHtml { + final static String PROXYHOST = "172.24.12.23"; + private static List prots = new ArrayList(Arrays.asList( + 45151, + 45152, + 45153, + 45154, + 45155, + 45156, + 45157, + 45158, + 45159, + 45160, + 45161, + 45162, + 45163, + 45164, + 45165, + 45166, + 45167, + 45168, + 45169, + 45170, + 45171, + 45172, + 45173, + 45174, + 45175, + 45176, + 45177, + 45178, + 45179, + 45180, + 45181, + 45182, + 45183, + 45184, + 45185, + 45186, + 45187, + 45188, + 45189, + 45190, + 45191, + 45192, + 45193, + 45194, + 45195, + 45196, + 45197, + 45198, + 45199, + 45200, + 45201, + 45202, + 45203, + 45204, + 45205, + 45206, + 45207, + 45208, + 45209, + 45210, + 45211, + 45212, + 45213, + 45214, + 45215, + 45216, + 45217, + 45218, + 45219, + 45220, + 45221, + 45222, + 45223, + 45224, + 45225, + 45226, + 45227, + 45228, + 45229, + 45230, + 45231, + 45232, + 45233, + 45234, + 45235, + 45236, + 45237, + 45238, + 45239, + 45240, + 45241, + 45242, + 45243, + 45244, + 45245, + 45246, + 45247, + 45248, + 45249, + 45250 + )); + private static List iplist = new ArrayList(); + +// public static void exec() { +// //定时扫表 +// TimerTask timerTask = new TimerTask() { +// @Override +// public void run() { +// getIpList(); +// } +// }; +// Timer timer = new Timer(); +// timer.schedule(timerTask, 1000 * 10, 1000 * 60); +// } + + public static String getIp() { + OkHttpClient.Builder builder = new OkHttpClient.Builder(); + builder.readTimeout(200, TimeUnit.SECONDS); + builder.connectTimeout(200, TimeUnit.SECONDS); + OkHttpClient client = new OkHttpClient().newBuilder() + .build(); + MediaType mediaType = MediaType.parse("text/plain"); + RequestBody body = RequestBody.create(mediaType, ""); + Request request = new Request.Builder() + .url("http://172.18.1.148:5476/getIp") + .method("GET", null) + .build(); + Response response = null; + String html = ""; + try { + response = client.newCall(request).execute(); + html = response.body().string(); + } catch (IOException e) { + e.printStackTrace(); + } + if (!html.equals("")) { + log.info("请求得到代理:{}", html); + } + + return html; + } + + private static RequestConfig reqConfig = null; + private static HttpHost proxy = null; + + public static String proxyDoPost(String url, String params, Map... headers) { +// int i = new Random().nextInt(10); +// int port = prots.get(i); +// String ip = ips.get(i); +// String ipAndPort = getIp().trim(); +// log.info("当前使用的是:{}", ipAndPort); + String p = ""; + String ip = ""; + int port = 0; + while (p.equals("")) { + String ipAndPort = getIp().trim(); + log.info("当前使用的是:{}", ipAndPort); + if (!ipAndPort.equals("")) { + if (!ipAndPort.contains("msg")) { + try { + p = ipAndPort.substring(ipAndPort.indexOf(":") + 1); + log.info("当前使用的port:{}。", p); + port = Integer.valueOf(p); + ip = ipAndPort.substring(0, ipAndPort.indexOf(":")); + } catch (Exception e) { + p = ""; + try { + Thread.sleep(1000 * 5); + } catch (InterruptedException interruptedException) { + interruptedException.printStackTrace(); + } +// e.printStackTrace(); + } + } else { + log.info("ip请求异常,更换E京东代理下载方式"); + int i = new Random().nextInt(10); + port = prots.get(i); + log.info("当前使用的port:{}。", port); + ip = PROXYHOST; + break; + } + + } else { + p = ""; + try { + Thread.sleep(1000 * 5); + } catch (InterruptedException interruptedException) { + interruptedException.printStackTrace(); + } + } + + } + log.info("当前使用的ip:{}", ip); + // 创建代理 + Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(ip, port)); + + // 创建OkHttpClient并配置代理和超时 + OkHttpClient client = new OkHttpClient.Builder() + .proxy(proxy) + .connectTimeout(30, TimeUnit.SECONDS) + .writeTimeout(30, TimeUnit.SECONDS) + .readTimeout(30, TimeUnit.SECONDS) + .retryOnConnectionFailure(true) // 配置失败重试 + .build(); + + // 设置请求体的MediaType + MediaType mediaType = MediaType.parse("application/x-www-form-urlencoded; charset=UTF-8"); + RequestBody body = RequestBody.create(mediaType, params); + + // 构建请求头 + Request.Builder requestBuilder = new Request.Builder() + .url(url) + .post(body); // 使用POST请求 + + if (headers != null && headers.length > 0) { + Map tempHeaders = headers[0]; + for (String key : tempHeaders.keySet()) { + requestBuilder.addHeader(key, tempHeaders.get(key).toString()); + } + } else { + requestBuilder.addHeader("Accept", "application/json, text/plain, */*") + .addHeader("Accept", "*/*") + .addHeader("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8") + .addHeader("Cache-Control", "no-cache") + .addHeader("Connection", "keep-alive") + .addHeader("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8") + .addHeader("Origin", "https://kns.cnki.net") + .addHeader("Pragma", "no-cache") + .addHeader("Referer", "https://kns.cnki.net/kns8s/AdvSearch?classid=WD0FTY92") + .addHeader("Sec-Fetch-Dest", "empty") + .addHeader("Sec-Fetch-Mode", "cors") + .addHeader("Sec-Fetch-Site", "same-origin") + .addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36") + .addHeader("X-Requested-With", "XMLHttpRequest") + .addHeader("sec-ch-ua", "\"Google Chrome\";v=\"125\", \"Chromium\";v=\"125\", \"Not.A/Brand\";v=\"24\"") + .addHeader("sec-ch-ua-mobile", "?0") + .addHeader("sec-ch-ua-platform", "\"Windows\""); + } + + Request request = requestBuilder.build(); + + // 发送请求并处理响应 + try (Response response = client.newCall(request).execute()) { + if (response.isSuccessful()) { + return response.body().string(); + } else if (response.code() == 404) { + return "

页面404,正常结束请求即可

"; + } else { + throw new IOException("请求错误,code码为:" + response.code()); + } + } catch (IOException e) { + e.printStackTrace(); + return "Download failed error is:"; + } + } + + public static String proxyDoGet(String url, Map... headers) { +// int i = new Random().nextInt(10); +// int port = prots.get(i); +// String ip = ips.get(i); +// String ipAndPort = getIp().trim(); +// log.info("当前使用的是:{}", ipAndPort); +// String p = ipAndPort.substring(ipAndPort.indexOf(":") + 1); +// log.info("当前使用的port:{}。", p); +// int port = Integer.valueOf(p); +// String ip = ipAndPort.substring(0, ipAndPort.indexOf(":")); + String p = ""; + String ip = ""; + int port = 0; + while (p.equals("")) { + String ipAndPort = getIp().trim(); + log.info("当前使用的是:{}", ipAndPort); + if (!ipAndPort.equals("")) { + if(!ipAndPort.contains("msg")){ + try { + p = ipAndPort.substring(ipAndPort.indexOf(":") + 1); + log.info("当前使用的port:{}。", p); + port = Integer.valueOf(p); + ip = ipAndPort.substring(0, ipAndPort.indexOf(":")); + } catch (Exception e) { + p = ""; + try { + Thread.sleep(1000 * 5); + } catch (InterruptedException interruptedException) { + interruptedException.printStackTrace(); + } +// e.printStackTrace(); + } + }else{ + log.info("ip请求异常,更换E京东代理下载方式"); + int i = new Random().nextInt(10); + port = prots.get(i); + log.info("当前使用的port:{}。", port); + ip = PROXYHOST; + break; + } + + } else { + p = ""; + try { + Thread.sleep(1000 * 5); + } catch (InterruptedException interruptedException) { + interruptedException.printStackTrace(); + } + } + + } + log.info("当前使用的ip:{}", ip); + // 创建代理 + Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(ip, port)); + + // 创建OkHttpClient并配置代理和超时 + OkHttpClient client = new OkHttpClient.Builder() + .proxy(proxy) + .connectTimeout(30, TimeUnit.SECONDS) + .writeTimeout(30, TimeUnit.SECONDS) + .readTimeout(30, TimeUnit.SECONDS) + .retryOnConnectionFailure(true) // 配置失败重试 + .build(); + + // 构建GET请求 + Request.Builder requestBuilder = new Request.Builder() + .url(url) + .get(); // 使用GET请求 + + if (headers != null && headers.length > 0) { + Map tempHeaders = headers[0]; + for (String key : tempHeaders.keySet()) { + requestBuilder.addHeader(key, tempHeaders.get(key).toString()); + } + } else { + requestBuilder.addHeader("Accept", "application/json, text/plain, */*") + .addHeader("Accept-Language", "zh-CN,zh;q=0.9") + .addHeader("Cache-Control", "no-cache") + .addHeader("Connection", "keep-alive") +// .addHeader("Origin", "https://www.tianyancha.com") + .addHeader("Pragma", "no-cache") +// .addHeader("Referer", "https://www.tianyancha.com/") + .addHeader("Sec-Fetch-Dest", "empty") + .addHeader("Sec-Fetch-Mode", "cors") + .addHeader("Sec-Fetch-Site", "same-site") + .addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36") + .addHeader("sec-ch-ua", "\"Google Chrome\";v=\"111\", \"Not(A:Brand\";v=\"8\", \"Chromium\";v=\"111\"") + .addHeader("sec-ch-ua-mobile", "?0") + .addHeader("sec-ch-ua-platform", "\"Windows\""); +// .addHeader("Cookie", "TYCID=e3a3a110909211eda1f8e55a7c39bb06; ssuid=6524549272; _ga=GA1.2.1408723786.1677737989; sensorsdata2015jssdkcross=..."); + } + + Request request = requestBuilder.build(); + + // 发送请求并处理响应 + try (Response response = client.newCall(request).execute()) { + if (response.isSuccessful()) { + return response.body().string(); + } else if (response.code() == 404) { + return "

页面404,正常结束请求即可

"; + } else { + throw new IOException("请求错误,code码为:" + response.code()); + } + } catch (IOException e) { + e.printStackTrace(); + return "Download failed error is:result is null"; + } + } + + public static String okHttpProxyPostRe(String url, String params, Map... headers) { + String html = ""; + try { + html = proxyDoPost(url, params, headers); + } catch (Exception e) { + log.info("下载错误", e); + } + int i = 1; + while (true) { + try { + if (html.contains("Download failed error is:")) { + log.error("第:{} 次下载失败 params:{}", i, params); + DateUtil.sleep(30 * 1000); + i++; + } else { + break; + } + if (i > 3) { + break; + } + html = proxyDoPost(url, params, headers); + } catch (Throwable e) { + log.info("下载错误", e); + } + + } + if (html.equals(Constants.EMPTY)) { + html = "Download failed error is:empty"; + } + return html; + } + + public static String okHttpProxyGetRe(String url, Map... headers) { + String html = ""; + try { + html = proxyDoGet(url, headers); + } catch (Exception e) { + log.info("下载错误", e); + } + int i = 1; + while (true) { + try { + if (html.contains("Download failed error is:")) { + log.error("第:{} 次下载失败", i); + DateUtil.sleep(30 * 1000); + i++; + } else { + break; + } + if (i > 3) { + break; + } + html = proxyDoGet(url, headers); + } catch (Throwable e) { + log.info("下载错误", e); + } + + } + if (html.equals(Constants.EMPTY)) { + html = "Download failed error is:empty"; + } + return html; + } + + + public static void main(String[] args) { +// String html = proxyDoGet("https://kns.cnki.net/kcms2/article/abstract?v=8pq0kR8SZyUA-jjvMOfvmV7784d1-xILPZ3ulEGwTSjGpgw17JHozrriMOEcbDXqj4ApyT1cXXNNH3SKYvSMz1oIAnKYsu_bsrWH5IL3EdD75CotnJjOHITk0QS0CqtbwGnNH-G65oPw6ay8qoHaKpUAeSSkMsZJXP7NLM3zVqKNbKIV3AucukUbkpMNEBq2ZWSCHcl2DZxL1ZFGdUK2NQ==&uniplatform=NZKPT"); +// System.out.println("html=" + html); + String p = "".substring("".indexOf(":") + 1); + } +} diff --git a/src/main/java/com/bfd/cnki/crawl/util/GetDriver.java b/src/main/java/com/bfd/cnki/crawl/util/GetDriver.java new file mode 100644 index 0000000..b71f71e --- /dev/null +++ b/src/main/java/com/bfd/cnki/crawl/util/GetDriver.java @@ -0,0 +1,23 @@ +package com.bfd.cnki.crawl.util; + +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.chrome.ChromeDriver; +import org.openqa.selenium.chrome.ChromeOptions; +/** + * @author:zhaoying + * @className:GetDriver + * @version:1.0 + * @description: + * @Date:2022-11-16 16:36:07 + */ +public class GetDriver{ + public WebDriver getWebDriverDriver() { + WebDriver driver; + ChromeOptions chromeOptions = new ChromeOptions(); + //无界面参数 + chromeOptions.addArguments("headless"); + // 禁用沙盒 就是被这个参数搞了一天 + chromeOptions.addArguments("no-sandbox"); + return driver = new ChromeDriver(chromeOptions); + } +} diff --git a/src/main/java/com/bfd/cnki/crawl/util/Kafkautils.java b/src/main/java/com/bfd/cnki/crawl/util/Kafkautils.java new file mode 100644 index 0000000..3e566bc --- /dev/null +++ b/src/main/java/com/bfd/cnki/crawl/util/Kafkautils.java @@ -0,0 +1,45 @@ +package com.bfd.cnki.crawl.util; + +import org.apache.kafka.clients.producer.KafkaProducer; +import org.apache.kafka.clients.producer.ProducerRecord; +import org.apache.kafka.common.serialization.StringSerializer; + +import java.util.Properties; + +/** + * @author:zhaoying + * @className:Kafkautils + * @version:1.0 + * @description: + * @Date:2022-11-16 18:22:33 + */ +public class Kafkautils { + public static KafkaProducer getKafkaProdect(String booystrapServers) { + Properties props = new Properties(); + //xxx服务器ip + props.put("bootstrap.servers", booystrapServers); + //所有follower都响应了才认为消息提交成功,即"committed" + props.put("acks", "all"); + //retries = MAX 无限重试,直到你意识到出现了问题:) + props.put("retries", 3); + //producer将试图批处理消息记录,以减少请求次数.默认的批量处理消息字节数 + //batch.size当批量的数据大小达到设定值后,就会立即发送,不顾下面的linger.ms + props.put("batch.size", 16384); + //延迟1ms发送,这项设置将通过增加小的延迟来完成--即,不是立即发送一条记录,producer将会等待给定的延迟时间以允许其他消息记录发送,这些消息记录可以批量处理 + props.put("linger.ms", 1); + //producer可以用来缓存数据的内存大小。 + props.put("buffer.memory", 33554432); + props.put("key.serializer", + StringSerializer.class.getName()); + props.put("value.serializer", + StringSerializer.class.getName()); + KafkaProducer producer = new KafkaProducer(props); + return producer; + } + + public static void main(String[] args) { + KafkaProducer producer = getKafkaProdect("172.18.1.114:9092"); + producer.send(new ProducerRecord("lcrtest", "123321")); + producer.close(); + } +} diff --git a/src/main/java/com/bfd/cnki/crawl/util/ParametricAssembly.java b/src/main/java/com/bfd/cnki/crawl/util/ParametricAssembly.java new file mode 100644 index 0000000..fdccf99 --- /dev/null +++ b/src/main/java/com/bfd/cnki/crawl/util/ParametricAssembly.java @@ -0,0 +1,145 @@ +package com.bfd.cnki.crawl.util; + +import com.alibaba.fastjson.JSONObject; +import com.bfd.cnki.crawl.entity.Constants; +import lombok.extern.slf4j.Slf4j; +import org.springframework.stereotype.Service; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * @PROJECT_NAME: cnki_crawl + * @DESCRIPTION: 参数组装 + * @AUTHOR: ying.zhao + * @DATE: 2024/5/30 11:16 + */ +@Slf4j +@Service +public class ParametricAssembly { + public static String getParams(String keyword, String pageNum,String documentType) { + String [] documentTypes = documentType.split(","); + String resource = ""; + String classid = ""; + String searchFrom = ""; + String kuaKuCode = ""; + String productStr = ""; + if(documentTypes[0].equals(Constants.DOCUMENTTYPE_ONE)){ + resource = "JOURNAL"; + classid = "YSTT4HG0"; + searchFrom = "资源范围:学术期刊; 仅看有全文,中英文扩展; 时间范围:更新时间:不限; 来源类别:全部期刊;"; + }else if(documentTypes[0].equals(Constants.DOCUMENTTYPE_TWO)){ + resource = "DISSERTATION"; + classid = "LSTPFY1C"; + searchFrom = "资源范围:学位论文; 中英文扩展; 时间范围:更新时间:不限; 优秀论文级别:不限;"; + }else if(documentTypes[0].equals(Constants.DOCUMENTTYPE_THREE)){ + resource = "CONFERENCE"; + classid = "JUP3MUPD"; + searchFrom = "资源范围:会议; 中英文扩展; 时间范围:更新时间:不限; 会议级别:全部;报告级别:全部;论文集类型:不限;语种:不限;"; + }else{ + resource = "CROSSDB"; + classid = "WD0FTY92"; + searchFrom = "资源范围:总库; 中英文扩展; 时间范围:更新时间:不限;"; + kuaKuCode = "YSTT4HG0,LSTPFY1C,JUP3MUPD,MPMFIG1A,EMRPGLPA,WQ0UVIAA,BLZOG7CK,PWFIRAGL,NN3FJMUV,NLBO1Z6R"; + productStr = "YSTT4HG0,LSTPFY1C,RMJLXHZ3,JQIRZIYA,JUP3MUPD,1UR4K4HZ,BPBAFJ5S,R79MZMCB,MPMFIG1A,EMRPGLPA,J708GVCE,ML4DRIDX,WQ0UVIAA,NB3BWEHK,XVLO76FD,HR1YT1Z9,BLZOG7CK,PWFIRAGL,NN3FJMUV,NLBO1Z6R"; + + } + String aside = keyword; + String params = ""; + String [] keywords= keyword.split(" "); + Map paramsMap = new HashMap(16); + paramsMap.put("Platform", ""); + paramsMap.put("Resource", resource); + paramsMap.put("Classid", classid); + paramsMap.put("Products", ""); + paramsMap.put("ExScope", "1"); + paramsMap.put("SearchType", 1); + paramsMap.put("Rlang", "CHINESE"); + paramsMap.put("KuaKuCode", kuaKuCode); + Map qNode = new HashMap(16); + List> qGroup = new ArrayList>(); + int keySize = keywords.length; + if (keySize == 1) { + log.info("只有一个关键词"); + params = "boolSearch=false&QueryJson={\"Platform\":\"\",\"Resource\":\""+resource+"\",\"Classid\":\""+classid+"\",\"Products\":\"\",\"QNode\":{\"QGroup\":[{\"Key\":\"Subject\",\"Title\":\"\",\"Logic\":0,\"Items\":[],\"ChildItems\":[{\"Key\":\"input[data-tipid=gradetxt-1]\",\"Title\":\"主题\",\"Logic\":0,\"Items\":[{\"Key\":\"input[data-tipid=gradetxt-1]\",\"Title\":\"主题\",\"Logic\":0,\"Field\":\"SU\",\"Operator\":\"TOPRANK\",\"Value\":\"".concat(keyword).concat("\",\"Value2\":\"\"}],\"ChildItems\":[]}]},{\"Key\":\"ControlGroup\",\"Title\":\"\",\"Logic\":0,\"Items\":[],\"ChildItems\":[]}]},\"ExScope\":\"1\",\"SearchType\":7,\"Rlang\":\"CHINESE\",\"KuaKuCode\":\""+kuaKuCode+"\"}&pageNum=").concat(pageNum).concat("&pageSize=50&sortField=PT&sortType=desc&dstyle=listmode&boolSortSearch=false&sentenceSearch=false&productStr="+productStr+"&aside=&searchFrom="+searchFrom); + return params; + } else { + log.info("有多个关键词"); + int logic = 0; + String or = "OR"; + String and = "AND"; + String not = "NOT"; + Map map = new HashMap(16); + map.put("Key", "Subject"); + map.put("Title", ""); + map.put("Logic", logic); + map.put("Items", new ArrayList<>()); + List> childItems = new ArrayList>(); + int index = 0; + int two =2; + for (int i = 0; i < keywords.length; i = i + two) { + index++; + log.info("第 " + i + " 个关键词是:" + keywords[i]); + keyword = keywords[i]; + //条件 + String condition = ""; + try { + condition = keywords[i - 1]; + } catch (Exception e) { + condition = "AND"; + } + log.info("condition :" + condition); + if (condition.contains(or)) { + logic = 1; + } else if(condition.contains(and)) { + logic = 0; + }else if(condition.contains(not)) { + logic = 2; + } + Map child = new HashMap(16); + String key = "input[data-tipid=gradetxt-" + index + "]"; + child.put("Key", key); + child.put("Title", "主题"); + child.put("Logic", logic); + List> items = new ArrayList>(); + Map itemsMap = new HashMap(16); + itemsMap.put("Key", key); + itemsMap.put("Title", "主题"); + itemsMap.put("Logic", logic); + itemsMap.put("Field", "SU"); + itemsMap.put("Operator", "TOPRANK"); + itemsMap.put("Value", keyword); + itemsMap.put("Value2", ""); + items.add(itemsMap); + child.put("Items", items); + child.put("ChildItems", new ArrayList<>()); + childItems.add(child); + } + map.put("ChildItems", childItems); + qGroup.add(map); + } + qNode.put("QGroup", qGroup); + paramsMap.put("QNode", qNode); +// log.info("paramsMap="+ JSONObject.toJSONString(paramsMap)); +// (主题:金融科技)OR(主题:金融创新)OR(主题:数字货币) + params = "boolSearch=false&QueryJson=".concat(JSONObject.toJSONString(paramsMap)).concat("&pageNum=").concat(pageNum).concat("&pageSize=50&dstyle=listmode&boolSortSearch=false&sentenceSearch=false&productStr="+productStr+"&aside==&searchFrom="+searchFrom); + return params; + } + + public static void main(String[] args) { +// String keyword = "芯片"; +// String keywords[] = keyword.split(" "); +// System.out.println(keywords.length); +// String keyword = "金融科技 OR 金融创新 AND 数字"; +// getParams(keyword, "1"); +// String keywords[] = keyword.split(" "); +// System.out.println(keywords.length); +// int a = -1; +// System.out.println(Integer.valueOf(a)); + String params = getParams("雷达 NOT 航母","1",""); + System.out.println(params); + } + +} diff --git a/src/main/java/com/bfd/cnki/crawl/util/PicCheckUtil.java b/src/main/java/com/bfd/cnki/crawl/util/PicCheckUtil.java new file mode 100644 index 0000000..8eec51c --- /dev/null +++ b/src/main/java/com/bfd/cnki/crawl/util/PicCheckUtil.java @@ -0,0 +1,75 @@ +package com.bfd.cnki.crawl.util; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; + +/** + * @author:zhaoying + * @className:PicCheckUtil + * @version:1.0 + * @description: + * @Date:2022-11-16 16:36:04 + */ +public class PicCheckUtil { + /** + * 这里指的是python环境,不需要改动 + */ + public static String ppath = "/opt/env/py36/bin/python3"; + /** + * py脚本路径 + */ + public static String py_dddd_ocr = "./dddd_ocr.py"; + public static String getCodeByPhoto(String... params) throws IOException { + String line = null; + try { + String[] args = new String[2 + params.length]; + args[0] = ppath; + args[1] = py_dddd_ocr; + for (int x = 0; x < args.length; x++) { + if (x > 1) { + args[x] = params[x - 2]; + } + } + // 执行py文件 + Process proc = Runtime.getRuntime().exec(args); + Runnable errorRunable = new Runnable() { + @Override + public void run() { + String errorline = null; + BufferedReader errorbufferedReader = null; + try { + errorbufferedReader = new BufferedReader(new InputStreamReader(proc.getErrorStream(), "GBK")); + while ((errorline = errorbufferedReader.readLine()) != null) { + System.out.println("命令执行错误日志信息:" + errorline); + } + } catch (Exception e) { + e.printStackTrace(); + } finally { + if (errorbufferedReader != null) { + try { + errorbufferedReader.close(); + System.out.println("关闭错误流对象!"); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + } + }; + new Thread(errorRunable, "error-execlog").start(); + BufferedReader in = new BufferedReader(new InputStreamReader(proc.getInputStream())); + String tempLine = null; + while ((tempLine = in.readLine()) != null) { + System.out.println("执行命令----" + tempLine); + line = tempLine; + // break; + } + in.close(); + proc.waitFor(); + } catch (Exception e) { + e.printStackTrace(); + } + return line; + } +} diff --git a/src/main/java/com/bfd/cnki/crawl/util/QueueUtils.java b/src/main/java/com/bfd/cnki/crawl/util/QueueUtils.java new file mode 100644 index 0000000..491ba2d --- /dev/null +++ b/src/main/java/com/bfd/cnki/crawl/util/QueueUtils.java @@ -0,0 +1,19 @@ +package com.bfd.cnki.crawl.util; + +import java.util.Map; +import java.util.concurrent.LinkedBlockingQueue; +/** + * @author:zhaoying + * @className:QueueUtils + * @version:1.0 + * @description: + * @Date:2022-11-21 17:53:52 + */ +public class QueueUtils { + public static LinkedBlockingQueue> kyyzTaskQueue = new LinkedBlockingQueue>(); + public static LinkedBlockingQueue> sfgzTaskQueue = new LinkedBlockingQueue>(); + public static LinkedBlockingQueue> oilTaskQueue = new LinkedBlockingQueue>(); + public static LinkedBlockingQueue> errorOilTaskQueue = new LinkedBlockingQueue>(); + public static LinkedBlockingQueue> errorKyyzTaskQueue = new LinkedBlockingQueue>(); + public static LinkedBlockingQueue> errorSfgzTaskQueue = new LinkedBlockingQueue>(); +} diff --git a/src/main/java/com/bfd/cnki/crawl/util/SeleniumTest.java b/src/main/java/com/bfd/cnki/crawl/util/SeleniumTest.java new file mode 100644 index 0000000..95a6302 --- /dev/null +++ b/src/main/java/com/bfd/cnki/crawl/util/SeleniumTest.java @@ -0,0 +1,64 @@ +package com.bfd.cnki.crawl.util; + +import org.openqa.selenium.Cookie; +import org.openqa.selenium.JavascriptExecutor; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.chrome.ChromeDriver; +import org.openqa.selenium.chrome.ChromeOptions; + +import java.io.IOException; +import java.util.Set; +/** + * @author:zhaoying + * @className:seleniumTest + * @version:1.0 + * @description: + * @Date:2024-06-20 14:52:34 + */ +public class SeleniumTest { + + public static void verticaltoMiddle(WebDriver driver) { + JavascriptExecutor js = (JavascriptExecutor) driver; + // 上下拉到中间 + js.executeScript("window.scrollBy(0, 0-document.body.scrollHeight *1/2)"); + } + + static String savePath = "C:\\Users\\BFD-LT-0\\Desktop\\checkPic"; + /***这里指的是python环境,不需要改动***/ + public static String ppath = "python"; + public static String py_dddd_ocr = "C:\\Users\\BFD-LT-0\\Desktop\\dddd_ocr.py"; + public static void main(String[] args) throws InterruptedException, IOException { + System.setProperty("webdriver.chrome.driver", + "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chromedriver.exe"); +// // 1.创建webdriver驱动 +// WebDriver driver = new ChromeDriver(); +// // 2.打开百度首页 +// driver.get("https://www.baidu.com/"); +// //3.找打搜索框,输入内容selenium +// driver.findElement(By.id("kw")).sendKeys("selenium"); +// //4.获取“百度一下”按钮,进行搜索 +// driver.findElement(By.id("su")).click(); +// 5.退出浏览器 +// System.out.println(driver.getPageSource()); + +// FileWriter fWriter = new FileWriter("C:\\Users\\BFD-LT-0\\Desktop\\cookie.txt"); +// FileWriter fWriter2 = new FileWriter("C:\\Users\\BFD-LT-0\\Desktop\\JDerrorUrl0417.txt"); + String nextUrl = "https://www.baidu.com"; + int i = 0; + int j = 0; + String line = ""; + ChromeOptions options = new ChromeOptions(); +// options.addArguments("–incognito"); + options.addArguments("no-sandbox"); + WebDriver driver = new ChromeDriver(options); + driver.get(nextUrl); + String headerCookie = ""; + Set cookies=driver.manage().getCookies(); + for (Cookie cookie : cookies) { + System.out.println(cookie.getName()+":"+cookie.getValue()); + headerCookie = headerCookie.concat(cookie.getName()).concat("=").concat(cookie.getValue()).concat("; "); + } + headerCookie = headerCookie.substring(0,headerCookie.length()-2); + System.out.println("headerCookie="+headerCookie); + } +} \ No newline at end of file diff --git a/src/main/java/com/bfd/cnki/crawl/util/Test.java b/src/main/java/com/bfd/cnki/crawl/util/Test.java new file mode 100644 index 0000000..48fdb08 --- /dev/null +++ b/src/main/java/com/bfd/cnki/crawl/util/Test.java @@ -0,0 +1,27 @@ +package com.bfd.cnki.crawl.util; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.redisson.Redisson; +import org.redisson.api.RBloomFilter; +import org.redisson.api.RedissonClient; +import org.redisson.config.Config; + +import java.util.Timer; +import java.util.TimerTask; + +/** + * @author:zhaoying + * @className:test + * @version:1.0 + * @description: + * @Date:2022-11-16 11:30:57 + */ +public class Test { + public static void main(String[] args) { + String html=" 呼探1井φ139.7 mm尾管精细动态控压固井技术 - 期刊

期刊

节点文献

(排版定稿)网络首发时间:2022-11-11 09:52:35

呼探1井φ139.7 mm尾管精细动态控压固井技术

【作者】 王敬朋张伟吴继伟魏瑞华马锦明杨虎

【Author】 WANG Jingpeng;ZHANG Wei;WU Jiwei;WEI Ruihua;MA Jinming;YANG Hu;

【机构】 “油气藏地质及开发工程”国家重点实验室(西南石油大学)中国石油新疆油田分公司中石化石油工程技术研究院有限公司中国石油大学(北京)克拉玛依校区

【摘要】 呼探1井φ139.7 mm尾管固井由于封固段长、井底温度高,造成存在漏失与溢流风险大、对水泥浆性能要求高及水泥浆稠化时间不易控制等技术难点。为解决上述技术难点,该井φ139.7 mm尾管固井进行了精细动态控压固井技术试验。该井通过优化水泥浆配方、精细设计浆柱和优化设计套管扶正器安放位置,制定确保井筒动态压力介于地层孔隙压力与漏失压力之间等技术措施,利用精细控压钻井装备,实现了控压下尾管、注水泥和水泥浆候凝,最终实现全过程精细动态控压固井,该井φ139.7 mm尾管固井质量合格。呼探1井φ139.7 mm尾管精细控压固井的成功表明,精细控压固井能够有效提高超深井长封固段窄安全压力窗口地层的固井质量,为准噶尔盆地南缘深层油气勘探提供技术保障。更多还原

    查询失败,请刷新
    跳转检索
    数据加载中,请稍候!
      查询失败,请刷新
      跳转检索
      数据加载中,请稍候!
        查询失败,请刷新
        跳转检索
        数据加载中,请稍候!

        【Abstract】 There are some cementing problems in the deep layer of the southern margin of Junggar basin, such as active oil, gas and water, deep burial, high temperature and high pressure, narrow safety pressure window and so on. Using conventional cementing technology is easy to lose circulation, resulting in cementing failure or unqualified quality. In Hutan-1 well with multiple reservoirs, coexistence of high and low pressure, long sealing and solid section (4069 m) and long open hole section (1907 m), the fine dynamic pressure control cementing technology is applied to successfully solve the problem of multi pressure system cementing in the oil and gas well section of the same open hole section. The fine pressure control cementing technology test includes: designing the construction scheme and parameters from the aspects of wellbore preparation, casing running technical measures, annulus slurry column structure design, injection displacement parameters, loss of circulation well control risk prediction, solid completion annulus plus back pressure, etc., and implementing the wellhead fine pressure control by reducing the hydrostatic column pressure and using the fine pressure control drilling equipment to ensure that the wellbore dynamic pressure is between the formation pore pressure and leakage pressure, Finally, the whole process of fine dynamic pressure control cementing is realized. Hutan 1 well Φ The successful cementing test of 139.7mm liner shows that fine pressure control cementing can effectively improve the cementing quality of ultra deep wells with long sealing section and narrow safety pressure window, and provide technical support for deep oil and gas exploration in the southern margin of Junggar Basin.更多还原

          查询失败,请刷新
          跳转检索
          数据加载中,请稍候!
            查询失败,请刷新
            跳转检索
            数据加载中,请稍候!
              查询失败,请刷新
              跳转检索
              数据加载中,请稍候!

              【基金】 中国石油重大专项“准噶尔南缘和玛湖等重点地区优快钻完井技术集成与试验”(编号:202019F-33)联合资助
              【文内图片】
              • 【分类号】TE256
              • 【下载频次】13

              中国知网独家网络首发,未经许可,禁止转载、摘编。

              节点文献中: 

              本文链接的文献网络图示:

              "; + Document doc = Jsoup.parse(html); + System.out.println(doc.select("#main").text()); + } + +} diff --git a/src/main/java/com/bfd/cnki/crawl/util/ThrowMessageUtil.java b/src/main/java/com/bfd/cnki/crawl/util/ThrowMessageUtil.java new file mode 100644 index 0000000..14ee9a3 --- /dev/null +++ b/src/main/java/com/bfd/cnki/crawl/util/ThrowMessageUtil.java @@ -0,0 +1,18 @@ +package com.bfd.cnki.crawl.util; + +import java.io.PrintWriter; +import java.io.StringWriter; +/** + * @author:zhaoying + * @className:ThrowMessageUtil + * @version:1.0 + * @description: + * @Date:2022-11-18 11:41:55 + */ +public class ThrowMessageUtil { + public static String getErrmessage(Throwable t){ + StringWriter stringWriter=new StringWriter(); + t.printStackTrace(new PrintWriter(stringWriter,true)); + return stringWriter.getBuffer().toString(); + } +} diff --git a/src/main/java/com/bfd/cnki/crawl/util/UseDb.java b/src/main/java/com/bfd/cnki/crawl/util/UseDb.java new file mode 100644 index 0000000..dd2b37d --- /dev/null +++ b/src/main/java/com/bfd/cnki/crawl/util/UseDb.java @@ -0,0 +1,194 @@ +package com.bfd.cnki.crawl.util; + +import com.bfd.cnki.crawl.cache.ConfigCache; +import com.bfd.cnki.crawl.entity.Constants; +import com.mchange.v2.c3p0.ComboPooledDataSource; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.sql.*; +import java.util.*; +/** + * @author:zhaoying + * @className:UseDB + * @version:1.0 + * @description: + * @Date:2022-11-16 11:34:38 + */ +public class UseDb { + private static ComboPooledDataSource ds; + public static ComboPooledDataSource init(){ + try{ + ds = new ComboPooledDataSource(); + ds.setDriverClass(ConfigCache.mainConfig.get(Constants.JDBC_DRIVER).toString()); + ds.setJdbcUrl(ConfigCache.mainConfig.get(Constants.JDBC_URL).toString()); + ds.setUser(ConfigCache.mainConfig.get(Constants.JDBC_USER).toString()); + ds.setPassword(ConfigCache.mainConfig.get(Constants.JDBC_PASSWORD).toString()); + ds.setMaxIdleTime(1800); + ds.setAcquireIncrement(3); + ds.setIdleConnectionTestPeriod(60); + ds.setAcquireRetryAttempts(30); + ds.setBreakAfterAcquireFailure(true); + ds.setTestConnectionOnCheckout(false); + ds.setInitialPoolSize(5); + ds.setMinPoolSize(5); + ds.setMaxPoolSize(10); + }catch (Exception e) { + e.printStackTrace(); + } + return ds; + } + public static Connection getConnection(){ + Connection conn = null; + try { + conn = ds.getConnection(); + } catch (Exception e) { + e.printStackTrace(); + } + return conn; + + } + public static void writeMethod(String json, String fileName) { + try { + FileWriter writer = new FileWriter(fileName, true); + writer.write(json + "\n"); + writer.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + public static void delFile(String fileName){ + try{ + File file = new File(fileName); + if(file.delete()){ + System.out.println(fileName + " 文件已被删除!"); + }else{ + System.out.println(fileName + "文件删除失败!"); + } + }catch(Exception e){ + e.printStackTrace(); + } + } + public static boolean writeDb(Connection conn,Map task){ + boolean isauto = false; + Statement stat = null; + try { + stat = conn.createStatement(); + + } catch (SQLException e1) { + e1.printStackTrace(); + } + try { + String sql = (String) task.get("insert"); + try { + stat.execute(sql); + System.out.println("插入成功"); + isauto = true; + } catch (Exception e) { + e.printStackTrace(); + System.out.println("sql----------"+sql); +// e.printStackTrace(); + sql = (String) task.get("update"); + stat.execute(sql); + System.out.println("更新成功"); + } + } catch (SQLException e) { + e.printStackTrace(); + }finally{ + try { + stat.close(); + } catch (SQLException e) { + e.printStackTrace(); + } + } + return isauto; + } + + public static List> queryBySql(Map mysqlTask,List params) { + List> results = new ArrayList<>(); + Connection conn = getConnection(); + Statement stat = null; + try { + stat = conn.createStatement(); + ResultSet rs = stat.executeQuery((String) mysqlTask.get("select")); + while (rs.next()) { + Map map = new HashMap<>(16); + for (String key : params) { + map.put(key, rs.getString(key)); + } + results.add(map); + } + } catch (SQLException e) { + e.printStackTrace(); + } finally { + try { + if(stat != null){ + stat.close(); + } + if(conn != null){ + conn.close(); + } + } catch (Exception e2) { + e2.printStackTrace(); + } + } + return results; + } + + public static boolean writeUpdate(Map mysqlTask) { + Connection conn = getConnection(); + Statement stat = null; + boolean isauto = false; + String sql = ""; + try { + stat = conn.createStatement(); + sql = (String) mysqlTask.get("update"); + stat.execute(sql); + isauto = true; + } catch (SQLException e) { + // TODO Auto-generated catch block + System.out.println("更新失败" + sql); + sql = (String) mysqlTask.get("update"); + System.out.println("修改状态=========" + sql); + try { + stat.execute(sql); + } catch (SQLException e1) { + // TODO Auto-generated catch block + System.out.println("状态更新失败!!!!!!!!!!!"); + e1.printStackTrace(); + } + e.printStackTrace(); + } finally { + try { + if(stat != null){ + stat.close(); + } + if(conn != null){ + conn.close(); + } + } catch (SQLException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + return isauto; + } + public static final String URL = "jdbc:mysql://172.18.1.180:3306/bfd_crawl_list"; + public static final String USER = "crawl"; + public static final String PASSWORD = "crawl"; + + public static void main(String[] args) throws Exception { + //1.加载驱动程序 + Class.forName("com.mysql.jdbc.Driver"); + //2. 获得数据库连接 + Connection conn = DriverManager.getConnection(URL, USER, PASSWORD); + //3.操作数据库,实现增删改查 + Statement stmt = conn.createStatement(); + ResultSet rs = stmt.executeQuery("SELECT * FROM newslist_000 WHERE cid = 'Nzhiwang' AND `status` = 1"); + //如果有数据,rs.next()返回true + while(rs.next()){ + System.out.println(rs.getString("url")); + } + } +} diff --git a/src/main/java/com/bfd/cnki/main/Application.java b/src/main/java/com/bfd/cnki/main/Application.java new file mode 100644 index 0000000..4843e50 --- /dev/null +++ b/src/main/java/com/bfd/cnki/main/Application.java @@ -0,0 +1,169 @@ +package com.bfd.cnki.main; + +import com.alibaba.fastjson.JSONObject; +import com.bfd.cnki.crawl.cache.ConfigCache; +import com.bfd.cnki.crawl.entity.Constants; +import com.bfd.cnki.crawl.process.CnkiCrawlServer; +import com.bfd.cnki.crawl.process.KyyzContentParse; +import com.bfd.cnki.crawl.util.EjdDownloadHtml; +import com.bfd.cnki.crawl.util.QueueUtils; +import com.bfd.cnki.crawl.util.UseDb; +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.io.FileUtils; +import org.springframework.boot.SpringApplication; +import org.springframework.boot.autoconfigure.SpringBootApplication; +import org.springframework.boot.builder.SpringApplicationBuilder; +import org.springframework.boot.web.support.SpringBootServletInitializer; + +import java.io.*; +import java.nio.charset.StandardCharsets; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.concurrent.LinkedBlockingQueue; +/** + * @author:zhaoying + * @className:Application + * @version:1.0 + * @description: + * @Date:2022-11-16 10:59:36 + */ +@Slf4j +@SpringBootApplication +public class Application extends SpringBootServletInitializer { + public static void main(String[] args) { + SpringApplication.run(Application.class, args); + log.info("Application------>Main program start..."); + //加载相关配置信息 + loadProperties(); + //初始化数据库连接池 + log.info("UseDB--------->db init"); + UseDb.init(); + String threadNum = ConfigCache.mainConfig.get(Constants.THREADNUM).toString(); + log.info("KyyzContentParseThread线程数为:{}", threadNum); + for (int i = 0; i < Integer.valueOf(threadNum); i++) { + KyyzContentParse kyyzContentParse = new KyyzContentParse(); + Thread kyyzContentParseThread = new Thread(kyyzContentParse); + kyyzContentParseThread.setName("kyyzContentParseThread" + i); + kyyzContentParseThread.start(); + log.info("KyyzContentParseThread线程-{}-启动", i); + } + //文件任务加载 + readTask("kyyzTaskQueue.txt", QueueUtils.kyyzTaskQueue); + //初始化driver +// GetDriver getDriver = new GetDriver(); +// ConfigCache.driver = getDriver.getWebDriverDriver(); + //结束前处理 + waitDown(); + //执行入口 + CnkiCrawlServer.exec(); +// EjdDownloadHtml.exec(); + + } + public static void waitDown(){ + Runtime.getRuntime().addShutdownHook(new Thread() { + @Override + public void run() { + //停止详情任务线程 + ConfigCache.isExec = false; +// ConfigCache.driver.quit(); + log.info("the load task thread is stop"); + try { + Thread.sleep(1000*10); + } catch (InterruptedException e) { + e.printStackTrace(); + } + //写文件 + writeTsskToFile(); + log.info("tasks save to disk"); + System.out.println("Shutdown hook ran!"); + log.info("Service exit safely"); + } + }); +// while (true) { +// try { +// Thread.sleep(1000); +// } catch (InterruptedException e) { +// e.printStackTrace(); +// } +// } + } + private static void loadProperties(){ + try { + Properties properties = new Properties(); + FileInputStream inputStream = new FileInputStream("./config.properties"); + properties.load(new InputStreamReader(inputStream, StandardCharsets.UTF_8)); + //加载到静态变量区 + ConfigCache.mainConfig.put(Constants.JDBC_DRIVER, properties.getProperty(Constants.JDBC_DRIVER)); + ConfigCache.mainConfig.put(Constants.JDBC_URL, properties.getProperty(Constants.JDBC_URL)); + ConfigCache.mainConfig.put(Constants.JDBC_USER, properties.getProperty(Constants.JDBC_USER)); + ConfigCache.mainConfig.put(Constants.JDBC_PASSWORD, properties.getProperty(Constants.JDBC_PASSWORD)); + ConfigCache.mainConfig.put(Constants.BROKERS, properties.getProperty(Constants.BROKERS)); + ConfigCache.mainConfig.put(Constants.BROKERSTWO, properties.getProperty(Constants.BROKERSTWO)); + ConfigCache.mainConfig.put(Constants.TOPIC, properties.getProperty(Constants.TOPIC)); + ConfigCache.mainConfig.put(Constants.TOPICTWO, properties.getProperty(Constants.TOPICTWO)); + ConfigCache.mainConfig.put(Constants.ERRORTOPIC, properties.getProperty(Constants.ERRORTOPIC)); + ConfigCache.mainConfig.put(Constants.ORDERBY, properties.getProperty(Constants.ORDERBY)); + ConfigCache.mainConfig.put(Constants.THREADNUM, properties.getProperty(Constants.THREADNUM)); + } catch (Exception e) { + e.printStackTrace(); + } + } + + /** + * 任务持久化到硬盘 + */ + public static void writeTsskToFile(){ + while(true) { + if (QueueUtils.kyyzTaskQueue.size() > 0) { + try { + Map task = QueueUtils.kyyzTaskQueue.take(); + writeMethod(JSONObject.toJSONString(task), "./data/kyyzTaskQueue.txt"); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } else { + log.info("tasks write is file end"); + break; + } + } + } + + public static void readTask(String fileName,LinkedBlockingQueue> oilTaskQueue){ + File file = new File("./data/"+fileName); + if(file.exists()){ + List tasks = null; + try { + tasks = FileUtils.readLines(file,"UTF-8"); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + for (String taskStr : tasks) { + Map task = JSONObject.parseObject(taskStr); + try { + oilTaskQueue.put(task); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + file.delete(); + } + } + + public static void writeMethod(String json, String fileName) { + try { + FileWriter writer = new FileWriter(fileName, true); + writer.write(json + "\n"); + writer.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + @Override + protected SpringApplicationBuilder configure( + SpringApplicationBuilder builder) { + return builder.sources(Application.class); + } + +} diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties new file mode 100644 index 0000000..0970114 --- /dev/null +++ b/src/main/resources/application.properties @@ -0,0 +1,6 @@ +#设置应用的日志级别 +logging.level.com.bfd=INFO +#路径 +logging.path=./logs + +spring.main.web-environment=false diff --git a/src/main/resources/banner.txt b/src/main/resources/banner.txt new file mode 100644 index 0000000..9251cba --- /dev/null +++ b/src/main/resources/banner.txt @@ -0,0 +1,20 @@ +${AnsiColor.RED} + .::::. + .::::::::. + ::::::::::: + ..:::::::::::' + '::::::::::::' + .:::::::::: + '::::::::::::::.. + ..::::::::::::. + ``:::::::::::::::: + ::::``:::::::::' .:::. + ::::' ':::::' .::::::::. + .::::' :::: .:::::::'::::. + .:::' ::::: .:::::::::' ':::::. + .::' :::::.:::::::::' ':::::. + .::' ::::::::::::::' ``::::. + ...::: ::::::::::::' ``::. + ```` ':. ':::::::::' ::::.. + '.:::::' ':'````.. +:: Spring Boot :: (v2.0.0.RELEASE) \ No newline at end of file diff --git a/src/main/resources/logback-spring.xml b/src/main/resources/logback-spring.xml new file mode 100644 index 0000000..b5f3414 --- /dev/null +++ b/src/main/resources/logback-spring.xml @@ -0,0 +1,38 @@ + + + + + + + + + + + + + + true + + ${logging.level} + + + ${logging.path}/crawlSchedule.log + + + + ${logging.path}/crawlSchedule.log.%d{yyyy-MM-dd} + + 7 + + + %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %line %-5level %logger{50} - %msg%n + UTF-8 + + + + + + + + diff --git a/src/test/java/com/bfd/cnki/AppTest.java b/src/test/java/com/bfd/cnki/AppTest.java new file mode 100644 index 0000000..887150e --- /dev/null +++ b/src/test/java/com/bfd/cnki/AppTest.java @@ -0,0 +1,20 @@ +package com.bfd.cnki; + +import static org.junit.Assert.assertTrue; + +import org.junit.Test; + +/** + * Unit test for simple App. + */ +public class AppTest +{ + /** + * Rigorous Test :-) + */ + @Test + public void shouldAnswerWithTrue() + { + assertTrue( true ); + } +}