From a7e55a9060bf3e83d82ec0f110adaa0696237ea1 Mon Sep 17 00:00:00 2001 From: guanjz <1826473923@qq.com> Date: Mon, 19 May 2025 10:19:56 +0800 Subject: [PATCH] =?UTF-8?q?=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .classpath | 28 + .gitignore | 3 + .idea/ParsePlugin2.4.iml | 571 +++++++ .idea/compiler.xml | 16 + .idea/encodings.xml | 6 + .idea/jarRepositories.xml | 25 + .idea/libraries/Maven__ant_ant_1_6_5.xml | 13 + .idea/libraries/Maven__antlr_antlr_2_7_6.xml | 13 + .idea/libraries/Maven__asm_asm_1_5_3.xml | 13 + .idea/libraries/Maven__asm_asm_attrs_1_5_3.xml | 13 + .idea/libraries/Maven__c3p0_c3p0_0_9_1_2.xml | 13 + .idea/libraries/Maven__cglib_cglib_2_1_3.xml | 13 + ...aven__ch_qos_logback_logback_classic_1_1_11.xml | 13 + .../Maven__ch_qos_logback_logback_core_1_1_11.xml | 13 + .idea/libraries/Maven__com_101tec_zkclient_0_3.xml | 13 + .../Maven__com_alibaba_fastjson_1_1_22.xml | 13 + .../Maven__com_fasterxml_classmate_1_3_3.xml | 13 + ...rxml_jackson_core_jackson_annotations_2_0_0.xml | 13 + ...m_fasterxml_jackson_core_jackson_core_2_0_0.xml | 13 + ...sterxml_jackson_core_jackson_databind_2_0_0.xml | 13 + ...tephenc_high_scale_lib_high_scale_lib_1_1_1.xml | 13 + .../Maven__com_google_guava_guava_14_0_1.xml | 13 + ...en__com_google_protobuf_protobuf_java_2_4_1.xml | 13 + .../Maven__com_sun_jersey_jersey_core_1_8.xml | 13 + .../Maven__com_sun_jersey_jersey_json_1_8.xml | 13 + .../Maven__com_sun_jersey_jersey_server_1_8.xml | 13 + .../Maven__com_sun_xml_bind_jaxb_impl_2_2_3_1.xml | 13 + .../Maven__com_wandoulabs_jodis_jodis_0_1_2.xml | 13 + ...aven__com_yammer_metrics_metrics_core_2_2_0.xml | 13 + ...__commons_beanutils_commons_beanutils_1_7_0.xml | 13 + ...mons_beanutils_commons_beanutils_core_1_8_0.xml | 13 + .../Maven__commons_cli_commons_cli_1_2.xml | 13 + .../Maven__commons_codec_commons_codec_1_4.xml | 13 + ...commons_collections_commons_collections_3_1.xml | 13 + ...ons_configuration_commons_configuration_1_6.xml | 13 + ...aven__commons_digester_commons_digester_2_1.xml | 13 + .../libraries/Maven__commons_el_commons_el_1_0.xml | 13 + ...commons_httpclient_commons_httpclient_3_0_1.xml | 13 + .../libraries/Maven__commons_io_commons_io_1_4.xml | 13 + .../Maven__commons_lang_commons_lang_2_4.xml | 13 + ...aven__commons_logging_commons_logging_1_1_1.xml | 13 + ...__commons_logging_commons_logging_api_1_0_4.xml | 13 + .../Maven__commons_net_commons_net_1_4_1.xml | 13 + .idea/libraries/Maven__dom4j_dom4j_1_6_1.xml | 13 + .idea/libraries/Maven__hsqldb_hsqldb_1_8_0_10.xml | 13 + .../Maven__io_netty_netty_3_7_0_Final.xml | 13 + .../Maven__javassist_javassist_3_4_GA.xml | 13 + .../Maven__javax_activation_activation_1_1.xml | 13 + ...aven__javax_persistence_persistence_api_1_0.xml | 13 + .../libraries/Maven__javax_transaction_jta_1_1.xml | 13 + ...javax_validation_validation_api_1_1_0_Final.xml | 13 + .../Maven__javax_xml_bind_jaxb_api_2_1.xml | 13 + .idea/libraries/Maven__jline_jline_0_9_94.xml | 13 + .idea/libraries/Maven__junit_junit_4_11.xml | 13 + .idea/libraries/Maven__log4j_log4j_1_2_14.xml | 13 + .../Maven__mysql_mysql_connector_java_5_1_6.xml | 13 + .idea/libraries/Maven__nekohtml_nekohtml_0_9_5.xml | 13 + .../Maven__net_java_dev_jets3t_jets3t_0_7_1.xml | 13 + .../libraries/Maven__net_jpountz_lz4_lz4_1_2_0.xml | 13 + .../Maven__net_sf_ehcache_ehcache_1_2_3.xml | 13 + .../Maven__net_sf_ezmorph_ezmorph_1_0_6.xml | 13 + .../Maven__net_sf_jopt_simple_jopt_simple_3_2.xml | 13 + .idea/libraries/Maven__net_sf_kosmosfs_kfs_0_3.xml | 13 + ...net_sourceforge_htmlcleaner_htmlcleaner_2_4.xml | 13 + .../Maven__org_apache_avro_avro_1_5_3.xml | 13 + .../Maven__org_apache_avro_avro_ipc_1_5_3.xml | 13 + .../Maven__org_apache_commons_commons_math_2_1.xml | 13 + ...Maven__org_apache_commons_commons_pool2_2_3.xml | 13 + ...en__org_apache_curator_curator_client_2_7_0.xml | 13 + ..._org_apache_curator_curator_framework_2_7_0.xml | 13 + ...n__org_apache_curator_curator_recipes_2_7_0.xml | 13 + .../Maven__org_apache_hadoop_hadoop_core_1_0_4.xml | 13 + .../Maven__org_apache_hbase_hbase_0_94_9.xml | 13 + ..._org_apache_httpcomponents_httpclient_4_3_5.xml | 13 + ...n__org_apache_httpcomponents_httpcore_4_3_2.xml | 13 + .../Maven__org_apache_kafka_kafka_2_10_0_8_2_1.xml | 13 + ...ven__org_apache_kafka_kafka_clients_0_8_2_1.xml | 13 + .../Maven__org_apache_thrift_libthrift_0_8_0.xml | 13 + .../Maven__org_apache_tika_tika_core_0_9.xml | 13 + ...pache_tomcat_embed_tomcat_embed_core_8_5_16.xml | 13 + ..._apache_tomcat_embed_tomcat_embed_el_8_5_16.xml | 13 + ..._tomcat_embed_tomcat_embed_websocket_8_5_16.xml | 13 + .../Maven__org_apache_velocity_velocity_1_7.xml | 13 + ...Maven__org_apache_zookeeper_zookeeper_3_4_6.xml | 13 + ...org_codehaus_jackson_jackson_core_asl_1_8_8.xml | 13 + ...n__org_codehaus_jackson_jackson_jaxrs_1_8_8.xml | 13 + ...g_codehaus_jackson_jackson_mapper_asl_1_0_1.xml | 13 + ...aven__org_codehaus_jackson_jackson_xc_1_8_8.xml | 13 + .../Maven__org_codehaus_jettison_jettison_1_1.xml | 13 + .../Maven__org_eclipse_jdt_core_3_1_1.xml | 13 + .../Maven__org_hamcrest_hamcrest_core_1_3.xml | 13 + ...en__org_hibernate_ejb3_persistence_1_0_2_GA.xml | 13 + .../Maven__org_hibernate_hibernate_3_2_1_ga.xml | 13 + ...rg_hibernate_hibernate_annotations_3_4_0_GA.xml | 13 + ...nate_hibernate_commons_annotations_3_3_0_ga.xml | 13 + ...aven__org_hibernate_hibernate_core_3_3_2_GA.xml | 13 + ..._hibernate_hibernate_entitymanager_3_4_0_GA.xml | 13 + ...g_hibernate_hibernate_validator_5_3_5_Final.xml | 13 + .../Maven__org_jamon_jamon_runtime_2_3_1.xml | 13 + ...org_jboss_logging_jboss_logging_3_3_1_Final.xml | 13 + .../Maven__org_jboss_netty_netty_3_2_4_Final.xml | 13 + .idea/libraries/Maven__org_jdom_jdom2_2_0_6.xml | 13 + .../Maven__org_jruby_jruby_complete_1_6_5.xml | 13 + .idea/libraries/Maven__org_jsoup_jsoup_1_7_3.xml | 13 + .../Maven__org_mortbay_jetty_jetty_6_1_26.xml | 13 + .../Maven__org_mortbay_jetty_jetty_util_6_1_26.xml | 13 + .../Maven__org_mortbay_jetty_jsp_2_1_6_1_14.xml | 13 + ...Maven__org_mortbay_jetty_jsp_api_2_1_6_1_14.xml | 13 + ..._org_mortbay_jetty_servlet_api_2_5_20081211.xml | 13 + ...n__org_mortbay_jetty_servlet_api_2_5_6_1_14.xml | 13 + .../libraries/Maven__org_mybatis_mybatis_3_1_1.xml | 13 + .../Maven__org_scala_lang_scala_library_2_10_4.xml | 13 + .../Maven__org_slf4j_jcl_over_slf4j_1_7_25.xml | 13 + .../Maven__org_slf4j_jul_to_slf4j_1_7_25.xml | 13 + .../Maven__org_slf4j_log4j_over_slf4j_1_7_25.xml | 13 + .../libraries/Maven__org_slf4j_slf4j_api_1_6_1.xml | 13 + .../Maven__org_slf4j_slf4j_log4j12_1_6_1.xml | 13 + ...ingframework_boot_spring_boot_1_5_6_RELEASE.xml | 13 + ...oot_spring_boot_autoconfigure_1_5_6_RELEASE.xml | 13 + ...work_boot_spring_boot_starter_1_5_6_RELEASE.xml | 13 + ...t_spring_boot_starter_logging_1_5_6_RELEASE.xml | 13 + ...ot_spring_boot_starter_tomcat_1_5_6_RELEASE.xml | 13 + ..._boot_spring_boot_starter_web_1_5_6_RELEASE.xml | 13 + ...g_springframework_spring_aop_4_3_10_RELEASE.xml | 13 + ...springframework_spring_beans_4_3_10_RELEASE.xml | 13 + ...ringframework_spring_context_4_3_10_RELEASE.xml | 13 + ..._springframework_spring_core_4_3_10_RELEASE.xml | 13 + ...gframework_spring_expression_4_3_10_RELEASE.xml | 13 + ...g_springframework_spring_web_4_3_10_RELEASE.xml | 13 + ...pringframework_spring_webmvc_4_3_10_RELEASE.xml | 13 + ...aven__org_xerial_snappy_snappy_java_1_0_4_1.xml | 13 + .idea/libraries/Maven__org_yaml_snakeyaml_1_17.xml | 13 + .idea/libraries/Maven__oro_oro_2_0_8.xml | 13 + .../libraries/Maven__redis_clients_jedis_2_6_0.xml | 13 + .idea/libraries/Maven__stax_stax_api_1_0_1.xml | 13 + .../Maven__tomcat_jasper_compiler_5_5_12.xml | 13 + .../Maven__tomcat_jasper_runtime_5_5_12.xml | 13 + .idea/libraries/Maven__xalan_serializer_2_7_1.xml | 13 + .idea/libraries/Maven__xalan_xalan_2_7_1.xml | 13 + .idea/libraries/Maven__xerces_xercesImpl_2_9_1.xml | 13 + .../Maven__xerces_xmlParserAPIs_2_6_2.xml | 13 + .../libraries/Maven__xml_apis_xml_apis_1_3_04.xml | 13 + .idea/libraries/Maven__xmlenc_xmlenc_0_52.xml | 13 + .idea/misc.xml | 11 + .idea/modules.xml | 8 + .idea/uiDesigner.xml | 124 ++ .idea/workspace.xml | 164 ++ .project | 23 + pom.xml | 793 ++++++++++ src/com/bfd/parse/AutoRuleParser.java | 271 ++++ src/com/bfd/parse/Constants.java | 154 ++ src/com/bfd/parse/Constants_TraceTask.java | 43 + src/com/bfd/parse/DataSaver.java | 126 ++ src/com/bfd/parse/DomParser.java | 677 ++++++++ src/com/bfd/parse/FileParser.java | 284 ++++ src/com/bfd/parse/ParseResult.java | 237 +++ src/com/bfd/parse/ParseStat.java | 184 +++ src/com/bfd/parse/ParseTestForPlugin.java | 176 +++ src/com/bfd/parse/ParserFace.java | 646 ++++++++ src/com/bfd/parse/client/AbstractClient.java | 172 +++ src/com/bfd/parse/client/ConfigClient.java | 94 ++ src/com/bfd/parse/client/DataOperatorClient.java | 90 ++ src/com/bfd/parse/client/DeduplicatorClient.java | 100 ++ src/com/bfd/parse/client/DownloadClient.java | 274 ++++ src/com/bfd/parse/client/DownloadClientTest.java | 40 + src/com/bfd/parse/client/IceClient.java | 19 + src/com/bfd/parse/client/LoginManagerClient.java | 47 + src/com/bfd/parse/client/TraceTaskClient.java | 87 ++ src/com/bfd/parse/client/URLNormalizerClient.java | 163 ++ src/com/bfd/parse/config/AConfig.java | 109 ++ src/com/bfd/parse/config/Config.java | 13 + src/com/bfd/parse/config/ConfigFactory.java | 37 + src/com/bfd/parse/config/ConfigLoader.java | 37 + .../config/FieldDefine/FieldDefineConfig.java | 40 + .../parse/config/PageDefine/PageDefineConfig.java | 93 ++ src/com/bfd/parse/config/dom/DomCFGBlock.java | 133 ++ src/com/bfd/parse/config/dom/DomCFGElement.java | 45 + src/com/bfd/parse/config/dom/DomCFGField.java | 220 +++ src/com/bfd/parse/config/dom/DomCFGTree.java | 245 +++ src/com/bfd/parse/config/dom/DomConfig.java | 321 ++++ src/com/bfd/parse/config/dom/DomSearch.java | 1630 ++++++++++++++++++++ src/com/bfd/parse/config/dom/DomTemplate.java | 220 +++ src/com/bfd/parse/config/dom/FieldRs.java | 62 + .../bfd/parse/config/fldmap/BfdItemFldMapRule.java | 171 ++ .../bfd/parse/config/fldmap/ItemFldFormator.java | 291 ++++ .../bfd/parse/config/fldmap/ItemFldMapConfig.java | 105 ++ .../bfd/parse/config/fldmap/ItemInfoParser.java | 435 ++++++ .../bfd/parse/config/fldmap/ItemInfoParser2.java | 425 +++++ src/com/bfd/parse/config/iid/ParseCfg.java | 94 ++ src/com/bfd/parse/config/iid/ParseConfigure.java | 105 ++ src/com/bfd/parse/config/iid/ParseReProcessor.java | 63 + .../config/parseplugin/PluginClassLoader.java | 10 + .../bfd/parse/config/parseplugin/PluginConfig.java | 163 ++ .../config/parseplugin/PluginRuntimeException.java | 14 + .../config/parsetemplate/ParseTemplateConfig.java | 24 + src/com/bfd/parse/config/shelf/EcConfigCache.java | 63 + src/com/bfd/parse/config/shelf/JudgeRule.java | 110 ++ .../bfd/parse/config/shelf/JudgeRuleConfig.java | 160 ++ src/com/bfd/parse/config/shelf/JudgeStatue.java | 110 ++ .../config/sitepageconfig/SitePageConfigCache.java | 65 + src/com/bfd/parse/config/website/AutoRuleWrap.java | 61 + src/com/bfd/parse/config/website/WebsiteCache.java | 107 ++ src/com/bfd/parse/entity/BaseEntity.java | 29 + src/com/bfd/parse/entity/CreateTaskEntity.java | 28 + src/com/bfd/parse/entity/ECConfigEntity.java | 61 + src/com/bfd/parse/entity/FielddefineEntity.java | 192 +++ src/com/bfd/parse/entity/FlowquotaEntity.java | 213 +++ src/com/bfd/parse/entity/PagedefineEntity.java | 120 ++ src/com/bfd/parse/entity/ParsePluginEntity.java | 294 ++++ src/com/bfd/parse/entity/ParsetemplateEntity.java | 186 +++ src/com/bfd/parse/entity/SitepageconfigEntity.java | 247 +++ src/com/bfd/parse/entity/WebsiteEntity.java | 232 +++ src/com/bfd/parse/etlchain/EtlChain.java | 8 + src/com/bfd/parse/facade/IParse.java | 15 + src/com/bfd/parse/facade/ParserFactory.java | 23 + src/com/bfd/parse/facade/autoparse/AutoParser.java | 212 +++ .../bfd/parse/facade/jspageparse/JsPageParser.java | 164 ++ src/com/bfd/parse/facade/parseunit/ParseUnit.java | 465 ++++++ src/com/bfd/parse/facade/tmplparse/TmplParser.java | 117 ++ src/com/bfd/parse/file/DataLoader.java | 211 +++ src/com/bfd/parse/file/FileParseWorker.java | 67 + src/com/bfd/parse/file/SourceDataLoader.java | 269 ++++ src/com/bfd/parse/json/AMJsonParser.java | 190 +++ src/com/bfd/parse/json/JsonData.java | 73 + src/com/bfd/parse/json/JsonParser.java | 26 + src/com/bfd/parse/json/JsonParserException.java | 22 + src/com/bfd/parse/json/JsonParserFactory.java | 86 ++ src/com/bfd/parse/json/JsonParserNotFound.java | 20 + src/com/bfd/parse/json/JsonParserResult.java | 44 + src/com/bfd/parse/preprocess/PreProcessor.java | 19 + .../bfd/parse/preprocess/PreProcessorFactory.java | 15 + .../bfd/parse/preprocess/PreProcessorNotFound.java | 20 + src/com/bfd/parse/preprocess/Test.java | 32 + src/com/bfd/parse/reprocess/AMReProcessor.java | 122 ++ src/com/bfd/parse/reprocess/ReProcessCode.java | 9 + src/com/bfd/parse/reprocess/ReProcessResult.java | 39 + src/com/bfd/parse/reprocess/ReProcessor.java | 20 + .../bfd/parse/reprocess/ReProcessorFactory.java | 71 + .../bfd/parse/reprocess/ReProcessorNotFound.java | 20 + src/com/bfd/parse/reprocess/RegexRule.java | 9 + src/com/bfd/parse/reprocess/ReplaceRule.java | 20 + src/com/bfd/parse/reprocess/ReprocessHelper.java | 42 + src/com/bfd/parse/service/DataOperatorService.java | 5 + .../bfd/parse/service/proxy/AbstractIceProxy.java | 127 ++ .../bfd/parse/service/proxy/AlarmServiceProxy.java | 23 + .../bfd/parse/service/proxy/BizConfigureProxy.java | 76 + .../bfd/parse/service/proxy/DataOperatorProxy.java | 85 + .../bfd/parse/service/proxy/DownloadClient.java | 103 ++ src/com/bfd/parse/service/proxy/IceProxy.java | 19 + src/com/bfd/parse/task/ConfigSynchronizer.java | 47 + src/com/bfd/parse/task/HookTask.java | 5 + src/com/bfd/parse/task/PageParserI.java | 121 ++ src/com/bfd/parse/task/ParseJob.java | 240 +++ src/com/bfd/parse/task/ParseQueue.java | 130 ++ src/com/bfd/parse/task/Server.java | 224 +++ src/com/bfd/parse/task/ServerTest.java | 23 + src/com/bfd/parse/test/BfdItemTester.java | 118 ++ src/com/bfd/parse/test/DomPathTester.java | 22 + src/com/bfd/parse/test/DomTmplTester.java | 194 +++ src/com/bfd/parse/test/JudgeRuleTester.java | 29 + src/com/bfd/parse/test/ParseTester.java | 277 ++++ src/com/bfd/parse/test/Test.java | 25 + src/com/bfd/parse/test/TestRequest.java | 127 ++ src/com/bfd/parse/test/TestResponse.java | 79 + src/com/bfd/parse/test/TestTagBalance.java | 36 + src/com/bfd/parse/test/Tester.java | 5 + src/com/bfd/parse/test/TesterFactory.java | 23 + .../parse/test/weibosinaparser/AWeiBoParser.java | 270 ++++ .../parse/test/weibosinaparser/CommentParser.java | 339 ++++ .../bfd/parse/test/weibosinaparser/Constants.java | 48 + .../test/weibosinaparser/FensiListParser.java | 148 ++ .../parse/test/weibosinaparser/IWeiBoparser.java | 17 + .../test/weibosinaparser/LoginServiceClient.java | 115 ++ .../parse/test/weibosinaparser/RepostParser.java | 194 +++ .../parse/test/weibosinaparser/SearchParser.java | 404 +++++ src/com/bfd/parse/test/weibosinaparser/Task.java | 76 + src/com/bfd/parse/test/weibosinaparser/Test.java | 27 + .../test/weibosinaparser/TestByHttpclient.java | 11 + .../parse/test/weibosinaparser/UserInfoParser.java | 298 ++++ src/com/bfd/parse/test/weibosinaparser/Utils.java | 65 + .../parse/test/weibosinaparser/WeiboParser.java | 360 +++++ .../test/weibosinaparser/WeiboParserFactory.java | 19 + src/com/bfd/parse/threadmanager/ParseWorker.java | 89 ++ src/com/bfd/parse/util/JsonUtil.java | 207 +++ src/com/bfd/parse/util/KfkUtils.java | 17 + src/com/bfd/parse/util/MyDateUtil.java | 940 +++++++++++ src/com/bfd/parse/util/ObjectCache.java | 38 + src/com/bfd/parse/util/ParseUtils.java | 152 ++ src/com/bfd/parse/util/Test.java | 240 +++ src/com/bfd/parse/util/TextUtil.java | 615 ++++++++ src/com/bfd/parse/util/UrlNormalizerUtil.java | 147 ++ src/com/bfd/parse/util/XiciHotBoardAnalysis.java | 149 ++ src/com/bfd/parse/vo/FiveTuple.java | 48 + src/com/bfd/parse/vo/FourTuple.java | 47 + src/com/bfd/parse/vo/ThreeTuple.java | 51 + src/com/bfd/parse/vo/Tuple.java | 20 + src/com/bfd/parse/vo/TwoTuple.java | 64 + src/com/bfd/parse/zkmonitor/ParseMonitor.java | 36 + .../download/plugin/NanzctrContentDownload.java | 63 + .../bfd/download/plugin/NanzctrListDownload.java | 208 +++ .../com/bfd/download/plugin/NcsiroDownload.java | 45 + .../bfd/download/plugin/NctrinicListDownload.java | 214 +++ .../com/bfd/download/plugin/NdrksListDownload.java | 340 ++++ .../com/bfd/download/plugin/NdsscuDownload.java | 79 + .../com/bfd/download/plugin/NgovListDownload.java | 207 +++ .../java/com/bfd/download/plugin/NicmDownload.java | 20 + .../bfd/download/plugin/NirbbarcelonaDownload.java | 42 + .../java/com/bfd/download/plugin/NissDownload.java | 45 + .../com/bfd/download/plugin/NkdcaDownload.java | 45 + .../com/bfd/download/plugin/NplymouthDownload.java | 61 + .../NpublichealthontarioContentDownload.java | 45 + .../plugin/NpublichealthontarioDownload.java | 143 ++ .../com/bfd/download/plugin/NrikenDownload.java | 62 + .../com/bfd/download/plugin/NumeduDownload.java | 27 + .../com/bfd/download/plugin/NunilDonwload.java | 72 + .../download/plugin/NutsouthwesternDownload.java | 63 + .../com/bfd/download/plugin/NvirologyDownload.java | 48 + .../com/bfd/download/plugin/NweizmannDownload.java | 45 + .../com/bfd/download/plugin/NwrairDownload.java | 60 + .../bfd/download/plugin/NwwwcdcgovDownload.java | 48 + .../bfd/download/plugin/NwwwiranintlDownload.java | 48 + .../bfd/download/plugin/PluginResultEntity.java | 100 ++ .../java/com/bfd/download/plugin/pluginDemo.java | 163 ++ src/main/java/com/bfd/main/Application.java | 26 + .../com/bfd/parse/preprocess/TestPreprocess.java | 28 + .../java/com/bfd/parse/reprocess/DemoListRe.java | 40 + .../bfd/parse/reprocess/DouBanChongWuDetail.java | 315 ++++ .../com/bfd/parse/reprocess/NanzctrContentRe.java | 149 ++ .../com/bfd/parse/reprocess/NanzctrListRe.java | 84 + .../com/bfd/parse/reprocess/NastarContentRe.java | 120 ++ .../java/com/bfd/parse/reprocess/NastarListRe.java | 98 ++ .../bfd/parse/reprocess/NcanadaenContentRe.java | 97 ++ .../com/bfd/parse/reprocess/NcanadaenListRe.java | 72 + .../com/bfd/parse/reprocess/NchariteContentRe.java | 119 ++ .../com/bfd/parse/reprocess/NchariteListRe.java | 98 ++ .../com/bfd/parse/reprocess/NcsiroContentRe.java | 89 ++ .../java/com/bfd/parse/reprocess/NcsiroListRe.java | 105 ++ .../com/bfd/parse/reprocess/NctrinicContentRe.java | 144 ++ .../com/bfd/parse/reprocess/NctrinicListRe.java | 192 +++ .../com/bfd/parse/reprocess/NdrksContentRe.java | 128 ++ .../java/com/bfd/parse/reprocess/NdrksListRe.java | 101 ++ .../com/bfd/parse/reprocess/NdsscuContentRe.java | 100 ++ .../java/com/bfd/parse/reprocess/NdsscuListRe.java | 97 ++ .../com/bfd/parse/reprocess/NfiocruzContentRe.java | 117 ++ .../com/bfd/parse/reprocess/NfiocruzListRe.java | 93 ++ .../com/bfd/parse/reprocess/NhqsmmContentRe.java | 166 ++ .../java/com/bfd/parse/reprocess/NhqsmmListRe.java | 65 + .../com/bfd/parse/reprocess/NicmContentRe.java | 137 ++ .../java/com/bfd/parse/reprocess/NicmListRe.java | 140 ++ .../bfd/parse/reprocess/NifengNewsContentRe.java | 334 ++++ .../parse/reprocess/NirbbarcelonaContentRe.java | 114 ++ .../bfd/parse/reprocess/NirbbarcelonaListRe.java | 109 ++ .../com/bfd/parse/reprocess/NissContentRe.java | 104 ++ .../java/com/bfd/parse/reprocess/NissListRe.java | 148 ++ .../com/bfd/parse/reprocess/NkdcaContentRe.java | 66 + .../java/com/bfd/parse/reprocess/NkdcaListRe.java | 95 ++ .../bfd/parse/reprocess/NkuleuvenContentRe.java | 113 ++ .../com/bfd/parse/reprocess/NkuleuvenListRe.java | 112 ++ .../bfd/parse/reprocess/NmanchesterContentRe.java | 111 ++ .../com/bfd/parse/reprocess/NmanchesterListRe.java | 103 ++ .../bfd/parse/reprocess/NmedicinaContentRe.java | 127 ++ .../com/bfd/parse/reprocess/NmedicinaListRe.java | 100 ++ .../com/bfd/parse/reprocess/NmofgovConentRe.java | 372 +++++ .../com/bfd/parse/reprocess/NncngovListRe.java | 103 ++ .../com/bfd/parse/reprocess/NnicdContentRe.java | 114 ++ .../java/com/bfd/parse/reprocess/NnicdListRe.java | 95 ++ .../com/bfd/parse/reprocess/NpasteurContentRe.java | 69 + .../com/bfd/parse/reprocess/NpasteurListRe.java | 103 ++ .../bfd/parse/reprocess/NplymouthContentRe.java | 125 ++ .../com/bfd/parse/reprocess/NplymouthListRe.java | 108 ++ .../reprocess/NpublichealthontarioContentRe.java | 102 ++ .../reprocess/NpublichealthontarioListRe.java | 99 ++ .../com/bfd/parse/reprocess/NrikContentRe.java | 70 + .../java/com/bfd/parse/reprocess/NrikListRe.java | 98 ++ .../com/bfd/parse/reprocess/NrikenContentRe.java | 113 ++ .../java/com/bfd/parse/reprocess/NrikenListRe.java | 98 ++ .../com/bfd/parse/reprocess/NsantegouvListRe.java | 83 + .../com/bfd/parse/reprocess/NsustechContentRe.java | 121 ++ .../com/bfd/parse/reprocess/NsustechListRe.java | 112 ++ .../com/bfd/parse/reprocess/NszuContentRe.java | 86 ++ .../java/com/bfd/parse/reprocess/NszuListRe.java | 103 ++ .../com/bfd/parse/reprocess/NumeduContentRe.java | 145 ++ .../java/com/bfd/parse/reprocess/NumeduListRe.java | 107 ++ .../parse/reprocess/NumontpellierContentRe.java | 63 + .../bfd/parse/reprocess/NumontpellierListRe.java | 90 ++ .../com/bfd/parse/reprocess/NunibasContentRe.java | 123 ++ .../com/bfd/parse/reprocess/NunibasListRe.java | 100 ++ .../com/bfd/parse/reprocess/NunilContentRe.java | 115 ++ .../java/com/bfd/parse/reprocess/NunilListRe.java | 92 ++ .../com/bfd/parse/reprocess/NunmContentRe.java | 114 ++ .../java/com/bfd/parse/reprocess/NunmListRe.java | 97 ++ .../parse/reprocess/NutsouthwesternContentRe.java | 100 ++ .../bfd/parse/reprocess/NutsouthwesternListRe.java | 74 + .../java/com/bfd/parse/reprocess/NuuContentRe.java | 112 ++ .../java/com/bfd/parse/reprocess/NuuListRe.java | 105 ++ .../com/bfd/parse/reprocess/NvniivContentRe.java | 84 + .../java/com/bfd/parse/reprocess/NvniivListRe.java | 72 + .../bfd/parse/reprocess/NwashingtonContentRe.java | 114 ++ .../com/bfd/parse/reprocess/NwashingtonListRe.java | 100 ++ .../bfd/parse/reprocess/NweizmannContentRe.java | 124 ++ .../com/bfd/parse/reprocess/NweizmannListRe.java | 112 ++ .../java/com/bfd/parse/reprocess/NwrairListRe.java | 111 ++ .../bfd/parse/reprocess/NwwwcdcgovContentRe.java | 82 + .../com/bfd/parse/reprocess/NwwwcdcgovListRe.java | 115 ++ .../bfd/parse/reprocess/NwwwiranintlContentRe.java | 110 ++ .../bfd/parse/reprocess/NwwwiranintlListRe.java | 95 ++ src/main/resources/application.properties | 6 + src/main/resources/banner.txt | 20 + src/main/resources/logback-spring.xml | 36 + 409 files changed, 36325 insertions(+) create mode 100644 .classpath create mode 100644 .gitignore create mode 100644 .idea/ParsePlugin2.4.iml create mode 100644 .idea/compiler.xml create mode 100644 .idea/encodings.xml create mode 100644 .idea/jarRepositories.xml create mode 100644 .idea/libraries/Maven__ant_ant_1_6_5.xml create mode 100644 .idea/libraries/Maven__antlr_antlr_2_7_6.xml create mode 100644 .idea/libraries/Maven__asm_asm_1_5_3.xml create mode 100644 .idea/libraries/Maven__asm_asm_attrs_1_5_3.xml create mode 100644 .idea/libraries/Maven__c3p0_c3p0_0_9_1_2.xml create mode 100644 .idea/libraries/Maven__cglib_cglib_2_1_3.xml create mode 100644 .idea/libraries/Maven__ch_qos_logback_logback_classic_1_1_11.xml create mode 100644 .idea/libraries/Maven__ch_qos_logback_logback_core_1_1_11.xml create mode 100644 .idea/libraries/Maven__com_101tec_zkclient_0_3.xml create mode 100644 .idea/libraries/Maven__com_alibaba_fastjson_1_1_22.xml create mode 100644 .idea/libraries/Maven__com_fasterxml_classmate_1_3_3.xml create mode 100644 .idea/libraries/Maven__com_fasterxml_jackson_core_jackson_annotations_2_0_0.xml create mode 100644 .idea/libraries/Maven__com_fasterxml_jackson_core_jackson_core_2_0_0.xml create mode 100644 .idea/libraries/Maven__com_fasterxml_jackson_core_jackson_databind_2_0_0.xml create mode 100644 .idea/libraries/Maven__com_github_stephenc_high_scale_lib_high_scale_lib_1_1_1.xml create mode 100644 .idea/libraries/Maven__com_google_guava_guava_14_0_1.xml create mode 100644 .idea/libraries/Maven__com_google_protobuf_protobuf_java_2_4_1.xml create mode 100644 .idea/libraries/Maven__com_sun_jersey_jersey_core_1_8.xml create mode 100644 .idea/libraries/Maven__com_sun_jersey_jersey_json_1_8.xml create mode 100644 .idea/libraries/Maven__com_sun_jersey_jersey_server_1_8.xml create mode 100644 .idea/libraries/Maven__com_sun_xml_bind_jaxb_impl_2_2_3_1.xml create mode 100644 .idea/libraries/Maven__com_wandoulabs_jodis_jodis_0_1_2.xml create mode 100644 .idea/libraries/Maven__com_yammer_metrics_metrics_core_2_2_0.xml create mode 100644 .idea/libraries/Maven__commons_beanutils_commons_beanutils_1_7_0.xml create mode 100644 .idea/libraries/Maven__commons_beanutils_commons_beanutils_core_1_8_0.xml create mode 100644 .idea/libraries/Maven__commons_cli_commons_cli_1_2.xml create mode 100644 .idea/libraries/Maven__commons_codec_commons_codec_1_4.xml create mode 100644 .idea/libraries/Maven__commons_collections_commons_collections_3_1.xml create mode 100644 .idea/libraries/Maven__commons_configuration_commons_configuration_1_6.xml create mode 100644 .idea/libraries/Maven__commons_digester_commons_digester_2_1.xml create mode 100644 .idea/libraries/Maven__commons_el_commons_el_1_0.xml create mode 100644 .idea/libraries/Maven__commons_httpclient_commons_httpclient_3_0_1.xml create mode 100644 .idea/libraries/Maven__commons_io_commons_io_1_4.xml create mode 100644 .idea/libraries/Maven__commons_lang_commons_lang_2_4.xml create mode 100644 .idea/libraries/Maven__commons_logging_commons_logging_1_1_1.xml create mode 100644 .idea/libraries/Maven__commons_logging_commons_logging_api_1_0_4.xml create mode 100644 .idea/libraries/Maven__commons_net_commons_net_1_4_1.xml create mode 100644 .idea/libraries/Maven__dom4j_dom4j_1_6_1.xml create mode 100644 .idea/libraries/Maven__hsqldb_hsqldb_1_8_0_10.xml create mode 100644 .idea/libraries/Maven__io_netty_netty_3_7_0_Final.xml create mode 100644 .idea/libraries/Maven__javassist_javassist_3_4_GA.xml create mode 100644 .idea/libraries/Maven__javax_activation_activation_1_1.xml create mode 100644 .idea/libraries/Maven__javax_persistence_persistence_api_1_0.xml create mode 100644 .idea/libraries/Maven__javax_transaction_jta_1_1.xml create mode 100644 .idea/libraries/Maven__javax_validation_validation_api_1_1_0_Final.xml create mode 100644 .idea/libraries/Maven__javax_xml_bind_jaxb_api_2_1.xml create mode 100644 .idea/libraries/Maven__jline_jline_0_9_94.xml create mode 100644 .idea/libraries/Maven__junit_junit_4_11.xml create mode 100644 .idea/libraries/Maven__log4j_log4j_1_2_14.xml create mode 100644 .idea/libraries/Maven__mysql_mysql_connector_java_5_1_6.xml create mode 100644 .idea/libraries/Maven__nekohtml_nekohtml_0_9_5.xml create mode 100644 .idea/libraries/Maven__net_java_dev_jets3t_jets3t_0_7_1.xml create mode 100644 .idea/libraries/Maven__net_jpountz_lz4_lz4_1_2_0.xml create mode 100644 .idea/libraries/Maven__net_sf_ehcache_ehcache_1_2_3.xml create mode 100644 .idea/libraries/Maven__net_sf_ezmorph_ezmorph_1_0_6.xml create mode 100644 .idea/libraries/Maven__net_sf_jopt_simple_jopt_simple_3_2.xml create mode 100644 .idea/libraries/Maven__net_sf_kosmosfs_kfs_0_3.xml create mode 100644 .idea/libraries/Maven__net_sourceforge_htmlcleaner_htmlcleaner_2_4.xml create mode 100644 .idea/libraries/Maven__org_apache_avro_avro_1_5_3.xml create mode 100644 .idea/libraries/Maven__org_apache_avro_avro_ipc_1_5_3.xml create mode 100644 .idea/libraries/Maven__org_apache_commons_commons_math_2_1.xml create mode 100644 .idea/libraries/Maven__org_apache_commons_commons_pool2_2_3.xml create mode 100644 .idea/libraries/Maven__org_apache_curator_curator_client_2_7_0.xml create mode 100644 .idea/libraries/Maven__org_apache_curator_curator_framework_2_7_0.xml create mode 100644 .idea/libraries/Maven__org_apache_curator_curator_recipes_2_7_0.xml create mode 100644 .idea/libraries/Maven__org_apache_hadoop_hadoop_core_1_0_4.xml create mode 100644 .idea/libraries/Maven__org_apache_hbase_hbase_0_94_9.xml create mode 100644 .idea/libraries/Maven__org_apache_httpcomponents_httpclient_4_3_5.xml create mode 100644 .idea/libraries/Maven__org_apache_httpcomponents_httpcore_4_3_2.xml create mode 100644 .idea/libraries/Maven__org_apache_kafka_kafka_2_10_0_8_2_1.xml create mode 100644 .idea/libraries/Maven__org_apache_kafka_kafka_clients_0_8_2_1.xml create mode 100644 .idea/libraries/Maven__org_apache_thrift_libthrift_0_8_0.xml create mode 100644 .idea/libraries/Maven__org_apache_tika_tika_core_0_9.xml create mode 100644 .idea/libraries/Maven__org_apache_tomcat_embed_tomcat_embed_core_8_5_16.xml create mode 100644 .idea/libraries/Maven__org_apache_tomcat_embed_tomcat_embed_el_8_5_16.xml create mode 100644 .idea/libraries/Maven__org_apache_tomcat_embed_tomcat_embed_websocket_8_5_16.xml create mode 100644 .idea/libraries/Maven__org_apache_velocity_velocity_1_7.xml create mode 100644 .idea/libraries/Maven__org_apache_zookeeper_zookeeper_3_4_6.xml create mode 100644 .idea/libraries/Maven__org_codehaus_jackson_jackson_core_asl_1_8_8.xml create mode 100644 .idea/libraries/Maven__org_codehaus_jackson_jackson_jaxrs_1_8_8.xml create mode 100644 .idea/libraries/Maven__org_codehaus_jackson_jackson_mapper_asl_1_0_1.xml create mode 100644 .idea/libraries/Maven__org_codehaus_jackson_jackson_xc_1_8_8.xml create mode 100644 .idea/libraries/Maven__org_codehaus_jettison_jettison_1_1.xml create mode 100644 .idea/libraries/Maven__org_eclipse_jdt_core_3_1_1.xml create mode 100644 .idea/libraries/Maven__org_hamcrest_hamcrest_core_1_3.xml create mode 100644 .idea/libraries/Maven__org_hibernate_ejb3_persistence_1_0_2_GA.xml create mode 100644 .idea/libraries/Maven__org_hibernate_hibernate_3_2_1_ga.xml create mode 100644 .idea/libraries/Maven__org_hibernate_hibernate_annotations_3_4_0_GA.xml create mode 100644 .idea/libraries/Maven__org_hibernate_hibernate_commons_annotations_3_3_0_ga.xml create mode 100644 .idea/libraries/Maven__org_hibernate_hibernate_core_3_3_2_GA.xml create mode 100644 .idea/libraries/Maven__org_hibernate_hibernate_entitymanager_3_4_0_GA.xml create mode 100644 .idea/libraries/Maven__org_hibernate_hibernate_validator_5_3_5_Final.xml create mode 100644 .idea/libraries/Maven__org_jamon_jamon_runtime_2_3_1.xml create mode 100644 .idea/libraries/Maven__org_jboss_logging_jboss_logging_3_3_1_Final.xml create mode 100644 .idea/libraries/Maven__org_jboss_netty_netty_3_2_4_Final.xml create mode 100644 .idea/libraries/Maven__org_jdom_jdom2_2_0_6.xml create mode 100644 .idea/libraries/Maven__org_jruby_jruby_complete_1_6_5.xml create mode 100644 .idea/libraries/Maven__org_jsoup_jsoup_1_7_3.xml create mode 100644 .idea/libraries/Maven__org_mortbay_jetty_jetty_6_1_26.xml create mode 100644 .idea/libraries/Maven__org_mortbay_jetty_jetty_util_6_1_26.xml create mode 100644 .idea/libraries/Maven__org_mortbay_jetty_jsp_2_1_6_1_14.xml create mode 100644 .idea/libraries/Maven__org_mortbay_jetty_jsp_api_2_1_6_1_14.xml create mode 100644 .idea/libraries/Maven__org_mortbay_jetty_servlet_api_2_5_20081211.xml create mode 100644 .idea/libraries/Maven__org_mortbay_jetty_servlet_api_2_5_6_1_14.xml create mode 100644 .idea/libraries/Maven__org_mybatis_mybatis_3_1_1.xml create mode 100644 .idea/libraries/Maven__org_scala_lang_scala_library_2_10_4.xml create mode 100644 .idea/libraries/Maven__org_slf4j_jcl_over_slf4j_1_7_25.xml create mode 100644 .idea/libraries/Maven__org_slf4j_jul_to_slf4j_1_7_25.xml create mode 100644 .idea/libraries/Maven__org_slf4j_log4j_over_slf4j_1_7_25.xml create mode 100644 .idea/libraries/Maven__org_slf4j_slf4j_api_1_6_1.xml create mode 100644 .idea/libraries/Maven__org_slf4j_slf4j_log4j12_1_6_1.xml create mode 100644 .idea/libraries/Maven__org_springframework_boot_spring_boot_1_5_6_RELEASE.xml create mode 100644 .idea/libraries/Maven__org_springframework_boot_spring_boot_autoconfigure_1_5_6_RELEASE.xml create mode 100644 .idea/libraries/Maven__org_springframework_boot_spring_boot_starter_1_5_6_RELEASE.xml create mode 100644 .idea/libraries/Maven__org_springframework_boot_spring_boot_starter_logging_1_5_6_RELEASE.xml create mode 100644 .idea/libraries/Maven__org_springframework_boot_spring_boot_starter_tomcat_1_5_6_RELEASE.xml create mode 100644 .idea/libraries/Maven__org_springframework_boot_spring_boot_starter_web_1_5_6_RELEASE.xml create mode 100644 .idea/libraries/Maven__org_springframework_spring_aop_4_3_10_RELEASE.xml create mode 100644 .idea/libraries/Maven__org_springframework_spring_beans_4_3_10_RELEASE.xml create mode 100644 .idea/libraries/Maven__org_springframework_spring_context_4_3_10_RELEASE.xml create mode 100644 .idea/libraries/Maven__org_springframework_spring_core_4_3_10_RELEASE.xml create mode 100644 .idea/libraries/Maven__org_springframework_spring_expression_4_3_10_RELEASE.xml create mode 100644 .idea/libraries/Maven__org_springframework_spring_web_4_3_10_RELEASE.xml create mode 100644 .idea/libraries/Maven__org_springframework_spring_webmvc_4_3_10_RELEASE.xml create mode 100644 .idea/libraries/Maven__org_xerial_snappy_snappy_java_1_0_4_1.xml create mode 100644 .idea/libraries/Maven__org_yaml_snakeyaml_1_17.xml create mode 100644 .idea/libraries/Maven__oro_oro_2_0_8.xml create mode 100644 .idea/libraries/Maven__redis_clients_jedis_2_6_0.xml create mode 100644 .idea/libraries/Maven__stax_stax_api_1_0_1.xml create mode 100644 .idea/libraries/Maven__tomcat_jasper_compiler_5_5_12.xml create mode 100644 .idea/libraries/Maven__tomcat_jasper_runtime_5_5_12.xml create mode 100644 .idea/libraries/Maven__xalan_serializer_2_7_1.xml create mode 100644 .idea/libraries/Maven__xalan_xalan_2_7_1.xml create mode 100644 .idea/libraries/Maven__xerces_xercesImpl_2_9_1.xml create mode 100644 .idea/libraries/Maven__xerces_xmlParserAPIs_2_6_2.xml create mode 100644 .idea/libraries/Maven__xml_apis_xml_apis_1_3_04.xml create mode 100644 .idea/libraries/Maven__xmlenc_xmlenc_0_52.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/uiDesigner.xml create mode 100644 .idea/workspace.xml create mode 100644 .project create mode 100644 pom.xml create mode 100644 src/com/bfd/parse/AutoRuleParser.java create mode 100644 src/com/bfd/parse/Constants.java create mode 100644 src/com/bfd/parse/Constants_TraceTask.java create mode 100644 src/com/bfd/parse/DataSaver.java create mode 100644 src/com/bfd/parse/DomParser.java create mode 100644 src/com/bfd/parse/FileParser.java create mode 100644 src/com/bfd/parse/ParseResult.java create mode 100644 src/com/bfd/parse/ParseStat.java create mode 100644 src/com/bfd/parse/ParseTestForPlugin.java create mode 100644 src/com/bfd/parse/ParserFace.java create mode 100644 src/com/bfd/parse/client/AbstractClient.java create mode 100644 src/com/bfd/parse/client/ConfigClient.java create mode 100644 src/com/bfd/parse/client/DataOperatorClient.java create mode 100644 src/com/bfd/parse/client/DeduplicatorClient.java create mode 100644 src/com/bfd/parse/client/DownloadClient.java create mode 100644 src/com/bfd/parse/client/DownloadClientTest.java create mode 100644 src/com/bfd/parse/client/IceClient.java create mode 100644 src/com/bfd/parse/client/LoginManagerClient.java create mode 100644 src/com/bfd/parse/client/TraceTaskClient.java create mode 100644 src/com/bfd/parse/client/URLNormalizerClient.java create mode 100644 src/com/bfd/parse/config/AConfig.java create mode 100644 src/com/bfd/parse/config/Config.java create mode 100644 src/com/bfd/parse/config/ConfigFactory.java create mode 100644 src/com/bfd/parse/config/ConfigLoader.java create mode 100644 src/com/bfd/parse/config/FieldDefine/FieldDefineConfig.java create mode 100644 src/com/bfd/parse/config/PageDefine/PageDefineConfig.java create mode 100644 src/com/bfd/parse/config/dom/DomCFGBlock.java create mode 100644 src/com/bfd/parse/config/dom/DomCFGElement.java create mode 100644 src/com/bfd/parse/config/dom/DomCFGField.java create mode 100644 src/com/bfd/parse/config/dom/DomCFGTree.java create mode 100644 src/com/bfd/parse/config/dom/DomConfig.java create mode 100644 src/com/bfd/parse/config/dom/DomSearch.java create mode 100644 src/com/bfd/parse/config/dom/DomTemplate.java create mode 100644 src/com/bfd/parse/config/dom/FieldRs.java create mode 100644 src/com/bfd/parse/config/fldmap/BfdItemFldMapRule.java create mode 100644 src/com/bfd/parse/config/fldmap/ItemFldFormator.java create mode 100644 src/com/bfd/parse/config/fldmap/ItemFldMapConfig.java create mode 100644 src/com/bfd/parse/config/fldmap/ItemInfoParser.java create mode 100644 src/com/bfd/parse/config/fldmap/ItemInfoParser2.java create mode 100644 src/com/bfd/parse/config/iid/ParseCfg.java create mode 100644 src/com/bfd/parse/config/iid/ParseConfigure.java create mode 100644 src/com/bfd/parse/config/iid/ParseReProcessor.java create mode 100644 src/com/bfd/parse/config/parseplugin/PluginClassLoader.java create mode 100644 src/com/bfd/parse/config/parseplugin/PluginConfig.java create mode 100644 src/com/bfd/parse/config/parseplugin/PluginRuntimeException.java create mode 100644 src/com/bfd/parse/config/parsetemplate/ParseTemplateConfig.java create mode 100644 src/com/bfd/parse/config/shelf/EcConfigCache.java create mode 100644 src/com/bfd/parse/config/shelf/JudgeRule.java create mode 100644 src/com/bfd/parse/config/shelf/JudgeRuleConfig.java create mode 100644 src/com/bfd/parse/config/shelf/JudgeStatue.java create mode 100644 src/com/bfd/parse/config/sitepageconfig/SitePageConfigCache.java create mode 100644 src/com/bfd/parse/config/website/AutoRuleWrap.java create mode 100644 src/com/bfd/parse/config/website/WebsiteCache.java create mode 100644 src/com/bfd/parse/entity/BaseEntity.java create mode 100644 src/com/bfd/parse/entity/CreateTaskEntity.java create mode 100644 src/com/bfd/parse/entity/ECConfigEntity.java create mode 100644 src/com/bfd/parse/entity/FielddefineEntity.java create mode 100644 src/com/bfd/parse/entity/FlowquotaEntity.java create mode 100644 src/com/bfd/parse/entity/PagedefineEntity.java create mode 100644 src/com/bfd/parse/entity/ParsePluginEntity.java create mode 100644 src/com/bfd/parse/entity/ParsetemplateEntity.java create mode 100644 src/com/bfd/parse/entity/SitepageconfigEntity.java create mode 100644 src/com/bfd/parse/entity/WebsiteEntity.java create mode 100644 src/com/bfd/parse/etlchain/EtlChain.java create mode 100644 src/com/bfd/parse/facade/IParse.java create mode 100644 src/com/bfd/parse/facade/ParserFactory.java create mode 100644 src/com/bfd/parse/facade/autoparse/AutoParser.java create mode 100644 src/com/bfd/parse/facade/jspageparse/JsPageParser.java create mode 100644 src/com/bfd/parse/facade/parseunit/ParseUnit.java create mode 100644 src/com/bfd/parse/facade/tmplparse/TmplParser.java create mode 100644 src/com/bfd/parse/file/DataLoader.java create mode 100644 src/com/bfd/parse/file/FileParseWorker.java create mode 100644 src/com/bfd/parse/file/SourceDataLoader.java create mode 100644 src/com/bfd/parse/json/AMJsonParser.java create mode 100644 src/com/bfd/parse/json/JsonData.java create mode 100644 src/com/bfd/parse/json/JsonParser.java create mode 100644 src/com/bfd/parse/json/JsonParserException.java create mode 100644 src/com/bfd/parse/json/JsonParserFactory.java create mode 100644 src/com/bfd/parse/json/JsonParserNotFound.java create mode 100644 src/com/bfd/parse/json/JsonParserResult.java create mode 100644 src/com/bfd/parse/preprocess/PreProcessor.java create mode 100644 src/com/bfd/parse/preprocess/PreProcessorFactory.java create mode 100644 src/com/bfd/parse/preprocess/PreProcessorNotFound.java create mode 100644 src/com/bfd/parse/preprocess/Test.java create mode 100644 src/com/bfd/parse/reprocess/AMReProcessor.java create mode 100644 src/com/bfd/parse/reprocess/ReProcessCode.java create mode 100644 src/com/bfd/parse/reprocess/ReProcessResult.java create mode 100644 src/com/bfd/parse/reprocess/ReProcessor.java create mode 100644 src/com/bfd/parse/reprocess/ReProcessorFactory.java create mode 100644 src/com/bfd/parse/reprocess/ReProcessorNotFound.java create mode 100644 src/com/bfd/parse/reprocess/RegexRule.java create mode 100644 src/com/bfd/parse/reprocess/ReplaceRule.java create mode 100644 src/com/bfd/parse/reprocess/ReprocessHelper.java create mode 100644 src/com/bfd/parse/service/DataOperatorService.java create mode 100644 src/com/bfd/parse/service/proxy/AbstractIceProxy.java create mode 100644 src/com/bfd/parse/service/proxy/AlarmServiceProxy.java create mode 100644 src/com/bfd/parse/service/proxy/BizConfigureProxy.java create mode 100644 src/com/bfd/parse/service/proxy/DataOperatorProxy.java create mode 100644 src/com/bfd/parse/service/proxy/DownloadClient.java create mode 100644 src/com/bfd/parse/service/proxy/IceProxy.java create mode 100644 src/com/bfd/parse/task/ConfigSynchronizer.java create mode 100644 src/com/bfd/parse/task/HookTask.java create mode 100644 src/com/bfd/parse/task/PageParserI.java create mode 100644 src/com/bfd/parse/task/ParseJob.java create mode 100644 src/com/bfd/parse/task/ParseQueue.java create mode 100644 src/com/bfd/parse/task/Server.java create mode 100644 src/com/bfd/parse/task/ServerTest.java create mode 100644 src/com/bfd/parse/test/BfdItemTester.java create mode 100644 src/com/bfd/parse/test/DomPathTester.java create mode 100644 src/com/bfd/parse/test/DomTmplTester.java create mode 100644 src/com/bfd/parse/test/JudgeRuleTester.java create mode 100644 src/com/bfd/parse/test/ParseTester.java create mode 100644 src/com/bfd/parse/test/Test.java create mode 100644 src/com/bfd/parse/test/TestRequest.java create mode 100644 src/com/bfd/parse/test/TestResponse.java create mode 100644 src/com/bfd/parse/test/TestTagBalance.java create mode 100644 src/com/bfd/parse/test/Tester.java create mode 100644 src/com/bfd/parse/test/TesterFactory.java create mode 100644 src/com/bfd/parse/test/weibosinaparser/AWeiBoParser.java create mode 100644 src/com/bfd/parse/test/weibosinaparser/CommentParser.java create mode 100644 src/com/bfd/parse/test/weibosinaparser/Constants.java create mode 100644 src/com/bfd/parse/test/weibosinaparser/FensiListParser.java create mode 100644 src/com/bfd/parse/test/weibosinaparser/IWeiBoparser.java create mode 100644 src/com/bfd/parse/test/weibosinaparser/LoginServiceClient.java create mode 100644 src/com/bfd/parse/test/weibosinaparser/RepostParser.java create mode 100644 src/com/bfd/parse/test/weibosinaparser/SearchParser.java create mode 100644 src/com/bfd/parse/test/weibosinaparser/Task.java create mode 100644 src/com/bfd/parse/test/weibosinaparser/Test.java create mode 100644 src/com/bfd/parse/test/weibosinaparser/TestByHttpclient.java create mode 100644 src/com/bfd/parse/test/weibosinaparser/UserInfoParser.java create mode 100644 src/com/bfd/parse/test/weibosinaparser/Utils.java create mode 100644 src/com/bfd/parse/test/weibosinaparser/WeiboParser.java create mode 100644 src/com/bfd/parse/test/weibosinaparser/WeiboParserFactory.java create mode 100644 src/com/bfd/parse/threadmanager/ParseWorker.java create mode 100644 src/com/bfd/parse/util/JsonUtil.java create mode 100644 src/com/bfd/parse/util/KfkUtils.java create mode 100644 src/com/bfd/parse/util/MyDateUtil.java create mode 100644 src/com/bfd/parse/util/ObjectCache.java create mode 100644 src/com/bfd/parse/util/ParseUtils.java create mode 100644 src/com/bfd/parse/util/Test.java create mode 100644 src/com/bfd/parse/util/TextUtil.java create mode 100644 src/com/bfd/parse/util/UrlNormalizerUtil.java create mode 100644 src/com/bfd/parse/util/XiciHotBoardAnalysis.java create mode 100644 src/com/bfd/parse/vo/FiveTuple.java create mode 100644 src/com/bfd/parse/vo/FourTuple.java create mode 100644 src/com/bfd/parse/vo/ThreeTuple.java create mode 100644 src/com/bfd/parse/vo/Tuple.java create mode 100644 src/com/bfd/parse/vo/TwoTuple.java create mode 100644 src/com/bfd/parse/zkmonitor/ParseMonitor.java create mode 100644 src/main/java/com/bfd/download/plugin/NanzctrContentDownload.java create mode 100644 src/main/java/com/bfd/download/plugin/NanzctrListDownload.java create mode 100644 src/main/java/com/bfd/download/plugin/NcsiroDownload.java create mode 100644 src/main/java/com/bfd/download/plugin/NctrinicListDownload.java create mode 100644 src/main/java/com/bfd/download/plugin/NdrksListDownload.java create mode 100644 src/main/java/com/bfd/download/plugin/NdsscuDownload.java create mode 100644 src/main/java/com/bfd/download/plugin/NgovListDownload.java create mode 100644 src/main/java/com/bfd/download/plugin/NicmDownload.java create mode 100644 src/main/java/com/bfd/download/plugin/NirbbarcelonaDownload.java create mode 100644 src/main/java/com/bfd/download/plugin/NissDownload.java create mode 100644 src/main/java/com/bfd/download/plugin/NkdcaDownload.java create mode 100644 src/main/java/com/bfd/download/plugin/NplymouthDownload.java create mode 100644 src/main/java/com/bfd/download/plugin/NpublichealthontarioContentDownload.java create mode 100644 src/main/java/com/bfd/download/plugin/NpublichealthontarioDownload.java create mode 100644 src/main/java/com/bfd/download/plugin/NrikenDownload.java create mode 100644 src/main/java/com/bfd/download/plugin/NumeduDownload.java create mode 100644 src/main/java/com/bfd/download/plugin/NunilDonwload.java create mode 100644 src/main/java/com/bfd/download/plugin/NutsouthwesternDownload.java create mode 100644 src/main/java/com/bfd/download/plugin/NvirologyDownload.java create mode 100644 src/main/java/com/bfd/download/plugin/NweizmannDownload.java create mode 100644 src/main/java/com/bfd/download/plugin/NwrairDownload.java create mode 100644 src/main/java/com/bfd/download/plugin/NwwwcdcgovDownload.java create mode 100644 src/main/java/com/bfd/download/plugin/NwwwiranintlDownload.java create mode 100644 src/main/java/com/bfd/download/plugin/PluginResultEntity.java create mode 100644 src/main/java/com/bfd/download/plugin/pluginDemo.java create mode 100644 src/main/java/com/bfd/main/Application.java create mode 100644 src/main/java/com/bfd/parse/preprocess/TestPreprocess.java create mode 100644 src/main/java/com/bfd/parse/reprocess/DemoListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/DouBanChongWuDetail.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NanzctrContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NanzctrListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NastarContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NastarListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NcanadaenContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NcanadaenListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NchariteContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NchariteListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NcsiroContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NcsiroListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NctrinicContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NctrinicListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NdrksContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NdrksListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NdsscuContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NdsscuListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NfiocruzContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NfiocruzListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NhqsmmContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NhqsmmListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NicmContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NicmListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NifengNewsContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NirbbarcelonaContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NirbbarcelonaListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NissContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NissListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NkdcaContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NkdcaListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NkuleuvenContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NkuleuvenListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NmanchesterContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NmanchesterListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NmedicinaContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NmedicinaListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NmofgovConentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NncngovListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NnicdContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NnicdListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NpasteurContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NpasteurListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NplymouthContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NplymouthListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NpublichealthontarioContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NpublichealthontarioListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NrikContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NrikListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NrikenContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NrikenListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NsantegouvListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NsustechContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NsustechListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NszuContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NszuListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NumeduContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NumeduListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NumontpellierContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NumontpellierListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NunibasContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NunibasListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NunilContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NunilListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NunmContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NunmListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NutsouthwesternContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NutsouthwesternListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NuuContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NuuListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NvniivContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NvniivListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NwashingtonContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NwashingtonListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NweizmannContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NweizmannListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NwrairListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NwwwcdcgovContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NwwwcdcgovListRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NwwwiranintlContentRe.java create mode 100644 src/main/java/com/bfd/parse/reprocess/NwwwiranintlListRe.java create mode 100644 src/main/resources/application.properties create mode 100644 src/main/resources/banner.txt create mode 100644 src/main/resources/logback-spring.xml diff --git a/.classpath b/.classpath new file mode 100644 index 0000000..a27c6d0 --- /dev/null +++ b/.classpath @@ -0,0 +1,28 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..40cbd51 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/target/ +/logs/ +/.settings/ \ No newline at end of file diff --git a/.idea/ParsePlugin2.4.iml b/.idea/ParsePlugin2.4.iml new file mode 100644 index 0000000..52572de --- /dev/null +++ b/.idea/ParsePlugin2.4.iml @@ -0,0 +1,571 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/compiler.xml b/.idea/compiler.xml new file mode 100644 index 0000000..8d4d511 --- /dev/null +++ b/.idea/compiler.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/encodings.xml b/.idea/encodings.xml new file mode 100644 index 0000000..63e9001 --- /dev/null +++ b/.idea/encodings.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/jarRepositories.xml b/.idea/jarRepositories.xml new file mode 100644 index 0000000..6c23d30 --- /dev/null +++ b/.idea/jarRepositories.xml @@ -0,0 +1,25 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__ant_ant_1_6_5.xml b/.idea/libraries/Maven__ant_ant_1_6_5.xml new file mode 100644 index 0000000..180bb2d --- /dev/null +++ b/.idea/libraries/Maven__ant_ant_1_6_5.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__antlr_antlr_2_7_6.xml b/.idea/libraries/Maven__antlr_antlr_2_7_6.xml new file mode 100644 index 0000000..c427128 --- /dev/null +++ b/.idea/libraries/Maven__antlr_antlr_2_7_6.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__asm_asm_1_5_3.xml b/.idea/libraries/Maven__asm_asm_1_5_3.xml new file mode 100644 index 0000000..c0d6922 --- /dev/null +++ b/.idea/libraries/Maven__asm_asm_1_5_3.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__asm_asm_attrs_1_5_3.xml b/.idea/libraries/Maven__asm_asm_attrs_1_5_3.xml new file mode 100644 index 0000000..3e9481d --- /dev/null +++ b/.idea/libraries/Maven__asm_asm_attrs_1_5_3.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__c3p0_c3p0_0_9_1_2.xml b/.idea/libraries/Maven__c3p0_c3p0_0_9_1_2.xml new file mode 100644 index 0000000..b0fbb97 --- /dev/null +++ b/.idea/libraries/Maven__c3p0_c3p0_0_9_1_2.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__cglib_cglib_2_1_3.xml b/.idea/libraries/Maven__cglib_cglib_2_1_3.xml new file mode 100644 index 0000000..885c7c3 --- /dev/null +++ b/.idea/libraries/Maven__cglib_cglib_2_1_3.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__ch_qos_logback_logback_classic_1_1_11.xml b/.idea/libraries/Maven__ch_qos_logback_logback_classic_1_1_11.xml new file mode 100644 index 0000000..2080e00 --- /dev/null +++ b/.idea/libraries/Maven__ch_qos_logback_logback_classic_1_1_11.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__ch_qos_logback_logback_core_1_1_11.xml b/.idea/libraries/Maven__ch_qos_logback_logback_core_1_1_11.xml new file mode 100644 index 0000000..2dd2052 --- /dev/null +++ b/.idea/libraries/Maven__ch_qos_logback_logback_core_1_1_11.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_101tec_zkclient_0_3.xml b/.idea/libraries/Maven__com_101tec_zkclient_0_3.xml new file mode 100644 index 0000000..96be526 --- /dev/null +++ b/.idea/libraries/Maven__com_101tec_zkclient_0_3.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_alibaba_fastjson_1_1_22.xml b/.idea/libraries/Maven__com_alibaba_fastjson_1_1_22.xml new file mode 100644 index 0000000..a0730aa --- /dev/null +++ b/.idea/libraries/Maven__com_alibaba_fastjson_1_1_22.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_fasterxml_classmate_1_3_3.xml b/.idea/libraries/Maven__com_fasterxml_classmate_1_3_3.xml new file mode 100644 index 0000000..2686533 --- /dev/null +++ b/.idea/libraries/Maven__com_fasterxml_classmate_1_3_3.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_fasterxml_jackson_core_jackson_annotations_2_0_0.xml b/.idea/libraries/Maven__com_fasterxml_jackson_core_jackson_annotations_2_0_0.xml new file mode 100644 index 0000000..a7d07cb --- /dev/null +++ b/.idea/libraries/Maven__com_fasterxml_jackson_core_jackson_annotations_2_0_0.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_fasterxml_jackson_core_jackson_core_2_0_0.xml b/.idea/libraries/Maven__com_fasterxml_jackson_core_jackson_core_2_0_0.xml new file mode 100644 index 0000000..ef3d856 --- /dev/null +++ b/.idea/libraries/Maven__com_fasterxml_jackson_core_jackson_core_2_0_0.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_fasterxml_jackson_core_jackson_databind_2_0_0.xml b/.idea/libraries/Maven__com_fasterxml_jackson_core_jackson_databind_2_0_0.xml new file mode 100644 index 0000000..2e09727 --- /dev/null +++ b/.idea/libraries/Maven__com_fasterxml_jackson_core_jackson_databind_2_0_0.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_github_stephenc_high_scale_lib_high_scale_lib_1_1_1.xml b/.idea/libraries/Maven__com_github_stephenc_high_scale_lib_high_scale_lib_1_1_1.xml new file mode 100644 index 0000000..a4c0756 --- /dev/null +++ b/.idea/libraries/Maven__com_github_stephenc_high_scale_lib_high_scale_lib_1_1_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_google_guava_guava_14_0_1.xml b/.idea/libraries/Maven__com_google_guava_guava_14_0_1.xml new file mode 100644 index 0000000..153eb73 --- /dev/null +++ b/.idea/libraries/Maven__com_google_guava_guava_14_0_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_google_protobuf_protobuf_java_2_4_1.xml b/.idea/libraries/Maven__com_google_protobuf_protobuf_java_2_4_1.xml new file mode 100644 index 0000000..2a614e0 --- /dev/null +++ b/.idea/libraries/Maven__com_google_protobuf_protobuf_java_2_4_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_sun_jersey_jersey_core_1_8.xml b/.idea/libraries/Maven__com_sun_jersey_jersey_core_1_8.xml new file mode 100644 index 0000000..cf9dd1f --- /dev/null +++ b/.idea/libraries/Maven__com_sun_jersey_jersey_core_1_8.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_sun_jersey_jersey_json_1_8.xml b/.idea/libraries/Maven__com_sun_jersey_jersey_json_1_8.xml new file mode 100644 index 0000000..632eada --- /dev/null +++ b/.idea/libraries/Maven__com_sun_jersey_jersey_json_1_8.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_sun_jersey_jersey_server_1_8.xml b/.idea/libraries/Maven__com_sun_jersey_jersey_server_1_8.xml new file mode 100644 index 0000000..e12b39d --- /dev/null +++ b/.idea/libraries/Maven__com_sun_jersey_jersey_server_1_8.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_sun_xml_bind_jaxb_impl_2_2_3_1.xml b/.idea/libraries/Maven__com_sun_xml_bind_jaxb_impl_2_2_3_1.xml new file mode 100644 index 0000000..fc3eb62 --- /dev/null +++ b/.idea/libraries/Maven__com_sun_xml_bind_jaxb_impl_2_2_3_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_wandoulabs_jodis_jodis_0_1_2.xml b/.idea/libraries/Maven__com_wandoulabs_jodis_jodis_0_1_2.xml new file mode 100644 index 0000000..089851b --- /dev/null +++ b/.idea/libraries/Maven__com_wandoulabs_jodis_jodis_0_1_2.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_yammer_metrics_metrics_core_2_2_0.xml b/.idea/libraries/Maven__com_yammer_metrics_metrics_core_2_2_0.xml new file mode 100644 index 0000000..e876f6a --- /dev/null +++ b/.idea/libraries/Maven__com_yammer_metrics_metrics_core_2_2_0.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__commons_beanutils_commons_beanutils_1_7_0.xml b/.idea/libraries/Maven__commons_beanutils_commons_beanutils_1_7_0.xml new file mode 100644 index 0000000..2c610a4 --- /dev/null +++ b/.idea/libraries/Maven__commons_beanutils_commons_beanutils_1_7_0.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__commons_beanutils_commons_beanutils_core_1_8_0.xml b/.idea/libraries/Maven__commons_beanutils_commons_beanutils_core_1_8_0.xml new file mode 100644 index 0000000..a195a86 --- /dev/null +++ b/.idea/libraries/Maven__commons_beanutils_commons_beanutils_core_1_8_0.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__commons_cli_commons_cli_1_2.xml b/.idea/libraries/Maven__commons_cli_commons_cli_1_2.xml new file mode 100644 index 0000000..90d277d --- /dev/null +++ b/.idea/libraries/Maven__commons_cli_commons_cli_1_2.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__commons_codec_commons_codec_1_4.xml b/.idea/libraries/Maven__commons_codec_commons_codec_1_4.xml new file mode 100644 index 0000000..fbc04aa --- /dev/null +++ b/.idea/libraries/Maven__commons_codec_commons_codec_1_4.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__commons_collections_commons_collections_3_1.xml b/.idea/libraries/Maven__commons_collections_commons_collections_3_1.xml new file mode 100644 index 0000000..b264d07 --- /dev/null +++ b/.idea/libraries/Maven__commons_collections_commons_collections_3_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__commons_configuration_commons_configuration_1_6.xml b/.idea/libraries/Maven__commons_configuration_commons_configuration_1_6.xml new file mode 100644 index 0000000..8c12b80 --- /dev/null +++ b/.idea/libraries/Maven__commons_configuration_commons_configuration_1_6.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__commons_digester_commons_digester_2_1.xml b/.idea/libraries/Maven__commons_digester_commons_digester_2_1.xml new file mode 100644 index 0000000..b0e6b12 --- /dev/null +++ b/.idea/libraries/Maven__commons_digester_commons_digester_2_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__commons_el_commons_el_1_0.xml b/.idea/libraries/Maven__commons_el_commons_el_1_0.xml new file mode 100644 index 0000000..71eae92 --- /dev/null +++ b/.idea/libraries/Maven__commons_el_commons_el_1_0.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__commons_httpclient_commons_httpclient_3_0_1.xml b/.idea/libraries/Maven__commons_httpclient_commons_httpclient_3_0_1.xml new file mode 100644 index 0000000..1e2c155 --- /dev/null +++ b/.idea/libraries/Maven__commons_httpclient_commons_httpclient_3_0_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__commons_io_commons_io_1_4.xml b/.idea/libraries/Maven__commons_io_commons_io_1_4.xml new file mode 100644 index 0000000..fc4ca30 --- /dev/null +++ b/.idea/libraries/Maven__commons_io_commons_io_1_4.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__commons_lang_commons_lang_2_4.xml b/.idea/libraries/Maven__commons_lang_commons_lang_2_4.xml new file mode 100644 index 0000000..4125d79 --- /dev/null +++ b/.idea/libraries/Maven__commons_lang_commons_lang_2_4.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__commons_logging_commons_logging_1_1_1.xml b/.idea/libraries/Maven__commons_logging_commons_logging_1_1_1.xml new file mode 100644 index 0000000..0f3aa32 --- /dev/null +++ b/.idea/libraries/Maven__commons_logging_commons_logging_1_1_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__commons_logging_commons_logging_api_1_0_4.xml b/.idea/libraries/Maven__commons_logging_commons_logging_api_1_0_4.xml new file mode 100644 index 0000000..0ccadbc --- /dev/null +++ b/.idea/libraries/Maven__commons_logging_commons_logging_api_1_0_4.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__commons_net_commons_net_1_4_1.xml b/.idea/libraries/Maven__commons_net_commons_net_1_4_1.xml new file mode 100644 index 0000000..ea4bd93 --- /dev/null +++ b/.idea/libraries/Maven__commons_net_commons_net_1_4_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__dom4j_dom4j_1_6_1.xml b/.idea/libraries/Maven__dom4j_dom4j_1_6_1.xml new file mode 100644 index 0000000..ebde198 --- /dev/null +++ b/.idea/libraries/Maven__dom4j_dom4j_1_6_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__hsqldb_hsqldb_1_8_0_10.xml b/.idea/libraries/Maven__hsqldb_hsqldb_1_8_0_10.xml new file mode 100644 index 0000000..87e28b4 --- /dev/null +++ b/.idea/libraries/Maven__hsqldb_hsqldb_1_8_0_10.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__io_netty_netty_3_7_0_Final.xml b/.idea/libraries/Maven__io_netty_netty_3_7_0_Final.xml new file mode 100644 index 0000000..09b9c46 --- /dev/null +++ b/.idea/libraries/Maven__io_netty_netty_3_7_0_Final.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__javassist_javassist_3_4_GA.xml b/.idea/libraries/Maven__javassist_javassist_3_4_GA.xml new file mode 100644 index 0000000..8c61bc4 --- /dev/null +++ b/.idea/libraries/Maven__javassist_javassist_3_4_GA.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__javax_activation_activation_1_1.xml b/.idea/libraries/Maven__javax_activation_activation_1_1.xml new file mode 100644 index 0000000..9aebda8 --- /dev/null +++ b/.idea/libraries/Maven__javax_activation_activation_1_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__javax_persistence_persistence_api_1_0.xml b/.idea/libraries/Maven__javax_persistence_persistence_api_1_0.xml new file mode 100644 index 0000000..9fa58e6 --- /dev/null +++ b/.idea/libraries/Maven__javax_persistence_persistence_api_1_0.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__javax_transaction_jta_1_1.xml b/.idea/libraries/Maven__javax_transaction_jta_1_1.xml new file mode 100644 index 0000000..c327d08 --- /dev/null +++ b/.idea/libraries/Maven__javax_transaction_jta_1_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__javax_validation_validation_api_1_1_0_Final.xml b/.idea/libraries/Maven__javax_validation_validation_api_1_1_0_Final.xml new file mode 100644 index 0000000..7cf2760 --- /dev/null +++ b/.idea/libraries/Maven__javax_validation_validation_api_1_1_0_Final.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__javax_xml_bind_jaxb_api_2_1.xml b/.idea/libraries/Maven__javax_xml_bind_jaxb_api_2_1.xml new file mode 100644 index 0000000..bbb4a3e --- /dev/null +++ b/.idea/libraries/Maven__javax_xml_bind_jaxb_api_2_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__jline_jline_0_9_94.xml b/.idea/libraries/Maven__jline_jline_0_9_94.xml new file mode 100644 index 0000000..88a4b55 --- /dev/null +++ b/.idea/libraries/Maven__jline_jline_0_9_94.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__junit_junit_4_11.xml b/.idea/libraries/Maven__junit_junit_4_11.xml new file mode 100644 index 0000000..5d14edd --- /dev/null +++ b/.idea/libraries/Maven__junit_junit_4_11.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__log4j_log4j_1_2_14.xml b/.idea/libraries/Maven__log4j_log4j_1_2_14.xml new file mode 100644 index 0000000..019e7bd --- /dev/null +++ b/.idea/libraries/Maven__log4j_log4j_1_2_14.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__mysql_mysql_connector_java_5_1_6.xml b/.idea/libraries/Maven__mysql_mysql_connector_java_5_1_6.xml new file mode 100644 index 0000000..37c10e9 --- /dev/null +++ b/.idea/libraries/Maven__mysql_mysql_connector_java_5_1_6.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__nekohtml_nekohtml_0_9_5.xml b/.idea/libraries/Maven__nekohtml_nekohtml_0_9_5.xml new file mode 100644 index 0000000..fd52c65 --- /dev/null +++ b/.idea/libraries/Maven__nekohtml_nekohtml_0_9_5.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__net_java_dev_jets3t_jets3t_0_7_1.xml b/.idea/libraries/Maven__net_java_dev_jets3t_jets3t_0_7_1.xml new file mode 100644 index 0000000..673fbdb --- /dev/null +++ b/.idea/libraries/Maven__net_java_dev_jets3t_jets3t_0_7_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__net_jpountz_lz4_lz4_1_2_0.xml b/.idea/libraries/Maven__net_jpountz_lz4_lz4_1_2_0.xml new file mode 100644 index 0000000..6436bb7 --- /dev/null +++ b/.idea/libraries/Maven__net_jpountz_lz4_lz4_1_2_0.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__net_sf_ehcache_ehcache_1_2_3.xml b/.idea/libraries/Maven__net_sf_ehcache_ehcache_1_2_3.xml new file mode 100644 index 0000000..bbf9774 --- /dev/null +++ b/.idea/libraries/Maven__net_sf_ehcache_ehcache_1_2_3.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__net_sf_ezmorph_ezmorph_1_0_6.xml b/.idea/libraries/Maven__net_sf_ezmorph_ezmorph_1_0_6.xml new file mode 100644 index 0000000..56a5ab6 --- /dev/null +++ b/.idea/libraries/Maven__net_sf_ezmorph_ezmorph_1_0_6.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__net_sf_jopt_simple_jopt_simple_3_2.xml b/.idea/libraries/Maven__net_sf_jopt_simple_jopt_simple_3_2.xml new file mode 100644 index 0000000..338b509 --- /dev/null +++ b/.idea/libraries/Maven__net_sf_jopt_simple_jopt_simple_3_2.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__net_sf_kosmosfs_kfs_0_3.xml b/.idea/libraries/Maven__net_sf_kosmosfs_kfs_0_3.xml new file mode 100644 index 0000000..9d3d50d --- /dev/null +++ b/.idea/libraries/Maven__net_sf_kosmosfs_kfs_0_3.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__net_sourceforge_htmlcleaner_htmlcleaner_2_4.xml b/.idea/libraries/Maven__net_sourceforge_htmlcleaner_htmlcleaner_2_4.xml new file mode 100644 index 0000000..54897e3 --- /dev/null +++ b/.idea/libraries/Maven__net_sourceforge_htmlcleaner_htmlcleaner_2_4.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_avro_avro_1_5_3.xml b/.idea/libraries/Maven__org_apache_avro_avro_1_5_3.xml new file mode 100644 index 0000000..18fa866 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_avro_avro_1_5_3.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_avro_avro_ipc_1_5_3.xml b/.idea/libraries/Maven__org_apache_avro_avro_ipc_1_5_3.xml new file mode 100644 index 0000000..254c8ec --- /dev/null +++ b/.idea/libraries/Maven__org_apache_avro_avro_ipc_1_5_3.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_commons_commons_math_2_1.xml b/.idea/libraries/Maven__org_apache_commons_commons_math_2_1.xml new file mode 100644 index 0000000..265e810 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_commons_commons_math_2_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_commons_commons_pool2_2_3.xml b/.idea/libraries/Maven__org_apache_commons_commons_pool2_2_3.xml new file mode 100644 index 0000000..ee6a100 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_commons_commons_pool2_2_3.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_curator_curator_client_2_7_0.xml b/.idea/libraries/Maven__org_apache_curator_curator_client_2_7_0.xml new file mode 100644 index 0000000..320a667 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_curator_curator_client_2_7_0.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_curator_curator_framework_2_7_0.xml b/.idea/libraries/Maven__org_apache_curator_curator_framework_2_7_0.xml new file mode 100644 index 0000000..f1d9b6a --- /dev/null +++ b/.idea/libraries/Maven__org_apache_curator_curator_framework_2_7_0.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_curator_curator_recipes_2_7_0.xml b/.idea/libraries/Maven__org_apache_curator_curator_recipes_2_7_0.xml new file mode 100644 index 0000000..e86e716 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_curator_curator_recipes_2_7_0.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_hadoop_hadoop_core_1_0_4.xml b/.idea/libraries/Maven__org_apache_hadoop_hadoop_core_1_0_4.xml new file mode 100644 index 0000000..816e22f --- /dev/null +++ b/.idea/libraries/Maven__org_apache_hadoop_hadoop_core_1_0_4.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_hbase_hbase_0_94_9.xml b/.idea/libraries/Maven__org_apache_hbase_hbase_0_94_9.xml new file mode 100644 index 0000000..409c51f --- /dev/null +++ b/.idea/libraries/Maven__org_apache_hbase_hbase_0_94_9.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_httpcomponents_httpclient_4_3_5.xml b/.idea/libraries/Maven__org_apache_httpcomponents_httpclient_4_3_5.xml new file mode 100644 index 0000000..8453c07 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_httpcomponents_httpclient_4_3_5.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_httpcomponents_httpcore_4_3_2.xml b/.idea/libraries/Maven__org_apache_httpcomponents_httpcore_4_3_2.xml new file mode 100644 index 0000000..cdf0027 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_httpcomponents_httpcore_4_3_2.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_kafka_kafka_2_10_0_8_2_1.xml b/.idea/libraries/Maven__org_apache_kafka_kafka_2_10_0_8_2_1.xml new file mode 100644 index 0000000..4b8a310 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_kafka_kafka_2_10_0_8_2_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_kafka_kafka_clients_0_8_2_1.xml b/.idea/libraries/Maven__org_apache_kafka_kafka_clients_0_8_2_1.xml new file mode 100644 index 0000000..17c2168 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_kafka_kafka_clients_0_8_2_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_thrift_libthrift_0_8_0.xml b/.idea/libraries/Maven__org_apache_thrift_libthrift_0_8_0.xml new file mode 100644 index 0000000..de76732 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_thrift_libthrift_0_8_0.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_tika_tika_core_0_9.xml b/.idea/libraries/Maven__org_apache_tika_tika_core_0_9.xml new file mode 100644 index 0000000..2046779 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_tika_tika_core_0_9.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_tomcat_embed_tomcat_embed_core_8_5_16.xml b/.idea/libraries/Maven__org_apache_tomcat_embed_tomcat_embed_core_8_5_16.xml new file mode 100644 index 0000000..d134fb2 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_tomcat_embed_tomcat_embed_core_8_5_16.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_tomcat_embed_tomcat_embed_el_8_5_16.xml b/.idea/libraries/Maven__org_apache_tomcat_embed_tomcat_embed_el_8_5_16.xml new file mode 100644 index 0000000..450b4fc --- /dev/null +++ b/.idea/libraries/Maven__org_apache_tomcat_embed_tomcat_embed_el_8_5_16.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_tomcat_embed_tomcat_embed_websocket_8_5_16.xml b/.idea/libraries/Maven__org_apache_tomcat_embed_tomcat_embed_websocket_8_5_16.xml new file mode 100644 index 0000000..ff7f032 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_tomcat_embed_tomcat_embed_websocket_8_5_16.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_velocity_velocity_1_7.xml b/.idea/libraries/Maven__org_apache_velocity_velocity_1_7.xml new file mode 100644 index 0000000..f002ed9 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_velocity_velocity_1_7.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_zookeeper_zookeeper_3_4_6.xml b/.idea/libraries/Maven__org_apache_zookeeper_zookeeper_3_4_6.xml new file mode 100644 index 0000000..5eb6e9d --- /dev/null +++ b/.idea/libraries/Maven__org_apache_zookeeper_zookeeper_3_4_6.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_codehaus_jackson_jackson_core_asl_1_8_8.xml b/.idea/libraries/Maven__org_codehaus_jackson_jackson_core_asl_1_8_8.xml new file mode 100644 index 0000000..9502227 --- /dev/null +++ b/.idea/libraries/Maven__org_codehaus_jackson_jackson_core_asl_1_8_8.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_codehaus_jackson_jackson_jaxrs_1_8_8.xml b/.idea/libraries/Maven__org_codehaus_jackson_jackson_jaxrs_1_8_8.xml new file mode 100644 index 0000000..5f9ebb8 --- /dev/null +++ b/.idea/libraries/Maven__org_codehaus_jackson_jackson_jaxrs_1_8_8.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_codehaus_jackson_jackson_mapper_asl_1_0_1.xml b/.idea/libraries/Maven__org_codehaus_jackson_jackson_mapper_asl_1_0_1.xml new file mode 100644 index 0000000..7ce1068 --- /dev/null +++ b/.idea/libraries/Maven__org_codehaus_jackson_jackson_mapper_asl_1_0_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_codehaus_jackson_jackson_xc_1_8_8.xml b/.idea/libraries/Maven__org_codehaus_jackson_jackson_xc_1_8_8.xml new file mode 100644 index 0000000..cd73942 --- /dev/null +++ b/.idea/libraries/Maven__org_codehaus_jackson_jackson_xc_1_8_8.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_codehaus_jettison_jettison_1_1.xml b/.idea/libraries/Maven__org_codehaus_jettison_jettison_1_1.xml new file mode 100644 index 0000000..17b8b44 --- /dev/null +++ b/.idea/libraries/Maven__org_codehaus_jettison_jettison_1_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_eclipse_jdt_core_3_1_1.xml b/.idea/libraries/Maven__org_eclipse_jdt_core_3_1_1.xml new file mode 100644 index 0000000..2beff06 --- /dev/null +++ b/.idea/libraries/Maven__org_eclipse_jdt_core_3_1_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_hamcrest_hamcrest_core_1_3.xml b/.idea/libraries/Maven__org_hamcrest_hamcrest_core_1_3.xml new file mode 100644 index 0000000..13adb58 --- /dev/null +++ b/.idea/libraries/Maven__org_hamcrest_hamcrest_core_1_3.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_hibernate_ejb3_persistence_1_0_2_GA.xml b/.idea/libraries/Maven__org_hibernate_ejb3_persistence_1_0_2_GA.xml new file mode 100644 index 0000000..ced7444 --- /dev/null +++ b/.idea/libraries/Maven__org_hibernate_ejb3_persistence_1_0_2_GA.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_hibernate_hibernate_3_2_1_ga.xml b/.idea/libraries/Maven__org_hibernate_hibernate_3_2_1_ga.xml new file mode 100644 index 0000000..8de4bb9 --- /dev/null +++ b/.idea/libraries/Maven__org_hibernate_hibernate_3_2_1_ga.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_hibernate_hibernate_annotations_3_4_0_GA.xml b/.idea/libraries/Maven__org_hibernate_hibernate_annotations_3_4_0_GA.xml new file mode 100644 index 0000000..7b54f79 --- /dev/null +++ b/.idea/libraries/Maven__org_hibernate_hibernate_annotations_3_4_0_GA.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_hibernate_hibernate_commons_annotations_3_3_0_ga.xml b/.idea/libraries/Maven__org_hibernate_hibernate_commons_annotations_3_3_0_ga.xml new file mode 100644 index 0000000..99de916 --- /dev/null +++ b/.idea/libraries/Maven__org_hibernate_hibernate_commons_annotations_3_3_0_ga.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_hibernate_hibernate_core_3_3_2_GA.xml b/.idea/libraries/Maven__org_hibernate_hibernate_core_3_3_2_GA.xml new file mode 100644 index 0000000..91f8247 --- /dev/null +++ b/.idea/libraries/Maven__org_hibernate_hibernate_core_3_3_2_GA.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_hibernate_hibernate_entitymanager_3_4_0_GA.xml b/.idea/libraries/Maven__org_hibernate_hibernate_entitymanager_3_4_0_GA.xml new file mode 100644 index 0000000..47fc00d --- /dev/null +++ b/.idea/libraries/Maven__org_hibernate_hibernate_entitymanager_3_4_0_GA.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_hibernate_hibernate_validator_5_3_5_Final.xml b/.idea/libraries/Maven__org_hibernate_hibernate_validator_5_3_5_Final.xml new file mode 100644 index 0000000..6469e7f --- /dev/null +++ b/.idea/libraries/Maven__org_hibernate_hibernate_validator_5_3_5_Final.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_jamon_jamon_runtime_2_3_1.xml b/.idea/libraries/Maven__org_jamon_jamon_runtime_2_3_1.xml new file mode 100644 index 0000000..a0639ce --- /dev/null +++ b/.idea/libraries/Maven__org_jamon_jamon_runtime_2_3_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_jboss_logging_jboss_logging_3_3_1_Final.xml b/.idea/libraries/Maven__org_jboss_logging_jboss_logging_3_3_1_Final.xml new file mode 100644 index 0000000..bbe33d8 --- /dev/null +++ b/.idea/libraries/Maven__org_jboss_logging_jboss_logging_3_3_1_Final.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_jboss_netty_netty_3_2_4_Final.xml b/.idea/libraries/Maven__org_jboss_netty_netty_3_2_4_Final.xml new file mode 100644 index 0000000..ef65ddf --- /dev/null +++ b/.idea/libraries/Maven__org_jboss_netty_netty_3_2_4_Final.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_jdom_jdom2_2_0_6.xml b/.idea/libraries/Maven__org_jdom_jdom2_2_0_6.xml new file mode 100644 index 0000000..a6351fa --- /dev/null +++ b/.idea/libraries/Maven__org_jdom_jdom2_2_0_6.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_jruby_jruby_complete_1_6_5.xml b/.idea/libraries/Maven__org_jruby_jruby_complete_1_6_5.xml new file mode 100644 index 0000000..8fb36d0 --- /dev/null +++ b/.idea/libraries/Maven__org_jruby_jruby_complete_1_6_5.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_jsoup_jsoup_1_7_3.xml b/.idea/libraries/Maven__org_jsoup_jsoup_1_7_3.xml new file mode 100644 index 0000000..53ac5b9 --- /dev/null +++ b/.idea/libraries/Maven__org_jsoup_jsoup_1_7_3.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_mortbay_jetty_jetty_6_1_26.xml b/.idea/libraries/Maven__org_mortbay_jetty_jetty_6_1_26.xml new file mode 100644 index 0000000..53c7ef5 --- /dev/null +++ b/.idea/libraries/Maven__org_mortbay_jetty_jetty_6_1_26.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_mortbay_jetty_jetty_util_6_1_26.xml b/.idea/libraries/Maven__org_mortbay_jetty_jetty_util_6_1_26.xml new file mode 100644 index 0000000..4f5aa5a --- /dev/null +++ b/.idea/libraries/Maven__org_mortbay_jetty_jetty_util_6_1_26.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_mortbay_jetty_jsp_2_1_6_1_14.xml b/.idea/libraries/Maven__org_mortbay_jetty_jsp_2_1_6_1_14.xml new file mode 100644 index 0000000..d6e455b --- /dev/null +++ b/.idea/libraries/Maven__org_mortbay_jetty_jsp_2_1_6_1_14.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_mortbay_jetty_jsp_api_2_1_6_1_14.xml b/.idea/libraries/Maven__org_mortbay_jetty_jsp_api_2_1_6_1_14.xml new file mode 100644 index 0000000..27879d6 --- /dev/null +++ b/.idea/libraries/Maven__org_mortbay_jetty_jsp_api_2_1_6_1_14.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_mortbay_jetty_servlet_api_2_5_20081211.xml b/.idea/libraries/Maven__org_mortbay_jetty_servlet_api_2_5_20081211.xml new file mode 100644 index 0000000..20f3e35 --- /dev/null +++ b/.idea/libraries/Maven__org_mortbay_jetty_servlet_api_2_5_20081211.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_mortbay_jetty_servlet_api_2_5_6_1_14.xml b/.idea/libraries/Maven__org_mortbay_jetty_servlet_api_2_5_6_1_14.xml new file mode 100644 index 0000000..44bbab0 --- /dev/null +++ b/.idea/libraries/Maven__org_mortbay_jetty_servlet_api_2_5_6_1_14.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_mybatis_mybatis_3_1_1.xml b/.idea/libraries/Maven__org_mybatis_mybatis_3_1_1.xml new file mode 100644 index 0000000..8844867 --- /dev/null +++ b/.idea/libraries/Maven__org_mybatis_mybatis_3_1_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_scala_lang_scala_library_2_10_4.xml b/.idea/libraries/Maven__org_scala_lang_scala_library_2_10_4.xml new file mode 100644 index 0000000..298b490 --- /dev/null +++ b/.idea/libraries/Maven__org_scala_lang_scala_library_2_10_4.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_slf4j_jcl_over_slf4j_1_7_25.xml b/.idea/libraries/Maven__org_slf4j_jcl_over_slf4j_1_7_25.xml new file mode 100644 index 0000000..28e52ba --- /dev/null +++ b/.idea/libraries/Maven__org_slf4j_jcl_over_slf4j_1_7_25.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_slf4j_jul_to_slf4j_1_7_25.xml b/.idea/libraries/Maven__org_slf4j_jul_to_slf4j_1_7_25.xml new file mode 100644 index 0000000..0f3f4c4 --- /dev/null +++ b/.idea/libraries/Maven__org_slf4j_jul_to_slf4j_1_7_25.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_slf4j_log4j_over_slf4j_1_7_25.xml b/.idea/libraries/Maven__org_slf4j_log4j_over_slf4j_1_7_25.xml new file mode 100644 index 0000000..899d14f --- /dev/null +++ b/.idea/libraries/Maven__org_slf4j_log4j_over_slf4j_1_7_25.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_slf4j_slf4j_api_1_6_1.xml b/.idea/libraries/Maven__org_slf4j_slf4j_api_1_6_1.xml new file mode 100644 index 0000000..797e3b4 --- /dev/null +++ b/.idea/libraries/Maven__org_slf4j_slf4j_api_1_6_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_slf4j_slf4j_log4j12_1_6_1.xml b/.idea/libraries/Maven__org_slf4j_slf4j_log4j12_1_6_1.xml new file mode 100644 index 0000000..17f504a --- /dev/null +++ b/.idea/libraries/Maven__org_slf4j_slf4j_log4j12_1_6_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_boot_spring_boot_1_5_6_RELEASE.xml b/.idea/libraries/Maven__org_springframework_boot_spring_boot_1_5_6_RELEASE.xml new file mode 100644 index 0000000..7925ee5 --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_boot_spring_boot_1_5_6_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_boot_spring_boot_autoconfigure_1_5_6_RELEASE.xml b/.idea/libraries/Maven__org_springframework_boot_spring_boot_autoconfigure_1_5_6_RELEASE.xml new file mode 100644 index 0000000..9bfbfdb --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_boot_spring_boot_autoconfigure_1_5_6_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_boot_spring_boot_starter_1_5_6_RELEASE.xml b/.idea/libraries/Maven__org_springframework_boot_spring_boot_starter_1_5_6_RELEASE.xml new file mode 100644 index 0000000..03671b5 --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_boot_spring_boot_starter_1_5_6_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_boot_spring_boot_starter_logging_1_5_6_RELEASE.xml b/.idea/libraries/Maven__org_springframework_boot_spring_boot_starter_logging_1_5_6_RELEASE.xml new file mode 100644 index 0000000..e901e8e --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_boot_spring_boot_starter_logging_1_5_6_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_boot_spring_boot_starter_tomcat_1_5_6_RELEASE.xml b/.idea/libraries/Maven__org_springframework_boot_spring_boot_starter_tomcat_1_5_6_RELEASE.xml new file mode 100644 index 0000000..26897bf --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_boot_spring_boot_starter_tomcat_1_5_6_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_boot_spring_boot_starter_web_1_5_6_RELEASE.xml b/.idea/libraries/Maven__org_springframework_boot_spring_boot_starter_web_1_5_6_RELEASE.xml new file mode 100644 index 0000000..9cb55af --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_boot_spring_boot_starter_web_1_5_6_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_spring_aop_4_3_10_RELEASE.xml b/.idea/libraries/Maven__org_springframework_spring_aop_4_3_10_RELEASE.xml new file mode 100644 index 0000000..45449e1 --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_spring_aop_4_3_10_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_spring_beans_4_3_10_RELEASE.xml b/.idea/libraries/Maven__org_springframework_spring_beans_4_3_10_RELEASE.xml new file mode 100644 index 0000000..3a34e64 --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_spring_beans_4_3_10_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_spring_context_4_3_10_RELEASE.xml b/.idea/libraries/Maven__org_springframework_spring_context_4_3_10_RELEASE.xml new file mode 100644 index 0000000..a90e953 --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_spring_context_4_3_10_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_spring_core_4_3_10_RELEASE.xml b/.idea/libraries/Maven__org_springframework_spring_core_4_3_10_RELEASE.xml new file mode 100644 index 0000000..15923ad --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_spring_core_4_3_10_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_spring_expression_4_3_10_RELEASE.xml b/.idea/libraries/Maven__org_springframework_spring_expression_4_3_10_RELEASE.xml new file mode 100644 index 0000000..071c63d --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_spring_expression_4_3_10_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_spring_web_4_3_10_RELEASE.xml b/.idea/libraries/Maven__org_springframework_spring_web_4_3_10_RELEASE.xml new file mode 100644 index 0000000..75afec0 --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_spring_web_4_3_10_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_springframework_spring_webmvc_4_3_10_RELEASE.xml b/.idea/libraries/Maven__org_springframework_spring_webmvc_4_3_10_RELEASE.xml new file mode 100644 index 0000000..1a7f1d3 --- /dev/null +++ b/.idea/libraries/Maven__org_springframework_spring_webmvc_4_3_10_RELEASE.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_xerial_snappy_snappy_java_1_0_4_1.xml b/.idea/libraries/Maven__org_xerial_snappy_snappy_java_1_0_4_1.xml new file mode 100644 index 0000000..9e6fd51 --- /dev/null +++ b/.idea/libraries/Maven__org_xerial_snappy_snappy_java_1_0_4_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_yaml_snakeyaml_1_17.xml b/.idea/libraries/Maven__org_yaml_snakeyaml_1_17.xml new file mode 100644 index 0000000..29798fa --- /dev/null +++ b/.idea/libraries/Maven__org_yaml_snakeyaml_1_17.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__oro_oro_2_0_8.xml b/.idea/libraries/Maven__oro_oro_2_0_8.xml new file mode 100644 index 0000000..0e0021d --- /dev/null +++ b/.idea/libraries/Maven__oro_oro_2_0_8.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__redis_clients_jedis_2_6_0.xml b/.idea/libraries/Maven__redis_clients_jedis_2_6_0.xml new file mode 100644 index 0000000..40ccb16 --- /dev/null +++ b/.idea/libraries/Maven__redis_clients_jedis_2_6_0.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__stax_stax_api_1_0_1.xml b/.idea/libraries/Maven__stax_stax_api_1_0_1.xml new file mode 100644 index 0000000..2670168 --- /dev/null +++ b/.idea/libraries/Maven__stax_stax_api_1_0_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__tomcat_jasper_compiler_5_5_12.xml b/.idea/libraries/Maven__tomcat_jasper_compiler_5_5_12.xml new file mode 100644 index 0000000..3c67312 --- /dev/null +++ b/.idea/libraries/Maven__tomcat_jasper_compiler_5_5_12.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__tomcat_jasper_runtime_5_5_12.xml b/.idea/libraries/Maven__tomcat_jasper_runtime_5_5_12.xml new file mode 100644 index 0000000..90fb110 --- /dev/null +++ b/.idea/libraries/Maven__tomcat_jasper_runtime_5_5_12.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__xalan_serializer_2_7_1.xml b/.idea/libraries/Maven__xalan_serializer_2_7_1.xml new file mode 100644 index 0000000..cc926a4 --- /dev/null +++ b/.idea/libraries/Maven__xalan_serializer_2_7_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__xalan_xalan_2_7_1.xml b/.idea/libraries/Maven__xalan_xalan_2_7_1.xml new file mode 100644 index 0000000..5f1e78f --- /dev/null +++ b/.idea/libraries/Maven__xalan_xalan_2_7_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__xerces_xercesImpl_2_9_1.xml b/.idea/libraries/Maven__xerces_xercesImpl_2_9_1.xml new file mode 100644 index 0000000..dca351c --- /dev/null +++ b/.idea/libraries/Maven__xerces_xercesImpl_2_9_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__xerces_xmlParserAPIs_2_6_2.xml b/.idea/libraries/Maven__xerces_xmlParserAPIs_2_6_2.xml new file mode 100644 index 0000000..47d5a41 --- /dev/null +++ b/.idea/libraries/Maven__xerces_xmlParserAPIs_2_6_2.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__xml_apis_xml_apis_1_3_04.xml b/.idea/libraries/Maven__xml_apis_xml_apis_1_3_04.xml new file mode 100644 index 0000000..d37192f --- /dev/null +++ b/.idea/libraries/Maven__xml_apis_xml_apis_1_3_04.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__xmlenc_xmlenc_0_52.xml b/.idea/libraries/Maven__xmlenc_xmlenc_0_52.xml new file mode 100644 index 0000000..4cfced5 --- /dev/null +++ b/.idea/libraries/Maven__xmlenc_xmlenc_0_52.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..aacd5e9 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,11 @@ + + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..d0d23da --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/uiDesigner.xml b/.idea/uiDesigner.xml new file mode 100644 index 0000000..e96534f --- /dev/null +++ b/.idea/uiDesigner.xml @@ -0,0 +1,124 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml new file mode 100644 index 0000000..8e06818 --- /dev/null +++ b/.idea/workspace.xml @@ -0,0 +1,164 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 1698050750617 + + + + + + \ No newline at end of file diff --git a/.project b/.project new file mode 100644 index 0000000..e3cf2b3 --- /dev/null +++ b/.project @@ -0,0 +1,23 @@ + + + ParsePlugin2.4 + + + + + + org.eclipse.jdt.core.javabuilder + + + + + org.eclipse.m2e.core.maven2Builder + + + + + + org.eclipse.jdt.core.javanature + org.eclipse.m2e.core.maven2Nature + + diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..8d1a8b2 --- /dev/null +++ b/pom.xml @@ -0,0 +1,793 @@ + + + + 4.0.0 + + com.bfd + ParsePlugin2.4 + 0.0.1-SNAPSHOT + + + org.springframework.boot + spring-boot-starter-parent + 1.5.6.RELEASE + + ParsePlugin2.4 + + http://www.example.com + + + UTF-8 + 1.8 + 1.8 + + + + + junit + junit + 4.11 + test + + + org.springframework.boot + spring-boot-starter-web + + + antlr + antlr + 2.7.6 + + + asm + asm + 1.5.3 + + + asm + asm-attrs + 1.5.3 + + + c3p0 + c3p0 + 0.9.1.2 + + + cglib + cglib + 2.1_3 + + + commons-beanutils + commons-beanutils + 1.7.0 + + + commons-cli + commons-cli + 1.2 + + + commons-codec + commons-codec + 1.4 + + + commons-collections + commons-collections + 3.1 + + + commons-configuration + commons-configuration + 1.6 + + + commons-el + commons-el + 1.0 + + + commons-io + commons-io + 1.4 + + + commons-lang + commons-lang + 2.4 + + + commons-logging + commons-logging + 1.1.1 + + + commons-logging + commons-logging-api + 1.0.4 + + + commons-net + commons-net + 1.4.1 + + + org.apache.commons + commons-pool2 + 2.3 + + + org.apache.curator + curator-client + 2.7.0 + + + org.apache.curator + curator-framework + 2.7.0 + + + org.apache.curator + curator-recipes + 2.7.0 + + + dom4j + dom4j + 1.6.1 + + + net.sf.ehcache + ehcache + 1.2.3 + + + org.hibernate + ejb3-persistence + 1.0.2.GA + pom + + + net.sf.ezmorph + ezmorph + 1.0.6 + + + com.alibaba + fastjson + 1.1.22 + + + com.google.guava + guava + 14.0.1 + + + org.apache.hadoop + hadoop-core + 1.0.4 + + + org.hamcrest + hamcrest-core + 1.3 + + + org.apache.hbase + hbase + 0.94.9 + + + org.hibernate + hibernate + 3.2.1.ga + + + org.hibernate + hibernate-annotations + 3.4.0.GA + + + org.hibernate + hibernate-commons-annotations + 3.3.0.ga + + + org.hibernate + hibernate-entitymanager + 3.4.0.GA + + + org.hibernate + hibernate-core + 3.3.2.GA + + + net.sourceforge.htmlcleaner + htmlcleaner + 2.4 + + + org.apache.httpcomponents + httpclient + 4.3.5 + + + org.apache.httpcomponents + httpcore + 4.3.2 + + + + com.fasterxml.jackson.core + jackson-annotations + 2.0.0 + + + com.fasterxml.jackson.core + jackson-core + 2.0.0 + + + com.fasterxml.jackson.core + jackson-databind + 2.0.0 + + + javassist + javassist + 3.4.GA + + + redis.clients + jedis + 2.6.0 + + + jline + jline + 0.9.94 + + + com.wandoulabs.jodis + jodis + 0.1.2 + + + org.jsoup + jsoup + 1.7.3 + + + org.apache.kafka + kafka_2.10 + 0.8.2.1 + + + org.apache.kafka + kafka-clients + 0.8.2.1 + + + com.yammer.metrics + metrics-core + 2.2.0 + + + org.mybatis + mybatis + 3.1.1 + + + + mysql + mysql-connector-java + 5.1.6 + + + + + nekohtml + nekohtml + 0.9.5 + + + javax.persistence + persistence-api + 1.0 + + + com.google.protobuf + protobuf-java + 2.4.1 + + + org.scala-lang + scala-library + 2.10.4 + + + org.slf4j + slf4j-api + 1.6.1 + + + org.slf4j + slf4j-log4j12 + 1.6.1 + + + org.xerial.snappy + snappy-java + 1.0.4.1 + + + org.apache.tika + tika-core + 0.9 + + + xalan + xalan + 2.7.1 + + + xerces + xercesImpl + 2.9.1 + + + xerces + xercesImpl + 2.9.1 + + + xml-apis + xml-apis + 1.0.b2 + + + xml-apis + xml-apis + 1.3.04 + + + xmlenc + xmlenc + 0.52 + + + xerces + xmlParserAPIs + 2.6.2 + + + com.101tec + zkclient + 0.3 + + + org.apache.zookeeper + zookeeper + 3.4.6 + + + javax.transaction + jta + 1.1 + + + log4j + log4j + 1.2.14 + + + + bfd_crawler_slice_configure_3.2.1 + bfd_crawler_slice_configure_3.2.1 + 1.0.0 + system + ${project.basedir}/../jarlib/bfd_crawler_slice_configure_3.2.1.jar + + + bfd_crawler_slice_datasaver_3.2.1 + bfd_crawler_slice_datasaver_3.2.1 + 1.0.0 + system + ${project.basedir}/../jarlib/bfd_crawler_slice_datasaver_3.2.1.jar + + + bfd_crawler_slice_deduplicator_3.2.1 + bfd_crawler_slice_deduplicator_3.2.1 + 1.0.0 + system + ${project.basedir}/../jarlib/bfd_crawler_slice_deduplicator_3.2.1.jar + + + bfd_crawler_slice_download_3.2.1 + bfd_crawler_slice_download_3.2.1 + 1.0.0 + system + ${project.basedir}/../jarlib/bfd_crawler_slice_download_3.2.1.jar + + + bfd_crawler_slice_jsEngine_3.2.1 + bfd_crawler_slice_jsEngine_3.2.1 + 1.0.0 + system + ${project.basedir}/../jarlib/bfd_crawler_slice_jsEngine_3.2.1.jar + + + bfd_crawler_slice_kafkaproxy_3.2.1 + bfd_crawler_slice_kafkaproxy_3.2.1 + 1.0.0 + system + ${project.basedir}/../jarlib/bfd_crawler_slice_kafkaproxy_3.2.1.jar + + + bfd_crawler_slice_loginmanager_3.2.1 + bfd_crawler_slice_loginmanager_3.2.1 + 1.0.0 + system + ${project.basedir}/../jarlib/bfd_crawler_slice_loginmanager_3.2.1.jar + + + bfd_crawler_slice_msgQueue_3.2.1 + bfd_crawler_slice_msgQueue_3.2.1 + 1.0.0 + system + ${project.basedir}/../jarlib/bfd_crawler_slice_msgQueue_3.2.1.jar + + + bfd_crawler_slice_pageparser_3.2.1 + bfd_crawler_slice_pageparser_3.2.1 + 1.0.0 + system + ${project.basedir}/../jarlib/bfd_crawler_slice_pageparser_3.2.1.jar + + + bfd_crawler_slice_resource_3.2.1 + bfd_crawler_slice_resource_3.2.1 + 1.0.0 + system + ${project.basedir}/../jarlib/bfd_crawler_slice_resource_3.2.1.jar + + + bfd_crawler_slice_scheduler_3.2.1 + bfd_crawler_slice_scheduler_3.2.1 + 1.0.0 + system + ${project.basedir}/../jarlib/bfd_crawler_slice_scheduler_3.2.1.jar + + + bfd_crawler_slice_statistics_3.2.1 + bfd_crawler_slice_statistics_3.2.1 + 1.0.0 + system + ${project.basedir}/../jarlib/bfd_crawler_slice_statistics_3.2.1.jar + + + bfd_crawler_slice_traceservice_3.2.1 + bfd_crawler_slice_traceservice_3.2.1 + 1.0.0 + system + ${project.basedir}/../jarlib/bfd_crawler_slice_traceservice_3.2.1.jar + + + bfd_crawler_slice_urlHandler_3.2.1 + bfd_crawler_slice_urlHandler_3.2.1 + 1.0.0 + system + ${project.basedir}/../jarlib/bfd_crawler_slice_urlHandler_3.2.1.jar + + + BfdCrawlMonitor-1.4 + BfdCrawlMonitor-1.4 + 1.0.0 + system + ${project.basedir}/../jarlib/BfdCrawlMonitor-1.5.jar + + + BfdRedisTools-2.0 + BfdRedisTools-2.0 + 1.0.0 + system + ${project.basedir}/../jarlib/BfdRedisTools-2.0.jar + + + BfdSimpleCrypto + BfdSimpleCrypto + 1.0.0 + system + ${project.basedir}/../jarlib/BfdSimpleCrypto.jar + + + CharsetDetector + CharsetDetector + 1.0.0 + system + ${project.basedir}/../jarlib/CharsetDetector.jar + + + crawler-shard-jdbc-v2.0.1.20150311 + crawler-shard-jdbc-v2.0.1.20150311 + 1.0.0 + system + ${project.basedir}/../jarlib/crawler-shard-jdbc-v2.0.1.20150311.jar + + + fastdfs_client + fastdfs_client + 1.0.0 + system + ${project.basedir}/../jarlib/fastdfs_client.jar + + + fastdht_client + fastdht_client + 1.0.0 + system + ${project.basedir}/../jarlib/fastdht_client.jar + + + IceGrid + IceGrid + 1.0.0 + system + ${project.basedir}/../jarlib/IceGrid.jar + + + Ice + Ice + 1.0.0 + system + ${project.basedir}/../jarlib/Ice.jar + + + im4java-1.3.2 + im4java-1.3.2 + 1.0.0 + system + ${project.basedir}/../jarlib/im4java-1.3.2.jar + + + jackson-all-1.6.4 + jackson-all-1.6.4 + 1.0.0 + system + ${project.basedir}/../jarlib/jackson-all-1.6.4.jar + + + json-jena-1.0 + json-jena-1.0 + 1.0.0 + system + ${project.basedir}/../jarlib/json-jena-1.0.jar + + + jta-1.1 + jta-1.1 + 1.0.0 + system + ${project.basedir}/../jarlib/jta-1.1.jar + + + kafka8-1.0.0 + kafka8-1.0.0 + 1.0.0 + system + ${project.basedir}/../jarlib/kafka-0.10.jar + + + metrics-annotation-2.2.0 + metrics-annotation-2.2.0 + 1.0.0 + system + ${project.basedir}/../jarlib/metrics-annotation-2.2.0.jar + + + metrics-core-2.2.0 + metrics-core-2.2.0 + 1.0.0 + system + ${project.basedir}/../jarlib/metrics-core-2.2.0.jar + + + MlinkSlice-2.0 + MlinkSlice-2.0 + 1.0.0 + system + ${project.basedir}/../jarlib/MlinkSlice-2.0.jar + + + org.apache.sling.commons.html-1.0.0 + org.apache.sling.commons.html-1.0.0 + 1.0.0 + system + ${project.basedir}/../jarlib/org.apache.sling.commons.html-1.0.0.jar + + + utils-0.0.1-crawlhuafen + utils-0.0.1-crawlhuafen + 1.0.0 + system + ${project.basedir}/../jarlib/utils-0.0.1-crawlhuafen.jar + + + utils-0.0.1-SNAPSHOT + utils-0.0.1-SNAPSHOT + 1.0.0 + system + ${project.basedir}/../jarlib/utils-0.0.1-SNAPSHOT.jar + + + utils-3.0.0 + utils-3.0.0 + 1.0.0 + system + ${project.basedir}/../jarlib/utils-3.0.0.jar + + + xml-apis-1.0.b2 + xml-apis-1.0.b2 + 1.0.0 + system + ${project.basedir}/../jarlib/xml-apis-1.0.b2.jar + + + dom4j-2.0.0-ALPHA-2 + dom4j-2.0.0-ALPHA-2 + 1.0.0 + system + ${project.basedir}/../jarlib/dom4j-2.0.0-ALPHA-2.jar + + + crawler-msgqueue-3.2.2 + crawler-msgqueue-3.2.2 + 1.0.0 + system + ${project.basedir}/../jarlib/crawler-msgqueue-3.2.2.jar + + + DownloadUtil + DownloadUtil + 1.0.0 + system + ${project.basedir}/../jarlib/DownloadUtil.jar + + + crawler-download-2.3 + crawler-download-2.3 + 1.0.0 + system + ${project.basedir}/../jarlib/crawler-download-2.3.jar + + + okhttp-3.9.0 + okhttp-3.9.0 + 1.0.0 + system + ${project.basedir}/../jarlib/okhttp-3.9.0.jar + + + okio-1.11.0 + okio-1.11.0 + 1.0.0 + system + ${project.basedir}/../jarlib/okio-1.11.0.jar + + + DownloadFileUtil + DownloadFileUtil + 1.0.0 + system + ${project.basedir}/../jarlib/DownloadFileUtil.jar + + + scala-library + scala-library + 1.0.0 + system + ${project.basedir}/../jarlib/scala-library.jar + + + itext-2.1.7 + itext-2.1.7 + 1.0.0 + system + ${project.basedir}/../jarlib/itext-2.1.7.jar + + + + + + + + + maven-clean-plugin + 3.1.0 + + + + maven-resources-plugin + 3.0.2 + + + maven-compiler-plugin + 3.8.0 + + + maven-surefire-plugin + 2.22.1 + + + maven-jar-plugin + 3.0.2 + + + maven-install-plugin + 2.5.2 + + + maven-deploy-plugin + 2.8.2 + + + + maven-site-plugin + 3.7.1 + + + maven-project-info-reports-plugin + 3.0.0 + + + + + org.springframework.boot + spring-boot-maven-plugin + + com.bfd.main.Application + ZIP + + + ${project.groupId} + ${project.artifactId} + + + + + + + repackage + + + + + + org.apache.maven.plugins + maven-dependency-plugin + + + copy + package + + copy-dependencies + + + jar + jar + runtime + ${project.build.directory}/libs + + + + + + + + diff --git a/src/com/bfd/parse/AutoRuleParser.java b/src/com/bfd/parse/AutoRuleParser.java new file mode 100644 index 0000000..6cff053 --- /dev/null +++ b/src/com/bfd/parse/AutoRuleParser.java @@ -0,0 +1,271 @@ +package com.bfd.parse; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.regex.Pattern; + +import org.apache.commons.lang.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.log4j.PropertyConfigurator; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; + +import com.bfd.parse.client.URLNormalizerClient; +import com.bfd.parse.config.iid.ParseCfg; +import com.bfd.parse.config.iid.ParseConfigure; +import com.bfd.parse.config.iid.ParseReProcessor; +//import com.bfd.parse.data.ParseRule; +//import com.bfd.parse.data.ParseRuleMap; +//import com.bfd.parse.data.TitleRule; +//import com.bfd.parse.data.TitleRuleMap; +//import com.bfd.parse.data.Website; +//import com.bfd.parse.data.WebsiteMap; +//import com.bfd.parse.facade.parseunit.ParseUnit; +//import com.bfd.parse.service.PageRuleParser; +//import com.bfd.crawler.utils.ConfigUtils; +//import com.bfd.parse.util.JsonUtil; +//import com.bfd.parse.utils.HttpUtil; +//import com.bfd.parse.utils.UrlUtils; +//import com.bfd.parse.utils.XPathUtil; + +public class AutoRuleParser { + +// private static final Log LOG = LogFactory.getLog(AutoRuleParser.class); +// private URLNormalizerClient normalizer = new URLNormalizerClient(); +// private static final Pattern searchLinkPattern = Pattern.compile("search", Pattern.CASE_INSENSITIVE); +// +// private static final Pattern[] urlExcludeFilter = { +// Pattern.compile("redirect\\.php\\?tid=.*?(&goto=).*", Pattern.CASE_INSENSITIVE), +// Pattern.compile( +// ConfigUtils +// .getInstance() +// .getProp( +// "url.binFileRegEx", +// ".*?\\.(jpg|bmp|gif|png|ico|jpeg|docx?|xlsx?|pptx?|iso|mpg|mpeg|rmvb|mp4|mov|swf|mp3|vob|avi|mkv|asf|wmv|wma|rm|wav|mid|flv|3gp|tiff|psd|mka|mpg|mpe|rss|zip|rar|z|txt|jar|tar|wps|pdf|exe|bin|vod|dotx?|chm|rpm)$"), +// Pattern.CASE_INSENSITIVE) }; +// +// private String thdName; +//// private ParserFace parseFace; +// +// public AutoRuleParser(String workname) { +// this.thdName = workname; +//// this.parseFace = parseFace; +// } +// +// public String getThdName() { +// return thdName; +// } +// +// public Map parse(ParseUnit unit) { +// if (unit.isList()) { +// return getUrls(unit.getPageData(), unit.getUrl(), unit.getPageType(), unit.getCid()); +// } else { +// return parse(unit.getPageData(), unit.getUrl(), unit.getCid()); +// } +// } +// +// private Map parse(String pageData, String url, String cid) { +// LOG.info(thdName + " begin to auto parse info page, url->" + url); +// Website website = WebsiteMap.INSTANCE.get(cid); +// boolean clean = (website == null) ? false : website.getCleantag() == 1; +// List titleRules = ParseRuleMap.getTitleParseRules(cid); +// List contentRules = ParseRuleMap.getContentParseRules(cid); +// List rule = TitleRuleMap.getRule(cid); +// List breadRules = ParseRuleMap.getBreadParseRules(cid); +// List dateRules = ParseRuleMap.getDateParseRules(cid); +// List nodes = new ArrayList(); +// LOG.info(thdName + " begin to auto parse info page,PageRuleParser parse, url->" + url); +// Map data = PageRuleParser.parse(pageData, cid, url, clean, titleRules, contentRules, +// breadRules, dateRules, rule, nodes); +// if (nodes != null && nodes.size() > 0) { +// Map imageMap = getContentLinks(nodes.get(0), url, cid); +// data.putAll(imageMap); +// } +// LOG.info(thdName + " finished auto parsing info page, url->" + url); +// return data; +// } +// +// private Map getContentLinks(Node contentNode, String url, String cid) { +// List> imgtasks = new ArrayList>(); +// Map result = new HashMap(); +// Map> contentImgs = new HashMap>(); +// int idx = 0;// getVideoImage(contentNode, imgtasks, contentImgs); +// NodeList nodes = XPathUtil.searchNodeList(contentNode, ".//IMG"); +// if (nodes == null || nodes.getLength() == 0) +// return result; +// +// for (int i = 0; i < nodes.getLength(); i++) { +// Node imgNode = nodes.item(i); +// String rawImg = XPathUtil.getImageLink(imgNode); +// if (StringUtils.isEmpty(rawImg)) { +// continue; +// } +// String imgUrl = null; +// imgUrl = this.normalizer.normalize(cid, "img", rawImg, url, true); +// if (StringUtils.isEmpty(imgUrl)) { +// LOG.warn(thdName + " normalized image failed, rawImg=" + rawImg + ", url=" + url); +// continue; +// } +// String imgTag = "img_" + idx; +// idx++; +// Map taskImg = new HashMap(); +// taskImg.put("img", imgUrl); +// taskImg.put("rawimg", rawImg); +// taskImg.put("imgtag", imgTag); +// imgtasks.add(taskImg); +// +// Map contentImg = new HashMap(); +// contentImg.put("img", imgUrl); +// contentImg.put("rawimg", rawImg); +// contentImg.put("imgtag", imgTag); +// contentImgs.put(imgTag, contentImg); +// contentImgs.put(rawImg, contentImg); +// } +// if (imgtasks.size() > 0) { +// result.put("imgtasks", imgtasks); +// result.put("contentimgs", contentImgs); +// } +// return result; +// } +// +// private Map getUrls(String pageData, String url, String type, String cid) { +// LOG.info(thdName + " autoparse parse listpage, begin to get urls, url -> " + url); +// Map data = new HashMap(); +// Document doc = Jsoup.parse(pageData); +// data.put("items", getInternalLinks(cid, url, type, doc)); +// return data; +// } +// +// private List> getInternalLinks(String cid, String baseurl, String type, Document doc) { +// Elements linkElements = doc.select("a"); +// List> urlList = new ArrayList>(); +// Set urlDepSet = new HashSet(); +// int cnt = 0; +// Integer iidtype = WebsiteMap.INSTANCE.get(cid).getIidtype(); +// for (; cnt < linkElements.size(); cnt++) { +// Element element = linkElements.get(cnt); +// String rawUrl = element.attr("href"); +// +// if (StringUtils.isEmpty(rawUrl)) // 过滤为空的链接 +// continue; +// rawUrl = rawUrl.trim(); +// +// final String text = element.text(); // 过滤无文字链接 +// if (StringUtils.isEmpty(text)) { +// LOG.debug(getThdName() + " there is no text between tag, will skip it, url=" + rawUrl); +// continue; +// } +// +// if (rawUrl.startsWith("javascript") || rawUrl.startsWith("#")) // 过滤为javascript链接 +// continue; +// +// Elements children = element.children(); // 直接图片的url不做抽取 +// if (children != null && children.size() > 0 && "img".equalsIgnoreCase(children.first().nodeName())) { +// LOG.debug(getThdName() + " url is a img link, will skip it, url=" + rawUrl); +// continue; +// } +// +// boolean exclude = false; +// for (Pattern p : urlExcludeFilter) { +// if (p.matcher(rawUrl).find()) { +// LOG.debug(getThdName() + " urlExcludeFilter filte url unexcepted, will skip it, url=" + rawUrl); +// exclude = true; +// break; +// } +// } +// if (exclude) { +// continue; +// } +// LOG.debug(thdName + " begin to normalize, rawUrl=" + rawUrl + ", baseurl=" + baseurl); +// Map resMap = this.normalizer.normalizeExt(cid, type, rawUrl, baseurl, true, +// false); +// Integer code = (Integer) resMap.get("code"); +// String url = (String) resMap.get("url"); +// String itemiid = (String) resMap.get("bfdiid"); +// if (code == null || (code != 0 && code != 3) || StringUtils.isEmpty(url) || StringUtils.isEmpty(itemiid)) { +// LOG.debug(getThdName() + " normalized error code, or link is empty."); +// continue; +// } +// if (urlDepSet.contains(url) || !UrlUtils.isInternalLink(url, baseurl) || baseurl.equalsIgnoreCase(url) +// || isSearchLink(url)) +// continue; +// +// // iidtype为1时,使用iid规则 +// if (iidtype != null && iidtype == 1) { // 0-> md5, 1->iid规则 +// ParseCfg config = ParseConfigure.getInstance().getParseConfig(cid); +// if (config == null) { +// LOG.warn(getThdName() + " iid rule empty, cid -> " + cid); +// continue; +// } +// Pattern pattern = config.getIidPattern(); +// itemiid = ParseReProcessor.parseIid(pattern, url); +// if (StringUtils.isEmpty(itemiid)) { +// LOG.warn(getThdName() + " url match iid rule failed, url -> " + url); +// continue; +// } +// LOG.info(getThdName() + " got iid from url by iidrule, itemiid->" + itemiid + ", url->" + url); +// } +// +// urlDepSet.add(url); +// Map itemMap = new HashMap(); +// Map linkMap = new HashMap(); +// +// linkMap.put("link", url); +// linkMap.put("type", "info"); +// linkMap.put("rawlink", rawUrl); +// itemMap.put("itemiid", itemiid); +// itemMap.put("itemlink", linkMap); +// itemMap.put("itemname", text.trim()); +// urlList.add(itemMap); +// } +// LOG.info(getThdName() + " got internal links size=" + urlList.size() + ", all=" + (cnt - 1)); +// return urlList; +// } +// +// /** +// * 链接的path部分是否为search形式 +// */ +// protected boolean isSearchLink(String url) { +// try { +// String path = UrlUtils.getPath(url); +// if (StringUtils.isEmpty(path)) { +// return false; +// } +// return searchLinkPattern.matcher(path).find(); +// } catch (Exception e) { +// LOG.warn("exception while judge search link, url=" + url, e); +// } +// return true; +// } +// +// public static void main(String[] args) { +// ParserFace parseFace = new ParserFace(""); +// String url = "http://www.cnbeta.com/articles/80159.htm"; +// String normalize = parseFace.getNormalizerClient().normalize("Czhongwenyjzx", "img", +// "http://static.cnbetacdn.com/upimg/100510/zhangxiaolu_191809588165534.jpg", url, true); +// System.out.println(normalize); +// } +// +// public static void main2(String[] args) { +// PropertyConfigurator.configure("log4j.properties"); +// String url = "http://www.yseeker.com/archives/3295.html"; +// String pageData = HttpUtil.getHtml(url); +// String cid = "Cpinweiyahu"; +// // System.out.println(Parser.getPageType(pageData, cid, url, false)); +//// AutoRuleParser autoParser = new AutoRuleParser("", new ParserFace("")); +//// // Map data = autoParser.getUrls(pageData, url, +//// // "list", +//// // cid); +//// Map data = autoParser.parse(pageData, url, cid); +//// System.out.println(JsonUtil.toJSONString(data)); +// } +} diff --git a/src/com/bfd/parse/Constants.java b/src/com/bfd/parse/Constants.java new file mode 100644 index 0000000..512a522 --- /dev/null +++ b/src/com/bfd/parse/Constants.java @@ -0,0 +1,154 @@ +package com.bfd.parse; + +public class Constants { + public static final String ENDTIME = "end_time"; + public static final String LOCATION = "location"; + //品牌 + public static final String BRAND_NAME = "brand_name"; + //list页的brand + public static final String LIST_BRAND = "list_brand"; + //品类 + public static final String CATE = "cate"; + //list页的cate + public static final String LIST_CATE = "list_cate"; + //描述 + public static final String DESC = "description"; + //型号 + public static final String MODEL_TYPE = "model_type"; + //属性 + public static final String PROPERTIES = "properties"; + //商家信息 + public static final String SHOP_DATA= "ShopData"; + //商家信息转为map类型信息 + public static final String SHOP_PROPERTIES = "shop_properties"; + //简介 + public static final String BRIEF = "brief"; + //本站价格 + public static final String PRICE_LOW = "price_low"; + public static final String PRICE_HIGH = "price_high"; + //市场价格 + public static final String MARKET_PRICE_LOW = "market_price_low"; + public static final String MARKET_PRICE_HIGH = "market_price_high"; + //商家名称 + public static final String SHOP_NAME = "shop_name"; + //好评 + public static final String goodCount = "goodCount"; + //一般评价 + public static final String generalCount = "generalCount"; + //差评 + public static final String poorCount = "poorCount"; + //评论标签 + public static final String IMPRESS_PROPERTIES = "impress"; + //月销量 + public static final String MONTH_SELL_COUNT = "month_sell_count"; + //累计销量 + public static final String CONFIRM_GOODS = "confirmGoods"; + //累计评价 + public static final String FEED_COUNT = "feedcount"; + //库存 + public static final String QUANTITY = "quantity"; + + public static final String promotion = "promotion"; + + public static final String buyer = "buyer"; + public static final String buyer_level = "buyer_level"; + public static final String price = "price"; + public static final String price_str = "price_str"; + public static final String num = "num"; + + public static final String done_time = "done_time"; + public static final String sku_data = "sku_data"; + public static final String buyer_items = "buyer_items"; + public static final String oldest_sold_time = "oldest_sold_time"; + public static final String last_crawl_time= "last_crawl_time"; + public static final String ajax_page_field = "ajax_page_field"; + public static final String total_old = "total_old"; + public static final String total_new = "total_new"; + public static final String crawled_pages = "crawled_pages"; + public static final String ajaxext = "ajaxext"; + public static final String task_create_time = "task_create_time"; + public static final String month_sold = "month_sold"; + public static final String last_soldlist_time = "last_soldlist_time"; + public static final String buyer_page_info = "buyer_page_info"; + public static final String noNext = "noNext"; + public static final String dendtime = "dendtime"; + + public static final String pageNumThisTimes = "pageNumThisTimes"; + + public static final String cid = "cid"; + + public static final int status_plugin_ready = 2; + + public static final String tmptasktag = "tmptasktag"; + + public static final String topicSummaries = "topicSummaries"; + + public static final String reply_cnt = "reply_cnt"; + public static final String view_cnt = "view_cnt"; + public static final String topicid = "topicid"; + + public static final String itemiid = "itemiid"; + public static final String replys = "replys"; + public static final String reply_author_essay = "reply_author_essay"; + public static final String reply_author_regtime = "reply_author_regtime"; + public static final String reply_author_postcnt = "reply_author_postcnt"; + public static final String reply_floor = "reply_floor"; + public static final String reply_time = "reply_time"; + public static final String reply_author_replycnt = "reply_author_replycnt"; + + public static final String reply_author_city = "reply_author_city"; + + + public static final String list_replys = "replys"; + public static final String list_views = "views"; + public static final String list_posttime = "posttime"; + public static final String posttime = "posttime"; + + public static final String tablename = "tablename"; + + public static final String userId = "userId"; + + public static final String newstime = "newstime"; + public static final String iceTimeout = "Ice.Override.Timeout"; + public static final String parseRsQueueName = "parseQueueTopic"; + public static final String stateQueueName = "statQueueName"; + + public static final String pluginType_preprocess = "0"; + public static final String pluginType_json = "1"; + public static final String pluginType_reprocess = "2"; +// public static final String pluginType_preprocess = "3"; + + public static String host = ""; + public static final int kfk_read_thread_num = 2; + public static final String kafka_consumer_group = "group_parser"; + + public static final String IID = "iid"; + public static final String purl = "purl"; + public static final String pageidx = "pageidx"; + public static final String length = "length"; + public static final String taskid = "taskid"; + + public static final String linktype = "linktype"; + public static final String link = "link"; + public static final String rawlink = "rawlink"; + public static final String errMsg = "errMsg"; +// public static fin +// public static final String kafka_consumer_group = "group_parser"; +// public static final String kafka_consumer_group = "group_parser"; + public static final String parsecode = "parsecode"; + public static final String code = "code"; + public static final String getiidsuccess = "getiidsuccess"; + public static final String getiidfail = "getiidfail"; + public static final String md5Num = "md5Num"; + public static final String iidtag = "iidtag"; + public static final String tasks = "tasks"; + + public static final String category = "category"; + public static final String nextpage = "nextpage"; + public static final String URL = "url"; + + public static final String siteid = "siteid"; + public static final String HTTPHEADER_LOCATION = "location"; + public static final String CHARSET_UTF8 = "UTF8"; + +} diff --git a/src/com/bfd/parse/Constants_TraceTask.java b/src/com/bfd/parse/Constants_TraceTask.java new file mode 100644 index 0000000..6444b39 --- /dev/null +++ b/src/com/bfd/parse/Constants_TraceTask.java @@ -0,0 +1,43 @@ +package com.bfd.parse; + +public class Constants_TraceTask { + + public static final String siteid = "siteid"; + public static final String pagetypeid = "pagetypeid"; + public static final String parenttraceid = "parenttraceid"; + public static final String host = "host"; + public static final String eventname = "eventname"; + public static final String eventdata = "eventdata"; + + + + public static final String commonparser = "commonparser"; + public static final String chkpreplugin = "chkpreplugin"; + public static final String preprocess = "preprocess"; + public static final String chktemplate = "chktemplate"; + public static final String parsehtml = "parsehtml"; + public static final String chkjsonplugin = "chkjsonplugin"; + public static final String jsonparse = "jsonparse"; + public static final String chkproplugin = "chkproplugin"; + public static final String reprocess = "reprocess"; + public static final String chkiidrule = "chkiidrule"; + public static final String iidprocess = "iidprocess"; + public static final String sendresult = "sendresult"; + public static final String parsedone = "parsedone"; + public static final String weiboparser = "weiboparser"; + public static final String chkhtml = "chkhtml"; + + + + public static final String rcvtask = "rcvtask";// 收到任务 + // public static final String chkhtml="chkhtml";//判断有无html数据 + public static final String chklogin = "chklogin";// 判断是否需要再次登陆 + public static final String chkpagetype = "chkpagetype";// 验证页面类型 + public static final String fragmentHtml = "fragmentHtml";// 页面截取 + // public static final String parsehtml="parsehtml";//解析源码 + public static final String parseAjaxData = "parseajaxdata";// 解析动态数据 + public static final String chkAjaxData = "chkajaxdata"; // 有无动态数据 + // public static final String sendresult="sendresult";//发送结果 + // public static final String parsedone="parsedone";//处理完成 + +} diff --git a/src/com/bfd/parse/DataSaver.java b/src/com/bfd/parse/DataSaver.java new file mode 100644 index 0000000..14cfbf8 --- /dev/null +++ b/src/com/bfd/parse/DataSaver.java @@ -0,0 +1,126 @@ +package com.bfd.parse; + +import java.util.Map; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import com.bfd.crawler.kafka7.KfkProducer; +import com.bfd.crawler.utils.ConfigUtils; +import com.bfd.crawler.utils.JsonUtils; +import com.bfd.parse.facade.parseunit.ParseUnit; + +public class DataSaver { + + private static final Log LOG = LogFactory.getLog(DataSaver.class); + + private String workName; +// private DataOperatorClient dataOperatorClient; + private static final Boolean saveSource = ConfigUtils.getInstance().getBoolProp("SaveSource", true); + + public DataSaver(String name) { + this.workName = name; +// dataOperatorClient = new DataOperatorClient(); + } + + public void saveData(ParseUnit unit, ParseResult result) { + if (unit.needSave()) { +// if (result.getParsedata().getParsecode() == ParseResult.SUCCESS +// || result.getParsedata().getParsecode() == ParseResult.OFF) { + //保存解析结果 + saveParseData(unit.getTaskdata(), result); +// } + + //:TODO不需要保存页面源码了 +// if (saveSource && !checkResMd5(unit, result)) { +// //保存页面源码 +// saveSourceData(unit.getData(), unit.getAjaxdata(), unit.getTaskdata()); +// } + } + } + + /** + * md5相同返回true + * + * @param unit + * @param result + */ +// private boolean checkResMd5(ParseUnit unit, ParseResult result) { +// Map data = result.getParsedata().getData(); +// Map tmp = new HashMap(); +// String attrMd5 = null; +// for (Entry entry : data.entrySet()) { +// Object value = entry.getValue(); +// if (entry.getKey().equals("attr")) { +// if (value != null && value instanceof Map) { +// attrMd5 = (String) ((Map) value).get("resmd5"); +// ((Map) value).remove("resmd5"); +// } +// continue; +// } +// tmp.put(entry.getKey(), value); +// } +// // 计算解析结果的resmd5并与attribute中的做比较。 +// String dataMd5 = DataUtil.calcMD5(JsonUtil.toJSONString(tmp)); +// LOG.info(workName + " got attrmd5=" + attrMd5 + ", datamd5=" + dataMd5 + ", url=" + unit.getUrl()); +// if (dataMd5.equals(attrMd5)) { +// LOG.info(workName + " got datamd5 is same as resmd5, url=" + unit.getUrl()); +// return true; +// } +// data.put("resmd5", dataMd5); +// return false; +// } + + /** + * 保存源文件数据 + * + * 若存在ajaxdata, 则将ajaxdata拼接到data后边,同时添加jsonlength字段 + * + * @param data + * @param ajaxdata + * @param taskdata + */ +// public void saveSourceData(final String data, List> ajaxdata, Map taskdata) { +// try { +// Map reqMap = new HashMap(); +// reqMap.putAll(taskdata); +// String reqData = ""; +// if (StringUtils.isNotEmpty(data)) { +// reqData = data; +// } +// if (ajaxdata != null && ajaxdata.size() > 0) { +// String jsonAjaxdata = JsonUtil.toJSONString(ajaxdata); +// reqMap.put("jsonlength", jsonAjaxdata.getBytes().length); +// reqData += jsonAjaxdata; +// } +// if (StringUtils.isNotEmpty(reqData)) { +// LOG.info(workName + " saving source data..."); +// int status = dataOperatorClient.saveData(JsonUtil.toJSONString(reqMap), reqData); +// LOG.info(workName + " saved source data, status=" + status + ", url=" + taskdata.get("url")); +// } +// } catch (Exception e) { +// LOG.warn(workName + " saving source data exception, url=" + taskdata.get("url"), e); +// } +// } + + /** + * 保存解析结果数据 + * + * @param taskData + * @param parseData + */ + public void saveParseData(Map taskData, ParseResult result) { + int status = 0; + + LOG.info(workName + " saving result data..."); + + //TODO:写解析结果到kafka,topic名称需要修改 + KfkProducer.getInstance().send(taskData.get("parsequeuetopic").toString(), JsonUtils.toJSONString(result)); +// status = dataOperatorClient.saveData(JsonUtil.toJSONString(req), JsonUtil.toJSONString(parseData)); + LOG.info(workName + " saved result data, status=" + status + ", url=" + taskData.get("url")); + } + +// public DataOperatorClient getDataOperatorClient() { +// return dataOperatorClient; +// } +} diff --git a/src/com/bfd/parse/DomParser.java b/src/com/bfd/parse/DomParser.java new file mode 100644 index 0000000..1285060 --- /dev/null +++ b/src/com/bfd/parse/DomParser.java @@ -0,0 +1,677 @@ +package com.bfd.parse; + +import java.io.ByteArrayInputStream; +import java.io.DataInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.Date; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.concurrent.ConcurrentHashMap; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.beanutils.BeanUtils; +import org.apache.commons.lang.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.html.dom.HTMLDocumentImpl; +import org.cyberneko.html.parsers.DOMFragmentParser; +import org.w3c.dom.DocumentFragment; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; + +import com.bfd.crawler.utils.EncodeUtil; +import com.bfd.crawler.utils.JsonUtils; +import com.bfd.crawler.utils.StringUtil; +import com.bfd.crawler.utils.crawler.httpclient.MyCrawler; +import com.bfd.parse.ParseResult.ParseData; +import com.bfd.parse.client.DownloadClient; +import com.bfd.parse.client.URLNormalizerClient; +import com.bfd.parse.config.dom.DomCFGTree; +import com.bfd.parse.config.dom.DomConfig; +import com.bfd.parse.config.dom.DomSearch; +import com.bfd.parse.config.dom.DomTemplate; +import com.bfd.parse.config.fldmap.BfdItemFldMapRule; +import com.bfd.parse.config.fldmap.ItemInfoParser2; +import com.bfd.parse.config.shelf.JudgeRule; +import com.bfd.parse.config.shelf.JudgeRuleConfig; +import com.bfd.parse.config.shelf.JudgeStatue; +import com.bfd.parse.facade.parseunit.ParseUnit; +import com.bfd.parse.json.JsonData; +//import com.bfd.parse.json.MTaoBaoJsonParser; +import com.bfd.parse.service.proxy.BizConfigureProxy; +import com.bfd.parse.test.weibosinaparser.Task; +import com.bfd.parse.test.weibosinaparser.WeiboParser; +import com.bfd.parse.util.JsonUtil; +import com.bfd.parse.util.MyDateUtil; +import com.bfd.parse.util.TextUtil; + +public class DomParser { + + private static final Log LOG = LogFactory.getLog(DomParser.class); + private static final Log TLOG = LogFactory.getLog("com.bfd.parse.TemplateLog"); + + private DomSearch domSearch; + private URLNormalizerClient normalizerClient; + private String workName; + + private static final Pattern cntFilter = Pattern.compile("\\d+"); + private static final Pattern totalNumReges = Pattern.compile("共(\\d+)页"); + + // 用于打印解析失败的页面源代码,每个cid最多打印200次 + private static String countDate = MyDateUtil.getStr(new Date(), + MyDateUtil.DATE_FORMAT); + private static Map printErrCountCidMap = new ConcurrentHashMap(); + private static int printErrCount = 200; + + public DomParser() { + workName = Thread.currentThread().getName(); + domSearch = new DomSearch(workName, normalizerClient); + } + + public DomParser(String workName, URLNormalizerClient normalizerClient) { + this.workName = workName; + domSearch = new DomSearch(workName, normalizerClient); + this.normalizerClient = normalizerClient; + } + + public ParseResult parse(ParseUnit unit, ParseResult result) { + ParseData parsedata = result.getParsedata(); + parsedata.setParsebegintime(System.currentTimeMillis()); + + // item类型判定上下架状态 +// String itemStatus = null; + + ParseRS templateRS = null; + templateRS = parseByTemplate(unit); + parsedata.setParsecode(templateRS.parseCode); + if (templateRS != null && templateRS.getRs() != null) { + parsedata.getData().putAll(templateRS.getRs()); + // return result; + } + + if (parsedata.getParsecode() == ParseResult.FAILED) { + parsedata.setErrMsg(templateRS.getErrMsg()); + return result; + } + //TODO:这一步已经移到parserface类的addExtraInfo方法里,因为datatype为json的也需要执行这个函数 +// handExtraArgs(unit, unit.getPageEncode(), itemStatus, +// parsedata.getData()); + parsedata.setParsecode(ParseResult.SUCCESS); + + return result; + } + + public List transDataToListObjs(T bean, Object data) { + List result = new ArrayList(); + // Object data = getData(); + if (data instanceof List) { + List> list = (List>) data; + for (Map map : list) { + T nBean; + try { + nBean = (T) BeanUtils.cloneBean(bean); + // BeanUtil.setValue(map, nBean); + BeanUtils.populate(nBean, map); + result.add(nBean); + } catch (Exception e) { + LOG.warn("transDataToObj exception. ", e); + } + } + } + return result; + } + + class ParseRS { + private int parseCode = -1; + private Map rs = null; + private String errMsg ; + + + public String getErrMsg() { + return errMsg; + } + + public void setErrMsg(String errMsg) { + this.errMsg = errMsg; + } + + public int getParseCode() { + return parseCode; + } + + public void setParseCode(int parseCode) { + this.parseCode = parseCode; + } + + public Map getRs() { + return rs; + } + + public void setRs(Map rs) { + this.rs = rs; + } + + } + + private ParseRS parseByTemplate(ParseUnit unit) { + ParseRS result = new ParseRS(); + // 获取解析模板通过siteId和pagetypeid得到,通过查parseTemplate表得到;相应的缓存也要改 + DomTemplate tpl = getDomTemplate(unit.getSiteId()+""); + LOG.info(workName+" url:"+unit.getUrl()+",cid:"+unit.getCid()); + + if (!checkDomTemplate(tpl, unit.getPageTypeId()+"")) { + LOG.info(workName + " NO PARSE template cid=" + unit.getCid() + + ", typeId=" + unit.getPageTypeId() + ", tpl=" + JsonUtils.toJSONString(tpl)); + TLOG.info("no template, cid = " + unit.getCid() + ", typeId = " + + unit.getPageTypeId() + ", url = " + unit.getUrl()); + result.setErrMsg("siteId:"+unit.getSiteId()+" pageTypeId:"+unit.getPageTypeId()+" no found template"); + result.setParseCode(ParseResult.NOFOUND_TEMPLATE); + return result; + } + + // 解析DOM数据 + DocumentFragment doc = null; + // 标签补全 + unit.setPageBytes(TextUtil.balanceTag(unit.getPageBytes(), + unit.getPageEncode())); + try { + doc = DomParser.parse2Html(unit.getPageBytes(), + unit.getPageEncode()); + } catch (Exception e) { + LOG.warn(workName + " parsing to HTML Exception, Err=" + ",url=" + + unit.getUrl(), e); + } + Map errMsg = new HashMap(); + long templateParseBeginTime = System.currentTimeMillis(); + Map rmap = parseData(unit.getPageTypeId()+"", unit.getCid(), + unit.getUrl(), doc, tpl, unit.getPageEncode(),errMsg); + long templateParseEndTime = System.currentTimeMillis(); + LOG.info("url:"+unit.getUrl()+" template parse consumer time:"+(templateParseEndTime-templateParseBeginTime)); + if (rmap == null) { + result.setParseCode(ParseResult.FAILED); + result.setErrMsg(errMsg.get(Constants.errMsg).toString()); + } else { + result.setParseCode(ParseResult.SUCCESS); + result.setRs(rmap); + } +// LOG.info("url : "+unit.getUrl()+".parseByTemplate:result:"+JsonUtil.toJSONString(result)); + return result; + } + + + + /** + * 添加上下架信息,处理下一页,添加task data中的参数 + * + * @param unit + * @param charset + * @param itemStatus + * @param rmap + */ + public void handExtraArgs(ParseUnit unit, String charset, + String itemStatus, Map rmap) { + if (itemStatus != null) + rmap.put("onshelf", itemStatus); + String nextpage = ""; +// if (rmap.containsKey("nextpage") && unit.getPageidx() < 50) { + //针对华为的小米和花粉抓取,分页不止50页 + if (rmap.containsKey(Constants.nextpage) ) { + Object obj = rmap.get(Constants.nextpage); + nextpage = handleNextPage(unit.getPageidx(), obj, unit.getSiteId()+"", + unit.getPageType(), unit.getUrl()); + if (unit.getUrl().equalsIgnoreCase(nextpage) + || nextpage.equalsIgnoreCase(unit.getUrl0())) + nextpage = ""; + rmap.put(Constants.nextpage, nextpage); + } else { + if (unit.getUrl0() != null + && StringUtils.isNotEmpty(unit.getUrl0().trim())) { + nextpage = ""; + rmap.put(Constants.nextpage, nextpage); + } + } + if (StringUtils.isNotEmpty(nextpage) && unit.getPageidx() > 0) { + rmap.put("pageidx", unit.getPageidx() + 1); + } + + rmap.put("cid", unit.getCid()); +// rmap.put("bfdiid", unit.getBfdiid()); + rmap.put(Constants.URL, unit.getUrl().trim()); + rmap.put("type", unit.getPageType()); + rmap.put(Constants.IID, unit.getIid()); + rmap.put("length", unit.getLength()); + rmap.put("charset", charset); + + // 将任务中的category放入结果 + if (unit.getTaskdata().containsKey(Constants.category)) { + Object category = unit.getTaskdata().get(Constants.category); + if (category != null) { + rmap.put(Constants.category, category); + } + } + + // 如果存在imgs,抽取 text(key)和img(value)信息组成Map,放入imgList(保留提供给挖掘的格式) + if (domSearch.hasImgs() && rmap.containsKey("imgs")) { + List imgs = (List) rmap.get("imgs"); + List imgList = new ArrayList(); + for (int i = 0; i < imgs.size(); i++) { + Map valueMap = new HashMap(); + Map imgMap = imgs.get(i); + String text = (String) imgMap.get("imgtag"); + String img = (String) imgMap.get("img"); + if (StringUtils.isEmpty(text)) { + text = "" + (i + 1); + } + valueMap.put(text, StringUtils.isEmpty(img) ? "" : img); + imgList.add(valueMap); + } + if (imgList.size() > 0) { + rmap.put("imglist", imgList); + } + } + } + + /** + * 检查模板,是否包含指定类型 + * + * @param tpl + * @param type + * @return + */ + private boolean checkDomTemplate(DomTemplate tpl, String type) { + if (tpl == null) { + return false; + } else { + ArrayList templates = tpl.getTemplates(); + if (templates.size() == 0) + return false; + for (DomCFGTree tree : templates) { + if (tree.getType().equalsIgnoreCase(type)) + return true; + } + return false; + } + } + + /** + * 处理下一页. + * + * @param pageno + * @param url + * @param cid + * @param obj + * @return + */ + public String handleNextPage(int pageno, Object objs, String cid, + String type, String url) { + String nextpage = ""; + if (objs != null && objs instanceof List) { + List pages = (List) objs; + String sTotalNum = "0"; + for (Object obj : pages) { // 找到link + if (obj instanceof String) { + Matcher matcher = totalNumReges + .matcher(String.valueOf(obj)); + if (matcher.find()) { + sTotalNum = matcher.group(1); + break; + } + } + if (obj instanceof Map) { + Map page = (Map) obj; + String text = (String) page.get("text"); + Matcher matcher = totalNumReges.matcher(text); + if (matcher.find()) { + sTotalNum = matcher.group(1); + break; + } + } + } + if (StringUtils.isNumeric(sTotalNum)) { + int totalNum = Integer.valueOf(sTotalNum); + if (totalNum > 0 && totalNum <= pageno) { + nextpage = ""; + LOG.debug(workName + " current pagenum reached totalNum "); + return nextpage; + } + } + for (Object obj : pages) { // 找到link, 匹配数字下一页 + if (obj instanceof Map) { + Map page = (Map) obj; + String text = (String) page.get("text"); + text = filterCnt(text); + if (text.trim().equals("" + (pageno + 1))) { + nextpage = (String) page.get("link"); + if (LOG.isDebugEnabled()) + LOG.debug(workName + + " guess nextpage by pageno, nextpage=" + + nextpage); + break; + } + } + } + if (StringUtils.isEmpty(nextpage)) { + boolean flag = false; + for (Object obj : pages) { // 找到link, 匹配数字下一页 + if (obj instanceof String) { + String text = filterCnt((String) obj); + if (flag) { + break; + } + if (text.trim().equals("" + pageno)) { + flag = true; + } + } + if (obj instanceof Map) { + Map page = (Map) obj; + if (flag) { + nextpage = (String) page.get("link"); + LOG.debug(workName + + " guess nextpage by next button of current pageno button, nextpage=" + + nextpage); + break; + } + String text = (String) page.get("text"); + text = filterCnt(text); + if (text.trim().equals("" + pageno)) { + nextpage = (String) page.get("link"); + flag = true; + } + } + } + } + } else if (objs != null && objs instanceof Map) { + nextpage = (String) ((Map) objs).get("link"); + if (LOG.isDebugEnabled()) + LOG.debug("Got real nextpage=" + nextpage); + } + if (StringUtils.isNotEmpty(nextpage)) { + //TODO 归一化的第一个参数cid可以改为siteId吗? + nextpage = normalizerClient.normalize(cid, type, nextpage, url, + true); + } + if (nextpage == null) { + nextpage = ""; + LOG.info(workName + " got nextpage failed, will use Empty String."); + } + return nextpage; + } + + private String filterCnt(String text) { + if (StringUtils.isNotEmpty(text) && !StringUtils.isNumeric(text)) { + Matcher matcher = cntFilter.matcher(text); + if (matcher.find()) { + text = matcher.group(0); + } + } + return text; + } + + /** + * 解析文件 + * + * @param url + * @param bytes + * @param bid + * @param type + * @param charset + * @return + */ + public Map parseFile(String url, byte[] bytes, String bid, + String type, String charset) { + // 标签补全 + bytes = TextUtil.balanceTag(bytes, charset); + Map result = new HashMap(); + // 获取解析模板 + DomTemplate tpl = getDomTemplate(bid); + if (tpl == null) { + return result; + } + // 获取页面编码 + if (StringUtils.isEmpty(charset)) + charset = EncodeUtil.getHtmlEncode(bytes, charset); + + InputSource input = new InputSource(new ByteArrayInputStream(bytes)); + input.setEncoding(charset); + DocumentFragment doc = null; + try { + doc = DomParser.parse2Html(input, charset); + } catch (Exception e) { + LOG.warn("parse2Html failed, url=" + url, e); + } + + // item类型判定上下架状态 + String itemStatus = null; + itemStatus = judgeSellStatus(bid, url, type, bytes, charset); + if (itemStatus == null) { + LOG.info("bid=" + bid + ", url" + url + ", onshelf is null."); + } + + Map errMsg = new HashMap(); + // 解析DOM数据 + Map rmap = parseData(type, bid, url, doc, tpl, charset,errMsg); + if (rmap == null) { + return result; + } else if (itemStatus != null) { + rmap.put("onshelf", itemStatus); + } + return rmap; + } + + public Map parseData(String pageTyepId, String cid, String url, + DocumentFragment doc, DomTemplate tpl, String charset,Map errMsg) { + Map rmap = null; +// boolean bOK = domSearch.executeTemplateParse(doc, tpl, pageTyepId, url,charset); + boolean bOK = true; +// + + if (!bOK) { + String err = domSearch.getTreeString(); + String errMsgStr = " NO PARSE RESULT, cid=" + cid + ", pageTyepId=" + + pageTyepId + ", url=" + url + ", Err:\n" + + (err.length() > 200 ? err.substring(0, 190) : err); + // TODO 放入数据库。 + LOG.info(workName + errMsgStr); + errMsg.put(Constants.errMsg, errMsgStr); + return rmap; + } + rmap = domSearch.getParseResult(); + int gotNum = rmap.keySet().size(); + LOG.info(workName + " parse ok, got " + gotNum + + " items, template idx=" + domSearch.getMatchTmplIDX() + + ", cid=" + cid + ", pageTyepId=" + pageTyepId + ", url=" + url); + return rmap; + } + + /** + * 判定上下架信息 + * + * @param bid + * @param url + * @param bytes + * @param charset + * @return + */ + public String judgeSellStatus(String bid, String url, String type, + byte[] bytes, String charset) { + LOG.info("execute judgeSellStatus!"); + List rules = JudgeRuleConfig.getInstance().getJudgeRules( + bid, type); +// if("Tjd".equals(bid)){ +// LOG.info("cid:"+bid+". judgerules is "+JsonUtil.toJSONString(rules)); +// } + String itemStatus = null; + if (rules != null) { + try { + itemStatus = new JudgeStatue().judgeStatus(url, new String( + bytes, charset), rules); + if (StringUtils.isNotEmpty(itemStatus)) + LOG.info(workName + " got onshelf result=" + itemStatus + + ", biz=" + bid + ", url=" + url); + } catch (Exception e) { + LOG.warn("Judge sell status exception, url=" + url + + ", charset=" + charset, e); + } + } + return itemStatus; + } + + public DomSearch getDomSearch() { + return domSearch; + } + + public DomTemplate getDomTemplate(String siteId) { + return DomConfig.getInstance().getBySiteId(siteId); + } + + public List getDomTemplate(String bid, String type) { + return DomConfig.getInstance().get(bid, type); + } + + public static DocumentFragment parse2Html(byte[] data, String charset) + throws Exception { + return parse2Html( + new InputSource(new ByteArrayInputStream(new String(data, + charset).replace((char) 26, (char) 32) + .getBytes(charset))), charset); + } + + public static DocumentFragment parse2Html(InputSource input, String encoding) + throws Exception { + DOMFragmentParser parser = new DOMFragmentParser(); + try { + parser.setFeature( + "http://cyberneko.org/html/features/augmentations", true); + parser.setProperty( + "http://cyberneko.org/html/properties/default-encoding", + encoding); + parser.setFeature( + "http://cyberneko.org/html/features/scanner/ignore-specified-charset", + true); + parser.setFeature( + "http://cyberneko.org/html/features/balance-tags/ignore-outside-content", + false); + parser.setFeature( + "http://cyberneko.org/html/features/balance-tags/document-fragment", + true); + parser.setFeature( + "http://cyberneko.org/html/features/report-errors", false); + } catch (SAXException e) { + e.printStackTrace(); + } + HTMLDocumentImpl doc = new HTMLDocumentImpl(); + doc.setErrorChecking(false); + DocumentFragment res = doc.createDocumentFragment(); + DocumentFragment frag = doc.createDocumentFragment(); + try { + parser.parse(input, frag); + } catch (Exception e) { + throw e; + } + res.appendChild(frag); + try { + while (true) { + frag = doc.createDocumentFragment(); + parser.parse(input, frag); + if (!frag.hasChildNodes()) + break; + res.appendChild(frag); + } + } catch (Exception e) { + throw e; + } + return res; + } + + public static DocumentFragment parse2Html(String data, String encoding) + throws Exception { + return parse2Html( + new InputSource(new ByteArrayInputStream(data.getBytes())), + encoding); + } + + public static DocumentFragment parse2Xml(byte[] bytes, String encoding) { + try { + InputSource input = new InputSource(new ByteArrayInputStream(bytes)); + input.setEncoding(encoding); + return parse2Html(input, encoding); + } catch (Exception e) { + LOG.warn("Parsing to XML Exception, Err", e); + } + return null; + } + + public static DocumentFragment parse2Xml(String content, String encoding) { + try { + InputSource input = new InputSource(new ByteArrayInputStream( + content.getBytes())); + input.setEncoding(encoding); + return parse2Html(input, encoding); + } catch (Exception e) { + LOG.warn("Parsing to XML Exception, Err", e); + } + return null; + } + + private static void testTemplateByHtml(String uri, String cid, + String encoding, String dns, boolean isFromNet,String type) { + DomParser domParser = new DomParser("parser-1", + new URLNormalizerClient()); + DomTemplate tpl = domParser.getDomTemplate(cid); +// System.out.println("template:"+JsonUtil.toJSONString(tpl)); + String string = ""; + if (isFromNet) { + string = getHtmlFromNet(uri, encoding); + } else { + string = getHtmlFromFile(uri, encoding); + } + System.out.println(string); + DocumentFragment doc = null; + try { + doc = parse2Html(string, encoding); + } catch (Exception e) { + e.printStackTrace(); + } + Map errMsg = new HashMap(); + Map rs = domParser.parseData(type, cid, dns, doc, + tpl, encoding,errMsg); + System.out.println(JsonUtil.toJSONString(rs)); + } + + private static String getHtmlFromFile(String fname, String encoding) { + File file = new File(fname); + byte[] bytes = new byte[(int) file.length()]; + DataInputStream in; + try { + in = new DataInputStream(new FileInputStream(file)); + in.readFully(bytes); + } catch (Exception e) { + e.printStackTrace(); + } + + String string = null; + try { + string = new String(bytes, encoding); + } catch (UnsupportedEncodingException e1) { + e1.printStackTrace(); + } + return string; + } + + public static String getHtmlFromNet(String url, String encoding) { + DownloadClient crawler = new DownloadClient(); +// MyCrawler crawler = new MyCrawler(); + String string = crawler.getPageData(url); +// String string = crawler.get(url)[1]; + System.out.println("html:" + string); + return string; + } + +} diff --git a/src/com/bfd/parse/FileParser.java b/src/com/bfd/parse/FileParser.java new file mode 100644 index 0000000..1c4d325 --- /dev/null +++ b/src/com/bfd/parse/FileParser.java @@ -0,0 +1,284 @@ +package com.bfd.parse; + +import java.io.File; +import java.io.FileFilter; +import java.io.FileWriter; +import java.io.IOException; +import java.io.RandomAccessFile; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import org.apache.commons.lang.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import com.bfd.crawler.utils.DataUtil; +import com.bfd.parse.reprocess.ReProcessor; +import com.bfd.parse.reprocess.ReProcessorFactory; +import com.bfd.parse.task.ParseJob; +import com.fasterxml.jackson.databind.ObjectMapper; + +/** + * key: length bid charset + * + * @author yanhui.ji + * + */ +public class FileParser { + + private static final Log LOG = LogFactory.getLog(FileParser.class); + + public static final String url = "url"; + public static final String charset = "charset"; + public static final String type = "type"; + public static final String cid = "cid"; + public static final String length = "length"; + private static Set requireds; + static { + requireds = new HashSet(); + requireds.add(url); + requireds.add(cid); + requireds.add(length); + } + + private FileWriter fileWriter = null; + + private DomParser domParser; + + private Map outputFlds = null; + + public FileParser() { + domParser = new DomParser(); + } + + public void parse(ParseJob.FileParseUnit unit) { + outputFlds = unit.getOutPutFlds(); + if (outputFlds == null) { + outputFlds = defaultOutputs(); + } else if (!outputFlds.values().containsAll(requireds)) { // + LOG.info("Request params don't contain all requirements"); + return; + } + handleFiles(unit.getFilename(), unit.getResfile(), unit.getPrefix(), unit.getBid(), unit.getCharset(), + unit.getType()); + } + + private void handleFiles(String inPath, String outPath, final String prefix, String cid, String charset, String type) { + File inFile = new File(inPath); + if (inFile.exists() && inFile.isFile() && inFile.canRead()) { + handleFile(inFile, outPath, cid, charset, type); + } else if (inFile.exists() && inFile.isDirectory()) { + File[] listFiles = null; + if (StringUtils.isNotEmpty(prefix)) { + listFiles = inFile.listFiles(new FileFilter() { + @Override + public boolean accept(File pathname) { + if (pathname.getName().startsWith(prefix)) + return true; + return false; + } + + }); + } else { + listFiles = inFile.listFiles(); + } + for (File file : listFiles) { + if (file.isFile() && file.canRead()) { + handleFile(file, outPath, cid, charset, type); + } + } + } + } + + private Map defaultOutputs() { + Map map = new HashMap(); + for (String required : requireds) { + map.put(required, required); + } + return map; + } + + private void handleFile(File file, String outPath, String aCid, String aCharset, String aType) { + outPath = createFile(file.getName(), outPath); + if (outPath == null) { + LOG.error("creating file=" + outPath + " failed. "); + return; + } + StringBuilder sBuilder = new StringBuilder(); + RandomAccessFile raf = null; + int total = 0; + int success = 0; + try { + raf = new RandomAccessFile(file, "r"); + while (true) { + String _type = null, _cid = null, _url = null, _charset = null; + int len = 0; + Map parseRes = new HashMap(); + String line = null; + while ((line = raf.readLine()) != null) { + if (line.length() == 0) { + continue; + } + int index = line.indexOf(":"); + if (index + 1 > line.length()) { // TODO + LOG.error("有问题的文件" + "|" + "inputPath" + file); + break; + } + String value = line.substring(index + 1); + String _key = line.substring(0, index); + + Collection keys = outputFlds.keySet(); + if (keys.contains(_key)) { + String key = outputFlds.get(_key); + parseRes.put(key, DataUtil.decode(value)); + if (type.equals(key)) { + _type = value; + if (!_type.equalsIgnoreCase("item") || !_type.equalsIgnoreCase("list") + || !_type.equalsIgnoreCase("info")) + continue; + } + if (cid.equals(key)) { + _cid = value; + } + if (url.equals(key)) { + _url = value; + } + if (charset.equals(key)) { + _charset = value; + } + if (length.equals(key)) { + len = Integer.parseInt(value); + break; + } + } + } + total++; + if (len == 0) { + break; + } + if (_charset == null) { + _charset = aCharset; + } + if (_type == null) { + _type = aType; + } + // 强制bid + if (StringUtils.isNotEmpty(aCid)) { + _cid = aCid; + } + + byte[] bs = new byte[len]; + raf.read(bs); + + LOG.info("begin inPath=" + file + ", url=" + _url); + Map map = domParser.parseFile(_url, bs, _cid, _type, _charset); + if (map == null || map.size() < 1) { + LOG.debug("parse file failed, url=" + _url); + continue; + } + success++; + LOG.info("after inPath=" + file + ", url=" + _url); + ReProcessor processor = ReProcessorFactory.getReProcessor(_cid, _type); + parseRes.putAll(map); + String jsonRes = createJson(outPath, parseRes); + if (StringUtils.isNotEmpty(jsonRes) && !"{}".equals(jsonRes)) { + sBuilder.append(jsonRes).append("\r\n"); + } + if (sBuilder.length() > 0) { + appendSave(sBuilder.toString(), outPath); + sBuilder = new StringBuilder(); + } + } + LOG.info("parse task finished, inFile=" + file + ", outFile=" + outPath); + } catch (Exception e) { + LOG.error("handleFile exception:", e); + } finally { + if (raf != null) { + try { + raf.close(); + } catch (IOException e) { + LOG.error("handleFile exception:", e); + } + } + if (fileWriter != null) { + try { + fileWriter.close(); + } catch (IOException e) { + LOG.error("handleFile exception:", e); + } finally { + fileWriter = null; + } + } + } + LOG.info("Parse finished, parse item count total=" + total + ", success=" + success + " inPath:" + file); + } + + public String createFile(String inFileName, String outPath) { + File file = new File(outPath); + try { + if (file.exists() && file.isFile()) { // 目标文件已存在 + LOG.info("output file is already exist. file=" + outPath); + return outPath; + } else if (file.exists() && file.isDirectory()) { + String out = outPath + "parse_" + inFileName; + file = new File(out); + if (!file.exists()) { + file.createNewFile(); + } + return out; + } else if (!file.exists() && outPath.endsWith(File.separator)) { + outPath = outPath + "parse_" + inFileName; + file = new File(outPath); + if (file.getParentFile().mkdirs() && file.createNewFile()) { + return outPath; + } + return null; + } else if (!file.exists()) { + if (file.getParentFile().mkdirs() && file.createNewFile()) { + return outPath; + } + return null; + } + } catch (Exception e) { + LOG.error("createFile exception:", e); + } + return null; + } + + public String createJson(String path, Map map) { + ObjectMapper om = new ObjectMapper(); + String jres = null; + try { + jres = om.writeValueAsString(map); + } catch (Exception x) { + LOG.info("json dump exception ", x); + } + return jres; + } + + public boolean appendSave(String strXml, String outPath) { + if (StringUtils.isEmpty(strXml)) { + return false; + } + try { + if (fileWriter == null) { + File file = new File(outPath); + if (file.exists() && file.canWrite()) { + fileWriter = new FileWriter(file, true); + } else { + LOG.warn("file doesnt exist or cannt be writed, file is " + outPath); + return false; + } + } + fileWriter.write(strXml); + return true; + } catch (Exception e) { + LOG.warn("IOException while writing json to file:", e); + } + return false; + } + +} diff --git a/src/com/bfd/parse/ParseResult.java b/src/com/bfd/parse/ParseResult.java new file mode 100644 index 0000000..b391271 --- /dev/null +++ b/src/com/bfd/parse/ParseResult.java @@ -0,0 +1,237 @@ +package com.bfd.parse; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import com.bfd.crawler.utils.JsonUtils; +import com.bfd.parse.facade.parseunit.ParseUnit; + +public class ParseResult implements Serializable{ + + private static final Log LOG = LogFactory.getLog(ParseResult.class); + + + public static final int SUCCESS = 0; + public static final int NOFOUND_TEMPLATE = 500001; + public static final int NOFOUND_PREPROCESSOR = 500002; + public static final int nofound_reprocessor = 500003; + public static final int nofound_jsonprocessor = 500004; + public static final int needLogin = 500005; + public static final int nohtmldata = 500006; + public static final int notFoundField = 500007; + public static final int GETIID_FAILED = 500008; + public static final int FAILED = 500009; + public static final int preprocessor_fail = 500010; + public static final int REPROCESS_FAILED = 500011; + public static final int jsonprocess_FAILED = 500012; + public static final int OFF_SHELF = 500016; + public static final int PARSECODE_DWONLOADFAILED = 500018; + public static final int cookie_nouse = 500019; + public static final int uncompress_fail = 50020; + public static final int nofound_sitepageconfig = 500021; + public static final int jsonparseerror = 500022; + public static final int weiboparse_error = 500023; + + public static final int REPROCESS_NONESAVE = -3;// 不保存,直接返回,以后不再调用 + public static final int NOT_ITEM_TASK = -4; + public static final int AUTO_PARSE_FAILED = -5; + +// public static final int FAILED_NO = "002"; + + + + + + + +// public static final int NOFIND_REPROCEE_PLUGIN = 7; + + private Map taskdata; + private Map spiderdata; + private ParseData parsedata; + + public static class ParseData implements Serializable{ + + private int parsecode; + private long begintime; + private long parsebegintime; + private long endtime; + private String errMsg; + + + + public String getErrMsg() { + return errMsg; + } + + public void setErrMsg(String errMsg) { + this.errMsg = errMsg; + } + + private Map data; + + public ParseData() { + data = new HashMap(); + } + + public int getParsecode() { + return parsecode; + } + +// public void mergeParsecode(String parsecode) { +// this.parsecode = parsecode | this.parsecode; +// } + + public void setParsecode(int parsecode) { + this.parsecode = parsecode; + } + + public long getBegintime() { + return begintime; + } + + public void setBegintime(long begintime) { + this.begintime = begintime; + } + + public long getEndtime() { + return endtime; + } + + public void setEndtime(long endtime) { + this.endtime = endtime; + } + + public Map getData() { + return data; + } + + public void setData(Map data) { + this.data = data; + } + + public long getParsebegintime() { + return parsebegintime; + } + + public void setParsebegintime(long parsebegintime) { + this.parsebegintime = parsebegintime; + } + + public void addData(Map data) { + for (String key : data.keySet()) { + if (this.data.containsKey(key)) { + Object o1 = this.data.get(key); + Object o2 = data.get(key); + if (o1 instanceof List && o2 instanceof List) { + List list = new ArrayList(); + list.addAll((Collection) o1); + list.addAll((Collection) o2); + LOG.debug("parseResut addData method merged list1.size=" + ((List) o1).size() + ", list2.size=" + + ((List) o2).size() + ", key=" + key + ",list.size=" + list.size()); + this.data.put(key, list); + } else { + this.data.put(key, data.get(key)); + } + } else { + this.data.put(key, data.get(key)); + } + } + } + + public void setCheckSwitch(Boolean checkSwitch) { + } + public static void main(String[] args) { + ParseData pd = new ParseData(); + Map data2 = pd.getData(); + ArrayList list = new ArrayList(); + list.add("x"); + list.add("y"); + list.add("z"); + data2.put("a", "a"); + data2.put("item", list); + + System.out.println(pd.getData()); + + HashMap map = new HashMap(); + ArrayList list2 = new ArrayList(); + list2.add("o"); + list2.add("p"); + list2.add("q"); + map.put("item", list2); + map.put("a", "b"); + + pd.addData(map); + System.out.println(pd.getData()); + } + + public void putData(String key, Object value) { + this.data.put(key, value); + } + } + + public static ParseResult prepareObj(ParseUnit unit) { + ParseResult result = new ParseResult(); + try { + result.setTaskdata(new HashMap(unit.getTaskdata())); + Map spiderdata = new HashMap(); + spiderdata.putAll(unit.getSpiderdata()); + if (spiderdata.containsKey("ajaxdata")) { + spiderdata.remove("ajaxdata"); + } + if (spiderdata.containsKey("data")) { + spiderdata.remove("data"); + } + result.setSpiderdata(spiderdata); + result.setParsedata(new ParseData()); + if (unit.getStartTime() != null) { + result.getParsedata().getData().put("start_time", unit.getStartTime()); + } + result.getParsedata().setBegintime(unit.getBegintime()); + } catch (Exception e) { + LOG.warn(e); + } + return result; + } + + public void setSpiderdata(Map spiderdata) { + this.spiderdata = spiderdata; + } + + public Map getSpiderdata() { + return spiderdata; + } + + public void setTaskdata(Map taskdata) { + this.taskdata = taskdata; + } + + public Map getTaskdata() { + return taskdata; + } + + public void setParsedata(ParseData parsedata) { + this.parsedata = parsedata; + } + + public ParseData getParsedata() { + return parsedata; + } + + public static void main(String[] args) { + ParseResult pr = new ParseResult(); + ParseData data = new ParseData(); + pr.setParsedata(data); + data.putData("type", ""); + data.putData("nextpage", ""); + data.putData("multipage", ""); + System.out.println(JsonUtils.toJSONString(pr)); + } +} diff --git a/src/com/bfd/parse/ParseStat.java b/src/com/bfd/parse/ParseStat.java new file mode 100644 index 0000000..3996336 --- /dev/null +++ b/src/com/bfd/parse/ParseStat.java @@ -0,0 +1,184 @@ +package com.bfd.parse; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import sun.util.logging.resources.logging; + +import com.bfd.crawler.kafka7.KfkProducer; +import com.bfd.crawler.utils.ConfigUtils; +import com.bfd.crawler.utils.JsonUtils; +import com.bfd.parse.util.JsonUtil; +import com.bfd.parse.util.KfkUtils; + +public class ParseStat { + + private static final Log LOG = LogFactory.getLog(ParseStat.class); + private Map statMap; + private String workName; + private static long reportStatTime; + private static String statTopicName ; + + static { + reportStatTime = ConfigUtils.getInstance().getLongProp("StatReport.time", 120000); + statTopicName = ConfigUtils.getInstance().getProp("crawl.public.statistics.kafka.topic", "statistics"); + } + private long time; + + public ParseStat(String name) { + statMap = new HashMap(); + time = System.currentTimeMillis(); + workName = name; + } + + private boolean timeIsUp() { + return System.currentTimeMillis() - time >= reportStatTime && statMap.size() != 0; + } + + /** + * 上报统计信息到统计服务 + */ + public void report() { +// List list = new ArrayList(); +// boolean success = true; + for (StatItem stat : statMap.keySet()) { + Map map = new HashMap(); + map.put("cid", stat.getCid()); + map.put("pagetype", stat.getType()); + map.put("code", String.valueOf(stat.getParseCode())); + map.put("projectname", stat.getProjectName()); + map.put("count", statMap.get(stat)); + map.put("stattime", ""+(System.currentTimeMillis()/1000)); + map.put("host", Constants.host); + + Map statData = new HashMap(); + statData.put("data", map); + statData.put("procname", "parse"); + statData.put("stattype", "parseres"); +// LOG.debug(workName + " report stat info to StatService :" + JsonUtil.toJSONString(map)); + //TODO:提交到保存kakfa里面,如果提交kafka失败,怎么处理? +// if (success == false || !statServiceClient.reportData("parse", "parseres", JsonUtil.toJSONString(map))) { +// success = false; +// list.add(stat); +// } + LOG.info("statTopicName:"+statTopicName); + KfkUtils.sendKfk(statTopicName, JsonUtils.toJSONString(statData)); + } +// if (success) { + LOG.info(workName + " reported stat success, stat size=" + statMap.size()); + LOG.debug("Reported statMap=" + JsonUtil.toJSONString(statMap)); + statMap.clear(); +// } else { +// LOG.warn(workName + " reported stat failed, statMap will remain failed stat key=" +// + JsonUtil.toJSONString(list)); +// Map map = new HashMap(); +// for (StatItem stat : list) { +// map.put(stat, statMap.get(stat)); +// } +// statMap = null; +// statMap = map; +// } + time = System.currentTimeMillis(); + } + + /** + * 统计计数+1 + * + * @param projectname + * @param cid + * @param type + * @param parsecode + */ + public void increment(String projectname, String cid, String type + , int parsecode) { + try { + StatItem statUnit = new StatItem(projectname, cid, type, parsecode); + if (statMap.get(statUnit) == null) { + statMap.put(statUnit, 1L); + return; + } + statMap.put(statUnit, statMap.get(statUnit) + 1); + } catch (Exception e) { + LOG.warn(workName + " increment exception, ", e); + } + if (timeIsUp()) { + report(); + } + } + + + class StatItem { + private String projectName; + private String cid; + private String type; + private int parseCode; // 0->success, 1->no template, 2->parse failed + + public StatItem(String projectName, String cid, String type, int parseCode) { + this.projectName = projectName; + this.cid = cid; + this.type = type; + this.parseCode = parseCode; + } + + public String getCid() { + return cid; + } + + public void setCid(String cid) { + this.cid = cid; + } + + public String getType() { + return type; + } + + public void setType(String type) { + this.type = type; + } + + public int getParseCode() { + return parseCode; + } + + public void setParseCode(int parsecode) { + this.parseCode = parsecode; + } + + public String getProjectName() { + return projectName; + } + + public void setProjectName(String projectName) { + this.projectName = projectName; + } + + @Override + public int hashCode() { + int result = (getCid() == null) ? 17 : getCid().hashCode(); + result = (getProjectName() == null) ? result * 11 : result * 11 + getProjectName().hashCode(); + result = (getType() == null) ? result * 7 : result * 7 + getType().hashCode(); + result = result * 13 + getParseCode(); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj instanceof StatItem) { + StatItem su = (StatItem) obj; + return getCid().equals(su.getCid()) && getType().equals(su.getType()) + && getParseCode() == su.getParseCode() && getProjectName().equals(su.getProjectName()); + } + return false; + } + + } + +} diff --git a/src/com/bfd/parse/ParseTestForPlugin.java b/src/com/bfd/parse/ParseTestForPlugin.java new file mode 100644 index 0000000..238037f --- /dev/null +++ b/src/com/bfd/parse/ParseTestForPlugin.java @@ -0,0 +1,176 @@ +package com.bfd.parse; + +import java.util.List; +import java.util.Map; + +import org.apache.commons.lang.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import com.bfd.parse.client.URLNormalizerClient; +import com.bfd.parse.facade.parseunit.ParseUnit; +import com.bfd.parse.json.JsonData; +import com.bfd.parse.json.JsonParser; +import com.bfd.parse.json.JsonParserResult; +import com.bfd.parse.preprocess.PreProcessor; +import com.bfd.parse.reprocess.ReProcessResult; +import com.bfd.parse.reprocess.ReProcessor; +import com.bfd.parse.util.JsonUtil; +import com.bfd.parse.util.TextUtil; +/** + * 只用来测试三种插件,所有的状态判断和插件动态加载的代码都已经去掉。 + * @author wenchao.fu + * + */ +public class ParseTestForPlugin { + private static final Log logger = LogFactory.getLog(ParseTestForPlugin.class); + public static final String workName = "test"; + private boolean preprocess(ParseUnit unit, ParserFace parserFace + ,PreProcessor preProcessor) { + try { + if (!TextUtil.unzipPageAndGuessEncodeForPluginTest(unit)) { + return false; + } + if (preProcessor != null) { + logger.info(workName + " find preprocess plugin, cid=" + + unit.getCid() + ", type=" + unit.getPageType()); + return preProcessor.process(unit, parserFace); + }else{ + logger.info(workName + " preprocess plugin is null, cid=" + + unit.getCid() + ", type=" + unit.getPageType()); + } + + return true; + } catch (Exception e) { + logger.warn( + workName + " preprocess exception, , cid=" + unit.getCid() + + ", type=" + unit.getPageType(), e); + } + return false; + } + + private JsonParserResult parseAjaxData(ParseUnit unit, + Map taskData, boolean inData,JsonParser jsonParser) { + + List dataList = TextUtil.wrapJsonData(unit, inData); + try { + //归一化服务在测试插件的时候没有用到,就传入null吧 + return jsonParser.parse(taskData, dataList, null, unit); + }catch (Exception e) { + e.printStackTrace(); + } + return new JsonParserResult(ParseResult.nofound_jsonprocessor, null); + } + + //用于本地调试插件 + public ParseResult parseTest(ParseUnit unit,PreProcessor preProcessor + ,JsonParser jsonParser,ReProcessor reprocessor){ + ParserFace parserFace = new ParserFace("test"); + DomParser domParser = new DomParser("test",new URLNormalizerClient()); + // logger.info("url:"+unit.getUrl()+".unit is "+JsonUtil.toJSONString(unit)); + ParseResult result = ParseResult.prepareObj(unit); + + try { + // logger.info("cid:"+unit.getCid()+".website : "+JsonUtil.toJSONString(website)); + // 判断网站的插件是否启用 + + // 没有ajax数据,或有ajax数据但needParseData标识为true + if (StringUtils.isNotEmpty(unit.getData())) { + if (unit.dataTypeIsHtml()) { + // 预处理 + if (!preprocess(unit, parserFace,preProcessor)) { + result.getParsedata().setParsecode( + ParseResult.PARSECODE_DWONLOADFAILED); + } else { + try { + domParser.parse(unit, result); + logger.info("url:"+unit.getUrl()+",after template parser rs is " + +JsonUtil.toJSONString(result.getParsedata()) + +" the parse code is "+result.getParsedata().getParsecode()); +// + } catch (Exception e) { + logger.warn(workName + " dom parse exception, cid=" + + unit.getCid() + ", url=" + unit.getUrl(), + e); + result.getParsedata().setParsecode( + ParseResult.FAILED); + } + } + } else { + JsonParserResult jResult = parseAjaxData(unit, + result.getTaskdata(), true,jsonParser); + result.getParsedata() + .setParsecode(jResult.getParsecode()); + if (jResult.getData() != null) { + result.getParsedata().addData(jResult.getData()); + } + } + } + + logger.info("unit.hasAjaxData() :"+(unit.hasAjaxData() )); + if (unit.hasAjaxData()) { + JsonParserResult jres = parseAjaxData(unit, + result.getTaskdata(), false,jsonParser); + // 若解析DATA时没有模板,又要解析AjaxData,则认为解析Data操作为成功。 + if (result.getParsedata().getParsecode() == ParseResult.FAILED) { + result.getParsedata().setParsecode(ParseResult.SUCCESS); + } + result.getParsedata().setParsecode(jres.getParsecode()); + if (jres.getData() != null) { + result.getParsedata().addData(jres.getData()); + } + } + logger.info("url:"+unit.getUrl()+",after json parser rs is "+JsonUtil.toJSONString(result.getParsedata())); + executeReprocessTest(unit, result, parserFace,reprocessor); + logger.info("url:"+unit.getUrl()+" the parse code is "+result.getParsedata().getParsecode() + +" after reprocess rs is "+JsonUtil.toJSONString(result.getParsedata().getData())); + } catch (Exception e) { + logger.warn(workName + " increment exception, ", e); + } + return result; + + } + + private void executeReprocessTest(ParseUnit unit,ParseResult result + ,ParserFace parserFace,ReProcessor reprocessor){ + if (result.getParsedata().getParsecode() == ParseResult.SUCCESS) { // FIXME + ReProcessResult reprocess = reprocessor.process(unit, result, parserFace); + logger.info("after execute reprocess"); + if (reprocess != null) { + switch (reprocess.getProcesscode()) { +// case ReProcessResult.PARSE_FAILED: + case ParseResult.REPROCESS_FAILED: + logger.warn(workName + " reprocess code 2, cid=" + + unit.getCid() + ", type=" + unit.getPageType() + + ", url=" + unit.getUrl()); + result.getParsedata().setParsecode( + ParseResult.REPROCESS_FAILED); + break; + case ReProcessResult.NONESAVE:// 若后处理的code为3则,不掉用保存,状态码为-2,xici的情况 + logger.info(workName + " reprocess code 3, cid=" + + unit.getCid() + ", type=" + unit.getPageType() + + ", url=" + unit.getUrl()); + result.getParsedata().setParsecode( + ParseResult.REPROCESS_NONESAVE); + break; + case ReProcessResult.OFF: // 后处理下架 + logger.info(workName + + " reprocess code 4, onshelf off, cid=" + + unit.getCid() + ", type=" + unit.getPageType() + + ", url=" + unit.getUrl()); + result.getParsedata().setParsecode(ParseResult.OFF_SHELF); + break; + case ReProcessResult.SUCCESS: + logger.info(workName + + " reprocess code 0, reprocess success, cid=" + + unit.getCid() + ", type=" + unit.getPageType() + + ", url=" + unit.getUrl()); + result.getParsedata().setParsecode(ParseResult.SUCCESS); + break; + } + } + } + } + + public static void main(String[] args) {} +} diff --git a/src/com/bfd/parse/ParserFace.java b/src/com/bfd/parse/ParserFace.java new file mode 100644 index 0000000..81548ac --- /dev/null +++ b/src/com/bfd/parse/ParserFace.java @@ -0,0 +1,646 @@ +package com.bfd.parse; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; + +import org.apache.commons.lang.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import com.bfd.crawler.utils.DataUtil; +import com.bfd.crawler.utils.JsonUtils; +import com.bfd.crawler.utils.MyStringUtil; +import com.bfd.crawler.utils.ParserException; +import com.bfd.parse.client.ConfigClient; +import com.bfd.parse.client.DataOperatorClient; +import com.bfd.parse.client.DeduplicatorClient; +import com.bfd.parse.client.LoginManagerClient; +import com.bfd.parse.client.TraceTaskClient; +import com.bfd.parse.client.URLNormalizerClient; +import com.bfd.parse.config.FieldDefine.FieldDefineConfig; +import com.bfd.parse.config.PageDefine.PageDefineConfig; +import com.bfd.parse.config.shelf.EcConfigCache; +import com.bfd.parse.config.sitepageconfig.SitePageConfigCache; +//import com.bfd.parse.data.Website; +//import com.bfd.parse.data.WebsiteMap; +import com.bfd.parse.entity.CreateTaskEntity; +import com.bfd.parse.entity.ECConfigEntity; +import com.bfd.parse.entity.FielddefineEntity; +import com.bfd.parse.entity.SitepageconfigEntity; +import com.bfd.parse.facade.parseunit.ParseUnit; +import com.bfd.parse.json.JsonData; +import com.bfd.parse.json.JsonParserFactory; +import com.bfd.parse.json.JsonParserNotFound; +import com.bfd.parse.json.JsonParserResult; +//import com.bfd.parse.learn.LearnUnit; +import com.bfd.parse.preprocess.PreProcessor; +import com.bfd.parse.preprocess.PreProcessorFactory; +import com.bfd.parse.preprocess.PreProcessorNotFound; +import com.bfd.parse.reprocess.ReProcessResult; +import com.bfd.parse.reprocess.ReProcessor; +import com.bfd.parse.reprocess.ReProcessorFactory; +import com.bfd.parse.reprocess.ReProcessorNotFound; +import com.bfd.parse.util.ParseUtils; +import com.bfd.parse.util.TextUtil; + +//import com.bfd.parse.service.Parser; + +/** + * 调用dom解析,插件解析json,插件后处理,保存上报统计信息,保存源数据和解析数据 + * + * @author ian + * + */ +public class ParserFace { + private static final Log LOG = LogFactory.getLog(ParserFace.class); + private static final String WORKNAME = "worker-t"; + private static ParserFace instance = new ParserFace(WORKNAME); + private String workName; + private URLNormalizerClient normalizer; + private TraceTaskClient traceTask = new TraceTaskClient(); + // private ParseStat stater; + private DataSaver dataSaver; + + private DomParser domParser; + + public ParserFace(String workname) { + this(workname, new URLNormalizerClient(), false); + } + + public static ParserFace getInstance() { + if (instance == null) { + instance = new ParserFace(WORKNAME); + } + return instance; + } + + private ParserFace(String workName, URLNormalizerClient normalizer, + boolean test) { + this.workName = workName; + if (!test) { + // this.stater = new ParseStat(workName); + } + // this.deduplicator = new DeduplicatorClient(workName); + // this.downloader = new DownloadClient(); + + this.dataSaver = new DataSaver(workName); + this.normalizer = normalizer; + this.domParser = new DomParser(workName, normalizer); + // this.configClient = new ConfigClient(); + } + + public ParseResult parse(ParseUnit unit) { + return parse(unit, false); + } + + private boolean executePreProcessor(ParseUnit unit, ParseResult result, + SitepageconfigEntity sitePage) { + boolean flag = preprocess(unit, sitePage, result); + LOG.debug("url:" + unit.getUrl() + " preprocessor parsecode is " + + result.getParsedata().getParsecode()); + return flag; + // if (!flag) { + // TODO 如果需要登录,现在就不调用登录服务,直接返回需要登录的code就可以 + // if (unit.getPreProcessCode() == ParseResult.needLogin) { + // if (unit.getTaskdata().containsKey(Constants.userId)) { + // logger.info("cid:" + unit.getCid() + " userId:" + // + unit.getTaskdata().get(Constants.userId) + // + " need call login!"); + // // call loginManager + // lmClient.callLogin(unit.getCid().toString(), unit + // .getTaskdata().get(Constants.userId).toString()); + // } + // + // } + // else { + // TODO 这个需要修改为前处理插件失败的code + // result.getParsedata().setParsecode( + // ParseResult.preprocessor_fail); + // } + + // } + // return flag; + } + + private boolean executeWebPageParse(ParseUnit unit, ParseResult result + ,SitepageconfigEntity sitePageConfig) { + int templateTag = sitePageConfig.getNeedtemplate(); + Map templateRsReport = new HashMap(); + Map data = new HashMap(); + data.put("templateTag", templateTag); + traceTask.reportTraceTask(unit, Constants_TraceTask.commonparser, Constants_TraceTask.chktemplate, JsonUtils.toJSONString(data), 0); + LOG.debug("url:"+unit.getUrl()+" datatypeishtml?"+unit.dataTypeIsHtml()); + if (unit.dataTypeIsHtml()) { + try { + domParser.parse(unit, result); + LOG.debug("url:" + unit.getUrl() + + " template parse code is " + + result.getParsedata().getParsecode()); + // logger.info("url:"+unit.getUrl()+".parsers:"+JsonUtil.toJSONString(result.getParsedata().getData())); + if (result.getParsedata().getParsecode() == ParseResult.FAILED) { + templateRsReport.put(Constants.errMsg, result.getParsedata().getErrMsg()); + traceTask.reportTraceTask(unit, Constants_TraceTask.commonparser, Constants_TraceTask.parsehtml, JsonUtils.toJSONString(templateRsReport), ParseResult.FAILED); + return false; + } + } catch (Exception e) { + LOG.warn( + workName + " dom parse exception, cid=" + unit.getCid() + + ", url=" + unit.getUrl(), e); + result.getParsedata().setParsecode(ParseResult.FAILED); + templateRsReport.put(Constants.errMsg, "template parse exception"); + traceTask.reportTraceTask(unit, Constants_TraceTask.commonparser, Constants_TraceTask.parsehtml, JsonUtils.toJSONString(templateRsReport), ParseResult.FAILED); + return false; + } + traceTask.reportTraceTask(unit, Constants_TraceTask.commonparser, Constants_TraceTask.parsehtml, "", 0); + return true; + } else { + JsonParserResult jResult = parseAjaxData(unit, + result.getTaskdata(), true); + result.getParsedata().setParsecode(jResult.getParsecode()); + LOG.debug("url:" + unit.getUrl() + " template parse code is " + + result.getParsedata().getParsecode()); + traceTask.reportTraceTask(unit, Constants_TraceTask.commonparser, Constants_TraceTask.parsehtml, JsonUtils.toJSONString(jResult.getData()), jResult.getParsecode()); + if (jResult.getData() != null) { + result.getParsedata().addData(jResult.getData()); + return true; + } else { + return false; + } + } + + } + + private boolean executeJsonParse(ParseUnit unit, ParseResult result, + SitepageconfigEntity entity) { + Map data = new HashMap(); + data.put("Jsonprocesstag", entity.getJsonprocesstag()); + data.put("hasajaxdata", unit.hasAjaxData()); + data.put("status", entity.getStatus()); + traceTask.reportTraceTask(unit, Constants_TraceTask.commonparser, Constants_TraceTask.chkjsonplugin, JsonUtils.toJSONString(data), 0); + // TODO:通过websiteconfig表得到是否需要json插件 + if ( entity.getStatus() == 1 + && entity.getJsonprocesstag() == 1) { + if(!unit.hasAjaxData()){ + Map traceData = new HashMap(); + traceData.put("desc", "no ajax data"); + traceTask.reportTraceTask(unit, Constants_TraceTask.commonparser, Constants_TraceTask.jsonparse, JsonUtils.toJSONString(traceData), 0); + return true; + }else{ + LOG.info("url:" + unit.getUrl() + ":need call jsonplugin!"); + JsonParserResult jres = parseAjaxData(unit, result.getTaskdata(), + false); + LOG.info("after execute parseAjaxData,url+" + unit.getUrl()); + // 若解析DATA时没有模板,又要解析AjaxData,则认为解析Data操作为成功。 + // if (result.getParsedata().getParsecode() == + // ParseResult.FAILED_NO) { + // result.getParsedata().setParsecode(ParseResult.SUCCESS); + // } + result.getParsedata().setParsecode(jres.getParsecode()); + traceTask.reportTraceTask(unit, Constants_TraceTask.commonparser, Constants_TraceTask.jsonparse, "", jres.getParsecode()); + LOG.debug("url:" + unit.getUrl() + " json parsecode is " + + result.getParsedata().getParsecode()); + if (jres.getData() != null) { + result.getParsedata().addData(jres.getData()); + } else { + return false; + } + } + + } + return true; + } + /** + * true为在架,false为下架 + * @param unit + * @return + */ + public boolean onOffShelf(ParseUnit unit,ParseResult result){ + ECConfigEntity ecConfig = EcConfigCache.getInstance().getECConfig(unit.getCid()); + if(ecConfig==null){ + LOG.debug("url:"+unit.getUrl()+" no ecconfig"); + return true; + } + if(ecConfig.getIscontain()!=null + &&ecConfig.getIscontain().trim().length()>0 + &&MyStringUtil.isRegexMatched(ecConfig.getIscontain(), unit.getPageData())){ + LOG.debug("url:"+unit.getUrl()+" match ecconfig iscontain"); + result.getParsedata().getData().put(Constants.errMsg, "off shelf match iscontain"); + return false; + } + if(unit.getSpiderdata().containsKey(Constants.HTTPHEADER_LOCATION) + &&unit.getSpiderdata().get(Constants.HTTPHEADER_LOCATION).toString().trim().length()>0){ + if(unit.getSpiderdata().get(Constants.HTTPHEADER_LOCATION).toString().trim().equalsIgnoreCase(ecConfig.getRedirecturl())){ + LOG.debug("url:"+unit.getUrl()+" match ecconfig redirect"); + result.getParsedata().getData().put(Constants.errMsg, "off shelf match redirect"); + return false; + } + } + return true; + } + /** + * 解析数据 + * + * @param unit + * 解析任务单元 + * @param isTest + * 是否为测试任务 + * @return + */ + public ParseResult parse(ParseUnit unit, boolean isTest) { + traceTask.reportTraceTask(unit, "commonparser", "rcvtask", "",0); + ParseResult result = ParseResult.prepareObj(unit); + + try { + + // 判断是否有html数据 + if (!StringUtils.isNotEmpty(unit.getData())) { + LOG.debug("url:" + unit.getUrl() + " no html data"); + result.getParsedata().setParsecode(ParseResult.nohtmldata); + traceTask.reportTraceTask(unit, "commonparser", Constants_TraceTask.chkhtml, "",ParseResult.nohtmldata); + return result; + } + LOG.debug("url:"+unit.getUrl()+" has html data"); + if (!TextUtil.unzipPageAndGuessEncode(unit)) { + result.getParsedata().setParsecode(ParseResult.uncompress_fail); + traceTask.reportTraceTask(unit, "commonparser", Constants_TraceTask.chkhtml, "",ParseResult.uncompress_fail); + return result; + } + LOG.debug("url:"+unit.getUrl()+" unzipandguessencode success!"); + + + if(!onOffShelf(unit,result)){ + LOG.info("url:"+unit.getUrl()+" off shelf"); + result.getParsedata().setParsecode(ParseResult.OFF_SHELF); + traceTask.reportTraceTask(unit, "commonparser", Constants_TraceTask.chkhtml, "",ParseResult.OFF_SHELF); + return result; + } + traceTask.reportTraceTask(unit, "commonparser", Constants_TraceTask.chkhtml, "",0); + SitepageconfigEntity sitePageConfig = SitePageConfigCache.getInstance() + .getSitePageConfig( + unit.getSiteId() + "|" + unit.getPageTypeId()); + LOG.trace("url:" + unit.getUrl() + " key: " + unit.getSiteId() + + "|" + unit.getPageTypeId() + " sitePageConfig:" + + JsonUtils.toJSONString(sitePageConfig)); + + if (!executePreProcessor(unit, result, sitePageConfig)) { + return result; + } + LOG.debug("url:"+unit.getUrl()+" preprocessor success!"); + if (!executeWebPageParse(unit, result,sitePageConfig)) { + return result; + } + LOG.debug("url:"+unit.getUrl()+" template parse success!"); + LOG.debug("url:"+unit.getUrl()+" unit.hasAjaxData() :" + (unit.hasAjaxData())); + //json plugin + if (!executeJsonParse(unit, result, sitePageConfig)) { + return result; + } + LOG.debug("url:"+unit.getUrl()+" json parse success!"); + Map chkiid = new HashMap(); + chkiid.put("iidtag", sitePageConfig.getIidtag()); + + // 从taskdata获得attr,cate;获得iid,生成tasks + if (!addExtraInfo(unit, result)) { + result.getParsedata().setParsecode(ParseResult.GETIID_FAILED); + return result; + } + LOG.debug("url:"+unit.getUrl()+" addExtraInfo success!"); + executeReprocess(unit, result, sitePageConfig); + + if(0==result.getParsedata().getParsecode()){ + traceTask.reportTraceTask(unit, Constants_TraceTask.commonparser, Constants_TraceTask.sendresult, JsonUtils.toJSONString(result.getParsedata().getData()), 0); + + traceTask.reportTraceTask(unit, Constants_TraceTask.commonparser, Constants_TraceTask.parsedone, "", 0); + } + + LOG.info("url is " + unit.getUrl() + " type is " + + unit.getPageType() + ".getParsecode is " + + result.getParsedata().getParsecode() + ",cid :" + + unit.getCid()+" task over!"); + + } catch (Exception e) { + e.printStackTrace(); + LOG.warn(workName + " increment exception, ", e); + } + return result; + } + + private void executeReprocess(ParseUnit unit, ParseResult result, + SitepageconfigEntity website) { + LOG.info("url:" + unit.getUrl() + ",parsecode is " + + result.getParsedata().getParsecode() + ",plugintag :" + + website.getReprocesstag()); + Map data = new HashMap(); + data.put("Reprocesstag", website.getReprocesstag()); + traceTask.reportTraceTask(unit, Constants_TraceTask.commonparser, Constants_TraceTask.chkproplugin, JsonUtils.toJSONString(data), 0); + // TODO:改为从websiteconfig表里面得到是否需要后处理插件 + if (result.getParsedata().getParsecode() == ParseResult.SUCCESS + && website.getReprocesstag() == 1) { // FIXME + LOG.info("url:" + unit.getUrl() + " need reprocess plugin"); + ReProcessResult reprocess = reprocess(unit, result); + traceTask.reportTraceTask(unit, Constants_TraceTask.commonparser, Constants_TraceTask.reprocess, "", result.getParsedata().getParsecode()); + if (reprocess != null) { + switch (reprocess.getProcesscode()) { +// case ReProcessResult.PARSE_FAILED: + case ParseResult.REPROCESS_FAILED: + LOG.warn(workName + " reprocess code 2, cid=" + + unit.getCid() + ", type=" + unit.getPageType() + + ", url=" + unit.getUrl()); + result.getParsedata().setParsecode( + ParseResult.REPROCESS_FAILED); + break; + case ReProcessResult.NONESAVE:// 若后处理的code为3则,不掉用保存,状态码为-2,xici的情况 + LOG.info(workName + " reprocess code 3, cid=" + + unit.getCid() + ", type=" + unit.getPageType() + + ", url=" + unit.getUrl()); + result.getParsedata().setParsecode( + ParseResult.REPROCESS_NONESAVE); + break; + case ReProcessResult.OFF: // 后处理下架 + LOG.info(workName + + " reprocess code 4, onshelf off, cid=" + + unit.getCid() + ", type=" + unit.getPageType() + + ", url=" + unit.getUrl()); + result.getParsedata().setParsecode(ParseResult.OFF_SHELF); + break; + case ReProcessResult.SUCCESS: + LOG.info(workName + + " reprocess code 0, reprocess success, cid=" + + unit.getCid() + ", type=" + unit.getPageType() + + ", url=" + unit.getUrl()); + result.getParsedata().setParsecode(ParseResult.SUCCESS); + break; + } + } + } + } + + private boolean addExtraInfo(ParseUnit unit, ParseResult result) { + // 处理cate + final Map resData = result.getParsedata().getData(); + + // 处理attr + Map attr = (Map) unit.getAttr(); + if (attr != null && attr.size() > 0) { + // 将task带过来的attr放入解析结果 + resData.put("attr", unit.getAttr()); + } + this.domParser.handExtraArgs(unit, unit.getPageEncode() + , null, resData); + // 获取iid + if (!ParseUtils.getIid(unit, result)) { + + LOG.debug("url is " + unit.getUrl() + + ".getiid failed"); + result.getParsedata().setParsecode(ParseResult.GETIID_FAILED); + return false; + } + + return true; + } + + public static boolean urlHasNoPathAndQuery(String location) { + + return false; + } + + // TODO:得到需要生成任务的字段,现在createTask字段已经从数据库表移到了模板里面,此函数弃用 + // private List getCreateTaskField(ParseUnit unit) { + // List rs = new ArrayList(); + // Fielddefine field = + // FieldDefineConfig.getInstance().getFieldDefine(unit.getPageTypeId()); + // Fielddefine tmp = field; + // while(tmp!=null){ + // if(tmp.getCreatetask()==1){ + // rs.add(tmp); + // } + // tmp = (Fielddefine)tmp.getNext(); + // } + // + // + // // List fields = this.configClient + // // .getFieldDefineByPageTypeId(Integer.parseInt(unit.getPageType())); + // // for (Fielddefine field : fields) { + // // if (field.getCreateTask() == 1) { + // // rs.add(field); + // // } + // // } + // return rs; + // } + + // TODO:通过cid和pageTypeId得到iid规则 +// private String getIidRegex(ParseUnit unit,String pageTypeId) { +// String cacheKey = unit.getSiteId() + "|" + pageTypeId; +// Sitepageconfig config = SitePageConfigCache.getInstance() +// .getSitePageConfig(cacheKey); +// if (config == null) { +// logger.debug("url:" + cacheKey + " get config null"); +// return ""; +// } +// if (config.getIidtag() == 1) { +// traceTask.reportTraceTask(unit, Constants_TraceTask.commonparser, Constants_TraceTask.chkiidrule, "", 0); +// return config.getIidregex(); +// } else { +// logger.debug("url:" + cacheKey + " iidtag is 0"); +// return ""; +// } +// +// } + + // TODO:遍历parsedata找到所有的key,得到所有的需要生成任务的字段 + // private void getAllCreateTasks(List createTaskFieldNames, + // List tasks, Map parsedata) { + // Iterator> it = parsedata.entrySet().iterator(); + // while (it.hasNext()) { + // Entry entry = it.next(); + // if (createTaskFieldNames.contains(entry.getKey())) { + // CreateTaskEntity task = new CreateTaskEntity(); + // Map value = (Map) entry + // .getValue(); + // task.setLink(value.get("link").toString()); + // task.setLinkType(value.get("lineType").toString()); + // tasks.add(task); + // continue; + // } else { + // if (entry.getValue() instanceof Map) { + // getAllCreateTasks(createTaskFieldNames, tasks, + // (Map) entry.getValue()); + // } + // } + // } + // } + + + + + + private boolean preprocess(ParseUnit unit, SitepageconfigEntity sitePageConfig, + ParseResult result) { + // SitePageConfigEntity sitePageConfig = + // SitePageConfig.getInstance().getSitePageConfig(unit.getSiteId()+"|"+unit.getPageTypeId()+"|"+Constants.pluginType_preprocess); + Map precessdata = new HashMap(); + try { + // TODO:通过siteId,pageTypeId,pluginType来得到插件 + PreProcessor preProcessor = PreProcessorFactory.getPreProcessor( + unit.getSiteId()+"", unit.getPageTypeId()+""); + if (preProcessor != null) { + LOG.info(workName + " find preprocess plugin, cid=" + + unit.getCid() + ", type=" + unit.getPageType() + + " url:" + unit.getUrl()); + if (!preProcessor.process(unit, this)) { + precessdata.put(Constants.errMsg, "process error!"); + + if(unit.getPreProcessCode()!=0){ + result.getParsedata().setParsecode( unit.getPreProcessCode()); + }else { + result.getParsedata().setParsecode(ParseResult.preprocessor_fail); + } + traceTask.reportTraceTask(unit, Constants_TraceTask.commonparser, + Constants_TraceTask.preprocess, JsonUtils.toJSONString(precessdata),ParseResult.preprocessor_fail); + return false; + } else { + traceTask.reportTraceTask(unit, Constants_TraceTask.commonparser, Constants_TraceTask.preprocess, "", + 0); + return true; + } + } + precessdata.put(Constants.errMsg, "siteID or pageTypeId is null"); + traceTask.reportTraceTask(unit, Constants_TraceTask.commonparser, Constants_TraceTask.preprocess, JsonUtils.toJSONString(precessdata), + ParseResult.NOFOUND_PREPROCESSOR); + LOG.info(workName + " preprocess plugin not found, cid=" + + unit.getCid() + ", type=" + unit.getPageType() + " url:" + + unit.getUrl()); + result.getParsedata() + .setParsecode(ParseResult.NOFOUND_PREPROCESSOR); + return false; + } + catch (PreProcessorNotFound e) { + + LOG.warn( + workName + " preprocess plugin not found, cid=" + + unit.getCid() + ", type=" + unit.getPageType(), e); + precessdata.put(Constants.errMsg, "throw preprocessorNotFound"); + traceTask.reportTraceTask(unit, Constants_TraceTask.commonparser, Constants_TraceTask.preprocess, JsonUtils.toJSONString(precessdata), + ParseResult.NOFOUND_PREPROCESSOR); + return false; + + } + catch (Exception e) { + LOG.warn( + workName + " preprocess exception, , cid=" + unit.getCid() + + ", type=" + unit.getPageType(), e); + } + precessdata.put(Constants.errMsg, "process throw exception"); + result.getParsedata().setParsecode(ParseResult.preprocessor_fail); + traceTask.reportTraceTask(unit, Constants_TraceTask.commonparser, Constants_TraceTask.preprocess, JsonUtils.toJSONString(precessdata), + ParseResult.preprocessor_fail); + return false; + } + + /** + * 调用后处理插件,进行后处理 + */ + private ReProcessResult reprocess(ParseUnit unit, ParseResult result) { + try { + // TODO:改为siteId pagetypeId pluginType得到插件 + ReProcessor reProcessor = ReProcessorFactory.getReProcessor( + unit.getSiteId()+"", unit.getPageTypeId()+""); + if (reProcessor != null) { + LOG.info(workName + " find reprocess plugin, cid=" + + unit.getCid() + ", type=" + unit.getPageType()); + return reProcessor.process(unit, result, this); + } + LOG.debug(workName + " reprocess plugin not found, cid=" + + unit.getCid() + ", type=" + unit.getPageType() + + ".url is " + unit.getUrl()); + } catch (ReProcessorNotFound e) { + e.printStackTrace(); + LOG.warn( + workName +" url:"+unit.getUrl()+ " reprocess plugin not found, cid=" + + unit.getCid() + ", type=" + unit.getPageType(), e); + result.getParsedata().setParsecode(ParseResult.nofound_reprocessor); + } catch (Exception e) { + e.printStackTrace(); + LOG.error("url:"+unit.getUrl()+" get reprocess error!"); + result.getParsedata().setParsecode(ParseResult.REPROCESS_FAILED); + }catch (Throwable e) { + LOG.error("url:"+unit.getUrl()+" runtime error"); + e.printStackTrace(); + result.getParsedata().setParsecode(ParseResult.REPROCESS_FAILED); + } + + return null; + } + + /** + * 调用对应插件,解析JSON数据 + */ + // TODO:这个得到json插件的方法也需要修改,siteId pagetypeId plugintype + private JsonParserResult parseAjaxData(ParseUnit unit, + Map taskData, boolean inData) { + + List dataList = TextUtil.wrapJsonData(unit, inData); + try { + return JsonParserFactory.getJsonParser(unit, inData, workName) + .parse(taskData, dataList, this.normalizer, unit); + } catch (JsonParserNotFound e) { + e.printStackTrace(); + LOG.warn( + workName + " JsonParser not found exception " + + unit.getUrl(), e); + return new JsonParserResult(ParseResult.nofound_jsonprocessor, null); + } catch (Exception e) { + e.printStackTrace(); + LOG.warn( + workName + " JsonParser execute exception " + unit.getUrl(), + e); + return new JsonParserResult(ParseResult.jsonprocess_FAILED, null); + } + + } + + public String getWorkName() { + return workName; + } + + public URLNormalizerClient getNormalizerClient() { + return normalizer; + } + +// public DataOperatorClient getDataOperatorClient() { +// return dataSaver.getDataOperatorClient(); +// } + + // public StatServiceClient getStatServiceClient() { + // return stater.getStatServiceClient(); + // } + + // public DownloadClient getDownloadClient() { + // return downloader; + // } + + public DataSaver getDataSaver() { + return dataSaver; + } + + public DomParser getDomParser() { + return domParser; + } + + public static void main(String[] args) { +// boolean res = urlHasNoPathAndQuery("http://www.baidu.com;dd"); +// System.out.println(res); +// try { +// String iid = MyStringUtil.getRegexGroup("http\\://k\\.autohome\\.com\\.cn/spec/\\d+/view_(\\d+)_\\d+\\.html(\\?.+)?", "http://k.autohome.com.cn/spec/18322/view_401054_1.html?st=1&piap=0|3217||0|1|0|0|0|0|0" +// .toString(), 1); +// System.out.println(iid); +// } catch (ParserException e) { +// e.printStackTrace(); +// } + + } +} diff --git a/src/com/bfd/parse/client/AbstractClient.java b/src/com/bfd/parse/client/AbstractClient.java new file mode 100644 index 0000000..2ae9a61 --- /dev/null +++ b/src/com/bfd/parse/client/AbstractClient.java @@ -0,0 +1,172 @@ +package com.bfd.parse.client; + +import java.io.File; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import com.bfd.crawler.utils.ConfigUtils; +import com.bfd.parse.Constants; + +import Ice.ObjectPrx; + + +public abstract class AbstractClient implements IceClient { + + private static final Log LOG = LogFactory.getLog(AbstractClient.class); + + private static final String cfgLocator = "bfdcloud/Locator:tcp -h 192.168.61.130 -p 62229"; + protected volatile Integer status = 0; + + protected static boolean testMode; + + static { + testMode = ConfigUtils.getInstance().getBoolProp("TestMode", false); + } + + private Ice.Communicator ic = null; + protected Ice.ObjectPrx service; + protected Ice.ObjectPrx base; + + /** + * 两种初始化,默认不设置最大消息 + * + * @param setMaxMsgSize + */ + protected AbstractClient(boolean setMaxMsgSize) { + if (init(setMaxMsgSize)) { + setSatus(STATUS_OK); + } else { + setSatus(STATUS_INITFAILED); + } + } + + /** + * 两种初始化,默认不设置最大消息 + * + * @param setMaxMsgSize + */ + protected AbstractClient(boolean setMaxMsgSize, boolean setMethordTimeout) { + if (init(setMaxMsgSize, setMethordTimeout)) { + setSatus(STATUS_OK); + } else { + setSatus(STATUS_INITFAILED); + } + } + + protected AbstractClient() { + if (init(false)) { + setSatus(STATUS_OK); + } else { + setSatus(STATUS_INITFAILED); + } + } + + protected boolean init() { + return init(false); + } + + protected boolean init(boolean setMaxMsgSize) { + return init(setMaxMsgSize, false); + } + + protected synchronized boolean init(boolean setMaxMsgSize, boolean setMethordTimeout) { + String iceTimeout = ConfigUtils.getInstance().getProp(Constants.iceTimeout); + if(iceTimeout==null||iceTimeout.trim().length()==0){ + ConfigUtils.getInstance().readFile(new File("../etc/crawl-config.properties")); + } + try { + if(ic!=null){ + try { + ic.destroy(); + } catch (Exception e) { + LOG.error("ic destroy error!"); + e.printStackTrace(); + } + ic=null; + this.service = null; + } + if (ic == null) { + if (ConfigUtils.getInstance().getBoolProp("Distribute.status", true)) { + Ice.Properties properties = Ice.Util.createProperties(); + properties.setProperty("Ice.Default.Locator", + ConfigUtils.getInstance().getProp("crawl.public.ice.locator", cfgLocator)); + LOG.info("locator:"+properties.getProperty("Ice.Default.Locator")); + if (setMaxMsgSize) { + properties.setProperty("Ice.MessageSizeMax", + ConfigUtils.getInstance().getProp("Ice.MessageSizeMax", "3076")); + LOG.info(name() + " service, set MessageSizeMax to " + + ConfigUtils.getInstance().getProp("Ice.MessageSizeMax", "3076")); + } + // + properties.setProperty("Ice.Override.Timeout", + ConfigUtils.getInstance().getProp(Constants.iceTimeout, "20000")); + LOG.info("Ice.Override.Timeout:"+ConfigUtils.getInstance().getProp(Constants.iceTimeout, "20000")); + if (setMethordTimeout == true) { + properties.setProperty("Ice.Override.ConnectTimeout", + ConfigUtils.getInstance().getProp("Ice.Override.ConnectTimeout", "10000")); + } + Ice.InitializationData initData = new Ice.InitializationData(); + initData.properties = properties; + ic = Ice.Util.initialize(initData); + } else { + ic = Ice.Util.initialize(); + } + } + ObjectPrx proxy = ic.stringToProxy(getProxyConfig()); + if (proxy == null) + return false; + base = proxy; + LOG.debug("Seting checked Service to " + name() + " service..."); + setService(checkedCast()); + LOG.info("Connected " + name() + " service. proxy=" + getProxyConfig()); + return true; + } catch (Ice.LocalException e) { + e.printStackTrace(); + } + return false; + } + + abstract protected ObjectPrx checkedCast(); + + abstract protected String getProxyConfig(); + + protected Ice.Communicator getIc() { + return ic; + } + + protected Ice.ObjectPrx getService() { + return service; + } + + public void setService(Ice.ObjectPrx service) { + this.service = service; + } + + protected Ice.ObjectPrx getBase() { + return base; + } + + @Override + public synchronized void release() { + if (ic != null) { + try { + ic.destroy(); + } catch (Exception e) { + LOG.warn(e.getMessage()); + } finally { + setSatus(STATUS_RELEASE); + ic = null; + } + } + } + + @Override + public Integer getStatus() { + return status; + } + + protected void setSatus(Integer status) { + this.status = status; + } +} diff --git a/src/com/bfd/parse/client/ConfigClient.java b/src/com/bfd/parse/client/ConfigClient.java new file mode 100644 index 0000000..b3bf0f0 --- /dev/null +++ b/src/com/bfd/parse/client/ConfigClient.java @@ -0,0 +1,94 @@ +package com.bfd.parse.client; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import Ice.ObjectPrx; + + + +//import com.bfd.crawler.BizConfigurerPrx; +//import com.bfd.crawler.BizConfigurerPrxHelper; +import com.bfd.crawler.ConfigurerPrx; +import com.bfd.crawler.ConfigurerPrxHelper; +import com.bfd.crawler.utils.ConfigUtils; +import com.bfd.crawler.utils.JsonUtils; +import com.bfd.parse.entity.FielddefineEntity; + +public class ConfigClient extends AbstractClient { + + private static final Log LOG = LogFactory.getLog(ConfigClient.class); + + public ConfigClient() { + super(); + } + + @Override + public String name() { + return "config_client"; + } + + @Override + protected ObjectPrx checkedCast() { +// return BizConfigurerPrxHelper.checkedCast(base); + return ConfigurerPrxHelper.checkedCast(base); + } + + @Override + protected String getProxyConfig() { + String configName = ConfigUtils.getInstance().getProp("crawl.public.ice.service.configure", "BizConfigurerService"); + return configName; + } + + @Override + public ConfigurerPrx getService() { +// ConfigurerPrx + return ((ConfigurerPrx) super.getService()); + } + + public String getConfig(String bizName, String configNames) { + try { + return getService().getConfig(bizName, configNames); + } catch (Exception e) { + LOG.warn(Thread.currentThread().getName() + " exception while calling getconfig method, Err=", e); + e.printStackTrace(); + } + long l1 = 1000, l2 = 1000, lmax = 120000, ltemp; + while (true) { + try { + Thread.sleep(l1); + ltemp = l1; + l1 = l2; + l2 = ltemp + l2; + if (l2 >= lmax) { + l1 = l2 = 1000; + } + if (init()) + return getService().getConfig(bizName, configNames); + } catch (InterruptedException x) { + } catch (Exception x) { + LOG.warn(Thread.currentThread().getName() + " exception while connect to config server: " + l1 / 1000 + + ", " + l2 / 1000 + "", x); + } + LOG.debug(Thread.currentThread().getName() + " try to connect to config server: " + l1 / 1000 + ", " + l2 + / 1000); + } + } + //TODO:通过pageTypeId得到所有的页面字段 +// public List getFieldDefineByPageTypeId(int pageTypeId){ +// List field = null; +// return field; +// } + public static void main(String[] args) { + ConfigClient client = new ConfigClient(); + String bizName = "sitePageConfig"; + Map configName = new HashMap(); + configName.put("type", "all"); + String configJson = JsonUtils.toJSONString(configName); + String rs = client.getConfig(bizName, configJson); + } +} diff --git a/src/com/bfd/parse/client/DataOperatorClient.java b/src/com/bfd/parse/client/DataOperatorClient.java new file mode 100644 index 0000000..6bda0d7 --- /dev/null +++ b/src/com/bfd/parse/client/DataOperatorClient.java @@ -0,0 +1,90 @@ +package com.bfd.parse.client; + +import java.io.UnsupportedEncodingException; + +import org.apache.commons.lang.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import Ice.ObjectPrx; + +import com.bfd.crawler.DataOperatorPrx; +import com.bfd.crawler.DataOperatorPrxHelper; +import com.bfd.crawler.utils.ConfigUtils; + +public class DataOperatorClient extends AbstractClient { + + private static final Log LOG = LogFactory.getLog(DataOperatorClient.class); + + public DataOperatorClient() { + // 设置最大参数 + super(true); + } + + @Override + protected String getProxyConfig() { + return ConfigUtils.getInstance().getProp("DataOperator.Proxy", "DataOperatorService"); + } + + @Override + protected ObjectPrx checkedCast() { + return DataOperatorPrxHelper.checkedCast(base); + } + + @Override + protected DataOperatorPrx getService() { + return ((DataOperatorPrx) super.getService()); + } + + public int saveData(String attr, String data) { + long callTimesNum =1; + if (StringUtils.isEmpty(attr) || StringUtils.isEmpty(data)) { + LOG.warn("Invalid data, attr or data is empty."); + return -1; + } + try { + return getService().saveData(attr, data); + } catch (Ice.MarshalException e) { + LOG.warn(Thread.currentThread().getName() + " ice.MarshalException while calling save data method, Err:", e); + try { + data = new String(data.getBytes("gbk"), "gbk"); + return getService().saveData(attr, data); + } catch (UnsupportedEncodingException e1) { + LOG.warn(Thread.currentThread().getName() + " exception while convert encode, Err:", e1); + return -1; + } + } catch (Exception e) { + LOG.warn(Thread.currentThread().getName() + " exception while calling save data method, Err:", e); + } + long l1 = 1000, l2 = 1000, lmax = 120000, ltemp; + while (true) { + LOG.info("datasaver call times is "+callTimesNum); + try { + Thread.sleep(l1); + ltemp = l1; + l1 = l2; + l2 = ltemp + l2; + if (l2 >= lmax) { + l1 = l2 = 1000; + } + if (init(true)) + return getService().saveData(attr, data); + callTimesNum++; + } catch (InterruptedException x) { + callTimesNum++; + } catch (Exception x) { + callTimesNum++; + LOG.warn(Thread.currentThread().getName() + " exception while connect to dataoperator server: " + l1 + / 1000 + ", " + l2 / 1000, x); + LOG.debug(Thread.currentThread().getName() + " save data exception, attr=" + attr); + } + LOG.debug(Thread.currentThread().getName() + " try to connect to dataoperator server: " + l1 / 1000 + ", " + + l2 / 1000); + } + } + + @Override + public String name() { + return "dataoperator"; + } +} diff --git a/src/com/bfd/parse/client/DeduplicatorClient.java b/src/com/bfd/parse/client/DeduplicatorClient.java new file mode 100644 index 0000000..cda535d --- /dev/null +++ b/src/com/bfd/parse/client/DeduplicatorClient.java @@ -0,0 +1,100 @@ +package com.bfd.parse.client; + +import java.util.List; + +import org.apache.commons.lang.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import Ice.ObjectPrx; + +import com.bfd.crawler.DeduplicatorPrx; +import com.bfd.crawler.DeduplicatorPrxHelper; +import com.bfd.crawler.utils.ConfigUtils; +import com.bfd.crawler.utils.JsonUtils; + +public class DeduplicatorClient extends AbstractClient { + + private static final Log LOG = LogFactory.getLog(DeduplicatorClient.class); + private String thdName; + + public DeduplicatorClient(String workname) { + this.thdName = workname; + } + + @Override + public String name() { + return "DeduplicatorService"; + } + + @Override + protected DeduplicatorPrx getService() { + return (DeduplicatorPrx) super.getService(); + } + + @Override + protected ObjectPrx checkedCast() { + return DeduplicatorPrxHelper.checkedCast(base); + } + + @Override + protected String getProxyConfig() { + return ConfigUtils.getInstance().getProp("Deduplicator.Proxy", "DeduplicatorService"); + } + + public boolean duplicate(String request, boolean bAdd) { + String duplicate = isDuplicate(request, bAdd); + if (StringUtils.isNotEmpty(duplicate)) { + try { + List res = JsonUtils.parseArray(duplicate); + Integer resCode = (Integer) res.get(1); + if (resCode != null && resCode == 1) { + return true; + } + } catch (Exception e) { + LOG.warn(thdName + " exception while calling duplicate, request -> " + request, e); + } + } + return false; + } + + public String isDuplicate(String request, boolean bAdd) { + long callTimesNum=1; + if (StringUtils.isEmpty(request)) { + new NullPointerException(); + } + try { + return getService().isDuplicate(request, bAdd); + } catch (Exception e) { + LOG.warn(thdName + " exception while calling isDuplicate , err:", e); + } + long l1 = 1000, l2 = 1000, lmax = 120000, ltemp; + while (true) { + LOG.info("call deduplicator call times is "+callTimesNum); + try { + Thread.sleep(l1); + ltemp = l1; + l1 = l2; + l2 = ltemp + l2; + if (l2 >= lmax) { + l1 = l2 = 1000; + } + if (init()) + return getService().isDuplicate(request, bAdd); + callTimesNum++; + } catch (InterruptedException x) { + callTimesNum++; + } catch (Exception x) { + callTimesNum++; + LOG.warn(thdName + " exception while connect to deduplicator server: " + l1 / 1000 + ", " + l2 / 1000 + + "", x); + } + LOG.debug(Thread.currentThread().getName() + " try to connect to deduplicator server: " + l1 / 1000 + ", " + + l2 / 1000); + } + } + + public static void main(String[] args) { + String duplicate = new DeduplicatorClient("test").isDuplicate("http://www.cnbeta.com", false); + } +} diff --git a/src/com/bfd/parse/client/DownloadClient.java b/src/com/bfd/parse/client/DownloadClient.java new file mode 100644 index 0000000..5a31844 --- /dev/null +++ b/src/com/bfd/parse/client/DownloadClient.java @@ -0,0 +1,274 @@ +package com.bfd.parse.client; + +import java.util.HashMap; +import java.util.Map; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.log4j.PropertyConfigurator; + +import Ice.ObjectPrx; + +import com.bfd.crawler.DownloaderPrx; +import com.bfd.crawler.DownloaderPrxHelper; +import com.bfd.crawler.utils.ConfigUtils; +import com.bfd.crawler.utils.DataUtil; +import com.bfd.crawler.utils.EncodeUtil; +import com.bfd.crawler.utils.JsonUtils; +import com.bfd.parse.test.weibosinaparser.WeiboParser; + +public class DownloadClient extends AbstractClient { + + private static final Log LOG = LogFactory.getLog(DownloadClient.class); + + public DownloadClient() { + super(); + } + + @Override + public String name() { + return "downloadservice"; + } + + @Override + protected ObjectPrx checkedCast() { + return DownloaderPrxHelper.checkedCast(base); + } + + @Override + protected String getProxyConfig() { + return ConfigUtils.getInstance().getProp("Downloader.Proxy", "DownloadService"); + } + + @Override + protected DownloaderPrx getService() { + return ((DownloaderPrx) super.getService()); + } + + public String getPage(String url) { + Map req = new HashMap(); + req.put("url", url); + req.put("cid", "C1905"); + url = JsonUtils.toJSONString(req); + try { + return getService().getOnePage(url); + } catch (Exception e) { + LOG.warn(Thread.currentThread().getName() + " exception while calling getPage , err:", e); + } + long l1 = 1000, l2 = 1000, lmax = 120000, ltemp; + while (true) { + try { + Thread.sleep(l1); + ltemp = l1; + l1 = l2; + l2 = ltemp + l2; + if (l2 >= lmax) { + l1 = l2 = 1000; + } + if (init()) + return getService().getOnePage(url); + } catch (InterruptedException x) { + } catch (Exception x) { + LOG.warn(Thread.currentThread().getName() + " exception while connect to downloader server: " + l1 + / 1000 + ", " + l2 / 1000 + "", x); + } + LOG.debug(Thread.currentThread().getName() + " try to connect to downloader server: " + l1 / 1000 + ", " + + l2 / 1000); + } + } + + public String getPageData(String url,String isAjax,String cid,String type + ,String refer,String cookie,String ip) { + try { + LOG.info(Thread.currentThread().getName()+" download client geting page, url=" + url); + + int httpcode=996 ; + int retry = 0 ; + while(httpcode == 996){ + String result = getPage(url,isAjax,cid,type,refer,cookie,ip); + Map resMap = (Map) JsonUtils.parseObject(result); + Map spider = (Map) resMap.get("spiderdata"); + httpcode = (Integer) spider.get("httpcode") ; + LOG.info(new StringBuffer().append(Thread.currentThread().getName()).append(" httcode=").append(httpcode).append(" , retry=").append(retry).append(" , url=").append(url).toString()); + if (httpcode == 200) { + String pagedata = (String) spider.get("data"); + String charset = (String) spider.get("charset"); + byte[] bytes = DataUtil.unzipAndDecode(pagedata); + String encode = EncodeUtil.getHtmlEncode(bytes, charset); + LOG.info(Thread.currentThread().getName()+" download client got page data"); + return new String(bytes, encode); + } + retry++ ; + } + } catch (Exception e) { + LOG.warn("Download page data error, ", e); + } + return null; + } + +// public String getPage(String url,String cid,String refer,String cookie) { +// Map req = new HashMap(); +// req.put("url", url); +// req.put("cid", cid); +// req.put("referer", refer); +// if(cookie!=null&&cookie.trim().length()!=0){ +// req.put("cookie", cookie); +// } +// url = JsonUtil.toJSONString(req); +// try { +// return getService().getPage(url); +// } catch (Exception e) { +// LOG.warn(Thread.currentThread().getName() + " exception while calling getPage , err:", e); +// } +// long l1 = 1000, l2 = 1000, lmax = 120000, ltemp; +// while (true) { +// try { +// Thread.sleep(l1); +// ltemp = l1; +// l1 = l2; +// l2 = ltemp + l2; +// if (l2 >= lmax) { +// l1 = l2 = 1000; +// } +//// destroy() ; +// init() ; +// return getService().getPage(url); +// } catch (InterruptedException x) { +// } catch (Exception x) { +// LOG.warn(Thread.currentThread().getName() + " exception while connect to downloader server: " + l1 +// / 1000 + ", " + l2 / 1000 + "", x); +// } +// LOG.debug(Thread.currentThread().getName() + " try to connect to downloader server: " + l1 / 1000 + ", " +// + l2 / 1000); +// } +// } + /** + * 1.url;2.jsajax;3.cid;4.type(item,list);5.refer;6.cookie,7.ip,8.ajax_pagefield,9.ajaxext + * @param URL + * @param isajax + * @param cid + * @param type + * @return + */ + public String getPage(Object... params) { + if(params==null||params.length==0){ + return ""; + } + + String[] paramKey = {"url","needajaxdata","cid","type","refer","cookie","pagetype","siteid","ip","ajax_page_field","ajaxext"}; + + Map req = new HashMap(); + Map accdata = new HashMap(); +// accdata.put("loginip", ""); + accdata.put("userid", "13412022647"); + accdata.put("code", "0"); + accdata.put("siteid", "35"); + for(int i=0;i= lmax) { + l1 = l2 = 1000; + } + if (init()) + return getService().getOnePage(requestJson); + } catch (InterruptedException x) { + } catch (Exception x) { + LOG.warn(Thread.currentThread().getName() + " exception while connect to downloader server: " + l1 + / 1000 + ", " + l2 / 1000 + "", x); + } + LOG.debug(Thread.currentThread().getName() + " try to connect to downloader server: " + l1 / 1000 + ", " + + l2 / 1000); + } + } + + /** + * 压缩编码后的页面内容 + * + * @param testurl + * @return + */ + @SuppressWarnings("unchecked") + public String getPageData(String url) { + try { + LOG.info("download client geting page, url=" + url); + String result = getPage(url); + LOG.info("download client got page data"); + Map resMap = (Map) JsonUtils.parseObject(result); + Map spider = (Map) resMap.get("spiderdata"); + if ((Integer) spider.get("httpcode") == 200) { + String pagedata = (String) spider.get("data"); + String charset = (String) spider.get("charset"); + byte[] bytes = DataUtil.unzipAndDecode(pagedata); + String encode = EncodeUtil.getHtmlEncode(bytes, charset); + return new String(bytes, encode); + } + } catch (Exception e) { + LOG.warn("Download page data error, ", e); + } + return null; + } + + public static void main(String[] args) { + PropertyConfigurator.configure("log4j.properties"); + String url0 = "http://weixin.sogou.com/weixin?query=%E5%8D%8E%E4%B8%BA&fr=sgsearch&type=2&ie=utf8&w=01019900&sut=1918&sst0=1441706539107&lkt=0%2C0%2C0"; + String url = "http://weixin.sogou.com/websearch/art.jsp?sg=CBf80b2xkgZdlrlP83m3xgEMeacTVcpEF8KWIvA4_n51cyNNyejYkZHXbnWRKDp6gGRCdYYSc_4rujf9h8Jhg4Y7sr8Rlk4YwfqCEUR7aTQAcHp4KQJgrM0bHQX1RR6d&url=p0OVDH8R4SHyUySb8E88hkJm8GF_McJfBfynRTbN8whQJg_0WPiWa9jGpcKi4811RD7iN2JSGUvIJyoW3Cfu5mQ3JxMQ3374wPGHHcB_Q_aS2cSBxZvpt-257UkqmhpJPRQzELLiAppYy-5x5In7jJFmExjqCxhpkyjFvwP6PuGcQ64lGQ2ZDMuqxplQrsbk"; + DownloadClient client = new DownloadClient(); +// String url = "http://weibo.com/3733532417/CpgJI89RK?type=comment"; + + String stringZip0 = client.getPage(url0, "1", "sina", "test", "", + "", "weibo", "35"); + + String stringZip = client.getPage(url, "1", "sina", "test", "", + "", "weibo", "35"); + + Map resMap = null; + try { + resMap = (Map) JsonUtils.parseObject(stringZip); + } catch (Exception e1) { + e1.printStackTrace(); + } + Map spider = (Map) resMap + .get("spiderdata"); + + String html = WeiboParser.getHtml(spider); + System.out.println(html); +// Pattern p = Pattern.compile("iframeContent = ([\\s\\S]*?)\""); +// Matcher m = p.matcher(page); +// String fragment = ""; +// while(m.find()){ +// fragment = m.group(1); +// } +// try { +// fragment = URLDecoder.decode(fragment, "utf-8"); +// } catch (UnsupportedEncodingException e) { +// e.printStackTrace(); +// } + } +} diff --git a/src/com/bfd/parse/client/DownloadClientTest.java b/src/com/bfd/parse/client/DownloadClientTest.java new file mode 100644 index 0000000..7e12776 --- /dev/null +++ b/src/com/bfd/parse/client/DownloadClientTest.java @@ -0,0 +1,40 @@ +package com.bfd.parse.client; + +import java.util.HashMap; +import java.util.Map; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import Ice.ObjectPrx; + +import com.bfd.crawler.DownloaderPrx; +import com.bfd.crawler.DownloaderPrxHelper; +import com.bfd.crawler.utils.JsonUtils; + +public class DownloadClientTest { + private static final Log LOG = LogFactory.getLog(DownloadClientTest.class); + private static final String cfgLocator = "bfdcloud/Locator:tcp -h 192.168.61.129 -p 9099"; + + public static void main(String[] args) { + Ice.Communicator __ic = null; + Ice.Properties properties = Ice.Util.createProperties(); + properties.setProperty("Ice.Default.Locator", cfgLocator); + properties.setProperty("Ice.Override.Timeout", "2000"); + + Ice.InitializationData initData = new Ice.InitializationData(); + initData.properties = properties; + __ic = Ice.Util.initialize(initData); + + ObjectPrx proxy = __ic.stringToProxy("DownonepageService"); + if (proxy == null){ + return ; + } + + DownloaderPrx download = DownloaderPrxHelper.checkedCast(proxy); + Map map = new HashMap(); + map.put("cid", "test"); + map.put("url", "http://blog.csdn.net/moxiaomomo/article/details/6769316"); + String html = download.getOnePage(JsonUtils.toJSONString(map)); + } +} diff --git a/src/com/bfd/parse/client/IceClient.java b/src/com/bfd/parse/client/IceClient.java new file mode 100644 index 0000000..8d11dfd --- /dev/null +++ b/src/com/bfd/parse/client/IceClient.java @@ -0,0 +1,19 @@ +package com.bfd.parse.client; + +public interface IceClient { + + public static final int STATUS_OK = 0; + + public static final int STATUS_INITFAILED = 1; + + public static final int STATUS_REFUSE = 2; + + public static final int STATUS_RELEASE = 3; + + public void release(); + + public Integer getStatus(); + + public String name(); + +} diff --git a/src/com/bfd/parse/client/LoginManagerClient.java b/src/com/bfd/parse/client/LoginManagerClient.java new file mode 100644 index 0000000..5edaee1 --- /dev/null +++ b/src/com/bfd/parse/client/LoginManagerClient.java @@ -0,0 +1,47 @@ +package com.bfd.parse.client; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import com.bfd.crawler.DownloaderPrx; +import com.bfd.crawler.LoginManagerService; +import com.bfd.crawler.LoginManagerServicePrx; +import com.bfd.crawler.LoginManagerServicePrxHelper; +import com.bfd.parse.ParserFace; + +import Ice.ObjectPrx; + +public class LoginManagerClient extends AbstractClient { + private static final Log logger = LogFactory.getLog(LoginManagerClient.class); + @Override + public String name() { + + return "LoginManagerService"; + } + + @Override + protected ObjectPrx checkedCast() { + + return LoginManagerServicePrxHelper.checkedCast(base); + } + + @Override + protected String getProxyConfig() { + + return "LoginManagerService"; + } + + @Override + protected LoginManagerServicePrx getService() { + return ((LoginManagerServicePrx) super.getService()); + } + + public void callLogin(String cid,String userId){ + getService().releaseLoginInfo(cid, userId, "invalid"); + } + + public static void main(String[] args) { + LoginManagerClient client = new LoginManagerClient(); + String rs = client.getService().getLoginInfo("sina"); + } +} diff --git a/src/com/bfd/parse/client/TraceTaskClient.java b/src/com/bfd/parse/client/TraceTaskClient.java new file mode 100644 index 0000000..c8ae784 --- /dev/null +++ b/src/com/bfd/parse/client/TraceTaskClient.java @@ -0,0 +1,87 @@ +package com.bfd.parse.client; + +import java.util.HashMap; +import java.util.Map; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import Ice.ObjectPrx; + +import com.bfd.crawler.TraceManagePrx; +import com.bfd.crawler.TraceManagePrxHelper; +import com.bfd.crawler.utils.ConfigUtils; +import com.bfd.crawler.utils.JsonUtils; +import com.bfd.parse.Constants; +import com.bfd.parse.facade.parseunit.ParseUnit; + +public class TraceTaskClient { + private static final Log logger = LogFactory.getLog(TraceTaskClient.class); + public static int maxTryCount = 3; + + public void reportData(String taskId,String json){ + int i=0; + while(i data = new HashMap(); + data.put("siteid", unit.getSiteId()); + data.put("pagetypeid", unit.getPageTypeId()); + data.put("parenttaskid", 0); + data.put("host", Constants.host); + data.put("eventname", eventname); + data.put("eventcode", eventcode); + data.put("eventdata", enentdata); + data.put("servicename", serviceName); + logger.debug("url:"+unit.getUrl()+" report data :"+JsonUtils.toJSONString(data)); + reportData(unit.getTaskId(), JsonUtils.toJSONString(data)); + } + + public void reportTraceTask(Map unit, String serviceName, String eventname, String enentdata, int eventcode) { + + if (!unit.containsKey("traceflag") || !unit.containsKey(Constants.taskid)) { + return; + } + int trace = Integer.parseInt(unit.get("traceflag").toString()); + // 如果不是染色任务,就不上报数据。 + if (trace != 1) { + return; + } + Map data = new HashMap(); + data.put("siteid", unit.get("siteid")); + data.put("pagetypeid", 0); + data.put("parenttaskid", 0); + data.put("host", Constants.host); + data.put("eventname", eventname); + data.put("eventcode", eventcode); + data.put("eventdata", enentdata); + data.put("servicename", serviceName); + logger.debug("url:" + unit.get("url") + " report data :" + JsonUtils.toJSONString(data)); + reportData(unit.get(Constants.taskid).toString(), JsonUtils.toJSONString(data)); + } + + + public static void main(String[] args) { + TraceTaskClient client = new TraceTaskClient(); + client.reportData("123", "json"); + + } + +} diff --git a/src/com/bfd/parse/client/URLNormalizerClient.java b/src/com/bfd/parse/client/URLNormalizerClient.java new file mode 100644 index 0000000..2103042 --- /dev/null +++ b/src/com/bfd/parse/client/URLNormalizerClient.java @@ -0,0 +1,163 @@ +package com.bfd.parse.client; + +import java.net.URL; +import java.util.HashMap; +import java.util.Map; +import java.util.regex.Pattern; + +import org.apache.commons.lang.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.log4j.PropertyConfigurator; + +import Ice.ObjectPrx; + +import com.bfd.crawler.UrlHandlePrx; +import com.bfd.crawler.UrlHandlePrxHelper; +import com.bfd.crawler.utils.ConfigUtils; +import com.bfd.crawler.utils.JsonUtils; + +public class URLNormalizerClient { + + private static final Log LOG = LogFactory.getLog(URLNormalizerClient.class); + + private static final Pattern imgFilerRegex = Pattern + .compile( + "^\\s*https?://([-0-9a-z]+\\.)+(com|org|net|[a-z]{2}|firm|store|Web|arts|rec|info|nom|edu)(\\.|:\\d+)?(/[^\\s'\"]*)?\\.(BMP|JPG|JPEG|PNG|GIF)(?![a-z])\\s*$", + Pattern.CASE_INSENSITIVE); + + private static final Pattern urlRegex = Pattern + .compile( + "^\\s*https?://([-0-9a-z]+\\.)+(com|org|net|[a-z]{2}|firm|store|Web|arts|rec|info|nom|edu)(\\.|:\\d+)?(/[^\\s'\"]*)?\\s*$", + Pattern.CASE_INSENSITIVE); + + public URLNormalizerClient() { + super(); + } + + + public String normalize(String bizName, String type, String url, String baseurl, boolean bNormalize) { + return this.normalize(bizName, type, url, baseurl, bNormalize, false); + } + + public String normalize(String bizName, String type, String url, String baseurl, boolean bNormalize, + boolean filteImg) { + // if (testMode) { + // return url; + // } + if (StringUtils.isBlank(url)) { + LOG.warn(Thread.currentThread().getName() + " Invalid url while normalizing, url=" + url + ", cid=" + + bizName + ", baseurl=" + baseurl); + return ""; + } + Map resMap = normalizeExt(bizName, type, url, baseurl, bNormalize, filteImg); + Integer code = (Integer) resMap.get("code"); + if (code == null || (code != 0 && code != 3)) { + return ""; + } + return (String) resMap.get("url"); + } + + public Map normalizeExt(String bizName, String type, String url, String baseurl, + boolean bNormalize, boolean filteImg) { + String res = normalizeE(bizName, type, url, baseurl, bNormalize, filteImg); + Map resMap = null; + try { + resMap = (Map) JsonUtils.parseObject(res); + } catch (Exception e) { + e.printStackTrace(); + } + if (resMap == null) { + resMap = new HashMap(); + resMap.put("code", -1); + } + Integer code = (Integer) resMap.get("code"); + if (code == null || (code != 0 && code != 3)) { + return resMap; + } + String resUrl = (String) resMap.get("url"); + boolean isMatch = true; + if (filteImg) + isMatch = imgFilerRegex.matcher(resUrl).find(); + else + isMatch = urlRegex.matcher(resUrl).find(); + resMap.put("url", isMatch ? resUrl.trim() : ""); + return resMap; + } + + public String normalizeE(String bizName, String type, String url, String baseurl, boolean bNormalize, + boolean filteImg) { + try { + String fallUrl = getFallUrl(baseurl,url); + Map map = new HashMap(); + map.put("code", 0); + map.put("url", fallUrl); + return JsonUtils.toJSONString(map); + } catch (Exception e) { + LOG.warn(Thread.currentThread().getName() + " exception while calling normalize ,url=" + url + ", cid=" + + bizName + ", baseurl=" + baseurl, e); + } + return ""; + } + + public static String getFallUrl(String parentURL, String url) { + + if (url == null) { + return ""; + } + + if ("".equals(url)) { + return ""; + } + if (url.toLowerCase().startsWith("javascript")) { + return ""; + } + URL u = null; + try { + + if (url.startsWith("?")) { // 子 HREF 值以 ? 开头 + if (parentURL.indexOf("?") == -1) { + u = new URL(parentURL + url); + } else if (parentURL.indexOf("?") != -1) { + u = new URL(parentURL.substring(0, parentURL.indexOf("?")) + url); + } + } else if (url.startsWith("../")) { // 提取的 URL 值. + if (StringUtils.countMatches(parentURL, "/") == 2) { + u = new URL(new URL(parentURL), "/" + url.replace("../", "")); + } else if (StringUtils.countMatches(parentURL, "/") == 3) { + u = new URL(new URL(parentURL), url.replace("../", "")); + } else { + u = new URL(new URL(parentURL), url); + } + } else { + u = new URL(new URL(parentURL), url); + } + + String link = u.toExternalForm(); + if (link.startsWith("https://")) { // 不支持 https:// + return ""; + } + } catch (Exception e) { + + } + + if (u == null) { + return ""; + } + String fallUrl = u.toString(); + if (fallUrl.contains("../")) { + fallUrl = fallUrl.replace("../", ""); + } + return fallUrl; + } + + public static void main(String[] args) { + PropertyConfigurator.configure("log4j.properties"); + String url = "/dggde/6.jpg"; + String base = "http://www.jiazhao.com/news18156"; + URLNormalizerClient normalizer = new URLNormalizerClient(); + String normalize = normalizer.normalize("Cmaibaobao", "_img", url, base, true,true); +// normalize(cid, type + "_img", srcLink, this.url, true, FilteImg); + System.out.println(normalize); + } +} diff --git a/src/com/bfd/parse/config/AConfig.java b/src/com/bfd/parse/config/AConfig.java new file mode 100644 index 0000000..004c629 --- /dev/null +++ b/src/com/bfd/parse/config/AConfig.java @@ -0,0 +1,109 @@ +package com.bfd.parse.config; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.locks.ReadWriteLock; +import java.util.concurrent.locks.ReentrantReadWriteLock; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import com.bfd.crawler.utils.JsonUtils; +import com.bfd.parse.ParserFace; +import com.bfd.parse.client.ConfigClient; +import com.bfd.parse.entity.BaseEntity; +import com.bfd.parse.util.JsonUtil; + +public abstract class AConfig implements Config { + private static final Log logger = LogFactory.getLog(AConfig.class); + protected ConfigClient configService = new ConfigClient(); + protected Map cache = new HashMap(); + private ReadWriteLock rwLock = new ReentrantReadWriteLock(true); + protected ReadWriteLock rwLockPD = new ReentrantReadWriteLock(true); + protected Map pagedefinenameIdMap = new HashMap(); + public abstract String getTableName(); + public abstract BaseEntity getObjByMap(Map map); + + + public AConfig() { + super(); + requestConfig(); + } + @Override + public boolean requestConfig() { + logger.info("execute requestConfig tablename is "+getTableName()); + rwLock.writeLock().lock(); + String config = configService.getConfig(getTableName(), REQUEST_ALL); + logger.info("requestconfig "+getTableName()+" config is :"+config); + if (config == null) { + rwLock.writeLock().unlock(); + return false; + } + this.cache.clear(); + Map map; + try { + map = (Map) JsonUtils.parseObject(config); + List data = (List) map.get("data"); +// List list = new ArrayList(); + for (Map tempMap : data) { + BaseEntity o = getObjByMap(tempMap); + //TODO这个有效性判断,在getObjByMap实现,如果失效就返回null + if (o!=null) { + //针对一个cachekey有多个记录的情况,这是结果是一个链表。 + //比如模板:一个siteId 一个pageTypeId可能又多个模板 + BaseEntity firstObj = this.cache.get(o.getCacheKey()); + if(firstObj!=null){ +// getLastEntity(this.cache.get(o.getCacheKey())).setNext(o); + //domtemplate 通过template来增加模板 + firstObj.addEntityToLast(o); + }else{ + + cache.put(o.getCacheKey(), o); + + } + + } + } + logger.debug("table:"+getTableName()+",cache size is " + +this.cache.size()+".cache is "+JsonUtils.toJSONString(this.cache)); + } catch (Exception e) { + e.printStackTrace(); +// rwLock.writeLock().unlock(); + return false; + }finally{ + rwLock.writeLock().unlock(); + } + + return true; + } + + public Object searchByCacheKey(String cacheKey){ + Object rs = null; + try { + rwLock.readLock().lock(); + rs = this.cache.get(cacheKey); + } catch (Exception e) { + e.printStackTrace(); + }finally{ + rwLock.readLock().unlock(); + } + if(rs==null){ + logger.info("searchbycachekey return null cache is "+JsonUtils.toJSONString(this.cache)); + } + return rs; + + } + + + @Override + public String name() { + return null; + } + + @Override + public Map getData() { + return this.cache; + } + +} diff --git a/src/com/bfd/parse/config/Config.java b/src/com/bfd/parse/config/Config.java new file mode 100644 index 0000000..f559885 --- /dev/null +++ b/src/com/bfd/parse/config/Config.java @@ -0,0 +1,13 @@ +package com.bfd.parse.config; + +import java.util.Map; + +import com.bfd.parse.entity.BaseEntity; + +public interface Config { + public static final String REQUEST_ALL = "{\"type\":\"ALL\"}";; + boolean requestConfig(); + + String name(); + Map getData(); +} diff --git a/src/com/bfd/parse/config/ConfigFactory.java b/src/com/bfd/parse/config/ConfigFactory.java new file mode 100644 index 0000000..f64d181 --- /dev/null +++ b/src/com/bfd/parse/config/ConfigFactory.java @@ -0,0 +1,37 @@ +package com.bfd.parse.config; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import com.bfd.parse.config.PageDefine.PageDefineConfig; +import com.bfd.parse.config.dom.DomConfig; +import com.bfd.parse.config.iid.ParseConfigure; +import com.bfd.parse.config.parseplugin.PluginConfig; +import com.bfd.parse.config.shelf.JudgeRuleConfig; +import com.bfd.parse.config.sitepageconfig.SitePageConfigCache; +import com.bfd.parse.config.website.WebsiteCache; + +public class ConfigFactory { + private static final Log log = LogFactory.getLog(ConfigFactory.class); + public static Config getInstance(String tableName){ + Config config = null ; + if("parsetemplate".equals(tableName)){ + config = DomConfig.getInstance(); + }else if("parseplugin".equals(tableName)){ + config = PluginConfig.getInstance(); + }else if("sitepageconfig".equals(tableName)){ + config = SitePageConfigCache.getInstance(); + }else if("website".equals(tableName)){ + config = WebsiteCache.getInstance(); + + }else if("pagedefine".equals(tableName)){ + config = PageDefineConfig.getInstance(); + } + else{ + log.error("tablename:"+tableName+" no get config to reload "); + return null; + } + log.debug("tableName : "+tableName + " return "+config.getClass().getName()); + return config; + } +} diff --git a/src/com/bfd/parse/config/ConfigLoader.java b/src/com/bfd/parse/config/ConfigLoader.java new file mode 100644 index 0000000..7646c5b --- /dev/null +++ b/src/com/bfd/parse/config/ConfigLoader.java @@ -0,0 +1,37 @@ +package com.bfd.parse.config; + +import java.util.HashMap; +import java.util.Map; + +import com.bfd.parse.config.dom.DomConfig; +import com.bfd.parse.config.parseplugin.PluginConfig; +import com.bfd.parse.config.shelf.JudgeRuleConfig; +import com.bfd.parse.config.website.AutoRuleWrap; + +public class ConfigLoader { + + private static Map configMrgDict; + static { + configMrgDict = new HashMap(); + configMrgDict.put("tmpl", DomConfig.getInstance()); + configMrgDict.put("plugin", PluginConfig.getInstance()); + configMrgDict.put("rule", JudgeRuleConfig.getInstance()); + configMrgDict.put("autorule", AutoRuleWrap.getInstance()); + } + + public static boolean load(Map reqMap) { + String configType = (String) reqMap.get("config");// tmpl,plugin,rule + Config config = configMrgDict.get(configType); + if (config instanceof DomConfig && !"all".equals((String) reqMap.get("cid"))) { + String cid = (String) reqMap.get("cid"); // all , Czouxiu, doit + String type = null; + if (reqMap.containsKey("type")) { + type = (String) reqMap.get("type"); // list, info, item + } + DomConfig domConfig = (DomConfig) config; + return domConfig.requestConfig(cid, type); + } else { + return config.requestConfig(); + } + } +} diff --git a/src/com/bfd/parse/config/FieldDefine/FieldDefineConfig.java b/src/com/bfd/parse/config/FieldDefine/FieldDefineConfig.java new file mode 100644 index 0000000..ea5e61a --- /dev/null +++ b/src/com/bfd/parse/config/FieldDefine/FieldDefineConfig.java @@ -0,0 +1,40 @@ +package com.bfd.parse.config.FieldDefine; + +import java.util.Map; + +import com.bfd.crawler.utils.JacksonUtils; +import com.bfd.crawler.utils.JsonUtils; +import com.bfd.parse.config.AConfig; +import com.bfd.parse.config.PageDefine.PageDefineConfig; +import com.bfd.parse.entity.BaseEntity; +import com.bfd.parse.entity.FielddefineEntity; + +public class FieldDefineConfig extends AConfig { + private static FieldDefineConfig instance; + private FieldDefineConfig(){ + + } + public FielddefineEntity getFieldDefine(String cacheKey){ + return (FielddefineEntity)this.searchByCacheKey(cacheKey); + } + public static FieldDefineConfig getInstance() { + if (instance == null) { + synchronized (FieldDefineConfig.class) { + if (instance == null) { + instance = new FieldDefineConfig(); + } + } + } + return instance; + } + @Override + public String getTableName() { + return "FieldDefine"; + } + + @Override + public BaseEntity getObjByMap(Map map) { + return JacksonUtils.extractObject(JsonUtils.toJSONString(map), FielddefineEntity.class); + } + +} diff --git a/src/com/bfd/parse/config/PageDefine/PageDefineConfig.java b/src/com/bfd/parse/config/PageDefine/PageDefineConfig.java new file mode 100644 index 0000000..53adec8 --- /dev/null +++ b/src/com/bfd/parse/config/PageDefine/PageDefineConfig.java @@ -0,0 +1,93 @@ +package com.bfd.parse.config.PageDefine; + +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.Map.Entry; +import java.util.concurrent.locks.ReadWriteLock; +import java.util.concurrent.locks.ReentrantReadWriteLock; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import com.bfd.crawler.utils.JacksonUtils; +import com.bfd.crawler.utils.JsonUtils; +import com.bfd.parse.config.AConfig; +import com.bfd.parse.entity.BaseEntity; +import com.bfd.parse.entity.PagedefineEntity; + +public class PageDefineConfig extends AConfig { + private static PageDefineConfig instance; +// private ReadWriteLock rwLockPD = new ReentrantReadWriteLock(true); + private static final Log logger = LogFactory.getLog(AConfig.class); + + private PageDefineConfig(){ + + } + public static PageDefineConfig getInstance() { + if (instance == null) { + synchronized (PageDefineConfig.class) { + if (instance == null) { + instance = new PageDefineConfig(); + } + } + } + return instance; + } + + @Override + public boolean requestConfig() { + boolean rs = super.requestConfig(); + rwLockPD.writeLock().lock(); + try { + Iterator> it = this.getData().entrySet().iterator(); + while(it.hasNext()){ + Entry entry = it.next(); + PagedefineEntity pd = (PagedefineEntity)entry.getValue(); + this.pagedefinenameIdMap.put(pd.getPagenameen(), pd.getPagetypeid()); + } + } catch (Exception e) { + e.printStackTrace(); + }finally{ + rwLockPD.writeLock().unlock(); + } + logger.info("nameIdMap:"+JsonUtils.toJSONString(pagedefinenameIdMap)); + return rs; + } + public PagedefineEntity getPageDefine(String cacheKey) { + Object o = this.searchByCacheKey(cacheKey); + return (PagedefineEntity)o; + } + + public String getIdByName(String name){ + if(name==null){ + return null; + } +// return this.nameIdMap.get(name).toString(); + + String rs = ""; + try { + rwLockPD.readLock().lock(); + rs = this.pagedefinenameIdMap.get(name.trim()).toString(); + } catch (Exception e) { + e.printStackTrace(); + }finally{ + rwLockPD.readLock().unlock(); + } +// logger.info("getIdByName nameIdMap:"+JsonUtils.toJSONString(pagedefinenameIdMap)+",param:"+name+",rs:"+rs); + return rs; + } + + @Override + public String getTableName() { + + return "PageDefine"; + } + + @Override + public BaseEntity getObjByMap(Map map) { + + return JacksonUtils.extractObject(JsonUtils.toJSONString(map), PagedefineEntity.class); + } + +} diff --git a/src/com/bfd/parse/config/dom/DomCFGBlock.java b/src/com/bfd/parse/config/dom/DomCFGBlock.java new file mode 100644 index 0000000..6424cfd --- /dev/null +++ b/src/com/bfd/parse/config/dom/DomCFGBlock.java @@ -0,0 +1,133 @@ +package com.bfd.parse.config.dom; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.w3c.dom.Node; + +public class DomCFGBlock extends DomCFGTree implements DomCFGElement { + private static final Log LOG = LogFactory.getLog(DomCFGBlock.class); + + private String[] path; + private String name; + + private List fields; + private List children; + private List blocks; + + // 是否为imglist(如taobao的imglist) + private boolean isImglist = false; + + public DomCFGBlock(Node node) { + String mName = node.getNodeName(); + if (!BLOCK_NAME.equalsIgnoreCase(mName)) { + return; + } + + fields = new ArrayList(); + children = new ArrayList(); + blocks = new ArrayList(); + name = DomSearch.getNodeAttr(node, BLOCK_ATTR_NAME, ""); + if ("imglist".equalsIgnoreCase(name)) { + isImglist = true; + } + + String stmp = DomSearch.getNodeAttr(node, BLOCK_ATTR_PATH, ""); + if (stmp.length() > 0) { + path = stmp.split("/"); + } + + Node cld = node.getFirstChild(); + while (cld != null) { + String childName = cld.getNodeName(); + if (NODE_NAME.equalsIgnoreCase(childName)) { + children.add(new DomCFGTree(cld)); // 子树 + } else if (FIELD_NAME.equalsIgnoreCase(childName)) { // 字段 + fields.add(new DomCFGField(cld)); + } else if (BLOCK_NAME.equalsIgnoreCase(childName)) { + blocks.add(new DomCFGBlock(cld)); // block + } + cld = cld.getNextSibling(); + } + } + + public String[] getTreePath() { + return path; + } + + public void setTreePath(String[] treePath) { + this.path = treePath; + } + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public List getFields() { + return fields; + } + + public void setFields(List field) { + this.fields = field; + } + + public List getBlocks() { + return blocks; + } + + public void setBlocks(List blocks) { + this.blocks = blocks; + } + + public List getChildren() { + return children; + } + + public void setChildren(List child) { + this.children = child; + } + + public String[] getPath() { + return path; + } + + public void setPath(String[] path) { + this.path = path; + } + + public boolean isImglist() { + return isImglist; + } + + public String toString() { + StringBuilder sbBuilder = new StringBuilder(); + if (path == null) { + sbBuilder.append("No path"); + } else { + for (int i = 0; i < path.length; i++) { + if (i > 0) + sbBuilder.append("/").append(path[i]); + else + sbBuilder.append("path=").append(path[i]); + } + } + for (int i = 0; i < fields.size(); i++) { + sbBuilder.append(",").append("field-").append(i).append(":").append(fields.get(i).toString()); + } + for (int i = 0; i < blocks.size(); i++) { + // TODO + } + for (int i = 0; i < children.size(); i++) { + // TODO + } + sbBuilder.append(",").append("name=").append(name); + return sbBuilder.toString(); + } + +} \ No newline at end of file diff --git a/src/com/bfd/parse/config/dom/DomCFGElement.java b/src/com/bfd/parse/config/dom/DomCFGElement.java new file mode 100644 index 0000000..6696546 --- /dev/null +++ b/src/com/bfd/parse/config/dom/DomCFGElement.java @@ -0,0 +1,45 @@ +package com.bfd.parse.config.dom; + +public interface DomCFGElement { + + public static final String ROOT_NAME = "tmpl"; + + public static final String NODE_NAME = "node"; + + public static final String NODE_ATTR_TYPE = "type"; + + public static final String NODE_ATTR_PATH = "path"; + + public static final String BLOCK_NAME = "block"; + + public static final String BLOCK_ATTR_NAME = "name"; + + public static final String BLOCK_ATTR_PATH = "path"; + + public static final String FIELD_NAME = "fld"; + + public static final String FILED_ATTR_NAME = "name"; + + public static final String FILED_ATTR_EXT = "ext"; + + public static final String FILED_ATTR_LINK = "link"; + + public static final String create_task = "createtask"; + + public static final String FILED_ATTR_NEEDIMG = "needImg"; + + public static final String FILED_ATTR_IMG = "isImg"; + + public static final String FILED_ATTR_HTML = "html"; + + public static final String FILED_ATTR_PATH = "path"; + + public static final String FILED_ATTR_LINKTYPE = "linktype"; + + public static final String FILED_ATTR_MULTI = "multi"; + + public static final String FILED_ATTR_MULTILEVEL = "multiLevel"; + + public static final String FILED_ATTR_SEGM = "segflag"; + +} diff --git a/src/com/bfd/parse/config/dom/DomCFGField.java b/src/com/bfd/parse/config/dom/DomCFGField.java new file mode 100644 index 0000000..2e4f584 --- /dev/null +++ b/src/com/bfd/parse/config/dom/DomCFGField.java @@ -0,0 +1,220 @@ +package com.bfd.parse.config.dom; + +import org.apache.commons.lang.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.w3c.dom.DocumentFragment; +import org.w3c.dom.Node; + +import com.bfd.parse.DomParser; + +public class DomCFGField implements DomCFGElement { + + private static final Log LOG = LogFactory.getLog(DomCFGField.class); + + private String name; // 字段名称 + private String[] path; // 字段路径,多级=[div@@conlist@1] + private String extFilter; // 匹配结果的附加处理,正则表达式 + + boolean isMulti; // 多匹配结果,多匹配结果时,会查询多个同级节点的值 + int multiLevel; // 多匹配结果时,多匹配结果的开始级别 + + boolean isUseIndex; // 是否使用索引进行匹配,缺省是否,即在名称匹配的情况下,首先通过id,class进行匹配,失败时才通过index进行匹配 + + private boolean isLink; // 链接地址 + private String linkType; // 链接类型 + private boolean needImg = false; // 是否收集图片 + private boolean isImg = false; + private String attName; + private boolean html = false; // 是否为获取html + private String linkAtt; + private boolean rmHead = false; + private boolean rmTail = false; + private String segflag = null; + private boolean needSegm = false; + private String fieldType; + private String createTask; + + + public String getCreateTask() { + return createTask; + } + + public void setCreateTask(String createTask) { + this.createTask = createTask; + } + + public String getFieldType() { + return fieldType; + } + + public void setFieldType(String fieldType) { + this.fieldType = fieldType; + } + + public DomCFGField(Node node) { + init(node); + } + + public void init(Node node) { + String sName = node.getNodeName(); + if (!FIELD_NAME.equalsIgnoreCase(sName)) { + return; + } + name = DomSearch.getNodeAttr(node, FILED_ATTR_NAME, ""); // 字段名称 + String stmp = DomSearch.getNodeAttr(node, FILED_ATTR_MULTI, ""); // 是否多匹配字段 + if (stmp.equalsIgnoreCase("true")) { + isMulti = true; + stmp = DomSearch.getNodeAttr(node, FILED_ATTR_MULTILEVEL, "0"); + try { + multiLevel = Integer.parseInt(stmp); // 多匹配开始级别 + } catch (NumberFormatException x) { + multiLevel = -1; + } + } else { + isMulti = false; + multiLevel = -1; + } + extFilter = DomSearch.getNodeAttr(node, FILED_ATTR_EXT, ""); // 附加处理 + stmp = DomSearch.getNodeAttr(node, FILED_ATTR_PATH, ""); // 路径 + if (stmp.length() > 0) { + path = stmp.split("/"); + } + + // 是否收集图片 + String needImg = DomSearch.getNodeAttr(node, FILED_ATTR_NEEDIMG, ""); + if ("true".equalsIgnoreCase(needImg) || ("contents".equalsIgnoreCase(name))) { + this.needImg = true; + } + // 是否是img + String isImg = DomSearch.getNodeAttr(node, FILED_ATTR_IMG, ""); + if ("true".equalsIgnoreCase(isImg) || "small_img".equalsIgnoreCase(name) || "large_img".equalsIgnoreCase(name) + || "imgurl".equalsIgnoreCase(name) || "img".equalsIgnoreCase(name)) { + this.isImg = true; + } + + // 指定解析图片的属性键 + this.attName = DomSearch.getNodeAttr(node, "attname", ""); + + // 是否是img + String html = DomSearch.getNodeAttr(node, FILED_ATTR_HTML, ""); + if ("true".equalsIgnoreCase(html)) { + this.html = true; + } + String createTaskStr = DomSearch.getNodeAttr(node, create_task, ""); // 是否生成任务 + this.createTask = createTaskStr; + LOG.info("createTask:"+this.createTask +" name :"+this.name); + String hrefTempName = DomSearch.getNodeAttr(node, FILED_ATTR_LINK, ""); // 链接 + if (hrefTempName.equalsIgnoreCase("true")) { + isLink = true; + this.linkAtt = DomSearch.getNodeAttr(node, "linkatt", "href"); + linkType = DomSearch.getNodeAttr(node, FILED_ATTR_LINKTYPE, null);// 链接的类型 + } else { + isLink = false; + } + + // if ("true".equalsIgnoreCase(DomSearch.getNodeAttr(node, + // FILED_ATTR_RMHEAD, null))){ + // rmHead = true; + // } + // if ("true".equalsIgnoreCase(DomSearch.getNodeAttr(node, + // FILED_ATTR_RMTAIL, null))){ + // rmTail = true; + // } + + segflag = DomSearch.getNodeAttr(node, FILED_ATTR_SEGM, null); + if (StringUtils.isNotEmpty(segflag)) { + needSegm = true; + } + } + + public String getName() { + return name; + } + + public boolean isMultiField() { + return isMulti; + } + + public boolean collectImg() { + return needImg; + } + + public boolean isImg() { + return this.isImg; + } + + public boolean isHtml() { + return html; + } + + public boolean useIndex() { + return isUseIndex; + } + + public int getMultiLevel() { + return multiLevel; + } + + public String[] getFieldPath() { + return path; + } + + public boolean isLink() { + return isLink; + } + + public String getLinkType() { + return linkType; + } + + public String getLinkAtt() { + return linkAtt; + } + + public String attName() { + return this.attName; + } + + public String getSegmflag() { + return segflag; + } + + public boolean needSegm() { + return needSegm; + } + + public String toString() { + String res = "name=" + name; + if (path == null) { + res += ", No path"; + } else { + for (int i = 0; i < path.length; i++) { + if (i > 0) + res += "/" + path[i]; + else + res += ", path=" + path[i]; + } + } + if (extFilter != null) { + res += ",ext=" + extFilter; + } + res += ",isMulti=" + isMulti; + res += ",isUseIndex=" + isUseIndex; + res += ",multiLevel=" + multiLevel; + res += ",isLink=" + isLink; + res += ",linkTpye=" + linkType; + + return res; + } + + public static void main(String[] args) { + String str = ""; + DocumentFragment doc = DomParser.parse2Xml(str, "utf8"); + System.out.println(doc.getNodeName()); + System.out.println(doc.getFirstChild().getNodeName()); + System.out.println(DomSearch.getNodeAttr(doc.getFirstChild(), "linkType", "")); + String rs = DomSearch.getNodeAttr(doc.getFirstChild(), "createtask", ""); + System.out.println(rs); + } +} diff --git a/src/com/bfd/parse/config/dom/DomCFGTree.java b/src/com/bfd/parse/config/dom/DomCFGTree.java new file mode 100644 index 0000000..ef7be94 --- /dev/null +++ b/src/com/bfd/parse/config/dom/DomCFGTree.java @@ -0,0 +1,245 @@ +package com.bfd.parse.config.dom; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.lang.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.w3c.dom.Node; + +import com.bfd.parse.entity.ParsetemplateEntity; + +public class DomCFGTree implements Comparable, DomCFGElement { + + private static final Log log = LogFactory.getLog(DomCFGTree.class); + + private String[] required; // 必须字段 + private Map output; // 输出字段 + private int num = 0; // 模板优先级 + private int status = 0; // 是否可用;0:可用,1:禁用 + private int id; + private String type; + + private boolean indexMatch; + private String[] path; + + private ArrayList fields; + private ArrayList children; + private ArrayList blocks; + + public DomCFGTree() { + // TODO Auto-generated constructor stub + } + + public DomCFGTree(Node node) { + String sName = node.getNodeName(); + if (!NODE_NAME.equalsIgnoreCase(sName) && !ROOT_NAME.equalsIgnoreCase(sName) + && !BLOCK_NAME.equalsIgnoreCase(sName)) { + return; + } + type = DomSearch.getNodeAttr(node, NODE_ATTR_TYPE, ""); // 显示类型新添加 + + String stmp = DomSearch.getNodeAttr(node, NODE_ATTR_PATH, ""); // 路径 + if (stmp.length() > 0) + path = stmp.split("/"); + + stmp = DomSearch.getNodeAttr(node, "required", ""); + if (stmp.length() > 0) + required = stmp.split(","); + + output = new HashMap(); // 输出字段 + stmp = DomSearch.getNodeAttr(node, "output", ""); + if (stmp.length() > 0) { + String[] ss = stmp.split(","); + for (int i = 0; i < ss.length; i++) + output.put(ss[i], true); + } + + fields = new ArrayList(); + children = new ArrayList(); + blocks = new ArrayList(); + + stmp = DomSearch.getNodeAttr(node, "matchByIndex", ""); + indexMatch = false; + if (stmp.equalsIgnoreCase("true")) + indexMatch = true; + + if (sName.equalsIgnoreCase(BLOCK_NAME)) { + blocks.add(new DomCFGBlock(node)); + } else { + Node cld = node.getFirstChild(); + while (cld != null) { + String childName = cld.getNodeName(); + if (childName.equalsIgnoreCase("node")) { + children.add(new DomCFGTree(cld));// 子树 + } else if (childName.equalsIgnoreCase("fld")) { // 字段 + fields.add(new DomCFGField(cld)); + } else if (childName.equalsIgnoreCase("block")) { + blocks.add(new DomCFGBlock(cld)); + } + cld = cld.getNextSibling(); + } + } + } + + public DomCFGTree(Node node, ParsetemplateEntity template) { + String sName = node.getNodeName(); + if (StringUtils.isEmpty(sName) // + || (!NODE_NAME.equalsIgnoreCase(sName) && !ROOT_NAME.equalsIgnoreCase(sName) // + && !BLOCK_NAME.equalsIgnoreCase(sName))) { + return; + } + type = template.getPagetypeid()+""; + num = template.getNum(); // 解析优先级 + status = template.getStatus(); // 是否可用 + id = template.getTmplid(); + String stmp = template.getRequired(); + if (stmp.length() > 0) + required = stmp.split(","); + + stmp = DomSearch.getNodeAttr(node, NODE_ATTR_PATH, ""); // 路径 + if (stmp.length() > 0) + path = stmp.split("/"); + + output = new HashMap(); // 输出字段 + stmp = template.getOutput(); + if (stmp.length() > 0) { + String[] ss = stmp.split(","); + for (int i = 0; i < ss.length; i++) + output.put(ss[i], true); + } + + fields = new ArrayList(); + children = new ArrayList(); + blocks = new ArrayList(); + + stmp = DomSearch.getNodeAttr(node, "matchByIndex", ""); + indexMatch = false; + if (stmp.equalsIgnoreCase("true")) + indexMatch = true; + + if (sName.equalsIgnoreCase(BLOCK_NAME)) { + blocks.add(new DomCFGBlock(node)); + } else { + Node cld = node.getFirstChild(); + while (cld != null) { + String childName = cld.getNodeName(); + if (NODE_NAME.equalsIgnoreCase(childName)) { + children.add(new DomCFGTree(cld)); // 子树 + } else if (FIELD_NAME.equalsIgnoreCase(childName)) { // 字段 + fields.add(new DomCFGField(cld)); + } else if (BLOCK_NAME.equalsIgnoreCase(childName)) { + blocks.add(new DomCFGBlock(cld)); // block + } + cld = cld.getNextSibling(); + } + } + } + + public boolean isMatchByIndex() { + return indexMatch; + } + + public String[] getTreePath() { + return path; + } + + public List getFields() { + return fields; + } + + public List getChildren() { + return children; + } + + public List getBlocks() { + return blocks; + } + + public String getType() { + return type; + } + + public int getId() { + return id; + } + + public Integer getNum() { + return num; + } + + public Integer getStatus() { + return status; + } + + public String[] getRequiredField() { + return required; + } + + public Map getOutputField() { + return output; + } + + public String toString() { + String res = ""; + if (path == null) + res += "No path"; + else + for (int i = 0; i < path.length; i++) + if (i > 0) + res += "/" + path[i]; + else + res += "path=" + path[i]; + for (int i = 0; i < fields.size(); i++) + res += ",field-" + i + ": " + fields.get(i).toString(); + for (int i = 0; i < children.size(); i++) + res += ",child-" + i + ": " + children.get(i).toString(); + if (type == "") + res += ",No type"; + else + res += "," + "type=" + type; + res += "," + "num=" + num; + res += "," + "active=" + status; + for (int i = 0; i < blocks.size(); i++) + res += ",mokes-" + i + ": " + blocks.get(i).toString(); + if (required != null) { + res += ", required:"; + for (int i = 0; i < required.length; i++) + res += " " + required[i]; + } + res += ", output fields:"; + Object[] objs = output.keySet().toArray(); + for (int i = 0; i < objs.length; i++) + res += " " + (String) objs[i]; + return res; + } + + @Override + public int compareTo(DomCFGTree tree) { + if (tree.getNum() > this.getNum()) { + return 1; + } + if (tree.getNum() < this.getNum()) { + return -1; + } + return 0; + } + + public static void main(String[] args) { + List trees = new ArrayList(); + DomCFGTree e = new DomCFGTree(); + e.num = 10; + DomCFGTree e1 = new DomCFGTree(); + e1.num = 5; + DomCFGTree e2 = new DomCFGTree(); + e2.num = 8; + trees.add(e); + trees.add(e1); + trees.add(e2); + Collections.sort(trees); + } +} diff --git a/src/com/bfd/parse/config/dom/DomConfig.java b/src/com/bfd/parse/config/dom/DomConfig.java new file mode 100644 index 0000000..c3e90ec --- /dev/null +++ b/src/com/bfd/parse/config/dom/DomConfig.java @@ -0,0 +1,321 @@ +package com.bfd.parse.config.dom; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.concurrent.ConcurrentHashMap; + +import org.apache.commons.lang.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.log4j.PropertyConfigurator; +import org.w3c.dom.DocumentFragment; +import org.w3c.dom.Node; + +import com.bfd.crawler.utils.JsonUtils; +import com.bfd.parse.DomParser; +import com.bfd.parse.client.ConfigClient; +import com.bfd.parse.config.Config; +import com.bfd.parse.entity.ParsetemplateEntity; +import com.bfd.parse.util.JsonUtil; +import com.bfd.parse.vo.TwoTuple; +//模板加载 +public class DomConfig implements Config { + + private static final Log log = LogFactory.getLog(DomConfig.class); + + private static final String CONFIG_NAME = "dom_config"; + + private static volatile DomConfig instance; + + private ConfigClient configService; + + private Map siteTmpls = new ConcurrentHashMap(); + + public static DomConfig getInstance() { + if (instance == null) { + synchronized (DomConfig.class) { + if (instance == null) { + instance = new DomConfig(); + } + } + } + return instance; + } + + protected DomConfig() { + configService = new ConfigClient(); + requestConfig(); + } + + public boolean hasTemplate(String sitename) { + return siteTmpls.containsKey(sitename); + } + //TODO:从parsetemplate来获取数据 + public boolean requestConfig(String siteId, String pageTypeId) { + HashMap reqMap = new HashMap(); + reqMap.put("type", "one"); + reqMap.put("fieldName", "siteID"); + reqMap.put("fieldValue", siteId); + reqMap.put("fieldType", "Integer"); + if (StringUtils.isNotEmpty(siteId) && !"all".equalsIgnoreCase(siteId)) { +// reqMap.put("type", "ALL"); // list,item,info, ALL + // ALL ---> list + item,list+info + String templates = configService.getConfig("parsePlugin", JsonUtils.toJSONString(reqMap)); + if (templates == null) { + return false; + } +// log.debug("templates:"+templates); + List tempList = JsonUtil.parseTemplates(templates); + DomTemplate tpl = readBuziConfig(tempList); + addDomTemplate(tpl); + log.info("reload tmpl config cid=" + siteId + ", size=" + tempList.size()); + return true; + } else { + return requestConfig(); + } + } + + public static void main3(String[] args) { + PropertyConfigurator.configure("/home/ian/dev/parser/log4j.properties"); + DomTemplate tpl = DomConfig.getInstance().getBySiteId("Cdida"); + log.info(tpl); + // if (!checkDomTemplate(tpl, "item")) { + // log.info("ss"); + // } + log.info("ok"); + } + + @Override + public boolean requestConfig() { + String templates = configService.getConfig("parseTemplate", JsonUtil.REQUEST_COMMON); + + log.info("will reload domconfig"); + log.trace("templates:"+templates); + if (templates == null) { + return false; + } + List list = JsonUtil.parseTemplates(templates); +// log.info("request template config success. templateList size is " + list.size()+".templates:"+templates); + if (list.size() == 0) { + log.error("template is error"); + return false; + } + + siteTmpls.clear(); + Map> tempMap = new HashMap>(); + try { + for (ParsetemplateEntity template : list) { + List tempList = null; + String nodeName = template.getSiteid()+""; + //改为通过站点id和页面类型来查询模板. +// String nodeName = template.getSiteid()+"|"+template.getPagetypeid(); +// if(nodeName.equalsIgnoreCase("Mjingdong")){ +// log.info("Mjingdong template :"+JsonUtil.toJSONString(template)); +// } + if (tempMap.containsKey(nodeName)) { + tempList = tempMap.get(nodeName); + } else { + tempList = new ArrayList(); + } + tempList.add(template); + tempMap.put(nodeName, tempList); + } + + if (tempMap.size() > 0) { + for (Entry> entry : tempMap.entrySet()) { + List temList = entry.getValue(); + DomTemplate tpl = readBuziConfig(temList); + addDomTemplate(tpl); + } + return true; + } + log.info("siteTmpls:"+JsonUtils.toJSONString(siteTmpls)); + log.warn("No template data"); + } catch (Exception e) { + log.warn("exception", e); + } + return false; + } + + private void addDomTemplate(DomTemplate tmpl) { + if (tmpl != null) { + String siteId = tmpl.getSiteId(); + + if (siteId != null && siteId.length() > 0) + siteTmpls.put(siteId, tmpl); + siteId = tmpl.getDNS(); + if (siteId != null && siteId.length() > 0) + siteTmpls.put(siteId, tmpl); + String[] alias = tmpl.getAlias(); + if (alias != null && alias.length > 0) { + for (int j = 0; j < alias.length; j++) + siteTmpls.put(alias[j], tmpl); + } + } + } + + public synchronized DomTemplate readBuziConfig(List list) { + if (list != null && list.size() > 0) { + List> listChilds = new ArrayList>(); + String XMLhead = ""; + for (ParsetemplateEntity template : list) { + if (StringUtils.isEmpty(template.getTmpl())) { + log.warn("error template, tmpl is empty, tmpl id=" + template.getTmplid()); + continue; + } + StringBuilder sbBuilder = new StringBuilder(); + sbBuilder.append(XMLhead); + sbBuilder.append(template.getTmpl()); + DocumentFragment doc = DomParser.parse2Xml(sbBuilder.toString(), "utf8"); + listChilds.add(new TwoTuple(doc.getFirstChild(), template)); + } + return new DomTemplate(list.get(0), listChilds); + } + return null; + } + + public static DomCFGTree readBuziConfig(ParsetemplateEntity template) { + String XMLhead = ""; + StringBuilder sbBuilder = new StringBuilder(); + String tmpl = template.getTmpl(); + sbBuilder.append(XMLhead); + sbBuilder.append(tmpl); + DocumentFragment doc = DomParser.parse2Xml(sbBuilder.toString(), "utf8"); + return new DomCFGTree(doc.getFirstChild(), template); + } + +// public synchronized DomTemplate readBuziConfig2(List