diff --git a/cl_query_data_job/pom.xml b/cl_query_data_job/pom.xml
index 4f9f20f..d66c17a 100644
--- a/cl_query_data_job/pom.xml
+++ b/cl_query_data_job/pom.xml
@@ -72,15 +72,10 @@
19.0
-
-
-
-
-
com.alibaba
fastjson
- 1.2.60
+ 1.2.68
diff --git a/cl_query_data_job/src/main/java/com/bfd/mf/job/service/es/EsQueryMiniService.java b/cl_query_data_job/src/main/java/com/bfd/mf/job/service/es/EsQueryMiniService.java
index 6ce5df0..aeafb3a 100644
--- a/cl_query_data_job/src/main/java/com/bfd/mf/job/service/es/EsQueryMiniService.java
+++ b/cl_query_data_job/src/main/java/com/bfd/mf/job/service/es/EsQueryMiniService.java
@@ -3,7 +3,10 @@ package com.bfd.mf.job.service.es;
import com.bfd.mf.job.config.ESConstants;
import com.bfd.mf.job.domain.entity.Task;
import com.bfd.mf.job.util.EsUtils;
-import org.elasticsearch.index.query.*;
+import org.elasticsearch.index.query.BoolQueryBuilder;
+import org.elasticsearch.index.query.QueryBuilders;
+import org.elasticsearch.index.query.RangeQueryBuilder;
+import org.elasticsearch.index.query.TermQueryBuilder;
import org.elasticsearch.search.aggregations.AggregationBuilder;
import org.elasticsearch.search.aggregations.bucket.terms.Terms;
import org.slf4j.Logger;
@@ -22,21 +25,21 @@ public class EsQueryMiniService {
/**
* 统计 每个专题下,每个渠道 的总量
*/
- public Map getSubjectChannelStatistics(String clusterName,String indexName) {
+ public Map getSubjectChannelStatistics(String clusterName, String indexName) {
- Map resultMap = new HashMap<>();
- try{
+ Map resultMap = new HashMap<>();
+ try {
boolean isExists = EsUtils.indexExists(clusterName, indexName);
if (isExists) {
BoolQueryBuilder qb = QueryBuilders.boolQuery();
AggregationBuilder ab = EsUtils.getSubjectChannelAB(ESConstants.DOC_TYPE);
- String indexNames [] = {indexName};
+ String indexNames[] = {indexName};
Terms result = EsUtils.queryTag(clusterName, indexNames, qb, ab, ESConstants.DOC_TYPE + "Tag");
resultMap = EsUtils.parseTerms(result);
}
- }catch (Exception e){
+ } catch (Exception e) {
e.printStackTrace();
}
return resultMap;
@@ -45,15 +48,15 @@ public class EsQueryMiniService {
/**
* 统计 每个专题下,每个渠道 当天的增量
*/
- public Map getSubjectChannelTodayStatistics(String clusterName,String indexName) {
- Map resultMap = new HashMap<>();
- try{
+ public Map getSubjectChannelTodayStatistics(String clusterName, String indexName) {
+ Map resultMap = new HashMap<>();
+ try {
boolean isExists = EsUtils.indexExists(clusterName, indexName);
if (isExists) {
BoolQueryBuilder qb = QueryBuilders.boolQuery();
- long current=System.currentTimeMillis();
- long zero=current/(1000*3600*24)*(1000*3600*24)-TimeZone.getDefault().getRawOffset();
+ long current = System.currentTimeMillis();
+ long zero = current / (1000 * 3600 * 24) * (1000 * 3600 * 24) - TimeZone.getDefault().getRawOffset();
Long startTime = new Timestamp(zero).getTime();
RangeQueryBuilder rangeQueryBuilder = QueryBuilders
.rangeQuery(ESConstants.CRAWLTIME)
@@ -61,12 +64,12 @@ public class EsQueryMiniService {
.lt(current);
qb.must(rangeQueryBuilder);
AggregationBuilder ab = EsUtils.getSubjectChannelAB(ESConstants.DOC_TYPE);
- String indexNames [] = {indexName};
+ String indexNames[] = {indexName};
Terms result = EsUtils.queryTag(clusterName, indexNames, qb, ab, ESConstants.DOC_TYPE + "Tag");
resultMap = EsUtils.parseTerms(result);
}
- }catch (Exception e){
+ } catch (Exception e) {
e.printStackTrace();
}
return resultMap;
@@ -75,19 +78,19 @@ public class EsQueryMiniService {
/**
* 统计 每个专题下,crawlDataFlag 三种类型当天的总量
*/
- public Map getSubjectCrawlDataFlagStatistics(String clusterName, String indexName) {
- Map resultMap = new HashMap<>();
- try{
+ public Map getSubjectCrawlDataFlagStatistics(String clusterName, String indexName) {
+ Map resultMap = new HashMap<>();
+ try {
boolean isExists = EsUtils.indexExists(clusterName, indexName);
if (isExists) {
BoolQueryBuilder qb = QueryBuilders.boolQuery();
AggregationBuilder ab = EsUtils.getSubjectChannelAB(ESConstants.CRAWLDATAFLAG);
- String indexNames [] = {indexName};
+ String indexNames[] = {indexName};
Terms result = EsUtils.queryTag(clusterName, indexNames, qb, ab, ESConstants.CRAWLDATAFLAG + "Tag");
- Map termsMap = EsUtils.parseTerms(result);
+ Map termsMap = EsUtils.parseTerms(result);
resultMap = EsUtils.getResultMap(termsMap);
}
- }catch (Exception e){
+ } catch (Exception e) {
e.printStackTrace();
}
return resultMap;
@@ -96,15 +99,15 @@ public class EsQueryMiniService {
/**
* 统计 每个专题下,crawlDataFlag 三种类型 的增量
*/
- public Map getSubjectCrawlDataFlagTodayStatistics(String clusterName, String indexName) {
- Map resultMap = new HashMap<>();
- try{
+ public Map getSubjectCrawlDataFlagTodayStatistics(String clusterName, String indexName) {
+ Map resultMap = new HashMap<>();
+ try {
boolean isExists = EsUtils.indexExists(clusterName, indexName);
if (isExists) {
BoolQueryBuilder qb = QueryBuilders.boolQuery();
- long current=System.currentTimeMillis();
- long zero=current/(1000*3600*24)*(1000*3600*24)-TimeZone.getDefault().getRawOffset();
+ long current = System.currentTimeMillis();
+ long zero = current / (1000 * 3600 * 24) * (1000 * 3600 * 24) - TimeZone.getDefault().getRawOffset();
Long startTime = new Timestamp(zero).getTime();
RangeQueryBuilder rangeQueryBuilder = QueryBuilders
.rangeQuery(ESConstants.CRAWLTIME)
@@ -112,13 +115,13 @@ public class EsQueryMiniService {
.lt(current);
qb.must(rangeQueryBuilder);
AggregationBuilder ab = EsUtils.getSubjectChannelAB(ESConstants.CRAWLDATAFLAG);
- String indexNames [] = {indexName};
+ String indexNames[] = {indexName};
Terms result = EsUtils.queryTag(clusterName, indexNames, qb, ab, ESConstants.CRAWLDATAFLAG + "Tag");
- Map termsMap = EsUtils.parseTerms(result);
+ Map termsMap = EsUtils.parseTerms(result);
resultMap = EsUtils.getResultMap(termsMap);
}
- }catch (Exception e){
+ } catch (Exception e) {
e.printStackTrace();
}
return resultMap;
@@ -126,12 +129,12 @@ public class EsQueryMiniService {
/**
- * 查询每个任务 的总量和当天的量 以及 包含图片的量、包含视频的量、包含附件的量
+ * 查询每个任务 的总量和当天的量 以及 包含图片的量、包含视频的量、包含附件的量
*/
- public Map getTaskCount(String clusterName,Long taskId, Task task,String crawlDataFlag,String indexNamePre) {
- Map countMap = new HashMap<>();
- String indexName = indexNamePre + task.getSubjectId();//subject_id
- if(null != task.getCid()) {
+ public Map getTaskCount(String clusterName, Long taskId, Task task, String crawlDataFlag, String indexNamePre) {
+ Map countMap = new HashMap<>();
+ String indexName = indexNamePre + task.getSubjectId();//subject_id
+ if (null != task.getCid()) {
String cid = task.getCid().toLowerCase();
Long crawlStartTime = task.getCrawlStartTime().longValue();
Long crawlEndTime = task.getCrawlEndTime().longValue();
@@ -139,8 +142,9 @@ public class EsQueryMiniService {
if (indexName.contains(indexNamePre)) {
boolean isExists = EsUtils.indexExists(clusterName, indexName);
if (isExists) {
- BoolQueryBuilder qb = getQueryBuilder(cid,crawlDataFlag,crawlStartTime,crawlEndTime);
- logger.info("QB1 查询总量: indexName: {}. taskId : {}.{\"query\": {}}.", indexName, taskId, qb.toString().replace("\n", "").replace("\r", "").replace(" ", ""));
+ BoolQueryBuilder qb = getQueryBuilder(cid, crawlDataFlag, crawlStartTime, crawlEndTime);
+ logger.info("QB1 查询总量: indexName: {}. taskId : {}.{\"query\": {}}.", indexName, taskId, qb.toString()
+ .replace("\n", "").replace("\r", "").replace(" ", ""));
Long count = EsUtils.queryCount(clusterName, indexName, qb);
countMap.put("totalCount", count);
@@ -158,26 +162,26 @@ public class EsQueryMiniService {
// 查询包含图片的数据的量
//videoPath == egc filePath == ugc imagePath == pgc
- TermQueryBuilder pgcTermQueryBuilder = QueryBuilders.termQuery(ESConstants.PGC,1);
- TermQueryBuilder egcTermQueryBuilder = QueryBuilders.termQuery(ESConstants.EGC,1);
- TermQueryBuilder ugcTermQueryBuilder = QueryBuilders.termQuery(ESConstants.UGC,1);
- TermQueryBuilder textTermQueryBuilder = QueryBuilders.termQuery(ESConstants.ISDOWNLOAD,false);
- qb = getQueryBuilder(cid,crawlDataFlag,crawlStartTime,crawlEndTime);
+ TermQueryBuilder pgcTermQueryBuilder = QueryBuilders.termQuery(ESConstants.PGC, 1);
+ TermQueryBuilder egcTermQueryBuilder = QueryBuilders.termQuery(ESConstants.EGC, 1);
+ TermQueryBuilder ugcTermQueryBuilder = QueryBuilders.termQuery(ESConstants.UGC, 1);
+ TermQueryBuilder textTermQueryBuilder = QueryBuilders.termQuery(ESConstants.ISDOWNLOAD, false);
+ qb = getQueryBuilder(cid, crawlDataFlag, crawlStartTime, crawlEndTime);
qb.must(pgcTermQueryBuilder);
logger.info("QB3 查询有图片的任务数: indexName: {}. taskId : {}.{\"query\": {}}.", indexName, taskId, qb.toString().replace("\n", "").replace("\r", "").replace(" ", ""));
Long imageCount = EsUtils.queryCount(clusterName, indexName, qb);
countMap.put(ESConstants.IMAGECOUNT, imageCount);
- qb = getQueryBuilder(cid,crawlDataFlag,crawlStartTime,crawlEndTime);
+ qb = getQueryBuilder(cid, crawlDataFlag, crawlStartTime, crawlEndTime);
qb.must(egcTermQueryBuilder);
logger.info("QB4 查询有视频的任务数: indexName: {}. taskId : {}.{\"query\": {}}.", indexName, taskId, qb.toString().replace("\n", "").replace("\r", "").replace(" ", ""));
Long videoCount = EsUtils.queryCount(clusterName, indexName, qb);
countMap.put(ESConstants.VIDEOCOUNT, videoCount);
- qb = getQueryBuilder(cid,crawlDataFlag,crawlStartTime,crawlEndTime);
+ qb = getQueryBuilder(cid, crawlDataFlag, crawlStartTime, crawlEndTime);
qb.must(ugcTermQueryBuilder);
logger.info("QB5 查询有文件的任务数: indexName: {}. taskId : {}.{\"query\": {}}.", indexName, taskId, qb.toString().replace("\n", "").replace("\r", "").replace(" ", ""));
Long fileCount = EsUtils.queryCount(clusterName, indexName, qb);
countMap.put(ESConstants.FILECOUNT, fileCount);
- qb = getQueryBuilder(cid,crawlDataFlag,crawlStartTime,crawlEndTime);
+ qb = getQueryBuilder(cid, crawlDataFlag, crawlStartTime, crawlEndTime);
qb.must(textTermQueryBuilder);
logger.info("QB6 查询纯文本的任务数: indexName: {}. taskId : {}.{\"query\": {}}.", indexName, taskId, qb.toString().replace("\n", "").replace("\r", "").replace(" ", ""));
Long textCount = EsUtils.queryCount(clusterName, indexName, qb);
@@ -200,10 +204,93 @@ public class EsQueryMiniService {
BoolQueryBuilder shouldbq = QueryBuilders.boolQuery();
RangeQueryBuilder rangeQueryBuilder = QueryBuilders
.rangeQuery(ESConstants.PUBTIME).gte(crawlStartTime).lt(crawlEndTime);
- TermQueryBuilder primary2 = QueryBuilders.termQuery(ESConstants.PRIMARY,2);
+ TermQueryBuilder primary2 = QueryBuilders.termQuery(ESConstants.PRIMARY, 2);
shouldbq.must(rangeQueryBuilder).mustNot(primary2);
// 不用统计FB 的这种粉丝的量
- TermQueryBuilder pageTypeQueryBuilder = QueryBuilders.termQuery(ESConstants.PAGETYPR,"socialFans");
+ TermQueryBuilder pageTypeQueryBuilder = QueryBuilders.termQuery(ESConstants.PAGETYPR, "socialFans");
+ qb.mustNot(pageTypeQueryBuilder).should(shouldbq);
+ return qb;
+ }
+
+ public Map getTaskCountNew(String clusterName, Long taskId, Task task, String indexNamePre) {
+ Map countMap = new HashMap<>();
+ String indexName = indexNamePre + task.getSubjectId();//subject_id
+ String taskIdString = taskId.toString();
+ if (null != task.getCid()) {
+ String cid = task.getCid().toLowerCase();
+ Long crawlStartTime = task.getCrawlStartTime().longValue();
+ Long crawlEndTime = task.getCrawlEndTime().longValue();
+ // String crawlDataFlag =task.getCrawlDataFlag();
+ if (indexName.contains(indexNamePre)) {
+ boolean isExists = EsUtils.indexExists(clusterName, indexName);
+ if (isExists) {
+ BoolQueryBuilder qb = getQueryBuilderNew(taskIdString, crawlStartTime, crawlEndTime);
+ logger.info("QB1 查询总量: indexName: {}. taskId : {}.{\"query\": {}}.", indexName, taskId, qb.toString()
+ .replace("\n", "").replace("\r", "").replace(" ", ""));
+ Long count = EsUtils.queryCount(clusterName, indexName, qb);
+ countMap.put("totalCount", count);
+
+ // 上面的语句是查询 该任务的 总数据量:totalCount,下面的语句是查询 该任务当天的数据量:todayCount
+ long current = System.currentTimeMillis();
+ long zero = current / (1000 * 3600 * 24) * (1000 * 3600 * 24) - TimeZone.getDefault().getRawOffset();
+ Long startTime = new Timestamp(zero).getTime();
+ RangeQueryBuilder rangeQueryBuilder2 = QueryBuilders
+ .rangeQuery(ESConstants.CRAWLTIME)
+ .gte(startTime).lt(current);
+ qb.must(rangeQueryBuilder2);
+ logger.info("QB2 查询今日总量: indexName: {}. taskId : {}.{\"query\": {}}.", indexName, taskId, qb.toString().replace("\n", "").replace("\r", "").replace(" ", ""));
+ Long todayCount = EsUtils.queryCount(clusterName, indexName, qb);
+ countMap.put("todayCount", todayCount);
+
+ // 查询包含图片的数据的量
+ //videoPath == egc filePath == ugc imagePath == pgc
+ TermQueryBuilder pgcTermQueryBuilder = QueryBuilders.termQuery(ESConstants.PGC, 1);
+ TermQueryBuilder egcTermQueryBuilder = QueryBuilders.termQuery(ESConstants.EGC, 1);
+ TermQueryBuilder ugcTermQueryBuilder = QueryBuilders.termQuery(ESConstants.UGC, 1);
+ TermQueryBuilder textTermQueryBuilder = QueryBuilders.termQuery(ESConstants.ISDOWNLOAD, false);
+ qb = getQueryBuilderNew(taskIdString, crawlStartTime, crawlEndTime);
+ qb.must(pgcTermQueryBuilder);
+ logger.info("QB3 查询有图片的任务数: indexName: {}. taskId : {}.{\"query\": {}}.", indexName, taskId, qb.toString().replace("\n", "").replace("\r", "").replace(" ", ""));
+ Long imageCount = EsUtils.queryCount(clusterName, indexName, qb);
+ countMap.put(ESConstants.IMAGECOUNT, imageCount);
+ qb = getQueryBuilderNew(taskIdString, crawlStartTime, crawlEndTime);
+ qb.must(egcTermQueryBuilder);
+ logger.info("QB4 查询有视频的任务数: indexName: {}. taskId : {}.{\"query\": {}}.", indexName, taskId, qb.toString().replace("\n", "").replace("\r", "").replace(" ", ""));
+ Long videoCount = EsUtils.queryCount(clusterName, indexName, qb);
+ countMap.put(ESConstants.VIDEOCOUNT, videoCount);
+ qb = getQueryBuilderNew(taskIdString, crawlStartTime, crawlEndTime);
+ qb.must(ugcTermQueryBuilder);
+ logger.info("QB5 查询有文件的任务数: indexName: {}. taskId : {}.{\"query\": {}}.", indexName, taskId, qb.toString().replace("\n", "").replace("\r", "").replace(" ", ""));
+ Long fileCount = EsUtils.queryCount(clusterName, indexName, qb);
+ countMap.put(ESConstants.FILECOUNT, fileCount);
+ qb = getQueryBuilderNew(taskIdString, crawlStartTime, crawlEndTime);
+ qb.must(textTermQueryBuilder);
+ logger.info("QB6 查询纯文本的任务数: indexName: {}. taskId : {}.{\"query\": {}}.", indexName, taskId, qb.toString().replace("\n", "").replace("\r", "").replace(" ", ""));
+ Long textCount = EsUtils.queryCount(clusterName, indexName, qb);
+ countMap.put(ESConstants.TEXTCOUNT, textCount);
+ logger.info("含图片的数据量:" + imageCount + " ; 含视频的数据量:" + videoCount + " ; 含文件的数据量:" + fileCount + " ; 纯文本的数据量:" + textCount);
+ }
+ }
+ }
+ return countMap;
+ }
+
+
+ private BoolQueryBuilder getQueryBuilderNew(String taskId, Long crawlStartTime, Long crawlEndTime) {
+ System.out.println("要统计的任务ID: " + taskId);
+ BoolQueryBuilder qb = QueryBuilders.boolQuery();
+ // 任务ID 筛选
+ //TermQueryBuilder cidTermQueryBuilder = QueryBuilders.termQuery(ESConstants.EN_SOURCE + ".keyword", cid);
+ TermQueryBuilder taskIdTermQueryBuilder = QueryBuilders.termQuery(ESConstants.TASKID, taskId);
+ qb.must(taskIdTermQueryBuilder);
+ // 时间范围筛选 只有主贴评论需要查时间,用户不需要设置时间范围
+ BoolQueryBuilder shouldbq = QueryBuilders.boolQuery();
+ RangeQueryBuilder rangeQueryBuilder = QueryBuilders
+ .rangeQuery(ESConstants.PUBTIME).gte(crawlStartTime).lt(crawlEndTime);
+ TermQueryBuilder primary2 = QueryBuilders.termQuery(ESConstants.PRIMARY, 2);
+ shouldbq.must(rangeQueryBuilder).mustNot(primary2);
+ // 不用统计FB 的这种粉丝的量
+ TermQueryBuilder pageTypeQueryBuilder = QueryBuilders.termQuery(ESConstants.PAGETYPR, "socialFans");
qb.mustNot(pageTypeQueryBuilder).should(shouldbq);
return qb;
}
diff --git a/cl_query_data_job/src/main/java/com/bfd/mf/job/service/statistics/StatisticsService.java b/cl_query_data_job/src/main/java/com/bfd/mf/job/service/statistics/StatisticsService.java
index 25da273..814cb02 100644
--- a/cl_query_data_job/src/main/java/com/bfd/mf/job/service/statistics/StatisticsService.java
+++ b/cl_query_data_job/src/main/java/com/bfd/mf/job/service/statistics/StatisticsService.java
@@ -251,6 +251,8 @@ public class StatisticsService {
if(null != task.getCid() && !task.getCid().equals("test")) {
// 获取任务数量
countMap = esQueryMiniService.getTaskCount(miniName, taskId, task, crawlDataFlag, indexNamePre);
+ countMap = esQueryMiniService.getTaskCountNew(miniName, taskId, task, indexNamePre);
+
// 直接更新 cl_task 表中的 data_total 和 today_data_total
long totalCount = 0L;
long todayCount = 0L;
@@ -267,6 +269,7 @@ public class StatisticsService {
fileCount = countMap.get(ESConstants.FILECOUNT);
textCount = countMap.get(ESConstants.TEXTCOUNT);
}
+
// taskRepository.updateTaskCount(taskId,totalCount,todayCount);
taskRepository.updateTaskCountAll(taskId,totalCount,todayCount,imageCount,videoCount,fileCount,textCount);
}
diff --git a/cl_query_data_job/src/main/java/com/bfd/mf/job/service/taskCount/TaskCountService.java b/cl_query_data_job/src/main/java/com/bfd/mf/job/service/taskCount/TaskCountService.java
index a59a456..9636c36 100644
--- a/cl_query_data_job/src/main/java/com/bfd/mf/job/service/taskCount/TaskCountService.java
+++ b/cl_query_data_job/src/main/java/com/bfd/mf/job/service/taskCount/TaskCountService.java
@@ -12,7 +12,6 @@ import com.bfd.mf.job.service.es.EsQueryNormalService;
import com.bfd.mf.job.service.statistics.TotalCountService;
import com.bfd.mf.job.util.DateUtil;
import com.bfd.mf.job.util.EsUtils;
-import kafka.utils.Json;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
diff --git a/cl_query_data_job/src/main/resources/application.yml b/cl_query_data_job/src/main/resources/application.yml
index 8bf8063..d99d397 100644
--- a/cl_query_data_job/src/main/resources/application.yml
+++ b/cl_query_data_job/src/main/resources/application.yml
@@ -3,22 +3,14 @@ debug: false
logging:
level:
com.bfd.mf: debug
-#spring:
-# datasource:
-# driver-class-name: com.mysql.jdbc.Driver
-# username: root
-# password: bfd123
-# url: jdbc:mysql://172.26.11.113:3306/intelligent_crawl?useOldAliasMetadataBehavior=true&characterEncoding=UTF-8&zeroDateTimeBehavior=round
-# hikari:
-# maximum-pool-size: 10
-# minimum-idle: 1
+
spring:
datasource:
- driver-class-name: com.mysql.jdbc.Driver
- username: crawl
- password: D5HLOvk553DUNV62qJI=
- url: jdbc:mysql://172.18.1.134:3306/all_task?useOldAliasMetadataBehavior=true&characterEncoding=UTF-8&zeroDateTimeBehavior=round
- hikari:
+ driver-class-name: com.mysql.cj.jdbc.Driver
+ username: crawl666
+ password: lx2a4jN1xFT96kj20LU=
+ url: jdbc:mysql://172.18.1.134:3306/intelligent_crawl?useSSL=true&useUnicode=true&characterEncoding=UTF-8&serverTimezone=UTC
+ hikari:
maximum-pool-size: 10
minimum-idle: 1
@@ -29,17 +21,17 @@ worker:
test-task-id: 180
## 数据默认要写的 kafka
broker-list: 172.18.1.113:9092
- send-topic : databasestokafka
+ send-topic: databasestokafka
analysis-topic:
- - sq_topic_cl_query_analysis_1
+ - sq_topic_cl_query_analysis_1
analysis-group: sq_group_cl_analysis_1
## 服务的状态,true 为启动
enable-analysis-producer: false # 查ES写kafka
enable-analysis-consumer: false # 读kafka写ES
- enable-statistics-producer: false # 统计 taskCount 和 subjectCount (采集平台)
+ enable-statistics-producer: true # 统计 taskCount 和 subjectCount (采集平台)
enable-query-producer: false # 离线拉数(采集平台)
- enable-high-frequency-producer: true # 高频离线拉数(采集平台)
+ enable-high-frequency-producer: false # 高频离线拉数(采集平台)
enable-backtrace-producer: false # 欧莱雅查数(采集平台,欧莱雅项目独用)
enable-rw-oly-producer: false # 欧莱雅数据导出,暂时不用
enable-up-load-producer: false # 上传(采集平台)
@@ -63,16 +55,16 @@ worker:
query-data-year-starttime: 1546272000000
rule-rest: http://rule.sq.baifendian.com/data_match/content/
- comment-rest: http://rule.sq.baifendian.com/reputation/addReputationTask
+ comment-rest: http://rule.sq.baifendian.com/reputation/addReputationTask
rule-rest-concurrency: 500
content-limit: 2000
failure-upper: 2000
- goFastPostUrl : http://172.18.1.113:8080/upload
- goFastDomain : http://172.18.1.113:8080
- uploadOLYExcelPath : /opt/nfsdata/excelTask/
- uploadZipPath : /opt/nfsdata/uploadFiles/
- indexNamePre : cl_major_
+ goFastPostUrl: http://172.18.1.113:8080/upload
+ goFastDomain: http://172.18.1.113:8080
+ uploadOLYExcelPath: /opt/nfsdata/excelTask/
+ uploadZipPath: /opt/nfsdata/uploadFiles/
+ indexNamePre: cl_major_
es-normal:
name: SQ_Normal_new
diff --git a/cl_search_api/pom.xml b/cl_search_api/pom.xml
index 5b0e959..8d8893a 100644
--- a/cl_search_api/pom.xml
+++ b/cl_search_api/pom.xml
@@ -5,15 +5,15 @@
4.0.0
- cl_stream_3.2
+ cl_stream_3.3
com.bfd.mf
- 3.2-SNAPSHOT
+ 3.3-SNAPSHOT
cl_search_api
- Search V3.2 API
+ Search V3.3 API
cl_search_api
- 3.2.7-SNAPSHOT
+ 3.3.0-SNAPSHOT
com.bfd.mf.SearchApplication
@@ -260,6 +260,13 @@
2.6
compile
+
+
+
+ org.jsoup
+ jsoup
+ 1.10.2
+
diff --git a/cl_search_api/src/main/java/com/bfd/mf/common/service/cache/TopicQueryService.java b/cl_search_api/src/main/java/com/bfd/mf/common/service/cache/TopicQueryService.java
index 13546b5..1836d5a 100644
--- a/cl_search_api/src/main/java/com/bfd/mf/common/service/cache/TopicQueryService.java
+++ b/cl_search_api/src/main/java/com/bfd/mf/common/service/cache/TopicQueryService.java
@@ -4,16 +4,15 @@ package com.bfd.mf.common.service.cache;
import com.bfd.mf.common.service.es.EsCommonService;
import com.bfd.mf.common.service.es.ParseSearchScopeService;
import com.bfd.mf.common.util.constants.ESConstant;
-import com.bfd.mf.common.web.entity.mysql.topic.Task;
import com.bfd.mf.common.web.repository.mysql.base.SiteRepository;
import com.bfd.mf.common.web.repository.mysql.topic.TaskRepository;
import com.bfd.mf.common.web.vo.params.QueryRequest;
import com.bfd.nlp.common.util.object.TObjectUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
@@ -25,7 +24,7 @@ import java.util.stream.Collectors;
@Service
public class TopicQueryService {
- private static Logger logger = LoggerFactory.getLogger(TopicQueryService.class);
+ private static Logger logger = LoggerFactory.getLogger(TopicQueryService.class);
@Autowired
private EsCommonService esCommonService;
@Autowired
@@ -70,7 +69,9 @@ public class TopicQueryService {
} else {
List areaList = siteRepository.findCidsByArea(queryRequest.getSearchArea());
List lowCaseAreaList = areaList.stream().map(String::toLowerCase).collect(Collectors.toList());
- // boolQuery.must(QueryBuilders.termsQuery(ESConstant.EN_SOURCE, lowCaseAreaList));
+ if (lowCaseAreaList.size() > 0) {
+ boolQuery.must(QueryBuilders.termsQuery(ESConstant.EN_SOURCE, lowCaseAreaList));
+ }
// String searchArea = getSearchArea(queryRequest.getSearchArea());
// boolQuery.must(QueryBuilders.termQuery(ESConstant.AREA, searchArea));
}
@@ -101,16 +102,16 @@ public class TopicQueryService {
* 2023-04-24
* 采集平台2.0 版本,可以选中多个任务进行查询
*/
- if(null == queryRequest.getTaskIds()){
+ if (null == queryRequest.getTaskIds()) {
logger.info("[TopicQueryService] queryByConditions_v1 没有任务ID,查询专题下全部任务");
- }else {
+ } else {
List taskIds = queryRequest.getTaskIds();
- boolQuery.must(QueryBuilders.termsQuery(ESConstant.TASK_ID, taskIds));
+ if (taskIds.size() > 0) {
+ boolQuery.must(QueryBuilders.termsQuery(ESConstant.TASK_ID, taskIds));
+ }
}
-
-
if (null == cid || ("").equals(cid) || ("test").equals(cid)) {
logger.info("[TopicQueryService] queryByConditions_v1 查询全部站点");
} else {
diff --git a/cl_search_api/src/main/java/com/bfd/mf/common/service/es/EsQueryAuthorService.java b/cl_search_api/src/main/java/com/bfd/mf/common/service/es/EsQueryAuthorService.java
index 1ea0e7a..10130c0 100644
--- a/cl_search_api/src/main/java/com/bfd/mf/common/service/es/EsQueryAuthorService.java
+++ b/cl_search_api/src/main/java/com/bfd/mf/common/service/es/EsQueryAuthorService.java
@@ -8,9 +8,7 @@ import com.bfd.mf.common.web.repository.mysql.base.SiteRepository;
import com.bfd.mf.common.web.vo.params.QueryRequest;
import com.bfd.mf.config.BFDApiConfig;
import com.bfd.mf.service.SearchAuthorService;
-import com.bfd.nlp.common.util.string.TStringUtils;
import org.elasticsearch.index.query.BoolQueryBuilder;
-import org.elasticsearch.index.query.MatchPhraseQueryBuilder;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.slf4j.Logger;
@@ -35,58 +33,59 @@ public class EsQueryAuthorService {
@Autowired
private SiteRepository siteRepository;
- private String clusterName ="";
+ private String clusterName = "";
+
@PostConstruct
public void init() {
// 注册数据查询来源
clusterName = bfdApiConfig.esMiniName();
- String sourceAddress [] = bfdApiConfig.esMiniAddress();
+ String sourceAddress[] = bfdApiConfig.esMiniAddress();
EsUtils.registerCluster(clusterName, sourceAddress);// 配置文件中的 es-source
}
public List queryAuthorListByKeyword(String[] indexName, QueryRequest queryRequest) {
- try{
+ try {
BoolQueryBuilder boolQueryBuilder = null;
logger.debug("[EsQueryAuthorService] queryAuthorListByKeyword ...");
Integer limit = queryRequest.getLimit(); //每页的数量
Integer start = (queryRequest.getPage() - 1) * limit; //起始页(0,20,40....)
String orderFlag = "desc";
- if(!queryRequest.getOrder().equals("")) {
+ if (!queryRequest.getOrder().equals("")) {
queryRequest.getOrder(); // 排序方式 asc/desc
}
String sortFlag = "pubTime";
- if(!queryRequest.getSidx().equals("")) {
+ if (!queryRequest.getSidx().equals("")) {
queryRequest.getSidx(); // 排序字段
}
- boolQueryBuilder = getQueryBuilder(queryRequest);
+ boolQueryBuilder = getQueryBuilderNew(queryRequest);
Integer searchType = queryRequest.getSearchType();
logger.info("[EsQueryAuthorService] queryAuthorListByKeyword indexName = " + indexName[0] + "; qb: \n {}.", boolQueryBuilder.toString());
- List result = EsUtils.query(clusterName, indexName, boolQueryBuilder, sortFlag, orderFlag, limit, start,searchType);
- List