Browse Source

2023-09-11

采集平台2.0版本
导出的 location 字段添加了一下
release-1.0
jing.du 2 years ago
parent
commit
8fd98c05d7
  1. 7
      cl_query_data_job/pom.xml
  2. 91
      cl_query_data_job/src/main/java/com/bfd/mf/job/service/es/EsQueryMiniService.java
  3. 3
      cl_query_data_job/src/main/java/com/bfd/mf/job/service/statistics/StatisticsService.java
  4. 1
      cl_query_data_job/src/main/java/com/bfd/mf/job/service/taskCount/TaskCountService.java
  5. 22
      cl_query_data_job/src/main/resources/application.yml
  6. 15
      cl_search_api/pom.xml
  7. 13
      cl_search_api/src/main/java/com/bfd/mf/common/service/cache/TopicQueryService.java
  8. 159
      cl_search_api/src/main/java/com/bfd/mf/common/service/es/EsQueryAuthorService.java
  9. 55
      cl_search_api/src/main/java/com/bfd/mf/common/service/es/EsQueryServiceForSQMini.java
  10. 134
      cl_search_api/src/main/java/com/bfd/mf/common/service/es/GetQueryBuilder.java
  11. 5
      cl_search_api/src/main/java/com/bfd/mf/common/util/constants/ESConstant.java
  12. 12
      cl_search_api/src/main/java/com/bfd/mf/common/util/enums/BaseFieldEnum.java
  13. 2
      cl_search_api/src/main/java/com/bfd/mf/common/util/enums/SearchScopeEnum.java
  14. 637
      cl_search_api/src/main/java/com/bfd/mf/common/util/es/EsUtils.java
  15. 33
      cl_search_api/src/main/java/com/bfd/mf/common/web/vo/params/QueryRequest.java
  16. 16
      cl_search_api/src/main/java/com/bfd/mf/common/web/vo/view/monitor/ESMonitorBaseEntity.java
  17. 17
      cl_search_api/src/main/java/com/bfd/mf/controller/SearchDataController.java
  18. 155
      cl_search_api/src/main/java/com/bfd/mf/service/SearchDataService.java
  19. 4
      pom.xml

7
cl_query_data_job/pom.xml

@ -72,15 +72,10 @@
<version>19.0</version>
</dependency>
<!--<dependency>-->
<!--<groupId>com.alibaba</groupId>-->
<!--<artifactId>fastjson</artifactId>-->
<!--<version>1.2.6</version>-->
<!--</dependency>-->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.60</version>
<version>1.2.68</version>
</dependency>

91
cl_query_data_job/src/main/java/com/bfd/mf/job/service/es/EsQueryMiniService.java

@ -3,7 +3,10 @@ package com.bfd.mf.job.service.es;
import com.bfd.mf.job.config.ESConstants;
import com.bfd.mf.job.domain.entity.Task;
import com.bfd.mf.job.util.EsUtils;
import org.elasticsearch.index.query.*;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.index.query.RangeQueryBuilder;
import org.elasticsearch.index.query.TermQueryBuilder;
import org.elasticsearch.search.aggregations.AggregationBuilder;
import org.elasticsearch.search.aggregations.bucket.terms.Terms;
import org.slf4j.Logger;
@ -140,7 +143,8 @@ public class EsQueryMiniService {
boolean isExists = EsUtils.indexExists(clusterName, indexName);
if (isExists) {
BoolQueryBuilder qb = getQueryBuilder(cid, crawlDataFlag, crawlStartTime, crawlEndTime);
logger.info("QB1 查询总量: indexName: {}. taskId : {}.{\"query\": {}}.", indexName, taskId, qb.toString().replace("\n", "").replace("\r", "").replace(" ", ""));
logger.info("QB1 查询总量: indexName: {}. taskId : {}.{\"query\": {}}.", indexName, taskId, qb.toString()
.replace("\n", "").replace("\r", "").replace(" ", ""));
Long count = EsUtils.queryCount(clusterName, indexName, qb);
countMap.put("totalCount", count);
@ -207,4 +211,87 @@ public class EsQueryMiniService {
qb.mustNot(pageTypeQueryBuilder).should(shouldbq);
return qb;
}
public Map<String, Long> getTaskCountNew(String clusterName, Long taskId, Task task, String indexNamePre) {
Map<String, Long> countMap = new HashMap<>();
String indexName = indexNamePre + task.getSubjectId();//subject_id
String taskIdString = taskId.toString();
if (null != task.getCid()) {
String cid = task.getCid().toLowerCase();
Long crawlStartTime = task.getCrawlStartTime().longValue();
Long crawlEndTime = task.getCrawlEndTime().longValue();
// String crawlDataFlag =task.getCrawlDataFlag();
if (indexName.contains(indexNamePre)) {
boolean isExists = EsUtils.indexExists(clusterName, indexName);
if (isExists) {
BoolQueryBuilder qb = getQueryBuilderNew(taskIdString, crawlStartTime, crawlEndTime);
logger.info("QB1 查询总量: indexName: {}. taskId : {}.{\"query\": {}}.", indexName, taskId, qb.toString()
.replace("\n", "").replace("\r", "").replace(" ", ""));
Long count = EsUtils.queryCount(clusterName, indexName, qb);
countMap.put("totalCount", count);
// 上面的语句是查询 该任务的 总数据量totalCount下面的语句是查询 该任务当天的数据量todayCount
long current = System.currentTimeMillis();
long zero = current / (1000 * 3600 * 24) * (1000 * 3600 * 24) - TimeZone.getDefault().getRawOffset();
Long startTime = new Timestamp(zero).getTime();
RangeQueryBuilder rangeQueryBuilder2 = QueryBuilders
.rangeQuery(ESConstants.CRAWLTIME)
.gte(startTime).lt(current);
qb.must(rangeQueryBuilder2);
logger.info("QB2 查询今日总量: indexName: {}. taskId : {}.{\"query\": {}}.", indexName, taskId, qb.toString().replace("\n", "").replace("\r", "").replace(" ", ""));
Long todayCount = EsUtils.queryCount(clusterName, indexName, qb);
countMap.put("todayCount", todayCount);
// 查询包含图片的数据的量
//videoPath == egc filePath == ugc imagePath == pgc
TermQueryBuilder pgcTermQueryBuilder = QueryBuilders.termQuery(ESConstants.PGC, 1);
TermQueryBuilder egcTermQueryBuilder = QueryBuilders.termQuery(ESConstants.EGC, 1);
TermQueryBuilder ugcTermQueryBuilder = QueryBuilders.termQuery(ESConstants.UGC, 1);
TermQueryBuilder textTermQueryBuilder = QueryBuilders.termQuery(ESConstants.ISDOWNLOAD, false);
qb = getQueryBuilderNew(taskIdString, crawlStartTime, crawlEndTime);
qb.must(pgcTermQueryBuilder);
logger.info("QB3 查询有图片的任务数: indexName: {}. taskId : {}.{\"query\": {}}.", indexName, taskId, qb.toString().replace("\n", "").replace("\r", "").replace(" ", ""));
Long imageCount = EsUtils.queryCount(clusterName, indexName, qb);
countMap.put(ESConstants.IMAGECOUNT, imageCount);
qb = getQueryBuilderNew(taskIdString, crawlStartTime, crawlEndTime);
qb.must(egcTermQueryBuilder);
logger.info("QB4 查询有视频的任务数: indexName: {}. taskId : {}.{\"query\": {}}.", indexName, taskId, qb.toString().replace("\n", "").replace("\r", "").replace(" ", ""));
Long videoCount = EsUtils.queryCount(clusterName, indexName, qb);
countMap.put(ESConstants.VIDEOCOUNT, videoCount);
qb = getQueryBuilderNew(taskIdString, crawlStartTime, crawlEndTime);
qb.must(ugcTermQueryBuilder);
logger.info("QB5 查询有文件的任务数: indexName: {}. taskId : {}.{\"query\": {}}.", indexName, taskId, qb.toString().replace("\n", "").replace("\r", "").replace(" ", ""));
Long fileCount = EsUtils.queryCount(clusterName, indexName, qb);
countMap.put(ESConstants.FILECOUNT, fileCount);
qb = getQueryBuilderNew(taskIdString, crawlStartTime, crawlEndTime);
qb.must(textTermQueryBuilder);
logger.info("QB6 查询纯文本的任务数: indexName: {}. taskId : {}.{\"query\": {}}.", indexName, taskId, qb.toString().replace("\n", "").replace("\r", "").replace(" ", ""));
Long textCount = EsUtils.queryCount(clusterName, indexName, qb);
countMap.put(ESConstants.TEXTCOUNT, textCount);
logger.info("含图片的数据量:" + imageCount + " ; 含视频的数据量:" + videoCount + " ; 含文件的数据量:" + fileCount + " ; 纯文本的数据量:" + textCount);
}
}
}
return countMap;
}
private BoolQueryBuilder getQueryBuilderNew(String taskId, Long crawlStartTime, Long crawlEndTime) {
System.out.println("要统计的任务ID: " + taskId);
BoolQueryBuilder qb = QueryBuilders.boolQuery();
// 任务ID 筛选
//TermQueryBuilder cidTermQueryBuilder = QueryBuilders.termQuery(ESConstants.EN_SOURCE + ".keyword", cid);
TermQueryBuilder taskIdTermQueryBuilder = QueryBuilders.termQuery(ESConstants.TASKID, taskId);
qb.must(taskIdTermQueryBuilder);
// 时间范围筛选 只有主贴评论需要查时间用户不需要设置时间范围
BoolQueryBuilder shouldbq = QueryBuilders.boolQuery();
RangeQueryBuilder rangeQueryBuilder = QueryBuilders
.rangeQuery(ESConstants.PUBTIME).gte(crawlStartTime).lt(crawlEndTime);
TermQueryBuilder primary2 = QueryBuilders.termQuery(ESConstants.PRIMARY, 2);
shouldbq.must(rangeQueryBuilder).mustNot(primary2);
// 不用统计FB 的这种粉丝的量
TermQueryBuilder pageTypeQueryBuilder = QueryBuilders.termQuery(ESConstants.PAGETYPR, "socialFans");
qb.mustNot(pageTypeQueryBuilder).should(shouldbq);
return qb;
}
}

3
cl_query_data_job/src/main/java/com/bfd/mf/job/service/statistics/StatisticsService.java

@ -251,6 +251,8 @@ public class StatisticsService {
if(null != task.getCid() && !task.getCid().equals("test")) {
// 获取任务数量
countMap = esQueryMiniService.getTaskCount(miniName, taskId, task, crawlDataFlag, indexNamePre);
countMap = esQueryMiniService.getTaskCountNew(miniName, taskId, task, indexNamePre);
// 直接更新 cl_task 表中的 data_total today_data_total
long totalCount = 0L;
long todayCount = 0L;
@ -267,6 +269,7 @@ public class StatisticsService {
fileCount = countMap.get(ESConstants.FILECOUNT);
textCount = countMap.get(ESConstants.TEXTCOUNT);
}
// taskRepository.updateTaskCount(taskId,totalCount,todayCount);
taskRepository.updateTaskCountAll(taskId,totalCount,todayCount,imageCount,videoCount,fileCount,textCount);
}

1
cl_query_data_job/src/main/java/com/bfd/mf/job/service/taskCount/TaskCountService.java

@ -12,7 +12,6 @@ import com.bfd.mf.job.service.es.EsQueryNormalService;
import com.bfd.mf.job.service.statistics.TotalCountService;
import com.bfd.mf.job.util.DateUtil;
import com.bfd.mf.job.util.EsUtils;
import kafka.utils.Json;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;

22
cl_query_data_job/src/main/resources/application.yml

@ -3,21 +3,13 @@ debug: false
logging:
level:
com.bfd.mf: debug
#spring:
# datasource:
# driver-class-name: com.mysql.jdbc.Driver
# username: root
# password: bfd123
# url: jdbc:mysql://172.26.11.113:3306/intelligent_crawl?useOldAliasMetadataBehavior=true&characterEncoding=UTF-8&zeroDateTimeBehavior=round
# hikari:
# maximum-pool-size: 10
# minimum-idle: 1
spring:
datasource:
driver-class-name: com.mysql.jdbc.Driver
username: crawl
password: D5HLOvk553DUNV62qJI=
url: jdbc:mysql://172.18.1.134:3306/all_task?useOldAliasMetadataBehavior=true&characterEncoding=UTF-8&zeroDateTimeBehavior=round
driver-class-name: com.mysql.cj.jdbc.Driver
username: crawl666
password: lx2a4jN1xFT96kj20LU=
url: jdbc:mysql://172.18.1.134:3306/intelligent_crawl?useSSL=true&useUnicode=true&characterEncoding=UTF-8&serverTimezone=UTC
hikari:
maximum-pool-size: 10
minimum-idle: 1
@ -37,9 +29,9 @@ worker:
## 服务的状态,true 为启动
enable-analysis-producer: false # 查ES写kafka
enable-analysis-consumer: false # 读kafka写ES
enable-statistics-producer: false # 统计 taskCount 和 subjectCount (采集平台)
enable-statistics-producer: true # 统计 taskCount 和 subjectCount (采集平台)
enable-query-producer: false # 离线拉数(采集平台)
enable-high-frequency-producer: true # 高频离线拉数(采集平台)
enable-high-frequency-producer: false # 高频离线拉数(采集平台)
enable-backtrace-producer: false # 欧莱雅查数(采集平台,欧莱雅项目独用)
enable-rw-oly-producer: false # 欧莱雅数据导出,暂时不用
enable-up-load-producer: false # 上传(采集平台)

15
cl_search_api/pom.xml

@ -5,15 +5,15 @@
<modelVersion>4.0.0</modelVersion>
<parent>
<artifactId>cl_stream_3.2</artifactId>
<artifactId>cl_stream_3.3</artifactId>
<groupId>com.bfd.mf</groupId>
<version>3.2-SNAPSHOT</version>
<version>3.3-SNAPSHOT</version>
</parent>
<name>cl_search_api</name>
<description>Search V3.2 API</description>
<description>Search V3.3 API</description>
<artifactId>cl_search_api</artifactId>
<version>3.2.7-SNAPSHOT</version>
<version>3.3.0-SNAPSHOT</version>
<properties>
<start-class>com.bfd.mf.SearchApplication</start-class>
@ -260,6 +260,13 @@
<version>2.6</version>
<scope>compile</scope>
</dependency>
<!-- jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/it.sauronsoftware/jave -->
<!--<dependency>-->
<!--<groupId>it.sauronsoftware</groupId>-->

13
cl_search_api/src/main/java/com/bfd/mf/common/service/cache/TopicQueryService.java

@ -4,16 +4,15 @@ package com.bfd.mf.common.service.cache;
import com.bfd.mf.common.service.es.EsCommonService;
import com.bfd.mf.common.service.es.ParseSearchScopeService;
import com.bfd.mf.common.util.constants.ESConstant;
import com.bfd.mf.common.web.entity.mysql.topic.Task;
import com.bfd.mf.common.web.repository.mysql.base.SiteRepository;
import com.bfd.mf.common.web.repository.mysql.topic.TaskRepository;
import com.bfd.mf.common.web.vo.params.QueryRequest;
import com.bfd.nlp.common.util.object.TObjectUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
@ -70,7 +69,9 @@ public class TopicQueryService {
} else {
List<String> areaList = siteRepository.findCidsByArea(queryRequest.getSearchArea());
List lowCaseAreaList = areaList.stream().map(String::toLowerCase).collect(Collectors.toList());
// boolQuery.must(QueryBuilders.termsQuery(ESConstant.EN_SOURCE, lowCaseAreaList));
if (lowCaseAreaList.size() > 0) {
boolQuery.must(QueryBuilders.termsQuery(ESConstant.EN_SOURCE, lowCaseAreaList));
}
// String searchArea = getSearchArea(queryRequest.getSearchArea());
// boolQuery.must(QueryBuilders.termQuery(ESConstant.AREA, searchArea));
}
@ -105,10 +106,10 @@ public class TopicQueryService {
logger.info("[TopicQueryService] queryByConditions_v1 没有任务ID,查询专题下全部任务");
} else {
List<Long> taskIds = queryRequest.getTaskIds();
if (taskIds.size() > 0) {
boolQuery.must(QueryBuilders.termsQuery(ESConstant.TASK_ID, taskIds));
}
}
if (null == cid || ("").equals(cid) || ("test").equals(cid)) {

159
cl_search_api/src/main/java/com/bfd/mf/common/service/es/EsQueryAuthorService.java

@ -8,9 +8,7 @@ import com.bfd.mf.common.web.repository.mysql.base.SiteRepository;
import com.bfd.mf.common.web.vo.params.QueryRequest;
import com.bfd.mf.config.BFDApiConfig;
import com.bfd.mf.service.SearchAuthorService;
import com.bfd.nlp.common.util.string.TStringUtils;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.MatchPhraseQueryBuilder;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.slf4j.Logger;
@ -36,6 +34,7 @@ public class EsQueryAuthorService {
private SiteRepository siteRepository;
private String clusterName = "";
@PostConstruct
public void init() {
// 注册数据查询来源
@ -59,7 +58,7 @@ public class EsQueryAuthorService {
if (!queryRequest.getSidx().equals("")) {
queryRequest.getSidx(); // 排序字段
}
boolQueryBuilder = getQueryBuilder(queryRequest);
boolQueryBuilder = getQueryBuilderNew(queryRequest);
Integer searchType = queryRequest.getSearchType();
logger.info("[EsQueryAuthorService] queryAuthorListByKeyword indexName = " + indexName[0] + "; qb: \n {}.", boolQueryBuilder.toString());
List<JSONObject> result = EsUtils.query(clusterName, indexName, boolQueryBuilder, sortFlag, orderFlag, limit, start, searchType);
@ -96,41 +95,123 @@ public class EsQueryAuthorService {
/**
* 查询语句组装
*/
private BoolQueryBuilder getQueryBuilder(QueryRequest queryRequest) {
logger.info("[EsQueryAuthorService] getQueryBuilder start ..." );
BoolQueryBuilder bqb = QueryBuilders.boolQuery();
// private BoolQueryBuilder getQueryBuilder(QueryRequest queryRequest) {
// logger.info("[EsQueryAuthorService] getQueryBuilder start ..." );
// BoolQueryBuilder bqb = QueryBuilders.boolQuery();
// // 基础查询根据查询条件组装查询语句
// BoolQueryBuilder boolQueryBuilder = null;
// boolQueryBuilder = topicQueryService.queryByConditions_v1(queryRequest);
// // 二次查询 关键词不为空就添加关键词查询语句 = 0 content 1 title 2 author 3 con+tit 4con+aut 5con+com
// // 单选 0:主贴;1:评论;2:用户
// Integer searchType = queryRequest.getSearchType();
// // String searchScope = queryRequest.getSearchScope(); //复选 0:标题;1:正文;2:作者 多个用,分割 0,1
// String keyword = queryRequest.getKeyword();
//
// BoolQueryBuilder searchTextBuilder = topicQueryService.buildSearchTextBuilder(searchType);
// boolQueryBuilder.filter(searchTextBuilder);
// // Map<String ,Float> fields = new HashedMap();
// if (TStringUtils.isNotEmpty(keyword)) {
// // 主贴的话 标题和内容
// if(searchType == 0){
// MatchPhraseQueryBuilder titleQuery = QueryBuilders.matchPhraseQuery(ESConstant.TITLE, keyword).slop(0);
// MatchPhraseQueryBuilder contentQuery = QueryBuilders.matchPhraseQuery(ESConstant.CONTENT, keyword).slop(0);
// QueryBuilder queryBuilder = QueryBuilders.boolQuery().should(titleQuery).should(contentQuery);
// bqb.must(queryBuilder);
// // 评论的话 评论内容
// }else if (searchType == 1){
//// MatchPhraseQueryBuilder contentQuery = QueryBuilders.matchPhraseQuery(ESConstant.CONTENT, keyword).slop(0);
//// QueryBuilder queryBuilder = QueryBuilders.boolQuery().must(contentQuery);
//// qb.must(queryBuilder);
// boolQueryBuilder.must(QueryBuilders.matchPhraseQuery(ESConstant.CONTENT, keyword).slop(0));
// // 用户 就只查 用户名
// }else if (searchType == 2){
// boolQueryBuilder.must(QueryBuilders.queryStringQuery("*"+keyword+"*").field(ESConstant.AUTHOR));
// //boolQueryBuilder.must(QueryBuilders.queryStringQuery("*"+keyword+"*").field(ESConstant.AUTHOR));
// }
// }
// bqb.must(boolQueryBuilder);
// return bqb;
// }
/**
* 2023-05-24 漏了用户的高级搜索
* @param queryRequest
* @return
*/
private BoolQueryBuilder getQueryBuilderNew(QueryRequest queryRequest) {
logger.info("[EsQueryAuthorService] getQueryBuilderNew start ...");
BoolQueryBuilder qb = QueryBuilders.boolQuery();
// 基础查询根据查询条件组装查询语句
BoolQueryBuilder boolQueryBuilder = null;
boolQueryBuilder = topicQueryService.queryByConditions_v1(queryRequest);
// 二次查询 关键词不为空就添加关键词查询语句 = 0 content 1 title 2 author 3 con+tit 4con+aut 5con+com
// 单选 0:主贴;1:评论;2:用户
Integer searchType = queryRequest.getSearchType();
// String searchScope = queryRequest.getSearchScope(); //复选 0:标题;1:正文;2:作者 多个用,分割 0,1
String keyword = queryRequest.getKeyword();
BoolQueryBuilder boolQueryBuilder = topicQueryService.queryByConditions_v1(queryRequest);
// 如果要根据ID 查询数据 如果查ID 后面的条件就不用查了
if (null != queryRequest.getDataIds() && !("").equals(queryRequest.getDataIds())) {
String dataIds = queryRequest.getDataIds();
List<String> dataIdList = getDataIdList(dataIds);
QueryBuilder queryBuilder = QueryBuilders.termsQuery(ESConstant.DATA_ID, dataIdList);
boolQueryBuilder = QueryBuilders.boolQuery().filter(queryBuilder);
// 如果有 任务ID就有没有就没有啊
if (null != queryRequest.getTaskIds()) {
List<Long> taskIds = queryRequest.getTaskIds();
if (taskIds.size() > 0) {
boolQueryBuilder = boolQueryBuilder.must(QueryBuilders.termsQuery("taskId", taskIds));
}
}
qb.must(boolQueryBuilder);
return qb;
}
Integer searchType = queryRequest.getSearchType(); // 单选 0:主贴;1:评论;2:用户
BoolQueryBuilder searchTextBuilder = topicQueryService.buildSearchTextBuilder(searchType);
boolQueryBuilder.filter(searchTextBuilder);
// Map<String ,Float> fields = new HashedMap();
if (TStringUtils.isNotEmpty(keyword)) {
// 主贴的话 标题和内容
if(searchType == 0){
MatchPhraseQueryBuilder titleQuery = QueryBuilders.matchPhraseQuery(ESConstant.TITLE, keyword).slop(0);
MatchPhraseQueryBuilder contentQuery = QueryBuilders.matchPhraseQuery(ESConstant.CONTENT, keyword).slop(0);
QueryBuilder queryBuilder = QueryBuilders.boolQuery().should(titleQuery).should(contentQuery);
bqb.must(queryBuilder);
// 评论的话 评论内容
}else if (searchType == 1){
// MatchPhraseQueryBuilder contentQuery = QueryBuilders.matchPhraseQuery(ESConstant.CONTENT, keyword).slop(0);
// QueryBuilder queryBuilder = QueryBuilders.boolQuery().must(contentQuery);
// qb.must(queryBuilder);
boolQueryBuilder.must(QueryBuilders.matchPhraseQuery(ESConstant.CONTENT, keyword).slop(0));
// 用户 就只查 用户名
}else if (searchType == 2){
boolQueryBuilder.must(QueryBuilders.queryStringQuery("*"+keyword+"*").field(ESConstant.AUTHOR));
if (null != queryRequest.getHighLevelQueries()) {
List<HighLevelQuery> highLevelQueries = queryRequest.getHighLevelQueries();
List<HighLevelQuery> tempHighLevel = new ArrayList<>(highLevelQueries.size());
for (HighLevelQuery high :tempHighLevel){
System.out.println(high.getText());
boolQueryBuilder.must(QueryBuilders.queryStringQuery("*"+high.getText()+"*").field(ESConstant.AUTHOR));
}
}
bqb.must(boolQueryBuilder);
return bqb;
// if (null != queryRequest.getHighLevelQueries()) {
// List<HighLevelQuery> highLevelQueries = queryRequest.getHighLevelQueries();
//
// // 1找到所有的not进行非处理
// highLevelQueries.stream().filter(e -> SearchExpressionEnum.NOT.is(e.getExpression())).forEach(e -> {
// qb.mustNot(this.getHighLevelQueryBuilder(e, true));
// });
//
// // 2循环处理剩下的不含not的处理逻辑为如果当前是and则将tempHighLevel进行must处理tempHighLevel中如果有多个则内部should处理
// List<HighLevelQuery> tempHighLevel = new ArrayList<>(highLevelQueries.size());
// highLevelQueries.stream().filter(e -> !SearchExpressionEnum.NOT.is(e.getExpression())).forEach(e -> {
// // 如果是and tempHighLevel不为空则处理tempHighLevel(>1个做内部或操作)并清空
// if (SearchExpressionEnum.AND.is(e.getExpression()) && !tempHighLevel.isEmpty()) {
// // 拼接条件
// BoolQueryBuilder tempQueryBuilder = QueryBuilders.boolQuery();
// tempHighLevel.forEach(temp -> tempQueryBuilder.should(this.getHighLevelQueryBuilder(temp, false)));
// qb.must(tempQueryBuilder);
// tempHighLevel.clear();
// }
// // 将当前项加入临时队列
// tempHighLevel.add(e);
// });
//
// // 此处拼接tempHighLevel未处理的内容
// if (!tempHighLevel.isEmpty()) {
// BoolQueryBuilder tempQueryBuilder = QueryBuilders.boolQuery();
// tempHighLevel.forEach(temp -> tempQueryBuilder.should(this.getHighLevelQueryBuilder(temp, false)));
// qb.must(tempQueryBuilder);
// }
//
// }
qb.must(boolQueryBuilder);
return qb;
}
@ -199,4 +280,18 @@ public class EsQueryAuthorService {
return qb;
}
public List<String> getDataIdList(String dataIds) {
List<String> dataIdList = new ArrayList<>();
if (dataIds.contains(",")) {
String ids[] = dataIds.split(",");
for (String id : ids) {
dataIdList.add(id);
}
} else {
dataIdList.add(dataIds);
}
return dataIdList;
}
}

55
cl_search_api/src/main/java/com/bfd/mf/common/service/es/EsQueryServiceForSQMini.java

@ -6,10 +6,7 @@ import com.bfd.mf.common.util.es.EsUtils;
import com.bfd.mf.common.web.repository.mysql.topic.TaskRepository;
import com.bfd.mf.common.web.vo.params.QueryRequest;
import com.bfd.mf.config.BFDApiConfig;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.index.query.TermsQueryBuilder;
import org.elasticsearch.index.query.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
@ -18,6 +15,7 @@ import org.springframework.stereotype.Service;
import javax.annotation.PostConstruct;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
@Service
public class EsQueryServiceForSQMini {
@ -42,6 +40,7 @@ public class EsQueryServiceForSQMini {
/**
* 2023-04-25 查询调用的方法
* 查询
*
* @param indexName
* @param queryRequest
* @return
@ -81,7 +80,8 @@ public class EsQueryServiceForSQMini {
}
/**
* 查询数据量
* 查询数据量 Count
*
* @param indexName
* @param queryRequest
* @return
@ -89,7 +89,8 @@ public class EsQueryServiceForSQMini {
public Long queryDataCountFromFolder(String[] indexName, QueryRequest queryRequest) {
try {
logger.debug("[EsQueryServiceForSQMini - 专题] queryDataCountFromOneSubject ...");
BoolQueryBuilder boolQueryBuilder = getQueryBuilder.getQueryBuilder(queryRequest);
//BoolQueryBuilder boolQueryBuilder = getQueryBuilder.getQueryBuilder(queryRequest);
BoolQueryBuilder boolQueryBuilder = getQueryBuilder.getQueryBuilderNew(queryRequest);
Integer searchType = queryRequest.getSearchType();
Integer size = queryRequest.getLimit();
Long totalCount = EsUtils.queryTotalCountNew(clusterName, indexName, boolQueryBuilder, searchType);
@ -157,6 +158,48 @@ public class EsQueryServiceForSQMini {
public long reIndexData(String indexList, String newIndex) {
try {
long created = EsUtils.reIndex(clusterName, indexList, newIndex);
System.out.println(created);
// 这块是不是得等3分钟后查一下ES中到底有木有数据哇
return created;
} catch (Exception e) {
e.printStackTrace();
return 0;
}
}
/**
* 2023-05-30 新的示例文件夹拉取
* @return
*/
public long reIndexDataNew(QueryRequest queryRequest) {
// 这个是之前准备好的示例文件夹
String originalIndex = "cl_major_9999";
String currentIndex = "cl_special_1.0_" + queryRequest.getSubjectId();
System.out.println(originalIndex + " to " + currentIndex);
List<Map<String, Object>> tasks = queryRequest.getTasks();
try {
long created = 0L;
for (Map<String, Object> task : tasks) {
/**
* "cid":"facebook",
* "crawlKeyword":"account:https://www.facebook.com/joebiden",
* "id":1000882,
* "siteId":182
*/
Long taskId = Long.valueOf(task.get("id").toString());
String crawlDataFlag = task.get("crawlDataFlag").toString();
String enSource = task.get("cid").toString().toLowerCase();
TermQueryBuilder termQueryBuilder1 = QueryBuilders.termQuery(ESConstant.CRAWLDATAFLAG, crawlDataFlag);
TermQueryBuilder termQueryBuilder2 = QueryBuilders.termQuery(ESConstant.EN_SOURCE, enSource);
QueryBuilder queryBuilder = QueryBuilders.boolQuery().must(termQueryBuilder1).must(termQueryBuilder2);
System.out.println(queryBuilder);
created = EsUtils.reIndexByTask(clusterName, originalIndex, currentIndex, queryBuilder);
System.out.println("条数: "+created);
//有条数之后是得执行个update操作吧
EsUtils.updateByQuery(clusterName,currentIndex,queryBuilder,taskId);
}
// 这块是不是得等3分钟后查一下ES中到底有木有数据哇
return created;
} catch (Exception e) {
e.printStackTrace();

134
cl_search_api/src/main/java/com/bfd/mf/common/service/es/GetQueryBuilder.java

@ -21,6 +21,7 @@ import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import java.util.*;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.stream.Stream;
@Service
@ -46,7 +47,7 @@ public class GetQueryBuilder {
* @param queryRequest
* @return
*/
public BoolQueryBuilder getQueryBuilderNew(QueryRequest queryRequest) {
public BoolQueryBuilder getQueryBuilderNew0530(QueryRequest queryRequest) {
logger.info("[GetQueryBuilder] getQueryBuilder...");
BoolQueryBuilder qb = QueryBuilders.boolQuery();
@ -65,7 +66,7 @@ public class GetQueryBuilder {
if (null != queryRequest.getTaskIds()) {
List<Long> taskIds = queryRequest.getTaskIds();
if (taskIds.size() > 0) {
boolQueryBuilder = boolQueryBuilder.must(QueryBuilders.termQuery("taskId", taskIds.get(0)));
boolQueryBuilder = boolQueryBuilder.must(QueryBuilders.termsQuery("taskId", taskIds));
}
}
@ -497,6 +498,8 @@ public class GetQueryBuilder {
*/
protected QueryBuilder getHighLevelQueryBuilder(HighLevelQuery highLevelQuery, boolean isNot) {
BoolQueryBuilder result = QueryBuilders.boolQuery();
//
highLevelQuery.setWordStrategy("2");
// 获取高级查询的字段
//Map<String, Float> fieldMap = SearchScopeEnum.getFieldsByKey(highLevelQuery.getScope());
// 默认就是查全文标题 + 正文
@ -523,14 +526,17 @@ public class GetQueryBuilder {
}
// 原文普通搜索
QueryBuilder rawQueryBuilder = this.getMatchQueryBuilder(path, fieldMap, highLevelQuery.getText(), isNot, SearchWordStrategyEnum.getByKey(highLevelQuery.getWordStrategy()));
QueryBuilder rawQueryBuilder = this.getMatchQueryBuilder(path, fieldMap, highLevelQuery.getText(), isNot,
SearchWordStrategyEnum.getByKey(highLevelQuery.getWordStrategy()));
if (rawQueryBuilder != null) {
result.should(rawQueryBuilder);
}
// 如果译文不为空则进行译文普通搜索
if (highLevelQuery.getTranslateText() != null) {
QueryBuilder transQueryBuilder = this.getMatchQueryBuilder(path, fieldMap, highLevelQuery.getTranslateText(), isNot, SearchWordStrategyEnum.getByKey(highLevelQuery.getWordStrategy()));
QueryBuilder transQueryBuilder = this.getMatchQueryBuilder(path, fieldMap,
highLevelQuery.getTranslateText(), isNot,
SearchWordStrategyEnum.getByKey(highLevelQuery.getWordStrategy()));
if (transQueryBuilder != null) {
result.should(transQueryBuilder);
}
@ -607,6 +613,10 @@ public class GetQueryBuilder {
return null;
}
System.out.println(" 0***** 要检索的词:" + splitText[0] + " --- " + strategyEnum);
// System.out.println(" 1***** 要检索的词:"+splitText[1]);
BoolQueryBuilder result = QueryBuilders.boolQuery();
// 如果是非 指定完整匹配则用短语否则用最佳字段
MultiMatchQueryBuilder.Type multiMatchType = isNot
@ -662,4 +672,120 @@ public class GetQueryBuilder {
public QueryBuilder nestedQuery(String nested, QueryBuilder queryBuilder) {
return QueryBuilders.nestedQuery(nested, queryBuilder, ScoreMode.None);
}
public BoolQueryBuilder getQueryBuilderNew(QueryRequest queryRequest) {
logger.info("[GetQueryBuilder] getQueryBuilder...");
BoolQueryBuilder qb = QueryBuilders.boolQuery();
// 基础查询根据查询条件组装查询语句
BoolQueryBuilder boolQueryBuilder = topicQueryService.queryByConditions_v1(queryRequest);
// 如果要根据ID 查询数据 如果查ID 后面的条件就不用查了
if (null != queryRequest.getDataIds() && !("").equals(queryRequest.getDataIds())) {
String dataIds = queryRequest.getDataIds();
List<String> dataIdList = getDataIdList(dataIds);
QueryBuilder queryBuilder = QueryBuilders.termsQuery(ESConstant.DATA_ID, dataIdList);
boolQueryBuilder = QueryBuilders.boolQuery().filter(queryBuilder);
// 如果有 任务ID就有没有就没有啊
if (null != queryRequest.getTaskIds()) {
List<Long> taskIds = queryRequest.getTaskIds();
if (taskIds.size() > 0) {
boolQueryBuilder = boolQueryBuilder.must(QueryBuilders.termsQuery("taskId", taskIds));
}
}
qb.must(boolQueryBuilder);
return qb;
}
Integer searchType = queryRequest.getSearchType(); // 单选 0:主贴;1:评论;2:用户
BoolQueryBuilder searchTextBuilder = topicQueryService.buildSearchTextBuilder(searchType);
boolQueryBuilder.filter(searchTextBuilder);
if (null != queryRequest.getHighLevelQueries()) {
//List<HighLevelQuery> highLevelQueries = queryRequest.getHighLevelQueries();
List<HighLevelQuery> highLevelQueries = queryRequest.getHighLevelQueries();
// 1找到所有的not进行非处理
// highLevelQueries.stream().filter(e -> SearchExpressionEnum.NOT.is(e.getExpression())).forEach(e -> {
// qb.mustNot(this.getHighLevelQueryBuilder(e, true));
// });
// 2循环处理剩下的不含not的处理逻辑为如果当前是and则将tempHighLevel进行must处理tempHighLevel中如果有多个则内部should处理
CopyOnWriteArrayList<HighLevelQuery> tempHighLevel = new CopyOnWriteArrayList<>();
//CopyOnWriteArrayList<HighLevelQuery> tempHighLevel = queryRequest.getHighLevelQueries();
for (int i = 0; i < highLevelQueries.size(); i++) {
HighLevelQuery e = highLevelQueries.get(i);
System.out.println("??? 没有这一步? : "+e.getExpression());
BoolQueryBuilder tempQueryBuilder = QueryBuilders.boolQuery();
if(SearchExpressionEnum.AND.is(e.getExpression())){
tempQueryBuilder.must(this.getHighLevelQueryBuilder(e, false));
qb.must(tempQueryBuilder);
}
}
//System.out.println( highLevelQueries.stream());
highLevelQueries.stream().forEach(e -> {
if( !tempHighLevel.isEmpty()){
// 拼接条件
BoolQueryBuilder tempQueryBuilder = QueryBuilders.boolQuery();
// tempHighLevel.forEach(temp -> tempQueryBuilder.should(this.getHighLevelQueryBuilder(temp, false)));
System.out.println(tempHighLevel.size());
System.out.println("??? : "+tempHighLevel.get(0).getText());
tempHighLevel.forEach(temp -> {
System.out.println(temp);
if( SearchExpressionEnum.AND.is(e.getExpression())){
tempQueryBuilder.must(this.getHighLevelQueryBuilder(temp, false));
}else if (SearchExpressionEnum.OR.is(e.getExpression())){
tempQueryBuilder.should(this.getHighLevelQueryBuilder(temp, false));
// qb.should(tempQueryBuilder);
}else{
tempQueryBuilder.mustNot(this.getHighLevelQueryBuilder(temp, false));
// qb.mustNot(tempQueryBuilder);
}
// qb.must(tempQueryBuilder);
tempHighLevel.clear();
});
// BoolQueryBuilder tempQueryBuilder = QueryBuilders.boolQuery();
// tempHighLevel.forEach(temp -> tempQueryBuilder.should(this.getHighLevelQueryBuilder(temp, false)));
// qb.must(tempQueryBuilder);
}
// 将当前项加入临时队列
tempHighLevel.add(e);
// // 如果是and tempHighLevel不为空则处理tempHighLevel(>1个做内部或操作)并清空
// if (SearchExpressionEnum.AND.is(e.getExpression()) && !tempHighLevel.isEmpty()) {
// // 拼接条件
// BoolQueryBuilder tempQueryBuilder = QueryBuilders.boolQuery();
// tempHighLevel.forEach(temp -> tempQueryBuilder.should(this.getHighLevelQueryBuilder(temp, false)));
// qb.must(tempQueryBuilder);
// tempHighLevel.clear();
// }
// // 将当前项加入临时队列
// tempHighLevel.add(e);
});
// 此处拼接tempHighLevel未处理的内容
if (!tempHighLevel.isEmpty()) {
BoolQueryBuilder tempQueryBuilder = QueryBuilders.boolQuery();
tempHighLevel.forEach(temp -> tempQueryBuilder.should(this.getHighLevelQueryBuilder(temp, false)));
qb.must(tempQueryBuilder);
}
}
qb.must(boolQueryBuilder);
return qb;
}
}

5
cl_search_api/src/main/java/com/bfd/mf/common/util/constants/ESConstant.java

@ -283,6 +283,8 @@ public class ESConstant {
public static final String HAS_IMAGE = "hasImage";
public static final String HAS_VIDEO = "hasVideo";
public static final String HAS_FILE = "hasFile";
public static final String HAS_TRANS = "hasTrans";
/**
* 关键词
*/
@ -773,7 +775,8 @@ public class ESConstant {
ESConstant.VALUELABEL,
ESConstant.CATEGORYLABEL,
ESConstant.TAG
ESConstant.TAG,
ESConstant.HAS_TRANS
);

12
cl_search_api/src/main/java/com/bfd/mf/common/util/enums/BaseFieldEnum.java

@ -170,7 +170,15 @@ public enum BaseFieldEnum {
* 区县
*/
county_code,
/**
* OCR 结果
*/
ocrText,
/**
* ASR 结果
*/
asrText
;
/**
@ -178,9 +186,11 @@ public enum BaseFieldEnum {
* @return
*/
public static Map<String, Float> getMatchFields(){
Map<String, Float> matchMap = new HashMap<>(2);
Map<String, Float> matchMap = new HashMap<>(4);
matchMap.put(BaseFieldEnum.title.name(), 2.0F);
matchMap.put(BaseFieldEnum.content.name(), 1.0F);
matchMap.put(BaseFieldEnum.ocrText.name(), 1.0F);
matchMap.put(BaseFieldEnum.asrText.name(), 1.0F);
return matchMap;
}

2
cl_search_api/src/main/java/com/bfd/mf/common/util/enums/SearchScopeEnum.java

@ -22,6 +22,8 @@ public enum SearchScopeEnum {
return new HashMap() {{
put(ESConstant.TITLE, 1.0F);
put(ESConstant.CONTENT, 1.0F);
put(ESConstant.OCRTEXT, 1.0F);
put(ESConstant.ASRTEXT, 1.0F);
}};
}
},

637
cl_search_api/src/main/java/com/bfd/mf/common/util/es/EsUtils.java

@ -26,9 +26,13 @@ import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.text.Text;
import org.elasticsearch.common.transport.TransportAddress;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.index.query.*;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.MultiMatchQueryBuilder;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.index.reindex.*;
import org.elasticsearch.script.Script;
import org.elasticsearch.script.ScriptType;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.SearchHits;
import org.elasticsearch.search.aggregations.AggregationBuilder;
@ -47,8 +51,6 @@ import org.springframework.util.Assert;
import java.net.InetAddress;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public abstract class EsUtils {
@ -84,7 +86,7 @@ public abstract class EsUtils {
return CLIENT_MAP.get(clusterName);
}
public static List<JSONObject> query(String clusterName, String[] index,
public static List<JSONObject> query0530(String clusterName, String[] index,
final QueryBuilder queryBuilder,
String sortFlag, String orderFlag,
Integer size, Integer from,
@ -94,10 +96,13 @@ public abstract class EsUtils {
boolean options = true;
boolean optionsf = false;
// 现在不同任务的同一条数据不做消重因此同一个DOCID 的数据会有多条因此只有查主贴的时候需要用DOCID 消重
CollapseBuilder collapseBuilder = new CollapseBuilder(ESConstant.DATA_ID);
if (searchType == 0) {
collapseBuilder = new CollapseBuilder(ESConstant.DOC_ID);
}
// CollapseBuilder collapseBuilder = new CollapseBuilder(ESConstant.DATA_ID);
// CollapseBuilder collapseBuilder = null;
// if (searchType == 0) {
// collapseBuilder = new CollapseBuilder(ESConstant.DOC_ID);
// }
// Object[] objects= new Object[]{"9999"};
// 查询
// from + size 分页 查询方式
@ -105,9 +110,10 @@ public abstract class EsUtils {
.setIndicesOptions(IndicesOptions.fromOptions(options, options, options, optionsf))
.addSort(sortFlag, orderFlag.equals(ESConstant.ASC) ? SortOrder.ASC : SortOrder.DESC)
.setQuery(queryBuilder)
.setCollapse(collapseBuilder)
// .searchAfter(objects)
//.setCollapse(collapseBuilder)
.setSize(size)
.setFrom(from);
.setFrom(from); // 用search_after 的话这个 from == 0
System.out.println(requestBuilder);
@ -130,7 +136,7 @@ public abstract class EsUtils {
}
public static List<JSONObject> queryWithHighlight(String clusterName, String[] index,
public static List<JSONObject> queryWithHighlight0530(String clusterName, String[] index,
final QueryBuilder queryBuilder,
String sortFlag, String orderFlag,
Integer size, Integer from,
@ -235,22 +241,53 @@ public abstract class EsUtils {
List<JSONObject> dataList = new ArrayList<>();
if (searchResponse.getHits().totalHits > 0) {
for (SearchHit hit : searchResponse.getHits().getHits()) {
SearchHit[] hits = searchResponse.getHits().getHits();
for (int i = 0; i < hits.length; i++) {
JSONObject data = new JSONObject();
data.putAll(hit.getSourceAsMap());
String fieldName[] = {ESConstant.CONTENT, ESConstant.TITLE};
for (int i = 0; i < fieldName.length; i++) {
getHighlightResult(fieldName[i], hit, data);
data.putAll(hits[i].getSourceAsMap());
String fieldName[] = {ESConstant.CONTENT, ESConstant.TITLE, ESConstant.OCRTEXT, ESConstant.ASRTEXT};
for (int j = 0; j < fieldName.length; j++) {
getHighlightResult(fieldName[j], hits[i], data);
}
data.put("subjectId", hits[i].getIndex()
.replace("cl_major_", "")
.replace("cl_subject_", "")
.replace("cl_special_1.0_", ""));
dataList.add(data);
}
}
// if (searchResponse.getHits().totalHits > 0) {
// for (SearchHit hit : searchResponse.getHits().getHits()) {
// JSONObject data = new JSONObject();
// data.putAll(hits[i].getSourceAsMap());
// data.put("subjectId", hits[i].getIndex()
// .replace("cl_major_", "")
// .replace("cl_subject_", "")
// .replace("cl_special_1.0_", ""));
// dataList.add(data);
//
// JSONObject data = new JSONObject();
// data.putAll(hit.getSourceAsMap());
// String fieldName[] = {ESConstant.CONTENT, ESConstant.TITLE};
// for (int i = 0; i < fieldName.length; i++) {
// getHighlightResult(fieldName[i], hit, data);
// }
//
// data.put("subjectId", hit.getSourceAsMap().get()
// .replace("cl_major_", "")
// .replace("cl_subject_", "")
// .replace("cl_special_1.0_", ""));
// dataList.add(data);
// }
// }
return dataList;
}
private static void getHighlightResult(String fieldName, SearchHit hit, JSONObject data) {
if (hit.getHighlightFields().containsKey(fieldName)) {
HighlightField highlightField = hit.getHighlightFields().get(fieldName);
System.out.println("getHighlightResult highlightField : "+highlightField);
Text[] fragments = highlightField.fragments();
String fragmentString = "";
for (Text fragment : fragments) {
@ -385,7 +422,8 @@ public abstract class EsUtils {
return 0L;
}
public static Long queryTotalCountNew(String clusterName, String[] index,
public static Long queryTotalCountNew_0530(String clusterName, String[] index,
QueryBuilder queryBuilder,
Integer searchType) {
@ -395,27 +433,62 @@ public abstract class EsUtils {
// 现在不同任务的同一条数据不做消重因此同一个DOCID 的数据会有多条因此只有查主贴的时候需要用DOCID 消重
String count = "count";
AggregationBuilder aggregation;
// searchType = 0 主贴
if (searchType == 0) {
aggregation = AggregationBuilders.cardinality(count).field(ESConstant.DOC_ID);
} else {
aggregation = AggregationBuilders.cardinality(count).field(ESConstant.DATA_ID);
}
// CollapseBuilder collapseBuilder = new CollapseBuilder(ESConstant.DATA_ID);
CollapseBuilder collapseBuilder = null;
if (searchType == 0) {
collapseBuilder = new CollapseBuilder(ESConstant.DOC_ID);
}
//searchSourceBuilder.aggregation(aggregation);
// from + size 分页 查询方式
SearchRequestBuilder requestBuilder = client.prepareSearch().setIndices(index)
.setIndicesOptions(IndicesOptions.fromOptions(options, options, options, optionsf))
.setQuery(queryBuilder)
//.setCollapse(collapseBuilder);
.addAggregation(aggregation);
// System.out.println(requestBuilder);
/**
* 2023-05-30 先注释掉看看情况
*/
// System.out.println("3333 : " + requestBuilder.get().getHits().totalHits);
Aggregations aggregations = requestBuilder.get().getAggregations();
Cardinality cardinality = aggregations.get(count);
// System.out.println("1111 : " + cardinality.getValue());
// System.out.println("2222 : " + requestBuilder.get().getHits().totalHits);
long resultCount = cardinality.getValue();
if (searchType == 2) {
resultCount = requestBuilder.get().getHits().totalHits;
}
System.out.println("cardinality : " + cardinality.getValue());
System.out.println("totalHits : " + requestBuilder.get().getHits().totalHits);
// long resultCount = cardinality.getValue();
// if (searchType == 2) {
// resultCount = requestBuilder.get().getHits().totalHits;
// }
/**
* 折叠查询的参考代码
*/
// CollapseBuilder collapseBuilder = new CollapseBuilder("duplicate_id");
// InnerHitBuilder innerHitBuilder = new InnerHitBuilder();
// innerHitBuilder.setName("test");
// innerHitBuilder.setSize(0);
// innerHitBuilder.setTrackScores(true);
// innerHitBuilder.setIgnoreUnmapped(true);
// innerHitBuilder.addSort(SortBuilders.fieldSort("level").order(SortOrder.DESC));
// collapseBuilder.setInnerHits(innerHitBuilder);
//
// ......
//
// srb = client.prepareSearch(indexName)
// .setTypes(typeName)
// .setQuery(bqb)
// .setFrom(params.getFrom())
// .setSize(params.getSize())
// .setCollapse(collapseBuilder)
// .setPreference("_primary_first");
long resultCount = requestBuilder.get().getHits().totalHits;
return resultCount;
}
@ -426,6 +499,7 @@ public abstract class EsUtils {
Integer limit,
String scrollId,
Integer searchType) {
Map<String, Object> result = new HashMap<>();
TransportClient client = getClient(clusterName);
SearchResponse searchResponse = null;
@ -590,6 +664,14 @@ public abstract class EsUtils {
}
}
/**
* 复制索引数据
*
* @param clusterName
* @param originalIndex
* @param currentIndex
* @return
*/
public static long reIndex(String clusterName, String originalIndex, String currentIndex) {
// String clusterName, String originalIndex, String currentIndex,
try {
@ -599,6 +681,7 @@ public abstract class EsUtils {
.newRequestBuilder(client)
.source(originalIndex)
.destination(currentIndex);
// 新建别名查询需要用别名查不加别名查不到哦
String newAliex = currentIndex.replace("cl_special_1.0_", "cl_major_");
BulkByScrollResponse response = builder.get();
@ -690,6 +773,46 @@ public abstract class EsUtils {
}
}
/**
* 2023-05-30
*
* @param clusterName
* @param originalIndex
* @param currentIndex
* @param queryBuilder
* @return
*/
public static long reIndexByTask(String clusterName,
String originalIndex,
String currentIndex,
QueryBuilder queryBuilder) {
try {
TransportClient client = getClient(clusterName);
System.out.println(originalIndex + " *** " + currentIndex);
ReindexRequestBuilder builder = ReindexAction.INSTANCE
.newRequestBuilder(client)
.source(originalIndex)// 来源索引
.destination(currentIndex) // 目标索引
.filter(queryBuilder)
.refresh(true);
// builder.
BulkByScrollResponse response = builder.get();
// 添加别名将cl_special_1.0_ 替换成 cl_major 别名
String newAliex = currentIndex.replace("cl_special_1.0_", "cl_major_");
IndicesAliasesRequestBuilder indicesBuilder = IndicesAliasesAction.INSTANCE
.newRequestBuilder(client)
.addAlias(currentIndex, newAliex);
IndicesAliasesResponse IndicesResponse = indicesBuilder.get();
System.out.println("******* : " + response);
System.out.println("##### : " + IndicesResponse);
return response.getCreated();
} catch (Exception e) {
e.printStackTrace();
return 0;
}
}
public static void delIndexByTasks(String clusterName, String indexName, String cid, List<String> tasks) {
try {
TransportClient client = getClient(clusterName);
@ -707,6 +830,17 @@ public abstract class EsUtils {
}
}
public static void updateByQuery(String clusterName, String currentIndex, QueryBuilder queryBuilder, Long taskId) {
TransportClient client = getClient(clusterName);
UpdateByQueryRequestBuilder updateByQuery = UpdateByQueryAction.INSTANCE.newRequestBuilder(client);
// "source": "ctx._source['source']='路透社';"
updateByQuery.source(currentIndex)
.filter(queryBuilder)
.size(1000)
.script(new Script(ScriptType.INLINE, "painless", "ctx._source['taskId'] = '" + taskId + "'", Collections.emptyMap()));
BulkByScrollResponse response = updateByQuery.get();
}
/**
* 全文检索查询拼接(非nested属性重载方法)
*
@ -757,4 +891,461 @@ public abstract class EsUtils {
public QueryBuilder nestedQuery(String nested, QueryBuilder queryBuilder) {
return QueryBuilders.nestedQuery(nested, queryBuilder, ScoreMode.None);
}
// public void testAggAndDistinct(){
// //获取注解通过注解可以得到 indexName type
// Document document = Customer.class.getAnnotation(Document.class);
// // dateHistogram Aggregation 是时间柱状图聚合按照天来聚合
// // dataAgg 为聚合结果的名称createTime 为字段名称
// // cardinality 用来去重
// SearchQuery searchQuery = new NativeSearchQueryBuilder()
// .withQuery(matchAllQuery())
// .withSearchType(SearchType.QUERY_THEN_FETCH)
// .withIndices(document.indexName()).withTypes(document.type())
// .addAggregation(AggregationBuilders.dateHistogram("dataAgg").field("createTime")
// .dateHistogramInterval(DateHistogramInterval.DAY)
// .subAggregation(AggregationBuilders.cardinality("nameAgg").field("firstName")))
// .build();
//
// // 聚合的结果
// Aggregations aggregations = elasticsearchTemplate.query(searchQuery, response -> response.getAggregations());
// Map<String, Aggregation> results = aggregations.asMap();
// Histogram histogram = (Histogram) results.get("dataAgg");
// // 将bucket list 转换成 map key -> 名字 value-> 出现次数
// histogram.getBuckets().stream().forEach(t->{
// Histogram.Bucket histogram1 = t;
// System.out.println(histogram1.getKeyAsString());
// Cardinality cardinality = histogram1.getAggregations().get("nameAgg");
// System.out.println(cardinality.getValue());
// });
// }
public static Long queryTotalCountNew0530(String clusterName, String[] index,
QueryBuilder queryBuilder,
Integer searchType) {
System.out.println("---------------------------");
long resultCount = 0l;
try {
TransportClient client = getClient(clusterName);
boolean options = true;
boolean optionsf = false;
// 现在不同任务的同一条数据不做消重因此同一个DOCID 的数据会有多条因此只有查主贴的时候需要用DOCID 消重
String count = "count";
AggregationBuilder aggregation;
// searchType = 0 主贴
if (searchType == 0) {
aggregation = AggregationBuilders.cardinality(count).field(ESConstant.DOC_ID);
} else {
aggregation = AggregationBuilders.cardinality(count).field(ESConstant.DATA_ID);
}
// aggregation = AggregationBuilders.dateHistogram("dataAgg").field("createTimeStr")
// .dateHistogramInterval(DateHistogramInterval.DAY)
// .subAggregation(AggregationBuilders.cardinality("idAgg").field("dataId"));
// CollapseBuilder collapseBuilder = new CollapseBuilder(ESConstant.DATA_ID);
// CollapseBuilder collapseBuilder = null;
// if (searchType == 0) {
// collapseBuilder = new CollapseBuilder(ESConstant.DOC_ID);
// }
//searchSourceBuilder.aggregation(aggregation);
// from + size 分页 查询方式
SearchRequestBuilder requestBuilder = client.prepareSearch().setIndices(index)
.setIndicesOptions(IndicesOptions.fromOptions(options, options, options, optionsf))
.setQuery(queryBuilder);
//.setCollapse(collapseBuilder);
// .addAggregation(aggregation);
// Aggregations aggregations = elasticsearchTemplate.query(searchQuery, response -> response.getAggregations());
// System.out.println(requestBuilder);
/**
* 2023-05-30 先注释掉看看情况
*/
// System.out.println("3333 : " + requestBuilder.get().getHits().totalHits);
// Aggregations aggregations = requestBuilder.get().getAggregations();
// Cardinality cardinality = aggregations.get(count);
// System.out.println("cardinality : " + cardinality.getValue());
System.out.println("totalHits : " + requestBuilder.get().getHits().totalHits);
// Map<String, Aggregation> results = aggregations.asMap();
// Histogram histogram = (Histogram) results.get("dataAgg");
// // 将bucket list 转换成 map key -> 名字 value-> 出现次数
// histogram.getBuckets().stream().forEach(t -> {
// Histogram.Bucket histogram1 = t;
// System.out.println(histogram1.getKeyAsString());
// Cardinality cardinality1 = histogram1.getAggregations().get("idAgg");
// System.out.println(cardinality1.getValue());
// });
// long resultCount = cardinality.getValue();
// if (searchType == 2) {
// resultCount = requestBuilder.get().getHits().totalHits;
// }
/**
* 折叠查询的参考代码
*/
// CollapseBuilder collapseBuilder = new CollapseBuilder("duplicate_id");
// InnerHitBuilder innerHitBuilder = new InnerHitBuilder();
// innerHitBuilder.setName("test");
// innerHitBuilder.setSize(0);
// innerHitBuilder.setTrackScores(true);
// innerHitBuilder.setIgnoreUnmapped(true);
// innerHitBuilder.addSort(SortBuilders.fieldSort("level").order(SortOrder.DESC));
// collapseBuilder.setInnerHits(innerHitBuilder);
//
// ......
//
// srb = client.prepareSearch(indexName)
// .setTypes(typeName)
// .setQuery(bqb)
// .setFrom(params.getFrom())
// .setSize(params.getSize())
// .setCollapse(collapseBuilder)
// .setPreference("_primary_first");
resultCount = requestBuilder.get().getHits().totalHits;
} catch (Exception e) {
e.printStackTrace();
}
return resultCount;
}
public static List<JSONObject> query05301(String clusterName, String[] index,
final QueryBuilder queryBuilder,
String sortFlag, String orderFlag,
Integer size, Integer from,
Integer searchType) {
System.out.println("非高亮查询");
TransportClient client = getClient(clusterName);
boolean options = true;
boolean optionsf = false;
// 现在不同任务的同一条数据不做消重因此同一个DOCID 的数据会有多条因此只有查主贴的时候需要用DOCID 消重
CollapseBuilder collapseBuilder = new CollapseBuilder(ESConstant.DATA_ID);
// CollapseBuilder collapseBuilder = null;
if (searchType == 0) {
collapseBuilder = new CollapseBuilder(ESConstant.DOC_ID);
}
// Object[] objects= new Object[]{"9999"};
// 查询
// from + size 分页 查询方式
SearchRequestBuilder requestBuilder = client.prepareSearch().setIndices(index)
.setIndicesOptions(IndicesOptions.fromOptions(options, options, options, optionsf))
.addSort(sortFlag, orderFlag.equals(ESConstant.ASC) ? SortOrder.ASC : SortOrder.DESC)
.setQuery(queryBuilder)
// .searchAfter(objects)
//.setCollapse(collapseBuilder)
.setSize(size)
.setFrom(from); // 用search_after 的话这个 from == 0
System.out.println(requestBuilder);
SearchResponse searchResponse = requestBuilder.execute().actionGet();
List<JSONObject> dataList = new ArrayList<>();
if (searchResponse.getHits().totalHits > 0) {
SearchHit[] hits = searchResponse.getHits().getHits();
for (int i = 0; i < hits.length; i++) {
JSONObject data = new JSONObject();
data.putAll(hits[i].getSourceAsMap());
data.put("subjectId", hits[i].getIndex()
.replace("cl_major_", "")
.replace("cl_subject_", "")
.replace("cl_special_1.0_", ""));
dataList.add(data);
}
}
return dataList;
}
/**
* 第一组查询不做数据聚合
*/
public static Long queryTotalCountNew_0531(String clusterName, String[] index,
QueryBuilder queryBuilder,
Integer searchType) {
System.out.println("---------------------------");
long resultCount = 0l;
try {
TransportClient client = getClient(clusterName);
boolean options = true;
boolean optionsf = false;
SearchRequestBuilder requestBuilder = client.prepareSearch().setIndices(index)
.setIndicesOptions(IndicesOptions.fromOptions(options, options, options, optionsf))
.setQuery(queryBuilder);
System.out.println("totalHits : " + requestBuilder.get().getHits().totalHits);
resultCount = requestBuilder.get().getHits().totalHits;
} catch (Exception e) {
e.printStackTrace();
}
return resultCount;
}
// public static List<JSONObject> query_0531(String clusterName, String[] index,
// final QueryBuilder queryBuilder,
// String sortFlag, String orderFlag,
// Integer size, Integer from,
// Integer searchType) {
// System.out.println("非高亮查询");
// TransportClient client = getClient(clusterName);
// boolean options = true;
// boolean optionsf = false;
// // from + size 分页 查询方式
// SearchRequestBuilder requestBuilder = client.prepareSearch().setIndices(index)
// .setIndicesOptions(IndicesOptions.fromOptions(options, options, options, optionsf))
// .addSort(sortFlag, orderFlag.equals(ESConstant.ASC) ? SortOrder.ASC : SortOrder.DESC)
// .setQuery(queryBuilder)
// .setSize(size)
// .setFrom(from);
//
// System.out.println(requestBuilder);
//
// SearchResponse searchResponse = requestBuilder.execute().actionGet();
// List<JSONObject> dataList = new ArrayList<>();
// if (searchResponse.getHits().totalHits > 0) {
// SearchHit[] hits = searchResponse.getHits().getHits();
// for (int i = 0; i < hits.length; i++) {
// JSONObject data = new JSONObject();
// data.putAll(hits[i].getSourceAsMap());
// data.put("subjectId", hits[i].getIndex()
// .replace("cl_major_", "")
// .replace("cl_subject_", "")
// .replace("cl_special_1.0_", ""));
// dataList.add(data);
// }
// }
// return dataList;
// }
public static List<JSONObject> queryWithHighlight(String clusterName, String[] index,
final QueryBuilder queryBuilder,
String sortFlag, String orderFlag,
Integer size, Integer from,
Integer searchType) {
System.out.println("高亮查询");
EsBaseParam esBaseParam = new EsBaseParam();
TransportClient client = getClient(clusterName);
boolean options = true;
boolean optionsf = false;
// 现在不同任务的同一条数据不做消重因此同一个DOCID 的数据会有多条因此只有查主贴的时候需要用DOCID 消重
CollapseBuilder collapseBuilder = new CollapseBuilder(ESConstant.DATA_ID);
if (searchType == 0) {
collapseBuilder = new CollapseBuilder(ESConstant.DOC_ID);
}
esBaseParam.setWithHighlight(true);
esBaseParam.setHighlightFields(new ArrayList<>(BaseFieldEnum.getMatchFieldsWithPy().keySet()));
Integer numOfFragments = 2;
HighlightBuilder highlightBuilder = new HighlightBuilder()
// match进行高亮
.requireFieldMatch(true)
.order(HighlightBuilder.Order.SCORE)
//fragment 是指一段连续的文字返回结果最多可以包含几段不连续的文字默认是5
.numOfFragments(numOfFragments)
//一段 fragment 包含多少个字符默认100
// .fragmentSize(Constants.MAX_R_LENGTH / numOfFragments)
// .noMatchSize(Constants.MAX_R_LENGTH)
.preTags(ESConstant.HIGHLIGHTPRETAGS)
.postTags(ESConstant.HIGHLIGHTPOSTTAGS);
BaseFieldEnum.getMatchFieldsWithPy().keySet().forEach(highlightBuilder::field);
esBaseParam.setHighlightBuilder(highlightBuilder);
// 查询
// from + size 分页 查询方式
SearchRequestBuilder requestBuilder = client.prepareSearch().setIndices(index)
.setIndicesOptions(IndicesOptions.fromOptions(options, options, options, optionsf))
.addSort(sortFlag, orderFlag.equals(ESConstant.ASC) ? SortOrder.ASC : SortOrder.DESC)
.setQuery(queryBuilder)
.setCollapse(collapseBuilder)
.setSize(size)
.setFrom(from)
.highlighter(esBaseParam.getHighlightBuilder());
System.out.println(requestBuilder);
System.out.println("-----");
SearchResponse searchResponse = requestBuilder.execute().actionGet();
List<JSONObject> dataList = new ArrayList<>();
if (searchResponse.getHits().totalHits > 0) {
SearchHit[] hits = searchResponse.getHits().getHits();
for (int i = 0; i < hits.length; i++) {
JSONObject data = new JSONObject();
data.putAll(hits[i].getSourceAsMap());
String fieldName[] = {ESConstant.CONTENT, ESConstant.TITLE, ESConstant.OCRTEXT, ESConstant.ASRTEXT};
for (int j = 0; j < fieldName.length; j++) {
getHighlightResult(fieldName[j], hits[i], data);
}
data.put("subjectId", hits[i].getIndex()
.replace("cl_major_", "")
.replace("cl_subject_", "")
.replace("cl_special_1.0_", ""));
dataList.add(data);
}
}
return dataList;
}
/**
* 05-30 聚合查询
*/
public static Long queryTotalCountNew(String clusterName, String[] index,
QueryBuilder queryBuilder,
Integer searchType) {
long resultCount = 0l;
try {
TransportClient client = getClient(clusterName);
boolean options = true;
boolean optionsf = false;
String aggrCount = "count";
AggregationBuilder aggregation;
// searchType = 0 主贴
if (searchType == 0) {
aggregation = AggregationBuilders.cardinality(aggrCount).field(ESConstant.DOC_ID);
} else {
aggregation = AggregationBuilders.cardinality(aggrCount).field(ESConstant.DATA_ID);
}
SearchRequestBuilder requestBuilder = client.prepareSearch().setIndices(index)
// .setIndicesOptions(IndicesOptions.fromOptions(options, options, options, optionsf))
.setQuery(queryBuilder)
.addAggregation(aggregation);
System.out.println("totalHits : " + requestBuilder.get().getHits().totalHits);
// ParsedCardinality parsedCardinality = (ParsedCardinality) searchResponse.getAggregations().asList().get(0);
// Aggregations aggregations = requestBuilder.get().getAggregations();
// Cardinality cardinality = aggregations.get(count);
// System.out.println("cardinality : " + cardinality.getValue());
Aggregations aggregations = requestBuilder.get().getAggregations();
Cardinality cardinality = aggregations.get(aggrCount);
System.out.println("1111 : " + aggregations.get(aggrCount));
System.out.println("cardinality : " + cardinality.getValue());
resultCount = cardinality.getValue();
// 用户数据不用ID做聚合
if (searchType == 2) {
resultCount = requestBuilder.get().getHits().totalHits;
}
// resultCount = requestBuilder.get().getHits().totalHits;
} catch (Exception e) {
e.printStackTrace();
}
return resultCount;
}
public static List<JSONObject> query(String clusterName, String[] index,
final QueryBuilder queryBuilder,
String sortFlag, String orderFlag,
Integer size, Integer from,
Integer searchType) {
System.out.println("非高亮查询");
TransportClient client = getClient(clusterName);
boolean options = true;
boolean optionsf = false;
String aggrCount = "count";
CollapseBuilder collapseBuilder = new CollapseBuilder(ESConstant.DATA_ID);
AggregationBuilder aggregationBuilder = AggregationBuilders.cardinality(aggrCount).field(ESConstant.DATA_ID);
if (searchType == 0) {
collapseBuilder = new CollapseBuilder(ESConstant.DOC_ID);
aggregationBuilder = AggregationBuilders.cardinality(aggrCount).field(ESConstant.DOC_ID);
}
// from + size 分页 查询方式
SearchRequestBuilder requestBuilder = client.prepareSearch().setIndices(index)
.setIndicesOptions(IndicesOptions.fromOptions(options, options, options, optionsf))
.addSort(sortFlag, orderFlag.equals(ESConstant.ASC) ? SortOrder.ASC : SortOrder.DESC)
.setQuery(queryBuilder)
.setCollapse(collapseBuilder)
.addAggregation(aggregationBuilder)
.setSize(size)
.setFrom(from);
System.out.println(requestBuilder);
SearchResponse searchResponse = requestBuilder.execute().actionGet();
List<JSONObject> dataList = new ArrayList<>();
if (searchResponse.getHits().totalHits > 0) {
SearchHit[] hits = searchResponse.getHits().getHits();
for (int i = 0; i < hits.length; i++) {
JSONObject data = new JSONObject();
data.putAll(hits[i].getSourceAsMap());
data.put("subjectId", hits[i].getIndex()
.replace("cl_major_", "")
.replace("cl_subject_", "")
.replace("cl_special_1.0_", ""));
dataList.add(data);
}
}
Cardinality cardinality = searchResponse.getAggregations().get(aggrCount);
//总数
long value = cardinality.getValue();
System.out.println("去重总数:" + value);
System.out.println("不去重的总数:" + requestBuilder.get().getHits().totalHits);
return dataList;
}
// private long getCardinality( QueryBuilder queryBuilder,String indexName,
// Integer size, Integer from) {
// // 获取查询的索引列表String indexName = "sjck_personnel"
// ;// 获取查询的条件列表
//// List<HashMap<String, String>> options = (List<HashMap<String, String>>) bindParams.get("conditions");
//// // 1.构建查询请求
// SearchRequest searchRequest = new SearchRequest(indexName);
//// // 4.构建最外面的
//// boolQueryBoolQueryBuilder query = QueryBuilders.boolQuery();
//// // 5.构建查询请求
//// synQueryPersonnelIndexBuilder(query, options);
// //6.高亮
// HighlightBuilder highlightBuilder = new HighlightBuilder();
// // 所有查询出来的字段全部高亮
// HighlightBuilder.Field highlightTitle = new HighlightBuilder.Field("*").requireFieldMatch(false);
// highlightTitle.highlighterType("unified");
// highlightBuilder.field(highlightTitle);
// //从第几条开始
//
// // 3.构建高亮
// AggregationBuilder aggregation = AggregationBuilders.cardinality("total_size").field("concat_field");
// SearchSourceBuilder sourceBuilder = new SearchSourceBuilder()
// .query(queryBuilder)
// .highlighter(highlightBuilder)
// .from(from)
// .size(size)
// .aggregation(aggregation);
// // 2.将查询构建器放入查询请求中
// searchRequest.source(sourceBuilder);
// SearchResponse searchResponse = null;
// try {
// searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);
// } catch (ElasticsearchStatusException e) {
// logger.error("请检查elasticsearchIndex是否存在{},错误信息{}", e, e.getMessage());
// } catch (IOException e) {
// logger.error("搜索出错了{},错误信息{}", e, e.getMessage());
// }
// assert searchResponse != null;
// ParsedCardinality parsedCardinality = (ParsedCardinality) searchResponse.getAggregations().asList().get(0);
// return parsedCardinality.getValue();
// }
}

33
cl_search_api/src/main/java/com/bfd/mf/common/web/vo/params/QueryRequest.java

@ -91,7 +91,7 @@ public class QueryRequest implements Serializable {
private String valueLabel;
private String categoryLabel;
private List<String> tasks;
// private List<String> tasks;
private String originalIndex;
private String currentIndex;
@ -102,6 +102,21 @@ public class QueryRequest implements Serializable {
private String pageType;
private String userType;
private String ocrTest;
private String asrText;
private List<Map<String, Object>> tasks;
public List<Map<String, Object>> getTasks() {
return tasks;
}
public void setTasks(List<Map<String, Object>> tasks) {
this.tasks = tasks;
}
public String getUserType() {
return userType;
}
@ -143,6 +158,7 @@ public class QueryRequest implements Serializable {
}
private List<HighLevelQuery> highLevelQueries;
public List<HighLevelQuery> getHighLevelQueries() {
return highLevelQueries;
}
@ -152,7 +168,6 @@ public class QueryRequest implements Serializable {
}
public String getOriginalIndex() {
return originalIndex;
}
@ -169,13 +184,13 @@ public class QueryRequest implements Serializable {
this.currentIndex = currentIndex;
}
public List<String> getTasks() {
return tasks;
}
public void setTasks(List<String> tasks) {
this.tasks = tasks;
}
// public List<String> getTasks() {
// return tasks;
// }
//
// public void setTasks(List<String> tasks) {
// this.tasks = tasks;
// }
public String getValueLabel() {
return valueLabel;

16
cl_search_api/src/main/java/com/bfd/mf/common/web/vo/view/monitor/ESMonitorBaseEntity.java

@ -790,16 +790,14 @@ public class ESMonitorBaseEntity implements Comparable<ESMonitorBaseEntity>, Ser
}
public String getSysSentimentTag() {
sysSentimentTag = sysSentiment.toString();
if(sysSentiment < 0.5){
sysSentimentTag = "负面";
}else if(sysSentiment == 0.5 || sysSentiment == 0.0){
sysSentimentTag = "中性";
// if(sysSentiment < 0.5){
// sysSentimentTag = "负面";
// }
// if(sysSentiment == 0.5){
// sysSentimentTag = "中性";
// }
// if(sysSentiment > 0.5){
// sysSentimentTag = "正面";
// }
}else if(sysSentiment > 0.5){
sysSentimentTag = "正面";
}
return sysSentimentTag;
}

17
cl_search_api/src/main/java/com/bfd/mf/controller/SearchDataController.java

@ -67,7 +67,8 @@ public class SearchDataController {
if (null != queryRequest.getPage() && !queryRequest.getPage().equals("")) {
if (page > 0 && queryRequest.getPage() > page) {
//return ResponseWrapper.buildResponse(RTCodeEnum.C_SUBJECT_GRAMMAR_ERROR, "总数和分页不匹配");
return ResponseWrapper.buildResponse(RTCodeEnum.C_OK, result); }
return ResponseWrapper.buildResponse(RTCodeEnum.C_OK, result);
}
}
long end = System.currentTimeMillis();
logger.info("接口查询时长:statr:" + start + " ; end:" + end + " ; time = " + (end - start) + " ; count = " + result.get(ESConstant.ALLDOCNUMBER));
@ -152,6 +153,7 @@ public class SearchDataController {
* https://caiji.percent.cn/api/sq/crawl/getCommentsByDocId
* https://caiji.percent.cn/api/sq/crawl/getQuotesByDocId
* https://caiji.percent.cn/api/sq/crawl/getAttitudesByDocId
*
* @param queryRequest
* @return
*/
@ -171,6 +173,7 @@ public class SearchDataController {
}
}
@ResponseBody
@ApiOperation(value = "查询点赞列表")
@RequestMapping(value = "/getAttitudesByDocId", method = RequestMethod.POST, consumes = MediaTypes.JSON_UTF_8, produces = MediaTypes.JSON_UTF_8)
@ -191,6 +194,7 @@ public class SearchDataController {
/**
* 查询 数据的Counts 用户左侧的显示
*
* @param queryRequest
* @return
*/
@ -217,6 +221,7 @@ public class SearchDataController {
/**
* 崔老师版本使修改标签调用的接口其他版本不调用该接口
*
* @param queryRequest
* @return
*/
@ -235,8 +240,6 @@ public class SearchDataController {
}
/**
* 删除专题的接口
*/
@ -285,6 +288,7 @@ public class SearchDataController {
/**
* 获取json 结构数据
*
* @param queryRequest
* @return
*/
@ -334,13 +338,16 @@ public class SearchDataController {
public JSONObject reIndex(@RequestBody QueryRequest queryRequest) {
logger.info("[reIndex] partial / Params: {}", JSONObject.toJSONString(queryRequest));
try {
searchDataService.reIndexSubject(queryRequest);
//searchDataService.reIndexSubject(queryRequest);
searchDataService.reIndexFolder(queryRequest);
} catch (Exception e) {
logger.error("[reIndex] Failed,The error message is :{}", e);
return ResponseWrapper.buildResponse(RTCodeEnum.C_SERVICE_NOT_AVAILABLE, "Query failed");
}
return ResponseWrapper.buildResponse(RTCodeEnum.C_OK, new JSONObject());
}
/**
* 2023-04-14 采集平台2.0新增接口
* 移动任务的数据并将原索引中的数据删除
@ -348,6 +355,7 @@ public class SearchDataController {
* crawl/subject/moveByTasks
* 参数
* {"originalIndex":"302088","moveTasks":["13889"],"currentIndex":"309980"}
*
* @param queryRequest
* @return
*/
@ -378,6 +386,7 @@ public class SearchDataController {
* crawl/subject/deleteByTasks
* 参数
* {"index":"302088","delTasks":["13889"]}
*
* @param queryRequest
* @return
*/

155
cl_search_api/src/main/java/com/bfd/mf/service/SearchDataService.java

@ -8,6 +8,7 @@ import com.bfd.mf.common.service.es.EsQueryServiceForSQNormal;
import com.bfd.mf.common.service.es.SubjectQueryDataService;
import com.bfd.mf.common.util.ESServerUtils;
import com.bfd.mf.common.util.constants.ESConstant;
import com.bfd.mf.common.util.es.EsUtils;
import com.bfd.mf.common.web.entity.mysql.SentimentModify;
import com.bfd.mf.common.web.entity.mysql.cache.Cluster;
import com.bfd.mf.common.web.repository.mysql.SentimentRepository;
@ -22,7 +23,10 @@ import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.collapse.CollapseBuilder;
import org.elasticsearch.search.sort.SortOrder;
import org.jsoup.Jsoup;
import org.jsoup.safety.Whitelist;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
@ -209,7 +213,7 @@ public class SearchDataService extends CrudService<SentimentModify, SentimentRep
* 导出时会用到这个字段解析解析组装返回结果 2
*/
private ESMonitorEntity parseMainMessage(JSONObject jsonObject, Integer searchType,
Map<String, Map<String, Object>> siteMap) throws Exception {
Map<String, Map<String, Object>> siteMap) {
ESMonitorEntity esMonitorEntity = new ESMonitorEntity();
try {
Map<String, Object> sourceAsMap = jsonObject;
@ -351,15 +355,31 @@ public class SearchDataService extends CrudService<SentimentModify, SentimentRep
List filePathSize = new ArrayList();
if (sourceAsMap.containsKey(ESConstant.FILEPATHSIZE)) {
if (!("").equals(sourceAsMap.get(ESConstant.FILEPATHSIZE)) && null != sourceAsMap.get(ESConstant.FILEPATHSIZE)) {
// filePathSize = JSONObject.parseArray(sourceAsMap.get(ESConstant.FILEPATHSIZE).toString());
if (sourceAsMap.get(ESConstant.FILEPATHSIZE).toString().contains("url=")) {
filePathSize = (List) sourceAsMap.get(ESConstant.FILEPATHSIZE);
} else {
filePathSize = JSONObject.parseArray(sourceAsMap.get(ESConstant.FILEPATHSIZE).toString());
}
}
}
List imagePathSize = new ArrayList();
if (sourceAsMap.containsKey(ESConstant.IMAGEPATHSIZE)) {
if (null != sourceAsMap.get(ESConstant.IMAGEPATHSIZE) && !("[]").equals(sourceAsMap.get(ESConstant.IMAGEPATHSIZE))) {
//if(sourceAsMap.get(ESConstant.IMAGEPATHSIZE))
/**
* [{"size":"","videoTime":"","resolution":"","url":"/group13/default/20220928/17/23/6/86b2566a903bbdbfa8e9313e105a2beb_4.png"}, {"size":"","videoTime":"","resolution":"","url":"/group13/default/20220928/17/23/6/86b2566a903bbdbfa8e9313e105a2beb_7.png"}]
* [{"size":"3541.040039KB","videoTime":"70.980000s","resolution":"","url":"http://crawl-files.pontoaplus.com/group13/default/20221010/11/50/6/7b5a86115c242223816d2b9e43acd0b1.mp4"}]
*/
// imagePathSize = JSONObject.parseArray(sourceAsMap.get(ESConstant.IMAGEPATHSIZE).toString());
/**
* [{size=107.41KB, videoTime=, resolution=-1x-1, url=/group16/default/20230308/16/07/6/53e6d72b9fe838529936572730d12441.jpg}]
*/
if (sourceAsMap.get(ESConstant.IMAGEPATHSIZE).toString().contains("url=")) {
imagePathSize = (List) sourceAsMap.get(ESConstant.IMAGEPATHSIZE);
} else {
imagePathSize = JSONObject.parseArray(sourceAsMap.get(ESConstant.IMAGEPATHSIZE).toString());
}
}
}
List videoPathSize = new ArrayList();
@ -368,13 +388,12 @@ public class SearchDataService extends CrudService<SentimentModify, SentimentRep
&& !("[]").equals(sourceAsMap.get(ESConstant.VIDEOPATHSIZE))
&& !("{\"\":null}").equals(sourceAsMap.get(ESConstant.VIDEOPATHSIZE).toString())) {
if (sourceAsMap.get(ESConstant.VIDEOPATHSIZE).toString().contains(ESConstant.URL)) {
if (sourceAsMap.get(ESConstant.VIDEOPATHSIZE) instanceof String) {
videoPathSize = JSONObject.parseArray(sourceAsMap.get(ESConstant.VIDEOPATHSIZE).toString());
} else {
if (sourceAsMap.get(ESConstant.VIDEOPATHSIZE).toString().contains("url=")) {
videoPathSize = (List) sourceAsMap.get(ESConstant.VIDEOPATHSIZE);
} else {
videoPathSize = JSONObject.parseArray(sourceAsMap.get(ESConstant.VIDEOPATHSIZE).toString());
}
//java.lang.String cannot be cast to java.util.List
//
}
}
}
@ -404,31 +423,40 @@ public class SearchDataService extends CrudService<SentimentModify, SentimentRep
//List<String>
// 视频分析结果
String asrText = "";
// String ocrText = "";
List<String> ocrText = new ArrayList<>();
if (sourceAsMap.containsKey(ESConstant.ASRTEXT)) {
asrText = sourceAsMap.get(ESConstant.ASRTEXT).toString();
}
if (sourceAsMap.containsKey(ESConstant.OCRTEXT)) {
ocrText = (List<String>) sourceAsMap.get(ESConstant.OCRTEXT);
if (sourceAsMap.get(ESConstant.OCRTEXT).toString().contains("[]")) {
} else {
ocrText.add(sourceAsMap.get(ESConstant.OCRTEXT).toString());
}
}
// 如果是用户数据需要获取下面四个字段值
String fansCount = "";
String friendsCount = "";
String postCount = "";
String location = "";
if (searchType == 2) {
if (sourceAsMap.containsKey(ESConstant.FANS_COUNT)) {
fansCount = sourceAsMap.get(ESConstant.FANS_COUNT).toString();
}
if (sourceAsMap.containsKey(ESConstant.FRIENDS_COUNT)) {
friendsCount = sourceAsMap.get(ESConstant.FRIENDS_COUNT).toString();
}
if (sourceAsMap.containsKey(ESConstant.POST_COUNT)) {
postCount = sourceAsMap.get(ESConstant.POST_COUNT).toString();
}
if (sourceAsMap.containsKey(ESConstant.WEIBO_LOCATION)) {
location = sourceAsMap.get(ESConstant.WEIBO_LOCATION).toString();
}
// if (searchType == 2) {
// if (sourceAsMap.containsKey(ESConstant.FANS_COUNT)) {
// fansCount = sourceAsMap.get(ESConstant.FANS_COUNT).toString();
// }
// if (sourceAsMap.containsKey(ESConstant.FRIENDS_COUNT)) {
// friendsCount = sourceAsMap.get(ESConstant.FRIENDS_COUNT).toString();
// }
// if (sourceAsMap.containsKey(ESConstant.POST_COUNT)) {
// postCount = sourceAsMap.get(ESConstant.POST_COUNT).toString();
// }
// if (sourceAsMap.containsKey(ESConstant.WEIBO_LOCATION)) {
// location = sourceAsMap.get(ESConstant.WEIBO_LOCATION).toString();
// }
// }
if (sourceAsMap.containsKey("location")) {
location = sourceAsMap.get("location").toString();
}
// 这个项目新增的三个字段
@ -546,8 +574,19 @@ public class SearchDataService extends CrudService<SentimentModify, SentimentRep
System.out.println(JSONObject.toJSONString(highlight));
}
// System.out.println("--------------" + url);
try {
esMonitorEntity.setForwardContent(sourceAsMap.get("forwardContent").toString());
String forwardContent = sourceAsMap.get("forwardContent").toString();
String forContent = forwardContent;
// if(forwardContent.contains("</title>")){
// Document doc = Jsoup.parse(forwardContent);
//String text = Jsoup.clean(forwardContent, Whitelist.basicWithImages());
String text = Jsoup.clean(forwardContent, Whitelist.basic());
forContent = text;
// }
esMonitorEntity.setForwardContent(forContent);
esMonitorEntity.setReadCount(readCount);
esMonitorEntity.setHasFile(hasFile);
esMonitorEntity.setHasVideo(hasVideo);
@ -572,6 +611,7 @@ public class SearchDataService extends CrudService<SentimentModify, SentimentRep
esMonitorEntity.setHlKeyWords(hlKeywords);
// 评论数转发数点赞数收藏数
esMonitorEntity.setCommentsCount(Integer.valueOf(sourceAsMap.getOrDefault(ESConstant.COMMENTS_COUNT, 0).toString()));
if (quoteCount.equals("-1")) {
quoteCount = "-";
}
@ -694,6 +734,7 @@ public class SearchDataService extends CrudService<SentimentModify, SentimentRep
/**
* 2023-04-26
* 根据主贴ID查 评论转发点赞的数据列表
*
* @param queryRequest
* @param currentIndexList
* @return
@ -732,6 +773,18 @@ public class SearchDataService extends CrudService<SentimentModify, SentimentRep
if (TopComment.size() > 0) {
size = 1L;
comments.add(TopComment);
String commentsCount = TopComment.getString(ESConstant.COMMENTS_COUNT);
if (commentsCount.equals("-1")) {
TopComment.put(ESConstant.COMMENTS_COUNT, "-");
}
String quoteCount = TopComment.getString(ESConstant.QUOTE_COUNT);
if (quoteCount.equals("-1")) {
TopComment.put(ESConstant.QUOTE_COUNT, "-");
}
String attitudesCount = TopComment.getString(ESConstant.ATTITUDES_COUNT);
if (attitudesCount.equals("-1")) {
TopComment.put(ESConstant.ATTITUDES_COUNT, "-");
}
}
boolQueryBuilder.mustNot(QueryBuilders.termQuery(ESConstant.DATA_ID, dataId));
}
@ -756,10 +809,12 @@ public class SearchDataService extends CrudService<SentimentModify, SentimentRep
long clusterId = 4;
System.out.println(boolQueryBuilder);
CollapseBuilder collapseBuilder = new CollapseBuilder(ESConstant.DATA_ID);
/**获取信息*/
SearchRequestBuilder builder = esServerUtils
.buildSearchRequestBuilder(clusterId, currentIndexList)
.setQuery(boolQueryBuilder)
.setCollapse(collapseBuilder)
.setFrom(start)
.setSize(limit)
.setFetchSource(ESConstant.COMMENT_FIELD_DATA, null)
@ -768,16 +823,41 @@ public class SearchDataService extends CrudService<SentimentModify, SentimentRep
SearchResponse searchResponse = builder.execute().actionGet();
SearchHit[] response = searchResponse.getHits().getHits();
/**
* 字段替换
* 将所有 = -1 的值替换成 -
*/
for (int i = 0; i < response.length; i++) {
JSONObject jsonObject = new JSONObject();
Map<String, Object> result = response[i].getSourceAsMap();
jsonObject.putAll(result);
jsonObject.put(ESConstant.SITEID, siteId);
String commentsCount = jsonObject.getString(ESConstant.COMMENTS_COUNT);
if (commentsCount.equals("-1")) {
jsonObject.put(ESConstant.COMMENTS_COUNT, "-");
}
String quoteCount = jsonObject.getString(ESConstant.QUOTE_COUNT);
if (quoteCount.equals("-1")) {
jsonObject.put(ESConstant.QUOTE_COUNT, "-");
}
String attitudesCount = jsonObject.getString(ESConstant.ATTITUDES_COUNT);
if (attitudesCount.equals("-1")) {
jsonObject.put(ESConstant.ATTITUDES_COUNT, "-");
}
comments.add(jsonObject);
}
/**
* TODO
* 评论回来的总数不对
*/
json.put(ESConstant.COMMENTLISTS, comments);
size = size + searchResponse.getHits().getTotalHits();
String[] currentList = {currentIndexList.get(0)};
Long totalCount = EsUtils.queryTotalCountNew("CL_Mini_2", currentList, boolQueryBuilder, 1);
System.out.println(size);
System.out.println(totalCount);
json.put(ESConstant.ALLDOCNUMBER, size);
} catch (Exception e) {
e.printStackTrace();
@ -1043,12 +1123,17 @@ public class SearchDataService extends CrudService<SentimentModify, SentimentRep
if (jsonObject.containsKey(ESConstant.DOC_TYPE)) {
if (jsonObject.get(ESConstant.DOC_TYPE).equals(ESConstant.SOCIAL)) {
String author = jsonObject.getString(ESConstant.AUTHOR);
/**
* 2023-05-17 社交媒体类详情的原文译文展示有问题
* 应该 author = author (translateTitle) 社交媒体类的用户不翻译
*/
String enSource = jsonObject.getString(ESConstant.EN_SOURCE);
if (!enSource.equals("weixin")) {
jsonObject.put(ESConstant.TITLE, author);
jsonObject.put(ESConstant.TRANSLATETITLE, author);
// content 的值 放入到 译文Title 中是为了展示一下翻译这个后面还是删掉吧
String content = jsonObject.getString(ESConstant.CONTENT);
jsonObject.put(ESConstant.TRANSLATECONTENT, content);
// String content = jsonObject.getString(ESConstant.CONTENT);
// jsonObject.put(ESConstant.TRANSLATECONTENT, content);
}
if (jsonObject.get(ESConstant.EN_SOURCE).equals(ESConstant.SINA)) {
jsonObject.put(ESConstant.SOURCE, "微博");
@ -1370,6 +1455,28 @@ public class SearchDataService extends CrudService<SentimentModify, SentimentRep
return jsonObject;
}
@Async
public JSONObject reIndexFolder(QueryRequest queryRequest) {
JSONObject jsonObject = new JSONObject();
/**
* 2023-05-29
* 复制示例文件夹的时候任务ID也需要替换成新的所以这块应该不用reindex而是直接拉数写入的时候替换就行吧
*/
// String oldIndex = "cl_major_12962";
// String oldIndex = "cl_major_12941";
// String newIndexPre = "cl_special_1.0_";
// String newIndex = newIndexPre + queryRequest.getSubjectId();
// System.out.println(newIndex);
try {
// 将上面专题的数据复制到新的专题下
long created = esQueryServiceForSQMini.reIndexDataNew(queryRequest);
jsonObject.put("created", created);
} catch (Exception e) {
e.printStackTrace();
}
return jsonObject;
}
private static int MOVE_TASKS_STATUS = 6;
private static int FINSH_TASKS_STATUS = 3;
private static int MOVE_FLORD_STATUS = 1;

4
pom.xml

@ -5,8 +5,8 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.bfd.mf</groupId>
<artifactId>cl_stream_3.2</artifactId>
<version>3.2-SNAPSHOT</version>
<artifactId>cl_stream_3.3</artifactId>
<version>3.3-SNAPSHOT</version>
<packaging>pom</packaging>
<modules>

Loading…
Cancel
Save