Browse Source

release-3.1.3(20210813,修复了 新增站点 linkedin 和 ins 查询详情的时候 标题不展示 author 的问题。修复了 离线统计 统计的时候会将粉丝数也一起统计,从而导致task 的统计结果跟查询结果不相符)

release-1.0
杜静 4 years ago
parent
commit
1da21ed362
  1. 3
      cl_query_data_job/src/main/java/com/bfd/mf/job/domain/repository/TaskRepository.java
  2. 21
      cl_query_data_job/src/main/java/com/bfd/mf/job/service/es/EsQueryMiniService.java
  3. 12
      cl_query_data_job/src/main/java/com/bfd/mf/job/service/query/QueryService.java
  4. 2
      cl_query_data_job/src/main/java/com/bfd/mf/job/service/statistics/StatisticsService.java
  5. 10
      cl_query_data_job/src/main/java/com/bfd/mf/job/worker/QueryProducer.java
  6. 3
      cl_search_api/src/main/java/com/bfd/mf/controller/SearchDataController.java
  7. 39
      cl_search_api/src/main/java/com/bfd/mf/service/SearchDataService.java

3
cl_query_data_job/src/main/java/com/bfd/mf/job/domain/repository/TaskRepository.java

@ -17,7 +17,8 @@ public interface TaskRepository extends CrudRepository<Task, Long> {
// 需要统计的任务的查询条件 1 状态为 1 OR 02状态为3且任务完成时间再2天前的
@Query(value = "SELECT ct.id,ct.app_id,ct.subject_id,ct.external_id,cs.site_type, ct.task_type,ct.cid,ct.crawl_status,ct.crawl_start_time,ct.crawl_end_time,ct.crawl_data_flag,ct.data_total,ct.today_data_total,ct.cache_num,ct.update_time,ct.del,ct.crawl_content_key FROM `cl_task` ct JOIN intelligent_crawl.cl_site cs ON ct.cid = cs.cid WHERE ct.del = 0 AND ((ct.crawl_status = 1 OR ct.crawl_status = 0) OR (ct.crawl_status = 3 AND ct.end_time > date_sub(curdate(),interval 2 day))); ",nativeQuery = true)
// @Query(value = "SELECT ct.id,ct.app_id,ct.subject_id,ct.external_id,cs.site_type, ct.task_type,ct.cid,ct.crawl_status,ct.crawl_start_time,ct.crawl_end_time,ct.crawl_data_flag,ct.data_total,ct.today_data_total,ct.cache_num,ct.update_time,ct.del,ct.crawl_content_key FROM `cl_task` ct JOIN intelligent_crawl.cl_site cs ON ct.cid = cs.cid WHERE ct.del = 0 AND ct.subject_id = 12273 ; ",nativeQuery = true)
// @Query(value = "SELECT ct.id,ct.app_id,ct.subject_id,ct.external_id,cs.site_type, ct.task_type,ct.cid,ct.crawl_status,ct.crawl_start_time,ct.crawl_end_time,ct.crawl_data_flag,ct.data_total,ct.today_data_total,ct.cache_num,ct.update_time,ct.del,ct.crawl_content_key FROM `cl_task` ct JOIN intelligent_crawl.cl_site cs ON ct.cid = cs.cid WHERE ct.del = 0 AND ct.subject_id = 12505 ; ",nativeQuery = true)
// @Query(value = "SELECT ct.id,ct.app_id,ct.subject_id,ct.external_id,cs.site_type, ct.task_type,ct.cid,ct.crawl_status,ct.crawl_start_time,ct.crawl_end_time,ct.crawl_data_flag,ct.data_total,ct.today_data_total,ct.cache_num,ct.update_time,ct.del,ct.crawl_content_key FROM `cl_task` ct JOIN intelligent_crawl.cl_site cs ON ct.cid = cs.cid WHERE ct.del = 0 ; ",nativeQuery = true)
List<Task> findAllBydel0();
@Query(value = "SELECT sum(data_total) FROM cl_task ct JOIN intelligent_crawl.cl_site cs ON ct.cid=cs.cid WHERE ct.del =0 AND ct.subject_id = ?1 AND cs.site_type = ?2",nativeQuery = true)

21
cl_query_data_job/src/main/java/com/bfd/mf/job/service/es/EsQueryMiniService.java

@ -139,19 +139,6 @@ public class EsQueryMiniService {
if (indexName.contains(indexNamePre)) {
boolean isExists = EsUtils.indexExists(clusterName, indexName);
if (isExists) {
// // 任务ID 筛选
// TermQueryBuilder cidTermQueryBuilder = QueryBuilders.termQuery(ESConstants.EN_SOURCE, cid);
// TermQueryBuilder taskIdTermQueryBuilder = QueryBuilders.termQuery(ESConstants.CRAWLDATAFLAG, crawlDataFlag);
// qb.must(taskIdTermQueryBuilder).must(cidTermQueryBuilder);
// // 时间范围筛选
// // BoolQueryBuilder shouldbq = QueryBuilders.boolQuery();
// RangeQueryBuilder rangeQueryBuilder = QueryBuilders
// .rangeQuery(ESConstants.PUBTIME)
// .gte(crawlStartTime)
// .lt(crawlEndTime);
// // 不用统计FB 的这种粉丝的量
// TermQueryBuilder pageTypeQueryBuilder = QueryBuilders.termQuery(ESConstants.PAGETYPR,"socialFans");
// qb.mustNot(pageTypeQueryBuilder).must(rangeQueryBuilder);
BoolQueryBuilder qb = getQueryBuilder(cid,crawlDataFlag,crawlStartTime,crawlEndTime);
logger.info("QB1 查询总量: indexName: {}. taskId : {}.{\"query\": {}}.", indexName, taskId, qb.toString().replace("\n", "").replace("\r", "").replace(" ", ""));
Long count = EsUtils.queryCount(clusterName, indexName, qb);
@ -202,15 +189,17 @@ public class EsQueryMiniService {
TermQueryBuilder cidTermQueryBuilder = QueryBuilders.termQuery(ESConstants.EN_SOURCE, cid);
TermQueryBuilder taskIdTermQueryBuilder = QueryBuilders.termQuery(ESConstants.CRAWLDATAFLAG, crawlDataFlag);
qb.must(taskIdTermQueryBuilder).must(cidTermQueryBuilder);
// 时间范围筛选
// BoolQueryBuilder shouldbq = QueryBuilders.boolQuery();
// 时间范围筛选 只有主贴评论需要查时间用户不需要设置时间范围
BoolQueryBuilder shouldbq = QueryBuilders.boolQuery();
RangeQueryBuilder rangeQueryBuilder = QueryBuilders
.rangeQuery(ESConstants.PUBTIME)
.gte(crawlStartTime)
.lt(crawlEndTime);
TermQueryBuilder primary1 = QueryBuilders.termQuery(ESConstants.PRIMARY,2);
shouldbq.must(rangeQueryBuilder).mustNot(primary1);
// 不用统计FB 的这种粉丝的量
TermQueryBuilder pageTypeQueryBuilder = QueryBuilders.termQuery(ESConstants.PAGETYPR,"socialFans");
qb.mustNot(pageTypeQueryBuilder).must(rangeQueryBuilder);
qb.mustNot(pageTypeQueryBuilder).should(shouldbq);
return qb;
}
}

12
cl_query_data_job/src/main/java/com/bfd/mf/job/service/query/QueryService.java

@ -63,7 +63,7 @@ public class QueryService {
// 注册数据查询来源
// EsUtils.registerCluster(config.esNormalClusterName(), config.esNormalAddress());// 配置文件中的 es-source
EsUtils.registerCluster(config.esMiniClusterName(), config.esMiniAddress()); // 配置文件中的 es-target
// pRateLimiter = RateLimiter.create(1.0D / config.getPeriodS());
pRateLimiter = RateLimiter.create(1.0D / config.getPeriodS());
kafkaProducer = Kafka010Utils.getProducer(config.getBrokerList());
// cRateLimiter = RateLimiter.create(1.0D / config.getPeriodS());
@ -487,6 +487,7 @@ public class QueryService {
String videoTime = "";
try {
if(null != downloadUrl && !downloadUrl.contains("si-te.percent.cn")) {
if (downloadUrl.contains("http")) {
Map<String, String> pathSizeMap = DownLoadFile.downloadAndSaveFile(downloadUrl, goFastAddr);
LOGGER.info("[QueryService] getPathSize goFaskAddr {}. resultMap {}.", goFastAddr, pathSizeMap);
if (pathSizeMap.size() > 0) {
@ -511,8 +512,7 @@ public class QueryService {
srcMap.put(downloadUrl, url);
// 这个值使用来替换 三个 Path imagePath,videoPath,filePath
path.add(url);
}
}
}
} catch (IOException e) {
@ -574,7 +574,7 @@ public class QueryService {
try {
// TermQueryBuilder queryCidBuilders = QueryBuilders.termQuery("enSource",cid);
TermQueryBuilder queryCrawlDataFlagBuilder = QueryBuilders.termQuery(ESConstants.CRAWLDATAFLAG,crawlDataFlag);
if (crawlDataFlag.contains("keyword")) {
if (crawlDataFlag.contains("keyword:")) {
String keyword = crawlDataFlag.split("keyword:")[1];
System.out.println("[buildCrawlDataFlagBuilder] keyword --- " + keyword); // 关键词的话需要去 title content 匹配一下
MatchPhraseQueryBuilder titleQuery = QueryBuilders.matchPhraseQuery(ESConstants.TITLE, keyword).slop(0);
@ -588,14 +588,14 @@ public class QueryService {
queryBuilder = QueryBuilders.boolQuery().should(titleFilterQuery).should(contentFilterQuery).should(queryCrawlDataFlagBuilder);
}
}
if (crawlDataFlag.contains("url")) {
if (crawlDataFlag.contains("url:")) {
String url = crawlDataFlag.split("url:")[1];
System.out.println("[buildCrawlDataFlagBuilder] url --- " + url); // url 的话直接匹配 url 字段
TermQueryBuilder queryUrlBuilders = QueryBuilders.termQuery(ESConstants.URL,url);
// QueryBuilder queryBuilder1 = QueryBuilders.boolQuery().must(queryUrlBuilders);
queryBuilder = QueryBuilders.boolQuery().should(queryCrawlDataFlagBuilder).should(queryUrlBuilders);
}
if (crawlDataFlag.contains("account")) {
if (crawlDataFlag.contains("account:")) {
String account = crawlDataFlag.split("account:")[1];
System.out.println("[buildCrawlDataFlagBuilder] account --- " + account);
TermQueryBuilder queryAccountBuilders = QueryBuilders.termQuery(ESConstants.USER_URL,account);

2
cl_query_data_job/src/main/java/com/bfd/mf/job/service/statistics/StatisticsService.java

@ -242,6 +242,7 @@ public class StatisticsService {
String indexNamePre = config.getIndexNamePre();
Map<String, Long> countMap = new HashMap<>();
if(null != task.getCid() && !task.getCid().equals("test")) {
// 获取任务数量
countMap = esQueryMiniService.getTaskCount(miniName, taskId, task, crawlDataFlag, indexNamePre);
// 直接更新 cl_task 表中的 data_total today_data_total
long totalCount = 0L;
@ -253,6 +254,7 @@ public class StatisticsService {
if(countMap.containsKey(ESConstants.TOTALCOUNT) && countMap.containsKey(ESConstants.TODAYCOUNT)) {
totalCount = countMap.get(ESConstants.TOTALCOUNT);
todayCount = countMap.get(ESConstants.TODAYCOUNT);
System.out.println("******* " + totalCount);
// imageCount = countMap.get(ESConstants.IMAGECOUNT);
// videoCount = countMap.get(ESConstants.VIDEOCOUNT);
// fileCount = countMap.get(ESConstants.FILECOUNT);

10
cl_query_data_job/src/main/java/com/bfd/mf/job/worker/QueryProducer.java

@ -34,10 +34,10 @@ public class QueryProducer extends AbstractWorker {
// LOGGER.info("[QueryProducer] work start ... ");
queryBacktraceService.tryAcquire();
queryBacktraceService.produce();
// try {
// Thread.sleep(config.getIntervalTime());
// } catch (InterruptedException e) {
// e.printStackTrace();
// }
try {
Thread.sleep(6000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}

3
cl_search_api/src/main/java/com/bfd/mf/controller/SearchDataController.java

@ -47,6 +47,7 @@ public class SearchDataController {
logger.info("[queryDataList] partial / Params: {}", JSONObject.toJSONString(queryRequest));
JSONObject result = new JSONObject();
try {
long start = System.currentTimeMillis();
String scorllId = queryRequest.getScrollId();
String subjectId = queryRequest.getSubjectId();
if(null != scorllId ){// 导出数据
@ -85,6 +86,8 @@ public class SearchDataController {
return ResponseWrapper.buildResponse(RTCodeEnum.C_SUBJECT_GRAMMAR_ERROR, "总数和分页不匹配");
}
}
long end = System.currentTimeMillis();
logger.info("接口查询时长:statr:"+ start +" ; end:"+end + " ; time = " + (end - start) + " ; count = "+result.get(ESConstant.ALLDOCNUMBER));
} catch (Exception e) {
logger.error("[queryData] Failed,The error message is :{}", e);
return ResponseWrapper.buildResponse(RTCodeEnum.C_SERVICE_NOT_AVAILABLE, "Query failed");

39
cl_search_api/src/main/java/com/bfd/mf/service/SearchDataService.java

@ -433,6 +433,7 @@ public class SearchDataService extends CrudService<SentimentModify, SentimentRep
esMonitorEntity.setQuoteCount(quoteCount);
esMonitorEntity.setAttitudesCount(attitudeCount);
esMonitorEntity.setCollectCount(collentCount);
// 视频音频
esMonitorEntity.setOcrText(ocrText);
esMonitorEntity.setAsrText(asrText);
// 用户字段
@ -599,6 +600,11 @@ public class SearchDataService extends CrudService<SentimentModify, SentimentRep
return jsonObject;
}
/**
* 根据docId 查询一条数据的详情及评论列表
* @param queryRequest
* @return
*/
public JSONObject queryOneDataByDocId(QueryRequest queryRequest) {
logger.info("[SearchDataService] queryOneDataByDocId start ... ");
// 先确认一下 要查的主贴是属于 专题还是 全部数据因此需要查 subjectId,如果没有 subjectId 这个字段说明要查的是 日期索引的ES
@ -616,14 +622,7 @@ public class SearchDataService extends CrudService<SentimentModify, SentimentRep
String docId = queryRequest.getDocId();
// 根据ID 获取一条详情数据
JSONObject jsonObject = getOneDataByDocId(docId, cluster, currentIndexList);
if(!jsonObject.containsKey(ESConstant.VALUELABEL) || null == jsonObject.get(ESConstant.VALUELABEL)){
jsonObject.put(ESConstant.VALUELABEL,"");
}else{
jsonObject.put(ESConstant.VALUELABEL,jsonObject.get(ESConstant.VALUELABEL).toString());
}
if(!jsonObject.containsKey(ESConstant.CATEGORYLABEL)){
jsonObject.put(ESConstant.CATEGORYLABEL,"");
}
jsonObject = setLable(jsonObject);
// 替换几个 pathSize 中的链接的前缀
if(jsonObject.containsKey(ESConstant.IMAGEPATHSIZE)){ //http://172.18.1.113:8080
List<Map<String,String>>imagePathSize = (List<Map<String, String>>) jsonObject.get(ESConstant.IMAGEPATHSIZE);
@ -663,6 +662,20 @@ public class SearchDataService extends CrudService<SentimentModify, SentimentRep
return jsonObject;
}
// 崔老师那个版本需要添加 分类标签和 价值标签
private JSONObject setLable(JSONObject jsonObject) {
if(!jsonObject.containsKey(ESConstant.VALUELABEL) || null == jsonObject.get(ESConstant.VALUELABEL)){
jsonObject.put(ESConstant.VALUELABEL,"");
}else{
jsonObject.put(ESConstant.VALUELABEL,jsonObject.get(ESConstant.VALUELABEL).toString());
}
if(!jsonObject.containsKey(ESConstant.CATEGORYLABEL)){
jsonObject.put(ESConstant.CATEGORYLABEL,"");
}
return jsonObject;
}
private JSONObject getSite(JSONObject jsonObject, String enSource) {
List<Map<String, Object>> site = siteRepository.findSiteByEnSource(enSource);
Map<String,Map<String,Object>> siteMap = new HashMap<>();
@ -741,11 +754,11 @@ public class SearchDataService extends CrudService<SentimentModify, SentimentRep
* 社交媒体类数据将作者替换到 标题中做显示
*/
private JSONObject socialDataChangeAuthorAndTitle(JSONObject jsonObject) {
if(jsonObject.containsKey(ESConstant.EN_SOURCE)){
if(jsonObject.get(ESConstant.EN_SOURCE).equals(ESConstant.TWITTER)
|| jsonObject.get(ESConstant.EN_SOURCE).equals(ESConstant.FACEBOOK)
|| jsonObject.get(ESConstant.EN_SOURCE).equals(ESConstant.SINA))
{
if(jsonObject.containsKey(ESConstant.DOC_TYPE)){
// if(jsonObject.get(ESConstant.EN_SOURCE).equals(ESConstant.TWITTER)
// || jsonObject.get(ESConstant.EN_SOURCE).equals(ESConstant.FACEBOOK)
// || jsonObject.get(ESConstant.EN_SOURCE).equals(ESConstant.SINA)) {
if(jsonObject.get(ESConstant.DOC_TYPE).equals(ESConstant.SOCIAL)){
String author = jsonObject.getString(ESConstant.AUTHOR);
jsonObject.put(ESConstant.TITLE,author);
if(jsonObject.get(ESConstant.EN_SOURCE).equals(ESConstant.SINA)){

Loading…
Cancel
Save