Browse Source

release-3.1.7(2021-11-12,新增了B站和youtube 的用户数据的采集,页面类型不同于其他的用户,调整了这两个数据的查询和统计。)

release-1.0
杜静 4 years ago
parent
commit
cff5b2f2d7
  1. 35
      cl_query_data_job/src/main/java/com/bfd/mf/job/service/query/QueryService.java
  2. 41
      cl_query_data_job/src/main/java/com/bfd/mf/job/service/query/SaveService.java
  3. 21
      cl_query_data_job/src/main/java/com/bfd/mf/job/util/WriteMethod.java
  4. 2
      cl_search_api/src/main/java/com/bfd/mf/common/service/cache/TopicQueryService.java
  5. 2
      cl_search_api/src/main/java/com/bfd/mf/common/service/es/ParseSearchScopeService.java
  6. 10
      cl_search_api/src/main/java/com/bfd/mf/common/util/constants/ESConstant.java
  7. 38
      cl_search_api/src/main/java/com/bfd/mf/common/web/vo/view/monitor/ESMonitorBaseEntity.java
  8. 59
      cl_search_api/src/main/java/com/bfd/mf/service/SearchDataService.java

35
cl_query_data_job/src/main/java/com/bfd/mf/job/service/query/QueryService.java

@ -12,10 +12,7 @@ import com.bfd.mf.job.domain.entity.Task;
import com.bfd.mf.job.domain.repository.SubjectRepository;
import com.bfd.mf.job.domain.repository.TaskRepository;
import com.bfd.mf.job.download.DownLoadFile;
import com.bfd.mf.job.util.DataCheckUtil;
import com.bfd.mf.job.util.EsUtils;
import com.bfd.mf.job.util.Kafka010Utils;
import com.bfd.mf.job.util.ReadLine;
import com.bfd.mf.job.util.*;
import com.google.common.collect.Maps;
import com.google.common.util.concurrent.RateLimiter;
import kafka.utils.Json;
@ -106,8 +103,14 @@ public class QueryService {
long taskId = task.getId().longValue();
String appId = task.getAppId();
int cache_num = 1;
Integer siteType = task.getSiteType();
if(siteType == 5){
cache_num = 2;
taskRepository.updateStatus(cache_num, task.getId().longValue());
cache.put(taskId+"#@#"+appId, Lists.newArrayList(0L, 0L, progressFactor, totalSegment, segment));
} else {
taskRepository.updateStatus(cache_num, task.getId().longValue());
cache.put(taskId + "#@#" + appId, Lists.newArrayList(0L, 0L, progressFactor, totalSegment, segment));
}
try {
P_TASK_CACHE_RANGE.put(cache);
} catch (InterruptedException e) {
@ -125,8 +128,8 @@ public class QueryService {
return;
}
String taskIdAppId = "";
long fromMills =0L;
long toMills = 0L;
long fromMills = 0L; //1604419200000
long toMills = 0L; // 1604505600000
for (Map.Entry<String, List<? extends Number>> entry : range.entrySet()) {
entry.getValue();
taskIdAppId = entry.getKey();
@ -182,13 +185,13 @@ public class QueryService {
fromMills = task.getCrawlStartTime().longValue();
queryBuilder = getQueryBuilder(fromMills, toMills, cid, crawlDataFlag, cacheNum, siteType);
}
// LOGGER.info("Query primary, task:{}, index:{}, from:{}, to:{}, indices:{}, dsl:{}.",
// taskId,
// indexName,
// new LocalDateTime(fromMills).toString(AppConfig.DATE_TIME_FORMAT),
// new LocalDateTime(toMills).toString(AppConfig.DATE_TIME_FORMAT),
// JSONObject.toJSONString(sourceIndices),
// queryBuilder.toString());
LOGGER.info("Query primary, task:{}, index:{}, from:{}, to:{}, indices:{}, dsl:{}.",
taskId,
indexName,
new LocalDateTime(fromMills).toString(AppConfig.DATE_TIME_FORMAT),
new LocalDateTime(toMills).toString(AppConfig.DATE_TIME_FORMAT),
JSONObject.toJSONString(sourceIndices),
queryBuilder.toString());
// 传入的参数 集群名称索引名称索引类型type, 查询Builder,scroll查询页面大小,scroll查询scrollId有效时间
String finalTaskId = taskId + "";
long pubTime = fromMills;
@ -223,6 +226,10 @@ public class QueryService {
if (!data.get("_id_").equals("")) {
saveService.saveToEsWithFilter(config.esMiniClusterName(), finalIndexName1, data);
kafkaProducer.send(config.getSendTopic(),JSONObject.toJSONString(data));
// long crawlTime = data.getLong("crawlTime");
// if(crawlTime < 1633795200000L){
// WriteMethod.writeMethod("../../../error.txt",JSONObject.toJSONString(data));
// }
LOGGER.debug("Send message, indexName :{} , taskId:{} , ID :{}.", finalIndexName, task.getId(), data.getString("_id_"));
// 将要拉评论的ID 添加到list ,电商的数据不用拉评论哦
if (!siteType.equals(ESConstants.DOCTYPEITEM)) {

41
cl_query_data_job/src/main/java/com/bfd/mf/job/service/query/SaveService.java

@ -17,22 +17,35 @@ public class SaveService {
// 初始化自定义字段
data.put(ESConstants.TASKID, taskId);
data.put("where","backtrace");
// data.put("tag","");
// data.put("mentionAccountUrl",new ArrayList<>());
// data.put("mentionAccount",new ArrayList<>());
// data.put("dns","");
if(!data.containsKey("tag")){
data.put("tag","");
}
if(!data.containsKey("dns")){
data.put("dns","");
}
if(!data.containsKey("hasOCR")){
// data.put("asrText","");
// data.put("ocrText",new ArrayList<>());
// data.put("hasOCR",0);
// data.put("hasASR",0);
// data.put("asrLength",0);
// data.put("ocrLength",0);
// data.put("hasTrans",0);
// data.put("translateTitleLength","");
// data.put("translateContentLength","");
// data.put("goodrate",0);
// data.put("generalrate",0);
// data.put("poorrate",0);
data.put("hasOCR",0);
data.put("hasASR",0);
data.put("asrLength",0);
data.put("ocrLength",0);
}
if(!data.containsKey("hasTrans")){
data.put("hasTrans",0);
data.put("translateTitleLength","");
data.put("translateContentLength","");
}
if(!data.containsKey("mentionAccount")){
data.put("mentionAccountUrl",new ArrayList<>());
data.put("mentionAccount",new ArrayList<>());
}
if(!data.containsKey("generalrate")){
data.put("goodrate",0);
data.put("generalrate",0);
data.put("poorrate",0);
}
}

21
cl_query_data_job/src/main/java/com/bfd/mf/job/util/WriteMethod.java

@ -0,0 +1,21 @@
package com.bfd.mf.job.util;
import java.io.FileWriter;
import java.io.IOException;
/**
* Created by BFD-229 on 2017/7/6.
*/
public class WriteMethod {
public static void writeMethod(String fileName, String json){
try{
FileWriter writer=new FileWriter(fileName,true);
writer.write(json+"\n");
writer.close();
} catch (IOException e)
{
e.printStackTrace();
}
}
}

2
cl_search_api/src/main/java/com/bfd/mf/common/service/cache/TopicQueryService.java

@ -190,8 +190,6 @@ public class TopicQueryService {
}
boolQuery.must(boolQueryBuilder);
}
// }
}catch (Exception e){
e.printStackTrace();
}

2
cl_search_api/src/main/java/com/bfd/mf/common/service/es/ParseSearchScopeService.java

@ -64,7 +64,7 @@ public class ParseSearchScopeService {
.must(QueryBuilders.termQuery(ESConstant.DOC_TYPE,ESConstant.ITEM)));
} else if(searchType == 2){
searchScopeQuery = QueryBuilders.boolQuery().must(QueryBuilders.termQuery(ESConstant.PRIMARY, 2))
.must(QueryBuilders.termsQuery(ESConstant.PAGETYPE,"userInfoPage"));
.must(QueryBuilders.termsQuery(ESConstant.PAGETYPE,"userInfoPage","newsuser"));
}
return searchScopeQuery;
}

10
cl_search_api/src/main/java/com/bfd/mf/common/util/constants/ESConstant.java

@ -396,7 +396,7 @@ public class ESConstant {
public static String CHANNEL = "channel";
public static final String CONTENT = "content";
public static final String SYS_SENTIMENT = "sysSentiment";
public static String POST_SOURCE = "postSource";
// public static String POST_SOURCE = "postSource";
public static String TRANSLATETITLE = "translateTitle";
public static String TRANSLATECONTENT = "translateContent";
@ -596,10 +596,10 @@ public class ESConstant {
public static String WEIBO_EXPRESSION_TEXT = "expressionText";// 表情文本
public static String WEIBO_FORWARD_URL = "forwardUrl";// 原文url
public static String FORWARD_USER_URL = "forwardUserUrl";// 原文url
public static String WEIBO_USER_TYPE = "userType";
public static String WEIBO_POST_SOURCE = "postSource";
public static String WEIBO_LEVEL = "level";
public static String WEIBO_REPLY_COMMENT = "replycomment";
public static String USER_TYPE = "userType";
public static String POST_SOURCE = "postSource";
public static String LEVEL = "level";
public static String REPLY_COMMENT = "replycomment";

38
cl_search_api/src/main/java/com/bfd/mf/common/web/vo/view/monitor/ESMonitorBaseEntity.java

@ -122,6 +122,44 @@ public class ESMonitorBaseEntity implements Comparable<ESMonitorBaseEntity>, Ser
private String promotionInfo;
private int readCount;
public int getReadCount() {
return readCount;
}
public void setReadCount(int readCount) {
this.readCount = readCount;
}
private int ugc;
private int egc;
private int pgc;
public int getUgc() {
return ugc;
}
public void setUgc(int ugc) {
this.ugc = ugc;
}
public int getEgc() {
return egc;
}
public void setEgc(int egc) {
this.egc = egc;
}
public int getPgc() {
return pgc;
}
public void setPgc(int pgc) {
this.pgc = pgc;
}
public String getPromotionInfo() {
return promotionInfo;
}

59
cl_search_api/src/main/java/com/bfd/mf/service/SearchDataService.java

@ -95,7 +95,7 @@ public class SearchDataService extends CrudService<SentimentModify, SentimentRep
}
}
/**
* 遍历查询结果调用解析组装方法 2
* 遍历查询结果调用解析组装方法 2 用户信息的组装用的是这个方法
*/
private void parseQueryResult(List<JSONObject> dataList, List<ESMonitorEntity> esMonitorListEntity,Integer searchType) {
try {
@ -227,8 +227,8 @@ public class SearchDataService extends CrudService<SentimentModify, SentimentRep
String icon = "";
String siteType = "";
if(enSource.equals(ESConstant.SINA)){
siteId = "183";
}else {
siteId = "183"; // 微博的我忘了为啥这个需要特殊处理
}else if(siteMap.containsKey(enSource)){
Map<String, Object> siteOtherMap = siteMap.get(enSource);
if (siteOtherMap.containsKey("site_id")) {
siteId = siteMap.get(enSource).get("site_id").toString();
@ -261,6 +261,7 @@ public class SearchDataService extends CrudService<SentimentModify, SentimentRep
price = sourceAsMap.get(ESConstant.PRICE).toString();
attitudeCount = sourceAsMap.get(ESConstant.POSTCOUNT).toString();
} else if(searchType == 2){
author = sourceAsMap.get(ESConstant.AUTHOR).toString();
content = productParameter;
}else {
content = sourceAsMap.get(ESConstant.CONTENT).toString();
@ -356,12 +357,22 @@ public class SearchDataService extends CrudService<SentimentModify, SentimentRep
String translateContent = "";
if (sourceAsMap.containsKey(ESConstant.TRANSLATETITLE)) {
translateTitle = (String) sourceAsMap.get(ESConstant.TRANSLATETITLE);
if(translateTitle.equals("")){
translateTitle = "未获取到翻译标题";
}
}
if (sourceAsMap.containsKey(ESConstant.TRANSLATECONTENT)) {
translateContent = (String) sourceAsMap.get(ESConstant.TRANSLATECONTENT);
if(translateContent.equals("")){
translateContent = "未获取到翻译正文";
}
}
// 词云
List<String> hlKeywords = (List<String>) sourceAsMap.get(ESConstant.HL_KEYWORDS);
List<String> hlKeywords = new ArrayList<>();
if (sourceAsMap.get(ESConstant.HL_KEYWORDS) instanceof List){
hlKeywords = (List<String>) sourceAsMap.get(ESConstant.HL_KEYWORDS);
}
//List<String>
// 视频分析结果
String asrText = "";
List<String> ocrText = new ArrayList<>();
@ -403,18 +414,51 @@ public class SearchDataService extends CrudService<SentimentModify, SentimentRep
}
if(sourceAsMap.containsKey(ESConstant.TAG)){
tag = sourceAsMap.get(ESConstant.TAG).toString();
if(primary == 0 ){
String pageType = sourceAsMap.get(ESConstant.PAGETYPE).toString();
if(pageType.equals("socialFollow")){
String userType = sourceAsMap.get(ESConstant.USER_TYPE).toString();
if(userType.equals( "0")){
tag = "分享用户";
}else if (userType.equals("1")){
tag = "点赞用户";
}
}else if (pageType.equals("socialComment")){
tag = "评论用户";
}
}
}
String otherSourceJson = "";
if(sourceAsMap.containsKey(ESConstant.OTHERSOURCEJSON)){
otherSourceJson = sourceAsMap.get(ESConstant.OTHERSOURCEJSON).toString();
}
String promotionInfo = "";
String promotionInfo = ""; // brife 字段
if(sourceAsMap.containsKey(ESConstant.PROMOTIONINFO)){
promotionInfo = sourceAsMap.get(ESConstant.PROMOTIONINFO).toString();
}
int ugc = 0;
int egc = 0;
int pgc = 0;
if(sourceAsMap.containsKey(ESConstant.SEARCH_SCOPE_UGC)){
ugc = (int) sourceAsMap.get(ESConstant.SEARCH_SCOPE_UGC);
}
if(sourceAsMap.containsKey(ESConstant.SEARCH_SCOPE_EGC)){
egc = (int) sourceAsMap.get(ESConstant.SEARCH_SCOPE_EGC);
}
if(sourceAsMap.containsKey(ESConstant.SEARCH_SCOPE_PGC)){
pgc = (int) sourceAsMap.get(ESConstant.SEARCH_SCOPE_PGC);
}
int readCount = 0;
if(sourceAsMap.containsKey("readCount") && sourceAsMap.get("readCount") != ""){
readCount = (int) sourceAsMap.get("readCount");
}
try {
esMonitorEntity.setReadCount(readCount);
esMonitorEntity.setEgc(egc);
esMonitorEntity.setPgc(pgc);
esMonitorEntity.setUgc(ugc);
esMonitorEntity.setDataId(dataId);
esMonitorEntity.setDocId(docId);
esMonitorEntity.setChannel(channel);
@ -933,6 +977,11 @@ public class SearchDataService extends CrudService<SentimentModify, SentimentRep
return jsonObject;
}
/**
* 导出专题下数据调用的方法
* @param queryRequest
* @return
*/
public JSONObject exportDataInSubjectIndex(QueryRequest queryRequest) {
JSONObject jsonObject = new JSONObject();
try {

Loading…
Cancel
Save