diff --git a/.idea/libraries/Maven__ch_qos_logback_logback_classic_1_1_2.xml b/.idea/libraries/Maven__ch_qos_logback_logback_classic_1_1_7.xml
similarity index 58%
rename from .idea/libraries/Maven__ch_qos_logback_logback_classic_1_1_2.xml
rename to .idea/libraries/Maven__ch_qos_logback_logback_classic_1_1_7.xml
index a8b0d9c..cdd7959 100644
--- a/.idea/libraries/Maven__ch_qos_logback_logback_classic_1_1_2.xml
+++ b/.idea/libraries/Maven__ch_qos_logback_logback_classic_1_1_7.xml
@@ -1,13 +1,13 @@
-
+
-
+
-
+
-
+
\ No newline at end of file
diff --git a/.idea/libraries/Maven__ch_qos_logback_logback_core_1_1_3.xml b/.idea/libraries/Maven__ch_qos_logback_logback_core_1_1_7.xml
similarity index 60%
rename from .idea/libraries/Maven__ch_qos_logback_logback_core_1_1_3.xml
rename to .idea/libraries/Maven__ch_qos_logback_logback_core_1_1_7.xml
index fd35ca4..6c2a760 100644
--- a/.idea/libraries/Maven__ch_qos_logback_logback_core_1_1_3.xml
+++ b/.idea/libraries/Maven__ch_qos_logback_logback_core_1_1_7.xml
@@ -1,13 +1,13 @@
-
+
-
+
-
+
-
+
\ No newline at end of file
diff --git a/cl_query_data_job/src/main/java/com/bfd/mf/job/config/ESConstants.java b/cl_query_data_job/src/main/java/com/bfd/mf/job/config/ESConstants.java
index d7bb47e..d2581fb 100644
--- a/cl_query_data_job/src/main/java/com/bfd/mf/job/config/ESConstants.java
+++ b/cl_query_data_job/src/main/java/com/bfd/mf/job/config/ESConstants.java
@@ -882,6 +882,7 @@ public class ESConstants {
public static String AREA = "area";
+
/**
* _all字段
*/
@@ -1082,6 +1083,8 @@ public class ESConstants {
public static final String GOFASTURL = "gofastUrl";
public static final String ORIGINALURL = "originalUrl";
public static final String PATHSIZELIST = "pathSizeList";
+ public static final String SRCLIST = "srcList";
+ public static final String SRCMAP = "srcMap";
public static final String PATH = "path";
diff --git a/cl_query_data_job/src/main/java/com/bfd/mf/job/domain/entity/Task.java b/cl_query_data_job/src/main/java/com/bfd/mf/job/domain/entity/Task.java
index fd120ba..356f624 100644
--- a/cl_query_data_job/src/main/java/com/bfd/mf/job/domain/entity/Task.java
+++ b/cl_query_data_job/src/main/java/com/bfd/mf/job/domain/entity/Task.java
@@ -12,7 +12,7 @@ public class Task extends AbstractEntity {
// private long top;
private BigInteger subjectId;
- // private String appId;
+ private String appId;
private String externalId;
// private long crawlId;
private Integer siteType;
@@ -54,6 +54,14 @@ public class Task extends AbstractEntity {
this.subjectId = subjectId;
}
+ public String getAppId() {
+ return appId;
+ }
+
+ public void setAppId(String appId) {
+ this.appId = appId;
+ }
+
public String getExternalId() {
return externalId;
}
diff --git a/cl_query_data_job/src/main/java/com/bfd/mf/job/domain/repository/TaskRepository.java b/cl_query_data_job/src/main/java/com/bfd/mf/job/domain/repository/TaskRepository.java
index 7515626..8bd8328 100644
--- a/cl_query_data_job/src/main/java/com/bfd/mf/job/domain/repository/TaskRepository.java
+++ b/cl_query_data_job/src/main/java/com/bfd/mf/job/domain/repository/TaskRepository.java
@@ -12,8 +12,8 @@ import java.util.Map;
public interface TaskRepository extends CrudRepository {
- @Query(value = "SELECT ct.id,ct.app_id,ct.subject_id,ct.external_id,cs.site_type,ct.task_type,ct.cid,ct.crawl_status,ct.crawl_start_time,ct.crawl_end_time,ct.crawl_data_flag,ct.data_total,ct.today_data_total,ct.cache_num,ct.update_time,ct.del,ct.crawl_content_key FROM cl_task ct JOIN intelligent_crawl.cl_site cs ON ct.cid = cs.cid WHERE ct.task_type <> 3 AND ct.crawl_status = 1 AND ct.cache_num = 3 AND app_id = '61qb' AND subject_id = 12094 AND ct.data_total = 0 AND ct.del = 0 AND ct.subject_id in (SELECT id from cl_subject csu WHERE csu.del =0) ORDER BY ct.id DESC;",nativeQuery = true)
- // @Query(value = "SELECT ct.id,ct.app_id,ct.subject_id,ct.external_id,cs.site_type,ct.task_type,ct.cid,ct.crawl_status,ct.crawl_start_time,ct.crawl_end_time,ct.crawl_data_flag,ct.data_total,ct.today_data_total,ct.cache_num,ct.update_time,ct.del,ct.crawl_content_key FROM cl_task ct JOIN intelligent_crawl.cl_site cs ON ct.cid = cs.cid WHERE ct.subject_id = 12094 AND ct.task_type <> 3 AND ct.crawl_status = 1 AND ct.cache_num = 0 AND ct.data_total = 0 AND ct.del = 0 AND ct.subject_id in (SELECT id from cl_subject csu WHERE csu.del =0) ORDER BY ct.id DESC ;",nativeQuery = true)
+ // @Query(value = "SELECT ct.id,ct.app_id,ct.subject_id,ct.external_id,cs.site_type,ct.task_type,ct.cid,ct.crawl_status,ct.crawl_start_time,ct.crawl_end_time,ct.crawl_data_flag,ct.data_total,ct.today_data_total,ct.cache_num,ct.update_time,ct.del,ct.crawl_content_key FROM all_task.cl_task ct JOIN cl_site cs ON ct.cid = cs.cid WHERE ct.task_type <> 3 AND ct.crawl_status = 1 AND ct.cache_num = 3 AND ct.data_total = 0 AND ct.del = 0 AND ct.subject_id in (SELECT id from cl_subject csu WHERE csu.del =0) ORDER BY ct.id DESC;",nativeQuery = true)
+ @Query(value = "SELECT ct.id,ct.app_id,ct.subject_id,ct.external_id,cs.site_type,ct.task_type,ct.cid,ct.crawl_status,ct.crawl_start_time,ct.crawl_end_time,ct.crawl_data_flag,ct.data_total,ct.today_data_total,ct.cache_num,ct.update_time,ct.del,ct.crawl_content_key FROM all_task.cl_task ct JOIN intelligent_crawl.cl_site cs ON ct.cid = cs.cid WHERE ct.task_type <> 3 AND ct.crawl_status = 1 AND ct.cache_num = 0 AND ct.data_total = 0 AND ct.del = 0 AND ct.subject_id in (SELECT id from all_task.cl_subject csu WHERE csu.del =0) ORDER BY ct.id DESC ;",nativeQuery = true)
List findAllNewTask();
// 需要统计的任务的查询条件 1、 状态为 1 OR 0;2、状态为3,且任务完成时间再2天前的。
@@ -63,12 +63,12 @@ public interface TaskRepository extends CrudRepository {
*/
@Modifying
@Transactional(rollbackFor = Exception.class)
- @Query(value = "update cl_task set data_total =?2 , today_data_total =?3 where id =?1 ", nativeQuery = true)
+ @Query(value = "update cl_task set data_total =?2 , today_data_total =?3, update_time = now() where id =?1 ", nativeQuery = true)
void updateTaskCount(Long id, Long totalCount, Long todayCount);
@Modifying
@Transactional(rollbackFor = Exception.class)
- @Query(value = "update cl_task set data_total =?2 , today_data_total =?3 ,has_image_total = ?4,has_video_total = ?5, has_file_total = ?6,has_text_total = ?7 where id =?1 ", nativeQuery = true)
+ @Query(value = "update cl_task set data_total =?2 , today_data_total =?3 , has_image_total = ?4, has_video_total = ?5, has_file_total = ?6, has_text_total = ?7 , update_time = now() where id =?1 ", nativeQuery = true)
void updateTaskCountAll(Long id, Long totalCount, Long todayCount,Long imageCount,Long videoCount,Long fileCount,Long textCount);
@Modifying
@@ -90,11 +90,11 @@ public interface TaskRepository extends CrudRepository {
// @Query(value = "SELECT ct.id,ct.subject_id,ct.external_id,cs.site_type, ct.task_type,ct.cid,ct.crawl_status,ct.crawl_start_time,ct.crawl_end_time,ct.crawl_data_flag,ct.data_total,ct.today_data_total,ct.cache_num,ct.update_time,ct.del,ct.file_name,ct.file_remark,ct.crawl_content_key FROM `cl_task` ct JOIN intelligent_crawl.cl_site cs ON ct.cid = cs.cid WHERE ct.del = 0 AND ct.id = ?1",nativeQuery = true)
- @Query(value = "SELECT ct.id,ct.app_id,ct.subject_id,ct.external_id,cs.site_type, ct.task_type,ct.cid,ct.crawl_status,ct.crawl_start_time,ct.crawl_end_time,ct.crawl_data_flag,ct.data_total,ct.today_data_total,ct.cache_num,ct.update_time,ct.del,ct.crawl_content_key FROM `cl_task` ct JOIN intelligent_crawl.cl_site cs ON ct.cid = cs.cid WHERE ct.del = 0 AND ct.id = ?1 ;",nativeQuery = true)
- List findOneTaskByIdAndAppId(long taskId);
-
-
+ @Query(value = "SELECT ct.id,ct.app_id,ct.subject_id,ct.external_id,cs.site_type, ct.task_type,ct.cid,ct.crawl_status,ct.crawl_start_time,ct.crawl_end_time,ct.crawl_data_flag,ct.data_total,ct.today_data_total,ct.cache_num,ct.update_time,ct.del,ct.crawl_content_key FROM `cl_task` ct JOIN intelligent_crawl.cl_site cs ON ct.cid = cs.cid WHERE ct.del = 0 AND ct.id = ?1 AND ct.app_id = ?2 ",nativeQuery = true)
+ List findOneTaskByIdAndAppId(long taskId,String appID);
+// @Query(value = "",nativeQuery = true)
+// List getByTaskId(long taskId);
// @Query(value = "SELECT id,subject_id,external_id,site_type,task_type,cid,crawl_data_flag,cache_num,crawl_start_time,crawl_end_time,data_total,today_data_total,update_time FROM cl_task WHERE NOW() > SUBDATE(update_time,interval -15 minute) AND del = 0 AND subject_id in (SELECT id from cl_subject WHERE `status` = 0 AND del =0)", nativeQuery = true)
diff --git a/cl_query_data_job/src/main/java/com/bfd/mf/job/download/DownLoadFile.java b/cl_query_data_job/src/main/java/com/bfd/mf/job/download/DownLoadFile.java
index 510ebcc..29fce72 100644
--- a/cl_query_data_job/src/main/java/com/bfd/mf/job/download/DownLoadFile.java
+++ b/cl_query_data_job/src/main/java/com/bfd/mf/job/download/DownLoadFile.java
@@ -1,21 +1,24 @@
package com.bfd.mf.job.download;
import com.alibaba.fastjson.JSONObject;
+import com.bfd.mf.job.config.ESConstants;
import okhttp3.*;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
+import java.awt.image.DataBufferByte;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
+import java.net.URLConnection;
import java.util.HashMap;
import java.util.Map;
public class DownLoadFile {
public static Map downloadAndSaveFile(String getUrl,String putUrl){
- String realUrl = "";double size;
+ // String path = "";double size;
Map realresult= new HashMap<>();
try{
String files [] = getUrl.split("/");
@@ -23,23 +26,20 @@ public class DownLoadFile {
Map header = new HashMap<>();
header.put("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36");
header.put("Connection","keep-alive");
- try {
- Map downloadresult = OkHttpUtils.doGetBytes(getUrl,header);
- size= (double) downloadresult.get("size");
- if (downloadresult.containsKey("content")&&size>0){
- byte[] content = (byte[]) downloadresult.get("content");
- size= (double) downloadresult.get("size");
- size = Double.valueOf(String.format("%.3f", size));
- Thread.sleep(4000);
- String result = DownLoadFile.upload(putUrl,fileName,content);
- Thread.sleep(4000);
- realUrl = JSONObject.parseObject(result).getString("url");
- realresult.put("realUrl",realUrl);
- realresult.put("size",size+"");
- }
- } catch (IOException e) {
- e.printStackTrace();
+ Map downloadresult = OkHttpUtils.doGetBytes(getUrl,header);
+ double size= (double) downloadresult.get(ESConstants.SIZE);
+
+ if (downloadresult.containsKey(ESConstants.CONTENT) && size > 0){
+ byte[] content = (byte[]) downloadresult.get(ESConstants.CONTENT);
+ //size= (double) downloadresult.get("size");
+ size = Double.valueOf(String.format("%.2f", size));
+ Thread.sleep(3000);
+ String result = DownLoadFile.upload(putUrl,fileName,content);
+ Thread.sleep(3000);
+ String path = JSONObject.parseObject(result).getString(ESConstants.PATH);
+ realresult.put(ESConstants.URL , path);
+ realresult.put(ESConstants.SIZE , size+"KB");
}
}catch (Exception e){
e.printStackTrace();
@@ -47,6 +47,14 @@ public class DownLoadFile {
return realresult;
}
+
+// public static void main(String[] args) {
+// String getUrl = "https://wx4.sinaimg.cn/mw690/001NtKpRly1guw9jh90poj60u01hcaqj02.jpg";
+// String putUrl = "http://172.18.1.113:8080/upload";
+// Map realresult = downloadAndSaveFile(getUrl,putUrl);
+// System.out.println(JSONObject.toJSONString(realresult));
+// }
+
public static String upload(String uploadUrl,String fileName,byte[] content) {
String result = "";
try {
@@ -54,8 +62,7 @@ public class DownLoadFile {
MultipartBody multipartBody = new MultipartBody.Builder().
setType(MultipartBody.FORM)
.addFormDataPart("file", fileName,
- RequestBody.create(MediaType.parse("multipart/form-data;charset=utf-8"),
- content))
+ RequestBody.create(MediaType.parse("multipart/form-data;charset=utf-8"), content))
.addFormDataPart("output", "json")
.build();
Request request = new Request.Builder()
@@ -75,18 +82,36 @@ public class DownLoadFile {
return result;
}
- public static String imagesize(String getUrl) throws IOException{
- String realresult="";
+ public static String getImageResolution(String getUrl) throws IOException{
+ String resolution = "" ;
try{
InputStream murl = new URL(getUrl).openStream();
BufferedImage sourceImg = ImageIO.read(murl);
int srcWidth = sourceImg .getWidth(); // 源图宽度
int srcHeight = sourceImg .getHeight(); // 源图高度
- realresult=Integer.toString(srcWidth)+"×"+ Integer.toString(srcHeight);
+ resolution = Integer.toString(srcWidth)+"×"+ Integer.toString(srcHeight);
}catch (Exception e){
+ System.out.println("ERROR URL : " + getUrl);
+ // e.printStackTrace();
+ }
+ return resolution;
+ }
+
+ public static String getFileSize(String getUrl){
+ String realSize = "";
+ // 获取大小
+ try {
+ URL url = new URL(getUrl);
+ URLConnection conn = url.openConnection();
+ double size = conn.getContentLength();
+ double newSize = Double.valueOf(String.format("%.2f", size/1024));
+ conn.getInputStream().close();
+ realSize = newSize+"KB";
+ } catch (Exception e) {
e.printStackTrace();
}
- return realresult;
+ return realSize;
+
}
public static Map upload(String uploadUrl,String fileName,File file) {
diff --git a/cl_query_data_job/src/main/java/com/bfd/mf/job/download/OkHttpUtils.java b/cl_query_data_job/src/main/java/com/bfd/mf/job/download/OkHttpUtils.java
index 9b6215e..8dbca66 100644
--- a/cl_query_data_job/src/main/java/com/bfd/mf/job/download/OkHttpUtils.java
+++ b/cl_query_data_job/src/main/java/com/bfd/mf/job/download/OkHttpUtils.java
@@ -175,8 +175,10 @@ public class OkHttpUtils {
if (body != null) {
byte[] content=response.body().bytes();
result.put("content",content);
- double size=Double.valueOf(response.header("Content-Length"))/1024;
- result.put("size",size);
+ if(response.header("Content-Length") != null) {
+ double size = Double.valueOf(response.header("Content-Length")) / 1024;
+ result.put("size", size);
+ }
}
}
return result;
diff --git a/cl_query_data_job/src/main/java/com/bfd/mf/job/service/es/EsQueryMiniService.java b/cl_query_data_job/src/main/java/com/bfd/mf/job/service/es/EsQueryMiniService.java
index c1ffe91..6846b7f 100644
--- a/cl_query_data_job/src/main/java/com/bfd/mf/job/service/es/EsQueryMiniService.java
+++ b/cl_query_data_job/src/main/java/com/bfd/mf/job/service/es/EsQueryMiniService.java
@@ -193,7 +193,7 @@ public class EsQueryMiniService {
private BoolQueryBuilder getQueryBuilder(String cid, String crawlDataFlag, Long crawlStartTime, Long crawlEndTime) {
BoolQueryBuilder qb = QueryBuilders.boolQuery();
// 任务ID 筛选
- TermQueryBuilder cidTermQueryBuilder = QueryBuilders.termQuery(ESConstants.EN_SOURCE, cid);
+ TermQueryBuilder cidTermQueryBuilder = QueryBuilders.termQuery(ESConstants.EN_SOURCE+".keyword", cid);
TermQueryBuilder taskIdTermQueryBuilder = QueryBuilders.termQuery(ESConstants.CRAWLDATAFLAG, crawlDataFlag);
qb.must(taskIdTermQueryBuilder).must(cidTermQueryBuilder);
// 时间范围筛选 只有主贴评论需要查时间,用户不需要设置时间范围
diff --git a/cl_query_data_job/src/main/java/com/bfd/mf/job/service/query/QueryService.java b/cl_query_data_job/src/main/java/com/bfd/mf/job/service/query/QueryService.java
index e297816..9127a30 100644
--- a/cl_query_data_job/src/main/java/com/bfd/mf/job/service/query/QueryService.java
+++ b/cl_query_data_job/src/main/java/com/bfd/mf/job/service/query/QueryService.java
@@ -1,6 +1,7 @@
package com.bfd.mf.job.service.query;
import com.alibaba.fastjson.JSON;
+import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.alibaba.fastjson.serializer.SerializerFeature;
import com.bfd.crawler.utils.JsonUtils;
@@ -17,6 +18,7 @@ import com.bfd.mf.job.util.Kafka010Utils;
import com.bfd.mf.job.util.ReadLine;
import com.google.common.collect.Maps;
import com.google.common.util.concurrent.RateLimiter;
+import kafka.utils.Json;
import org.apache.commons.lang3.exception.ExceptionUtils;
import org.assertj.core.util.Lists;
import org.elasticsearch.index.query.*;
@@ -27,8 +29,14 @@ import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import javax.annotation.PostConstruct;
import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
import java.io.IOException;
+import java.math.BigDecimal;
import java.math.BigInteger;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.net.URLConnection;
import java.sql.Timestamp;
import java.util.*;
import java.util.concurrent.BlockingQueue;
@@ -40,12 +48,13 @@ import static org.elasticsearch.index.query.QueryBuilders.rangeQuery;
public class QueryService {
private static final Logger LOGGER = LoggerFactory.getLogger(QueryService.class);
private static final Long ONE_MINUTE = 1000L * 60;
- private static BlockingQueue