/*
 * Decompiled with CFR 0.152.
 */
package com.bfd.crawler.tmptask.newscrawl;

import com.bfd.crawler.download.httclient43.crawler.httpclient.Crawl4HttpClient;
import com.bfd.crawler.kafka7.KfkProducer;
import com.bfd.crawler.tmptask.Task;
import com.bfd.crawler.tmptask.newscrawl.NewsEntity;
import com.bfd.crawler.tmptask.newscrawl.formatTime.FormatTimeCommon;
import com.bfd.crawler.tmptask.newscrawl.formatTime.TimeRegexRule;
import com.bfd.crawler.tmptask.newscrawl.listurlFilter.ListUrlFilter;
import com.bfd.crawler.tmptask.workqueue.IWorkQueue;
import com.bfd.crawler.tmptask.workqueue.WorkQueueRedis;
import com.bfd.crawler.utils.JsonUtils;
import com.bfd.crawler.utils.MyStringUtil;
import com.bfd.crawler.utils.ParserException;
import com.bfd.crawler.utils.ThreadUtils;
import com.bfd.crawler.utils.autoparse.AutoParse;
import com.bfd.crawler.utils.htmlcleaner.HtmlCleanerUtil;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.log4j.Logger;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;

public abstract class BaseNewsThread
implements Runnable {
    public static String topic = "poc_allnews2";
    private static Logger log = Logger.getLogger(BaseNewsThread.class);
    public static IWorkQueue queue = new WorkQueueRedis();
    private static AtomicInteger urlNum = new AtomicInteger(0);

    public abstract boolean filter(String var1, String var2, Task var3);

    @Override
    public void run() {
        String html = "";
        String url = "";
        long interval = 1000L;
        log.info((Object)(ThreadUtils.getCurrentThreadName() + " interval is " + interval));
        while (true) {
            try {
                while (true) {
                    log.info((Object)(ThreadUtils.getCurrentThreadName() + " workqueue size is " + queue.getQueueSize()));
                    Task task = queue.getTask();
                    url = task.getUrl();
                    log.info((Object)(ThreadUtils.getCurrentThreadName() + " start crawl url\uff1a" + url));
                    if (MyStringUtil.pre_filtered(url)) {
                        log.info((Object)(ThreadUtils.getCurrentThreadName() + " url:" + url + " has been filtered!"));
                        continue;
                    }
                    html = Crawl4HttpClient.getHtmlByDefaultHeader((String)url);
                    if (html == null || html.trim().length() == 0) continue;
                    NewsEntity entity = BaseNewsThread.parseHtml(html, url, task.getCid());
                    if (task.getTimeRule() != null) {
                        String time = BaseNewsThread.getTimeByPath(html, task.getTimeRule());
                        if (time.length() != 0) {
                            entity.setPostTime(time);
                        } else {
                            log.info((Object)(ThreadUtils.getCurrentThreadName() + " url:" + task.getUrl() + " no get time"));
                        }
                    }
                    BaseNewsThread.writeTokfk(entity);
                    List<String> urls = HtmlCleanerUtil.getAllHref(task.getUrl(), html);
                    log.info((Object)("sourceUrl:" + task.getUrl() + ",get subUrl is " + JsonUtils.toJSONString(urls)));
                    BaseNewsThread.addTask(urls, task);
                    log.info((Object)(ThreadUtils.getCurrentThreadName() + " " + urlNum.get() + " task over"));
                    Thread.currentThread();
                    Thread.sleep(interval);
                }
            }
            catch (Exception e) {
                e.printStackTrace();
                log.error((Object)(ThreadUtils.getCurrentThreadName() + " err url is " + url), (Throwable)e);
                continue;
            }
            break;
        }
    }

    public static String formatTime(String time) {
        time = time.replace("\u53d1\u5e03\u4e8e ", "");
        return time;
    }

    public static String getTimeByPath(String html, String path) {
        System.out.println("timepaht:" + path);
        HtmlCleaner cleaner = new HtmlCleaner();
        TagNode root = cleaner.clean(html);
        String time = HtmlCleanerUtil.getContentByXpath(root, path, "", "text");
        return time;
    }

    public static void writeTokfk(NewsEntity entity) {
        if (entity.getPostTime() == null) {
            log.info((Object)(ThreadUtils.getCurrentThreadName() + " url:" + entity.getUrl() + " posttime is null! filtered"));
            return;
        }
        entity.setPostTime(FormatTimeCommon.formatTimeFacade(entity.getPostTime().trim()));
        try {
            if (!ListUrlFilter.isFilter(entity.getCid(), entity.getUrl()) && entity.getTitle().indexOf("404") < 0 && entity.getTitle().trim().length() > 0 && entity.getPostTime() != null) {
                log.info((Object)(ThreadUtils.getCurrentThreadName() + " url:" + entity.getUrl() + " writetokfk:" + JsonUtils.toJSONString(entity) + ",topic:" + topic));
                KfkProducer.getInstance().send(topic, JsonUtils.toJSONString(entity));
                urlNum.getAndIncrement();
            } else {
                log.info((Object)(ThreadUtils.getCurrentThreadName() + " url:" + entity.getUrl() + " formater error filtered"));
            }
        }
        catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static NewsEntity parseHtml(String html, String url, String cid) {
        Map<String, String> rs = AutoParse.getTitleAndContent(html);
        NewsEntity entity = new NewsEntity();
        entity.setContent(rs.get("content"));
        entity.setTitle(rs.get("title"));
        entity.setKeywords(MyStringUtil.getKeywordFromHtml(html, url));
        entity.setUrl(url);
        entity.setCid(cid);
        entity.setCreation_time(new Date().getTime());
        return entity;
    }

    public static void addTask(List<String> urls, Task parentTask) {
        int insertNum = 0;
        for (String url : urls) {
            url = BaseNewsThread.formatUrl(url, parentTask);
            boolean flag = MyStringUtil.isRegexMatched(parentTask.getUrlRule(), url);
            if (!flag) {
                log.info((Object)("url:" + url + " not match " + parentTask.getUrlRule()));
                continue;
            }
            if (!url.startsWith(parentTask.getHost())) {
                log.info((Object)("url:" + url + ",not start host:" + parentTask.getHost()));
                continue;
            }
            log.info((Object)("url:" + url + " match " + parentTask.getUrlRule()));
            Task task = new Task();
            task.setUrl(url);
            task.setCid(parentTask.getCid());
            task.setHost(parentTask.getHost());
            task.setUrlRule(parentTask.getUrlRule());
            if (parentTask.getTimeRule() != null) {
                task.setTimeRule(parentTask.getTimeRule());
            }
            queue.putTask(task);
            ++insertNum;
        }
        log.info((Object)(ThreadUtils.getCurrentThreadName() + " sourceUrl:" + parentTask.getUrl() + " add new task num is " + insertNum));
    }

    public static String formatUrl(String url, Task parentTask) {
        if (!url.startsWith("http")) {
            url = url.startsWith("/") ? parentTask.getHost() + url : parentTask.getHost() + "/" + url;
        }
        return url;
    }

    public static void main(String[] args) {
        String url = "https://opensource.com/business/15/3/using-spark-dataframes-large-scale-data-science";
        String html = Crawl4HttpClient.getHtmlByDefaultHeader((String)url);
        String path = "//span[@class='created-date']";
        String time = BaseNewsThread.getTimeByPath(html, path);
        System.out.println(time);
        try {
            System.out.println(MyStringUtil.getRegexGroup(TimeRegexRule.getTimeregex, "dfd2015-09-09", 1).length());
        }
        catch (ParserException e) {
            e.printStackTrace();
        }
    }
}

