Explorar o código

【学习助手】 同步gaw shiju 首页爬虫

lzu918 hai 5 meses
pai
achega
ed39386ac6

+ 23 - 0
ruoyi-zzb/src/main/java/com/ruoyi/zzb/study/domain/StudyDocDatasourceEnum.java

@@ -0,0 +1,23 @@
+package com.ruoyi.zzb.study.domain;
+
+public enum StudyDocDatasourceEnum {
+
+    SHIJU_LLQY("1", "市局首页--理论前沿"),
+    SHIJU_XXYD("2", "市局首页--学习园地");
+
+    private  String code;
+    private String name;
+
+    StudyDocDatasourceEnum(String code, String name){
+        this.code = code;
+        this.name = name;
+    }
+
+    public String getCode() {
+        return code;
+    }
+
+    public String getName() {
+        return name;
+    }
+}

+ 13 - 15
ruoyi-zzb/src/main/java/com/ruoyi/zzb/study/service/CrondCrawlerService.java

@@ -8,6 +8,7 @@ import com.ruoyi.common.utils.DateUtils;
 import com.ruoyi.common.utils.StringUtils;
 import com.ruoyi.zzb.study.common.UserAgentUtil;
 import com.ruoyi.zzb.study.domain.StudyCrawlerStatus;
+import com.ruoyi.zzb.study.domain.StudyDocDatasourceEnum;
 import com.ruoyi.zzb.study.domain.StudyDocInfo;
 import com.ruoyi.zzb.study.mapper.StudyCrawlerStatusMapper;
 import com.ruoyi.zzb.study.mapper.StudyDocInfoMapper;
@@ -48,15 +49,15 @@ public class CrondCrawlerService {
     @Scheduled(cron = "${crawler.xxyd.crondTime}")
     public int crawlerDocGawXXYD() throws IOException, ParseException, InterruptedException {
 
-        log.info("{} : {} ms",  "GAW学习园地调度任务开始!");
+        log.info( "============【学习助手】GAW学习园地调度任务开始!");
         Connection conn = Jsoup.connect(xxydUrl);
         //获取页码,遍历分页请求
         //conn.data("page","2");     //拼接get请求参数 https://www.163.com/?page=2
 
-        String datasourceCode ="2";
-        String  datasourceName = "市局首页--学习园地";
+        String datasourceCode = StudyDocDatasourceEnum.SHIJU_XXYD.getCode();
+        String  datasourceName = StudyDocDatasourceEnum.SHIJU_XXYD.getName();
         int flagNums = processShiJuDocument(conn,datasourceCode,datasourceName);
-        log.info("{} : {} 条",  "GAW学习园地调度任务结束!",flagNums);
+        log.info("{} : {} 条",  "============【学习助手】 GAW学习园地调度任务结束!",flagNums);
         return flagNums;
     }
 
@@ -67,19 +68,19 @@ public class CrondCrawlerService {
     @Scheduled(cron = "${crawler.llqy.crondTime}")
     public int crawlerDocGawLLQY() throws IOException, ParseException, InterruptedException {
 
-        log.info("{} : {} ms",  "GAW理论前沿调度任务开始!");
+        log.info("============【学习助手】 GAW理论前沿调度任务开始!");
         Connection conn = Jsoup.connect(llqyUrl);
 
-        String  datasourceCode ="1";
-        String  datasourceName = "市局首页--理论前沿";
+        String  datasourceCode =StudyDocDatasourceEnum.SHIJU_LLQY.getCode();
+        String  datasourceName = StudyDocDatasourceEnum.SHIJU_LLQY.getName();
         int flagNums = processShiJuDocument(conn,datasourceCode,datasourceName);
-        log.info("{} : {} 条",  "GAW理论前沿调度任务结束!",flagNums);
+        log.info("{} : {} 条",  "============【学习助手】 GAW理论前沿调度任务结束!",flagNums);
         return flagNums;
     }
 
 
     /**
-     * 处理市局首页 Dom
+     * 处理市局首页 Dom 结构获取文章详情
      * @param conn
      * @param datasourceCode
      * @param datasourceName
@@ -98,7 +99,7 @@ public class CrondCrawlerService {
             Elements tags = e.getElementsByTag("li");
             for (Element tag : tags) {      //遍历文章列表
                 String href = tag.getElementsByTag("a").attr("href");
-                Thread.sleep(5000);    //避免频繁访问 放慢请求速度
+                Thread.sleep(7000);    //避免频繁访问 放慢请求速度
                 //开始爬文章详情页
                 if(StringUtils.isNotEmpty(href) && !href.contains("http://www.tj/Page/") ){
                     href = "http://www.tj/Page/"+href;
@@ -121,8 +122,6 @@ public class CrondCrawlerService {
 
                 String title = tmpDocument.getElementsByClass("article_title").text();
                 String srcTimeStr = tmpDocument.getElementById("article-date").text().replace("发布时间:","");
-//                    System.err.println(tmpDocument.getElementById("article-source").text());
-//                    System.err.println( tmpDocument.getElementById("article-cypl-num").text().replace("参与评论",""));
                 String content = tmpDocument.getElementsByClass("article-p").toString();
 
                 //写入数据库
@@ -132,9 +131,8 @@ public class CrondCrawlerService {
                 studyDocInfo.setTitle(title);
                 studyDocInfo.setContent(content);
                 studyDocInfo.setUrl(href);
-                studyDocInfo.setUrlType("0");
-                //公安网	0
-                //互联网	1
+                studyDocInfo.setUrlType("0"); //公安网	0   //互联网	1
+
                 studyDocInfo.setDocSrcTime(DateUtils.parseDate(srcTimeStr,"yyyy-MM-dd HH:mm:ss"));
                 studyDocInfo.setPageViewNum(0);
                 studyDocInfo.setCreateUserId("-1");