Browse Source

【学习助手】 同步gaw shiju 首页爬虫

lzu918 5 months ago
parent
commit
16110afefc

+ 5 - 5
ruoyi-admin/src/main/resources/application.yml

@@ -176,13 +176,13 @@ module:
   hybz:
   hybz:
     templateFilePath: /Users/fangtasyj/Desktop/HHWY/智慧ZGAPP建设需求/hybz
     templateFilePath: /Users/fangtasyj/Desktop/HHWY/智慧ZGAPP建设需求/hybz
 
 
-# 市局爬虫配置
+# 市局爬虫配置 写  秒  分  时
 crawler:
 crawler:
   xxyd:
   xxyd:
-    crondTime: 0 */2 0 * * ?
-    url: https://www.163.com/
+    crondTime: 0 */5 1 1 1 ?
+    url: http://www.tj/Page/InfoListPage.aspx?NCode=xxyd
   llqy:
   llqy:
-    crondTime: 0 */2 0 * * ?
-    url: https://www.163.com/
+    crondTime: 0 */5 1 1 1 ?
+    url: http://www.tj/Page/InfoListPage.aspx?NCode=llqy
 
 
 
 

+ 188 - 0
ruoyi-zzb/src/main/java/com/ruoyi/zzb/study/common/TestJsoupShiJu.java

@@ -0,0 +1,188 @@
+package com.ruoyi.zzb.study.common;
+
+import com.ruoyi.common.utils.DateUtils;
+import org.jsoup.Connection;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.nodes.Node;
+import org.jsoup.select.Elements;
+
+import java.io.IOException;
+import java.text.ParseException;
+
+import java.text.SimpleDateFormat;
+import java.time.LocalDate;
+import java.time.format.DateTimeFormatter;
+import java.time.temporal.ChronoUnit;
+import java.util.Calendar;
+import java.util.Date;
+import java.util.List;
+
+public class TestJsoupShiJu {
+
+    private static final SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+
+    public static void main(String[] args) throws IOException, ParseException, InterruptedException {
+
+
+//        String url = "
+//        String proxyHost = "127.0.0.1";
+//        int proxyPort = 8888;
+//        Document document = Jsoup.connect(url).proxy(proxyHost, proxyPort).get();
+
+//        String url = "https://news.sina.com.cn/gov/xlxw/2024-11-18/doc-incwmzsr3745291.shtml";
+//        //直接获取DOM树
+//        Document document = Jsoup.connect(url).get();
+//        System.out.println(document.toString());
+
+        String url = "http://www.tj/Page/InfoListPage.aspx?NCode=xxyd";
+
+        Connection conn = Jsoup.connect(url);
+//        conn.data("page","2");     //拼接get请求参数 https://www.163.com/?page=2
+        Document document = conn.timeout(10000)
+                .maxBodySize(0)
+                .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36")
+                .get();
+
+
+//        System.err.println(document.toString());
+
+
+//     <ul class="channel-newsGroup">
+//     <li><a href="https://news.gmw.cn/2024-11/23/content_37694371.htm" target="_blank">努力造就高技能专业化劳动大军——天津积极推动职业教育高质量发展</a><span class="channel-newsTime">2024-11-23</span></li>
+//     <li><a href="2024-11/21/content_37691304.htm" target="_blank">2024年世界职业技术教育发展大会在天津开幕</a><span class="channel-newsTime">2024-11-21</span></li>
+//     <li><a href="2024-11/19/content_37686796.htm" target="_blank">老街区里潮流新|“新”在哪?天津这样作答!</a><span class="channel-newsTime">2024-11-19</span></li>
+//     <li><a href="2024-11/19/content_37686559.htm" target="_blank">聚焦高质量发展|打造职教高地 绘就“出彩人生”——天津职教何以领跑?</a><span class="channel-newsTime">2024-11-19</span></li>
+//     <li><a href="2024-11/19/content_37686041.htm" target="_blank">老街区里潮流新|这条古文化街浓缩了津派文化的万千气象</a><span class="channel-newsTime">2024-11-19</span></li>
+//    </ul>
+//    <ul class="channel-newsGroup">
+//     <li><a href="https://news.gmw.cn/2024-11/15/content_37677590.htm" target="_blank">世界职业技术教育发展大会将举办</a><span class="channel-newsTime">2024-11-15</span></li>
+//     <li><a href="https://news.gmw.cn/2024-11/14/content_37675130.htm" target="_blank">国内首座冷热能互换站在天津投产</a><span class="channel-newsTime">2024-11-14</span></li>
+//     <li><a href="2024-11/13/content_37674847.htm" target="_blank">向“新”谋变 以“创”攀高——天津民营经济高质量发展一线观察</a><span class="channel-newsTime">2024-11-13</span></li>
+//     <li><a href="https://news.gmw.cn/2024-11/11/content_37667637.htm" target="_blank">努力当好津甘协作牵线人</a><span class="channel-newsTime">2024-11-11</span></li>
+//     <li><a href="2024-11/05/content_37658288.htm" target="_blank">天津生产消费一线走访见闻:强产业 稳投资 促消费</a><span class="channel-newsTime">2024-11-05</span></li>
+//    </ul>
+
+
+        Elements textsElement = document.getElementsByClass("new-ul-lj2 list-style");       //获取文章列表
+        Document  tmpDocument =null;
+
+        long flagTime = DateUtils.parseDate("2024-11-25","yyyy-MM-dd").getTime();
+
+        for(Element e : textsElement){
+            Elements tags = e.getElementsByTag("li");
+            for (Element tag : tags) {
+                String title = tag.getElementsByClass("gayw-title").text();
+                String href = tag.getElementsByTag("a").attr("href");
+
+                //非正式发布时间  是列表显示时间
+                String newsTime = tag.getElementsByTag("span").text();
+
+//                String title = tag.getElementsByClass("gayw-title").text();
+//                String href = tag.getElementsByTag("a").attr("href");
+//                String newsTime = tag.getElementsByClass("channel-newsTime").text();
+
+               long docTims =DateUtils.parseDate(newsTime,"yyyy-MM-dd").getTime();
+
+               if (docTims >flagTime) {    //新数据
+                   System.err.println("新数据");
+
+                    Thread.sleep(5000L);    //避免频繁访问 放慢请求速度
+//                    System.err.println(tag);
+                    System.err.println(href);
+//                    System.err.println(title);
+//                    System.err.println(newsTime);
+                    //开始爬文章详情页
+
+                    if(!href.contains("http://www.tj/Page/")){
+                        href = "http://www.tj/Page/"+href;
+                    }
+                    conn = Jsoup.connect(href);
+                    tmpDocument = conn.timeout(10000)
+                            .maxBodySize(0)
+                            .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36")
+                            .get();
+
+
+                   System.err.println(tmpDocument.getElementsByClass("article_title").text());
+//                   System.err.println(tmpDocument.getElementsByClass("article-info").text());
+//                   System.err.println(tmpDocument.getElementsByClass("article-info").text());
+                   System.err.println(tmpDocument.getElementById("article-date").text().replace("发布时间:",""));
+                   System.err.println(tmpDocument.getElementById("article-source").text());
+                   System.err.println( tmpDocument.getElementById("article-cypl-num").text().replace("参与评论",""));
+
+                   System.err.println(tmpDocument.getElementsByClass("article-p").toString());
+
+
+//                    tmpDocument.getElementById("articleBox");
+//                    System.err.println(tmpDocument.getElementsByClass("u-title").text());
+//                    System.err.println(tmpDocument.getElementsByClass("m-con-time").text());
+//                    System.err.println(tmpDocument.getElementsByClass("m-con-source").get(0).getElementsByTag("a").text());
+//                    System.err.println(tmpDocument.getElementById("articleBox"));
+
+                    //  写入数据库
+                }
+            }
+
+            //更新 爬虫 状态位
+        }
+
+
+
+
+
+//        // 文字新闻
+//        Elements textsElement = document.getElementsByClass("cm_ul_round");
+//        for (Element e : textsElement) {
+//            Elements tags = e.getElementsByTag("a");
+//            for (Element tag : tags) {
+//                // 标题
+//                String title = tag.getElementsByAttribute("href").text();
+//                // 链接地址,可以根据需求继续解析网址,获取新闻详细信息
+//                String href = tag.attributes().get("href");
+//                // 所属分类
+//                String classification = null;
+//                if (href.contains("?") && href.contains("clickfrom=w_")) {
+//                    classification = href.substring(href.lastIndexOf("?") + 1).replace("clickfrom=w_", "");
+//                }
+//                System.out.println(title);
+//                System.out.println(href);
+//                System.out.println(classification);
+//            }
+//        }
+//        // 图片新闻
+//        Elements imgs = document.getElementsByClass("cm_bigimg");
+//        for (Element img : imgs) {
+//            Elements photos = img.getElementsByClass("photo");
+//            for (Element photo : photos) {
+//                // 标题
+//                String title = photo.attributes().get("title");
+//                // 链接地址,可以根据需求继续解析网址,获取新闻详细信息
+//                String href = photo.attributes().get("href");
+//                // 封面图
+//                String imgSrc = null;
+//                List<Node> child = photo.childNodes();
+//                for(Node node : child) {
+//                    if (node.hasAttr("data-original")) {
+//                        imgSrc = node.attributes().get("data-original");
+//                        break;
+//                    }
+//                }
+//                // 所属分类
+//                String classification = null;
+//                if (href.contains("?") && href.contains("clickfrom=w_")) {
+//                    classification = href.substring(href.lastIndexOf("?") + 1).replace("clickfrom=w_", "");
+//                }
+//                System.out.println(title);
+//                System.out.println(href);
+//                System.out.println(imgSrc);
+//                System.out.println(classification);
+//            }
+//        }
+//
+    }
+
+
+}
+

+ 2 - 1
ruoyi-zzb/src/main/java/com/ruoyi/zzb/study/controller/CrondCrawlerController.java

@@ -9,6 +9,7 @@ import org.springframework.web.bind.annotation.RequestMethod;
 import org.springframework.web.bind.annotation.RestController;
 import org.springframework.web.bind.annotation.RestController;
 
 
 import java.io.IOException;
 import java.io.IOException;
+import java.text.ParseException;
 
 
 @Slf4j
 @Slf4j
 @RestController
 @RestController
@@ -18,7 +19,7 @@ public class CrondCrawlerController {
     private CrondCrawlerService crondCrawlerService;
     private CrondCrawlerService crondCrawlerService;
 
 
     @RequestMapping(value = "/crawlergaw", method = RequestMethod.GET)
     @RequestMapping(value = "/crawlergaw", method = RequestMethod.GET)
-    public void crawlerGAWSchedule() throws IOException {
+    public void crawlerGAWSchedule() throws IOException, ParseException, InterruptedException {
         log.info("{} : {} ms",  "GAW理论前沿调度任务开始!");
         log.info("{} : {} ms",  "GAW理论前沿调度任务开始!");
         int nums =  crondCrawlerService.crawlerDocGawLLQY();
         int nums =  crondCrawlerService.crawlerDocGawLLQY();
         log.info("{} : {} 条",  "GAW理论前沿调度任务结束!",nums);
         log.info("{} : {} 条",  "GAW理论前沿调度任务结束!",nums);

+ 91 - 42
ruoyi-zzb/src/main/java/com/ruoyi/zzb/study/service/CrondCrawlerService.java

@@ -4,6 +4,8 @@ import com.baomidou.mybatisplus.core.conditions.Wrapper;
 import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
 import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
 import com.ruoyi.common.annotation.DataSource;
 import com.ruoyi.common.annotation.DataSource;
 import com.ruoyi.common.enums.DataSourceType;
 import com.ruoyi.common.enums.DataSourceType;
+import com.ruoyi.common.utils.DateUtils;
+import com.ruoyi.common.utils.StringUtils;
 import com.ruoyi.zzb.study.common.UserAgentUtil;
 import com.ruoyi.zzb.study.common.UserAgentUtil;
 import com.ruoyi.zzb.study.domain.StudyCrawlerStatus;
 import com.ruoyi.zzb.study.domain.StudyCrawlerStatus;
 import com.ruoyi.zzb.study.domain.StudyDocInfo;
 import com.ruoyi.zzb.study.domain.StudyDocInfo;
@@ -13,12 +15,16 @@ import lombok.extern.slf4j.Slf4j;
 import org.jsoup.Connection;
 import org.jsoup.Connection;
 import org.jsoup.Jsoup;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.beans.factory.annotation.Value;
 import org.springframework.beans.factory.annotation.Value;
 import org.springframework.scheduling.annotation.Scheduled;
 import org.springframework.scheduling.annotation.Scheduled;
 import org.springframework.stereotype.Service;
 import org.springframework.stereotype.Service;
 
 
 import java.io.IOException;
 import java.io.IOException;
+import java.text.ParseException;
+import java.util.List;
 
 
 @Slf4j
 @Slf4j
 @Service
 @Service
@@ -40,62 +46,105 @@ public class CrondCrawlerService {
      * 学习园地爬虫
      * 学习园地爬虫
      */
      */
     @Scheduled(cron = "${crawler.xxyd.crondTime}")
     @Scheduled(cron = "${crawler.xxyd.crondTime}")
-    public int crawlerDocGawXXYD() throws IOException {
+    public int crawlerDocGawXXYD() throws IOException, ParseException, InterruptedException {
 
 
         log.info("{} : {} ms",  "GAW学习园地调度任务开始!");
         log.info("{} : {} ms",  "GAW学习园地调度任务开始!");
         Connection conn = Jsoup.connect(xxydUrl);
         Connection conn = Jsoup.connect(xxydUrl);
-        Document document = conn.timeout(10000).maxBodySize(0).userAgent(UserAgentUtil.getRandomAgent()).get();
-
-        LambdaQueryWrapper<StudyCrawlerStatus> queryWrapper = new LambdaQueryWrapper();
-        queryWrapper.eq(StudyCrawlerStatus::getModuleCode,"study");
-        queryWrapper.eq(StudyCrawlerStatus::getServiceCode,"xxyd");
-        //获取上次状态位,爬虫时间
-        Long statusTime = studyCrawlerStatusMapper.selectOne(queryWrapper).getStatusTime();
-        System.err.println("xxyd--status"+statusTime);
-
-
-
-        //处理爬虫文章
-
-        //过滤新数据,写入
-
-//        StudyDocInfo studyDocInfo = new StudyDocInfo();
-//        studyDocInfoMapper.insert();
-
-
-        //结束后更新爬虫状态位
-        log.info("{} : {} 条",  "GAW学习园地调度任务结束!");
-
-        return 1;
+        //获取页码,遍历分页请求
+        //conn.data("page","2");     //拼接get请求参数 https://www.163.com/?page=2
+
+        String datasourceCode ="2";
+        String  datasourceName = "市局首页--学习园地";
+        int flagNums = processShiJuDocument(conn,datasourceCode,datasourceName);
+        log.info("{} : {} 条",  "GAW学习园地调度任务结束!",flagNums);
+        return flagNums;
     }
     }
 
 
+
     /**
     /**
      * 理论前沿爬虫
      * 理论前沿爬虫
      */
      */
     @Scheduled(cron = "${crawler.llqy.crondTime}")
     @Scheduled(cron = "${crawler.llqy.crondTime}")
-    public int crawlerDocGawLLQY() throws IOException {
+    public int crawlerDocGawLLQY() throws IOException, ParseException, InterruptedException {
 
 
         log.info("{} : {} ms",  "GAW理论前沿调度任务开始!");
         log.info("{} : {} ms",  "GAW理论前沿调度任务开始!");
         Connection conn = Jsoup.connect(llqyUrl);
         Connection conn = Jsoup.connect(llqyUrl);
-        Document document = conn.timeout(10000).maxBodySize(0).userAgent(UserAgentUtil.getRandomAgent()).get();
-        //获取上次状态位,爬虫时间
-        LambdaQueryWrapper<StudyCrawlerStatus> queryWrapper = new LambdaQueryWrapper();
-        queryWrapper.eq(StudyCrawlerStatus::getModuleCode,"study");
-        queryWrapper.eq(StudyCrawlerStatus::getServiceCode,"llqy");
-        //获取上次状态位,爬虫时间
-        Long statusTime = studyCrawlerStatusMapper.selectOne(queryWrapper).getStatusTime();
-        System.err.println("llqy--status"+statusTime);
-
-        //处理爬虫文章
-
-        //过滤新数据,写入
 
 
-//        StudyDocInfo studyDocInfo = new StudyDocInfo();
-//        studyDocInfoMapper.insert();
+        String  datasourceCode ="1";
+        String  datasourceName = "市局首页--理论前沿";
+        int flagNums = processShiJuDocument(conn,datasourceCode,datasourceName);
+        log.info("{} : {} 条",  "GAW理论前沿调度任务结束!",flagNums);
+        return flagNums;
+    }
 
 
 
 
-        //结束后更新爬虫状态位
-        log.info("{} : {} 条",  "GAW理论前沿调度任务结束!");
-        return 1;
+    /**
+     * 处理市局首页 Dom
+     * @param conn
+     * @param datasourceCode
+     * @param datasourceName
+     * @return
+     * @throws InterruptedException
+     * @throws IOException
+     * @throws ParseException
+     */
+    private int processShiJuDocument(Connection conn,String datasourceCode,String datasourceName) throws InterruptedException, IOException, ParseException {
+        int flagNums = 0;
+        Document document = conn.timeout(10000).maxBodySize(0).userAgent(UserAgentUtil.getRandomAgent()).get();
+        Elements textsElement = document.getElementsByClass("new-ul-lj2 list-style");   //获取文章列表
+        Document  tmpDocument =null;
+
+        for(Element e : textsElement){
+            Elements tags = e.getElementsByTag("li");
+            for (Element tag : tags) {      //遍历文章列表
+                String href = tag.getElementsByTag("a").attr("href");
+                Thread.sleep(5000);    //避免频繁访问 放慢请求速度
+                //开始爬文章详情页
+                if(StringUtils.isNotEmpty(href) && !href.contains("http://www.tj/Page/") ){
+                    href = "http://www.tj/Page/"+href;
+                }else{
+                    continue;   //无文章详情
+                }
+
+                LambdaQueryWrapper<StudyDocInfo> queryWrapper = new LambdaQueryWrapper();
+                queryWrapper.eq(StudyDocInfo::getUrl,href);
+                List<StudyDocInfo> StudyDocInfos = studyDocInfoMapper.selectList(queryWrapper);
+
+                //判断下此文章是否已爬过  href  判断
+                if(StudyDocInfos.size() != 0){      //文章已被记录
+                    log.info("文章已记录,跳过:"+href);
+                    continue;
+                }
+
+                conn = Jsoup.connect(href);
+                tmpDocument = conn.timeout(10000).maxBodySize(0).userAgent(UserAgentUtil.getRandomAgent()).get();
+
+                String title = tmpDocument.getElementsByClass("article_title").text();
+                String srcTimeStr = tmpDocument.getElementById("article-date").text().replace("发布时间:","");
+//                    System.err.println(tmpDocument.getElementById("article-source").text());
+//                    System.err.println( tmpDocument.getElementById("article-cypl-num").text().replace("参与评论",""));
+                String content = tmpDocument.getElementsByClass("article-p").toString();
+
+                //写入数据库
+                StudyDocInfo studyDocInfo = new StudyDocInfo();
+                studyDocInfo.setDatasourceCode(datasourceCode);
+                studyDocInfo.setDatasourceName(datasourceName);
+                studyDocInfo.setTitle(title);
+                studyDocInfo.setContent(content);
+                studyDocInfo.setUrl(href);
+                studyDocInfo.setUrlType("0");
+                //公安网	0
+                //互联网	1
+                studyDocInfo.setDocSrcTime(DateUtils.parseDate(srcTimeStr,"yyyy-MM-dd HH:mm:ss"));
+                studyDocInfo.setPageViewNum(0);
+                studyDocInfo.setCreateUserId("-1");
+                studyDocInfo.setCreateUserName("系统管理员");
+                studyDocInfo.setCreatePoliceNo("-111111");
+
+                studyDocInfoMapper.insert(studyDocInfo);
+                flagNums++;
+            }
+        }
+        return flagNums;
     }
     }
 }
 }