|
@@ -4,6 +4,8 @@ import com.baomidou.mybatisplus.core.conditions.Wrapper;
|
|
import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
|
|
import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
|
|
import com.ruoyi.common.annotation.DataSource;
|
|
import com.ruoyi.common.annotation.DataSource;
|
|
import com.ruoyi.common.enums.DataSourceType;
|
|
import com.ruoyi.common.enums.DataSourceType;
|
|
|
|
+import com.ruoyi.common.utils.DateUtils;
|
|
|
|
+import com.ruoyi.common.utils.StringUtils;
|
|
import com.ruoyi.zzb.study.common.UserAgentUtil;
|
|
import com.ruoyi.zzb.study.common.UserAgentUtil;
|
|
import com.ruoyi.zzb.study.domain.StudyCrawlerStatus;
|
|
import com.ruoyi.zzb.study.domain.StudyCrawlerStatus;
|
|
import com.ruoyi.zzb.study.domain.StudyDocInfo;
|
|
import com.ruoyi.zzb.study.domain.StudyDocInfo;
|
|
@@ -13,12 +15,16 @@ import lombok.extern.slf4j.Slf4j;
|
|
import org.jsoup.Connection;
|
|
import org.jsoup.Connection;
|
|
import org.jsoup.Jsoup;
|
|
import org.jsoup.Jsoup;
|
|
import org.jsoup.nodes.Document;
|
|
import org.jsoup.nodes.Document;
|
|
|
|
+import org.jsoup.nodes.Element;
|
|
|
|
+import org.jsoup.select.Elements;
|
|
import org.springframework.beans.factory.annotation.Autowired;
|
|
import org.springframework.beans.factory.annotation.Autowired;
|
|
import org.springframework.beans.factory.annotation.Value;
|
|
import org.springframework.beans.factory.annotation.Value;
|
|
import org.springframework.scheduling.annotation.Scheduled;
|
|
import org.springframework.scheduling.annotation.Scheduled;
|
|
import org.springframework.stereotype.Service;
|
|
import org.springframework.stereotype.Service;
|
|
|
|
|
|
import java.io.IOException;
|
|
import java.io.IOException;
|
|
|
|
+import java.text.ParseException;
|
|
|
|
+import java.util.List;
|
|
|
|
|
|
@Slf4j
|
|
@Slf4j
|
|
@Service
|
|
@Service
|
|
@@ -40,62 +46,105 @@ public class CrondCrawlerService {
|
|
* 学习园地爬虫
|
|
* 学习园地爬虫
|
|
*/
|
|
*/
|
|
@Scheduled(cron = "${crawler.xxyd.crondTime}")
|
|
@Scheduled(cron = "${crawler.xxyd.crondTime}")
|
|
- public int crawlerDocGawXXYD() throws IOException {
|
|
|
|
|
|
+ public int crawlerDocGawXXYD() throws IOException, ParseException, InterruptedException {
|
|
|
|
|
|
log.info("{} : {} ms", "GAW学习园地调度任务开始!");
|
|
log.info("{} : {} ms", "GAW学习园地调度任务开始!");
|
|
Connection conn = Jsoup.connect(xxydUrl);
|
|
Connection conn = Jsoup.connect(xxydUrl);
|
|
- Document document = conn.timeout(10000).maxBodySize(0).userAgent(UserAgentUtil.getRandomAgent()).get();
|
|
|
|
-
|
|
|
|
- LambdaQueryWrapper<StudyCrawlerStatus> queryWrapper = new LambdaQueryWrapper();
|
|
|
|
- queryWrapper.eq(StudyCrawlerStatus::getModuleCode,"study");
|
|
|
|
- queryWrapper.eq(StudyCrawlerStatus::getServiceCode,"xxyd");
|
|
|
|
- //获取上次状态位,爬虫时间
|
|
|
|
- Long statusTime = studyCrawlerStatusMapper.selectOne(queryWrapper).getStatusTime();
|
|
|
|
- System.err.println("xxyd--status"+statusTime);
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-
|
|
|
|
- //处理爬虫文章
|
|
|
|
-
|
|
|
|
- //过滤新数据,写入
|
|
|
|
-
|
|
|
|
-// StudyDocInfo studyDocInfo = new StudyDocInfo();
|
|
|
|
-// studyDocInfoMapper.insert();
|
|
|
|
-
|
|
|
|
-
|
|
|
|
- //结束后更新爬虫状态位
|
|
|
|
- log.info("{} : {} 条", "GAW学习园地调度任务结束!");
|
|
|
|
-
|
|
|
|
- return 1;
|
|
|
|
|
|
+ //获取页码,遍历分页请求
|
|
|
|
+ //conn.data("page","2"); //拼接get请求参数 https://www.163.com/?page=2
|
|
|
|
+
|
|
|
|
+ String datasourceCode ="2";
|
|
|
|
+ String datasourceName = "市局首页--学习园地";
|
|
|
|
+ int flagNums = processShiJuDocument(conn,datasourceCode,datasourceName);
|
|
|
|
+ log.info("{} : {} 条", "GAW学习园地调度任务结束!",flagNums);
|
|
|
|
+ return flagNums;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+
|
|
/**
|
|
/**
|
|
* 理论前沿爬虫
|
|
* 理论前沿爬虫
|
|
*/
|
|
*/
|
|
@Scheduled(cron = "${crawler.llqy.crondTime}")
|
|
@Scheduled(cron = "${crawler.llqy.crondTime}")
|
|
- public int crawlerDocGawLLQY() throws IOException {
|
|
|
|
|
|
+ public int crawlerDocGawLLQY() throws IOException, ParseException, InterruptedException {
|
|
|
|
|
|
log.info("{} : {} ms", "GAW理论前沿调度任务开始!");
|
|
log.info("{} : {} ms", "GAW理论前沿调度任务开始!");
|
|
Connection conn = Jsoup.connect(llqyUrl);
|
|
Connection conn = Jsoup.connect(llqyUrl);
|
|
- Document document = conn.timeout(10000).maxBodySize(0).userAgent(UserAgentUtil.getRandomAgent()).get();
|
|
|
|
- //获取上次状态位,爬虫时间
|
|
|
|
- LambdaQueryWrapper<StudyCrawlerStatus> queryWrapper = new LambdaQueryWrapper();
|
|
|
|
- queryWrapper.eq(StudyCrawlerStatus::getModuleCode,"study");
|
|
|
|
- queryWrapper.eq(StudyCrawlerStatus::getServiceCode,"llqy");
|
|
|
|
- //获取上次状态位,爬虫时间
|
|
|
|
- Long statusTime = studyCrawlerStatusMapper.selectOne(queryWrapper).getStatusTime();
|
|
|
|
- System.err.println("llqy--status"+statusTime);
|
|
|
|
-
|
|
|
|
- //处理爬虫文章
|
|
|
|
-
|
|
|
|
- //过滤新数据,写入
|
|
|
|
|
|
|
|
-// StudyDocInfo studyDocInfo = new StudyDocInfo();
|
|
|
|
-// studyDocInfoMapper.insert();
|
|
|
|
|
|
+ String datasourceCode ="1";
|
|
|
|
+ String datasourceName = "市局首页--理论前沿";
|
|
|
|
+ int flagNums = processShiJuDocument(conn,datasourceCode,datasourceName);
|
|
|
|
+ log.info("{} : {} 条", "GAW理论前沿调度任务结束!",flagNums);
|
|
|
|
+ return flagNums;
|
|
|
|
+ }
|
|
|
|
|
|
|
|
|
|
- //结束后更新爬虫状态位
|
|
|
|
- log.info("{} : {} 条", "GAW理论前沿调度任务结束!");
|
|
|
|
- return 1;
|
|
|
|
|
|
+ /**
|
|
|
|
+ * 处理市局首页 Dom
|
|
|
|
+ * @param conn
|
|
|
|
+ * @param datasourceCode
|
|
|
|
+ * @param datasourceName
|
|
|
|
+ * @return
|
|
|
|
+ * @throws InterruptedException
|
|
|
|
+ * @throws IOException
|
|
|
|
+ * @throws ParseException
|
|
|
|
+ */
|
|
|
|
+ private int processShiJuDocument(Connection conn,String datasourceCode,String datasourceName) throws InterruptedException, IOException, ParseException {
|
|
|
|
+ int flagNums = 0;
|
|
|
|
+ Document document = conn.timeout(10000).maxBodySize(0).userAgent(UserAgentUtil.getRandomAgent()).get();
|
|
|
|
+ Elements textsElement = document.getElementsByClass("new-ul-lj2 list-style"); //获取文章列表
|
|
|
|
+ Document tmpDocument =null;
|
|
|
|
+
|
|
|
|
+ for(Element e : textsElement){
|
|
|
|
+ Elements tags = e.getElementsByTag("li");
|
|
|
|
+ for (Element tag : tags) { //遍历文章列表
|
|
|
|
+ String href = tag.getElementsByTag("a").attr("href");
|
|
|
|
+ Thread.sleep(5000); //避免频繁访问 放慢请求速度
|
|
|
|
+ //开始爬文章详情页
|
|
|
|
+ if(StringUtils.isNotEmpty(href) && !href.contains("http://www.tj/Page/") ){
|
|
|
|
+ href = "http://www.tj/Page/"+href;
|
|
|
|
+ }else{
|
|
|
|
+ continue; //无文章详情
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ LambdaQueryWrapper<StudyDocInfo> queryWrapper = new LambdaQueryWrapper();
|
|
|
|
+ queryWrapper.eq(StudyDocInfo::getUrl,href);
|
|
|
|
+ List<StudyDocInfo> StudyDocInfos = studyDocInfoMapper.selectList(queryWrapper);
|
|
|
|
+
|
|
|
|
+ //判断下此文章是否已爬过 href 判断
|
|
|
|
+ if(StudyDocInfos.size() != 0){ //文章已被记录
|
|
|
|
+ log.info("文章已记录,跳过:"+href);
|
|
|
|
+ continue;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ conn = Jsoup.connect(href);
|
|
|
|
+ tmpDocument = conn.timeout(10000).maxBodySize(0).userAgent(UserAgentUtil.getRandomAgent()).get();
|
|
|
|
+
|
|
|
|
+ String title = tmpDocument.getElementsByClass("article_title").text();
|
|
|
|
+ String srcTimeStr = tmpDocument.getElementById("article-date").text().replace("发布时间:","");
|
|
|
|
+// System.err.println(tmpDocument.getElementById("article-source").text());
|
|
|
|
+// System.err.println( tmpDocument.getElementById("article-cypl-num").text().replace("参与评论",""));
|
|
|
|
+ String content = tmpDocument.getElementsByClass("article-p").toString();
|
|
|
|
+
|
|
|
|
+ //写入数据库
|
|
|
|
+ StudyDocInfo studyDocInfo = new StudyDocInfo();
|
|
|
|
+ studyDocInfo.setDatasourceCode(datasourceCode);
|
|
|
|
+ studyDocInfo.setDatasourceName(datasourceName);
|
|
|
|
+ studyDocInfo.setTitle(title);
|
|
|
|
+ studyDocInfo.setContent(content);
|
|
|
|
+ studyDocInfo.setUrl(href);
|
|
|
|
+ studyDocInfo.setUrlType("0");
|
|
|
|
+ //公安网 0
|
|
|
|
+ //互联网 1
|
|
|
|
+ studyDocInfo.setDocSrcTime(DateUtils.parseDate(srcTimeStr,"yyyy-MM-dd HH:mm:ss"));
|
|
|
|
+ studyDocInfo.setPageViewNum(0);
|
|
|
|
+ studyDocInfo.setCreateUserId("-1");
|
|
|
|
+ studyDocInfo.setCreateUserName("系统管理员");
|
|
|
|
+ studyDocInfo.setCreatePoliceNo("-111111");
|
|
|
|
+
|
|
|
|
+ studyDocInfoMapper.insert(studyDocInfo);
|
|
|
|
+ flagNums++;
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ return flagNums;
|
|
}
|
|
}
|
|
}
|
|
}
|