|
@@ -8,6 +8,7 @@ import com.ruoyi.common.utils.DateUtils;
|
|
|
import com.ruoyi.common.utils.StringUtils;
|
|
|
import com.ruoyi.zzb.study.common.UserAgentUtil;
|
|
|
import com.ruoyi.zzb.study.domain.StudyCrawlerStatus;
|
|
|
+import com.ruoyi.zzb.study.domain.StudyDocDatasourceEnum;
|
|
|
import com.ruoyi.zzb.study.domain.StudyDocInfo;
|
|
|
import com.ruoyi.zzb.study.mapper.StudyCrawlerStatusMapper;
|
|
|
import com.ruoyi.zzb.study.mapper.StudyDocInfoMapper;
|
|
@@ -48,15 +49,15 @@ public class CrondCrawlerService {
|
|
|
@Scheduled(cron = "${crawler.xxyd.crondTime}")
|
|
|
public int crawlerDocGawXXYD() throws IOException, ParseException, InterruptedException {
|
|
|
|
|
|
- log.info("{} : {} ms", "GAW学习园地调度任务开始!");
|
|
|
+ log.info( "============【学习助手】GAW学习园地调度任务开始!");
|
|
|
Connection conn = Jsoup.connect(xxydUrl);
|
|
|
//获取页码,遍历分页请求
|
|
|
//conn.data("page","2"); //拼接get请求参数 https://www.163.com/?page=2
|
|
|
|
|
|
- String datasourceCode ="2";
|
|
|
- String datasourceName = "市局首页--学习园地";
|
|
|
+ String datasourceCode = StudyDocDatasourceEnum.SHIJU_XXYD.getCode();
|
|
|
+ String datasourceName = StudyDocDatasourceEnum.SHIJU_XXYD.getName();
|
|
|
int flagNums = processShiJuDocument(conn,datasourceCode,datasourceName);
|
|
|
- log.info("{} : {} 条", "GAW学习园地调度任务结束!",flagNums);
|
|
|
+ log.info("{} : {} 条", "============【学习助手】 GAW学习园地调度任务结束!",flagNums);
|
|
|
return flagNums;
|
|
|
}
|
|
|
|
|
@@ -67,19 +68,19 @@ public class CrondCrawlerService {
|
|
|
@Scheduled(cron = "${crawler.llqy.crondTime}")
|
|
|
public int crawlerDocGawLLQY() throws IOException, ParseException, InterruptedException {
|
|
|
|
|
|
- log.info("{} : {} ms", "GAW理论前沿调度任务开始!");
|
|
|
+ log.info("============【学习助手】 GAW理论前沿调度任务开始!");
|
|
|
Connection conn = Jsoup.connect(llqyUrl);
|
|
|
|
|
|
- String datasourceCode ="1";
|
|
|
- String datasourceName = "市局首页--理论前沿";
|
|
|
+ String datasourceCode =StudyDocDatasourceEnum.SHIJU_LLQY.getCode();
|
|
|
+ String datasourceName = StudyDocDatasourceEnum.SHIJU_LLQY.getName();
|
|
|
int flagNums = processShiJuDocument(conn,datasourceCode,datasourceName);
|
|
|
- log.info("{} : {} 条", "GAW理论前沿调度任务结束!",flagNums);
|
|
|
+ log.info("{} : {} 条", "============【学习助手】 GAW理论前沿调度任务结束!",flagNums);
|
|
|
return flagNums;
|
|
|
}
|
|
|
|
|
|
|
|
|
/**
|
|
|
- * 处理市局首页 Dom
|
|
|
+ * 处理市局首页 Dom 结构获取文章详情
|
|
|
* @param conn
|
|
|
* @param datasourceCode
|
|
|
* @param datasourceName
|
|
@@ -98,7 +99,7 @@ public class CrondCrawlerService {
|
|
|
Elements tags = e.getElementsByTag("li");
|
|
|
for (Element tag : tags) { //遍历文章列表
|
|
|
String href = tag.getElementsByTag("a").attr("href");
|
|
|
- Thread.sleep(5000); //避免频繁访问 放慢请求速度
|
|
|
+ Thread.sleep(7000); //避免频繁访问 放慢请求速度
|
|
|
//开始爬文章详情页
|
|
|
if(StringUtils.isNotEmpty(href) && !href.contains("http://www.tj/Page/") ){
|
|
|
href = "http://www.tj/Page/"+href;
|
|
@@ -121,8 +122,6 @@ public class CrondCrawlerService {
|
|
|
|
|
|
String title = tmpDocument.getElementsByClass("article_title").text();
|
|
|
String srcTimeStr = tmpDocument.getElementById("article-date").text().replace("发布时间:","");
|
|
|
-// System.err.println(tmpDocument.getElementById("article-source").text());
|
|
|
-// System.err.println( tmpDocument.getElementById("article-cypl-num").text().replace("参与评论",""));
|
|
|
String content = tmpDocument.getElementsByClass("article-p").toString();
|
|
|
|
|
|
//写入数据库
|
|
@@ -132,9 +131,8 @@ public class CrondCrawlerService {
|
|
|
studyDocInfo.setTitle(title);
|
|
|
studyDocInfo.setContent(content);
|
|
|
studyDocInfo.setUrl(href);
|
|
|
- studyDocInfo.setUrlType("0");
|
|
|
- //公安网 0
|
|
|
- //互联网 1
|
|
|
+ studyDocInfo.setUrlType("0"); //公安网 0 //互联网 1
|
|
|
+
|
|
|
studyDocInfo.setDocSrcTime(DateUtils.parseDate(srcTimeStr,"yyyy-MM-dd HH:mm:ss"));
|
|
|
studyDocInfo.setPageViewNum(0);
|
|
|
studyDocInfo.setCreateUserId("-1");
|