|
@@ -0,0 +1,171 @@
|
|
|
|
+package com.ruoyi.zzb.study.common;
|
|
|
|
+
|
|
|
|
+import com.ruoyi.common.utils.DateUtils;
|
|
|
|
+import org.jsoup.Connection;
|
|
|
|
+import org.jsoup.Jsoup;
|
|
|
|
+import org.jsoup.nodes.Document;
|
|
|
|
+import org.jsoup.nodes.Element;
|
|
|
|
+import org.jsoup.nodes.Node;
|
|
|
|
+import org.jsoup.select.Elements;
|
|
|
|
+
|
|
|
|
+import java.io.IOException;
|
|
|
|
+import java.text.ParseException;
|
|
|
|
+
|
|
|
|
+import java.text.SimpleDateFormat;
|
|
|
|
+import java.time.LocalDate;
|
|
|
|
+import java.time.format.DateTimeFormatter;
|
|
|
|
+import java.time.temporal.ChronoUnit;
|
|
|
|
+import java.util.Calendar;
|
|
|
|
+import java.util.Date;
|
|
|
|
+import java.util.List;
|
|
|
|
+
|
|
|
|
+public class TestJsoup2 {
|
|
|
|
+
|
|
|
|
+ private static final SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
|
|
|
|
+
|
|
|
|
+ public static void main(String[] args) throws IOException, ParseException, InterruptedException {
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+// String url = "
|
|
|
|
+// String proxyHost = "127.0.0.1";
|
|
|
|
+// int proxyPort = 8888;
|
|
|
|
+// Document document = Jsoup.connect(url).proxy(proxyHost, proxyPort).get();
|
|
|
|
+
|
|
|
|
+// String url = "https://news.sina.com.cn/gov/xlxw/2024-11-18/doc-incwmzsr3745291.shtml";
|
|
|
|
+// //直接获取DOM树
|
|
|
|
+// Document document = Jsoup.connect(url).get();
|
|
|
|
+// System.out.println(document.toString());
|
|
|
|
+
|
|
|
|
+ String url = "https://difang.gmw.cn/tj/node_12889.htm";
|
|
|
|
+
|
|
|
|
+ Connection conn = Jsoup.connect(url);
|
|
|
|
+// conn.data("page","2"); //拼接get请求参数 https://www.163.com/?page=2
|
|
|
|
+ Document document = conn.timeout(10000)
|
|
|
|
+ .maxBodySize(0)
|
|
|
|
+ .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36")
|
|
|
|
+ .get();
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+// System.err.println(document.toString());
|
|
|
|
+
|
|
|
|
+ Elements textsElement = document.getElementsByClass("channel-newsGroup"); //获取文章列表
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+// <ul class="channel-newsGroup">
|
|
|
|
+// <li><a href="https://news.gmw.cn/2024-11/23/content_37694371.htm" target="_blank">努力造就高技能专业化劳动大军——天津积极推动职业教育高质量发展</a><span class="channel-newsTime">2024-11-23</span></li>
|
|
|
|
+// <li><a href="2024-11/21/content_37691304.htm" target="_blank">2024年世界职业技术教育发展大会在天津开幕</a><span class="channel-newsTime">2024-11-21</span></li>
|
|
|
|
+// <li><a href="2024-11/19/content_37686796.htm" target="_blank">老街区里潮流新|“新”在哪?天津这样作答!</a><span class="channel-newsTime">2024-11-19</span></li>
|
|
|
|
+// <li><a href="2024-11/19/content_37686559.htm" target="_blank">聚焦高质量发展|打造职教高地 绘就“出彩人生”——天津职教何以领跑?</a><span class="channel-newsTime">2024-11-19</span></li>
|
|
|
|
+// <li><a href="2024-11/19/content_37686041.htm" target="_blank">老街区里潮流新|这条古文化街浓缩了津派文化的万千气象</a><span class="channel-newsTime">2024-11-19</span></li>
|
|
|
|
+// </ul>
|
|
|
|
+// <ul class="channel-newsGroup">
|
|
|
|
+// <li><a href="https://news.gmw.cn/2024-11/15/content_37677590.htm" target="_blank">世界职业技术教育发展大会将举办</a><span class="channel-newsTime">2024-11-15</span></li>
|
|
|
|
+// <li><a href="https://news.gmw.cn/2024-11/14/content_37675130.htm" target="_blank">国内首座冷热能互换站在天津投产</a><span class="channel-newsTime">2024-11-14</span></li>
|
|
|
|
+// <li><a href="2024-11/13/content_37674847.htm" target="_blank">向“新”谋变 以“创”攀高——天津民营经济高质量发展一线观察</a><span class="channel-newsTime">2024-11-13</span></li>
|
|
|
|
+// <li><a href="https://news.gmw.cn/2024-11/11/content_37667637.htm" target="_blank">努力当好津甘协作牵线人</a><span class="channel-newsTime">2024-11-11</span></li>
|
|
|
|
+// <li><a href="2024-11/05/content_37658288.htm" target="_blank">天津生产消费一线走访见闻:强产业 稳投资 促消费</a><span class="channel-newsTime">2024-11-05</span></li>
|
|
|
|
+// </ul>
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ Document tmpDocument =null;
|
|
|
|
+ long flagTime = DateUtils.parseDate("2024-11-18","yyyy-MM-dd").getTime();
|
|
|
|
+
|
|
|
|
+ for(Element e : textsElement){
|
|
|
|
+
|
|
|
|
+ Elements tags = e.getElementsByTag("li");
|
|
|
|
+ for (Element tag : tags) {
|
|
|
|
+
|
|
|
|
+ String title = tag.getElementsByTag("a").text();
|
|
|
|
+ String href = tag.getElementsByTag("a").attr("href");
|
|
|
|
+ String newsTime = tag.getElementsByClass("channel-newsTime").text();
|
|
|
|
+
|
|
|
|
+ long docTims =DateUtils.parseDate(newsTime,"yyyy-MM-dd").getTime();
|
|
|
|
+
|
|
|
|
+ if (docTims >flagTime) { //新数据
|
|
|
|
+ Thread.sleep(5000L); //避免频繁访问 放慢请求速度
|
|
|
|
+ System.err.println(tag);
|
|
|
|
+ System.err.println(href);
|
|
|
|
+ System.err.println(title);
|
|
|
|
+ System.err.println(newsTime);
|
|
|
|
+ //开始爬文章详情页
|
|
|
|
+
|
|
|
|
+ if(!href.contains("https://news.gmw.cn/")){
|
|
|
|
+ href = "https://difang.gmw.cn/tj/"+href;
|
|
|
|
+ }
|
|
|
|
+ conn = Jsoup.connect(href);
|
|
|
|
+ tmpDocument = conn.timeout(10000)
|
|
|
|
+ .maxBodySize(0)
|
|
|
|
+ .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36")
|
|
|
|
+ .get();
|
|
|
|
+
|
|
|
|
+ tmpDocument.getElementById("articleBox");
|
|
|
|
+ System.err.println(tmpDocument.getElementsByClass("u-title").text());
|
|
|
|
+ System.err.println(tmpDocument.getElementsByClass("m-con-time").text());
|
|
|
|
+ System.err.println(tmpDocument.getElementsByClass("m-con-source").get(0).getElementsByTag("a").text());
|
|
|
|
+ System.err.println(tmpDocument.getElementById("articleBox"));
|
|
|
|
+
|
|
|
|
+ // 写入数据库
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ //更新 爬虫 状态位
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+// // 文字新闻
|
|
|
|
+// Elements textsElement = document.getElementsByClass("cm_ul_round");
|
|
|
|
+// for (Element e : textsElement) {
|
|
|
|
+// Elements tags = e.getElementsByTag("a");
|
|
|
|
+// for (Element tag : tags) {
|
|
|
|
+// // 标题
|
|
|
|
+// String title = tag.getElementsByAttribute("href").text();
|
|
|
|
+// // 链接地址,可以根据需求继续解析网址,获取新闻详细信息
|
|
|
|
+// String href = tag.attributes().get("href");
|
|
|
|
+// // 所属分类
|
|
|
|
+// String classification = null;
|
|
|
|
+// if (href.contains("?") && href.contains("clickfrom=w_")) {
|
|
|
|
+// classification = href.substring(href.lastIndexOf("?") + 1).replace("clickfrom=w_", "");
|
|
|
|
+// }
|
|
|
|
+// System.out.println(title);
|
|
|
|
+// System.out.println(href);
|
|
|
|
+// System.out.println(classification);
|
|
|
|
+// }
|
|
|
|
+// }
|
|
|
|
+// // 图片新闻
|
|
|
|
+// Elements imgs = document.getElementsByClass("cm_bigimg");
|
|
|
|
+// for (Element img : imgs) {
|
|
|
|
+// Elements photos = img.getElementsByClass("photo");
|
|
|
|
+// for (Element photo : photos) {
|
|
|
|
+// // 标题
|
|
|
|
+// String title = photo.attributes().get("title");
|
|
|
|
+// // 链接地址,可以根据需求继续解析网址,获取新闻详细信息
|
|
|
|
+// String href = photo.attributes().get("href");
|
|
|
|
+// // 封面图
|
|
|
|
+// String imgSrc = null;
|
|
|
|
+// List<Node> child = photo.childNodes();
|
|
|
|
+// for(Node node : child) {
|
|
|
|
+// if (node.hasAttr("data-original")) {
|
|
|
|
+// imgSrc = node.attributes().get("data-original");
|
|
|
|
+// break;
|
|
|
|
+// }
|
|
|
|
+// }
|
|
|
|
+// // 所属分类
|
|
|
|
+// String classification = null;
|
|
|
|
+// if (href.contains("?") && href.contains("clickfrom=w_")) {
|
|
|
|
+// classification = href.substring(href.lastIndexOf("?") + 1).replace("clickfrom=w_", "");
|
|
|
|
+// }
|
|
|
|
+// System.out.println(title);
|
|
|
|
+// System.out.println(href);
|
|
|
|
+// System.out.println(imgSrc);
|
|
|
|
+// System.out.println(classification);
|
|
|
|
+// }
|
|
|
|
+// }
|
|
|
|
+//
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+}
|
|
|
|
+
|