|
@@ -0,0 +1,84 @@
|
|
|
+package com.ruoyi.zzb.study.common;
|
|
|
+
|
|
|
+import org.jsoup.Connection;
|
|
|
+import org.jsoup.Jsoup;
|
|
|
+import org.jsoup.nodes.Document;
|
|
|
+import org.jsoup.nodes.Element;
|
|
|
+import org.jsoup.nodes.Node;
|
|
|
+import org.jsoup.select.Elements;
|
|
|
+
|
|
|
+import java.io.IOException;
|
|
|
+import java.util.List;
|
|
|
+
|
|
|
+public class TestJsoup {
|
|
|
+
|
|
|
+ public static void main(String[] args) throws IOException {
|
|
|
+// String url = "https://news.sina.com.cn/gov/xlxw/2024-11-18/doc-incwmzsr3745291.shtml";
|
|
|
+// //直接获取DOM树
|
|
|
+// Document document = Jsoup.connect(url).get();
|
|
|
+// System.out.println(document.toString());
|
|
|
+
|
|
|
+ String url = "https://www.163.com/";
|
|
|
+
|
|
|
+ Connection conn = Jsoup.connect(url);
|
|
|
+ conn.data("page","2"); //拼接get请求参数 https://www.163.com/?page=2
|
|
|
+ conn.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36");
|
|
|
+ Document document = conn.timeout(4000).userAgent("Mozilla").get();
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ System.err.println(document.toString());
|
|
|
+ // 文字新闻
|
|
|
+ Elements textsElement = document.getElementsByClass("cm_ul_round");
|
|
|
+ for (Element e : textsElement) {
|
|
|
+ Elements tags = e.getElementsByTag("a");
|
|
|
+ for (Element tag : tags) {
|
|
|
+ // 标题
|
|
|
+ String title = tag.getElementsByAttribute("href").text();
|
|
|
+ // 链接地址,可以根据需求继续解析网址,获取新闻详细信息
|
|
|
+ String href = tag.attributes().get("href");
|
|
|
+ // 所属分类
|
|
|
+ String classification = null;
|
|
|
+ if (href.contains("?") && href.contains("clickfrom=w_")) {
|
|
|
+ classification = href.substring(href.lastIndexOf("?") + 1).replace("clickfrom=w_", "");
|
|
|
+ }
|
|
|
+ System.out.println(title);
|
|
|
+ System.out.println(href);
|
|
|
+ System.out.println(classification);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ // 图片新闻
|
|
|
+ Elements imgs = document.getElementsByClass("cm_bigimg");
|
|
|
+ for (Element img : imgs) {
|
|
|
+ Elements photos = img.getElementsByClass("photo");
|
|
|
+ for (Element photo : photos) {
|
|
|
+ // 标题
|
|
|
+ String title = photo.attributes().get("title");
|
|
|
+ // 链接地址,可以根据需求继续解析网址,获取新闻详细信息
|
|
|
+ String href = photo.attributes().get("href");
|
|
|
+ // 封面图
|
|
|
+ String imgSrc = null;
|
|
|
+ List<Node> child = photo.childNodes();
|
|
|
+ for(Node node : child) {
|
|
|
+ if (node.hasAttr("data-original")) {
|
|
|
+ imgSrc = node.attributes().get("data-original");
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ // 所属分类
|
|
|
+ String classification = null;
|
|
|
+ if (href.contains("?") && href.contains("clickfrom=w_")) {
|
|
|
+ classification = href.substring(href.lastIndexOf("?") + 1).replace("clickfrom=w_", "");
|
|
|
+ }
|
|
|
+ System.out.println(title);
|
|
|
+ System.out.println(href);
|
|
|
+ System.out.println(imgSrc);
|
|
|
+ System.out.println(classification);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+}
|
|
|
+
|