Java自带爬虫类库分析:9个方面解读,轻松爬取数据
优采云 发布时间: 2023-03-16 01:14Java作为一门广泛应用的编程语言,在数据采集领域也有着不可替代的地位。而Java自带的爬虫类库更是让开发者能够轻松获取海量数据。本文将从以下9个方面进行详细分析:
1. Java自带爬虫类库的概述
2. HttpUrlConnection类的使用
3. Jsoup类库的使用
4. HttpClient类库的使用
5. Selenium WebDriver类库的使用
6.爬虫实战案例:抓取百度百科词条信息
7.爬虫实战案例:抓取新浪微博热搜榜信息
8.爬虫实战案例:抓取淘宝商品信息
9.爬虫实战案例:抓取知乎问题与答案信息
Java自带爬虫类库是指在Java语言中,已经内置了一些可供开发者使用的网络请求、HTML解析等相关类库,使得开发者能够快速、高效地完成网页爬取任务。
其中最常用的就是HttpUrlConnection类。该类可以通过HTTP或HTTPS与服务器进行连接,发送请求并获取响应。以下是一个简单的示例代码:
import java.net.HttpURLConnection;
import java.net.URL;
import java.io.BufferedReader;
import java.io.InputStreamReader;
public class HttpUrlConnectionDemo {
public static void main(String[] args) throws Exception {
URL url = new URL("https://www.ucaiyun.com");
HttpURLConnection con =(HttpURLConnection) url.openConnection();
con.setRequestMethod("GET");
BufferedReader in = new BufferedReader(
new InputStreamReader(con.getInputStream()));
String inputLine;
StringBuffer content = new StringBuffer();
while ((inputLine = in.readLine())!= null){
content.append(inputLine);
}
in.close();
System.out.println(content.toString());
}
}
除了HttpUrlConnection外,Jsoup、HttpClient、Selenium WebDriver等类库也是非常常见且实用的爬虫工具。
Jsoup是一款Java HTML解析器,它可以直接解析HTML文档,并提供了一些便捷的API来获取所需数据。以下是一个简单示例:
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class JsoupDemo {
public static void main(String[] args) throws Exception {
String url ="https://www.ucaiyun.com";
Document doc = Jsoup.connect(url).get();
//获取title标签内容
String title = doc.title();
System.out.println("title:"+ title);
//获取所有a标签链接地址
Elements links = doc.select("a[href]");
for (Element link : links){
System.out.println("link:"+ link.attr("href"));
System.out.println("text:"+ link.text());
}
}
}
HttpClient是一款强大而灵活的HTTP客户端工具,它可以在Java应用程序中发送HTTP请求,并处理响应。以下是一个简单示例:
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
public class HttpClientDemo {
public static void main(String[] args) throws Exception {
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet("https://www.ucaiyun.com");
String html = EntityUtils.toString(5b89c714a1eaefffe2be5542e4c06103.execute(httpGet).getEntity());
System.out.println(html);
}
}
Selenium WebDriver是一款自动化测试工具,可以模拟用户在浏览器中操作来获取数据。以下是一个简单示例:
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
public class SeleniumDemo {
public static void main(String[] args) throws Exception {
System.setProperty("webdriver.chrome.driver","/path/to/chromedriver");
WebDriver driver = new ChromeDriver();
driver.get("https://www.ucaiyun.com");
WebElement element = driver.findElement(By.tagName("body"));
System.out.println(element.getText());
driver.quit();
}
}
以上这些工具只是Java自带爬虫类库中的冰山一角。接下来我们通过几个实战案例来看看如何使用这些工具。
首先我们来看一个抓取百度百科词条信息的案例。我们需要从百度百科上获取“Java”词条的标题、摘要和正文内容。
import java.io.FileWriter;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
public class BaiduBaikeDemo {
public static void main(String[] args) throws IOException {
String url ="https://baike.baidu.com/item/Java";
Document doc = Jsoup.connect(url).get();
//获取标题
String title = doc.select("h1[class=lemma-title]").text().trim();
System.out.println(title);
//获取摘要
Element summaryElem = doc.selectFirst(".lemma-summary");
String summaryText = summaryElem.text().trim();
System.out.println(summaryText);
//获取正文内容
Element contentElem = doc.selectFirst("#content-wrapper");
contentElem.select(".edit-icon").remove();//去除编辑按钮
contentElem.select(".lock-lemma-vip").remove();//去除VIP锁定提示
FileWriter writer = new FileWriter("./java.html");
writer.write(contentElem.html());
writer.close();
}
}
接下来我们来看一个抓取新浪微博热搜榜信息的案例。我们需要从新浪微博上获取当前热搜榜前10名的关键词和搜索量。
import java.util.List;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
public class WeiboHotSearchDemo {
public static void main(String[] args) throws Exception {
System.setProperty("webdriver.chrome.driver","/path/to/chromedriver");
WebDriver driver = new ChromeDriver();
driver.get("https://s.weibo.com/top/summary?cate=realtimehot");
List<WebElement> items = driver.findElements(By.cssSelector(".td-02 a"));
for (int i=0;i<10;i++){
WebElement item = items.get(i);
String keyword = item.getText();
WebElement countElem = item.findElement(By.xpath("../../td[@class='td-03']/span"));
String countStr = countElem.getText().replaceAll("[^\\d]","");
int count = Integer.parseInt(countStr);
System.out.printf("%d.%s-%d%n",i+1, keyword, count);
}
driver.quit();
}
}
再来看一个抓取淘宝商品信息的案例。我们需要从淘宝上搜索“手机”,并获取前5页商品列表中每个商品的名称、价格和销量。
import java.net.URLEncoder;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
public class TaobaoDemo {
public static void main(String[] args) throws Exception {
for (int page=1; page<=5; page++){//搜索前5页结果
String keywordEncoded = URLEncoder.encode("手机","UTF-8");
String urlTemplate =
"https://s.taobao.com/search?q=%s&bcoffset=0&p4ppushleft=%%2C44&s=%d";
Document doc =
Jsoup.connect(String.format(urlTemplate, keywordEncoded,(page-1)*44))
.userAgent(
"Mozilla/5.0(Windows NT 10.0; Win64; x64; rv:103.0) Gecko/20100101 Firefox/103.0")
.get();
for (Element item : doc.select("#mainsrp-itemlist .item")){
Element titleElem = item.selectFirst(".title a");
Element priceElem = item.selectFirst(".price strong");
Element salesVolumeElem =
item.selectFirst(".deal-cnt").ownText().contains("人付款")?
item.selectFirst(".deal-cnt"):
null;
if (titleElem != null &&63fdb1987e51b050940fc1039490eec4!= null && salesVolumeElem != null){
String title =
titleElem.text().replaceAll("[\n\r]","").trim();
double price =
Double.parseDouble(priceElem.text().replaceAll("[^\\d\\.]",""));
int salesVolume =
Integer.parseInt(salesVolumeElem.text().replaceAll("[^\\d]",""));
System.out.printf("%s\t%.2f\t%d%n", title, price, salesVolume);
}
}
}
}
}
最后再看一个抓取知乎问题与答案信息的案例。我们需要从知乎上搜索“机器学习”,并获取前3页问题列表中每个问题及其所有答案。
import java.util.List;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
public class ZhihuDemo {
public static void main(String[] args) throws Exception {
System.setProperty("webdriver.chrome.driver","/path/to/chromedriver");
WebDriver driver = new ChromeDriver();
for (int page=1; page<=3; page++){//搜索前3页结果
driver.get(
"https://www.zhihu.com/search?type=content&q=%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0&page="
+ page);
List<WebElement> items =
driver.findElements(By.cssSelector(".SearchResultCard"));
for (WebElement item : items){
WebElement questionTitleElem =
item.findElement(By.cssSelector(".ContentItem-title a"));
if (!questionTitleElem.getText().contains("机器学习")){ continue;}
questionTitleElem.click(); Thread.sleep(1000);
WebElement questionContentElem =
driver.findElement(By.cssSelector(".QuestionHeader-detail span"));
List<WebElement> answerElems =
driver.findElements(By.cssSelector(".List-item"));
System.out.println(questionTitleElem.getText());
System.out.println(questionContentElem.getText());
for (WebElement answer : answerElems){
WebElement authorNameElem =
answer.findElement(By.cssSelector(".AuthorInfo-name"));
WebElement answerContentElem =
answer.findElement(By.cssSelector(".RichContent-inner"));
System.out.printf("%s:%n%s%n%n",
authorNameElem.getText(),
answerContentElem.getText());
}
Thread.sleep(1000);//等待页面加载完毕
driver.navigate().back(); Thread.sleep(1000);
}
}
driver.quit();
}
}
以上就是本文对于Java自带爬虫类库及其相关工具和实战案例的详细介绍。相信通过本文您已经掌握了如何利用Java进行网页数据采集,并能够在实际项目中灵活运用。如果您想进一步学习相关技术,欢迎访问优采云(www.ucaiyun.com),我们提供专业SEO优化服务及各种技术培训课程。