4.HttpClient
- 網(wǎng)絡(luò)爬蟲(chóng)就是用程序幫助我們?cè)L問(wèn)網(wǎng)絡(luò)上的資源,我們一直以來(lái)都是使用HTTP協(xié)議訪問(wèn)互聯(lián)網(wǎng)的網(wǎng)頁(yè),網(wǎng)絡(luò)爬蟲(chóng)需要編寫(xiě)程序,在這里使用同樣的HTTP協(xié)議訪問(wèn)網(wǎng)頁(yè)。
- 這里我們使用Java的HTTP協(xié)議客戶端 HttpClient這個(gè)技術(shù),來(lái)實(shí)現(xiàn)抓取網(wǎng)頁(yè)數(shù)據(jù)。
4.1.GET請(qǐng)求
訪問(wèn)傳智官網(wǎng),請(qǐng)求url地址:
http://www.itcast.cn/
package cn.itcast.crawler.test;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class HttpGetTest {
public static void main(String[] args) {
// 創(chuàng)建HttpClient對(duì)象
CloseableHttpClient httpClient = HttpClients.createDefault();
// 創(chuàng)建HttpGet對(duì)象 設(shè)置url訪問(wèn)地址
HttpGet httpGet = new HttpGet("http://www.itcast.cn");
CloseableHttpResponse response = null;
try {
//使用HttpClient發(fā)起請(qǐng)求,獲取response
response = httpClient.execute(httpGet);
//解析響應(yīng)
if (response.getStatusLine().getStatusCode() == 200) {
String content = EntityUtils.toString(response.getEntity(), "utf8");
System.out.println(content.length());
}
} catch (IOException e) {
e.printStackTrace();
}finally {
//關(guān)閉response
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
請(qǐng)求參數(shù)
4.2.帶參數(shù)的GET請(qǐng)求
在傳智中搜索學(xué)習(xí)視頻,地址為:http://yun.itheima.com/search?keys=Java
package cn.itcast.crawler.test;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.net.URISyntaxException;
public class HttpGetParamTest {
public static void main(String[] args) throws Exception {
// 創(chuàng)建HttpClient對(duì)象
CloseableHttpClient httpClient = HttpClients.createDefault();
// 創(chuàng)建HttpGet對(duì)象 設(shè)置url訪問(wèn)地址http://yun.itheima.com/search?keys=Java
// 創(chuàng)建URLBuilder
URIBuilder urlBuilder = new URIBuilder("http://yun.itheima.com/search");
// 設(shè)置參數(shù)
urlBuilder.setParameter("keys","Java");
HttpGet httpGet = new HttpGet(urlBuilder.build());
//System.out.println("發(fā)起請(qǐng)求的信息"+httpGet);//發(fā)起請(qǐng)求的信息GET http://www.itcast.cn HTTP/1.1
System.out.println("發(fā)起請(qǐng)求的信息"+httpGet); //發(fā)起請(qǐng)求的信息GET http://yun.itheima.com/search?keys=Java HTTP/1.1
CloseableHttpResponse response = null;
try {
//使用HttpClient發(fā)起請(qǐng)求,獲取response
response = httpClient.execute(httpGet);
//解析響應(yīng)
if (response.getStatusLine().getStatusCode() == 200) {
String content = EntityUtils.toString(response.getEntity(), "utf8");
System.out.println(content.length());
}
} catch (IOException e) {
e.printStackTrace();
}finally {
//關(guān)閉response
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
4.3.POST請(qǐng)求
package cn.itcast.crawler.test;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class HttpPostTest {
public static void main(String[] args) {
// 創(chuàng)建HttpClient對(duì)象
CloseableHttpClient httpClient = HttpClients.createDefault();
// 創(chuàng)建HttpGet對(duì)象 設(shè)置url訪問(wèn)地址
HttpPost httpPost = new HttpPost("http://www.itcast.cn");
CloseableHttpResponse response = null;
try {
//使用HttpClient發(fā)起請(qǐng)求,獲取response
response = httpClient.execute(httpPost);
//解析響應(yīng)
if (response.getStatusLine().getStatusCode() == 200) {
String content = EntityUtils.toString(response.getEntity(), "utf8");
System.out.println(content.length());
}
} catch (IOException e) {
e.printStackTrace();
}finally {
//關(guān)閉response
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
4.4.帶參數(shù)的POST請(qǐng)求
在傳智中搜索學(xué)習(xí)視頻,使用POST請(qǐng)求,url地址為:
http://yun.itheima.com/search
url地址沒(méi)有參數(shù),參數(shù)keys=java放到表單中進(jìn)行提交
package cn.itcast.crawler.test;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
public class HttpPostParamTest {
public static void main(String[] args) throws Exception {
// 創(chuàng)建HttpClient對(duì)象
CloseableHttpClient httpClient = HttpClients.createDefault();
// 創(chuàng)建HttpGet對(duì)象 設(shè)置url訪問(wèn)地址
HttpPost httpPost = new HttpPost("https://haokan.baidu.com");
// 聲明List集合,封裝表單中的參數(shù)
List<NameValuePair> params = new ArrayList<NameValuePair>();
// https://haokan.baidu.com/?sfrom=baidu-top
boolean add = params.add(new BasicNameValuePair("sfrom", "baidu-top"));
// 創(chuàng)建表單的Entity對(duì)象 第一個(gè)參數(shù)就是封裝好的表單參數(shù),第二個(gè)參數(shù)就是編碼
UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params,"utf8");
//設(shè)置表單的Entity對(duì)象到Post請(qǐng)求中
httpPost.setEntity(formEntity);
CloseableHttpResponse response = null;
try {
//使用HttpClient發(fā)起請(qǐng)求,獲取response
response = httpClient.execute(httpPost);
//解析響應(yīng)
if (response.getStatusLine().getStatusCode() == 200) {
String content = EntityUtils.toString(response.getEntity(), "utf8");
System.out.println(content.length());
}
} catch (IOException e) {
e.printStackTrace();
}finally {
//關(guān)閉response
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
連接池
如果每次請(qǐng)求都要?jiǎng)?chuàng)建HttpClient,會(huì)有頻繁創(chuàng)建和銷毀的問(wèn)題,可以使用連接池來(lái)解決這個(gè)問(wèn)題。
測(cè)試以下代碼,并斷點(diǎn)查看每次獲取的HttpClient都是不一樣的。文章來(lái)源:http://www.zghlxwxcb.cn/news/detail-406737.html
package cn.itcast.crawler.test;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class HttpClientPoolTest {
public static void main(String[] args) {
// 創(chuàng)建連接池管理器
PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
// 設(shè)置最大連接數(shù)
cm.setMaxTotal(100);
//設(shè)置每個(gè)主機(jī)的最大連接數(shù)
cm.setDefaultMaxPerRoute(10);
// 使用連接池管理器發(fā)起請(qǐng)求
doGet(cm);
doGet(cm);
}
private static void doGet(PoolingHttpClientConnectionManager cm) {
//不是每次創(chuàng)建新的HttpClient 而是從連接池中獲取HttpClient
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
HttpGet httpGet = new HttpGet("https://www.baidu.com");
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpGet);
if(response.getStatusLine().getStatusCode() == 200){
String content = EntityUtils.toString(response.getEntity(), "utf8");
System.out.println(content.length());
}
} catch (IOException e) {
e.printStackTrace();
} finally {
if (response != null) {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
// 不能關(guān)閉HttpClient 由連接池管理HttpClient
}
}
}
private static void doPost(PoolingHttpClientConnectionManager cm) {
}
}
請(qǐng)求參數(shù)
有時(shí)候因?yàn)榫W(wǎng)絡(luò),或者目標(biāo)服務(wù)器的原因,請(qǐng)求需要更長(zhǎng)的時(shí)間才能完成,我們需要自定義相關(guān)時(shí)間文章來(lái)源地址http://www.zghlxwxcb.cn/news/detail-406737.html
package cn.itcast.crawler.test;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class HttpConfigTest {
public static void main(String[] args) {
// 創(chuàng)建HttpClient對(duì)象
CloseableHttpClient httpClient = HttpClients.createDefault();
// 創(chuàng)建HttpGet對(duì)象 設(shè)置url訪問(wèn)地址
HttpGet httpGet = new HttpGet("https://cn.bing.com");
//配置請(qǐng)求信息
RequestConfig config = RequestConfig.custom().setConnectTimeout(1000) // 創(chuàng)建連接的最長(zhǎng)時(shí)間,單位是毫秒
.setConnectionRequestTimeout(500) // 設(shè)置獲取連接的最長(zhǎng)時(shí)間 單位也是毫秒
.setSocketTimeout(10*1000).build(); //設(shè)置數(shù)據(jù)傳輸?shù)淖铋L(zhǎng)時(shí)間,單位也是毫秒
// 給請(qǐng)求設(shè)置請(qǐng)求信息
httpGet.setConfig(config);
CloseableHttpResponse response = null;
try {
//使用HttpClient發(fā)起請(qǐng)求,獲取response
response = httpClient.execute(httpGet);
//解析響應(yīng)
if (response.getStatusLine().getStatusCode() == 200) {
String content = EntityUtils.toString(response.getEntity(), "utf8");
System.out.println(content.length());
}
} catch (IOException e) {
e.printStackTrace();
}finally {
//關(guān)閉response
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
到了這里,關(guān)于HttpClient-爬蟲(chóng)的文章就介紹完了。如果您還想了解更多內(nèi)容,請(qǐng)?jiān)谟疑辖撬阉鱐OY模板網(wǎng)以前的文章或繼續(xù)瀏覽下面的相關(guān)文章,希望大家以后多多支持TOY模板網(wǎng)!