java代理實現(xiàn)爬取代理IP的示例
更新時間:2018年05月07日 11:13:35 作者:sdfiiiiii
今天小編就為大家分享一篇java代理實現(xiàn)爬取代理IP的示例,具有很好的參考價值,希望對大家有所幫助。一起跟隨小編過來看看吧
僅僅使用了一個java文件,運行main方法即可,需要依賴的jar包是com.alibaba.fastjson(版本1.2.28)和Jsoup(版本1.10.2)
如果用了pom,那么就是以下兩個:
<dependency> <groupId>com.alibaba</groupId> <artifactId>fastjson</artifactId> <version>1.2.28</version> </dependency> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.10.2</version> </dependency>
完整的代碼如下:
package com.tuniu.fcm.facade.IPProxy;
import com.alibaba.fastjson.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 獲取代理IP,需要
* com.alibaba.fastjson.JSONObject以及Jsoup
*/
public class ProxyCralwerUnusedVPN {
ThreadLocal<Integer> localWantedNumber = new ThreadLocal<Integer>();
ThreadLocal<List<ProxyInfo>> localProxyInfos = new ThreadLocal<List<ProxyInfo>>();
public static void main(String[] args) {
ProxyCralwerUnusedVPN proxyCrawler = new ProxyCralwerUnusedVPN();
/**
* 想要獲取的代理IP個數(shù),由需求方自行指定。(如果個數(shù)太多,將導致返回變慢)
*/
proxyCrawler.startCrawler(1);
}
/**
* 暴露給外部模塊調(diào)用的入口
* @param wantedNumber 調(diào)用方期望獲取到的代理IP個數(shù)
*/
public String startCrawler(int wantedNumber) {
localWantedNumber.set(wantedNumber);
kuaidailiCom("http://www.xicidaili.com/nn/", 15);
kuaidailiCom("http://www.xicidaili.com/nt/", 15);
kuaidailiCom("http://www.xicidaili.com/wt/", 15);
kuaidailiCom("http://www.kuaidaili.com/free/inha/", 15);
kuaidailiCom("http://www.kuaidaili.com/free/intr/", 15);
kuaidailiCom("http://www.kuaidaili.com/free/outtr/", 15);
/**
* 構(gòu)造返回數(shù)據(jù)
*/
ProxyResponse response = new ProxyResponse();
response.setSuccess("true");
Map<String, Object> dataInfoMap = new HashMap<String, Object>();
dataInfoMap.put("numFound", localProxyInfos.get().size());
dataInfoMap.put("pageNum", 1);
dataInfoMap.put("proxy", localProxyInfos.get());
response.setData(dataInfoMap);
String responseString = JSONObject.toJSON(response).toString();
System.out.println(responseString);
return responseString;
}
private void kuaidailiCom(String baseUrl, int totalPage) {
String ipReg = "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3} \\d{1,6}";
Pattern ipPtn = Pattern.compile(ipReg);
for (int i = 1; i < totalPage; i++) {
if (getCurrentProxyNumber() >= localWantedNumber.get()) {
return;
}
try {
Document doc = Jsoup.connect(baseUrl + i + "/")
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
.header("Accept-Encoding", "gzip, deflate, sdch")
.header("Accept-Language", "zh-CN,zh;q=0.8,en;q=0.6")
.header("Cache-Control", "max-age=0")
.header("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36")
.header("Cookie", "Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1462812244; _gat=1; _ga=GA1.2.1061361785.1462812244")
.header("Host", "www.kuaidaili.com")
.header("Referer", "http://www.kuaidaili.com/free/outha/")
.timeout(30 * 1000)
.get();
Matcher m = ipPtn.matcher(doc.text());
while (m.find()) {
if (getCurrentProxyNumber() >= localWantedNumber.get()) {
break;
}
String[] strs = m.group().split(" ");
if (checkProxy(strs[0], Integer.parseInt(strs[1]))) {
System.out.println("獲取到可用代理IP\t" + strs[0] + "\t" + strs[1]);
addProxy(strs[0], strs[1], "http");
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
private static boolean checkProxy(String ip, Integer port) {
try {
//http://1212.ip138.com/ic.asp 可以換成任何比較快的網(wǎng)頁
Jsoup.connect("http://1212.ip138.com/ic.asp")
.timeout(2 * 1000)
.proxy(ip, port)
.get();
return true;
} catch (Exception e) {
return false;
}
}
private int getCurrentProxyNumber() {
List<ProxyInfo> proxyInfos = localProxyInfos.get();
if (proxyInfos == null) {
proxyInfos = new ArrayList<ProxyInfo>();
localProxyInfos.set(proxyInfos);
return 0;
}
else {
return proxyInfos.size();
}
}
private void addProxy(String ip, String port, String protocol){
List<ProxyInfo> proxyInfos = localProxyInfos.get();
if (proxyInfos == null) {
proxyInfos = new ArrayList<ProxyInfo>();
proxyInfos.add(new ProxyInfo(ip, port, protocol));
}
else {
proxyInfos.add(new ProxyInfo(ip, port, protocol));
}
}
}
class ProxyInfo {
private String userName = "";
private String ip;
private String password = "";
private String type;
private String port;
private int is_internet = 1;
public ProxyInfo(String ip, String port, String type) {
this.ip = ip;
this.type = type;
this.port = port;
}
public String getUserName() {
return userName;
}
public void setUserName(String userName) {
this.userName = userName;
}
public String getIp() {
return ip;
}
public void setIp(String ip) {
this.ip = ip;
}
public String getPassword() {
return password;
}
public void setPassword(String password) {
this.password = password;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getPort() {
return port;
}
public void setPort(String port) {
this.port = port;
}
public int getIs_internet() {
return is_internet;
}
public void setIs_internet(int is_internet) {
this.is_internet = is_internet;
}
}
class ProxyResponse {
private String success;
private Map<String, Object> data;
public String getSuccess() {
return success;
}
public void setSuccess(String success) {
this.success = success;
}
public Map<String, Object> getData() {
return data;
}
public void setData(Map<String, Object> data) {
this.data = data;
}
}
以上這篇java代理實現(xiàn)爬取代理IP的示例就是小編分享給大家的全部內(nèi)容了,希望能給大家一個參考,也希望大家多多支持腳本之家。
相關(guān)文章
如何使用@AllArgsConstructor和final 代替 @Autowired
這篇文章主要介紹了使用@AllArgsConstructor和final 代替 @Autowired方式,具有很好的參考價值,希望對大家有所幫助。如有錯誤或未考慮完全的地方,望不吝賜教2021-09-09
Mybatis-plus如何在xml中傳入自定義的SQL語句
這篇文章主要介紹了Mybatis-plus如何在xml中傳入自定義的SQL語句問題,具有很好的參考價值,希望對大家有所幫助,如有錯誤或未考慮完全的地方,望不吝賜教2024-05-05
spring?boot項目實戰(zhàn)之實現(xiàn)與數(shù)據(jù)庫的連接
在我們?nèi)粘5拈_發(fā)過程中,肯定不可避免的會使用到數(shù)據(jù)庫以及SQL?語句,下面這篇文章主要給大家介紹了關(guān)于spring?boot項目實戰(zhàn)之實現(xiàn)與數(shù)據(jù)庫連接的相關(guān)資料,文中通過實例代碼介紹的非常詳細,需要的朋友可以參考下2023-05-05
Mybatis-plus實現(xiàn)主鍵自增和自動注入時間的示例代碼
這篇文章主要介紹了Mybatis-plus實現(xiàn)主鍵自增和自動注入時間的示例代碼,文中通過示例代碼介紹的非常詳細,對大家的學習或者工作具有一定的參考學習價值,需要的朋友們下面隨著小編來一起學習學習吧2020-07-07

