Python使用代理抓取網(wǎng)站圖片(多線程)
一、功能說明:
1. 多線程方式抓取代理服務(wù)器,并多線程驗證代理服務(wù)器
ps 代理服務(wù)器是從http://www.cnproxy.com/ (測試只選擇了8個頁面)抓取
2. 抓取一個網(wǎng)站的圖片地址,多線程隨機(jī)取一個代理服務(wù)器下載圖片
二、實(shí)現(xiàn)代碼
#!/usr/bin/env python
#coding:utf-8
import urllib2
import re
import threading
import time
import random
rawProxyList = []
checkedProxyList = []
imgurl_list = []
#抓取代理網(wǎng)站
portdicts ={'v':"3",'m':"4",'a':"2",'l':"9",'q':"0",'b':"5",'i':"7",'w':"6",'r':"8",'c':"1"}
targets = []
for i in xrange(1,9):
target = r"http://www.cnproxy.com/proxy%d.html" % i
targets.append(target)
#print targets
#抓取代理服務(wù)器正則
p = re.compile(r'''<tr><td>(.+?)<SCRIPT type=text/javascript>document.write\(":"\+(.+?)\)</SCRIPT></td><td>(.+?)</td><td>.+?</td><td>(.+?)</td></tr>''')
#獲取代理的類
class ProxyGet(threading.Thread):
def __init__(self,target):
threading.Thread.__init__(self)
self.target = target
def getProxy(self):
print "代理服務(wù)器目標(biāo)網(wǎng)站: " + self.target
req = urllib2.urlopen(self.target)
result = req.read()
#print chardet.detect(result)
matchs = p.findall(result)
for row in matchs:
ip=row[0]
port =row[1]
port = map(lambda x:portdicts[x],port.split('+'))
port = ''.join(port)
agent = row[2]
addr = row[3].decode("cp936").encode("utf-8")
proxy = [ip,port,addr]
#print proxy
rawProxyList.append(proxy)
def run(self):
self.getProxy()
#檢驗代理的類
class ProxyCheck(threading.Thread):
def __init__(self,proxyList):
threading.Thread.__init__(self)
self.proxyList = proxyList
self.timeout = 5
self.testUrl = "http://www.baidu.com/"
self.testStr = "030173"
def checkProxy(self):
cookies = urllib2.HTTPCookieProcessor()
for proxy in self.proxyList:
proxyHandler = urllib2.ProxyHandler({"http" : r'http://%s:%s' %(proxy[0],proxy[1])})
#print r'http://%s:%s' %(proxy[0],proxy[1])
opener = urllib2.build_opener(cookies,proxyHandler)
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0')]
#urllib2.install_opener(opener)
t1 = time.time()
try:
#req = urllib2.urlopen("http://www.baidu.com", timeout=self.timeout)
req = opener.open(self.testUrl, timeout=self.timeout)
#print "urlopen is ok...."
result = req.read()
#print "read html...."
timeused = time.time() - t1
pos = result.find(self.testStr)
#print "pos is %s" %pos
if pos > 1:
checkedProxyList.append((proxy[0],proxy[1],proxy[2],timeused))
#print "ok ip: %s %s %s %s" %(proxy[0],proxy[1],proxy[2],timeused)
else:
continue
except Exception,e:
#print e.message
continue
def run(self):
self.checkProxy()
#獲取圖片地址函數(shù)
def imgurlList(url_home):
global imgurl_list
home_page = urllib2.urlopen(url_home)
url_re = re.compile(r'<li><a href="(.+?)" target="_blank" rel="nofollow">')
pic_re = re.compile(r'<img src="(.*?\.\w{3,4})"')
url_list = re.findall(url_re,home_page.read())
for url in url_list:
#print url_home+url
url_page = urllib2.urlopen(url_home+url)
for imgurlList in re.findall(pic_re,url_page.read()):
imgurl_list.append(imgurlList)
#下載圖片的類
class getPic(threading.Thread):
def __init__(self,imgurl_list):
threading.Thread.__init__(self)
self.imgurl_list = imgurl_list
self.timeout = 5
def downloadimg(self):
for imgurl in self.imgurl_list:
pic_suffix = imgurl.split('.')[-1] #獲取圖片后綴
pic_name = str(random.randint(0,10000000000))+'.'+pic_suffix
cookies = urllib2.HTTPCookieProcessor()
randomCheckedProxy = random.choice(checkedProxyList) #隨機(jī)取一組代理服務(wù)器
proxyHandler = urllib2.ProxyHandler({"http" : r'http://%s:%s' %(randomCheckedProxy[0],randomCheckedProxy[1])})
opener = urllib2.build_opener(cookies,proxyHandler)
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0')]
urllib2.install_opener(opener)
try:
data_img = opener.open(imgurl,timeout=self.timeout)
f = open (pic_name,'wb')
f.write(data_img.read())
f.close()
except:
continue
def run(self):
self.downloadimg()
if __name__ == "__main__":
getThreads = []
checkThreads = []
imgurlList('http://www.ivsky.com')
getPicThreads = []
#對每個目標(biāo)網(wǎng)站開啟一個線程負(fù)責(zé)抓取代理
for i in range(len(targets)):
t = ProxyGet(targets[i])
getThreads.append(t)
for i in range(len(getThreads)):
getThreads[i].start()
for i in range(len(getThreads)):
getThreads[i].join()
print '.'*10+"總共抓取了%s個代理" %len(rawProxyList) +'.'*10
#開啟20個線程負(fù)責(zé)校驗,將抓取到的代理分成20份,每個線程校驗一份
for i in range(20):
t = ProxyCheck(rawProxyList[((len(rawProxyList)+19)/20) * i:((len(rawProxyList)+19)/20) * (i+1)])
checkThreads.append(t)
for i in range(len(checkThreads)):
checkThreads[i].start()
for i in range(len(checkThreads)):
checkThreads[i].join()
print '.'*10+"總共有%s個代理通過校驗" %len(checkedProxyList) +'.'*10
#開啟20個線程隨機(jī)取一個代理下載圖片
for i in range(20):
t = getPic(imgurl_list[((len(imgurl_list)+19)/20) * i:((len(imgurl_list)+19)/20) * (i+1)])
getPicThreads.append(t)
for i in range(len(getPicThreads)):
getPicThreads[i].start()
for i in range(len(getPicThreads)):
getPicThreads[i].join()
print '.'*10+"總共有%s個圖片下載" %len(imgurl_list) +'.'*10
#代理排序持久化
f= open("proxy_list.txt",'w+')
for proxy in sorted(checkedProxyList,cmp=lambda x,y:cmp(x[3],y[3])):
#print "checked proxy is: %s:%s\t%s\t%s" %(proxy[0],proxy[1],proxy[2],proxy[3])
f.write("%s:%s\t%s\t%s\n"%(proxy[0],proxy[1],proxy[2],proxy[3]))
f.close()
二、測試結(jié)果:
# ls
proxy_getpic.py
# python proxy_getpic.py
代理服務(wù)器目標(biāo)網(wǎng)站: http://www.cnproxy.com/proxy1.html
代理服務(wù)器目標(biāo)網(wǎng)站: http://www.cnproxy.com/proxy2.html
代理服務(wù)器目標(biāo)網(wǎng)站: http://www.cnproxy.com/proxy3.html
代理服務(wù)器目標(biāo)網(wǎng)站: http://www.cnproxy.com/proxy4.html
代理服務(wù)器目標(biāo)網(wǎng)站: http://www.cnproxy.com/proxy5.html
代理服務(wù)器目標(biāo)網(wǎng)站: http://www.cnproxy.com/proxy6.html
代理服務(wù)器目標(biāo)網(wǎng)站: http://www.cnproxy.com/proxy7.html
代理服務(wù)器目標(biāo)網(wǎng)站: http://www.cnproxy.com/proxy8.html
..........總共抓取了800個代理..........
..........總共有458個代理通過校驗..........
..........總共有154個圖片下載..........
# cat proxy_list.txt | more
173.213.113.111:3128 United States 0.432188987732
173.213.113.111:8089 United States 0.441318035126
173.213.113.111:7808 United States 0.444597005844
110.4.24.170:80 香港 香港移動通訊有限公司 0.489440202713
211.142.236.135:8080 湖南省株洲市 移動 0.490673780441
211.142.236.135:8081 湖南省株洲市 移動 0.518096923828
211.142.236.135:8000 湖南省株洲市 移動 0.51860499382
211.142.236.135:8082 湖南省株洲市 移動 0.520448207855
# ls
1001117689.jpg 3097883176.jpg 5234319709.jpg 7012274766.jpg 8504924248.jpg
1076458640.jpg 3144369522.jpg 5387877704.jpg 7106183143.jpg 867723868.jpg
1198548712.jpg 3161307031.jpg 5572092752.jpg 7361254661.jpg 8746315373.jpg
165738192.jpg 3228008315.jpg 5575388077.jpg 7389537793.jpg 8848973192.jpg
1704512138.jpg 3306931164.jpg 5610740708.jpg 7407358698.jpg 8973834958.jpg
1742167711.jpg 3320152673.jpg 5717429022.jpg 7561176207.jpg 8976862152.jpg
...............
相關(guān)文章
Python的pdfplumber庫將pdf轉(zhuǎn)為圖片的實(shí)現(xiàn)
本文主要介紹了Python的pdfplumber庫將pdf轉(zhuǎn)為圖片的實(shí)現(xiàn),文中通過示例代碼介紹的非常詳細(xì),對大家的學(xué)習(xí)或者工作具有一定的參考學(xué)習(xí)價值,需要的朋友們下面隨著小編來一起學(xué)習(xí)學(xué)習(xí)吧2023-06-06
Python判斷字符串是否為字母或者數(shù)字(浮點(diǎn)數(shù))的多種方法
本文給大家?guī)砣N方法基于Python判斷字符串是否為字母或者數(shù)字(浮點(diǎn)數(shù)),非常不錯,具有一定的參考借鑒價值,需要的朋友可以參考下2018-08-08
使用python字典統(tǒng)計CSV數(shù)據(jù)的步驟和示例代碼
為了使用Python字典來統(tǒng)計CSV數(shù)據(jù),我們可以使用內(nèi)置的csv模塊來讀取CSV文件,并使用字典來存儲統(tǒng)計信息,以下是一個詳細(xì)的步驟和完整的代碼示例,需要的朋友可以參考下2024-12-12
Nginx搭建HTTPS服務(wù)器和強(qiáng)制使用HTTPS訪問的方法
這篇文章主要介紹了Nginx搭建HTTPS服務(wù)器和強(qiáng)制使用HTTPS訪問的方法,即從HTTP跳轉(zhuǎn)到HTTPS,需要的朋友可以參考下2015-08-08
解析Python 偏函數(shù)用法全方位實(shí)現(xiàn)
這篇文章主要介紹了解析Python 偏函數(shù)用法全方位實(shí)現(xiàn),文中通過示例代碼介紹的非常詳細(xì),對大家的學(xué)習(xí)或者工作具有一定的參考學(xué)習(xí)價值,需要的朋友們下面來一起學(xué)習(xí)學(xué)習(xí)吧2020-06-06

