Python實(shí)現(xiàn)多線程抓取妹子圖
心血來潮寫了個(gè)多線程抓妹子圖,雖然代碼還是有一些瑕疵,但是還是記錄下來,分享給大家。
Pic_downloader.py
# -*- coding: utf-8 -*-
"""
Created on Fri Aug 07 17:30:58 2015
@author: Dreace
"""
import urllib2
import sys
import time
import os
import random
from multiprocessing.dummy import Pool as ThreadPool
type_ = sys.getfilesystemencoding()
def rename():
return time.strftime("%Y%m%d%H%M%S")
def rename_2(name):
if len(name) == 2:
name = '0' + name + '.jpg'
elif len(name) == 1:
name = '00' + name + '.jpg'
else:
name = name + '.jpg'
return name
def download_pic(i):
global count
global time_out
if Filter(i):
try:
content = urllib2.urlopen(i,timeout = time_out)
url_content = content.read()
f = open(repr(random.randint(10000,999999999)) + "_" + rename_2(repr(count)),"wb")
f.write(url_content)
f.close()
count += 1
except Exception, e:
print i + "下載超時(shí),跳過!".decode("utf-8").encode(type_)
def Filter(content):
for line in Filter_list:
line=line.strip('\n')
if content.find(line) == -1:
return True
def get_pic(url_address):
global pic_list
try:
str_ = urllib2.urlopen(url_address, timeout = time_out).read()
url_content = str_.split("\"")
for i in url_content:
if i.find(".jpg") != -1:
pic_list.append(i)
except Exception, e:
print "獲取圖片超時(shí),跳過!".decode("utf-8").encode(type_)
MAX = 2
count = 0
time_out = 60
thread_num = 30
pic_list = []
page_list = []
Filter_list = ["imgsize.ph.126.net","img.ph.126.net","img2.ph.126.net"]
dir_name = "C:\Photos\\"+rename()
os.makedirs(dir_name)
os.chdir(dir_name)
start_time = time.time()
url_address = "http://sexy.faceks.com/?page="
for i in range(1,MAX + 1):
page_list.append(url_address + repr(i))
page_pool = ThreadPool(thread_num)
page_pool.map(get_pic,page_list)
print "獲取到".decode("utf-8").encode(type_),len(pic_list),"張圖片,開始下載!".decode("utf-8").encode(type_)
pool = ThreadPool(thread_num)
pool.map(download_pic,pic_list)
pool.close()
pool.join()
print count,"張圖片保存在".decode("utf-8").encode(type_) + dir_name
print "共耗時(shí)".decode("utf-8").encode(type_),time.time() - start_time,"s"
我們來看下一個(gè)網(wǎng)友的作品
#coding: utf-8 #############################################################
# File Name: main.py
# Author: mylonly
# mail: mylonly@gmail.com
# Created Time: Wed 11 Jun 2014 08:22:12 PM CST
#########################################################################
#!/usr/bin/python
import re,urllib2,HTMLParser,threading,Queue,time
#各圖集入口鏈接
htmlDoorList = []
#包含圖片的Hmtl鏈接
htmlUrlList = []
#圖片Url鏈接Queue
imageUrlList = Queue.Queue(0)
#捕獲圖片數(shù)量
imageGetCount = 0
#已下載圖片數(shù)量
imageDownloadCount = 0
#每個(gè)圖集的起始地址,用于判斷終止
nextHtmlUrl = ''
#本地保存路徑
localSavePath = '/data/1920x1080/'
#如果你想下你需要的分辨率的,請(qǐng)修改replace_str,有如下分辨率可供選擇1920x1200,1980x1920,1680x1050,1600x900,1440x900,1366x768,1280x1024,1024x768,1280x800
replace_str = '1920x1080'
replaced_str = '960x600'
#內(nèi)頁分析處理類
class ImageHtmlParser(HTMLParser.HTMLParser):
def __init__(self):
self.nextUrl = ''
HTMLParser.HTMLParser.__init__(self)
def handle_starttag(self,tag,attrs):
global imageUrlList
if(tag == 'img' and len(attrs) > 2 ):
if(attrs[0] == ('id','bigImg')):
url = attrs[1][1]
url = url.replace(replaced_str,replace_str)
imageUrlList.put(url)
global imageGetCount
imageGetCount = imageGetCount + 1
print url
elif(tag == 'a' and len(attrs) == 4):
if(attrs[0] == ('id','pageNext') and attrs[1] == ('class','next')):
global nextHtmlUrl
nextHtmlUrl = attrs[2][1];
#首頁分析類
class IndexHtmlParser(HTMLParser.HTMLParser):
def __init__(self):
self.urlList = []
self.index = 0
self.nextUrl = ''
self.tagList = ['li','a']
self.classList = ['photo-list-padding','pic']
HTMLParser.HTMLParser.__init__(self)
def handle_starttag(self,tag,attrs):
if(tag == self.tagList[self.index]):
for attr in attrs:
if (attr[1] == self.classList[self.index]):
if(self.index == 0):
#第一層找到了
self.index = 1
else:
#第二層找到了
self.index = 0
print attrs[1][1]
self.urlList.append(attrs[1][1])
break
elif(tag == 'a'):
for attr in attrs:
if (attr[0] == 'id' and attr[1] == 'pageNext'):
self.nextUrl = attrs[1][1]
print 'nextUrl:',self.nextUrl
break
#首頁Hmtl解析器
indexParser = IndexHtmlParser()
#內(nèi)頁Html解析器
imageParser = ImageHtmlParser()
#根據(jù)首頁得到所有入口鏈接
print '開始掃描首頁...'
host = 'http://desk.zol.com.cn'
indexUrl = '/meinv/'
while (indexUrl != ''):
print '正在抓取網(wǎng)頁:',host+indexUrl
request = urllib2.Request(host+indexUrl)
try:
m = urllib2.urlopen(request)
con = m.read()
indexParser.feed(con)
if (indexUrl == indexParser.nextUrl):
break
else:
indexUrl = indexParser.nextUrl
except urllib2.URLError,e:
print e.reason
print '首頁掃描完成,所有圖集鏈接已獲得:'
htmlDoorList = indexParser.urlList
#根據(jù)入口鏈接得到所有圖片的url
class getImageUrl(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
def run(self):
for door in htmlDoorList:
print '開始獲取圖片地址,入口地址為:',door
global nextHtmlUrl
nextHtmlUrl = ''
while(door != ''):
print '開始從網(wǎng)頁%s獲取圖片...'% (host+door)
if(nextHtmlUrl != ''):
request = urllib2.Request(host+nextHtmlUrl)
else:
request = urllib2.Request(host+door)
try:
m = urllib2.urlopen(request)
con = m.read()
imageParser.feed(con)
print '下一個(gè)頁面地址為:',nextHtmlUrl
if(door == nextHtmlUrl):
break
except urllib2.URLError,e:
print e.reason
print '所有圖片地址均已獲得:',imageUrlList
class getImage(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
def run(self):
global imageUrlList
print '開始下載圖片...'
while(True):
print '目前捕獲圖片數(shù)量:',imageGetCount
print '已下載圖片數(shù)量:',imageDownloadCount
image = imageUrlList.get()
print '下載文件路徑:',image
try:
cont = urllib2.urlopen(image).read()
patter = '[0-9]*\.jpg';
match = re.search(patter,image);
if match:
print '正在下載文件:',match.group()
filename = localSavePath+match.group()
f = open(filename,'wb')
f.write(cont)
f.close()
global imageDownloadCount
imageDownloadCount = imageDownloadCount + 1
else:
print 'no match'
if(imageUrlList.empty()):
break
except urllib2.URLError,e:
print e.reason
print '文件全部下載完成...'
get = getImageUrl()
get.start()
print '獲取圖片鏈接線程啟動(dòng):'
time.sleep(2)
download = getImage()
download.start()
print '下載圖片鏈接線程啟動(dòng):'
批量抓取指定網(wǎng)頁上的所有圖片
# -*- coding:utf-8 -*-
# coding=UTF-8
import os,urllib,urllib2,re
url = u"http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=python&oq=python&rsp=-1"
outpath = "t:\\"
def getHtml(url):
webfile = urllib.urlopen(url)
outhtml = webfile.read()
print outhtml
return outhtml
def getImageList(html):
restr=ur'('
restr+=ur'http:\/\/[^\s,"]*\.jpg'
restr+=ur'|http:\/\/[^\s,"]*\.jpeg'
restr+=ur'|http:\/\/[^\s,"]*\.png'
restr+=ur'|http:\/\/[^\s,"]*\.gif'
restr+=ur'|http:\/\/[^\s,"]*\.bmp'
restr+=ur'|https:\/\/[^\s,"]*\.jpeg'
restr+=ur'|https:\/\/[^\s,"]*\.jpeg'
restr+=ur'|https:\/\/[^\s,"]*\.png'
restr+=ur'|https:\/\/[^\s,"]*\.gif'
restr+=ur'|https:\/\/[^\s,"]*\.bmp'
restr+=ur')'
htmlurl = re.compile(restr)
imgList = re.findall(htmlurl,html)
print imgList
return imgList
def download(imgList, page):
x = 1
for imgurl in imgList:
filepathname=str(outpath+'pic_%09d_%010d'%(page,x)+str(os.path.splitext(urllib2.unquote(imgurl).decode('utf8').split('/')[-1])[1])).lower()
print '[Debug] Download file :'+ imgurl+' >> '+filepathname
urllib.urlretrieve(imgurl,filepathname)
x+=1
def downImageNum(pagenum):
page = 1
pageNumber = pagenum
while(page <= pageNumber):
html = getHtml(url)#獲得url指向的html內(nèi)容
imageList = getImageList(html)#獲得所有圖片的地址,返回列表
download(imageList,page)#下載所有的圖片
page = page+1
if __name__ == '__main__':
downImageNum(1)
以上就是給大家匯總的3款Python實(shí)現(xiàn)的批量抓取妹紙圖片的代碼了,希望對(duì)大家學(xué)習(xí)Python爬蟲能夠有所幫助。
- Python使用代理抓取網(wǎng)站圖片(多線程)
- Python代理抓取并驗(yàn)證使用多線程實(shí)現(xiàn)
- python多線程抓取天涯帖子內(nèi)容示例
- python實(shí)現(xiàn)多線程抓取知乎用戶
- Python實(shí)現(xiàn)多線程抓取網(wǎng)頁功能實(shí)例詳解
- Python之多線程爬蟲抓取網(wǎng)頁圖片的示例代碼
- python Selenium爬取內(nèi)容并存儲(chǔ)至MySQL數(shù)據(jù)庫的實(shí)現(xiàn)代碼
- Python實(shí)現(xiàn)批量讀取圖片并存入mongodb數(shù)據(jù)庫的方法示例
- Python3實(shí)現(xiàn)的爬蟲爬取數(shù)據(jù)并存入mysql數(shù)據(jù)庫操作示例
- Python基于多線程實(shí)現(xiàn)抓取數(shù)據(jù)存入數(shù)據(jù)庫的方法
相關(guān)文章
Python實(shí)現(xiàn)更改圖片尺寸大小的方法(基于Pillow包)
這篇文章主要介紹了Python實(shí)現(xiàn)更改圖片尺寸大小的方法,結(jié)合實(shí)例形式分析了Python基于Pillow包更改圖片屬性的相關(guān)技巧,需要的朋友可以參考下2016-09-09
使用python+pygame實(shí)現(xiàn)中秋節(jié)動(dòng)畫效果
馬上就要中秋節(jié)了,使用python可以實(shí)現(xiàn)中秋節(jié)動(dòng)畫效果,包括月亮、兔子和煙花嗎?當(dāng)然是可以的,那該如何實(shí)現(xiàn)呢?這篇文章我們主要使用pygame來實(shí)現(xiàn),文中有詳細(xì)的代碼示例供大家參考,需要的朋友可以參考下2023-09-09
python機(jī)器學(xué)習(xí)之決策樹分類詳解
這篇文章主要介紹了python機(jī)器學(xué)習(xí)之決策樹分類,具有一定的參考價(jià)值,感興趣的小伙伴們可以參考一下2017-12-12
Python+uiautomator2實(shí)現(xiàn)手機(jī)鎖屏解鎖功能
python-uiautomator2封裝了谷歌自帶的uiautomator2測(cè)試框架,提供便利的python接口,這篇文章給大家介紹使用Python+uiautomator2實(shí)現(xiàn)手機(jī)鎖屏解鎖(期望輸入的鎖屏密碼,基于滑動(dòng)解鎖),感興趣的朋友一起看看吧2021-04-04
tensorflow學(xué)習(xí)教程之文本分類詳析
初學(xué)tensorflow,借鑒了很多別人的經(jīng)驗(yàn),參考博客對(duì)評(píng)論分類(感謝博主的一系列好文),本人也嘗試著實(shí)現(xiàn)了對(duì)文本數(shù)據(jù)的分類,下面這篇文章主要給大家介紹了關(guān)于tensorflow學(xué)習(xí)教程之文本分類的相關(guān)資料,需要的朋友可以參考下2018-08-08
django之從html頁面表單獲取輸入的數(shù)據(jù)實(shí)例
這篇文章主要介紹了django之從html頁面表單獲取輸入的數(shù)據(jù)實(shí)例,具有很好的參考價(jià)值,希望對(duì)大家有所幫助。一起跟隨小編過來看看吧2020-03-03
詳解Python?Flask?API?示例演示(附cookies和session)
這篇文章主要為大家介紹了Python?Flask?API?示例演示(附cookies和session)詳解,有需要的朋友可以借鑒參考下,希望能夠有所幫助,祝大家多多進(jìn)步,早日升職加薪2023-03-03

