Python實(shí)現(xiàn)多線程抓取妹子圖

更新時(shí)間：2015年08月08日 09:54:36 投稿：hebedich

本文給大家匯總了3款由Python制作的多線程批量抓取美圖的代碼，主要是將獲取圖片鏈接任務(wù)和下載圖片任務(wù)用線程分開來處理了，而且這次的爬蟲不僅僅可以爬第一頁的圖片鏈接的，有類似需求的小伙伴可以參考下。

心血來潮寫了個(gè)多線程抓妹子圖，雖然代碼還是有一些瑕疵，但是還是記錄下來，分享給大家。

Pic_downloader.py

# -*- coding: utf-8 -*-
"""
Created on Fri Aug 07 17:30:58 2015
 
@author: Dreace
"""
import urllib2
import sys
import time
import os
import random
from multiprocessing.dummy import Pool as ThreadPool 
type_ = sys.getfilesystemencoding()
def rename():
  return time.strftime("%Y%m%d%H%M%S")
def rename_2(name): 
  if len(name) == 2: 
    name = '0' + name + '.jpg' 
  elif len(name) == 1: 
    name = '00' + name + '.jpg' 
  else: 
    name = name + '.jpg' 
  return name
def download_pic(i):
  global count
  global time_out
  if Filter(i):
    try: 
      content = urllib2.urlopen(i,timeout = time_out)
      url_content = content.read()
      f = open(repr(random.randint(10000,999999999)) + "_" + rename_2(repr(count)),"wb")
      f.write(url_content)
      f.close()
      count += 1
    except Exception, e:
      print i + "下載超時(shí)，跳過！".decode("utf-8").encode(type_)
def Filter(content):
  for line in Filter_list:
    line=line.strip('\n')
    if content.find(line) == -1:
      return True
def get_pic(url_address):
  global pic_list
  try:
    str_ = urllib2.urlopen(url_address, timeout = time_out).read()
    url_content = str_.split("\"")
    for i in url_content:
      if i.find(".jpg") != -1:
        pic_list.append(i)  
  except Exception, e:
    print "獲取圖片超時(shí)，跳過！".decode("utf-8").encode(type_)
MAX = 2
count = 0
time_out = 60
thread_num = 30
pic_list = []
page_list = []
Filter_list = ["imgsize.ph.126.net","img.ph.126.net","img2.ph.126.net"]
dir_name = "C:\Photos\\"+rename()
os.makedirs(dir_name)
os.chdir(dir_name)
start_time = time.time()
url_address = "http://sexy.faceks.com/?page="
for i in range(1,MAX + 1): 
  page_list.append(url_address + repr(i))
page_pool = ThreadPool(thread_num)
page_pool.map(get_pic,page_list)
print "獲取到".decode("utf-8").encode(type_),len(pic_list),"張圖片，開始下載！".decode("utf-8").encode(type_)
pool = ThreadPool(thread_num) 
pool.map(download_pic,pic_list)
pool.close() 
pool.join()
print count,"張圖片保存在".decode("utf-8").encode(type_) + dir_name
print "共耗時(shí)".decode("utf-8").encode(type_),time.time() - start_time,"s"

我們來看下一個(gè)網(wǎng)友的作品

#coding: utf-8 #############################################################
# File Name: main.py
# Author: mylonly
# mail: mylonly@gmail.com
# Created Time: Wed 11 Jun 2014 08:22:12 PM CST
#########################################################################
#!/usr/bin/python

import re,urllib2,HTMLParser,threading,Queue,time

#各圖集入口鏈接
htmlDoorList = []
#包含圖片的Hmtl鏈接
htmlUrlList = []
#圖片Url鏈接Queue
imageUrlList = Queue.Queue(0)
#捕獲圖片數(shù)量
imageGetCount = 0
#已下載圖片數(shù)量
imageDownloadCount = 0
#每個(gè)圖集的起始地址，用于判斷終止
nextHtmlUrl = ''
#本地保存路徑
localSavePath = '/data/1920x1080/'

#如果你想下你需要的分辨率的，請(qǐng)修改replace_str,有如下分辨率可供選擇1920x1200，1980x1920,1680x1050,1600x900,1440x900,1366x768,1280x1024,1024x768,1280x800
replace_str = '1920x1080'

replaced_str = '960x600'

#內(nèi)頁分析處理類
class ImageHtmlParser(HTMLParser.HTMLParser):
def __init__(self):
self.nextUrl = ''
HTMLParser.HTMLParser.__init__(self)
def handle_starttag(self,tag,attrs):
global imageUrlList
if(tag == 'img' and len(attrs) > 2 ):
if(attrs[0] == ('id','bigImg')):
url = attrs[1][1]
url = url.replace(replaced_str,replace_str)
imageUrlList.put(url)
global imageGetCount
imageGetCount = imageGetCount + 1
print url
elif(tag == 'a' and len(attrs) == 4):
if(attrs[0] == ('id','pageNext') and attrs[1] == ('class','next')):
global nextHtmlUrl
nextHtmlUrl = attrs[2][1];

#首頁分析類
class IndexHtmlParser(HTMLParser.HTMLParser):
def __init__(self):
self.urlList = []
self.index = 0
self.nextUrl = ''
self.tagList = ['li','a']
self.classList = ['photo-list-padding','pic']
HTMLParser.HTMLParser.__init__(self)
def handle_starttag(self,tag,attrs):
if(tag == self.tagList[self.index]):
for attr in attrs:
if (attr[1] == self.classList[self.index]):
if(self.index == 0):
#第一層找到了
self.index = 1
else:
#第二層找到了
self.index = 0
print attrs[1][1]
self.urlList.append(attrs[1][1])
break
elif(tag == 'a'):
for attr in attrs:
if (attr[0] == 'id' and attr[1] == 'pageNext'):
self.nextUrl = attrs[1][1]
print 'nextUrl:',self.nextUrl
break

#首頁Hmtl解析器
indexParser = IndexHtmlParser()
#內(nèi)頁Html解析器
imageParser = ImageHtmlParser()

#根據(jù)首頁得到所有入口鏈接
print '開始掃描首頁...'
host = 'http://desk.zol.com.cn'
indexUrl = '/meinv/'
while (indexUrl != ''):
print '正在抓取網(wǎng)頁:',host+indexUrl
request = urllib2.Request(host+indexUrl)
try:
m = urllib2.urlopen(request)
con = m.read()
indexParser.feed(con)
if (indexUrl == indexParser.nextUrl):
break
else:
indexUrl = indexParser.nextUrl
except urllib2.URLError,e:
print e.reason

print '首頁掃描完成，所有圖集鏈接已獲得：'
htmlDoorList = indexParser.urlList

#根據(jù)入口鏈接得到所有圖片的url
class getImageUrl(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
def run(self):
for door in htmlDoorList:
print '開始獲取圖片地址,入口地址為:',door
global nextHtmlUrl
nextHtmlUrl = ''
while(door != ''):
print '開始從網(wǎng)頁%s獲取圖片...'% (host+door)
if(nextHtmlUrl != ''):
request = urllib2.Request(host+nextHtmlUrl)
else:
request = urllib2.Request(host+door)
try:
m = urllib2.urlopen(request)
con = m.read()
imageParser.feed(con)
print '下一個(gè)頁面地址為:',nextHtmlUrl
if(door == nextHtmlUrl):
break
except urllib2.URLError,e:
print e.reason
print '所有圖片地址均已獲得:',imageUrlList

class getImage(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
def run(self):
global imageUrlList
print '開始下載圖片...'
while(True):
print '目前捕獲圖片數(shù)量:',imageGetCount
print '已下載圖片數(shù)量:',imageDownloadCount
image = imageUrlList.get()
print '下載文件路徑:',image
try:
cont = urllib2.urlopen(image).read()
patter = '[0-9]*\.jpg';
match = re.search(patter,image);
if match:
print '正在下載文件：',match.group()
filename = localSavePath+match.group()
f = open(filename,'wb')
f.write(cont)
f.close()
global imageDownloadCount
imageDownloadCount = imageDownloadCount + 1
else:
print 'no match'
if(imageUrlList.empty()):
break
except urllib2.URLError,e:
print e.reason
print '文件全部下載完成...'

get = getImageUrl()
get.start()
print '獲取圖片鏈接線程啟動(dòng):'

time.sleep(2)

download = getImage()
download.start()
print '下載圖片鏈接線程啟動(dòng):'

批量抓取指定網(wǎng)頁上的所有圖片

# -*- coding:utf-8 -*-
# coding=UTF-8
 
import os,urllib,urllib2,re
 
url = u"http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=python&oq=python&rsp=-1"
outpath = "t:\\"
 
def getHtml(url):
  webfile = urllib.urlopen(url)
  outhtml = webfile.read()
  print outhtml
  return outhtml
 
def getImageList(html):
  restr=ur'('
  restr+=ur'http:\/\/[^\s,"]*\.jpg'
  restr+=ur'|http:\/\/[^\s,"]*\.jpeg'
  restr+=ur'|http:\/\/[^\s,"]*\.png'
  restr+=ur'|http:\/\/[^\s,"]*\.gif'
  restr+=ur'|http:\/\/[^\s,"]*\.bmp'
  restr+=ur'|https:\/\/[^\s,"]*\.jpeg'  
  restr+=ur'|https:\/\/[^\s,"]*\.jpeg'
  restr+=ur'|https:\/\/[^\s,"]*\.png'
  restr+=ur'|https:\/\/[^\s,"]*\.gif'
  restr+=ur'|https:\/\/[^\s,"]*\.bmp'
  restr+=ur')'
  htmlurl = re.compile(restr)
  imgList = re.findall(htmlurl,html)
  print imgList
  return imgList
 
def download(imgList, page):
  x = 1
  for imgurl in imgList:
    filepathname=str(outpath+'pic_%09d_%010d'%(page,x)+str(os.path.splitext(urllib2.unquote(imgurl).decode('utf8').split('/')[-1])[1])).lower()
    print '[Debug] Download file :'+ imgurl+' >> '+filepathname
    urllib.urlretrieve(imgurl,filepathname)
    x+=1
 
def downImageNum(pagenum):
  page = 1
  pageNumber = pagenum
  while(page <= pageNumber):
    html = getHtml(url)#獲得url指向的html內(nèi)容
    imageList = getImageList(html)#獲得所有圖片的地址，返回列表
    download(imageList,page)#下載所有的圖片
    page = page+1
 
if __name__ == '__main__':
  downImageNum(1)

以上就是給大家匯總的3款Python實(shí)現(xiàn)的批量抓取妹紙圖片的代碼了，希望對(duì)大家學(xué)習(xí)Python爬蟲能夠有所幫助。

您可能感興趣的文章: