對Python3 解析html的幾種操作方式小結(jié)
解析html是爬蟲后的重要的一個處理數(shù)據(jù)的環(huán)節(jié)。一下記錄解析html的幾種方式。
先介紹基礎(chǔ)的輔助函數(shù),主要用于獲取html并輸入解析后的結(jié)束
#把傳遞解析函數(shù),便于下面的修改
def get_html(url, paraser=bs4_paraser):
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Host': 'www.360kan.com',
'Proxy-Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
}
request = urllib2.Request(url, headers=headers)
response = urllib2.urlopen(request)
response.encoding = 'utf-8'
if response.code == 200:
data = StringIO.StringIO(response.read())
gzipper = gzip.GzipFile(fileobj=data)
data = gzipper.read()
value = paraser(data) # open('E:/h5/haPkY0osd0r5UB.html').read()
return value
else:
pass
value = get_html('http://www.360kan.com/m/haPkY0osd0r5UB.html', paraser=lxml_parser)
for row in value:
print row
1,lxml.html的方式進(jìn)行解析,
The lxml XML toolkit is a Pythonic binding for the C libraries libxml2 and libxslt. It is unique in that it combines the speed and XML feature completeness of these libraries with the simplicity of a native Python API, mostly compatible but superior to the well-known ElementTree API. The latest release works with all CPython versions from 2.6 to 3.5. See the introduction for more information about background and goals of the lxml project. Some common questions are answered in the FAQ. [官網(wǎng)](http://lxml.de/)
def lxml_parser(page):
data = []
doc = etree.HTML(page)
all_div = doc.xpath('//div[@class="yingping-list-wrap"]')
for row in all_div:
# 獲取每一個影評,即影評的item
all_div_item = row.xpath('.//div[@class="item"]') # find_all('div', attrs={'class': 'item'})
for r in all_div_item:
value = {}
# 獲取影評的標(biāo)題部分
title = r.xpath('.//div[@class="g-clear title-wrap"][1]')
value['title'] = title[0].xpath('./a/text()')[0]
value['title_href'] = title[0].xpath('./a/@href')[0]
score_text = title[0].xpath('./div/span/span/@style')[0]
score_text = re.search(r'\d+', score_text).group()
value['score'] = int(score_text) / 20
# 時間
value['time'] = title[0].xpath('./div/span[@class="time"]/text()')[0]
# 多少人喜歡
value['people'] = int(
re.search(r'\d+', title[0].xpath('./div[@class="num"]/span/text()')[0]).group())
data.append(value)
return data
2,使用BeautifulSoup,不多說了,大家網(wǎng)上找資料看看
def bs4_paraser(html):
all_value = []
value = {}
soup = BeautifulSoup(html, 'html.parser')
# 獲取影評的部分
all_div = soup.find_all('div', attrs={'class': 'yingping-list-wrap'}, limit=1)
for row in all_div:
# 獲取每一個影評,即影評的item
all_div_item = row.find_all('div', attrs={'class': 'item'})
for r in all_div_item:
# 獲取影評的標(biāo)題部分
title = r.find_all('div', attrs={'class': 'g-clear title-wrap'}, limit=1)
if title is not None and len(title) > 0:
value['title'] = title[0].a.string
value['title_href'] = title[0].a['href']
score_text = title[0].div.span.span['style']
score_text = re.search(r'\d+', score_text).group()
value['score'] = int(score_text) / 20
# 時間
value['time'] = title[0].div.find_all('span', attrs={'class': 'time'})[0].string
# 多少人喜歡
value['people'] = int(
re.search(r'\d+', title[0].find_all('div', attrs={'class': 'num'})[0].span.string).group())
# print r
all_value.append(value)
value = {}
return all_value
3,使用SGMLParser,主要是通過start、end tag的方式進(jìn)行了,解析工程比較明朗,但是有點(diǎn)麻煩,而且該案例的場景不太適合該方法,(哈哈)
class CommentParaser(SGMLParser):
def __init__(self):
SGMLParser.__init__(self)
self.__start_div_yingping = False
self.__start_div_item = False
self.__start_div_gclear = False
self.__start_div_ratingwrap = False
self.__start_div_num = False
# a
self.__start_a = False
# span 3中狀態(tài)
self.__span_state = 0
# 數(shù)據(jù)
self.__value = {}
self.data = []
def start_div(self, attrs):
for k, v in attrs:
if k == 'class' and v == 'yingping-list-wrap':
self.__start_div_yingping = True
elif k == 'class' and v == 'item':
self.__start_div_item = True
elif k == 'class' and v == 'g-clear title-wrap':
self.__start_div_gclear = True
elif k == 'class' and v == 'rating-wrap g-clear':
self.__start_div_ratingwrap = True
elif k == 'class' and v == 'num':
self.__start_div_num = True
def end_div(self):
if self.__start_div_yingping:
if self.__start_div_item:
if self.__start_div_gclear:
if self.__start_div_num or self.__start_div_ratingwrap:
if self.__start_div_num:
self.__start_div_num = False
if self.__start_div_ratingwrap:
self.__start_div_ratingwrap = False
else:
self.__start_div_gclear = False
else:
self.data.append(self.__value)
self.__value = {}
self.__start_div_item = False
else:
self.__start_div_yingping = False
def start_a(self, attrs):
if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:
self.__start_a = True
for k, v in attrs:
if k == 'href':
self.__value['href'] = v
def end_a(self):
if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear and self.__start_a:
self.__start_a = False
def start_span(self, attrs):
if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:
if self.__start_div_ratingwrap:
if self.__span_state != 1:
for k, v in attrs:
if k == 'class' and v == 'rating':
self.__span_state = 1
elif k == 'class' and v == 'time':
self.__span_state = 2
else:
for k, v in attrs:
if k == 'style':
score_text = re.search(r'\d+', v).group()
self.__value['score'] = int(score_text) / 20
self.__span_state = 3
elif self.__start_div_num:
self.__span_state = 4
def end_span(self):
self.__span_state = 0
def handle_data(self, data):
if self.__start_a:
self.__value['title'] = data
elif self.__span_state == 2:
self.__value['time'] = data
elif self.__span_state == 4:
score_text = re.search(r'\d+', data).group()
self.__value['people'] = int(score_text)
pass
def sgl_parser(html):
parser = CommentParaser()
parser.feed(html)
return parser.data
4,HTMLParaer,與3原理相識,就是調(diào)用的方法不太一樣,基本上可以公用,
class CommentHTMLParser(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.__start_div_yingping = False
self.__start_div_item = False
self.__start_div_gclear = False
self.__start_div_ratingwrap = False
self.__start_div_num = False
# a
self.__start_a = False
# span 3中狀態(tài)
self.__span_state = 0
# 數(shù)據(jù)
self.__value = {}
self.data = []
def handle_starttag(self, tag, attrs):
if tag == 'div':
for k, v in attrs:
if k == 'class' and v == 'yingping-list-wrap':
self.__start_div_yingping = True
elif k == 'class' and v == 'item':
self.__start_div_item = True
elif k == 'class' and v == 'g-clear title-wrap':
self.__start_div_gclear = True
elif k == 'class' and v == 'rating-wrap g-clear':
self.__start_div_ratingwrap = True
elif k == 'class' and v == 'num':
self.__start_div_num = True
elif tag == 'a':
if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:
self.__start_a = True
for k, v in attrs:
if k == 'href':
self.__value['href'] = v
elif tag == 'span':
if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:
if self.__start_div_ratingwrap:
if self.__span_state != 1:
for k, v in attrs:
if k == 'class' and v == 'rating':
self.__span_state = 1
elif k == 'class' and v == 'time':
self.__span_state = 2
else:
for k, v in attrs:
if k == 'style':
score_text = re.search(r'\d+', v).group()
self.__value['score'] = int(score_text) / 20
self.__span_state = 3
elif self.__start_div_num:
self.__span_state = 4
def handle_endtag(self, tag):
if tag == 'div':
if self.__start_div_yingping:
if self.__start_div_item:
if self.__start_div_gclear:
if self.__start_div_num or self.__start_div_ratingwrap:
if self.__start_div_num:
self.__start_div_num = False
if self.__start_div_ratingwrap:
self.__start_div_ratingwrap = False
else:
self.__start_div_gclear = False
else:
self.data.append(self.__value)
self.__value = {}
self.__start_div_item = False
else:
self.__start_div_yingping = False
elif tag == 'a':
if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear and self.__start_a:
self.__start_a = False
elif tag == 'span':
self.__span_state = 0
def handle_data(self, data):
if self.__start_a:
self.__value['title'] = data
elif self.__span_state == 2:
self.__value['time'] = data
elif self.__span_state == 4:
score_text = re.search(r'\d+', data).group()
self.__value['people'] = int(score_text)
pass
def html_parser(html):
parser = CommentHTMLParser()
parser.feed(html)
return parser.data
3,4對于該案例來說確實(shí)是不太適合,趁現(xiàn)在有空記錄下來,功學(xué)習(xí)使用!
以上這篇對Python3 解析html的幾種操作方式小結(jié)就是小編分享給大家的全部內(nèi)容了,希望能給大家一個參考,也希望大家多多支持腳本之家。
相關(guān)文章
python switch 實(shí)現(xiàn)多分支選擇功能
這篇文章主要介紹了python switch 實(shí)現(xiàn)多分支選擇功能,本文給大家介紹的非常詳細(xì),對大家的學(xué)習(xí)或工作具有一定的參考借鑒價值,需要的朋友可以參考下2020-12-12
50行Python代碼實(shí)現(xiàn)視頻中物體顏色識別和跟蹤(必須以紅色為例)
本文通過50行Python代碼實(shí)現(xiàn)視頻中物體顏色識別和跟蹤效果,通過實(shí)例截圖和實(shí)例代碼給大家講解的非常詳細(xì),需要的朋友可以參考下2019-11-11
Linux CentOS Python開發(fā)環(huán)境搭建教程
這篇文章主要介紹了Linux CentOS Python開發(fā)環(huán)境搭建方法,非常不錯,具有一定的參考借鑒價值,需要的朋友可以參考下2018-11-11
python使用threading獲取線程函數(shù)返回值的實(shí)現(xiàn)方法
這篇文章主要介紹了python使用threading獲取線程函數(shù)返回值的實(shí)現(xiàn)方法,需要的朋友可以參考下2017-11-11
python lambda函數(shù)及三個常用的高階函數(shù)
這篇文章主要介紹了python lambda函數(shù)及三個常用的高階函數(shù),本文給大家介紹的非常詳細(xì),具有一定的參考借鑒價值,需要的朋友可以參考下2020-02-02
python人工智能算法之人工神經(jīng)網(wǎng)絡(luò)
這篇文章主要為大家介紹了python人工智能算法之人工神經(jīng)網(wǎng)絡(luò)示例詳解,有需要的朋友可以借鑒參考下,希望能夠有所幫助,祝大家多多進(jìn)步,早日升職加薪2023-03-03
python使用beautifulsoup4爬取酷狗音樂代碼實(shí)例
這篇文章主要介紹了python使用beautifulsoup4爬取酷狗音樂代碼實(shí)例,文中通過示例代碼介紹的非常詳細(xì),對大家的學(xué)習(xí)或者工作具有一定的參考學(xué)習(xí)價值,需要的朋友可以參考下2019-12-12

