基于python實(shí)現(xiàn)垂直爬蟲(chóng)系統(tǒng)的方法詳解
html_downloader
from urllib import request
def download(url):
if url is None:
return
response = request.urlopen(url)
if response.getcode() != 200:
return None
return response.read()
html_outeputer
data_list = []
def collect_data(data):
data_list.append(data)
def output_html():
fout = open('output.html', 'w')
fout.write('<html>')
fout.write('<body>')
fout.write('<table>')
for dataitem in data_list:
fout.write('<tr>')
fout.write('<td>%s</td>' % dataitem['url'])
fout.write('<td>%s</td>' % dataitem['title'])
fout.write('<td>%s</td>' % dataitem['datetime'])
fout.write('<td>%s</td>' % dataitem['visitcount'])
fout.write('</tr>')
fout.write('</table>')
fout.write('</body>')
fout.write('</html>')
fout.close()
html_parser
import re
from bs4 import BeautifulSoup
from urllib.parse import urljoin
def get_new_urls(page_url, soup):
new_urls = set()
links = soup.find_all('a', href=re.compile(r"/\d+/\d+/\w+/page\.htm"))
for link in links:
new_url = link['href']
new_full_url = urljoin(page_url, new_url)
new_urls.add(new_full_url)
return new_urls
def get_new_data(page_url, soup):
res_data = {}
title_node = soup.find('h1', class_='arti-title')
if title_node is None:
return res_data
res_data['title'] = title_node.get_text()
datetime_node = soup.find('span', class_='arti-update')
res_data['datetime'] = datetime_node.get_text()
visitcount_node = soup.find('span', class_='WP_VisitCount')
res_data['visitcount'] = visitcount_node.get_text()
res_data['url'] = page_url
return res_data
def parse(page_url, html_cont):
if page_url is None or html_cont is None:
return
soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
new_urls = get_new_urls(page_url, soup)
new_data = get_new_data(page_url, soup)
return new_urls, new_data
spider_main
import urls_manager, html_downloader, \
html_parser, html_outputer
def craw(root_url):
count = 1
urls_manager.add_new_url(root_url)
#啟動(dòng)爬蟲(chóng)循環(huán)
while urls_manager.has_new_url():
new_url = urls_manager.get_new_url()
print('craw %d : %s' % (count, new_url))
html_cont = html_downloader.download(new_url)
new_urls, new_data = html_parser.parse(new_url, html_cont)
urls_manager.add_new_urls(new_urls)
if new_data:
html_outputer.collect_data(new_data)
if count == 10:
break
count = count + 1
html_outputer.output_html()
if __name__ == '__main__':
root_url = 'http://news.zzuli.edu.cn/'
craw(root_url)
import urls_manager, html_downloader, \
html_parser, html_outputer
def craw(root_url):
count = 1
urls_manager.add_new_url(root_url)
#啟動(dòng)爬蟲(chóng)循環(huán)
while urls_manager.has_new_url():
new_url = urls_manager.get_new_url()
print('craw %d : %s' % (count, new_url))
html_cont = html_downloader.download(new_url)
new_urls, new_data = html_parser.parse(new_url, html_cont)
urls_manager.add_new_urls(new_urls)
if new_data:
html_outputer.collect_data(new_data)
if count == 10:
break
count = count + 1
html_outputer.output_html()
if __name__ == '__main__':
root_url = 'http://news.zzuli.edu.cn/'
craw(root_url)
test_64
from bs4 import BeautifulSoup
import re
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" id="link1">Elsie</a>,
<a class="sister" id="link2">Lacie</a> and
<a class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, 'html.parser')
print('獲取所有鏈接')
links = soup.find_all('a')
for link in links:
print(link.name, link['href'], link.get_text())
print('獲取lacie鏈接')
link_node = soup.find('a', )
print(link_node.name, link_node['href'], link_node.get_text())
print('正則匹配')
link_node = soup.find('a', href=re.compile(r'ill'))
print(link_node.name, link_node['href'], link_node.get_text())
print('獲取P段落文字')
p_node = soup.find('p', class_='title')
print(p_node.name, p_node.get_text())
urls_manager
new_urls = set()
old_urls = set()
def add_new_url(url):
if url is None:
return
if url not in new_urls and url not in old_urls:
new_urls.add(url)
def add_new_urls(urls):
if urls is None or len(urls) == 0:
return
for url in urls:
add_new_url(url)
def get_new_url():
new_url = new_urls.pop()
old_urls.add(new_url)
return new_url
def has_new_url():
return len(new_urls) != 0
總結(jié)
本篇文章就到這里了,希望能夠給你帶來(lái)幫助,也希望您能夠多多關(guān)注腳本之家的更多內(nèi)容!
相關(guān)文章
Python tkinter實(shí)現(xiàn)圖片標(biāo)注功能(完整代碼)
tkinter是Python下面向tk的圖形界面接口庫(kù),可以方便地進(jìn)行圖形界面設(shè)計(jì)和交互操作編程,本文通過(guò)實(shí)例代碼給大家介紹的Python tkinter實(shí)現(xiàn)圖片標(biāo)注功能,感興趣的朋友一起看看吧2019-12-12
Python辦公自動(dòng)化批量處理文件實(shí)現(xiàn)示例
這篇文章主要為大家介紹了Python辦公自動(dòng)化批量處理文件實(shí)現(xiàn)示例詳解,有需要的朋友可以借鑒參考下,希望能夠有所幫助,祝大家多多進(jìn)步,早日升職加薪2022-06-06
使用Python對(duì)Dicom文件進(jìn)行讀取與寫(xiě)入的實(shí)現(xiàn)
這篇文章主要介紹了使用Python對(duì)Dicom文件進(jìn)行讀取與寫(xiě)入的實(shí)現(xiàn),文中通過(guò)示例代碼介紹的非常詳細(xì),對(duì)大家的學(xué)習(xí)或者工作具有一定的參考學(xué)習(xí)價(jià)值,需要的朋友們下面隨著小編來(lái)一起學(xué)習(xí)學(xué)習(xí)吧2020-04-04
Python多進(jìn)程multiprocessing用法實(shí)例分析
這篇文章主要介紹了Python多進(jìn)程multiprocessing用法,結(jié)合實(shí)例形式分析了Python多線(xiàn)程的概念以及進(jìn)程的創(chuàng)建、守護(hù)進(jìn)程、終止、退出進(jìn)程、進(jìn)程間消息傳遞等相關(guān)操作技巧,需要的朋友可以參考下2017-08-08
python實(shí)現(xiàn)逢七拍腿小游戲的思路詳解
這篇文章主要介紹了python實(shí)現(xiàn)逢七拍腿小游戲的思路,本文通過(guò)實(shí)例代碼給大家介紹的非常詳細(xì),對(duì)大家的學(xué)習(xí)或工作具有一定的參考借鑒價(jià)值,需要的朋友可以參考下2020-05-05
Pydantic中Optional 和Union類(lèi)型的使用
本文主要介紹了Pydantic中Optional 和Union類(lèi)型的使用,這兩者在處理可選字段和多類(lèi)型字段時(shí)尤為重要,文中通過(guò)示例代碼介紹的非常詳細(xì),需要的朋友們下面隨著小編來(lái)一起學(xué)習(xí)學(xué)習(xí)吧2025-04-04
YOLOv5車(chē)牌識(shí)別實(shí)戰(zhàn)教程(二)理論基礎(chǔ)
這篇文章主要介紹了YOLOv5車(chē)牌識(shí)別實(shí)戰(zhàn)教程(二)理論基礎(chǔ),在這個(gè)教程中,我們將一步步教你如何使用YOLOv5進(jìn)行車(chē)牌識(shí)別,幫助你快速掌握YOLOv5車(chē)牌識(shí)別技能,需要的朋友可以參考下2023-04-04
python Autopep8實(shí)現(xiàn)按PEP8風(fēng)格自動(dòng)排版Python代碼
這篇文章主要介紹了python Autopep8實(shí)現(xiàn)按PEP8風(fēng)格自動(dòng)排版Python代碼,文中通過(guò)示例代碼介紹的非常詳細(xì),對(duì)大家的學(xué)習(xí)或者工作具有一定的參考學(xué)習(xí)價(jià)值,需要的朋友們下面隨著小編來(lái)一起學(xué)習(xí)學(xué)習(xí)吧2021-03-03

