python出現(xiàn)RuntimeError錯(cuò)誤問題及解決

更新時(shí)間：2022年05月23日 10:29:41 作者：舔狗一無所有

這篇文章主要介紹了python出現(xiàn)RuntimeError錯(cuò)誤問題及解決方案，具有很好的參考價(jià)值，希望對(duì)大家有所幫助。如有錯(cuò)誤或未考慮完全的地方，望不吝賜教

下面是出現(xiàn)的錯(cuò)誤解釋

RuntimeError:
An attempt has been made to start a new process before the
current process has finished its bootstrapping phase.

This probably means that you are not using fork to start your
child processes and you have forgotten to use the proper idiom
in the main module:

if __name__ == '__main__':
freeze_support()
...

The "freeze_support()" line can be omitted if the program
is not going to be frozen to produce an executable.

下面是出現(xiàn)錯(cuò)誤代碼的原代碼

import multiprocessing as mp
import time
from urllib.request import urlopen,urljoin
from bs4 import BeautifulSoup
import re
 
base_url = "https://morvanzhou.github.io/"
 
#crawl爬取網(wǎng)頁
def crawl(url):
    response = urlopen(url)
    time.sleep(0.1)
    return response.read().decode()
 
#parse解析網(wǎng)頁
def parse(html):
    soup = BeautifulSoup(html,'html.parser')
    urls = soup.find_all('a',{"href":re.compile('^/.+?/$')})
    title = soup.find('h1').get_text().strip()
    page_urls = set([urljoin(base_url,url['href'])for url in urls])
    url = soup.find('meta',{'property':"og:url"})['content']
    return title,page_urls,url
 
unseen = set([base_url])
seen = set()
restricted_crawl = True
 
pool = mp.Pool(4)
count, t1 = 1, time.time()
while len(unseen) != 0:                 # still get some url to visit
    if restricted_crawl and len(seen) > 20:
            break
    print('\nDistributed Crawling...')
    crawl_jobs = [pool.apply_async(crawl, args=(url,)) for url in unseen]
    htmls = [j.get() for j in crawl_jobs]      # request connection
 
    print('\nDistributed Parsing...')
    parse_jobs = [pool.apply_async(parse, args=(html,)) for html in htmls]
    results = [j.get() for j in parse_jobs]    # parse html
 
    print('\nAnalysing...')
    seen.update(unseen)         # seen the crawled
    unseen.clear()              # nothing unseen
 
    for title, page_urls, url in results:
        print(count, title, url)
        count += 1
        unseen.update(page_urls - seen)     # get new url to crawl
print('Total time: %.1f s' % (time.time()-t1))    # 16 s !!!

這是修改后的正確代碼

import multiprocessing as mp
import time
from urllib.request import urlopen,urljoin
from bs4 import BeautifulSoup
import re
?
base_url = "https://morvanzhou.github.io/"
?
#crawl爬取網(wǎng)頁
def crawl(url):
? ? response = urlopen(url)
? ? time.sleep(0.1)
? ? return response.read().decode()
?
#parse解析網(wǎng)頁
def parse(html):
? ? soup = BeautifulSoup(html,'html.parser')
? ? urls = soup.find_all('a',{"href":re.compile('^/.+?/$')})
? ? title = soup.find('h1').get_text().strip()
? ? page_urls = set([urljoin(base_url,url['href'])for url in urls])
? ? url = soup.find('meta',{'property':"og:url"})['content']
? ? return title,page_urls,url
?
def main():
? ? unseen = set([base_url])
? ? seen = set()
? ? restricted_crawl = True
?
? ? pool = mp.Pool(4)
? ? count, t1 = 1, time.time()
? ? while len(unseen) != 0: ? ? ? ? ? ? ? ? # still get some url to visit
? ? ? ? if restricted_crawl and len(seen) > 20:
? ? ? ? ? ? ? ? break
? ? ? ? print('\nDistributed Crawling...')
? ? ? ? crawl_jobs = [pool.apply_async(crawl, args=(url,)) for url in unseen]
? ? ? ? htmls = [j.get() for j in crawl_jobs] ? ? ?# request connection
?
? ? ? ? print('\nDistributed Parsing...')
? ? ? ? parse_jobs = [pool.apply_async(parse, args=(html,)) for html in htmls]
? ? ? ? results = [j.get() for j in parse_jobs] ? ?# parse html
?
? ? ? ? print('\nAnalysing...')
? ? ? ? seen.update(unseen) ? ? ? ? # seen the crawled
? ? ? ? unseen.clear() ? ? ? ? ? ? ?# nothing unseen
?
? ? ? ? for title, page_urls, url in results:
? ? ? ? ? ? print(count, title, url)
? ? ? ? ? ? count += 1
? ? ? ? ? ? unseen.update(page_urls - seen) ? ? # get new url to crawl
? ? print('Total time: %.1f s' % (time.time()-t1)) ? ?# 16 s !!!
?
?
if __name__ == '__main__':
? ? main()

綜上可知，就是把你的運(yùn)行代碼整合成一個(gè)函數(shù)，然后加入