Python爬取國外天氣預報網(wǎng)站的方法

更新時間：2015年07月10日 11:32:18 作者：speedmancs

這篇文章主要介紹了Python爬取國外天氣預報網(wǎng)站的方法,可實現(xiàn)抓取國外天氣預報信息的相關技巧,具有一定參考借鑒價值,需要的朋友可以參考下

本文實例講述了Python爬取國外天氣預報網(wǎng)站的方法。分享給大家供大家參考。具體如下：

crawl_weather.py如下：

#encoding=utf-8
import httplib
import urllib2
import time
from threading import Thread
import threading
from Queue import Queue
from time import sleep
import re
import copy
lang = "fr"
count = 0
class Location:
  # Location(False, "中國", "北京", "zh")
  # Location(True, "", "亞洲", "zh")
  def __init__(self, is_beyond_country, country_name, loc_name, lang):
    self.country_name = country_name
    self.loc_name = loc_name
    self.lang = lang
    self.is_beyond_country = is_beyond_country
prn_lock = threading.RLock()
def GetLocationURLs(url, recursive):
  global count
  if url.find("weather-forecast") != -1:
    count = count + 1
    if count % 500 == 0:
      prn_lock.acquire()
      print "count:%d" % (count)
      prn_lock.release()
    return [url]
  page = urllib2.urlopen(url).read()
  time.sleep(0.01)
  #"<h6><a href=\"http://www.accuweather.com/zh/browse-locations/afr\"><em>Africa</em></a></h6>"
  pattern = "<h6><a href=\"(.*)\"><em>(.*)</em></a></h6>"
  locs = re.findall(pattern, page)
  locs = [(url, name) for url, name in locs if url.find("browse-locations") != -1 or url.find("weather-forecast") != -1]
  if not recursive:
    urls = [url for url, name in locs]
    return urls
  urls = []
  for _url, _name in locs:
    lst = GetLocationURLs(_url, True)
    urls.extend(lst)
  return urls
#entry_url = "http://www.accuweather.com/zh/browse-locations"
entry_url = "http://www.accuweather.com/%s/browse-locations/eur/fr" % (lang)
#regions = ["afr", "ant", "arc", "asi", "cac", "eur", "mea", "nam", "ocn", "sam"]
#regions = ["eur"]
#region_urls = [ "%s/%s" % (entry_url, reg) for reg in regions]
#region_urls = ["http://www.accuweather.com/zh/browse-locations/eur/fr"]
sub_urls = GetLocationURLs(entry_url, False)
print len(sub_urls)
print sub_urls
q = Queue()
location_urls = []
ThreadNum = 5
lock = threading.RLock()
for url in sub_urls:
  q.put(url)
def working():
  while True:
    url = q.get()
    lst = GetLocationURLs(url, True)
    print "%s %d urls " % (url, len(lst))
    lock.acquire()
    location_urls.extend(lst)
    lock.release()
    q.task_done()
for i in range(ThreadNum):
  t = Thread(target=working)
  t.setDaemon(True)
  t.start()
q.join()  
fp = open('locations.txt', "w")
fp.write("\n".join(location_urls))
fp.close()
#for url in location_urls:
#  print url
#location_urls = GetLocationURLs(entry_url)
'''
def Fetch(url):
  try:
    print url
    web_path = url[0]
    local_name = url[1]   
    print "web_path:", web_path
    print "local_name:", local_name
    sContent = urllib2.urlopen(web_path).read()
    savePath = "D:\\Course\\NLP_Manning\\%s" % (local_name)
    print savePath
    file = open(savePath,'wb')
    file.write(sContent)
    file.close()
    print savePath + " saved";
  except:
    pass;
def working():
  while True:
    url = q.get()
    Fetch(url)
    sleep(10)
    q.task_done()
#root_url = "https://class.coursera.org/nlp/lecture/index?lecture_player=flash"
root_url = "https://class.coursera.org/nlp/lecture/index?lecture_player=flash"
page = urllib2.urlopen(root_url).read()
for i in range(NUM):
  t = Thread(target=working)
  t.setDaemon(True)
  t.start()
urls = copy.deepcopy(ppt_urls)
urls.extend(srt_urls)
urls.extend(video_urls)
print len(ppt_urls)
print len(srt_urls)
print len(video_urls)
print len(urls)
for url in urls:
  q.put(url)
q.join()
'''
'''
root_url = "http://www.accuweather.com/zh/cn/andingmen/57494/weather-forecast/57494"
page = urllib2.urlopen(root_url).read()
print page
'''

FetchLocation.py如下：

#encoding=utf-8
import sys
import httplib
import urllib2
import time
from threading import Thread
import threading
from Queue import Queue
from time import sleep
import re
import copy
from xml.dom import minidom
import HTMLParser
import datetime
q = Queue()
locks = [threading.RLock() for i in range(2)]
ThreadNumber = 20
locations = {}
conds = {}
def FindCountryBreadCrumbs(page):
  lines = page.splitlines()
  count = 0
  start = -1
  opened = False
  for line in lines:
    if line.find("<ul id=\"country-breadcrumbs\">") != -1:
      start = count
      opened = True
    if opened and line.find("</ul>") != -1:
      end = count
      opened = False
    count = count + 1
  return "\n".join(lines[start: (end + 1)])
def GetText(nodelist):
  rc = []
  for node in nodelist:
    if node.nodeType == node.TEXT_NODE:
      rc.append(HTMLParser.HTMLParser().unescape(node.data))
  return ''.join(rc)
def FindCondition(page):
  pat = "<span class=\"cond\">(.*?)</span>"
  cds = re.findall(pat, page)
  cds = [HTMLParser.HTMLParser().unescape(cd).encode("utf-8") for cd in cds]
  return cds  
def ExtractInfo(url):
  try:
    page = urllib2.urlopen(url).read()
  except Exception, e:
    return []
  text = FindCountryBreadCrumbs(page)
  text = HTMLParser.HTMLParser().unescape(text)
  dom = minidom.parseString(text.encode("utf-8"))
  locs = []
  lis = dom.getElementsByTagName("li")
  for li in lis:
    adr_list = li.getElementsByTagName("a")
    if adr_list:
      locs.append(GetText(adr_list[0].childNodes).encode("utf-8"))
    strs = li.getElementsByTagName("strong")
    if strs:
      locs.append(GetText(strs[0].childNodes).encode("utf-8"))
  cds = FindCondition(page)
  return locs, cds
def AddMap(lst, m):
  for x in lst:
    if m.get(x) == None:
      m[x] = 1
def working():
  while True:
    urls = q.get()
    #print len(urls)
    m = {}
    m2 = {}
    count = 0
    for url in urls:
      count = count + 1
      #print "%d/%d" % (count, len(urls))
      locs, cds = ExtractInfo(url)
      AddMap(locs, m)
      AddMap(cds, m2)
    locks[1].acquire()
    AddMap(m.keys(), locations)
    AddMap(m2.keys(), conds)
    locks[1].release()
    q.task_done()
def main():
  if len(sys.argv) < 2:
    exit()
  loc_path = sys.argv[1]
  fp = open(loc_path, "r")
  urls = [line.strip() for line in fp]
  fp.close()
  #urls = urls[0:1000]
  blocks = len(urls) / ThreadNumber + 1
  for start in range(0, len(urls), blocks):
    end = start + blocks
    if end > len(urls):
      end = len(urls)
    q.put(urls[start:end])
  for i in range(ThreadNumber):
    t = Thread(target=working)
    t.setDaemon(True)
    t.start()
  q.join()
  fp = open("location_name.fr", "w")
  fp.write("\n".join(locations.keys()))
  fp.close()
  fp = open("conditions.fr", "w")
  fp.write("\n".join(conds.keys()))
  fp.close()
if __name__ == '__main__':
  main()

希望本文所述對大家的python程序設計有所幫助。

您可能感興趣的文章:

如何使用Flask-Migrate拓展數(shù)據(jù)庫表結構
這篇文章主要介紹了如何使用Flask-Migrate拓展數(shù)據(jù)庫表結構,文中通過示例代碼介紹的非常詳細，對大家的學習或者工作具有一定的參考學習價值,需要的朋友可以參考下
2019-07-07
Python嵌套式數(shù)據(jù)結構實例淺析
這篇文章主要介紹了Python嵌套式數(shù)據(jù)結構,結合實例形式簡單分析了Python字典與列表元素的嵌套存儲相關定義與操作技巧,需要的朋友可以參考下
2019-03-03
基于Python和MoviePy開發(fā)一個視頻管理工具
這篇文章主要為大家詳細介紹了如何基于Python和MoviePy開發(fā)一個視頻管理工具,該工具提供了視頻播放,元數(shù)據(jù)提取,格式轉換等功能,有需要的小伙伴可以了解下
2025-04-04
python opencv實現(xiàn)直線檢測并測出傾斜角度(附源碼+注釋)
這篇文章主要介紹了python opencv實現(xiàn)直線檢測并測出傾斜角度(附源碼+注釋)，文中通過示例代碼介紹的非常詳細，對大家的學習或者工作具有一定的參考學習價值，需要的朋友們下面隨著小編來一起學習學習吧
2020-12-12
詳解Python中的斷點類型
在?Python?中,斷點是一種在代碼中設置的標記,用于在程序執(zhí)行過程中停止或中斷程序的執(zhí)行,本文將詳細介紹?Python?中的斷點類型,感興趣的可以了解下
2024-02-02
python3.4下django集成使用xadmin后臺的方法
本篇文章主要介紹了python3.4下django集成使用xadmin后臺的方法，具有一定的參加價值，有興趣的可以了解一下
2017-08-08
pytorch的batch normalize使用詳解
今天小編就為大家分享一篇pytorch的batch normalize使用詳解，具有很好的參考價值，希望對大家有所幫助。一起跟隨小編過來看看吧
2020-01-01
python編寫圖書管理系統(tǒng)
這篇文章主要為大家詳細介紹了python編寫圖書管理系統(tǒng)，文中示例代碼介紹的非常詳細，具有一定的參考價值，感興趣的小伙伴們可以參考一下
2022-03-03
使用python將時間轉換為指定的格式方法
今天小編就為大家分享一篇使用python將時間轉換為指定的格式方法，具有很好的參考價值，希望對大家有所幫助。一起跟隨小編過來看看吧
2018-11-11
使用matplotlib繪制圖例標簽中帶有公式的圖
今天小編就為大家分享一篇使用matplotlib繪制圖例標簽中帶有公式的圖，具有很好的參考價值，希望對大家有所幫助。一起跟隨小編過來看看吧
2019-12-12