python爬虫抓取天气

· 2016-09-24 · # Python # 爬虫

已失效

一个小小小爬虫
由于天气API需要付费，就自己爬虫爬天气数据下来
然后挂在openshift上。
http://i.zxc.science/weatherapi?citycode=101190102&type=today

爬虫代码如下：

简单的正则匹配，效率应该很低，基于python3。

#coding:utf-8
import urllib.request
import re

__author__ = 'Taosky'

def now(citycode):
    url = 'http://m.weathercn.com/todayweather.do?id='+citycode+'&partner=m'
    r = urllib.request.urlopen(url)    
    html = r.read().decode('UTF-8')
    
    today ={}
    today['today_date'] = re.search(r'<span class="date">(.*?)</span>',html).group(1)
    today['today_weekday'] = re.search(r'<span class="weekday">(.*?)</span>',html).group(1)
    today['today_solarTerm'] = re.search(r'<span class="solarTerm">(.*?)</span>',html).group()
    today['today_curtime'] = re.search(r'<div class="curtime">/s*?<span>(.*?)</span>',html).group(1)
    today['today_curtemp'] = re.search(r'<span class="cur-temp">(.*?)</sup>',html).group(1).replace('<sup>','')
    today['today_description'] = re.search(r'<span class="description">(.*?)</span>',html).group(1)
    today['today_sunrise'] = re.search(r'<div class="sunrise"><i></i><span>.*?</span><span>(.*?)</span></div>',html).group(1)
    today['today_sunset'] = re.search(r'<div class="sunset"><i></i><span>.*?</span><span>(.*?)</span></div>',html).group(1)
    
    textdescre = re.compile('<p/s*class="textdesc">(.*?)</p>') 
    textdesc = textdescre.findall(html)
    today['today_wind-direction'] = textdesc[0]
    today['today_wind_level'] = textdesc[1]
    
    numre = re.compile('<p class="num">(.*?)</p>')
    num = numre.findall(html)
    today['today_air_pressure'] = num[0]
    today['today_humidity'] = num[1]
    today['today_wind_speed'] = num[2]
    today['today_visibility'] = num[3]
    return today


def days(citycode):
    url = 'http://m.weathercn.com/index.do?id='+citycode+'&partner=m'
    r = urllib.request.urlopen(url)
    html = r.read().decode('UTF-8')
    
    datesre = re.compile('<td class="date">(.*?)</td>')
    tempssre = re.compile('<td class="temp">/s*?(.*?)/s*?</td>')
    descsre = re.compile('<td class="desc">/s*?(.*?)/s*?</td>')

    dates = datesre.findall(html)
    temps = tempssre.findall(html)
    descs = descsre.findall(html)

    days = {}
    for i in range(len(dates)):
        num =str(i+1)
        days['days_date'+num] = dates[i]
        days['days_tem'+num] = temps[i].replace('/t','').replace('/n','').replace('/r','').replace(' ','')
        days['days_desc'+num] = descs[i].replace('/t','').replace('/n','').replace('/r','').replace(' ','')
    return days

def hours(citycode):
    url = 'http://m.weathercn.com/eachhours.do?id='+ citycode + '&partner='
    r = urllib.request.urlopen(url)
    html = r.read().decode('UTF-8')

    timesre = re.compile('<div class="time">(.*?)</div>')
    tempsre = re.compile('<div class="temp">(.*?)</div>')
    descsre = re.compile('<div class="desc">(.*?)</div>')

    times = timesre.findall(html)
    temps = tempsre.findall(html)
    descs= descsre.findall(html)
          
    hours = {}
    for i in range(len(times)):
        num =str(i+1)
        hours['hours_time'+num] = times[i]
        hours['hours_tem'+num] = temps[i]
        hours['hours_desc'+num] = descs[i]
    return hours