python 编码问题

__author__ = 'dell'
# -*- coding: utf-8 -*-

from lxml import etree
import urllib2
import time


def loadCategory():
    res = {}
    f_txt = open('catetory.txt')
    while True:
        line = f_txt.readline()
        if not line:
            break
        line = line.strip().decode('gbk')
        tokens = line.split('\t')
        if len(tokens) < 2:
            continue
        key = tokens[1].strip()
        print key
        val = tokens[0].strip()
        res[key] = val
    return res


def loadCity():
    res = {}
    f_txt = open('city.txt')
    while True:
        line = f_txt.readline()
        if not line:
            break
        line = line.strip().decode('gbk')
        tokens = line.split(':')
        if len(tokens) < 2:
            continue
        key = tokens[0].strip()
        val = tokens[1].strip()
        if key in res.keys():
            print 'repeated city:', key
        else:
            res[key] = val
    return res


cats = loadCategory()
# for key in cats.keys():
#     print key, cats[key]

citys = loadCity()
# for key in citys.keys():
#     print key, citys[key]

print 'length of category:', len(cats)
print 'length of citys:', len(citys)

print 'generating urls ... ...'

standard = 'http://www.dianping.com/search/category/%s/%s'


def gen(cateName):
    res = []
    if cateName in cats.keys():
        catId = cats[cateName]
        for cityName in citys.keys():
            cityId = citys[cityName]
            url = standard % (cityId, catId)
            res.append((url, cityName))
        return res
    else:
        return res


def getHtml(url):
    request = urllib2.Request(url)
    request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:19.0) Gecko/20100101 Firefox/19.0')
    doc = urllib2.urlopen(request, timeout=45).read().decode('utf8')
    return doc


def getFetchHour(count):
    return count * 5.0 / 3600


def getFetchDay(count):
    return (count * 5.0 / 3600) / 24


urllist = gen(u'购物')
print len(urllist)
sum = 0
for u in urllist:
    html = getHtml(u[0])
    tree = etree.HTML(html)
    hnc = tree.xpath("//span[@class='Color7']")
    for hn in hnc:
        strnum = hn.text.replace('(', '').replace(')', '')
        print u[1], strnum
        sum += int(strnum)
    # time.sleep(5)

print sum
print 'fetch time (hour) :' + str(getFetchHour(sum))
print 'fetch time (day) :' + str(getFetchDay(sum))

 

posted on 2013-11-13 15:19  雨渐渐  阅读(188)  评论(0编辑  收藏  举报

导航