python百度排名查询半成品

因为编码问题卡在这里了,还是不知道咋解决。先记录下代码,明天去研究。

#coding=utf-8

import requests
import BeautifulSoup
import re

def createURL(checkWord):
    checkWord = checkWord.replace(' ', '+')
    checkWord = checkWord.strip()
    baiduURL = 'http://www.baidu.com/s?wd=%s&rn=100' % checkWord
    return baiduURL    


def getLastURL(rawurl):
    r = requests.get(rawurl)
    return r.url

def getAtext(atext):
    pat = re.compile(r'<a .*?>(.*?)</a>')
    match = pat.findall(atext)
    if match:
        pureText = match[0].replace('<em>', '').replace('</em>', '')
        return pureText

def getCacheDate(t):
    pat = re.compile(r'<span class="g">.*?(\d{4}-\d{1,2}-\d{1,2})  </span>')
    match = pat.findall(t)
    if match:
        cacheDate = match[0]
        return cacheDate


def getrank(checkWord):
    checkWord = unicode(checkWord, 'utf-8')
    baiduURL = createURL(checkWord)

    r = requests.get(baiduURL, allow_redirects = False)
    cont = r.content
    soup = BeautifulSoup.BeautifulSoup(cont)
    results = soup.findAll('table', {'class': 'result'})
    for result in results:
        checkData = unicode(result.find('span', {'class': 'g'}))
        if domain in checkData: #改正则
            nowRank = result['id']

            resLink = result.find('h3').a
            resURL = resLink['href']
            domainURL = getLastURL(resURL)
            resTitle = getAtext(unicode(resLink))

            rescache = result.find('span', {'class': 'g'})
            cacheDate = getCacheDate(unicode(rescache))

            print checkWord , ',' , nowRank, ',', resTitle, ',', cacheDate, ',', domainURL
#            print domainURL
#            print resTitle
#            print nowRank
#            print cacheDate
            break
    else:
        print '>100'




domain = 'www.douban.com/'



f = open('r.txt', 'w')
f.write(getrank('梦天空 解梦'))
f.close()

 

最后放上2个链接供学习:

http://www.crifan.com/python_unicodedecodeerror_codec_can_not_decode_byte_in_position_ordinal_not_in_range/

http://blog.wahahajk.com/2009/08/unicodedecodeerror-ascii-codec-cant.html

posted on 2013-07-16 00:11  alexkh  阅读(290)  评论(0编辑  收藏  举报