简单的大众点评爬虫

一个很简单的爬虫,爬取中大周边地点的点评信息。

# -*- coding: utf-8 -*-
import requests
import re
import time

def placeSplider(name, star, url):
    time.sleep(5)
    res = requests.get('http://www.dianping.com'+url)
    text = res.text
    longInfo = "<p class=\"desc J-desc\">(.*?)</p>"
    longInfo_re = re.compile(longInfo, re.DOTALL)
    longInfos = longInfo_re.findall(text)
    
    info = "sml-rank-stars sml-str(.*?)\".*?<p class=\"desc\">(.*?)</p>"
    info_re = re.compile(info, re.DOTALL)
    results = info_re.findall(text)
    #print result
    #print '%d results' %len(results)
    if len(results) == 0 or len(results[0]) < 2 or results[0][1].count(u'人点评') > 0:
        print u'没有点评\n'
        return
    fOut = open('D:\\%s.txt' %name, 'w')
    fOut.write('place star %s\n' %star)
    for result in results:
        star = result[0]
        info = result[1]
        if info.count('<span') > 0 or info.count(u'仅售')>0:#去广告
            print ''
            break
        else:
            if info[-6:] == u"......":#替换短评论为相应的长评论
                info = info[:-6]
                for i in longInfos:
                    if i.count(info) > 0:
                        info = i
                        break
            info = info.replace("<br/>", '')
            info = info.replace("<br>", '')
            info = info.replace("&nbsp;", '')
            print star, info
            fOut.write('%s\n' %star)
            fOut.write('%s\n' %info.encode('u8'))
    fOut.close()

for page in range(1, 6):
    res = requests.get('http://www.dianping.com/search/keyword/206/0_%E4%B8%AD%E5%B1%B1%E5%A4%A7%E5%AD%A6/p'+str(page))
    text = res.text
    href = "data-hippo-type=\"shop\" title=\"(.*?)\" target=\"_blank\" href=\"(.*?)\".*?sml-rank-stars sml-str(.*?)\""
    href_re = re.compile(href, re.DOTALL)
    result =  href_re.findall(text)
    for place in result:
        name = place[0]
        url = place[1]
        star = place[2]
        print name, star, url
        placeSplider(name, star, url)
    time.sleep(5)

 

posted @ 2014-12-12 20:53  KevinHwang  阅读(1064)  评论(1编辑  收藏  举报