yyyyyyyyyyyyyyyyyyyy

博客园 首页 新随笔 联系 订阅 管理
#!/usr/bin/env python
#coding:utf-8
import urllib2,sys,re,os,string

reload(sys);
sys.setdefaultencoding('utf8');

#url="http://www.dianping.com/search/category/1/20/g122"

def httpCrawler(url):
    #first page
    content = httpRequest(url)
    #other page
    #for pageNo in range(2,50):
    #    content = httpRequest(url)
    shops=parseHtml(content)
    getAllPages(shops)
    unpackOneShop()
    #saveData(shops)
   

def httpRequest(url):
    try:
        html = None
        req_header = {
            'User-Agent':'Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0'
            #'Accept':'text/html;q=0.9,*/*;q=0.8',
            #'Accept-Language':'en-US,en;q=0.5',
            #'Accept-Encoding':'gzip',
            #'Host':'j3.s2.dpfile.com',
            #'Connection':'keep-alive',
            #'Referer':'http://www.baidu.com'
        }
        req_timeout = 5
        req = urllib2.Request(url,None,req_header)
        resp = urllib2.urlopen(req,None,req_timeout)
        html = resp.read().decode('utf-8')
        #print html
    finally:
        if resp:
            resp.close()
    return html

def parseHtml(html):
    content = None
    #shops = re.findall(r'<li class="" >(.+?)<\li>',html,re.S)
    shops = re.findall(r'<div class="shop-list J_shop-list shop-all-list" id="shop-all-list">\n<ul>.*?</ul>\n</div>',html,re.S)
    return shops

def getAllPages(shops):
    #total 50 pages
    getEachShop(shops)
    print "################one page done."

global t_OneShop
t_OneShop=['']*15*50

def getEachShop(shops):
    global t_OneShop
    t_OneShop=['']*15*50
    t_start=0
    shops_string=''.join(shops)

    i=0
    t_start = shops_string.find(r'<li class="" >')
    all_end = shops_string.rfind('</li>')
    while i<=15 and t_start and all_end:
        t_start = shops_string.find(r'<li class="" >',t_start)
        t_end = shops_string.find('</li>',t_start,all_end)
        #print "t_start:",t_start
        #print "t_end:",t_end
        t_OneShop[i] = shops_string[t_start:t_end]
        #print t_OneShop[i]
        
        t_start=t_end
        i=i+1


def unpackOneShop():
    global t_OneShop
    
    f = open('./zhubao/shops.csv', 'w')
    f.write('\xEF\xBB\xBF')
    f.write('名称,地址,人均消费,,,')
    f.write('\r\n')
    f.close()
    for i in range(0,15):
        #print t_OneShop[i]
        
        f = open('./zhubao/shops.csv', 'ab+')
       
        ShopName = re.findall(r'<h4>(.*?)</h4>',t_OneShop[i])
    #ShopDistrict = 
        address = re.findall(r'<span class="addr">(.*?)</span>',t_OneShop[i])
        mean_price = re.findall(r'mean-price" target="_blank" >(.*?)</span>',t_OneShop[i],re.S)
        averageComsumption = re.findall(r'<b>(.*?)</b>',''.join(mean_price),re.S)

        print 'mean_price:',mean_price 
        print 'average::',averageComsumption
        ShopName.extend(address)
        ShopName.extend(averageComsumption)

        print (','.join(ShopName)).replace('\n',''),'\r\n'
        f.write((''.join(','.join(ShopName)).replace('\n','')))
        f.write('\r\n')
        f.close()
    
    #iprovince = 
    #city =
    #adminDistrict =


def saveData(data):
    if not os.path.exists('./zhubao'):
        os.mkdir(r'./zhubao')
    f = open('./zhubao/zhubao_shops.csv', 'wb')
    f.write(data)
    f.close()


if __name__ == '__main__':
    url="http://www.dianping.com/search/category/1/20/g122"
    httpCrawler(url)


'''
python2.6 没有urllib.request
多线程
gevent
爬虫系统基本的结构:
1.网络请求;
最简单的工具就是urllib、urllib2。这两个工具可以实现基本的下载功能,如果进阶想要异步可以使用多线程,如果想效率更高采用非阻塞方案tornado和curl可以实现非阻塞的下载。
2.抓取结构化数据;
要想在页面中找到新链接需要对页面解析和对url排重,正则和DOM都可以实现这个功能,看自己熟悉哪一种。
正则感觉速度较快一些,DOM相对较慢并且复杂一点,如果只是为了要url正则可以解决,如果还想要页面中其他的结构或者内容DOM比较方便。
url的排重两小可以用memcache或者redis,量大就要用到bloomfilter。
3.数据存储;
抓的少怎么存都行,抓的多并且要方便读取那就要好好设计了,用哈希分布存储在RDBMS上或者直接存在HBase上都要看你的数据量和具体需求。 
'''

 

posted on 2015-07-16 03:48  xxxxxxxx1x2xxxxxxx  阅读(472)  评论(0编辑  收藏  举报