使用Python抓取美团数据存于Excel中
0.程序是针对美团中的美食部分数据按好评排序采集。
要抓取保存的数据为:
商家名 类型 地理位置 评论人数 均价 最低价格
1.首先编写网页数据采集函数,使用request采集网页源码,具体实现如下
def getHtml(url): headers = ('User-Agent', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11') opener = urllib.request.build_opener() opener.addheaders = [headers] htmldata = opener.open(url).read() htmldata=htmldata.decode('utf-8') return htmldata
2.根据网页源码解析获取已上线城市的url
class GetCityUrl(HTMLParser): part = ('gaevent','changecity/build') urldic = {} def handle_starttag(self, tag, attrs): if tag=='a' and (self.part in attrs): for att,value in attrs: if att=='href': self.urldic.__setitem__(value, value+'/category/meishi/all/rating') def getUrl(self): return self.urldic
3.获取分页url
class GetPages(HTMLParser): pagelist = list() temphref = str() flg = 0 initurl = str() def setInitUrl(self,url): self.initurl = url def handle_starttag(self, tag, attrs): if tag=='a': for attr,value in attrs: if attr=='href' and ('page' in value): self.temphref = self.initurl + value if self.temphref not in self.pagelist: self.pagelist.append(self.temphref) def getList(self): return self.pagelist
4.解析网页源码 获取有效信息
class MyHTMLParser(HTMLParser): tempstr = str() divsum = int() def handle_starttag(self, tag, attrs): if tag=='div': for attr,value in attrs: if attr=='class' and value.find('poi-tile-nodeal')!=-1: self.tempstr='' self.divsum = 0 def handle_data(self, data): if(data.isspace()==False): data = data.replace('·', '·') if data=='¥': if '¥' not in self.tempstr: self.tempstr+='无' +'\t' self.tempstr+=data elif data=='¥': if '¥' not in self.tempstr: self.tempstr+='无' +'\t' self.tempstr+='¥' elif data=='人评价': self.tempstr=self.tempstr[0:-1]+data+'\t' elif data=='人均 ': self.tempstr+='人均' elif data[0]=='起': self.tempstr=self.tempstr[0:-1]+'起' else: self.tempstr+=data+'\t' def handle_endtag(self, tag): if tag=='div': self.divsum+=1 if self.divsum==6: if (self.tempstr.find('¥'))!=-1: if (re.split(r'\t', self.tempstr).__len__())==5: teststr = str() flg = 0 for stmp in re.split(r'\t',self.tempstr): if flg==2: teststr+='无位置信息'+'\t' teststr+=stmp+'\t' flg+=1 self.tempstr=teststr if (re.split(r'\t', self.tempstr).__len__())==6: arraystr.append(self.tempstr) self.divsum=0 self.tempstr=''
5.将信息存放于Excel中
def SaveExcel(listdata): head=['商家名','类型','地理位置','评论人数','均价','最低价格'] wbk=xlwt.Workbook() sheet1=wbk.add_sheet("sheet1") ii=0 for testhand in head: sheet1.write(0,ii,testhand) ii+=1 i=1 j=0 for stt in listdata: j=0 lis = re.split(r'\t',stt) for ls in lis: sheet1.write(i,j,ls) j=j+1 i+=1 wbk.save('test.xls')以下是Excel中的数据:
附录完整代码:
#encoding:utf-8 ''' Created on 2016年7月22日 python version 3.5 @author: baalhuo ''' from html.parser import HTMLParser import re import urllib.request import xlwt import time #存放采集的商家信息 arraystr = list() #解析网页源码 获取有效信息 class MyHTMLParser(HTMLParser): tempstr = str() divsum = int() def handle_starttag(self, tag, attrs): if tag=='div': for attr,value in attrs: if attr=='class' and value.find('poi-tile-nodeal')!=-1: self.tempstr='' self.divsum = 0 def handle_data(self, data): if(data.isspace()==False): data = data.replace('·', '·') if data=='¥': if '¥' not in self.tempstr: self.tempstr+='无' +'\t' self.tempstr+=data elif data=='¥': if '¥' not in self.tempstr: self.tempstr+='无' +'\t' self.tempstr+='¥' elif data=='人评价': self.tempstr=self.tempstr[0:-1]+data+'\t' elif data=='人均 ': self.tempstr+='人均' elif data[0]=='起': self.tempstr=self.tempstr[0:-1]+'起' else: self.tempstr+=data+'\t' def handle_endtag(self, tag): if tag=='div': self.divsum+=1 if self.divsum==6: if (self.tempstr.find('¥'))!=-1: if (re.split(r'\t', self.tempstr).__len__())==5: teststr = str() flg = 0 for stmp in re.split(r'\t',self.tempstr): if flg==2: teststr+='无位置信息'+'\t' teststr+=stmp+'\t' flg+=1 self.tempstr=teststr if (re.split(r'\t', self.tempstr).__len__())==6: arraystr.append(self.tempstr) self.divsum=0 self.tempstr='' #获取美团已上线城市的url 目前为844个城市地区 class GetCityUrl(HTMLParser): part = ('gaevent','changecity/build') urldic = {} def handle_starttag(self, tag, attrs): if tag=='a' and (self.part in attrs): for att,value in attrs: if att=='href': self.urldic.__setitem__(value, value+'/category/meishi/all/rating') def getUrl(self): return self.urldic #获取分页URL class GetPages(HTMLParser): pagelist = list() temphref = str() flg = 0 initurl = str() def setInitUrl(self,url): self.initurl = url def handle_starttag(self, tag, attrs): if tag=='a': for attr,value in attrs: if attr=='href' and ('page' in value): self.temphref = self.initurl + value if self.temphref not in self.pagelist: self.pagelist.append(self.temphref) def getList(self): return self.pagelist #采集网页源码信息 def getHtml(url): headers = ('User-Agent', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11') opener = urllib.request.build_opener() opener.addheaders = [headers] htmldata = opener.open(url).read() htmldata=htmldata.decode('utf-8') return htmldata #将信息保存到Excel中 def SaveExcel(listdata): head=['商家名','类型','地理位置','评论人数','均价','最低价格'] wbk=xlwt.Workbook() sheet1=wbk.add_sheet("sheet1") ii=0 for testhand in head: sheet1.write(0,ii,testhand) ii+=1 i=1 j=0 for stt in listdata: j=0 lis = re.split(r'\t',stt) for ls in lis: sheet1.write(i,j,ls) j=j+1 i+=1 wbk.save('e:/test3.xls') par = GetCityUrl() par.feed(getHtml('http://www.meituan.com/index/changecity/initiative')) urldic = par.getUrl() par = MyHTMLParser() print(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))) ffwait=1 for url in urldic: data = getHtml(urldic.get(url)) getpage = GetPages() getpage.setInitUrl(url) getpage.feed(data) pageurllist = getpage.getList() par.feed(data) for urltemp in pageurllist: par.feed(getHtml(urltemp)) arraystr.append('切换地区 ') if ffwait ==4:#此处只抓取了4个城市数据 break; ffwait+=1 SaveExcel(arraystr) print(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))) print('Done')
学之,以记之。