python爬虫实例
这里有两个爬虫的实例,是刚开始学python用的,一个是爬取京东茅台酒评论的,另一个是爬取新浪网国内新闻的,两个都是网上的教程里边的,代码略微有些不同,供参考学习。
都可以在andconda里跑
import requests from bs4 import BeautifulSoup from datetime import datetime import re import json import pandas news_total=[] commentURL='http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-{}&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20' url='http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&format=json&page={}&callback=newsloadercallback&_=1509373193047' def parseListLinks(url): newsdetails=[] res = requests.get(url) jd= json.loads(res.text.strip().lstrip('newsloadercallback(').rstrip(');')) for ent in jd['result']['data']: newsdetails.append(getNewsDetail(ent['url'])) return newsdetails def getNewsDetail(newsurl): result={} res=requests.get(newsurl) res.encoding='utf-8' soup=BeautifulSoup(res.text,'html.parser') result['title']=soup.select('#artibodyTitle')[0].text result['newssource']=soup.select('.time-source span a')[0].text timesource=soup.select('.time-source')[0].contents[0].strip() dt1=datetime.strptime(timesource,'%Y年%m月%d日%H:%M') result['dt'] =dt1.strftime('%Y-%m-%d-%H:%M') result['article']=' '.join([p.text.strip() for p in soup.select('#artibody p')[:-1]]) result['editor']=soup.select('.article-editor')[0].text.strip('责任编辑:') result['comments']=getCommentCounts(newsurl) print('获得一条新闻') return result def getCommentCounts(newsurl): m=re.search('doc-i(.+).shtml',newsurl) newsid=m.group(1) comments=requests.get(commentURL.format(newsid)) jd=json.loads(comments.text.strip('var data=')) return jd['result']['count']['total'] for i in range(1,8): print('正在爬取第'+str(i)+'页......') newsurl=url.format(i) newsary= parseListLinks(newsurl) news_total.extend(newsary) print('抓取结束') df=pandas.DataFrame(news_total) df.to_excel('news.xlsx')
import requests import re import json import time import xlwt # # #配置表格 #不需要明白是干啥的 #有下面4行代码就可以往表格写中文了 # style=xlwt.XFStyle() font=xlwt.Font() font.name='SimSun' style.font=font #创建一个表格 w=xlwt.Workbook(encoding='utf-8') #添加个sheet ws=w.add_sheet('sheet 1',cell_overwrite_ok=True) #当前写入表格到第 row行 row=1 # #写入表格头 # ws.write(0,0,'content') ws.write(0,1,'userClientShow') ws.write(0,2,'creationTime') ws.write(0,3,'userLevelName') ws.write(0,4,'productColor') ws.write(0,5,'userLevelId') ws.write(0,6,'score') ws.write(0,7,'referenceName') ws.write(0,8,'referenceTime') ws.write(0,9,'isMobile') ws.write(0,10,'nickname') # #接受一个json对象 #将内容写进表格 #一次一页评论 # def write_json_to_xls(dat): global row for comment in dat['comments']: ws.write(row,0,comment['content']) ws.write(row,1,comment['userClientShow']) ws.write(row,2,comment['creationTime']) ws.write(row,3,comment['userLevelName']) ws.write(row,4,comment['productColor']) ws.write(row,5,comment['userLevelId']) ws.write(row,6,comment['score']) ws.write(row,7,comment['referenceName']) ws.write(row,8,comment['referenceTime']) ws.write(row,9,comment['isMobile']) ws.write(row,10,comment['nickname']) row+=1 # # # 循环获取数据 # # for i in range(1,10+1): url='https://club.jd.com/comment/productPageComments.action?productId=1475512465&score=0&sortType=5&page=%d&pageSize=100&isShadowSku=0&fold=' % i try: json_req = requests.get(url) dat = json_req.json() write_json_to_xls(dat) print(u'写入一页数据') except Exception as e: print(u'获取数据失败数据',e) time.sleep(0.5) #将数据存进表格 w.save('result.xls')