爬虫实例:每日一文和豆瓣电影
这篇博客主要是写两个爬虫,一个抓取静态网站的文字和图片,一个抓取动态网站的电影及相关消息。
1.每日一文(http://voice.meiriyiwen.com/)
#coding=utf-8 #爬取每日一文前10页内容 from lxml import etree import requests import urllib2,urllib import sys import os import time tmpt_url = 'http://voice.meiriyiwen.com/voice/past?page=%d' urllist = [tmpt_url%i for i in range(1,11)] def get_url(): for url in urllist: try: headers = { 'Host':'voice.meiriyiwen.com', 'Upgrade-Insecure-Requests':'1', 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36' , 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding':'gzip, deflate, sdch, br', 'Accept-Language':'zh-CN,zh;q=0.8', 'Cache-Control':'max-age=0', 'Connection':'keep-alive'} #proxies = { "http": "dev-proxy.oa.com:8080","https": "dev-proxy.oa.com:8080",} time.sleep(0.5) response = requests.get(url,headers = headers) print response.status_code get_info(response) except urllib2.URLError, e: print e.reason def get_info(response): global count html = response.content #print html tree = etree.HTML(html) rez = tree.xpath('//*[@class="img_list"]') for i in rez: title = i.xpath('//*[@class="list_author"]/a/text()') author = i.xpath('//*[@class="author_name"]/text()') for x,y in zip(title,author): count += 1 print count,'|',x.replace(u'\xa0','').strip(),'|',y.replace(u'\xa0','').strip() if __name__ == '__main__': count = 0 get_url()
运行结果:
2.豆瓣电影(https://movie.douban.com)
# coding=utf-8 import json import os import sys import time import urllib import urllib2 import pymongo import requests import re from lxml import etree # reload(sys) # sys.setdefaultencoding('utf-8') tmpt_url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%s&sort=recommend&page_limit=500&page_start=0' tags = ('热门', '最新', '经典', '豆瓣高分', '冷门佳片', '华语', '欧美', '韩国', '日本', '动作', '喜剧', '爱情', '科幻', '悬疑', '恐怖', '文艺') urllist = [tmpt_url % i for i in tags] #print urllist def get_url(): for url in urllist: try: headers = { #'Host':'www.douban.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch, br', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive'} # proxies = { "http": "dev-proxy.oa.com:8080","https": # "dev-proxy.oa.com:8080",} time.sleep(0.5) response = requests.get(url, headers=headers) tag = re.findall('tag=(.*?)&',url) print u"电影类型:",tag get_info(response) except Exception,e: print e def get_info(response): global count html = response.content dictt = json.loads(html, encoding='utf-8') dd = dictt['subjects'] for item in dd: count += 1 print count,u'电影链接:', item['url'], u'电影名:', item['title'], u'评分:', item['rate'] if __name__ == '__main__': count = 0 get_url()
运行结果: