scrapy爬取豆瓣电影
零、说明
这个例子爬取了豆瓣top250电影,并把这些电影的某些属性保存到mysql中,具体的url是这个:https://movie.douban.com/top250。
一、环境
- python3.4
- mysql
- python安装好scrapy
二、工作目录
三、具体代码
items.py
# -*- coding: utf-8 -*- import scrapy class DoubanMovieItem(scrapy.Item): no = scrapy.Field() movie_name = scrapy.Field() director = scrapy.Field() writer = scrapy.Field() actor = scrapy.Field() type = scrapy.Field() region = scrapy.Field() language = scrapy.Field() date = scrapy.Field() length = scrapy.Field() another_name = scrapy.Field() introduction = scrapy.Field() grade = scrapy.Field() comment_times = scrapy.Field()
doubanspider.py
这个文件就是爬虫主体,具体作用就是在这个文件里设置抓取的起始url、抓取规则(rules)、解析网页
# -*- coding: utf-8 -*- from bs4 import BeautifulSoup from scrapy.cmdline import execute from scrapy.spiders.crawl import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from douban_movie.items import DoubanMovieItem class DouBanSpider(CrawlSpider): name = 'douban_movie' #起始url start_urls = [ "https://movie.douban.com/top250" ] #抓取规则 rules = [ Rule(LinkExtractor(allow=(r"https://movie.douban.com/subject/\d+/?$")),callback="parse_page"), Rule(LinkExtractor(allow=(r"https://movie.douban.com/top250")),follow=True) ] #解析抓取到网页 def parse_page(self,response): soup = BeautifulSoup(response.body, 'html.parser', from_encoding='utf-8') movie_name_tag = soup.find('div',id='content').findChild('h1') no = soup.find('span', 'top250-no').get_text() movie_name = movie_name_tag.findChildren()[0].get_text()+movie_name_tag.findChildren()[1].get_text() director = soup.find('a',rel='v:directedBy').get_text() writer = soup.find('span',text='编剧').next_sibling.next_sibling.text actor = '/'.join(star.text for star in soup.findAll('a',rel = 'v:starring')) type = '/'.join(genre.text for genre in soup.findAll('span',property='v:genre')) region = soup.find('span',text='制片国家/地区:').next_sibling language = soup.find('span',text='语言:').next_sibling date = soup.find('span',property = 'v:initialReleaseDate').text length_tag = soup.find('span',property = 'v:runtime') if str(length_tag.next_sibling)!='<br/>': length = length_tag.text+str(length_tag.next_sibling) else: length = length_tag.text another_name = soup.find('span',text='又名:').next_sibling introduction = soup.find('span',property='v:summary').text grade= soup.find('strong',property='v:average').text comment_times=soup.find('span',property='v:votes').text item = DoubanMovieItem() item['no']=no item['movie_name']=movie_name item['director']=director item['writer']=writer item['actor']=actor item['type']=type item['region']=region item['language']=language item['date']=date item['length']=length item['another_name']=another_name item['introduction']=introduction item['grade']=grade item['comment_times']=comment_times return item if __name__ == '__main__': execute('scrapy crawl douban_movie'.split(' '))
pipelines.py
这个文件主要用来做数据库操作,把items保存到数据库中。(首先你得现在数据库中建好表和字段)
# -*- coding: utf-8 -*- import pymysql class DoubanMoviePipeline(object): def __init__(self): self.conn = pymysql.connect( user='root', #此处换你的数据用户名 password='mysql', #此处换成你的密码 host='127.0.0.1', db='test', charset='utf8' ) self.cursor = self.conn.cursor() def process_item(self, item, spider): try: self.cursor.execute( """insert into douban_movie(no,movie_name,director,writer,actor,type,region,language,date,length,another_name,introduction,grade,comment_times) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""", ( item['no'], item['movie_name'], item['director'], item['writer'], item['actor'], item['type'], item['region'], item['language'], item['date'], item['length'], item['another_name'], item['introduction'], item['grade'], item['comment_times'] ) ) self.conn.commit() except Exception as e: print(e) return item
settings.py
这个文件是scrapy的配置文件,配置文件里有许多属性可以配置,当你创建一个scrapy的项目是会自动生成很多注释过的key=value这样的键值对。当你需要配置某个属性是就可以把注释去掉,然后添加对应的value值。下面的代码是这个项目用到的key=value键值对。
# -*- coding: utf-8 -*- BOT_NAME = 'douban_movie' SPIDER_MODULES = ['douban_movie.spiders'] NEWSPIDER_MODULE = 'douban_movie.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = ' Mozilla/5.0 (Windows NT 10.0; Trident/7.0; rv:11.0) like Gecko' # Obey robots.txt rules ROBOTSTXT_OBEY = False # 默认的头 DEFAULT_REQUEST_HEADERS = { 'Referer': 'https://movie.douban.com/' } ITEM_PIPELINES = { 'douban_movie.pipelines.DoubanMoviePipeline': 300, }