scrapy爬取豆瓣电影

零、说明

  这个例子爬取了豆瓣top250电影,并把这些电影的某些属性保存到mysql中,具体的url是这个:https://movie.douban.com/top250

一、环境

  • python3.4
  • mysql
  • python安装好scrapy

二、工作目录

三、具体代码

  items.py

# -*- coding: utf-8 -*-
import scrapy
class DoubanMovieItem(scrapy.Item):
    no = scrapy.Field()
    movie_name = scrapy.Field()
    director = scrapy.Field()
    writer = scrapy.Field()
    actor = scrapy.Field()
    type = scrapy.Field()
    region = scrapy.Field()
    language = scrapy.Field()
    date = scrapy.Field()
    length = scrapy.Field()
    another_name = scrapy.Field()
    introduction = scrapy.Field()
    grade = scrapy.Field()
    comment_times = scrapy.Field()

 

  doubanspider.py 

  这个文件就是爬虫主体,具体作用就是在这个文件里设置抓取的起始url、抓取规则(rules)、解析网页

# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
from scrapy.cmdline import execute
from scrapy.spiders.crawl import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from douban_movie.items import DoubanMovieItem
class DouBanSpider(CrawlSpider):
    name = 'douban_movie'
    #起始url
    start_urls = [
                  "https://movie.douban.com/top250"
                  ]
    #抓取规则
    rules = [
             Rule(LinkExtractor(allow=(r"https://movie.douban.com/subject/\d+/?$")),callback="parse_page"),
             Rule(LinkExtractor(allow=(r"https://movie.douban.com/top250")),follow=True)
             ]
    #解析抓取到网页
    def parse_page(self,response):
        soup = BeautifulSoup(response.body, 'html.parser', from_encoding='utf-8')
        movie_name_tag = soup.find('div',id='content').findChild('h1')
        no = soup.find('span', 'top250-no').get_text()
        movie_name = movie_name_tag.findChildren()[0].get_text()+movie_name_tag.findChildren()[1].get_text()
        director = soup.find('a',rel='v:directedBy').get_text()
        writer = soup.find('span',text='编剧').next_sibling.next_sibling.text
        actor = '/'.join(star.text for star in soup.findAll('a',rel = 'v:starring'))
        type = '/'.join(genre.text for genre in soup.findAll('span',property='v:genre'))
        region = soup.find('span',text='制片国家/地区:').next_sibling
        language = soup.find('span',text='语言:').next_sibling
        date = soup.find('span',property = 'v:initialReleaseDate').text
        length_tag = soup.find('span',property = 'v:runtime')
        if str(length_tag.next_sibling)!='<br/>':
            length = length_tag.text+str(length_tag.next_sibling)
        else:
            length = length_tag.text
        another_name = soup.find('span',text='又名:').next_sibling
        introduction = soup.find('span',property='v:summary').text
        grade= soup.find('strong',property='v:average').text
        comment_times=soup.find('span',property='v:votes').text
        
        item = DoubanMovieItem()
        item['no']=no
        item['movie_name']=movie_name
        item['director']=director
        item['writer']=writer
        item['actor']=actor
        item['type']=type
        item['region']=region
        item['language']=language
        item['date']=date
        item['length']=length
        item['another_name']=another_name
        item['introduction']=introduction
        item['grade']=grade
        item['comment_times']=comment_times
        
        return item
    
if __name__ == '__main__':
    execute('scrapy crawl douban_movie'.split(' '))      

  pipelines.py

  这个文件主要用来做数据库操作,把items保存到数据库中。(首先你得现在数据库中建好表和字段)

# -*- coding: utf-8 -*-
import pymysql

class DoubanMoviePipeline(object):
   
    def __init__(self):
            self.conn = pymysql.connect(
                                      user='root',                #此处换你的数据用户名
                                      password='mysql',        #此处换成你的密码
                                      host='127.0.0.1',
                                      db='test',
                                      charset='utf8'
                                      )
            self.cursor = self.conn.cursor()
    def process_item(self, item, spider):
        try:
            self.cursor.execute(
                                    """insert into douban_movie(no,movie_name,director,writer,actor,type,region,language,date,length,another_name,introduction,grade,comment_times) 
                                    values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
                                    (
                                          item['no'],
                                          item['movie_name'],
                                          item['director'],
                                          item['writer'],
                                          item['actor'],
                                          item['type'],
                                          item['region'],
                                          item['language'],
                                          item['date'],
                                          item['length'],
                                          item['another_name'],
                                          item['introduction'],
                                          item['grade'],
                                          item['comment_times']
                                     )
            )
            self.conn.commit()       
        except Exception as e:
            print(e)
        return item
            

  settings.py

  这个文件是scrapy的配置文件,配置文件里有许多属性可以配置,当你创建一个scrapy的项目是会自动生成很多注释过的key=value这样的键值对。当你需要配置某个属性是就可以把注释去掉,然后添加对应的value值。下面的代码是这个项目用到的key=value键值对。

# -*- coding: utf-8 -*-

BOT_NAME = 'douban_movie'

SPIDER_MODULES = ['douban_movie.spiders']
NEWSPIDER_MODULE = 'douban_movie.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = ' Mozilla/5.0 (Windows NT 10.0; Trident/7.0; rv:11.0) like Gecko'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# 默认的头
DEFAULT_REQUEST_HEADERS = {
                           'Referer': 'https://movie.douban.com/'
}

ITEM_PIPELINES = {
   'douban_movie.pipelines.DoubanMoviePipeline': 300,
}

 

posted @ 2016-11-01 16:52  阿峰峰峰峰  阅读(1329)  评论(0编辑  收藏  举报