scrapy入门

1.安装环境

http://pan.baidu.com/s/1bnAKBSz

修改环境变量path.添加 c:\python27;c:\python27\Script;

2.如需使用mysql等数据库请自行安装

3.安装好环境后，进入命令行，进入到工作目录，使用以下命令创建工程(工程名以doubanmoive为例)

scrapy startproject doubanmoive

4.做一系列修改使用以下命令运行项目

scrapy crawl doubanmoive

5.scrapy的一些注意地方

(1)步骤3完成以后，项目的目录应该是这样的（根目录各人不同）

D:\WEB\Python\doubanmoive>tree /f
Folder PATH listing for volume Data
Volume serial number is 00000200 34EC:9CB9
D:.
│  scrapy.cfg
│
└─doubanmoive
    │  items.py
    │  pipelines.py
    │  settings.py
    │  __init__.py
    │
    └─spiders
            __init__.py
            moive_spider.py
            moive_spider.pyc

(2)这些文件主要功能为：

doubanmoive/items.py：定义需要获取的内容字段，类似于实体类。
doubanmoive/pipelines.py：项目管道文件，用来处理Spider抓取的数据。
doubanmoive/settings.py：项目配置文件
doubanmoive/spiders/moive_spider.py：放置spider的目录

(3)demo

doubanmoive/items.py

from scrapy.item import Item, Field

class DoubanmoiveItem(Item):
    name=Field()#电影名
    year=Field()#上映年份
    score=Field()#豆瓣分数
    director=Field()#导演
    classification=Field()#分类
    actor=Field()#演员
    img=Field()#剧照

doubanmoive/pipelines.py

# -*- coding: utf-8 -*-
from scrapy import log
from twisted.enterprise import adbapi
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
import urllib

import MySQLdb
import MySQLdb.cursors


class DoubanmoivePipeline(object):
    def __init__(self):
    #定义mysql数据，db:mysql database name,user: mysql username,passwd:mysql password
        self.dbpool = adbapi.ConnectionPool('MySQLdb',
                db = 'python',
                user = 'root',
                passwd = 'root',
                cursorclass = MySQLdb.cursors.DictCursor,
                charset = 'utf8',
                use_unicode = False
        )
    def process_item(self, item, spider):
        query = self.dbpool.runInteraction(self._conditional_insert, item)
        query.addErrback(self.handle_error)
        return item

    def _conditional_insert(self,tx,item):
        #检查数据库中是否存在该名字的电影
        tx.execute("select * from doubanmoive where m_name= %s",(item['name'][0],))
        result=tx.fetchone()
#       log.msg(result,level=log.DEBUG)
        print result
        if result:
            log.msg("Item already stored in db:%s" % item,level=log.DEBUG)
        else:
　　        #处理过多的主演和不符合的时间格式　　
            classification=actor=''
            lenClassification=len(item['classification'])
            lenActor=len(item['actor'])
            for n in xrange(lenClassification):
                classification+=item['classification'][n]
                if n<lenClassification-1:
                    classification+='/'
            for n in xrange(lenActor):
                actor+=item['actor'][n]
                if n<lenActor-1:
                    actor+='/'
            #获取海报下载地址
            site= item['img'][0]
            #截取海报地址的最后一个/,生成本地的文件名
            str = site.split('/');
            print str
            path = str[-1]
            print 'local img path %s'%(path)
            #开始下载海报
            print '--------------------download img %s'%(site)
            data = urllib.urlopen(site).read()
            newfile = open(path,"wb")
            newfile.write(data)
            newfile.close()
            #将这些数据插入到数据库里面
            tx.execute(\
                "insert into doubanmoive (m_name,m_year,m_score,m_director,m_classification,m_actor,m_img,m_local_img) values (%s,%s,%s,%s,%s,%s,%s,%s)",\
                (item['name'][0],item['year'][0],item['score'][0],item['director'][0],classification,actor,site,path))
#            log.msg("Item stored in db: %s" % item, level=log.DEBUG)
            
            

    def handle_error(self, e):
        log.err(e)

doubanmoive/spiders

# -*- coding: utf-8 -*-
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from doubanmoive.items import DoubanmoiveItem

class MoiveSpider(CrawlSpider):
    name="doubanmoive"
    #允许访问的domain
    allowed_domains=["movie.douban.com"]
    #开始的地址
    start_urls=["http://movie.douban.com/top250"]
    #定义规则：允许访问的地址正则表达式，申明爬虫回调方法
    rules=[
        Rule(SgmlLinkExtractor(allow=(r'http://movie.douban.com/top250\?start=\d+.*'))),
        Rule(SgmlLinkExtractor(allow=(r'http://movie.douban.com/subject/\d+')),callback="parse_item"),      
    ]

    #rules申明的爬虫回调方法
    def parse_item(self,response):
        sel=HtmlXPathSelector(response)
        item=DoubanmoiveItem()
        #定义获取item字段的xpath表达式，完成item解析赋值
        item['name']=sel.xpath('//*[@id="content"]/h1/span[1]/text()').extract()
        item['year']=sel.xpath('//*[@id="content"]/h1/span[2]/text()').re(r'\((\d+)\)')
        item['score']=sel.xpath('//*[@id="interest_sectl"]/div/p[1]/strong/text()').extract()
        item['director']=sel.xpath('//*[@id="info"]/span[1]/a/text()').extract()
        item['classification']= sel.xpath('//span[@property="v:genre"]/text()').extract()
        item['actor']= sel.xpath('//*[@id="info"]/span[3]/a[1]/text()').extract()
 #      item['img']= sel.xpath('//*[@id="content"]/div/div[1]/ol/li[1]/div/div[1]/a/img/text()').extract()
 #      item['img']= sel.select('//ol/li/div/div/a/img/@src').extract()
        item['img']= sel.select('//a/img/@src').extract()
        return item

doubanmoive/settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for doubanmoive project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#

BOT_NAME = 'doubanmoive'

SPIDER_MODULES = ['doubanmoive.spiders']
NEWSPIDER_MODULE = 'doubanmoive.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'doubanmoive (+http://www.yourdomain.com)'

ITEM_PIPELINES={
 
    'doubanmoive.pipelines.DoubanmoivePipeline':400,
}
LOG_LEVEL='DEBUG'

DOWNLOAD_DELAY = 2
RANDOMIZE_DOWNLOAD_DELAY = True
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5'
COOKIES_ENABLED = True

　运行完成以后，数据库结果

posted on 2014-07-02 18:28 QDa 阅读(614) 评论(4) 收藏举报

刷新页面返回顶部

QDa

公告