Crawley框架

Posted on 2019-10-14 16:03  大白不白  阅读(2569)  评论(0编辑  收藏  举报

crawley startproject crawley_spider

cd crawley_spider

# Models:等于scrapy的item
from crawley.persistance import Entity, UrlEntity, Field, Unicode

class Package(Entity):

    #add your table fields here
    updated = Field(Unicode(255))
    package = Field(Unicode(255))
    description = Field(Unicode(255))
# Crawlers:
from crawley.crawlers import BaseCrawler
from crawley.scrapers import BaseScraper
from crawley.extractors import XPathExtractor
from models import *

class pypiScraper(BaseScraper):

    #specify the urls that can be scraped by this class
    matching_urls = ["%"]

    def scrape(self, response):

        #getting the html table
        table = response.html.xpath("/html/body/div[5]/div/div/div[3]/table")[0]

        #for rows 1 to n-1
        for tr in table[1:-1]:

            #obtaining the searched html inside the rows
            td_updated = tr[0]
            td_package = tr[1]
            package_link = td_package[0]
            td_description = tr[2]

            #storing data in Packages table
            Package(updated=td_updated.text, package=package_link.text, description=td_description.text)


class pypiCrawler(BaseCrawler):

    #add your starting urls here
    start_urls = ["http://pypi.python.org/pypi"]

    #add your scraper classes here
    scrapers = [pypiScraper]

    #specify you maximum crawling depth level
    max_depth = 0

    #select your favourite HTML parsing tool
    extractor = XPathExtractor

pypiScraper类内部定义的scrape方法。它使用Xpath来获取解析的html,然后将提取的数据存储在Packages表中。

# settings.py
import os
PATH = os.path.dirname(os.path.abspath(__file__))

#Don't change this if you don't have renamed the project
PROJECT_NAME = "pypi"
PROJECT_ROOT = os.path.join(PATH, PROJECT_NAME)

DATABASE_ENGINE = 'sqlite'
DATABASE_NAME = 'pypi'
DATABASE_USER = ''
DATABASE_PASSWORD = ''
DATABASE_HOST = ''
DATABASE_PORT = ''

SHOW_DEBUG_INFO = True

运行爬虫:crawley  run

 

Copyright © 2024 大白不白
Powered by .NET 9.0 on Kubernetes