scrapy简单爬虫

# -*- coding: utf-8 -*-
#这只是爬虫文件内容，使用pycharm运行，在terminal中使用命令行，要用爬虫名字

import scrapy
from insist.items import InsistItem

class InsistsSpider(scrapy.Spider):
    name = 'insists'
    allowed_domains = ['itcast.cn']
    start_urls = ['http://www.itcast.cn/channel/teacher.shtml']

    def parse(self, response):
        node_list=response.xpath("//div[@class='li_txt']")
        items=[]
        for node in node_list:
            #创建item字段对象，用来存储信息
            item=InsistItem()#items里面的类
            name=node.xpath("./h3/text()").extract()#extract()将xpath对象转化为Unicode字符串
            title=node.xpath("./h4/text()").extract()
            info=node.xpath("./p/text()").extract()

            item['name']=name[0]
            item['title']=title[0]
            item['info']=info[0]
            items.append(item)
        return items
        #pass

posted @ 2019-09-15 22:17 晨曦yd 阅读(156) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

公告

昵称：晨曦yd
园龄： 5年8个月
粉丝： 3
关注： 1

2025年1月

日

一

二

三

四

五

六