scrapy简单爬虫
# -*- coding: utf-8 -*- #这只是爬虫文件内容,使用pycharm运行,在terminal中使用命令行,要用爬虫名字
import scrapy from insist.items import InsistItem class InsistsSpider(scrapy.Spider): name = 'insists' allowed_domains = ['itcast.cn'] start_urls = ['http://www.itcast.cn/channel/teacher.shtml'] def parse(self, response): node_list=response.xpath("//div[@class='li_txt']") items=[] for node in node_list: #创建item字段对象,用来存储信息 item=InsistItem()#items里面的类 name=node.xpath("./h3/text()").extract()#extract()将xpath对象转化为Unicode字符串 title=node.xpath("./h4/text()").extract() info=node.xpath("./p/text()").extract() item['name']=name[0] item['title']=title[0] item['info']=info[0] items.append(item) return items #pass