python框架Scrapy中crawlSpider的使用——爬取内容写进MySQL
一、先在MySQL中创建test数据库,和相应的site数据表
二、创建Scrapy工程
#scrapy startproject 工程名 scrapy startproject demo4
三、进入工程目录,根据爬虫模板生成爬虫文件
#scrapy genspider -l # 查看可用模板 #scrapy genspider -t 模板名 爬虫文件名 允许的域名 scrapy genspider -t crawl test sohu.com
四、设置IP池或用户代理(middlewares.py文件)
1 # -*- coding: utf-8 -*- 2 # 导入随机模块 3 import random 4 # 导入有关IP池有关的模块 5 from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware 6 # 导入有关用户代理有关的模块 7 from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware 8 9 # IP池 10 class HTTPPROXY(HttpProxyMiddleware): 11 # 初始化 注意一定是 ip='' 12 def __init__(self, ip=''): 13 self.ip = ip 14 15 def process_request(self, request, spider): 16 item = random.choice(IPPOOL) 17 try: 18 print("当前的IP是:"+item["ipaddr"]) 19 request.meta["proxy"] = "http://"+item["ipaddr"] 20 except Exception as e: 21 print(e) 22 pass 23 24 25 # 设置IP池 26 IPPOOL = [ 27 {"ipaddr": "182.117.102.10:8118"}, 28 {"ipaddr": "121.31.102.215:8123"}, 29 {"ipaddr": "1222.94.128.49:8118"} 30 ] 31 32 33 # 用户代理 34 class USERAGENT(UserAgentMiddleware): 35 #初始化 注意一定是 user_agent='' 36 def __init__(self, user_agent=''): 37 self.user_agent = user_agent 38 39 def process_request(self, request, spider): 40 item = random.choice(UPPOOL) 41 try: 42 print("当前的User-Agent是:"+item) 43 request.headers.setdefault('User-Agent', item) 44 except Exception as e: 45 print(e) 46 pass 47 48 49 # 设置用户代理池 50 UPPOOL = [ 51 "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393" 52 ]
五、settngs.py配置
1 COOKIES_ENABLED = False 2 3 DOWNLOADER_MIDDLEWARES = { 4 # 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware':123, 5 # 'demo4.middlewares.HTTPPROXY' : 125, 6 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 2, 7 'demo4.middlewares.USERAGENT': 1 8 } 9 10 ITEM_PIPELINES = { 11 'demo4.pipelines.Demo4Pipeline': 300, 12 }
六、定义爬取关注的数据(items.py文件)
1 # -*- coding: utf-8 -*- 2 import scrapy 3 # Define here the models for your scraped items 4 # 5 # See documentation in: 6 # http://doc.scrapy.org/en/latest/topics/items.html 7 8 class Demo4Item(scrapy.Item): 9 name = scrapy.Field() 10 link = scrapy.Field()
七、爬虫文件编写(test.py)
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from scrapy.linkextractors import LinkExtractor 4 from scrapy.spiders import CrawlSpider, Rule 5 from demo4.items import Demo4Item 6 7 class TestSpider(CrawlSpider): 8 name = 'test' 9 allowed_domains = ['sohu.com'] 10 start_urls = ['http://www.sohu.com/'] 11 12 rules = ( 13 Rule(LinkExtractor(allow=('http://news.sohu.com'), allow_domains=('sohu.com')), callback='parse_item', 14 follow=False), 15 # Rule(LinkExtractor(allow=('.*?/n.*?shtml'),allow_domains=('sohu.com')), callback='parse_item', follow=False), 16 ) 17 18 def parse_item(self, response): 19 i = Demo4Item() 20 i['name'] = response.xpath('//div[@class="news"]/h1/a/text()').extract() 21 i['link'] = response.xpath('//div[@class="news"]/h1/a/@href').extract() 22 #i['description'] = response.xpath('//div[@id="description"]').extract() 23 return i
八、管道文件编写(pipelines.py)
1 # -*- coding: utf-8 -*- 2 import pymysql 3 import json 4 # Define your item pipelines here 5 # 6 # Don't forget to add your pipeline to the ITEM_PIPELINES setting 7 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 8 9 10 class Demo4Pipeline(object): 11 def __init__(self): 12 # 数据库连接 13 self.conn = pymysql.connect(host='localhost', user='root', password='123456', database='chapter17', charset='utf8') 14 self.cur = self.conn.cursor() 15 16 def process_item(self, item, spider): 17 # 排除空值 18 for j in range(0, len(item["name"])): 19 nam = item["name"][j] 20 lin = item["link"][j] 21 print(type(nam)) 22 print(type(lin)) 23 # 注意参数化编写 24 sql = "insert into site(name,link) values(%s,%s)" 25 self.cur.execute(sql,(nam,lin)) 26 self.conn.commit() 27 return item 28 def close_spider(self, spider): 29 self.cur.close() 30 self.conn.close()
九、总结
1.注意在测试完数据库正常运行时,再开始写入数据,当然,在sql参数化处理的过程中,注意格式,千万不要弄错了