基础
1 安装scrapy
2 pip install wheel
3 下载Twisted https://www.lfd.uci.edu/~gohlke/pythonlibs/
4 pip install Twisted-xxxxxx
5 pip install scrapy
创建工程
scrapy startproject xxxpro
scrapy genspider spidername www.xxx.com
运行
scrapy crawl qiubai
extract()
1 def parse(self, response): 2 title_list=response.xpath("/html/body/main/div/div/div//posts") 3 for title in title_list: 4 title_name=title.xpath(".//div[2]/h2/a/text()")[0].extract() 5 title_eyes=title.xpath(".//item[@class='meta-view']/text()")[0].extract() 6 7 # print(title_name,title_eyes) 8 9 item=TestproItem()#前面需要引入文件item 10 item['title_name']=title_name 11 item['title_eyes'] = title_eyes 12 yield item
pipelines.py 管道文件 一个管道类存一种方式
TXT管道
1 class TestproPipeline: 2 fp=None 3 def open_spider(self,spider): 4 print("开始爬虫") 5 self.fp =open('./qiu.txt','w',encoding='utf-8') 6 7 def process_item(self, item, spider): 8 title_name = item['title_name'] 9 title_eyes = item['title_eyes'] 10 # print(title_name,title_eyes) 11 self.fp.write(title_name+":"+title_eyes+"\n") 12 return item #两种管道必须都加返回,否则第二个管道无法获取item 13 def close_spider(self,spider): 14 print("爬虫结束") 15 self.fp.close()
mysql 管道
1 class mysqlPipeline: 2 conn=None 3 cursor=None 4 def open_spider(self, spider): 5 print("开始爬虫") 6 self.conn=pymysql.Connect(host="127.0.0.1",user="root",password="root",database="xijing",port=3306) 7 8 def process_item(self, item, spider): 9 title_name = item['title_name'] 10 title_eyes = item['title_eyes'] 11 # print(title_name,title_eyes) 12 self.cursor=self.conn.cursor() 13 try: 14 self.cursor.execute('insert into water values ("%s","%s")'%(title_name,title_eyes)) 15 self.conn.commit() 16 except Exception as e: 17 print(e) 18 self.conn.rollback() 19 return item #两种管道必须都加返回,否则第二个管道无法获取item 20 21 def close_spider(self, spider): 22 self.cursor.close() 23 self.conn.close() 24 print("爬虫结束")