协程+requests+MYSQL爬取盗墓笔记小说
#!/usr/bin/env python # -*- coding: utf-8 -*- __author__ = 'Fade Zhao' import requests from lxml import etree from fake_useragent import UserAgent from gevent import monkey monkey.patch_all() from gevent.pool import Pool import MySQLdb class Grave(object): def __init__(self): self.conn = MySQLdb.connect(host='localhost', user='root', password='zhaoyinghan', db='GraveNote', charset='utf8') self.cursor = self.conn.cursor() self.session = requests.session() self.url = 'http://seputu.com/' self.agent = UserAgent().random self.headers = { 'User-Agent':self.agent } def get_bags(self): '''获取各个木块的列表''' response = self.session.get(self.url,headers=self.headers) html = etree.HTML(response.content) article_list = html.xpath('.//div[@class="mulu"]') len(article_list) for item in article_list[1:]: self.get_Aictcleurl(item) # self.get_Aictcleurl(article_list[1]) def get_Aictcleurl(self,bag): '''获取各个模块的url''' urls = bag.xpath('.//li/a/@href') print(urls) pool = Pool(20) pool.map_cb(self.get_detail,urls,callback=self.save) def get_detail(self,url): try: response = self.session.get(url,headers = self.headers) html = etree.HTML(response.content) arcitle_dir = html.xpath('.//div[@class="mark"]/a/text()')[0] title = html.xpath('.//div[@class="bg"]/h1/text()')[0] if '藏海花2' in title or '盗墓笔记9' in title or '巫山妖棺' in title: content = html.xpath('.//div[@class="content-body"]/div[@class="content"]/p/text()') else: content = html.xpath('.//div[@class="content-body"]/p/text()') content_str = '\n'.join(content) return (arcitle_dir,title,content_str) except Exception as e: print('发生错误:',e) def save(self,args): print(args) sql_str = '''INSERT INTO Grave(article_dir,title,content) VALUES(%s,%s,%s) on duplicate key update content=values(content)''' try: self.cursor.executemany(sql_str, args) self.conn.commit() print('插完了') except Exception as e: print('插入错误', e) self.conn.rollback() if __name__ == '__main__': spider = Grave() spider.get_bags()
遇到的坑:
1、在爬取盗墓笔记-藏海花的时候,碰到数据爬取不到的现象,发现部分【藏海花】章节的网页结构和其他的不同,导致获取不到数据,改正后重新填充。
2、在xpath中,即使已经通过xpath筛选到对象;如果将此对象另外筛选,必须要在筛选条件之前加上[ . ] 代表的是当前节点下,否则默认的范围是包括之前筛选过的所有节点范围。
3、出现问题后,解决的思路应该先从程序运行的结构向下查找,再细分问题(每一个问题的解决想法都应该是独立的,不能惯性思维),逐步筛选后再修改。