XPath与多线程爬虫
XPath是一门在xml中查询信息的语言
安装使用XPath
1.安装lxml库
window:pip install lxml
linux:sudo pip install lxml
国内安装缓慢,建议到:
http://www.lfd.uci.edu/~gohlke/pythonlibs/
搜索到lxml并下载
修改后缀名whl为zip并解压,复制lxml文件夹到python的lib目录下
2.使用 from lxml import etree 导入
部分方法:
//定位根节点
/往下层寻找
提取文本内容:/text()
提取属性内容:/@xxx
selector = etree.HTML(html)#将页面元素转换成xpath识别信息
xpath内容获取示例
提取文本
content = selector.xpath('//ul[@id="useful"]/li/text()')#列表 content = selector.xpath('//ul[@id="useful"]/li[0]/text()')#列表的第一个
提取属性(获取a标签的href属性)
content = selector.xpath('//a/@href')
3.XPath特殊用法
以相同的字符开头
starts-with(@属性名称,属性字符相同部分)
selector = etree.HTML(html) content = selector.xpath('//div[starts-with(@id,"test")]/text()')
标签套标签
String(.)
data = selector.xpath('//div[@id="class3"]')[0] info = data.xpath('string(.)') content2 = info.replace('\n','').replace(' ','')
4并行化演示
多个线程同时处理任务
map的使用
from multiprocessing.dummy import Pool
pool = Pool(4) #四核计算机写4 八核计算机写8 速度会快 results = pool.map(爬取函数,网址列表)
并行化速度显著加快,事例如下:
# -*- coding: utf-8 -*- from multiprocessing.dummy import Pool as ThreadPool import requests import time def getsourse(url): html = requests.get(url)#获取网页信息 urls = [] for i in range(1,21): newpage = 'http://tieba.baidu.com/p/3522395718?pn='+str(i) urls.append(newpage) time1 = time.time() for i in urls: print(i) getsourse(i) time2 = time.time() print('单线程耗时:' + str(time2-time1)) pool = ThreadPool(4) time3 = time.time() results = pool.map(getsourse,urls)#map函数一手包办了序列操作,参数传递,和结果保存等一系列操作 pool.close() pool.join() time4 = time.time() print('并行耗时:' + str(time4-time3))
---------------------------------------------------------------------------------
举一个百度贴吧爬虫程序:
# -*- coding: utf-8 -*- from lxml import etree from multiprocessing.dummy import Pool as ThreadPool import requests import json def towrite(contentdict): f.writelines('回帖时间'+str(contentdict['topic_reply_time'])+'\n') f.writelines('回帖内容'+str(contentdict['topic_reply_content'])+'\n') f.writelines('回帖人'+str(contentdict['user_name'])+'\n\n') def spider(url): html = requests.get(url) selector = etree.HTML(html.text) content_field = selector.xpath('//div[@class="l_post j_l_post l_post_bright "]') item={} for each in content_field: reply_info = json.loads(each.xpath('@data-field')[0].replace('"','')) author = reply_info['author']['user_name'] content = each.xpath('div[@class="d_post_content_main"]/div/cc/div[@class="d_post_content j_d_post_content clearfix"]/text()')[0] reply_time = reply_info['content']['date'] print(author) print(content) print(reply_time) item['user_name'] = author item['topic_reply_content'] = content item['topic_reply_time'] = reply_time towrite(item) if __name__ == '__main__': pool = ThreadPool(4) f=open('content.txt','a') page=[] for i in range(1,21): newpage='http://tieba.baidu.com/p/3522395718?pn='+str(i) page.append(newpage) results = pool.map(spider,page) pool.close() f.close()