XPath与多线程爬虫

XPath是一门在xml中查询信息的语言
安装使用XPath

1.安装lxml库

window:pip install lxml
linux:sudo pip install lxml
国内安装缓慢,建议到:
http://www.lfd.uci.edu/~gohlke/pythonlibs/
搜索到lxml并下载
修改后缀名whl为zip并解压,复制lxml文件夹到python的lib目录下

2.使用 from lxml import etree 导入

部分方法:
//定位根节点
/往下层寻找
提取文本内容:/text()
提取属性内容:/@xxx

selector = etree.HTML(html)#将页面元素转换成xpath识别信息

xpath内容获取示例
提取文本

content = selector.xpath('//ul[@id="useful"]/li/text()')#列表
content = selector.xpath('//ul[@id="useful"]/li[0]/text()')#列表的第一个

提取属性(获取a标签的href属性)

content = selector.xpath('//a/@href')

3.XPath特殊用法
以相同的字符开头
starts-with(@属性名称,属性字符相同部分)

selector = etree.HTML(html)
content = selector.xpath('//div[starts-with(@id,"test")]/text()')

标签套标签
String(.)

data = selector.xpath('//div[@id="class3"]')[0]
info = data.xpath('string(.)')
content2 = info.replace('\n','').replace(' ','')

4并行化演示
多个线程同时处理任务

map的使用

from multiprocessing.dummy import Pool

pool = Pool(4) #四核计算机写4 八核计算机写8 速度会快
results = pool.map(爬取函数,网址列表)

并行化速度显著加快,事例如下:

# -*- coding: utf-8 -*-

from multiprocessing.dummy import  Pool as ThreadPool
import  requests
import time

def getsourse(url):
    html = requests.get(url)#获取网页信息

urls = []
for i in  range(1,21):
    newpage = 'http://tieba.baidu.com/p/3522395718?pn='+str(i)
    urls.append(newpage)

time1 = time.time()
for i in urls:
    print(i)
    getsourse(i)
time2 = time.time()
print('单线程耗时:' + str(time2-time1))


pool = ThreadPool(4)
time3 = time.time()
results = pool.map(getsourse,urls)#map函数一手包办了序列操作,参数传递,和结果保存等一系列操作
pool.close()
pool.join()
time4 = time.time()
print('并行耗时:' + str(time4-time3))

---------------------------------------------------------------------------------

举一个百度贴吧爬虫程序:

# -*- coding: utf-8 -*-
from lxml import etree
from multiprocessing.dummy import  Pool as ThreadPool
import  requests
import  json

def towrite(contentdict):
    f.writelines('回帖时间'+str(contentdict['topic_reply_time'])+'\n')
    f.writelines('回帖内容'+str(contentdict['topic_reply_content'])+'\n')
    f.writelines('回帖人'+str(contentdict['user_name'])+'\n\n')

def  spider(url):
    html = requests.get(url)
    selector = etree.HTML(html.text)
    content_field = selector.xpath('//div[@class="l_post j_l_post l_post_bright  "]')
    item={}
    for each in content_field:
        reply_info = json.loads(each.xpath('@data-field')[0].replace('&quot',''))
        author = reply_info['author']['user_name']
        content = each.xpath('div[@class="d_post_content_main"]/div/cc/div[@class="d_post_content j_d_post_content  clearfix"]/text()')[0]
        reply_time = reply_info['content']['date']
        print(author)
        print(content)
        print(reply_time)
        item['user_name'] = author
        item['topic_reply_content'] = content
        item['topic_reply_time'] = reply_time
        towrite(item)


if __name__ == '__main__':
    pool = ThreadPool(4)
    f=open('content.txt','a')
    page=[]
    for i in range(1,21):
        newpage='http://tieba.baidu.com/p/3522395718?pn='+str(i)
        page.append(newpage)

    results = pool.map(spider,page)
    pool.close()
    f.close()

posted on 2015-12-29 16:37 itliucheng 阅读(407) 评论(0) 编辑收藏举报