scrapy+lxml.etree爬取百度贴吧
分析:首先通过scrapy内置的xpath提取内容,发现为空,所以不行咯
采用正则re匹配出所有的<li>标签,也就是需要提取的所有内容
在把li标签通过resultTree = lxml.etree.HTML(articleBody),变成'lxml.etree._Element'
在通过resultTree.xpath()进行提取
注意 此时的xpath与scrapy的xpath是不一样的
# -*- coding: utf-8 -*- import scrapy from ..settings import MAX_PAGE from ..items import TiebaBaiduItem import re import lxml.html import lxml.etree import json class TiebaSpider(scrapy.Spider): name = 'tieba' allowed_domains = ['tieba.baidu.com'] start_urls = ['https://tieba.baidu.com/f?kw=%E9%83%91%E5%AE%B8&ie=utf-8&pn={}'.format(str(page * 50)) for page in range(MAX_PAGE + 1)] def parse(self, response): # 关键 是正则匹配出 那一段需要有用的 html代码 如下 就是把那一部分 <li>标签全取出来 articleBodyRe = re.search('<ul id="thread_list" class="threadlist_bright j_threadlist_bright">(.*?)<div class="thread_list_bottom clearfix">', response.text, re.DOTALL) articleBody = '' if articleBodyRe: articleBody = articleBodyRe.group(1) # 通过lxml.etree.HTML(articleBody) 变成html对象 再利用xpath进行提取 # 此时的xpath与scrapy使用的xpath略有不同 # 这是lxml模块中xpath的使用方式 resultTree = lxml.etree.HTML(articleBody) articleList = resultTree.xpath('//li[contains(@class,"j_thread_list")]') for articleElem in articleList: articleInfo = {} data_field = articleElem.xpath("@data-field")[0] dataFieldJson = json.loads(data_field) articleInfo['id'] = dataFieldJson['id'] articleInfo['author'] = dataFieldJson['author_name'] articleInfo['title'] = articleElem.xpath(".//div[@class='t_con cleafix']//a/@title")[0] articleInfo['href'] = \ articleElem.xpath(".//div[@class='t_con cleafix']//a/@href")[0] yield response.follow( url = articleInfo['href'] + "?see_lz=1", meta={'dont_redirect': True, 'articleInfo': articleInfo}, callback = self.parseArticleDetail, errback = self.errorHandle ) def parseArticleDetail(self, response): print( f"parseArticleDetail: statusCode = {response.status}, url = {response.url}") contentLst = response.xpath( "//div[contains(@id, 'post_content')]//text()").extract() imgHrefLst = response.xpath( "//div[contains(@id, 'post_content')]//img/@src").extract() dateLst = response.xpath( "//div[contains(@class, 'post_content_firstfloor')]//span[@class='tail-info']/text()").extract() content = '' for contentElem in contentLst: content += contentElem.replace('\n', ',').replace(" ", '').strip() content += ', ' print(f"content = {content}") print(f"imgHrefLst = {imgHrefLst}") articleInfo = response.meta['articleInfo'] articleItem = TiebaBaiduItem() articleItem['item_type'] = 'articleDetail' articleItem['_id'] = articleInfo['id'] articleItem['title'] = articleInfo['title'] articleItem['author'] = articleInfo['author'] articleItem['content'] = content articleItem['fromUrl'] = response.url articleItem['picHrefLst'] = imgHrefLst articleItem['date'] = dateLst[1] yield articleItem # 请求错误处理:可以打印,写文件,或者写到数据库中 def errorHandle(self, failure): print(f"request error: {failure.value.response}")