爬取糗事百科列表页案例
import requests from lxml import etree import re # 爬取糗事百科所有列表页信息 class Qiushi(): def __init__(self, base_url): self.base_url = base_url self.max_page = self.get_max_page() self.get_data() # 获取最大页数 def get_max_page(self): response = requests.get(self.base_url) html_str = response.text html = etree.HTML(html_str) max_page = html.xpath('//ul[@class="pagination"]/li[last()-1]/a/span/text()') max_page = int(max_page[0].strip()) return max_page # 发起请求获取数据 def get_data(self): # 循环获取每一页的数据 for page in range(1, self.max_page + 1): base_url = 'https://www.qiushibaike.com/8hr/page/{}/'.format(page) response = requests.get(base_url) html_str = response.text html = etree.HTML(html_str) result = html.xpath('//div[@class="recommend-article"]/ul/li') all_list = [] for site in result: # print(type(site)) # 看看里面是什么 # print(etree.tostring(site, pretty_print=True,encoding='utf-8').decode('utf-8')) qiushi_info = {} funny_number = site.xpath('.//div[@class="recmd-num"]/span[1]/text()') # 搞笑数 comment_number = site.xpath('.//div[@class="recmd-num"]/span[4]/text()') # 评论数 content = site.xpath('.//a[@class="recmd-content"]/text()') # 内容 pic = site.xpath('.//a[contains(@class, "recmd-left")]/img/@src') # 图片 username = site.xpath('.//span[@class="recmd-name"]/text()') # 用户昵称 # all函数 所有的都为真 返回真 只要有一个假 则返回假 # any函数 只要有一个为真 则返回真 # 过滤掉广告 if all([funny_number, comment_number, content, pic, username]): qiushi_info['funny_number'] = funny_number[0] qiushi_info['comment_number'] = comment_number[0] qiushi_info['content'] = content[0] # 拼接图片url pic = "https:" + pic[0] # 获取原始图片大小 有些图片没有问号 就是原始图片 if "?" in pic: pattern = re.compile('(.*?)\?') pic = pattern.findall(pic)[0] qiushi_info['pic'] = pic qiushi_info['username'] = username[0] all_list.append(qiushi_info) # 整理输出 print('-------------------第{}页------------------------'.format(page)) for i in all_list: print(i) if __name__ == "__main__": base_url = 'https://www.qiushibaike.com/' Qiushi(base_url)