使用xpath技术爬取段子网
from lxml import etree import time import json import urllib.request item_list = [] # 创建一个列表存储获取的信息 # 构造request对象 def handler_request(url, page): headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) Apple\ WebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } get_url = url + str(page) request = urllib.request.Request(url=get_url, headers=headers) return request # 解析获取的html文件 def parse_content(content): # 生成对象 tree = etree.HTML(content) article_list = tree.xpath('//main[@class="col-md-8 main-content"]/article') # 遍历article列表 for article in article_list: # 获取标题 title = article.xpath('.//div[@class="post-head"]/h1/a/text()')[0] # 获取内容 text = article.xpath('.//div[@class="post-content"]/p/text()') text = '\n'.join(text) # 将内容进行拼接 item = { '标题': title, '内容': text, } item_list.append(item) def main(): start_page = int(input("请输入查询起始页面:")) end_page = int(input("查询结束页面:")) url = "http://duanziwang.com/page/" for page in range(start_page, end_page+1): request = handler_request(url, page) try: content = urllib.request.urlopen(request).read().decode() parse_content(content) except: print("第%d页面爬取失败" % page) string = json.dumps(item_list, ensure_ascii=False) with open('duanzi.txt', "w", encoding='utf-8') as f: f.write(string) if __name__ == '__main__': main()