【爬虫】XPath实例
题目要求我们用XPATH去爬某个网站并且保存为CSV文件
代码如下,仅供参考
# -*- coding: UTF-8 -*-
# 开发人员:萌狼蓝天
# 博客:Https://mllt.cc
# 笔记:Https://cnblogs.com/mllt
# 哔哩哔哩/微信公众号:萌狼蓝天
# 开发时间:2022/10/5
import pandas as pd
import requests
import lxml.html
csv_data = pd.DataFrame(columns=["序号", "标题", "链接", "作者", "点击", "回复", "更新时间"])
# 获取页面源码
headers = {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; Tablet PC 2.0; wbx 1.0.0; wbxapp 1.0.0; Zoom 3.6.0)",
"X-Amzn-Trace-Id": "Root=1-628b672d-4d6de7f34d15a77960784504"
}
code = requests.get("http://bbs.tianya.cn/list-no02-1.shtml", headers=headers).content.decode("utf-8")
print("-------------------------------------------------获取源码-----------------------------------")
# print(code)
selector = lxml.html.fromstring(code)
print("-------------------------------------------------获取关键部分-----------------------------------")
lists = selector.xpath('//div[@class="mt5"]/table')
print("-------------------------------------------------获取单独部分-----------------------------------")
print(len(lists))
for i in lists:
x = 0
for j in range(2, 9):
for c in range(1, 11):
x += 1
title = i.xpath('//tbody[' + str(j) + ']/tr[' + str(c) + ']/td[1]/a/text()')[0].replace("\t", "").replace("\r", "").replace("\n", "")
link = i.xpath('//tbody[' + str(j) + ']/tr[' + str(c) + ']/td[1]/a')[0].attrib['href'].replace("\t", "")
author = i.xpath('//tbody[' + str(j) + ']/tr[' + str(c) + ']/td[2]/a/text()')[0].replace("\t", "")
click = i.xpath('//tbody[' + str(j) + ']/tr[' + str(c) + ']/td[3]/text()')[0].replace("\t", "")
reply = i.xpath('//tbody[' + str(j) + ']/tr[' + str(c) + ']/td[4]/text()')[0].replace("\t", "")
reply_time = i.xpath('//tbody[' + str(j) + ']/tr[' + str(c) + ']/td[5]/text()')[0].replace("\t", "")
csv_data=csv_data.append({"序号": x, "标题": title, "链接": 'http://bbs.tianya.cn/'+link, "作者": author, "点击": click, "回复": reply,
"更新时间": reply_time}, ignore_index=True)
print(title, link, author)
print(csv_data)
csv_data.to_csv("result.csv")
往期文章
版 权 声 明