使用python爬虫爬取数据集保存到csv或者excel中
准备
下载库
在编写代码时需要使用的python库要提前下载
pip install beautifulsoup4
pip install openpyxl
pip install requests
相关库的文档
openpyxl - 读/写 Excel 2010 xlsx/xlsm 文件的 Python 库
Beautiful Soup 4.4.0 文档
代码
1. 引入
# BeautifulSoup可以解析html与xml格式的文件
from bs4 import BeautifulSoup
# requests用来获取与发送网络请求
import requests
# openpyxl用来读写excel或者csv格式文件
from openpyxl import Workbook
2. 通过requests库将网页爬取
# 所要爬取的网页
url = "https://ssr1.scrape.center/page/"
# 请求头文件
header={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
# 获取网页请求返回信息
response = requests.get(url+str(pageInd),headers=header)
# 使用BeautifulSoup解析html
soup = BeautifulSoup(response.text,'html.parser')
3. 通过bs4获取所需内容
lists = soup.find_all("h2",class_="m-b-sm")
sorce_lists = soup.find_all("p",class_='score m-t-md m-b-n-sm')
# list = lists.find("div",class_="m-b-sm")
# print(type(lists))
for list in lists:
# 将获取的内容添加到数组中.strip()用来去除无效字符
nameList.append(list.string.strip())
# print(list.string)
for list in sorce_lists:
sorceList.append(list.string.strip())
# print(list.string)
4. 通过openpyxl保存文件信息
from openpyxl import Workbook
# 实例化
wb = Workbook()
# 激活 worksheet
ws = wb.active
# 添加一行数据
ws.append(nameList)
ws.append(sorceList)
# 保存文件
wb.save("temp.xlsx")
5. 使用pandas转为列格式
import pandas as pd
df = pd.read_excel('temp.xlsx',index=False) # 读取需要转置的文件
df = df.T # 转置
# df.to_excel('abc.xlsx',header=False) # 另存为xlsx文件,但是第一列会加粗
df.to_csv('abc.csv',header=False) # 另存为csv文件
完整代码
import pandas as pd
from bs4 import BeautifulSoup
import requests
from openpyxl import Workbook
url = "https://ssr1.scrape.center/page/"
header={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
wb = Workbook()
ws = wb.active
pageInd=0
nameList = []
nameList.append('Name')
sorceList = []
sorceList.append('Sorce')
for i in range(10):
pageInd = pageInd+1
response = requests.get(url+str(pageInd),headers=header)
#==================================================
soup = BeautifulSoup(response.text,'html.parser')
lists = soup.find_all("h2",class_="m-b-sm")
sorce_lists = soup.find_all("p",class_='score m-t-md m-b-n-sm')
# list = lists.find("div",class_="m-b-sm")
# print(type(lists))
for list in lists:
nameList.append(list.string.strip())
# print(list.string)
for list in sorce_lists:
sorceList.append(list.string.strip())
# print(list.string)
ws.append(nameList)
ws.append(sorceList)
# ws['B'].append(sorceList)
wb.save("temp.xlsx")
df = pd.read_excel('temp.xlsx',index=False) # 读取需要转置的文件
df = df.T # 转置
# df.to_excel('abc.xlsx',header=False) # 另存为xlsx文件
df.to_csv('abc.csv',header=False) # 另存为csv文件