使用python爬虫爬取数据集保存到csv或者excel中

准备

下载库

在编写代码时需要使用的python库要提前下载

pip install beautifulsoup4
pip install openpyxl
pip install requests

相关库的文档

openpyxl - 读/写 Excel 2010 xlsx/xlsm 文件的 Python 库
Beautiful Soup 4.4.0 文档

代码

1. 引入

# BeautifulSoup可以解析html与xml格式的文件
from bs4 import BeautifulSoup
# requests用来获取与发送网络请求
import requests
# openpyxl用来读写excel或者csv格式文件
from openpyxl import Workbook

2. 通过requests库将网页爬取

# 所要爬取的网页
url = "https://ssr1.scrape.center/page/"
# 请求头文件
header={
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
# 获取网页请求返回信息
response = requests.get(url+str(pageInd),headers=header)
# 使用BeautifulSoup解析html
soup = BeautifulSoup(response.text,'html.parser')

3. 通过bs4获取所需内容

lists = soup.find_all("h2",class_="m-b-sm")
sorce_lists = soup.find_all("p",class_='score m-t-md m-b-n-sm')
# list = lists.find("div",class_="m-b-sm")
# print(type(lists))
for list in lists:
	# 将获取的内容添加到数组中.strip()用来去除无效字符
	nameList.append(list.string.strip())
	# print(list.string)
for list in sorce_lists:
	sorceList.append(list.string.strip())
	# print(list.string)

4. 通过openpyxl保存文件信息

from openpyxl import Workbook
# 实例化
wb = Workbook()
# 激活 worksheet
ws = wb.active
# 添加一行数据
ws.append(nameList)
ws.append(sorceList)
# 保存文件
wb.save("temp.xlsx")

5. 使用pandas转为列格式

import pandas as pd

df = pd.read_excel('temp.xlsx',index=False)  # 读取需要转置的文件
df = df.T  # 转置
# df.to_excel('abc.xlsx',header=False)  # 另存为xlsx文件,但是第一列会加粗
df.to_csv('abc.csv',header=False)  # 另存为csv文件

完整代码


import pandas as pd
from bs4 import BeautifulSoup
import requests
from openpyxl import Workbook

url = "https://ssr1.scrape.center/page/"

header={
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}

wb = Workbook()
ws = wb.active

pageInd=0
nameList = []
nameList.append('Name')
sorceList = []
sorceList.append('Sorce')
for i in range(10):
    pageInd = pageInd+1
    response = requests.get(url+str(pageInd),headers=header)
    #==================================================
    soup = BeautifulSoup(response.text,'html.parser')
    lists = soup.find_all("h2",class_="m-b-sm")
    sorce_lists = soup.find_all("p",class_='score m-t-md m-b-n-sm')
    # list = lists.find("div",class_="m-b-sm")
    # print(type(lists))
    for list in lists:
        nameList.append(list.string.strip())
        # print(list.string)
    for list in sorce_lists:
        sorceList.append(list.string.strip())
        # print(list.string)


ws.append(nameList)
ws.append(sorceList)
# ws['B'].append(sorceList)
wb.save("temp.xlsx")


df = pd.read_excel('temp.xlsx',index=False)  # 读取需要转置的文件
df = df.T  # 转置
# df.to_excel('abc.xlsx',header=False)  # 另存为xlsx文件
df.to_csv('abc.csv',header=False)  # 另存为csv文件
posted @ 2022-10-15 16:45  又一岁荣枯  阅读(213)  评论(0编辑  收藏  举报