爬虫入门-写一个小爬虫的思路

注释里面比较详细。

点击查看代码
# -*- codeing = utf-8 -*-
from bs4 import BeautifulSoup
import requests
import csv

# 准备URL
url = "https://ssr1.scrape.center/"
# 获取页面源码
code_request = requests.get(url).content.decode("utf-8")
# 转为Soup对象
soup = BeautifulSoup(code_request, "html.parser")
# 获取 带有bg样式的标签tr 的源码
code_list = soup.findAll("div", attrs={"class": "el-card item m-t is-hover-shadow"})
# 预设列表存放结果
result_list = []
for i in code_list:
    # 准备空字典以存放所需数据
    result_dic = {}
    # 使用正则表达式获取响应内容 | .replace("\n","")删除换行符   .lstrip()和.rstrip()删除文本左、右空白
    result_title = i.findAll("td", attrs={"class": "td-title"})[0].findAll("a")[0].text.replace("\n","").lstrip().rstrip()
    result_type = i.find_all("span", attrs={"class": "face"})[0].get("title").replace("\n", "").lstrip().rstrip()
    result_author = i.find_all("a", attrs={"class": "author"})[0].text.replace("\n", "").lstrip().rstrip()
    result_time = i.find_all("td")[-1].text.replace("\n", "").lstrip().rstrip()
    # print("---------测试----------")
    # print(result_title,result_author,result_time)
    # print("---------测试----------")
    result_dic["标题"] = result_title
    result_dic["发帖种类"] = result_type
    result_dic["作者"] = result_author
    result_dic["回复时间"] = result_time
    print()
    # 将词典数据放入列表
    result_list.append(result_dic)
    print(result_list) # 此时的列表是由多个字典元素组成的列表
# 开始写出CSV文件
with open("resultb.csv", "a+", encoding="utf-8") as f:
    # 准备CSV表头
    writer = csv.DictWriter(f, fieldnames=["标题",'发帖种类','作者', '回复时间'])
    # 写CSV表头
    writer.writeheader()
    # 写CSV数据
    writer.writerows(result_list)
# 读取CSV
with open("resultb.csv", encoding="utf-8") as f:
    reader = csv.DictReader(f)
    for row in reader:
        print(row)
可以直接跑,不懂的地方可以私信我!
posted @ 2022-10-24 09:50  吕洞玄  阅读(16)  评论(0编辑  收藏  举报