爬虫数据存储csv

爬虫数据存储csv

一,csv文件的简单读写

import csv

# CSV也叫逗号分分隔,一般以逗号分隔,也可以使用空格或者tab分隔
csv_file = open("file/test.csv", "w+")
# 写入
try:
    csv_write = csv.writer(csv_file)
    csv_write.writerow(("col1", "col2", "col3"))
    for i in range(4):
        csv_write.writerow((i*1, i*10, i*100))
finally:
    csv_file.close()

# 读出,当然上面写入也可以使用with as语法
with open("file/test.csv", "r") as f:
    reader = csv.reader(f)
    for i in reader:
        print(i)

二, 爬虫实战,爬取世界大学排名存入csv中

import requests
from bs4 import BeautifulSoup
import csv
"""
https://www.dxsbb.com/news/16131.html
获取本网站上的世界大学1000名。因为时表格,存入csv文件
"""


def wirte_csv(data_list):
    # 之前为gb2312编码,csv写入部分字符无法表示,所以转换为utf-8编码
    csv_file = open("file/university.csv", "w+", encoding="utf-8")
    try:
        csv_writer = csv.writer(csv_file)
        for i in data_list:
            # 因为之前使用了\t来拼接,所以这次使用'\t'来就行分割成为列表
            csv_writer.writerow(i.split('\t')[:-1])
    finally:
        csv_file.close()


def get_soup(url, encode):
    # 获取soup对象
    header = {
        'User-Agent': 'python-requests/2.22.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'
    }
    resp = requests.get(url=url, headers=header)
    resp.encoding=encode
    return BeautifulSoup(resp.text, "lxml")


def get_university(soup):
    # 获取数据
    un_list = soup.find("table").find_all("tr")
    td_list = []
    for un_row in un_list:
        td_row=""
        for un_td in un_row.find_all("td"):
            td_row = td_row+un_td.text+"\t"
        td_list.append(td_row)
    return td_list

if __name__ == '__main__':
    url = r"https://www.dxsbb.com/news/16131.html"
    # 获取bs对象,并且此网站使用gb2312编码
    soup = get_soup(url,"gb2312")
    list = get_university(soup)
    wirte_csv(list)

结果:生成文件university.csv,打开即可看到将爬取的数据。

posted @ 2020-02-11 21:41  没尾巴的刺刺鱼  阅读(237)  评论(0编辑  收藏  举报