python实战项目 — 爬取中国票房网年度电影信息并保存在csv

 

 

 

 

import pandas as pd
import requests
from bs4 import BeautifulSoup
import time


def spider(url, headers):
    print("正在抓取url:  " + url)
    datas = requests.get(url=url, headers=headers).text
    # 解析url
    soup = BeautifulSoup(datas, 'lxml')
    # 获取数据集合,find_all 返回的是集合类型,所以取[0], 找table标签下 的 属性是 id:tbContent
    moives_tables = soup.find_all('table', {'id': 'tbContent'})[0]
    # 获取每一个子节点 tr标签
    moives = moives_tables.findAll('tr')
    # 获取电影名字,电影名字在每个tr标签里面的第一个td标签里面,由于是有多个td所以要用for遍历
    names = [tr.find_all('td')[0].a.get('title') for tr in moives[1:]]
    # 获取电影的详情页url地址,而且下面提供给获取导演使用,因为导演信息不在主页面上
    hrefs = [tr.find_all('td')[0].a.get('href') for tr in moives[1:]]
    # 获取电影类型
    types = [tr.find_all('td')[1].string for tr in moives[1:]]
    # 获取票房数据
    box_offices = [int(tr.find_all('td')[2].string) for tr in moives[1:]]
    # 获取平均票价
    Average_fare = [tr.find_all('td')[3].string for tr in moives[1:]]
    # 获取上映日期
    show_time = [tr.find_all('td')[6].string for tr in moives[1:]]
    # print(names, hrefs, types, box_offices, Average_fare, show_time)
    # print(len(hrefs))
    daoyans = []
    for href in hrefs:
        try:
            daoyan_datas = requests.get(href)
            # 出现错误的原因是因为这里的daoyan_datas是requests对象,无法用BeautifulSoup解析,可以在daoyan_datas后面加上content
            soup = BeautifulSoup(daoyan_datas.content, 'lxml')
            # 获取导演,由于数据是带换行的,所以要用replace("\n","") 取消换行
            daoyan = soup.select('dl.dltext dd')[0].get_text().replace("\n", "")
            #print(daoyan)
            daoyans.append(daoyan)
            #print(len(daoyans))
            time.sleep(0.5)
        except:
            daoyans.append("获取失败")
    # 数据拼接,得到的数据类型是  <class 'pandas.core.frame.DataFrame'> ,所以要用 DataFrame() 函数来写入excel
    df = pd.DataFrame({
        'name': names,
        'href': hrefs,
        'type': types,
        'box_office': box_offices,
        'Average_fare': Average_fare,
        'show_time': show_time,
        'directors': daoyans
    })
    download(df)


'''
问题是不能连续存储,都是重新创建文件csv, os文件操作 mode='a'
'''
def download(df):
    df.to_csv('D://box_office.csv', mode='a', index=False, header=False)
    print("done")


if __name__ == "__main__":
    start_time = time.time()
    headers = {
        'Cookie': 'Hm_lvt_daabace29afa1e8193c0e3000d391562=1570691612; Hm_lpvt_daabace29afa1e8193c0e3000d391562=1570691612',
        'Host': 'www.cbooo.cn',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
    }
    base_url = "http://www.cbooo.cn/year?year="
    for i in range(2008, 2020):
        url = base_url + str(i)
        spider(url, headers)
        time.sleep(2)
print(round((time.time() - start_time), 3))

  

posted @ 2019-08-06 16:56  FishMan552  阅读(1319)  评论(0编辑  收藏  举报