用Python爬猫眼影院信息

废话不多说直接打开我的 Sublime Text 一顿操作

#-*-coding:utf-8-*-

import requests

import os

from bs4 import BeautifulSoup

import csv

 

page=1    #多少页就写多少

file_name="xx影院.csv" #保存文件名

#cookie 

cookie='_lxsdk_cuid=16ba3042655c8-021bb756ea3cee-e343166-1fa400-16ba3042655c8; uuid_n_v=v1; uuid=EAECE6509A6111E997236B31A64DDA6592C2B1FF33DE4DAE83060027C779F435; _csrf=a49697e895ed83cdc2865a4d32146d83244a096c83f4200ff4cc6a1013b231a5; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk=EAECE6509A6111E997236B31A64DDA6592C2B1FF33DE4DAE83060027C779F435; ci=52; __mta=20362631.1561808096229.1562714645807.1562714663483.176; _lxsdk_s=16bd90b3f70-aa4-bd3-409%7C%7C26'



#猫眼电影网站有反爬虫措施,设置headers后可以爬取 设置cookie 

headers = {

    'Content-Type': 'text/plain; charset=UTF-8',

    'Origin':'https://maoyan.com',

    'Referer':'https://maoyan.com/',

    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',

    'cookie':cookie

    }

 

#爬取网页源代码

def get_one_page(url,headers):

    try:

        response =requests.get(url,headers =headers)

        if response.status_code == 200:

            return response.text

        return None

    except RequestsException:

        return None

 

#提取影院url

def parse_one_page(html):

    soup = BeautifulSoup(html, 'lxml')

    url_list = soup.find_all('a', attrs={"class": "cinema-name"})

    # print(img_list.title)

    for tmp in url_list:

        url = "https://maoyan.com"+ tmp.get('href')

        # print(url)

        html_info = get_one_page(url,headers)

        parse_one_pageinfo(html_info)



# 影院详细信息

def parse_one_pageinfo(html):

    soup = BeautifulSoup(html, 'lxml')

    cinema_name = soup.find_all('h3', attrs={"class": "name text-ellipsis"})

    cinema_address = soup.find_all('div', attrs={"class": "address text-ellipsis"})

    cinema_phone= soup.find_all('div', attrs={"class": "telphone"})

    print(cinema_name[0].string)

    print(cinema_address[0].string)

    print(cinema_phone[0].string)

    cinema_info = [cinema_name[0].string,cinema_address[0].string, cinema_phone[0].string]

    write_to_file_csv(cinema_info)



def write_to_file_csv(item):

    with open(file_name, 'a', encoding='utf_8_sig',newline='') as f:

        # 'a'为追加模式(添加)

        # utf_8_sig格式导出csv不乱码 

        w = csv.writer(f)

        w.writerow(item) 



def main(offset):

    url = "https://maoyan.com/cinemas?offset="+str(offset)

    print(url)

    html = get_one_page(url,headers)

    if not os.path.exists('covers'):

        os.mkdir('covers')    

    parse_one_page(html)

    # for item in parse_one_page(html):

    #     print(item)

        # write_to_file_csv(item)

        # save_image_file(item['image'],'covers/'+item['title']+'.jpg')

 

if __name__ == '__main__':

    #对每一页信息进行爬取

    for i in range(page):

        main(i*(10+2))

因为历史票房需要 app 查看,所以爬不了,但还是省了很多时间,几个小时的事情一分钟解决
转自:https://www.admin17.cn/2803.html

posted @ 2022-04-20 16:01  流口水的鱼  阅读(288)  评论(0编辑  收藏  举报