进击的爬虫-001-猫眼电影爬取

猫眼电影top 100 爬取

import requests
import re


def get_html(url,data):
    ret = requests.get(url, params=data)
    return ret.text

# re_before = re.compile('<dd>\s*.*\s*.*?title="(.*?)"')


# movies = re.finditer(movie_re, ret)  #找出当前页面十个电影,得到一个可迭代对象

# movie_obj = next(movies).group()  #通过next方法拿到第一个电影
# print(movie_obj)
#
# movie_name = re.findall('s="name".*?title="(.*?)"', movie_obj)  #拿到第一个电影的电影名
# print(movie_name[0])
#
# movie_star = re.findall('s="star">\s*(.*?)\s*<', movie_obj)   #拿到电影主演
# print(movie_star[0])
#
# movie_releasetime = re.search('s="releasetime">上映时间:(?P<time>.*?)<', movie_obj) #拿到电影时间
# print(movie_releasetime.group('time'))

def get_info(html_res):
    movie_re = re.compile('<dd>[\d\D]*?</dd>')
    movies = re.finditer(movie_re, html_res)

    for movie_obj in movies:
        movie_obj = movie_obj.group()
        movie_name = re.findall('s="name".*?title="(.*?)"', movie_obj)[0]
        movie_star = re.findall('s="star">\s*(.*?)\s*<', movie_obj)[0]
        movie_releasetime = re.search('s="releasetime">上映时间:(?P<time>.*?)<', movie_obj).group('time')
        movieinfo = f'电影名:{movie_name},  {movie_star},    上映时间:{movie_releasetime}'
        print(movieinfo)


url = 'https://maoyan.com/board/4'

data = {
    'offset':0
}

for i in range(10):
    data['offset'] = i * 10
    html_res = get_html(url, data)
    get_info(html_res)

 

posted @ 2019-07-18 09:44  眼镜儿  阅读(136)  评论(0编辑  收藏  举报