爬取猫眼电影top100信息

爬取猫眼电影top100信息

 基于requests模块和 pyquery 模块

request爬取网页内容,pyquery 模块过滤想要的内容信息

因为没有涉及到登陆验证,所以写的比较简单,写的不好请见谅,有什么不明白的地方可以私信我,或者加我QQ:743876685  

欢迎大家提供更好的建议。

import requests
from pyquery import PyQuery
from concurrent.futures import ThreadPoolExecutor


#代理ip
proxies={'https':"https://27.46.74.32:9999"}
root = "http://maoyan.com"

def maoyan_request(num):
    try:
        url = "http://maoyan.com/board/4?offset=%s0"%num
        headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER"}
        html = requests.get(url=url,headers=headers,proxies=proxies)
        return html.status_code,html.text
    except requests.exceptions.ConnectionError:
        return None,None

def maoyan_top(num):
    code,html = maoyan_request(num)
    if str(code) == '200':
        doc = PyQuery(html)
        data = doc.find('.board-wrapper dd')
        with open('top-100.txt','a',encoding='utf-8') as f:
            for i in data.items():
                #获取榜单序号
                serial_number = i.find('i').text().split(' ')[0]
                #获取评分
                score = i.find('i').text().split(' ')[1] + i.find('i').text().split(' ')[2]
                #获取电影名
                name = i.find('.image-link').attr('title')
                #获取演员名
                actor = i.find('.star').text()
                #获取上映日期
                move_date = i.find('.releasetime').text()
                #电影url
                move_url = root + i.find('.name a').attr('href')
                f.write('%s,%s,%s,%s,%s,%s\n'%(serial_number,score,name,actor,move_date,move_url))
        print(num)
    else:
        print('访问失败')

def number():
    url = "http://maoyan.com/board/4?offset=00"
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER"}
    html = requests.get(url=url,headers=headers).text
    doc = PyQuery(html)
    used = doc.find('.page_10').text()
    num = int(used)
    return num


if  __name__ == '__main__':
    p = ThreadPoolExecutor(6)
    number =int(number())
    for num  in range(0,number):
        p.submit(maoyan_top,num)
    p.shutdown()

爬取的数据如下:

 

posted @ 2018-09-03 17:22  换你一世迷离  阅读(189)  评论(0编辑  收藏  举报