爬取猫眼电影top100信息
爬取猫眼电影top100信息
基于requests模块和 pyquery 模块
request爬取网页内容,pyquery 模块过滤想要的内容信息
因为没有涉及到登陆验证,所以写的比较简单,写的不好请见谅,有什么不明白的地方可以私信我,或者加我QQ:743876685
欢迎大家提供更好的建议。
import requests from pyquery import PyQuery from concurrent.futures import ThreadPoolExecutor #代理ip proxies={'https':"https://27.46.74.32:9999"} root = "http://maoyan.com" def maoyan_request(num): try: url = "http://maoyan.com/board/4?offset=%s0"%num headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER"} html = requests.get(url=url,headers=headers,proxies=proxies) return html.status_code,html.text except requests.exceptions.ConnectionError: return None,None def maoyan_top(num): code,html = maoyan_request(num) if str(code) == '200': doc = PyQuery(html) data = doc.find('.board-wrapper dd') with open('top-100.txt','a',encoding='utf-8') as f: for i in data.items(): #获取榜单序号 serial_number = i.find('i').text().split(' ')[0] #获取评分 score = i.find('i').text().split(' ')[1] + i.find('i').text().split(' ')[2] #获取电影名 name = i.find('.image-link').attr('title') #获取演员名 actor = i.find('.star').text() #获取上映日期 move_date = i.find('.releasetime').text() #电影url move_url = root + i.find('.name a').attr('href') f.write('%s,%s,%s,%s,%s,%s\n'%(serial_number,score,name,actor,move_date,move_url)) print(num) else: print('访问失败') def number(): url = "http://maoyan.com/board/4?offset=00" headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER"} html = requests.get(url=url,headers=headers).text doc = PyQuery(html) used = doc.find('.page_10').text() num = int(used) return num if __name__ == '__main__': p = ThreadPoolExecutor(6) number =int(number()) for num in range(0,number): p.submit(maoyan_top,num) p.shutdown()
爬取的数据如下: