爬取猫眼电影100榜单 代码
1 import requests 2 import re, json 3 4 5 # 请求网页,返回响应体 6 def get_one_page(url): 7 headers = { 8 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36', 9 } 10 response = requests.get(url, headers=headers) 11 if response.status_code == 200: 12 return response.text 13 return None 14 15 16 # 使用正则将需要的数据剥离 17 def parse_one_page(html): 18 pattern = re.compile( 19 '<dd>.*?board-index.*?>(.*?)</i>.*?data-src="(.*?)"' 20 + '.*?name.*?a.*?>(.*?)</a>.*?star.*?>(.*?)</p>' 21 + '.*?releasetime.*?>(.*?)</p>.*?integer.*?>(.*?)</i>' 22 + '.*?fraction.*?>(.*?)</i>.*?</dd>', 23 re.S) 24 res = re.findall(pattern, html) 25 for i in res: 26 yield { 27 '排名': i[0], 28 '封面': i[1], 29 '影片名称': i[2], 30 '主演': i[3].strip()[3:], 31 '上映时间': i[4].strip()[5:], 32 '评分': i[5].strip() + i[6].strip(), 33 } 34 35 36 # 将结果写入到文件 37 def write_to_file(content): 38 with open('猫眼电影排行TOP100.txt', 'a', encoding='utf-8') as f: 39 f.write(json.dumps(content, ensure_ascii=False) + '\n') 40 41 42 def main(offset): 43 url = 'https://maoyan.com/board/4?offset=' + str(offset) 44 html = get_one_page(url) 45 for item in parse_one_page(html): 46 print(item) 47 write_to_file(item) 48 49 50 if __name__ == '__main__': 51 for i in range(10): 52 main(offset=i * 10)
问题:
转载地址 https://www.jianshu.com/p/86d66257de41