爬取豆瓣
import json import requests class DoubanSpider: def __init__(self): self.start_url = "https://movie.douban.com/j/search_subjects?type=tv&tag={}&sort=recommend&page_limit=20&page_start={}" self.tv_type_url = "https://movie.douban.com/j/search_tags?type=tv&source=" self.headers = { "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1", } def parse_url(self, url): """发送请求,获取响应""" print(url) response = requests.get(url, headers=self.headers) return response.content.decode() def get_content_list(self, json_str, json_data): """提取数据""" dict_ret = json.loads(json_str) content_list = dict_ret[json_data] return content_list def save_content_list(self, content_list,data_type): """保存""" with open("douban.txt", "a", encoding="Utf-8") as f: for content in content_list: content["data_type"] =data_type f.write(json.dumps(content, ensure_ascii=False)) f.write("\n") # 写入换行符,进行换行 print("保存成功") def run(self): """实现主要逻辑""" tv_type_json_str = self.parse_url(self.tv_type_url) tv_type_list = self.get_content_list(tv_type_json_str, "tags") for tv_type in tv_type_list: num = 0 while True: # 1.start_url url = self.start_url.format(tv_type, num) # 2.发送请求,获取响应 json_str = self.parse_url(url) # 3.提取TV数据 content_list = self.get_content_list(json_str, "subjects") # 4.保存 self.save_content_list(content_list,tv_type) if len(content_list) < 20: break # 5.构造下一页的url地址,进入循环 num += 1 if __name__ == '__main__': douban_spider = DoubanSpider() douban_spider.run()