爬取豆瓣电影,把电影名称和详情url保存到json中

# -*-coding:utf-8-*-
import requests
import json

class Douban(object):
def __init__(self):
self.url = "https://m.douban.com/rexxar/api/v2/subject_collection/movie_showing/items?&start=0&count=100"
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36'
}

def get_data(self):
response = requests.get(self.url, headers=self.headers)
# print(response.content.decode())
return response.content.decode('UTF-8')

def parse_data(self, data):
# print(type(data))
# str转化为dict
dict_data = json.loads(data)
# 使用key获取值 movie_li是一个元素为字典的列表
movie_list = dict_data['subject_collection_items']
# 把提取的内容放到新的元素为字典(key电影名称,)的列表中
data_list = []
for movie_info in movie_list:
temp_dict = {}
temp_dict['title'] = movie_info['title']
temp_dict['url'] = movie_info['url']
data_list.append(temp_dict)
print(temp_dict['title'])
# chardet.detect(data_list[0]['title'])
return data_list


def save_data(self, data_list):
with open('douban_movie.json','w') as f:
for data_info in data_list:
str_data = json.dumps(data_info, ensure_ascii=False) + ',\n'
f.write(str_data)

def run(self):
# 发起请求
data = self.get_data()
# 解析数据
data_list = self.parse_data(data)
# 保存数据
self.save_data(data_list)
# print(data)


if __name__ == '__main__':
douban = Douban()
douban.run()
posted @ 2017-10-07 20:11  建小国  阅读(2081)  评论(1编辑  收藏  举报