爬取猫眼数据
//源码
#
# 导包
#
import pyximport
import requests
from fake_useragent import UserAgent
import json
import os
import pandas as pd
import csv
import datetime
#
#
#
# 代码
# http://maoyan.com/films/42964
#
#
#伪表头定义
pyximport.install()
ua=UserAgent()
headers = {
# "User-agent":UserAgent(verity_ssl=False).random,
"User-agent":ua.random,
"Host":"m.maoyan.com",
#"Referer":"http://m.maoyan.com/movie/1217236/comments?_v_=yes"
"Referer":"http://m.maoyan.com/movie/42964/comments?_v_=yes"
}
#请求参数定义
offsets = [0,15,30,45,60,75,90,105,120,135,150,165,180]
startTime="0"
randomTime = ""
list_info = []
for offset in offsets:
comment_api='http://m.maoyan.com/mmdb/comments/movie/42964.json?_v_=yes&offset={0}&startTime={1}'.format(offset,datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
response_comment = requests.get(comment_api,headers=headers)
json_comment=response_comment.text
json_comment=json.loads(json_comment)
#print(json_comment)
json_response = json_comment['cmts']
for data in json_response:
cityName = data['cityName']
content=data['content']
if "gender" in data:
gender = data['gender']
else:
gender=0
nickName = data['nickName']
userLevel = data['userLevel']
score = data['score']
list_one=[nickName,gender,cityName,userLevel,score,content]
list_info.append(list_one)
#print("offset:"+offset+",startTime:"+startTime)
#重新定义请求参数
print("正在存储数据:")
file_size=os.path.getsize(r'D:\B_Hakkelujah\python\maoyan.csv')
prStr = "文件大小:{0}".format(file_size)
print(prStr)
if file_size==0:
print("空文件添加数据")
# 表头
name = ['评论者昵称', '性别', '所在城市','猫眼等级','评分','评论内容']
# 建立DataFrame对象
file_test = pd.DataFrame(columns=name, data=list_info)
# 数据写入
file_test.to_csv(r'D:\B_Hakkelujah\python\maoyan.csv', encoding='utf_8_sig', index=False)
print("数据添加完毕")
#pd.read_csv(file_name, encoding='utf-8')
原文:
https://mp.weixin.qq.com/s?__biz=MjM5MjAwODM4MA==&mid=2650706418&idx=1&sn=20e57b7b1c8caa4c0b06d6dbd2b94aaa&chksm=bea6e02189d16937c8c3d934264f24b599576b14b76361018b55cca76fb73a127d4f6681af98&mpshare=1&scene=1&srcid=101045ENCgxgoTId8LKXrIaE&pass_ticket=Cgz9TOK3J64evSI%2B9Ev7kLigZCJHUOKf8eJe9%2FagJaUdYdhyn53lL%2FeRC4NnDrUq#rd
注:
数据爬取记录
1.分析接口(包括接口参数的变化)
2.分析JSON数据(数据解析)
3.数据存储(文件、数据库)