大数据加工平台py文件内容
#coding:utf-8
# coding:utf-8
import matplotlib as matplotlib
import numpy as np
import pandas as pd
import openpyxl
from pandas import DataFrame, Series
# 可视化显示在界面
# matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来显示中文
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
import json
import warnings
warnings.filterwarnings('ignore')
# 设置显示的最大列、宽等参数,消掉打印不完全中间的省略号
# pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000) # 加了这一行那表格的一行就不会分段出现了
# pd.set_option('display.max_colwidth', 1000)
# pd.set_option('display.height', 1000)
# 显示所有列
pd.set_option('display.max_columns', None)
# 显示所有行
pd.set_option('display.max_rows', None)
movies = pd.read_csv('C:\\Users\\杜子轩\\Desktop\\王建民作业\\大数据竞赛练习题\\MathorCup大数据竞赛练习题1\\data\\tmdb_5000_movies.csv',
encoding='utf_8')
credits = pd.read_csv('C:\\Users\\杜子轩\\Desktop\\王建民作业\\大数据竞赛练习题\\MathorCup大数据竞赛练习题1\\data\\tmdb_5000_credits.csv',
encoding='utf_8')
movies.info() # 查看信息
credits.info()
# 两个数据框都有title列,以及movies.riginal_title
# 以上三个数据列重复,删除两个
del credits['title']
del movies['original_title']
# 连接两个csv文件
merged = pd.merge(movies, credits, left_on='id', right_on='movie_id', how='left')
# 删除不需要分析的列
df = merged.drop(['homepage', 'overview', 'spoken_languages', 'status', 'tagline', 'movie_id'], axis=1)
df.info()
# 查找缺失值记录-release_date
df[df.release_date.isnull()]
# 查找缺失值记录-runtime
df[df.runtime.isnull()]
len(df.id.unique())
df['release_year'] = pd.to_datetime(df.release_date, format='%Y-%m-%d', errors='coerce').dt.year
df['release_month'] = pd.to_datetime(df.release_date).apply(lambda x: x.month)
df['release_day'] = pd.to_datetime(df.release_date).apply(lambda x: x.day)
df.info()
df.describe()
df = df[(df.vote_count >= 50) & (df.budget * df.revenue * df.popularity * df.vote_average != 0)].reset_index(
drop='True')
df
json_column = ['genres', 'keywords', 'production_companies', 'production_countries', 'cast', 'crew']
# 1-json本身为字符串类型,先转换为字典列表
for i in json_column:
df[i] = df[i].apply(json.loads)
# 提取name
# 2-将字典列表转换为以','分割的字符串
def get_name(x):
return ','.join([i['name'] for i in x])
df['cast'] = df['cast'].apply(get_name)
# 提取derector
def get_director(x):
for i in x:
if i['job'] == 'Director':
return i['name']
df['crew'] = df['crew'].apply(get_director)
for j in json_column[0:4]:
df[j] = df[j].apply(get_name)
# 重命名
rename_dict = {'cast': 'actor', 'crew': 'director'}
df.rename(columns=rename_dict, inplace=True)
df.info()
df.head(5)
# 备份原始数据框original_df
org_df = df.copy()
df.to_excel('C:\\Users\\杜子轩\\Desktop\\王建民作业\\大数据竞赛练习题\\MathorCup大数据竞赛练习题1\\data\\result.xlsx', columns=["budget","genres","id","keywords","original_language","popularity","production_companies","production_countries","release_date","revenue","runtime","title","vote_average","vote_count","actor","director","release_year","release_month","release_day"], index=False,encoding='utf-8',sheet_name='Sheet1')
#df.to_excel('C:\\Users\\杜子轩\\Desktop\\王建民作业\\大数据竞赛练习题\\MathorCup大数据竞赛练习题1\\data\\result.xlsx')