观影数据集之大数据分析
import pandas as pd from pandas.io import json movies = pd.read_csv('tmdb_5000_movies.csv', encoding='utf_8') credits = pd.read_csv('tmdb_5000_credits.csv', encoding='utf_8') # 查看信息 movies.info() credits.info() #删除重复列 del credits['title'] del movies['original_title'] #合并表 merged = pd.merge(movies, credits, left_on='id', right_on='movie_id', how='left') #删除不需要的字段 df=merged.drop(['homepage','overview','spoken_languages','status','tagline','movie_id'],axis=1) #查看结果 df.info() # 查找缺失值记录-release_date var = df[df.release_date.isnull()] print(var.title) # 查找缺失值记录-runtime var = df[df.runtime.isnull()] print(var.title) #补全数据 df['release_date'] = df['release_date'].fillna('2014-06-01') df.loc[2656] = df.loc[2656].fillna('94, limit=1') df.loc[4140] = df.loc[4140].fillna('240, limit=1') df.info() #重复值处理 print(len(df.id.unique())) #日期值处理 df['release_year'] = pd.to_datetime(df.release_date, format = '%Y-%m-%d',errors='coerce').dt.year df['release_month'] = pd.to_datetime(df.release_date).apply(lambda x: x.month) df['release_day'] = pd.to_datetime(df.release_date).apply(lambda x: x.day) df.info() print(df['release_year'],df['release_month'],df['release_day']) #筛选数据 df = df[(df.vote_count >= 50) &(df.budget * df.revenue * df.popularity * df.vote_average !=0)].reset_index(drop = 'True') df.info() #Json格式处理 json_column = ['genres', 'keywords', 'production_companies', 'production_countries', 'cast', 'crew'] # 1-json本身为字符串类型,先转换为字典列表 for i in json_column: df[i] = df[i].apply(json.loads) # 提取name # 2-将字典列表转换为以','分割的字符串 def get_name(x): return ','.join([i['name'] for i in x]) df['cast'] = df['cast'].apply(get_name) # 提取derector def get_director(x): for i in x: if i['job'] == 'Director': return i['name'] df['crew'] = df['crew'].apply(get_director) for j in json_column[0:4]: df[j] = df[j].apply(get_name) # 重命名 rename_dict = {'cast': 'actor', 'crew': 'director'} df.rename(columns=rename_dict, inplace=True) df.info() print(df.head(5).genres) print(df.head(5).keywords) print(df.head(5).production_companies) print(df.head(5).production_countries) print(df.head(5).actor) print(df.head(5).director) #数据备份 org_df = df.copy() df.reset_index().to_csv("TMDB_5000_Movie_Dataset_Cleaned.csv")
一直到数据备份这步