三、: python数据分析处理库-Pandas
本节内容:
- Pandas数据读取
- Pandas索引与计算
- Pandas数据预处理实例
- Pandas常用预处理方法
- Pandas自定义函数
- Series结构
1、Pandas数据读取
import pandas food_info = pandas.read_csv("food_info.csv")#读取一个以逗号分隔开的文件 #print(type(food_info)) #<class 'pandas.core.frame.DataFrame'> print(food_info.dtypes)
print(help(pandas.read_csv))
first_rows = food_info.head()#不加参数默认显示前5条数据,指定参数后可根据参数进行显示
print(first_rows )
print(food_info.head(3))#显示前3条数据
print(food_info.tail(3))#显示后3条数据
print(food_info.columns)#显示列名
print(food_info.shape)#多少行多少列
2、Pandas索引与计算
索引:
#要执行下列程序必先执行上面的程序
#pandas uses zero-indexing #Series object representing the row at index 0. print(food_info.loc[0])#通过调用loc这个函数来进行索引,取出第一行的元素 # Series object representing the seventh row. print(food_info.loc[6])#通过调用loc这个函数来进行索引,取出第7行的元素 # Will throw an error: "KeyError: 'the label [8620] is not in the [index]'" print(food_info.loc[8620])#当行数超出时会报错(8618, 36)
#The object dtype is equivalent to a string in Python
#object - For string values #int - For integer values #float - For float values #datetime - For time values #bool - For Boolean values print(food_info.dtypes)#看每个元素的数据类型
# Returns a DataFrame containing the rows at indexes 3, 4, 5, and 6. food_info.loc[3:6] # Returns a DataFrame containing the rows at indexes 2, 5, and 10. Either of the following approaches will work. # Method 1 two_five_ten = [2,5,10] food_info.loc[two_five_ten] # Method 2 food_info.loc[[2,5,10]]#返回一个包含了2,5,10行的数据框
# Series object representing the "NDB_No" column. # ndb_col = food_info["NDB_No"]#根据"NDB_No"这个列名来打印此列 # print(ndb_col) # Alternatively, you can access a column by passing in a string variable. col_name = "NDB_No" ndb_col = food_info[col_name] print(ndb_col)
import pandas food_info = pandas.read_csv("food_info.csv") # columns = ["Zinc_(mg)", "Copper_(mg)"]#指定要拿的两个列,存在一个列表中 # zinc_copper = food_info[columns] # print(zinc_copper) zinc_copper = food_info[["Zinc_(mg)", "Copper_(mg)"]] print(zinc_copper)
#print(food_info.columns) #print(food_info.head(2)) col_names = food_info.columns.tolist() #把所有的列名取出来并放在一个列表中 print(col_names) gram_columns = [] #构造一个空列表 for c in col_names: #对存放了列名的列表进行遍历 if c.endswith("(g)"): #对每一个col_names中的列名进行查看,如果是以"(g)"结尾的列名,则把它加到gram_columns列表里 gram_columns.append(c) print(gram_columns) gram_df = food_info[gram_columns] #根据列名,输出对应的每一列 print(gram_df.head(3))#只输出前三行
加减乘除计算:
print(food_info["Iron_(mg)"])#打印列名为"Iron_(mg)"的这一列 div_1000 = food_info["Iron_(mg)"] / 1000 #将这一列的值都除以1000 print(div_1000) # Adds 100 to each value in the column and returns a Series object. add_100 = food_info["Iron_(mg)"] + 100 print(add_100 ) # Subtracts 100 from each value in the column and returns a Series object. sub_100 = food_info["Iron_(mg)"] - 100 print(sub_100 ) # Multiplies each value in the column by 2 and returns a Series object. mult_2 = food_info["Iron_(mg)"]*2 print(mult_2 )
#It applies the arithmetic operator to the first value in both columns, the second value in both columns, and so on
water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"]#对应位置上的两个数相乘
weighted_protein = food_info["Protein_(g)"] * 2
weighted_fat = -0.75 * food_info["Lipid_Tot_(g)"]
initial_rating = weighted_protein + weighted_fat
print(weighted_protein,weighted_fat,initial_rating)
# the "Vit_A_IU" column ranges from 0 to 100000, while the "Fiber_TD_(g)" column ranges from 0 to 79 #For certain calculations, columns like "Vit_A_IU" can have a greater effect on the result, #due to the scale of the values # The largest value in the "Energ_Kcal" column. max_calories = food_info["Energ_Kcal"].max() #取列名为"Energ_Kcal"这一列最大的数 # Divide the values in "Energ_Kcal" by the largest value. normalized_calories = food_info["Energ_Kcal"] / max_calories normalized_protein = food_info["Protein_(g)"] / food_info["Protein_(g)"].max() normalized_fat = food_info["Lipid_Tot_(g)"] / food_info["Lipid_Tot_(g)"].max() food_info["Normalized_Protein"] = normalized_protein #在food_info中加入新的列,列名为"Normalized_Protein" food_info["Normalized_Fat"] = normalized_fat
print(food_info.shape) #打印food_info.csv的行列数,(8618, 36) iron_grams = food_info["Iron_(mg)"] / 1000 food_info["Iron_(g)"] = iron_grams #在food_info中加入新的列,列名为Iron_(g) print(food_info.shape) #(8618, 37)
#By default, pandas will sort the data by the column we specify in ascending order and return a new DataFrame # Sorts the DataFrame in-place, rather than returning a new DataFrame.对DataFrame进行就地排序,而不是返回新的DataFrame。 print(food_info["Sodium_(mg)"]) food_info.sort_values("Sodium_(mg)", inplace=True)#默认对"Sodium_(mg)"这一列从小到大进行排序 print(food_info["Sodium_(mg)"]) #Sorts by descending order, rather than ascending.按降序排序,而不是升序排序。 food_info.sort_values("Sodium_(mg)", inplace=True, ascending=False) print(food_info["Sodium_(mg)"])
3、Pandas数据预处理实例
泰坦尼克号:
把数据集读取进来
import pandas as pd import numpy as np titanic_survival = pd.read_csv("titanic_train.csv") titanic_survival.head()
一些数据的处理:
#The Pandas library uses NaN(缺失值), which stands for "not a number", to indicate a missing value. #we can use the pandas.isnull() function which takes a pandas series and returns a series of True and False values age = titanic_survival["Age"] #把"Age"这一列的值放在age这个列表中 print(age.loc[0:10]) #看一下age这一列的前11个值 age_is_null = pd.isnull(age) #判断age中那些样本的值是空的(缺失) print(age_is_null) #返回一系列真值和假值 age_null_true = age[age_is_null] #把返回ture的这个位置拿出来做索引,找出这些空缺值 print(age_null_true) age_null_count = len(age_null_true) #看空缺值的个数 print(age_null_count)
#The result of this is that mean_age would be nan. This is because any calculations we do with a null value also result in a null value mean_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"]) print(mean_age) #nan 因为空缺值的存在导致无法计算均值
把空缺值去掉:(并不需要把所有的空缺值去掉,有时候会用中位数,均值来替换空缺值以保证数据的完整)
#we have to filter out the missing values before we calculate the mean.在计算平均值之前,我们必须过滤掉遗漏的值。 good_ages = titanic_survival["Age"][age_is_null == False] #把"Age"中不是空缺的值拿出来放在新的列表“good_ages”中 #print good_ages correct_mean_age = sum(good_ages) / len(good_ages) #计算新列表的均值 print(correct_mean_age) #29.6991176471
# missing data is so common that many pandas methods automatically filter for it丢失的数据是如此常见,以至于许多熊猫方法会自动为其过滤 correct_mean_age = titanic_survival["Age"].mean() #通过调用.mean()自动过滤空缺值,再进行求均值 print(correct_mean_age) #29.69911764705882
#mean fare for each class 每个船舱等级(1,2,3)的平均票价 passenger_classes = [1, 2, 3] fares_by_class = {} for this_class in passenger_classes: #对船舱等级进行遍历 pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class] #先取出1等级的数据,再取2等级的数据,最后取三等级的数据 pclass_fares = pclass_rows["Fare"] #把1等级的船费票价取出来放pclass_fares,然后再是2...,3 fare_for_class = pclass_fares.mean() #求1等级的船费票价的均值 fares_by_class[this_class] = fare_for_class #求1,2,3等级的平均票价放在fares_by_class这个字典里面。 print(fares_by_class) #{1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997}
.pivot_table()
#index tells the method which column to group by index告诉方法按哪个列分组 #values is the column that we want to apply the calculation to 值是我们要应用计算的列 #aggfunc specifies the calculation we want to perform aggfunc指定我们要执行的计算 passenger_survival = titanic_survival.pivot_table(index="Pclass", values="Survived", aggfunc=np.mean) #.pivot_table()统计一个量与其他量之间的关系 print(passenger_survival) ### 看船舱等级与获救乘客之间有什么联系,1等舱平均获救多少人,2等舱平均获救多少人,3等舱平均获救多少人 Pclass 1 0.629630 2 0.472826 3 0.242363 Name: Survived, dtype: float64 船舱等级越高与获救率越高 ###
passenger_age = titanic_survival.pivot_table(index="Pclass", values="Age") #未指定 aggfunc,默认按照求均值的方法去计算 print(passenger_age) ### 看船舱等级与年龄之间的一个联系 Pclass 1 38.233441 2 29.877630 3 25.140620 Name: Age, dtype: float64
船舱等级越高坐着年龄越大的人 ###
port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum) #看一个量与另外两个量之间的关系 print(port_stats) ### 看登船地点C、Q、S与票价收费总额和获救总人数的关系 Fare Survived Embarked C 10072.2962 93 Q 1022.2543 30 S 17439.3988 217 ###
.dropna()函数
#specifying axis=1 or axis='columns' will drop any columns that have null values指定axis=1或axis='columns'将删除任何具有null值的列 drop_na_columns = titanic_survival.dropna(axis=1) #删除任何具有null值的列 new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age", "Sex"])#将"Age","Sex"这两列的中有null的行删除 print(new_titanic_survival)
具体定位到一个值:
row_index_83_age = titanic_survival.loc[83,"Age"] #定位带编号为83的样本,特征量为"Age"的这一个值 row_index_1000_pclass = titanic_survival.loc[766,"Pclass"] print(row_index_83_age) #28.0 print(row_index_1000_pclass) #1
.reset_index(drop=True)
new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)#对"Age"这一列进行降序排列 print(new_titanic_survival[0:10]) titanic_reindexed = new_titanic_survival.reset_index(drop=True) #对索引值进行重新排序 drop=True:原来的不要了,形成新的值 print(titanic_reindexed.iloc[0:10])
4、自定义函数并应用:.apply()
# This function returns the hundredth item from a series 这个函数返回系列中的第一百项 def hundredth_row(column): # Extract the hundredth item 提取第一百项 hundredth_item = column.iloc[99] return hundredth_item # Return the hundredth item from each column hundredth_row = titanic_survival.apply(hundredth_row) print(hundredth_row)
def not_null_count(column): column_null = pd.isnull(column)#查看列值是否空缺,将真假值返回 null = column[column_null]#用真值做索引,取出空缺值放到null中 return len(null) column_null_count = titanic_survival.apply(not_null_count)#通过.apply()调用自定义得函数 print(column_null_count )
#By passing in the axis=1 argument, we can use the DataFrame.apply() method to iterate over rows instead of columns. #通过传入axis=1参数,我们可以使用DataFrame.apply()方法遍历行而不是列。 def which_class(row): #通过自定义的which_class函数对船舱等级进行数据类型转换 pclass = row['Pclass'] if pd.isnull(pclass): return "Unknown" elif pclass == 1: return "First Class" elif pclass == 2: return "Second Class" elif pclass == 3: return "Third Class" classes = titanic_survival.apply(which_class, axis=1) print(classes)
def is_minor(row): if row["Age"] < 18: return True else: return False minors = titanic_survival.apply(is_minor, axis=1) #print(minors) def generate_age_label(row): age = row["Age"] if pd.isnull(age): return "unknown" elif age < 18: return "minor" else: return "adult" age_labels = titanic_survival.apply(generate_age_label, axis=1) print(age_labels) #把连续值转换为离散值
titanic_survival['age_labels'] = age_labels age_group_survival = titanic_survival.pivot_table(index="age_labels", values="Survived") print(age_group_survival ) ### age_labels adult 0.381032 minor 0.539823 unknown 0.293785 Name: Survived, dtype: float64 ###
5、Series结构
#Series (collection of values) 一行或一列 #DataFrame (collection of Series objects) #Panel (collection of DataFrame objects)
#A Series object can hold many data types, including #float - for representing float values #int - for representing integer values #bool - for representing Boolean values #datetime64[ns] - for representing date & time, without time-zone #datetime64[ns, tz] - for representing date & time, with time-zone #timedelta[ns] - for representing differences in dates & times (seconds, minutes, etc.) #category - for representing categorical values #object - for representing String values #FILM - film name #RottenTomatoes - Rotten Tomatoes critics average score #RottenTomatoes_User - Rotten Tomatoes user average score #RT_norm - Rotten Tomatoes critics average score (normalized to a 0 to 5 point system) #RT_user_norm - Rotten Tomatoes user average score (normalized to a 0 to 5 point system) #Metacritic - Metacritic critics average score #Metacritic_User - Metacritic user average score
import pandas as pd fandango = pd.read_csv('fandango_score_comparison.csv') series_film = fandango['FILM'] print(series_film[0:5]) series_rt = fandango['RottenTomatoes'] print (series_rt[0:5]) ### 0 Avengers: Age of Ultron (2015) 1 Cinderella (2015) 2 Ant-Man (2015) 3 Do You Believe? (2015) 4 Hot Tub Time Machine 2 (2015) Name: FILM, dtype: object 0 74 1 85 2 80 3 18 4 14 Name: RottenTomatoes, dtype: int64 ###
# Import the Series object from pandas from pandas import Series film_names = series_film.values #通过.values 取film这一列的值 print(type(film_names)) #<class 'numpy.ndarray'>,可以证明pandas是在numpy上进行封装 # print(film_names) print(len(film_names)) #146 rt_scores = series_rt.values # print(rt_scores) print(len(rt_scores)) #146 series_custom = Series(rt_scores , index=film_names) #可用‘str’型来做索引,给出对应评分 series_custom[['Minions (2015)', 'Leviathan (2014)']] ### Minions (2015) 54 Leviathan (2014) 99 dtype: int64 ###
# int index is also aviable series_custom = Series(rt_scores , index=film_names) series_custom[['Minions (2015)', 'Leviathan (2014)']] fiveten = series_custom[5:10] print(fiveten) ### The Water Diviner (2015) 63 Irrational Man (2015) 42 Top Five (2014) 86 Shaun the Sheep Movie (2015) 99 Love & Mercy (2015) 89 dtype: int64 ###
original_index = series_custom.index.tolist() print(type(original_index)) #<class 'list'> print(original_index) sorted_index = sorted(original_index) sorted_by_index = series_custom.reindex(sorted_index) print(sorted_by_index)#按字母进行排序
sc2 = series_custom.sort_index() sc3 = series_custom.sort_values() print(sc2[0:10]) print(sc3[0:10])
#The values in a Series object are treated as an ndarray, the core data type in NumPy import numpy as np # Add each value with each other print(np.add(series_custom, series_custom) ) # Apply sine function to each value np.sin(series_custom) # Return the highest value (will return a single value not a Series) np.max(series_custom)
#will actually return a Series object with a boolean value for each film series_custom > 50 series_greater_than_50 = series_custom[series_custom > 50] criteria_one = series_custom > 50 criteria_two = series_custom < 75 both_criteria = series_custom[criteria_one & criteria_two] print(both_criteria)
#data alignment same index rt_critics = Series(fandango['RottenTomatoes'].values, index=fandango['FILM']) rt_users = Series(fandango['RottenTomatoes_User'].values, index=fandango['FILM']) rt_mean = (rt_critics + rt_users)/2#求两个媒体评分的均值 print(rt_mean)
import pandas as pd #will return a new DataFrame that is indexed by the values in the specified column #and will drop that column from the DataFrame #without the FILM column dropped fandango = pd.read_csv('fandango_score_comparison.csv') print type(fandango) fandango_films = fandango.set_index('FILM', drop=False) #print(fandango_films.index)
# Slice using either bracket notation or loc[] fandango_films["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"] fandango_films.loc["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"] # Specific movie fandango_films.loc['Kumiko, The Treasure Hunter (2015)'] # Selecting list of movies movies = ['Kumiko, The Treasure Hunter (2015)', 'Do You Believe? (2015)', 'Ant-Man (2015)'] fandango_films.loc[movies] #When selecting multiple rows, a DataFrame is returned, #but when selecting an individual row, a Series object is returned instead
#The apply() method in Pandas allows us to specify Python logic #The apply() method requires you to pass in a vectorized operation #that can be applied over each Series object. import numpy as np # returns the data types as a Series types = fandango_films.dtypes #print types # filter data types to just floats, index attributes returns just column names float_columns = types[types.values == 'float64'].index # use bracket notation to filter columns to just float columns float_df = fandango_films[float_columns] #print float_df # `x` is a Series object representing a column deviations = float_df.apply(lambda x: np.std(x)) print(deviations)
rt_mt_user = float_df[['RT_user_norm', 'Metacritic_user_nom']] rt_mt_user.apply(lambda x: np.std(x), axis=1)
浙公网安备 33010602011771号