条件过滤
----------------基于numpy
world_alcolhol是numpy的array类型。输入:matrix; 输出:matrix
# Boolean vector corresponding to Canada and 1986. canada_1986_boolean = (world_alcohol[:,2] == "Canada") & (world_alcohol[:,0] == "1986") # We can then use canada_1986 to subset a matrix -- it's just a normal boolean vector print(world_alcohol[canada_1986_boolean,:])
---------------基于pandas
- pd.isnull() 输入:dataframe;输出:vector
#dataframe is titanic_survival, the age column has null values(NaN) which need to be excluded while calculation # age_null is a boolean vector, and has "True" where age is NaN, and "False" where it isn't age_null = pd.isnull(titanic_survival["age"]) # then use this boolean to filter age column survival_with_valid_age = titanic_survival["age"][age_null==False] # do calculation correct_sum = sum(survival_with_valid_age) correct_mean_age=correct_sum/len(survival_with_valid_age)
输入:dataframe;输出:vector
pclass_survival = titanic_survival["fare"][titanic_survival["pclass"]==2] fare_for_class = pclass_survival.mean()
输入:dataframe;输出:dataframe
selected_table=table[table['Major_category']==major]
根据列空值过滤行 dropna()
输入:dataframe;输出:dataframe
# do calculation drop rows if certain columns have missing values
new_titanic_survival = titanic_survival.dropna(subset=["age", "body","home.dest"]).reset_index(drop=True)
输入:series;输出:series
#series series_film=fandango['FILM'] series_rt=fandango['RottenTomatoes'] #use film names as index series_custom = Series(series_rt.values,index= series_film.values) #filter criteria_one = series_custom > 50 criteria_two = series_custom < 75 both_criteria = series_custom[criteria_one & criteria_two]