1. Getting Started
import pandas as pd #导入
pd.DataFrame({'Yes': [50, 21], 'No': [131, 2]}) #Create a table
#assign the row lables
pd.DataFrame({'Bob': ['I liked it.', 'It was awful.'],
'Sue': ['Pretty good.', 'Bland.']},
index=['Product A', 'Product B'])
wine_reviews = pd.read_csv("../input/wine-reviews/winemag-data-130k-v2.csv") #read csv into a DataFrame
wine_reviews.shape # check how large the resulting DataFrame is
wine_reviews.head() # grabs the first five rows
#To make pandas use that column for the index(row lables)
wine_reviews = pd.read_csv("../input/wine-reviews/winemag-data-130k-v2.csv", index_col=0)
wine_reviews.to_csv("wine_reviews.csv") #save this DataFrame as a csv file
2. Indexing, Selecting & Assigning
reviews['country'] or reviews.country #access its values using the indexing
reviews['country'][0] # index like like a dictionary
#Index-based selection
reviews.iloc[0] #To select the first row
reviews.iloc[:, 0] #To select the first column
reviews.iloc[1:3, 0] or reviews.iloc[[1, 2], 0] # to select just the second and third entries
#Label-based selection(Including right margin)
reviews.loc[0, 'country']
reviews.loc[:, ['taster_name', 'taster_twitter_handle', 'points']] #select mutiple column
reviews.set_index("title")# add a row label
#Conditional select
reviews.loc[reviews.country == 'Italy']
reviews.loc[(reviews.country == 'Italy') & (reviews.points >= 90)]
reviews.loc[reviews.country.isin(['Italy', 'France'])]
reviews.loc[reviews.price.notnull()]
#Assigning data
reviews['critic'] = 'everyone' #assign a same value
reviews['index_backwards'] = range(len(reviews), 0, -1) #an iterable of values
counts = pd.Series([n_trop, n_fruity], index=['tropical', 'fruity']) #series create
3. Summary Funcitons and Maps
#Summary Funcitons
reviews.points.describe() # generates a high-level summary of the attributes
reviews.points.mean()
reviews.taster_name.unique() #To see how many kinds of value
reviews.taster_name.value_counts() #To see how often they occur
#Maps(don't modify the original data)
review_points_mean = reviews.points.mean() #get the mean value
reviews.points.map(lambda p: p - review_points_mean) # and use this value to reassign them
n_trop = reviews.description.map(lambda desc: "tropical" in desc).sum() #count times in atribution
reviews.points - review.points.mean() #common mapping operator
reviews.apply(remean_points, axis='columns') #other method to do the same thing
def remean_points(row):
row.points = row.points - review_points_mean
return row
reviews.country + " - " + reviews.region_1 #combining country and region information
4. Grouping and Sorting
#Group
reviews.groupby('points').points.count() #classify and count
reviews.groupby('points').price.min() #get the cheapest wine in each point value category
reviews.groupby('winery').apply(lambda df: df.title.iloc[0]) #classify and get each first title
reviews.groupby(['country']).price.agg([len, min, max]) #run a bunch of different functions simultaneously
countries_reviewed = reviews.groupby(['country', 'province']).description.agg([len]) #mutiple index
countries_reviewed.reset_index() #convert to single index
#Sort
countries_reviewed.sort_values(by=['len','country'], ascending=False) #sort by length
5. Data Types and Missing Values
reviews[pd.isnull(reviews.country)] # To select NaN
reviews.loc[reviews.price.notnull()] # To select notnull
reviews.region_2.fillna("Unknown") #Replacing missing values
reviews.taster_twitter_handle.replace("@kerinokeefe", "@kerino") # replace the original data
6. Renaming and Combining
reviews.rename(columns={'points': 'score'})#change the points column to score
reviews.rename(index={0: 'firstEntry', 1: 'secondEntry'}) #change the row labels
reviews.rename(columns=dict(region_1='region', region_2='locale')) #change the row labels
reindexed = reviews.rename_axis('wines', axis='rows') #reset the index name
#combine
pd.concat([canadian_youtube, british_youtube]) #in different DataFrame but having the same columns
left.join(right, lsuffix='_CAN', rsuffix='_UK')