Pandas

1. Getting Started

import pandas as pd #导入
pd.DataFrame({'Yes': [50, 21], 'No': [131, 2]}) #Create a table
#assign the row lables
pd.DataFrame({'Bob': ['I liked it.', 'It was awful.'], 
              'Sue': ['Pretty good.', 'Bland.']},
             index=['Product A', 'Product B'])

wine_reviews = pd.read_csv("../input/wine-reviews/winemag-data-130k-v2.csv") #read csv into a DataFrame
wine_reviews.shape # check how large the resulting DataFrame is
wine_reviews.head() # grabs the first five rows
#To make pandas use that column for the index(row lables)
wine_reviews = pd.read_csv("../input/wine-reviews/winemag-data-130k-v2.csv", index_col=0)
wine_reviews.to_csv("wine_reviews.csv") #save this DataFrame  as a csv file

2. Indexing, Selecting & Assigning

reviews['country']  or reviews.country #access its values using the indexing
reviews['country'][0] # index like like a  dictionary

#Index-based selection
reviews.iloc[0] #To select the first row 
reviews.iloc[:, 0] #To select the first column
reviews.iloc[1:3, 0] or reviews.iloc[[1, 2], 0] # to select just the second and third entries

#Label-based selection(Including right margin)
reviews.loc[0, 'country']
reviews.loc[:, ['taster_name', 'taster_twitter_handle', 'points']] #select mutiple  column
reviews.set_index("title")# add a row label

#Conditional select
reviews.loc[reviews.country == 'Italy'] 
reviews.loc[(reviews.country == 'Italy') & (reviews.points >= 90)]
reviews.loc[reviews.country.isin(['Italy', 'France'])] 
reviews.loc[reviews.price.notnull()]

#Assigning data
reviews['critic'] = 'everyone' #assign a same value
reviews['index_backwards'] = range(len(reviews), 0, -1) #an iterable of values
counts = pd.Series([n_trop, n_fruity], index=['tropical', 'fruity']) #series create

3. Summary Funcitons and Maps

#Summary Funcitons 
reviews.points.describe() # generates a high-level summary of the attributes
reviews.points.mean()
reviews.taster_name.unique() #To see how many kinds of value
reviews.taster_name.value_counts() #To see how often they occur

#Maps(don't modify the original data)
review_points_mean = reviews.points.mean() #get the mean value
reviews.points.map(lambda p: p - review_points_mean) # and use this value to reassign them
n_trop = reviews.description.map(lambda desc: "tropical" in desc).sum() #count times in atribution
reviews.points - review.points.mean() #common mapping operator 

reviews.apply(remean_points, axis='columns') #other method to do the same thing
def remean_points(row):
    row.points = row.points - review_points_mean
    return row

reviews.country + " - " + reviews.region_1 #combining country and region information

4. Grouping and Sorting

#Group
reviews.groupby('points').points.count() #classify and count
reviews.groupby('points').price.min()  #get the cheapest wine in each point value category
reviews.groupby('winery').apply(lambda df: df.title.iloc[0]) #classify and get each first title
reviews.groupby(['country']).price.agg([len, min, max]) #run a bunch of different functions simultaneously
countries_reviewed = reviews.groupby(['country', 'province']).description.agg([len]) #mutiple index
countries_reviewed.reset_index() #convert to single index

#Sort
countries_reviewed.sort_values(by=['len','country'], ascending=False) #sort by length

5. Data Types and Missing Values

reviews[pd.isnull(reviews.country)] # To select NaN 
reviews.loc[reviews.price.notnull()] # To select notnull
reviews.region_2.fillna("Unknown") #Replacing missing values
reviews.taster_twitter_handle.replace("@kerinokeefe", "@kerino") # replace the original data

6. Renaming and Combining

reviews.rename(columns={'points': 'score'})#change the points column to score
reviews.rename(index={0: 'firstEntry', 1: 'secondEntry'}) #change the row labels
reviews.rename(columns=dict(region_1='region', region_2='locale')) #change the row labels
reindexed = reviews.rename_axis('wines', axis='rows') #reset the index name

#combine
pd.concat([canadian_youtube, british_youtube]) #in different DataFrame  but having the same columns
left.join(right, lsuffix='_CAN', rsuffix='_UK') 
posted @ 2022-06-04 21:58  失控D大白兔  阅读(35)  评论(0编辑  收藏  举报