Python pandas
1.得到指定行的索引值。
In [7]: dframe = pd.DataFrame({"A":list("abcde"), "B":list("fghij")}, index=[10,11,12,13,14]) #dframe """ Out[7]: A B 10 a f 11 b g 12 c h 13 d i 14 e j """ # 得到指定行的索引值 dframe.index[2] #output: 12 #删除指定行 dframe.drop(dframe.index[2]) """ Out[99]: A B 10 a f 11 b g 13 d i 14 e j """
2.一些作业记录
Question 1:
import pandas as pd import string import re energy = pd.read_excel('Energy Indicators.xls', usecols=[2,3,4,5], skiprows=16, skipfooter=38, na_values=['...']) energy1 = energy.drop([0]) for col in energy1.columns: if col[:7] == 'Unnamed': energy1.rename(columns={col:'Country'}, inplace=True) if col[-6:] == 'capita': energy1.rename(columns={col:col[:-6] + 'Capita'}, inplace=True) if col[-10:] == 'Production': energy1.rename(columns={col:'% ' + col[:9]}, inplace=True) #nergy1.reset_index()gy1.set_index('Country') energy1 = energy1.set_index('Country') #nergy1 #GDP = pd.read_csv('world_bank.csv', skiprows=4) #GDP for row in energy1.index: if row[:17] == "Republic of Korea": energy1.rename(index = {row : "South Korea"}, inplace=True) if row[:24] == "United States of America": energy1.rename(index = {row : "United States"}, inplace=True) if row[:14] == "United Kingdom": energy1.rename(index = {row : "United Kingdom"}, inplace=True) if row[:16] == "China, Hong Kong": energy1.rename(index = {row : "Hong Kong"}, inplace=True) for row in energy1.index: energy1.rename(index = {row : re.sub(u"\\(.*?\\)","",row)}, inplace=True) for row in energy1.index: energy1.rename(index = {row : row.rstrip(string.digits)}, inplace=True) for row in energy1.index: energy1.rename(index = {row : row.rstrip()}, inplace=True) #====read_csv============ GDP = pd.read_csv('world_bank.csv', skiprows=4) """ "Korea, Rep.": "South Korea", "Iran, Islamic Rep.": "Iran", "Hong Kong SAR, China": "Hong Kong" """ for col in GDP.columns: if col == 'Country Name': GDP.rename(columns = {col : "Country"},inplace=True) GDP = GDP.set_index('Country') for row in GDP.index: if row[:11]== "Korea, Rep." : GDP.rename(index = {row : "South Korea"}, inplace=True) if row[:18]=="Iran, Islamic Rep.": GDP.rename(index = {row : "Iran"}, inplace=True) if row[:9] == 'Hong kong': GDP.rename(index = {row : "Hong Kong"}, inplace=True) #===========read_excel====== ScimEn = pd.read_excel('scimagojr-3.xlsx') ScimEn = ScimEn.set_index('Country') #===========merge========== df_merged = pd.merge(energy1, GDP, how='inner', left_index=True, right_index =True) df_merged = pd.merge(ScimEn, df_merged, how = 'inner', left_index=True, right_index=True) df_merged = df_merged.sort(['Rank'], ascending=True) df_merged = df_merged[df_merged['Rank'] <= 15]
Question2:
def answer_two(): energy = pd.read_excel('Energy Indicators.xls', usecols=[2,3,4,5], skiprows=16, skipfooter=38, na_values=['...']) energy1 = energy.drop([0]) for col in energy1.columns: if col[:7] == 'Unnamed': energy1.rename(columns={col:'Country'}, inplace=True) if col[-6:] == 'capita': energy1.rename(columns={col:col[:-6] + 'Capita'}, inplace=True) if col[-10:] == 'Production': energy1.rename(columns={col:'% ' + col[:9]}, inplace=True) #nergy1.reset_index()gy1.set_index('Country') energy1 = energy1.set_index('Country') #nergy1 #GDP = pd.read_csv('world_bank.csv', skiprows=4) #GDP for row in energy1.index: if row[:17] == "Republic of Korea": energy1.rename(index = {row : "South Korea"}, inplace=True) if row[:24] == "United States of America": energy1.rename(index = {row : "United States"}, inplace=True) if row[:14] == "United Kingdom": energy1.rename(index = {row : "United Kingdom"}, inplace=True) if row[:16] == "China, Hong Kong": energy1.rename(index = {row : "Hong Kong"}, inplace=True) for row in energy1.index: energy1.rename(index = {row : re.sub(u"\\(.*?\\)","",row)}, inplace=True) for row in energy1.index: energy1.rename(index = {row : row.rstrip(string.digits)}, inplace=True) for row in energy1.index: energy1.rename(index = {row : row.rstrip()}, inplace=True) #====read_csv============ GDP = pd.read_csv('world_bank.csv', skiprows=4) for col in GDP.columns: if col == 'Country Name': GDP.rename(columns = {col : "Country"},inplace=True) GDP = GDP.set_index('Country') for row in GDP.index: if row[:11]== "Korea, Rep." : GDP.rename(index = {row : "South Korea"}, inplace=True) if row[:18]=="Iran, Islamic Rep.": GDP.rename(index = {row : "Iran"}, inplace=True) if row[:9] == 'Hong kong': GDP.rename(index = {row : "Hong Kong"}, inplace=True) #===========read_excel====== ScimEn = pd.read_excel('scimagojr-3.xlsx') ScimEn = ScimEn.set_index('Country') #===========merge========== df_merged = pd.merge(energy1, GDP, how='outer', left_index=True, right_index =True) df_merged = pd.merge(ScimEn, df_merged, how = 'outer', left_index=True, right_index=True) return len(df_merged.index.unique()) -15
Question 3:
def answer_three(): Top15 = answer_one() #print(Top15.columns) cols = ['Rank', 'Documents', 'Citable documents', 'Citations', 'Self-citations', 'Citations per document', 'H index', 'Energy Supply', 'Energy Supply per Capita', '% Renewable', 'Country Code', 'Indicator Name', 'Indicator Code'] Top15_new = Top15.drop(cols, axis =1) #print(Top15_new.columns) Top15_new['avgGDP'] = Top15_new.mean(axis=1) Top15_new = Top15_new.sort(['avgGDP'], ascending=False) #Top15_new = Top15_new['avgGDP'] result = pd.Series(Top15_new['avgGDP']) return result
Question 4:
def answer_four(): Top15 = answer_one() result = (Top15.loc[['United Kingdom']]['2015'] -Top15.loc[['United Kingdom']]['2006'])/(Top15.loc[['United Kingdom']]['2006']) return result
Question 5:
def answer_five(): Top15 = answer_one() mean = Top15['Energy Supply per Capita'].mean() return mean print(answer_five())
Question 6:
def answer_six(): """ What country has the maximum % Renewable and what is the percentage? This function should return a tuple with the name of the country and the percentage """ Top15 = answer_one() Top15 = Top15.reset_index() Top = Top15.loc[:, ['Country', '% Renewable']] Top = Top.set_index('Country') return (Top.idxmax(), Top.max()) answer_six()
Question 7:
def answer_seven(): Top15 = answer_one() Top = Top15['Self-citations'] Top2 = Top15['Citations'] Top15['ratio'] = Top.div(Top2) Top15.reset_index() Top_new = Top15.loc[:,['Country', 'ratio']] Top_new.set_index('Country') return (Top_new.idxmax(), Top_new.max()) answer_seven()
Question 8:
def answer_eight(): Top15 = answer_one() Top15['populations'] = Top15['Energy Supply'].div(Top15['Energy Supply per Capita']) Top15 = Top15.sort(['populations'], ascending=False) Top15 = Top15.loc[:, ['populations']] print(Top15) return Top15.index[2] answer_eight()
Question 9: 出现错误: AttributeError: 'float' object has no attribute 'sqrt'
#出现错误 """ AttributeError: 'float' object has no attribute 'sqrt' """ def answer_nine(): Top15 = answer_one() Top15['populations'] = Top15['Energy Supply'].div(Top15['Energy Supply per Capita']) Top15['Docs per Capita'] = Top15['Documents'].div(Top15['populations']) Top = Top15.loc[:, ['Docs per Capita', 'Energy Supply per Capita']] print(Top) corr = Top15['Docs per Capita'].corr(Top15['Energy Supply per Capita']) return corr print(answer_nine())
修改后:
def answer_nine(): Top15 = answer_one() Top15['populations'] = Top15['Energy Supply'].div(Top15['Energy Supply per Capita']) Top15['Docs per Capita'] = Top15['Documents'].div(Top15['populations']) Top = Top15.loc[:, ['Docs per Capita', 'Energy Supply per Capita']] print(Top) #修改后的方法 correlation=Top15['Docs per Capita'].astype('float64').corr(Top15['Energy Supply per Capita'].astype('float64')) return correlation print(answer_nine())
Question 10:
def answer_ten(): Top15 = answer_one() Top15 = Top15.sort(['% Renewable'], ascending = False) print(Top15) median = Top15['% Renewable'][7] Top15['HighRenew'] = Top15['% Renewable'] Top15['HighRenew'] = Top15['HighRenew'].apply(lambda x : 1 if x >= median else 0) Top15 = Top15.sort(['Rank'], ascending=True) print(Top15) return Top15['HighRenew'] answer_ten()
Question 11:
import numpy as np def answer_eleven(): Top15 = answer_one() ContinentDict = {'China':'Asia', 'United States':'North America', 'Japan':'Asia', 'United Kingdom':'Europe', 'Russian Federation':'Europe', 'Canada':'North America', 'Germany':'Europe', 'India':'Asia', 'France':'Europe', 'South Korea':'Asia', 'Italy':'Europe', 'Spain':'Europe', 'Iran':'Asia', 'Australia':'Australia', 'Brazil':'South America'} Top15['Continent'] = Top15.index.to_series().map(ContinentDict) Top15 = Top15.reset_index() # Top15.set_index('Continent') Top15['populations'] = Top15['Energy Supply'].div(Top15['Energy Supply per Capita']).astype(float) #需要astype,不然用到mean出现numerical的错误 df = Top15.loc[:, ['Continent', 'populations']] #df = df.set_index('Continent') #print(df.groupby(level=0)['populations'].agg({'size' : np.size, 'sum': np.sum, 'mean':np.mean, 'std':np.std})) df_size = (df.set_index('Continent').groupby(level=0)['populations'].agg({'size' : np.size})) df_sum = ((df.set_index('Continent').groupby(level=0)['populations'].agg({'sum' : np.sum}))) df_mean = ((df.set_index('Continent').groupby(level=0)['populations'].agg({'mean' : np.mean}))) df_std = ((df.set_index('Continent').groupby(level=0)['populations'].agg({'std' : np.std}))) df.set_index('Continent') df_1 = pd.merge(df_size, df_sum, how='inner', left_index=True, right_index=True) df_2 = pd.merge(df_mean, df_std, how='inner', left_index=True, right_index=True) df = pd.merge(df_1, df_2, how='inner', left_index=True, right_index=True) return(df) answer_eleven()
Question 12:
有agg和cut()的函数。
def answer_twelve(): Top15 = answer_one() ContinentDict = {'China':'Asia', 'United States':'North America', 'Japan':'Asia', 'United Kingdom':'Europe', 'Russian Federation':'Europe', 'Canada':'North America', 'Germany':'Europe', 'India':'Asia', 'France':'Europe', 'South Korea':'Asia', 'Italy':'Europe', 'Spain':'Europe', 'Iran':'Asia', 'Australia':'Australia', 'Brazil':'South America'} Top15['Continent'] = Top15.index.to_series().map(ContinentDict) Top15 = Top15.reset_index() Top15['bins'] = pd.cut(Top15['% Renewable'],5) df = Top15.loc[:, ['Country', 'Continent','bins' ]] print(df.groupby(['Continent','bins'])['Country'].agg({'size' : np.size})) result = df.groupby(['Continent','bins'])['Country'].agg({'size' : np.size}) return result answer_twelve()
Question 13:
为数字添加千位符号:e.g. 317615384.61538464 -> 317,615,384.61538464
#数字添加千位符号 def answer_thirteen(): Top15 = answer_one() Top15['populations'] = Top15['Energy Supply'].div(Top15['Energy Supply per Capita']) Top15['populations'] = Top15['populations'].map(lambda x : format(x, ',')) Top15 = Top15.reset_index() result = Top15.loc[:,['Country', 'populations']] result = result.set_index('Country') return result answer_thirteen()
The Safest Way to Get what you Want is to Try and Deserve What you Want.