Python pandas

1.得到指定行的索引值。

In [7]:  
dframe = pd.DataFrame({"A":list("abcde"), "B":list("fghij")}, index=[10,11,12,13,14])
#dframe
"""
Out[7]:
    A   B
10  a   f
11  b   g
12  c   h
13  d   i
14  e   j
"""
# 得到指定行的索引值
dframe.index[2]
#output: 12
#删除指定行
dframe.drop(dframe.index[2])
"""
Out[99]:
    A  B
10  a  f
11  b  g
13  d  i
14  e  j
"""

2.一些作业记录

Question 1:

import pandas as pd
import string
import re
energy = pd.read_excel('Energy Indicators.xls', usecols=[2,3,4,5], skiprows=16, skipfooter=38, na_values=['...'])
energy1 = energy.drop([0])

for col in energy1.columns:
    if col[:7] == 'Unnamed':
        energy1.rename(columns={col:'Country'}, inplace=True)
    if col[-6:] == 'capita':
        energy1.rename(columns={col:col[:-6] + 'Capita'}, inplace=True)
    if col[-10:] == 'Production':
        energy1.rename(columns={col:'% ' + col[:9]}, inplace=True)

#nergy1.reset_index()gy1.set_index('Country')
energy1 = energy1.set_index('Country')
#nergy1
#GDP = pd.read_csv('world_bank.csv', skiprows=4)
#GDP 

for row in energy1.index:
    if row[:17] == "Republic of Korea":
        energy1.rename(index = {row : "South Korea"}, inplace=True)
    if row[:24] == "United States of America":
        energy1.rename(index = {row : "United States"}, inplace=True)
    if row[:14] == "United Kingdom":
        energy1.rename(index = {row : "United Kingdom"}, inplace=True)
    if row[:16] ==  "China, Hong Kong":
        energy1.rename(index = {row : "Hong Kong"}, inplace=True)
for row in energy1.index:
    energy1.rename(index = {row : re.sub(u"\\(.*?\\)","",row)}, inplace=True)
for row in energy1.index:
    energy1.rename(index = {row : row.rstrip(string.digits)}, inplace=True)
for row in energy1.index:
    energy1.rename(index = {row : row.rstrip()}, inplace=True)

#====read_csv============
GDP = pd.read_csv('world_bank.csv', skiprows=4)
"""
"Korea, Rep.": "South Korea", 
"Iran, Islamic Rep.": "Iran",
"Hong Kong SAR, China": "Hong Kong"
"""
for col in GDP.columns:
    if col == 'Country Name':
        GDP.rename(columns = {col : "Country"},inplace=True)
GDP = GDP.set_index('Country')

for row in GDP.index:
    if row[:11]== "Korea, Rep." :         
        GDP.rename(index = {row : "South Korea"}, inplace=True)
    if row[:18]=="Iran, Islamic Rep.":
        GDP.rename(index = {row : "Iran"}, inplace=True)
    if row[:9] == 'Hong kong':
        GDP.rename(index = {row : "Hong Kong"}, inplace=True)
#===========read_excel======
ScimEn = pd.read_excel('scimagojr-3.xlsx')
ScimEn = ScimEn.set_index('Country')
#===========merge==========
df_merged = pd.merge(energy1, GDP, how='inner', left_index=True, right_index =True)
df_merged = pd.merge(ScimEn, df_merged, how = 'inner', left_index=True, right_index=True)
df_merged = df_merged.sort(['Rank'], ascending=True)

df_merged = df_merged[df_merged['Rank'] <= 15]

Question2：

def answer_two():
    energy = pd.read_excel('Energy Indicators.xls', usecols=[2,3,4,5], skiprows=16, skipfooter=38, na_values=['...'])
    energy1 = energy.drop([0])

    for col in energy1.columns:
        if col[:7] == 'Unnamed':
            energy1.rename(columns={col:'Country'}, inplace=True)
        if col[-6:] == 'capita':
            energy1.rename(columns={col:col[:-6] + 'Capita'}, inplace=True)
        if col[-10:] == 'Production':
            energy1.rename(columns={col:'% ' + col[:9]}, inplace=True)

#nergy1.reset_index()gy1.set_index('Country')
    energy1 = energy1.set_index('Country')
#nergy1
#GDP = pd.read_csv('world_bank.csv', skiprows=4)
#GDP 
    for row in energy1.index:
        if row[:17] == "Republic of Korea":
            energy1.rename(index = {row : "South Korea"}, inplace=True)
        if row[:24] == "United States of America":
            energy1.rename(index = {row : "United States"}, inplace=True)
        if row[:14] == "United Kingdom":
            energy1.rename(index = {row : "United Kingdom"}, inplace=True)
        if row[:16] ==  "China, Hong Kong":
            energy1.rename(index = {row : "Hong Kong"}, inplace=True)
    for row in energy1.index:
        energy1.rename(index = {row : re.sub(u"\\(.*?\\)","",row)}, inplace=True)
    for row in energy1.index:
        energy1.rename(index = {row : row.rstrip(string.digits)}, inplace=True)
    for row in energy1.index:
        energy1.rename(index = {row : row.rstrip()}, inplace=True)

#====read_csv============
    GDP = pd.read_csv('world_bank.csv', skiprows=4)

    for col in GDP.columns:
        if col == 'Country Name':
            GDP.rename(columns = {col : "Country"},inplace=True)
    GDP = GDP.set_index('Country')

    for row in GDP.index:
        if row[:11]== "Korea, Rep." :         
            GDP.rename(index = {row : "South Korea"}, inplace=True)
        if row[:18]=="Iran, Islamic Rep.":
            GDP.rename(index = {row : "Iran"}, inplace=True)
        if row[:9] == 'Hong kong':
            GDP.rename(index = {row : "Hong Kong"}, inplace=True)
#===========read_excel======
    ScimEn = pd.read_excel('scimagojr-3.xlsx')
    ScimEn = ScimEn.set_index('Country')
#===========merge==========
    df_merged = pd.merge(energy1, GDP, how='outer', left_index=True, right_index =True)
    df_merged = pd.merge(ScimEn, df_merged, how = 'outer', left_index=True, right_index=True)
    
    return len(df_merged.index.unique()) -15

View Code

Question 3:

def answer_three():
    Top15 = answer_one()
    #print(Top15.columns)
    cols = ['Rank', 'Documents', 'Citable documents', 'Citations', 'Self-citations',
       'Citations per document', 'H index', 'Energy Supply',
       'Energy Supply per Capita', '% Renewable', 'Country Code',
       'Indicator Name', 'Indicator Code']
    Top15_new = Top15.drop(cols, axis =1)
    #print(Top15_new.columns)
    Top15_new['avgGDP'] = Top15_new.mean(axis=1)
    Top15_new = Top15_new.sort(['avgGDP'], ascending=False)
    #Top15_new = Top15_new['avgGDP']
    result = pd.Series(Top15_new['avgGDP'])
    return result

Question 4:

def answer_four():
    Top15 = answer_one()
    result = (Top15.loc[['United Kingdom']]['2015'] -Top15.loc[['United Kingdom']]['2006'])/(Top15.loc[['United Kingdom']]['2006'])
    return result

Question 5:

def answer_five():
    Top15 = answer_one()
    mean = Top15['Energy Supply per Capita'].mean()
    
    return mean
print(answer_five())

Question 6:

def answer_six():
    """
    What country has the maximum % Renewable and what is the     percentage?

This function should return a tuple with the name of the country and the percentage
    """
    Top15 = answer_one()
    Top15 = Top15.reset_index()
    Top = Top15.loc[:, ['Country', '% Renewable']]
    Top = Top.set_index('Country') 
    return (Top.idxmax(), Top.max())

answer_six()

View Code

Question 7:

def answer_seven():
    Top15 = answer_one()
    Top = Top15['Self-citations']
    Top2 = Top15['Citations']
    Top15['ratio'] = Top.div(Top2)
    Top15.reset_index()
    Top_new = Top15.loc[:,['Country', 'ratio']]
    Top_new.set_index('Country')
    return (Top_new.idxmax(), Top_new.max())
answer_seven()

Question 8:

def answer_eight():
    Top15 = answer_one()
    Top15['populations'] = Top15['Energy Supply'].div(Top15['Energy Supply per Capita'])
    Top15 = Top15.sort(['populations'], ascending=False)
    Top15 = Top15.loc[:, ['populations']]
    print(Top15)
    return Top15.index[2]
answer_eight()

Question 9: 出现错误： AttributeError: 'float' object has no attribute 'sqrt'

#出现错误
"""
AttributeError: 'float' object has no attribute 'sqrt'
"""

def answer_nine():
    Top15 = answer_one()
    Top15['populations'] = Top15['Energy Supply'].div(Top15['Energy Supply per Capita'])
    Top15['Docs per Capita'] = Top15['Documents'].div(Top15['populations'])
    Top = Top15.loc[:, ['Docs per Capita', 'Energy Supply per Capita']]
    print(Top)
    corr = Top15['Docs per Capita'].corr(Top15['Energy Supply per Capita'])
    return corr
print(answer_nine())

修改后：

def answer_nine():
    Top15 = answer_one()
    Top15['populations'] = Top15['Energy Supply'].div(Top15['Energy Supply per Capita'])
    Top15['Docs per Capita'] = Top15['Documents'].div(Top15['populations'])
    Top = Top15.loc[:, ['Docs per Capita', 'Energy Supply per Capita']]
    print(Top)
#修改后的方法    
    correlation=Top15['Docs per Capita'].astype('float64').corr(Top15['Energy Supply per Capita'].astype('float64'))
    return correlation
print(answer_nine())

Question 10:

def answer_ten():
    Top15 = answer_one()
    Top15 = Top15.sort(['% Renewable'], ascending = False)
    print(Top15)
    median = Top15['% Renewable'][7]
    Top15['HighRenew'] = Top15['% Renewable']
    Top15['HighRenew'] = Top15['HighRenew'].apply(lambda x : 1 if x >= median else 0)
    Top15 = Top15.sort(['Rank'], ascending=True) 
    print(Top15)
    return Top15['HighRenew']
answer_ten()

Question 11：

import numpy as np
def answer_eleven():
    Top15 = answer_one()
    ContinentDict  = {'China':'Asia', 
                  'United States':'North America', 
                  'Japan':'Asia', 
                  'United Kingdom':'Europe', 
                  'Russian Federation':'Europe', 
                  'Canada':'North America', 
                  'Germany':'Europe', 
                  'India':'Asia',
                  'France':'Europe', 
                  'South Korea':'Asia', 
                  'Italy':'Europe', 
                  'Spain':'Europe', 
                  'Iran':'Asia',
                  'Australia':'Australia', 
                  'Brazil':'South America'}
    Top15['Continent'] = Top15.index.to_series().map(ContinentDict)
    Top15 = Top15.reset_index()
    # Top15.set_index('Continent')
    Top15['populations'] = Top15['Energy Supply'].div(Top15['Energy Supply per Capita']).astype(float)
   #需要astype,不然用到mean出现numerical的错误
    df = Top15.loc[:, ['Continent', 'populations']]
    #df = df.set_index('Continent')
    #print(df.groupby(level=0)['populations'].agg({'size' : np.size, 'sum': np.sum, 'mean':np.mean, 'std':np.std}))
    df_size = (df.set_index('Continent').groupby(level=0)['populations'].agg({'size' : np.size}))
    df_sum = ((df.set_index('Continent').groupby(level=0)['populations'].agg({'sum' : np.sum})))
    df_mean = ((df.set_index('Continent').groupby(level=0)['populations'].agg({'mean' : np.mean})))
    df_std = ((df.set_index('Continent').groupby(level=0)['populations'].agg({'std' : np.std})))
    df.set_index('Continent')
    df_1 = pd.merge(df_size, df_sum, how='inner', left_index=True, right_index=True)
    df_2 = pd.merge(df_mean, df_std, how='inner', left_index=True, right_index=True)
    df = pd.merge(df_1, df_2, how='inner', left_index=True, right_index=True)
 
    return(df)
answer_eleven()

Question 12:

有agg和cut()的函数。

def answer_twelve():
    Top15 = answer_one()
    ContinentDict  = {'China':'Asia', 
                  'United States':'North America', 
                  'Japan':'Asia', 
                  'United Kingdom':'Europe', 
                  'Russian Federation':'Europe', 
                  'Canada':'North America', 
                  'Germany':'Europe', 
                  'India':'Asia',
                  'France':'Europe', 
                  'South Korea':'Asia', 
                  'Italy':'Europe', 
                  'Spain':'Europe', 
                  'Iran':'Asia',
                  'Australia':'Australia', 
                  'Brazil':'South America'}
    Top15['Continent'] = Top15.index.to_series().map(ContinentDict)
    Top15 = Top15.reset_index()
    Top15['bins'] = pd.cut(Top15['% Renewable'],5)
   
    df = Top15.loc[:, ['Country', 'Continent','bins' ]]
    print(df.groupby(['Continent','bins'])['Country'].agg({'size' : np.size}))
    result = df.groupby(['Continent','bins'])['Country'].agg({'size' : np.size})
    
    return result
answer_twelve()

Question 13:

为数字添加千位符号：e.g. 317615384.61538464 -> 317,615,384.61538464

#数字添加千位符号

def answer_thirteen():
    Top15 = answer_one()
    Top15['populations'] = Top15['Energy Supply'].div(Top15['Energy Supply per Capita'])
    Top15['populations'] = Top15['populations'].map(lambda x : format(x, ','))
    Top15 = Top15.reset_index()
    result = Top15.loc[:,['Country', 'populations']]
    result = result.set_index('Country')
    return result
answer_thirteen()

posted @ 2018-06-28 23:03 卷积阅读(510) 评论(0) 收藏举报

刷新页面返回顶部

Python pandas

公告