Learning notes | Data Analysis: 1.2 data wrangling


| Data Wrangling |

# Sort all the data into one file 

files = ['BeijingPM20100101_20151231.csv','ChengduPM20100101_20151231.csv','GuangzhouPM20100101_20151231.csv','ShanghaiPM20100101_20151231.csv','ShenyangPM20100101_20151231.csv']
out_columns = ['No', 'year', 'month', 'day', 'hour', 'season', 'PM_US Post']

 

# Create a void dataframe

df_all_cities = pd.DataFrame()

 

# Iterate to write diffrent files

for inx, val in enumerate(files):
    df = pd.read_csv(val)
    df = df[out_columns]
    # create a city column
    df['city'] = val.split('P')[0]
    # map season
    df['season'] = df['season'].map({1:'Spring', 2:'Summer', 3:'Autumn', 4: 'Winter'})
    # append each file and merge all files into one
    df_all_cities = df_all_cities.append(df)

 

# replace the space in variable names with '_'

df_all_cities.columns = [c.replace(' ', '_') for c in df_all_cities.columns]

 

# Assignment: 

# print the length of data
print("The number of row in this dataset is ",len(Beijing_data.index))
# calculating the number of records in column "PM_Dongsi" print("There number of missing data records in PM_Dongsi is: ",len(Beijing_data.index) - len(Beijing_data['PM_Dongsi'].dropna())) print("There number of missing data records in PM_Dongsihuan is: ",len(Beijing_data.index) - len(Beijing_data['PM_Dongsihuan'].dropna())) print("There number of missing data records in PM_Nongzhanguan is: ",len(Beijing_data.index) - len(Beijing_data['PM_Nongzhanguan'].dropna())) print("There number of missing data records in DEWP is: ",len(Beijing_data.index) - len(Beijing_data['DEWP'].dropna())) print("There number of missing data records in HUMI is: ",len(Beijing_data.index) - len(Beijing_data['HUMI'].dropna())) print("There number of missing data records in PRES is: ",len(Beijing_data.index) - len(Beijing_data['PRES'].dropna())) print("There number of missing data records in TEMP is: ",len(Beijing_data.index) - len(Beijing_data['TEMP'].dropna())) print("There number of missing data records in cbwd is: ",len(Beijing_data.index) - len(Beijing_data['cbwd'].dropna())) print("There number of missing data records in Iws is: ",len(Beijing_data.index) - len(Beijing_data['Iws'].dropna())) print("There number of missing data records in precipitation is: ",len(Beijing_data.index) - len(Beijing_data['precipitation'].dropna())) print("There number of missing data records in Iprec is: ",len(Beijing_data.index) - len(Beijing_data['Iprec'].dropna()))

 

posted @ 2018-10-29 10:56  Jinze_Yu  阅读(366)  评论(0编辑  收藏  举报