Learning notes | Data Analysis: 1.2 data wrangling
| Data Wrangling |
# Sort all the data into one file
files = ['BeijingPM20100101_20151231.csv','ChengduPM20100101_20151231.csv','GuangzhouPM20100101_20151231.csv','ShanghaiPM20100101_20151231.csv','ShenyangPM20100101_20151231.csv'] out_columns = ['No', 'year', 'month', 'day', 'hour', 'season', 'PM_US Post']
# Create a void dataframe
df_all_cities = pd.DataFrame()
# Iterate to write diffrent files
for inx, val in enumerate(files): df = pd.read_csv(val) df = df[out_columns] # create a city column df['city'] = val.split('P')[0] # map season df['season'] = df['season'].map({1:'Spring', 2:'Summer', 3:'Autumn', 4: 'Winter'}) # append each file and merge all files into one df_all_cities = df_all_cities.append(df)
# replace the space in variable names with '_'
df_all_cities.columns = [c.replace(' ', '_') for c in df_all_cities.columns]
# Assignment:
# print the length of data print("The number of row in this dataset is ",len(Beijing_data.index))
# calculating the number of records in column "PM_Dongsi" print("There number of missing data records in PM_Dongsi is: ",len(Beijing_data.index) - len(Beijing_data['PM_Dongsi'].dropna())) print("There number of missing data records in PM_Dongsihuan is: ",len(Beijing_data.index) - len(Beijing_data['PM_Dongsihuan'].dropna())) print("There number of missing data records in PM_Nongzhanguan is: ",len(Beijing_data.index) - len(Beijing_data['PM_Nongzhanguan'].dropna())) print("There number of missing data records in DEWP is: ",len(Beijing_data.index) - len(Beijing_data['DEWP'].dropna())) print("There number of missing data records in HUMI is: ",len(Beijing_data.index) - len(Beijing_data['HUMI'].dropna())) print("There number of missing data records in PRES is: ",len(Beijing_data.index) - len(Beijing_data['PRES'].dropna())) print("There number of missing data records in TEMP is: ",len(Beijing_data.index) - len(Beijing_data['TEMP'].dropna())) print("There number of missing data records in cbwd is: ",len(Beijing_data.index) - len(Beijing_data['cbwd'].dropna())) print("There number of missing data records in Iws is: ",len(Beijing_data.index) - len(Beijing_data['Iws'].dropna())) print("There number of missing data records in precipitation is: ",len(Beijing_data.index) - len(Beijing_data['precipitation'].dropna())) print("There number of missing data records in Iprec is: ",len(Beijing_data.index) - len(Beijing_data['Iprec'].dropna()))