pands 编码知识
一,pandas功能
1,基于numpy , 分析结构化数据。
二,常用基础知识编码练习
包括数据类型,数据操作,比如索引,分片 ,分组聚合 ,排序 过滤等等数分常见操作代码
# coding=utf-8
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib as mp
'''
data analisis structured data baseed on numpy
data file : csv json sql MS excel
manipulation: map reduce group aggregation sorting query fillter
join selection ETL feature enginnering
same as spark flink Hsql
data structure:
Series: one column ,any type ,
DataFrame: table , multipul column
'''
'''
print pd.__version__
mydataset = {
'sites' : ["Google", "Runoob", "Wiki"],
'number': [1, 2, 3]
}
list = [1, 2, 3]
myvar = pd.DataFrame(mydataset)
series = pd.Series(list)
print series[1]
series1 = pd.Series(list, index = ["x", "y", "z"])
print series1
data = [['Google', 10],['Runoob', 12],['Wiki', 13]] # nested list
df = pd.DataFrame(data, columns = ['Site', 'Age'], dtype= float)
print (df)
data_1 = {'Site':['Google', 'Runoob', 'Wiki'],'Age':[10,12,13]}
df_1 = pd.DataFrame(data_1)
# pd.DataFrame(data_1, index=["a","b"])
print (df_1.loc[1]) #row
# df_1.loc[0,1]
# read file
df = pd.read_csv("D:\\vbox-share\\better-training-for-safer-foods-april-2014-to-march-2015.csv").head(10)
print (df)
print (df.info())
print np.zeros((2,3))
print np.random.rand(10)
a = np.arange(6).reshape(3, 2)
print a
'''
data = np.array(['a','b','c','d','e'])
ser = pd.Series(data,index=[3,2,6,8,9]) #索引
print (ser[9])
df = pd.read_csv("C:\\Users\\86187\\Downloads\\nba.csv") #读取文件
ser = pd.Series(df['Name'])
data = ser.head(10) //切片
# print (data[3:6])
print (data.loc[3:6])
# binary operations like add sub
sub_d1 = pd.Series([5,2,4,6], index=['a','b','c','d'])
sub_d2 = pd.Series([1,2,6,7], index=['a','b','d','e'])
print sub_d1.sub(sub_d2,fill_value=0)
# add sub mul div sum prod
# dataframe indexing 抽取数据 , selection and prejection
df_obj = pd.DataFrame(np.random.rand(5,4),columns=['a','b','c','d'])
print (df_obj.head())
print (df_obj['a']) #df_obj['a','c']
print (df_obj.loc[0:2,'a']) #df_obj.iloc[0:2, 0]
#对齐运算
# Series 对齐 dataframe 2个df , 元素对元素
align_df1= pd.DataFrame(np.ones((2,2)),columns=['a','b'])
align_df2= pd.DataFrame(np.ones((3,3)),columns=['a','b','c'])
print (align_df1 + align_df2)
# print (align_df1.add(align_df2,fill_value = -1)) 未对齐填充值
# function computaion 单个df, 对每个元素,行 列方向使用函数计算
func_df = pd.DataFrame(np.random.rand(5,4) - 1)
print (np.abs(func_df)) # np 函数
print (func_df.apply(lambda x: x.max())) # 默认axis=0,方向是列
f2 = lambda x: '%.2f' % x
print (func_df.applymap(f2)) # each element
# sorting
sort_s1 = pd.Series(range(10,15), index=np.random.randint(5, size=5))
print (sort_s1)
print (sort_s1.sort_index()) # ascending default, by index column
# df.sort_index(axis=1,ascending=False) df考虑轴方向,1 列方向排序。
print (func_df.sort_values(by = 0,ascending=False)) # 0 列名,的值来排序。
# deal with missing value
missing_value_df = pd.DataFrame([np.random.rand(3),[1.,2.,np.nan]
[np.nan,4.,np.nan],[1.,2.,3.]])
以上执行后结果:
以上为本人练习pandas功能,并做了笔记。
---一------步-----一 ------个-----脚--------印----------