numpy教程 pandas教程 Python数据科学计算简介(莫烦 视频链接+代码笔记)
#conding:utf-8 import numpy as np import pandas as pd import matplotlib.pyplot as plt array = np.array([[1, 2, 3], [2, 3, 4]]) #矩阵存为数组 print(array) print('number of dim,几行', array.ndim) print('shape,行数*列数', array.shape) print('size,有多少元素', array.size) a = np.array([2, 23, 4], dtype=np.int64) #类型:例如dype=np.float64 print(a) #[ 2 23 4],没有,区别于列表 print(a.dtype) #输出:int64 #定义矩阵 a = np.array([[2, 23, 4], [2, 23, 4]]) print(a) #定义全部为0的矩阵 a = np.zeros((3,4)) #3行4列的全部为0的矩阵 print('3行4列的全部为0的矩阵') print(a) a = np.ones((1, 2), dtype=np.int16) #1行2列的全部为0的矩阵 print('1行2列的全部为1的矩阵') print(a) a = np.empty((3, 4)) print('3行4列的全部什么都没有(几乎接近于0的数字)的矩阵') print(a) a = np.arange(10, 20, 2) print(a) #[10 12 14 16 18] a = np.arange(12).reshape((3,4)) print(a) # [[ 0 1 2 3] # [ 4 5 6 7] # [ 8 9 10 11]] a = np.linspace(1, 10, 5) print('生成1-10分成5段的序列') print(a) #[ 1. 3.25 5.5 7.75 10. ] #+-*=/ a = np.array([10, 20, 30, 40]) b = np.arange(4) print(a, b) c = a - b print('a-b', c) c = a + b print('a+b', c) c = a * b print('a*b', c) c = a**2 print('a*a', c) c = 10 * np.sin(a) #c = 10 * np.cos(a) print('10*sin(a)', c) print(b < 3) #[ True True True False] #矩阵运算 a = np.array([[1, 1], [0, 1]]) b = np.arange(4).reshape((2, 2)) c = a * b #对应位置直接相乘 print(c) c_dot = np.dot(a, b) #矩阵乘法 c_dot_2 = a.dot(b) #同上,矩阵乘法 print('矩阵乘法', c_dot) print('矩阵乘法方法2', c_dot_2) a = np.random.random((2, 4)) print(a) #2行4列的0~1之间的值 print('求和', np.sum(a)) print('最小值', np.min(a)) print('最大值', np.max(a)) print('每一行求和', np.sum(a, axis=1)) #每一行求和 [1.31972875 1.51855042] print('每一列求最小值', np.min(a, axis=0)) # 每一列求最小值 [0.01769909 0.35831739 0.27856868 0.40177896] print('对行求平均值', np.mean(a, axis=1)) #6.6 numpy的基本运算2 a = np.arange(2, 14).reshape((3, 4)) print(a) print('最小值的索引', np.argmin(a)) #最大值的索引np.argmax(a) print('平均值') print(np.mean(a)) print(a.mean()) print(np.average(a)) print('中位数') print(np.median(a)) print('逐项累加', np.cumsum(a)) #逐项累加 print('每邻近的两项相减', np.diff(a)) print('输出非0的位置(行数和列数)', np.nonzero(a)) a = np.arange(14, 2, -1).reshape((3, 4)) print(a) print('逐行排序', np.sort(a)) #矩阵的反向(转置) print('矩阵转置', np.transpose(a)) print('a^T*a', (a.T).dot(a)) print('array中所有<5的变成5,>9的变成9,中间的不变', np.clip(a, 5, 9)) #7.7numpy的索引 a = np.arange(3, 15).reshape((3,4)) print(a) print(a[2][1]) print(a[1, 1:3]) print(a[1, :]) print(a[:, 1]) for row in a: #迭代矩阵的行 print(row) for column in a.T: #迭代矩阵的列 print(column) print("输出矩阵a里面的元素:", a.flatten()) #输出[ 3 4 5 6 7 8 9 10 11 12 13 14] for item in a.flat: print(item) #8.8 numpy的array合并 a = np.array([1, 1, 1]) b = np.array([2, 2, 2]) print("上下合并:", np.vstack((a, b))) #11. 11pandas基本介绍 s = pd.Series([1, 3, 6, np.nan, 44, 1]) print(s) dates = pd.date_range('20160101',periods = 6) print('设置索引', dates) #定义DataFrame #方法一:numpy导入数据 df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['a', 'b', 'c', 'd']) #index行索引, columns列索引 print(df) # a b c d # 2016-01-01 -0.852069 0.454103 0.720401 -1.379524 # 2016-01-02 -0.695040 0.045785 0.721502 -0.462416 # 2016-01-03 -0.501414 0.215428 1.421680 -2.380329 # 2016-01-04 0.750305 0.012037 0.774156 -0.889714 # 2016-01-05 0.120922 1.640206 -0.058068 1.104911 # 2016-01-06 -0.059252 -0.252355 -0.192977 -1.294317 #字典一行一行的导入数据 df2 = pd.DataFrame({'A': 1., 'B': pd.Timestamp('20130102'), 'C': pd.Series(1, index=list(range(4)), dtype='float32'), 'D': np.array([3]*4, dtype='float32'), 'E': pd.Categorical(['test', 'train', 'test', 'train']), 'F': 'foo'}) print(df2) print('每个维度的类型:', df2.dtypes) print('输出所有列的标序:', df2.index) print('输出所有行的标序:', df2.columns) print('输出每一行的值:', df2.values) print('描述(数值型的方差均值等):', df2.describe()) print('矩阵转置:', df2.T) print('列项倒排(行项axis=0):', df2.sort_index(axis=1, ascending=False)) print('按值排序:', df2.sort_values(by='E')) #12. 12pandas选择数据 datas = pd.date_range('20130101', periods=6) df3 = pd.DataFrame(np.arange(24).reshape((6, 4)), index=datas, columns=['A', 'B', 'C', 'D']) print(df3) print('输出一列,两种方法:', df3['A'], df3.A) print(df3[0:3], df3['20130102':'20130104']) print('根据标签来选择(.loc):', df3.loc['20130102']) print(df3.loc[:, ['A', 'B']]) print(df3.loc['20130101', ['A', 'B']]) print('筛选出3到5行,1到3列:', df3.iloc[3:5, 1:3]) print('将iloc和loc结合起来筛选,同时用数字和标签混合筛选(.ix):', df3.ix[:3, ['A', 'C']]) print('df3[df3.A<8]', df3[df3.A < 8]) #13. 13pandas设置值 df3.iloc[2, 2] = 111 print(df3) df3.loc['20130101', 'B'] = 222 print(df3) df3[df3.A > 4] = 0 print(df3) df3['E'] = np.nan print('添加一行新的空序列E:', df3) df3['F'] = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20130101', periods = 6)) print('添加一列F:', df3) #17. 17pandas处理缺失值 df3.iloc[0, 1] = np.nan df3.iloc[1, 2] = np.nan print(df3) print('只要列上有nan就丢掉整列:', df3.dropna(axis=1, how='any')) #只要列上有nan就丢掉整列 print('只有整列都是nan的时候才丢掉整列:', df3.dropna(axis=1, how='all')) print(df3.fillna(value=0)) print('检测是否有nan', df3.isnull()) print('数据中至少有一个nan:', np.any(df3.isnull())==True) #15. 15pandas导入导出 path = '../data/' #data = pd.read_csv(path+'101_wang_feat.csv') #print('读取文件', data) #data.to_pickle(path+'sivetest.pickle') #16. 16pandas合并concat #concatenating df1 = pd.DataFrame(np.ones((3, 4))*0, columns=['a', 'b', 'c', 'd']) df2 = pd.DataFrame(np.ones((3, 4))*1, columns=['a', 'b', 'c', 'd']) df3 = pd.DataFrame(np.ones((3, 4))*2, columns=['a', 'b', 'c', 'd']) print(df1) print(df2) print(df3) res = pd.concat([df1, df2, df3], axis=0) print(res) #join ['inner', 'outer'] df1 = pd.DataFrame(np.ones((3, 4))*0, columns=['a', 'b', 'c', 'd'], index=[1, 2, 3]) df2 = pd.DataFrame(np.ones((3, 4))*1, columns=['b', 'c', 'd', 'e'], index=[2, 3, 4]) df3 = pd.DataFrame(np.ones((3, 4))*1, columns=['b', 'c', 'd', 'e'], index=[2, 3, 4]) print(df1) print(df2) res = pd.concat([df1, df2]) print('合并df1,df2所有的项,没有的值补为NAN:', res) res = pd.concat([df1, df2], join='outer') print("join='outer'(默认形式同上)合并df1,df2所有的项,没有的值补为NAN:", res) res = pd.concat([df1, df2], join='inner', ignore_index=True) print(" join='inner'合并df1,df2都有的项, ignore_index=True,序项排序", res) res = pd.concat([df1, df2], axis=1, join_axes=[df1.index]) print("join_axes=[df1.index],以df1的index为准", res) res = df1.append([df2, df3], ignore_index=True) print("在df1后面追加df2:", res) s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) print(s1) res = df1.append(s1, ignore_index=True) print(res) #17. 17pandas合并merge left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], 'A': ['A0', 'A1', 'A2', 'A3'], 'B': ['B0', 'B1', 'B2', 'B3']}) right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], 'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3']}) print(left) print(right) res = pd.merge(left, right, on='key') print('基于key合并left和right:', res) left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'], 'key2': ['K0', 'K1', 'K0', 'K1'], 'A': ['A0', 'A1', 'A2', 'A3'], 'B': ['B0', 'B1', 'B2', 'B3']}) right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'], 'key2': ['K0', 'K0', 'K0', 'K0'], 'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3']}) #how=['left', 'right', 'inner', 'outer'] res = pd.merge(left, right, on=['key1', 'key2']) print('合并多列时输出满足多列值同时相同的部分(默认how=inner):') print(res) res = pd.merge(left, right, on=['key1', 'key2'], how='outer', indicator=True) print("合并多列时输出所以已有的多列值的组合,没有的补NAN(how=outer),indicator=True显示左右组合的情况。默认该列的名字是_merge,改名字: indicator='indicator_column':") print(res) #merged by index left = pd.DataFrame({'A': ['A0', 'A1', 'A2'], 'B': ['B0', 'B1', 'B2']}, index=['K0', 'K1', 'K2']) right = pd.DataFrame({'C': ['C0', 'C1', 'C2'], 'D': ['D0', 'D1', 'D2']}, index=['K0', 'K2', 'K3']) print("left") print(left) print("right") print(right) res = pd.merge(left, right, left_index=True, right_index=True, how='outer') print("merged by index") print(res) boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]}) girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]}) print(boys) print(girls) res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='outer') print(res) #18, 18pandas plot data = pd.DataFrame(np.random.randn(1000, 4), index=np.arange(1000), columns=list("ABCD")) print(data.head()) data = data.cumsum() print(data.head()) data.plot() plt.show() #plot方法: #'bar','hist', 'box', 'kde', 'area', 'scatter', 'hexbin', 'pie' ax = data.plot.scatter(x='A', y='B', color='DarkBlue', label='Class 1') data.plot.scatter(x='A', y='C', color='green', label='Class 2', ax=ax) #ax=ax,将两个图打印在一张图上 plt.show()