莫烦Python教程之Pandas教程

'''   Day 2   '''


'''1.错误处理'''
# try:
#     file = open('eeee.txt','r+')                                #以只读的形式打开一个文件'eeee',不能'r',要用'r+'---'只读加写入'
# except Exception as e:                                          #将该错误用变量e接收
#     print('There is no file named as eeee')                     #输出错误worning
#     response = input('Do you want to creat a new file? :')    #向用户发送请求
#     if response == 'y':                                         #请求yes
#         file = open('eeee.txt','w')                                 #重新以写的形式打开文件
#     else:
#         pass
# else:
#     file.write('ssss')                                          #如果try的命令可以执行则写入ssss到文件eeee中
# file.close()                                                    #关闭文件


'''zip,lambda,map的功能'''
# map:把一个功能附加给一个参数,把功能和参数合起来的一个运算
# lambda:相当于map一个简化的功能,简单的方程/函数功能
# zip:拉链,为迭代器服务

# a = [1,2,3]
# b = [4,5,6]
# #print(zip(a,b))                                                    #可见zip输出的是一个功能
# #print(list(zip(a,b)))                                              #可视化
# for i,j,c in zip(a,b,a):                                                #迭代器功能
#     print(i/2,j*2,c*2)

# def fun1(a,b):
#     print(a+b)
# fun1(2,5)
'''def是一个带有输出功能的函数,而lambda只是一个函数,需要用print函数来输出得到值'''
# fun1 = lambda x,y:x+y
# print(fun1(2,3))

# def fun1(x,y):
#     print(x+y)
# print(list(map(fun1,[1,2],[2,1])))          #None是print的一个返回值


'''copy  shallowcopy   deepcopy'''

import copy
# a = [1,2,3]
# b = a
# print(id(a)==id(b))
# a[1] = 22
# print(b)
# c = copy.copy(a)
# print(id(a)==id(c))
# print(c)
'''就是只是把a的value赋给c,地址是重新分配的'''
'''copy的独特功能,copy.copy可以将列表中的数值地址保留下来,列表中的列表的地址可以复制;copy.deepcopy是完完全全复制到两一个地址'''
'''a=b时时公用一个空间,copy是打包带走第一层数据(第二层数据还是公用),deepcopy是把所有的数据打包带走'''
# import copy
# a = [1,2,[1,2]]
# b = copy.deepcopy(a)
# print(id(a)==id(b))
# print(id(a)==id(b))
# print(id(a[1])==id(b[1]))

'''多线程(multi-threading)---多个核去运算一大堆数据>>一个核去预算一大堆数据'''
'''多核运算介绍'''
'''tklinker,python特定自带的的GUI界面---图像窗口,开发一个计算器,用户可以体验到的窗口'''

'''pickle存放数据'''
'''保存数据第二天继续处理'''

# import pickle
# a_dict = {'da':111,2:[1,2,3],'23':{1:2,'2':3}}
#
# file = open('pickle_example.pickle','wb')       #以二进制的形式写一个文件
# pickle.dump(a_dict,file)                        #调用pickle里面的dump将a_dict里面的内容倒入file中---写内容
# file.close()
# '''1'''
# print('File has been built.')
# file  = open('pickle_example.pickle','rb')      #以读二进制文件的形式读取一个file
# a_dict = pickle.load(file)                      #调用pickle包里面的load函数,将文件pickle_example里面的内容载入变量a_dict---
# file.close()                                    #关闭文件
# print('The content of this file is :',a_dict)   #输出
# '''2'''
# '''为了防止少了file.close(),可以采用以下写法'''
# # with open('pickle_example.pickle','rb') as file:
# #     a_dict = pickle.load(file)
# # print(a_dict)

'''set功能:找不同/不重合'''
'''集合的元素具有唯一性'''
# a = [1,1,2,3,4,5,5,6,7]
# print(a)
# print(type(a))
# print(set(a))
# print(type(set(a)))

# sentence = 'Welcome back to this tutorial.'
# print(set(sentence))
# char_list = ['a', 'a', 'b', 'b', 'b', 'c', 'c', 'd']
# print(set(char_list))
# unique_char = set(char_list)
# unique_char.add('e')
# print(unique_char)
# # unique_char.clear()
# unique_char.remove('x')
# '''discard 和 remove的区别是,remove一个不存在的数就会报错,但是discard不会报错,只会返回一个原来的值'''
# unique_char.discard('x')
# print(unique_char)
# unique_char = set(char_list)
# unique_char.add('x')
#
# set1 = unique_char
# set2 = {'a', 'b', 'c','e','i'}
# print(set1.difference(set2))        #set1有什么set2没有的
# print(set1.intersection(set2))      #set1和set2的交集

'''正则表达式RegEx'''
'''
正则表达式 (Regular Expression) 又称 RegEx, 是用来匹配字符的一种工具. 在一大串字符中寻找你需要的内容. 
它常被用在很多方面, 比如网页爬虫, 文稿整理, 数据筛选等等. 最简单的一个例子, 比如我需要爬取网页中每一页的标题. 
而网页中的标题常常是这种形式.
而且每个网页的标题各不相同, 我就能使用正则表达式, 用一种简单的匹配方法, 一次性选取出成千上万网页的标题信息. 
正则表达式绝对不是一天就能学会和记住的, 因为表达式里面的内容非常多, 
强烈建议, 现在这个阶段, 你只需要了解正则里都有些什么, 不用记住, 等到你真正需要用到它的时候, 再反过头来, 好好琢磨琢磨, 
那个时候才是你需要训练自己记住这些表达式的时候.
'''

# import re
# pattern1 = "dog"
# pattern2 = "bird"
# string = "dog runs to cat"
# # print(re.search(pattern1, string))
# # print(re.search(pattern2, string))
# # #multiple patterns ("run"  or  "ran")
# ptn = r"r[au]n"                                     #加了r就是表达式,没加r就是字符串
# print(re.search(ptn,"dog rans to cat"))
'''匹配多种可能'''
'''r"r[A-Z]n"---r"r[a-z]n"---r"r[0-9]n"---  '''

'''numpy的基本属性'''

# import numpy as np
#
# array = np.array([[1,2,3],                                  #定义一个数组
#                   [4,5,6]])
# print(array)
# print('number of dim: ',array.ndim)                         #数组的维度
# print('shape: ',array.shape)                                #数组的形状
# print('size:',array.size)                                   #数组的尺寸

'''numpy创建array'''

import numpy as np

# a = np.array([2, 23, 4], dtype=np.float32)                      #位数越小所占空间越小;位数越大所占空间越大,越精确
# print(a, a.dtype)
# b = np.zeros((3, 4))
# print(b)
# c = np.ones((3, 4), dtype=np.int)
# print(c)

# a = np.arange(12).reshape((3,4))                                  #reshape重新生成形状
# print(a)
# b = np.linspace(1, 10, 6).reshape((2,3))                          #生成线段linspace
# print(b)

'''numpy基础运算'''
# import numpy as np
# a = np.array([10, 20, 30, 40]).reshape((2, 2))
# b = np.arange(4).reshape((2, 2))
#
# c = b**2                                                            #平方
# d = 10*np.sin(a)
# print(c, '\n', d)
# e = a*b
# e_dot = np.dot(a, b)
# print(e, '\n', e_dot)                                               # '*' 表示内积,'np.dot(a,b)' 表示矩阵相乘

# a = np.random.random((2,4))
# print(a)
# print(np.max(a,axis=0))                                               # 'axis = 0', 表示在列中寻找
# print(np.min(a,axis=1))                                               # 'axis = 1', 表示在行中寻找
# print(np.sum(a,axis=1))

''' numpy 基础运算2'''
# import numpy as np
#
# A = np.arange(2, 14).reshape((3, 4))                                    #生成2-13,放入3*4的数组中
# print(A)
# print(np.argmax(A))                                                     #'arg'索引标志,索引到A的最大值
# print(np.argmin(A))
# print(np.mean(A))
# print(A.mean())
# print(np.average(A))                                                    # ‘average’ 和 ‘mean’ 都是平均数
# print(np.median(A))                                                     # ‘media’ 中位数
# print(np.cumsum(A))                                                     # 斐波那契数列,累加
# print(np.diff(A))                                                       # 累差
# print(np.nonzero(A))
'''
print(np.nonzero(A))输出结果为:(array([0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2], dtype=int64), array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3], dtype=int64))
意思是,(0,0)为非零,(0,1)为非零,(0,2)为非零,(0,3)为非零,(1,0)为非零,(1,1)为非零,(1,2)为非零,(1,3)为非零......
'''
# B = np.arange(14, 2, -1).reshape((3, 4))
# print(np.sort(B))                                                       # 逐行排序
# print(np.transpose(B))                                                  # 矩阵的转置
# print(B.T)                                                              # 矩阵的转置
# print((B.T).dot(B))                                                     # A的转置乘以A
# print(np.clip(B, 4, 10))                                                # 类似滤波功能,小于4的数都是4,大于10的数都是10,保留中间的数
# print(np.mean(B, axis=1))                                               # 1---行的,0---列的

''' numpy的索引'''
# import numpy as np
#
# A = np.arange(2, 14).reshape((3,4))
# print(A, '\n', A[-2][-2])
# print(A[1][2])
# print(A[1, 2])
# print(A[:, 1])                                                             # 第1列的数
# print(A[1, 1:3])
# print(A)
# for row in A:
#     print(row)                                                             # 迭代输出每一行
# for column in np.transpose(A):                                             # 迭代输出每一列,numpy中没有这样的功能可以先转置A再逐一输出每一行
#     print(column)
#
# print(A)
# print(A.flatten())                                                         # 将矩阵平铺,转化成一个维度返回
# for item in A.flat:                                                        # flat返回的是一个迭代器,可以由for循环遍历
#     print(item)

'''numpy 的 array合并'''
# A = np.array([1,2,3])[:, np.newaxis]                                          # 在列加了一个新的维度
# B = np.array([4,5,6])[:, np.newaxis]                                          # 在列加了一个新的维度
# C = np.vstack((A,B))                                                         # 'vstack' == 'vertical stack'上下合并
# D = np.hstack((A,B))                                                         # 'hstack' == 'horizontal stack'左右合并
# print(C)
# print(D)
# print(A.shape, B.shape, C.shape, D.shape)
# print(A[np.newaxis,:])                                                       # 在行加了一个新的维度
# print(A[np.newaxis,:].shape)
# print(A[:,np.newaxis])                                                       # 在列加了一个新的维度
# print(A[:,np.newaxis].shape)
# print(np.hstack((A, A, B)))
# D = np.concatenate((A, B, B), axis=1 )
# print(D)

'''numpy 的 array分割'''
# import numpy as np
# A = np.arange(12).reshape((3,4))
# print(A)
# print(np.split(A, 2, axis=1))                                       # 把A分成2列,0101行列行列,axis指定方向
# print(np.split(A, 3, axis=1))                                      #ValueError: array split does not result in an equal division,不能进行不等的分割,4列只能按照1,2,4分
'''要想实现不等的分割,可以用array_split'''
# print(np.array_split(A, 3, axis=1))                                 # 它会把第1,2列分到第一块,3,4列各一列,实现不等分割
# print(A)
# print(np.vsplit(A, 3))                                              # 横向分割
# print(np.hsplit(A, 2))                                              # 纵向分割

'''numpy 的 copy & deepcopy'''
# import numpy as np
#
# a = [1,2,3]
# b = a
# c = a
# d = b
# print(a is b)
# print(a is d)
# e = np.copy(a)                                                       # 这是一个deep copy,只是把值赋值过去,并没有关联在一起
# print(e)
# print(a is e)

'''pandas基本介绍'''
'''
如果把numpy比作列表的话,pandas更像是字典形式的numpy
'''
'''
定义DataFrame的两种方式:
1.直接导入字典
2.导入自动生成的
'''
# import numpy as np
# import pandas as pd
#
# s = pd.Series([1, 2, 3, 33, np.nan, 44, 1])                            #序列他比列表多了一个index
# print(s)
# datas = pd.date_range('20190101', periods=7)
# print(datas)
# df = pd.DataFrame(np.random.randn(7, 4), index=datas, columns=['a', 'b', 'c', 'd'])         # index默认行索引,columns默认列索引
# print(df)
'''
           |       a         b         c         d
--------------------------------------------------       
datas      |
           |       a         b         c         d
2019-01-01  1.390443  0.943523 -0.250015 -0.937293
2019-01-02  0.478537  0.891622 -0.009471  0.772718
2019-01-03 -0.620127 -1.297744  0.352404 -1.449165
2019-01-04 -0.002273  1.373092 -0.290291 -0.153854
2019-01-05 -1.335555  0.107416 -0.166542  0.913456
2019-01-06 -0.648300 -2.056751 -1.879867  0.850983
2019-01-07  0.295561  0.214722  0.222594  1.474603
'''

'''如果没有给行列的索引,就会默认0,1,2.....作为行列索引'''
# df1 = pd.DataFrame(np.arange(12).reshape((3,4)))
# print(df1)
#
# df2 = pd.DataFrame({'A': 1.,
#                     'B': pd.date_range('20190101', periods=4),                   # pd.Timestamp('20130102')
#                     'C': pd.Series(1,index=list(range(4)),dtype='float32'),
#                     'D': np.array([3]*4,dtype='int32'),
#                     'E': pd.Categorical(["test", "train", "test", "train"]),
#                     'F': 'foo'})
# print(df2)
# '''
#      A          B    C  D      E    F
# 0  1.0 2019-01-01  1.0  3   test  foo
# 1  1.0 2019-01-02  1.0  3  train  foo
# 2  1.0 2019-01-03  1.0  3   test  foo
# 3  1.0 2019-01-04  1.0  3  train  foo
# '''
# print(df2.dtypes)
# print(df2.columns)
# print(df2.values)
# print(df2.describe())                   # 数据帧描述
# print(df1.T)
# print(df2.sort_index(axis=1, ascending=False))          # ascending = False 表示倒叙输出,axis = 1表示按照列输出
# print(df2.sort_index(axis=0, ascending=False))          # ascending = False 表示倒叙输出,axis = 1表示按照列输出
# print(df2.sort_values(by = 'E'))                        # 'sort_values(by = '排序规则')'按照E中的值进行排序

'''pandas 选择数据'''
# import numpy as np
# import pandas as pd
#
# dates = pd.date_range('20190101', periods=6)
# df = pd.DataFrame(np.arange(24).reshape((6, 4)), index=dates, columns=['A', 'B', 'C', 'D'])
# print(df)
# print(df['A'], '\n', df.A)                                # 效果是一样的
# print(df[0:3], '\n', df['20190101':'20190103'])           # 效果是一样的,普通的选取规则
#
# '''select by label:loc---通过标签选择数据'''
# print(df.loc['20190102'])
# print(df.loc[:, ['A', 'B']])
#
# '''select by position:iloc---通过位置选择数据'''
# print(df.iloc[3:5, 1:3])                                  # 选取行和列的范围
# '''
#              B   C
# 2019-01-04  13  14
# 2019-01-05  17  18
# '''
#
# '''mixed selection: ix---混合选择数据'''
# print(df.ix[:3, ['A', 'C']])
#
# '''Boolean indexing'''
# print(df)
# print(df[df.A > 8])                             #根据给定条件筛选,A>8的才输出

'''Pandas 处理丢失数据'''

# import numpy as np
# import pandas as pd

# dates = pd.date_range('20190101', periods=6)                                                # 首先构建一个数据的纵索引
# df = pd.DataFrame(np.arange(24).reshape((6,4)), index=dates, columns=['A', 'B', 'C', 'D'])  # 建立一个6*4的DataFrame,横索引为columns
# print(df)                                                                                   # 构建初始数据表
# df.iloc[0,1] = np.nan
# df.iloc[1,3] = np.nan
# print(df)                                                                                   # 填入缺失值后的数据表
'''处理数据的方法'''
# print(df.dropna(axis=0, how='any'))                                                         # how=['any','all'],any--某一行或某一列中任何一行或一列存在Nan时都可以去掉,all--只有一行或一列都为nan时才处理
# print(df.dropna(axis=1, how='any'))
# print(df.dropna(axis=0, how='all'))                                                         # 不打印
'''填充缺失数据'''
# print(df.fillna(value=0))
'''检查数据是否有缺失值'''
# print(df.isnull())
'''如果数据太大看不到是否有没有缺失值,可以这么检查'''
# print('\n')
# print(np.any(df.isnull()) == True)                                                          # 如果说至少有一个True他就会返回True

'''pandas 导入导出数据'''

# import pandas as pd
# import numpy as np
#
# data = pd.read_excel('student.xls')                                 # 读取表格‘read_...’
# date = pd.read_pickle('pickle_example.pickle')                      # 读取pickle文件
# print(date)
# data.to_pickle('student.pickle')                                    # 导入pickle文件,保存文件:‘to_...’

'''pandas 如何合并多个DataFrame,concat'''

'''方式一'''
# import pandas as pd
# import numpy as np

# df1 = pd.DataFrame(np.ones((3, 4))*0, columns=['a', 'b', 'c', 'd'])
# df2 = pd.DataFrame(np.ones((3, 4))*1, columns=['a', 'b', 'c', 'd'])
# df3 = pd.DataFrame(np.ones((3, 4))*2, columns=['a', 'b', 'c', 'd'])
# print(df1)
# print(df2)
# print(df3)
# res = pd.concat([df1, df2, df3], axis=0)
# print(res)                                                           # 这个打印出来的索引是之前创建的索引合并
# res1 = pd.concat([df1, df2, df3], axis=0, ignore_index=True)         # 忽略索引,重新排序
# print(res1)

'''方式二'''
# join['inner','outer'], inner 表示交集,outer 表示并集
# import numpy as np
# import pandas as pd
#
# df1 = pd.DataFrame(np.ones((3, 4))*0, columns=['a', 'b', 'c', 'd'], index=[1, 2, 3])
# df2 = pd.DataFrame(np.ones((3, 4))*1, columns=['b', 'c', 'd', 'e'], index=[2, 3, 4])
# print(df1)
# print(df2)
# df3 = pd.concat([df1, df2], join='outer')                                                         # 默认就是‘outer’的模式,求并集
# print(df3)
# df4 = pd.concat([df1, df2], join='inner')                                                         # 相当于求交集
# print(df4)
# df5 = pd.concat([df1, df2], join='outer', ignore_index=True)
# print(df5)

'''方式三  join_axes'''
# import numpy as np
# import pandas as pd
#
# df1 = pd.DataFrame(np.ones((3, 4))*0, columns=['a', 'b', 'c', 'd'], index=['1', '2', '3'])
# df2 = pd.DataFrame(np.ones((3, 4))*1, columns=['b', 'c', 'd', 'e'], index=['2', '3', '4'])
# res = pd.concat([df1, df2], axis=1, join_axes=[df1.index])          # 将df1与df2按列合并,按照df1的进行,因为df2没有第一行,所以用Nan填充,且df1没有第4行所以删除
# res1 = pd.concat([df1, df2], axis=1)
# print(df1)
# print(df2)
# print(res)
# print(res1)

'''方式四  append,在后面加数据'''
# import numpy as np
# import pandas as pd
#
# df1 = pd.DataFrame(np.ones((3, 4))*0, columns=['a', 'b', 'c', 'd'])
# df2 = pd.DataFrame(np.ones((3, 4))*1, columns=['a', 'b', 'c', 'd'])
# df3 = pd.DataFrame(np.ones((3, 4))*2, columns=['a', 'b', 'c', 'd'], index=['2', '3', '4'])
# res = df1.append(df2, ignore_index=True)
# res1 = df1.append([df2, df3])
# print(res)
# print(res1)
# s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
# res2 = df1.append(s1, ignore_index=True)                                # 加一行直接,数据是[1,2,3,4]
# print(res2)


'''pandas 合并,merge'''

'''merge two df by key/keys.(may be used in database)'''
# import pandas as pd
# import numpy as np
#
# left = pd.DataFrame({'key': ['K1', 'K2', 'K3', 'K4'],
#                      'A': ['A1', 'A2', 'A3', 'A4'],
#                      'B': ['B1', 'B2', 'B3', 'B4']})
# right = pd.DataFrame({'key': ['K1', 'K2', 'K3', 'K4'],
#                      'C': ['C1', 'C2', 'C3', 'C4'],
#                      'D': ['D1', 'D2', 'D3', 'D4']})
# print(left)
# print(right)
# res = pd.merge(left, right, on='key')
# print(res)

'''consider two keys'''
# import pandas as pd
# import numpy as np
#
# left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
#                      'key2': ['K1', 'K0', 'K1', 'K0'],
#                      'A': ['A1', 'A2', 'A3', 'A4'],
#                      'B': ['B1', 'B2', 'B3', 'B4']})
# right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
#                       'key2': ['K0', 'K0', 'K0', 'K0'],
#                      'C': ['C1', 'C2', 'C3', 'C4'],
#                      'D': ['D1', 'D2', 'D3', 'D4']})
# print(left)
# print(right)
'''
how = ['inner','outer','right','left']
'''
# res = pd.merge(left, right, on=['key1', 'key2'], how='inner')                # 默认 'inner'
# res1 = pd.merge(left, right, on=['key1', 'key2'], how='outer')               # 没有的就补Nan
# res2 = pd.merge(left,right,on=['key1', 'key2'], how='right')                 # 以key1,key2右边的为标准
# res3 = pd.merge(left,right,on=['key1', 'key2'], how='left')                  # 以key1,key2左边的为标准
# print(res)
# print(res1)
# print(left)
# print(right)
# print(res2)
# print(res3)
'''indicator=True时,可以显示怎么合并的'''

'''left_index,right_index以左右两边的索引为标准'''

# import numpy as np
# import pandas as pd
#
# boys = pd.DataFrame({'k': ['k0', 'k1', 'k2'],
#                      'age': [1, 2, 3]})
# girls = pd.DataFrame({'k': ['k0', 'k0', 'k3'],
#                       'age': [4, 5, 6]})
# print(boys)
# print(girls)
# res = pd.merge(boys,girls,on='k',suffixes=['_boy', '_girl'], how='inner')
# print(res)
'''
    k  age
0  k0    1
1  k1    2
2  k2    3
    k  age
0  k0    4
1  k0    5
2  k3    6
    k  age_boy  age_girl
0  k0        1         4
1  k0        1         5
'''

'''pandas plot画图'''

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# plot data

# Series
# data = pd.Series(np.random.randn(1000), index=np.arange(1000))
# data = data.cumsum()                                                            # 累加
# data.plot()
# plt.show()                                                                      # 显示

'''DataFrame'''
# data = pd.DataFrame(np.random.randn(1000, 4),
#                     index=np.arange(1000),
#                     columns=list("ABCD"))
# # print(data)                                                                       # 生成了个1000行,4列的矩阵
# data = data.cumsum()
# data.plot()
# plt.show()

'''plot methods'''
'''---'bar','hist','kde','area','scatter(只有xy两个属性,描述点)','hexbin','pie'---'''

data = pd.DataFrame(np.random.randn(1000, 4),
                    index=np.arange(1000),
                    columns=list("ABCD"))
# print(data)                                                                       # 生成了个1000行,4列的矩阵
data = data.cumsum()
ax = data.plot.scatter(x='A', y='B', color='DarkBlue', label='Class 1')
data.plot.scatter(x='A', y='C', color='DarkGreen', label='Class 2', ax=ax)          # 第一个ax是参数名,第二个ax是图名
# ax是一个图对象,如果不ax=ax就会产生两幅图片
plt.show()

posted @ 2019-12-11 10:01  旅人_Eric  阅读(199)  评论(0编辑  收藏  举报