Pandas基本使用(一)

读取数据源

安装库

pip install pandas

# 读取.xls
pip install xlrd

# 读取.xlsx
pip install openpyxl

读取excel

import pandas as pd
path = r"C:\work\test.xlsx"
data = pd.read_excel(path)
print(data)

读取.csv文件

import pandas as pd
data = pd.read_csv(r"C:\\work\\test.csv")
print(data)
print(type(data))

注意excel文件另存为csv后缀的文件后，需要再用记事本打开，查看编码格式是否为utf8，否则会报错

解决方案

在使用记事本另存为utf8的格式

Traceback (most recent call last):
  File "C:\work\PythonProject\demo\day15\pandas学习01.py", line 27, in <module>
    data = pd.read_csv(r"C:\\work\\test.csv")
  File "C:\Users\ychen\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\parsers\readers.py", line 948, in read_csv
    return _read(filepath_or_buffer, kwds)
  File "C:\Users\ychen\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\parsers\readers.py", line 611, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
  File "C:\Users\ychen\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\parsers\readers.py", line 1448, in __init__
    self._engine = self._make_engine(f, self.engine)
  File "C:\Users\ychen\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\parsers\readers.py", line 1723, in _make_engine
    return mapping[engine](f, **self.options)
  File "C:\Users\ychen\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\parsers\c_parser_wrapper.py", line 93, in __init__
    self._reader = parsers.TextReader(src, **kwds)
  File "parsers.pyx", line 579, in pandas._libs.parsers.TextReader.__cinit__
  File "parsers.pyx", line 668, in pandas._libs.parsers.TextReader._get_header
  File "parsers.pyx", line 879, in pandas._libs.parsers.TextReader._tokenize_rows
  File "parsers.pyx", line 890, in pandas._libs.parsers.TextReader._check_tokenize_status
  File "parsers.pyx", line 2050, in pandas._libs.parsers.raise_parser_error
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb2 in position 0: invalid start byte

读取csv文件并指定参数

'''
header=None 没有标题行
names=['transaction_id','member_id']  创建标题
index_col 指定索引列
skiprows
    skiprows=[2,4] 跳过2行和4行，从0开始数
    skiprows=3  跳过前面3行
encoding="utf-8"  指定编码格式
nrows=3 只取前面3行
'''

import pandas as pd
path = r"C:\\work\\sales.csv"
data = pd.read_csv(path,
                   header=None,
                   names=['transaction_id',"member_id",'product_id','city'],
                   index_col="transaction_id"
                   ,skiprows=[2,4]
                   ,encoding="utf-8"
                   ,nrows=3)
print(data)
# 另存为
data.to_excel("a.xlsx")

读取mysql

'''
# 安装库
pip install pymysql
# host=主机名,user=用户名,password=密码,database=数据库名
# id就是表的主键列
# index_col="id"
'''

import pymysql as psql
conn = psql.connect(host="localhost",user="root",password="local123",database="mydb")
sql = "select * from shop"
res = pd.read_sql(sql,conn,index_col="id") 
print(res)

pandas数据结构

DataFrame

# DataFrame 是一个表格型的数据结构
# Series 表示DataFrame的一行或者一列

import pandas as pd
dic={
    "序号":[1,2,3,4],
    "姓名":['张三','李四','王五',"赵六"]
}
# 不指定索引列
data = pd.DataFrame(dic)
print(data)
# 指定索引列
data = data.set_index("序号") 
print(data)
# 导出
data.to_excel("pa.xlsx")

输出打印

   序号  姓名
0   1  张三
1   2  李四
2   3  王五
3   4  赵六

    姓名
序号    
1   张三
2   李四
3   王五
4   赵六

Series

s1 = ["张三",30,'男']
s2 = ['姓名','年龄','性别']
se = pd.Series(s1,s2)
print(se)

# 字典作为参数传入
dic = {
    "姓名":"张三",
    "性别":'男',
    "年龄":20
}
se = pd.Series(dic)
print(se)

输出打印

姓名    张三
年龄    30
性别     男
dtype: object

姓名    张三
性别     男
年龄    20
dtype: object

传入列表，切片

d = [
    [1,11,12,13,14],
    [2,16,17,18,19],
    [3,21,22,23,24],
    [4,21,22,23,24],
    [5,21,22,23,24],
]
data = pd.DataFrame(d,columns=['a','b','d','c','e'])  # 指定标题行
data = data.set_index('a')  # 指定索引列
print(data)

"""
    b   d   c   e
a                
1  11  12  13  14
2  16  17  18  19
3  21  22  23  24
4  21  22  23  24
5  21  22  23  24
"""

# 指定行列的值
b = data.loc[1,'b']
print(b)    # 11

# 显示b列
b = data['b']
print(b)

"""
a
1    11
2    16
3    21
4    21
5    21
Name: b, dtype: int64
"""

# 显示b,c两列
b = data[['b','c']]
print(b)

"""
    b   c
a        
1  11  13
2  16  18
3  21  23
4  21  23
5  21  23
"""

# 先取行后取列
b = data.loc[[2,4],'b':'e']
print(b)

"""
    b   d   c   e
a                
2  16  17  18  19
4  21  22  23  24
"""

传入字典，切片

dic ={
    "序号":[1,2,3,4],
    "姓名":['张三','李四','王五','赵六'],
    "性别":['男','男','女','男'],
    "年龄":[28,30,18,29],
    "特长":['吃饭','睡觉','打豆豆','跑步']
}
data = pd.DataFrame(dic)
data = data.set_index("序号")
# 打印所有
print(data)

"""
    姓名 性别  年龄   特长
序号                
1   张三  男  28   吃饭
2   李四  男  30   睡觉
3   王五  女  18  打豆豆
4   赵六  男  29   跑步
"""

# 打印标题行
print(data.columns)

"""
Index(['姓名', '性别', '年龄', '特长'], dtype='object')
"""

# 查看指定列
print(data['姓名'])  

"""
序号
1    张三
2    李四
3    王五
4    赵六
Name: 姓名, dtype: object
"""

# 查看2列
print(data[['姓名','性别']]) 

"""
    姓名 性别
序号       
1   张三  男
2   李四  男
3   王五  女
4   赵六  男
"""

# 查看1-3行的记录
print(data.loc[1:3])  

"""
    姓名 性别  年龄   特长
序号                
1   张三  男  28   吃饭
2   李四  男  30   睡觉
3   王五  女  18  打豆豆
"""

多个Series组成DataFrame

ser1 = pd.Series(["张三",'李四','王五'],index=[1,2,5],name="姓名")
ser2 = pd.Series(["男",'女','男'],index=[1,5,3],name="性别")
ser3 = pd.Series([18,20,22],index=[1,2,3],name="年龄")
# 直接作为参数传入
data = pd.DataFrame([ser1,ser2,ser3])
print(data)

"""
       1     2    5     3
姓名    张三    李四   王五   NaN
性别     男   NaN    女     男
年龄  18.0  20.0  NaN  22.0
"""

# 先放入字典
dic={
    ser1.name:ser1,
    ser2.name:ser2,
    ser3.name:ser3
}
data = pd.DataFrame(dic)
print(data)

"""
    姓名   性别    年龄
1   张三    男  18.0
2   李四  NaN  20.0
3  NaN    男  22.0
5   王五    女   NaN
"""

列太多时，显示全部

pd.set_option('max_colwidth',200)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号

# 显示全部字段
home.head()

posted @ 2023-11-10 12:47 DogLeftover 阅读(282) 评论(0) 编辑收藏举报

刷新页面返回顶部