python-pandas DataFrame,Series笔记1

Python pandas 模块，Series, DataFrame 学习笔记

官方文档网址：

https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html#basics-dataframe

我的笔记分享网址：

https://note.youdao.com/s/LFip7Cc5

python-pandas DataFrame,Series笔记1

包含头文件

#!/usr/bin/evn python

import numpy as np
import pandas as pd

Series

"""
Series

Series is a one-dimensional labeled array capable of holding any data type
(integers, strings, floating point numbers, Python objects, etc.).

The axis labels are collectively referred to as the index.
The basic method to create a Series is to call:
s = pd.Series(data, index=index)
"""


# from ndarray
s1 = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])
print("pd s1:\n", s1)
print("pd s1.index:", s1.index)
print("pd s1.values:", s1.values)

pd s1:
 a   -0.261995
b    0.119171
c   -0.129191
d   -1.385260
e   -0.087495
dtype: float64
pd s1.index: Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
pd s1.values: [-0.26199524  0.11917108 -0.12919125 -1.38525982 -0.08749467]

# from dict
d = {"b": 1, "a": 0, "c": 2}
s2 = pd.Series(d)
print("pd s2:\n", s2)

s3 = pd.Series(d, index=["b", "c", "d", "a"])
print("pd s3:\n", s3)

pd s2:
 b    1
a    0
c    2
dtype: int64
pd s3:
 b    1.0
c    2.0
d    NaN
a    0.0
dtype: float64

# from scalar value
s4 = pd.Series(5.0, index=["a", "b", "c", "d", "e"])
print("pd s4:\n", s4)

pd s4:
 a    5.0
b    5.0
c    5.0
d    5.0
e    5.0
dtype: float64


# Series is ndarray-like，可以像数组一样访问 Series 里面的数据
print("pd s3.array:\n", s3.array[1])
# Series is dic-like. 可以像数组一样访问 Series 里面的数据
print("pd s3.['b']:\n", s3['b'])
print("test s3 key:", "b" in s3)
print("test s3 key:", "f" in s3)

pd s3.array:
 2.0
pd s3.['b']:
 1.0
test s3 key: True
test s3 key: False


# Using the  Series.get() method, a missing lable will return None or specified default:
print("Series.get() method:", s3.get("name"))
print("Series.get() method:", s3.get("name", np.nan))
print("Series.get() method:", s3.get("name", "henry"))

Series.get() method: None
Series.get() method: nan
Series.get() method: henry

# Series also has a name attribute:
s5 = pd.Series(np.random.randn(5), name="henry")
print("pd s5:\n", s5)

s6 = s5.rename("henry2")
print("pd s6:\n", s6)
print("pd s6.head():\n", s6.head())
print("pd s6.head(2):\n", s6.head(2))

pd s5:
 0   -0.476002
1    0.248520
2    1.094846
3    0.505171
4   -0.176442
Name: henry, dtype: float64
pd s6:
 0   -0.476002
1    0.248520
2    1.094846
3    0.505171
4   -0.176442
Name: henry2, dtype: float64
pd s6.head():
 0   -0.476002
1    0.248520
2    1.094846
3    0.505171
4   -0.176442
Name: henry2, dtype: float64
pd s6.head(2):
 0   -0.476002
1    0.248520
Name: henry2, dtype: float64

DataFrame

"""
DataFrame

DataFrame is a 2-dimensional labeled data structure with columns of potentially different types.
You can think of it like a spreadsheet or SQL table,
or a dict of Series objects.

It is generally the most commonly used pandas object.
Like Series, DataFrame accepts many different kinds of input:

Dict of 1D ndarrays, lists, dicts, or Series
2-D numpy.ndarray
Structured or record ndarray
A Series
Another DataFrame

Along with the data, you can optionally pass index (row labels) and columns (column labels) arguments.
If you pass an index and / or columns,
you are guaranteeing the index and / or columns of the resulting DataFrame.
Thus, a dict of Series plus a specific index will discard all data
not matching up to the passed index.

If axis labels are not passed,
they will be constructed from the input data based on common sense rules.

"""


# from dic of Series or dicts
d2 = {
    "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "two": pd.Series([10.0, 20.0, 30.0, 40.0], index=["a", "b", "c", "d"])
}
df1 = pd.DataFrame(d2)
print("DataFrame df1:", df1)

df2 = pd.DataFrame(d2, index=["d", "b", "a"])
print("DataFrame df2:", df2)
df2.rename(columns={'two': 'symbol'}, inplace=True)
print("DataFrame df2.rename:", df2)

df3 = pd.DataFrame(d2, index=["d", "b", "a"], columns=["two", "three"])
print("DataFrame df3:", df3)

DataFrame df1:    one   two
a  1.0  10.0
b  2.0  20.0
c  3.0  30.0
d  NaN  40.0
DataFrame df2:    one   two
d  NaN  40.0
b  2.0  20.0
a  1.0  10.0
DataFrame df2.rename:    one  symbol
d  NaN    40.0
b  2.0    20.0
a  1.0    10.0
DataFrame df3:     two three
d  40.0   NaN
b  20.0   NaN
a  10.0   NaN


# the row and colunm lables can be accessed respectively by acessing the index and columns attributes:
#  索引名称
print("df3.index:", df3.index)
# 列 名称
print("df3.columns:", df3.columns)

df3.index: Index(['d', 'b', 'a'], dtype='object')
df3.columns: Index(['two', 'three'], dtype='object')


# from dict of ndarrays / lists
d = {
    "one":[1.0, 2.0, 3.0, 4.0],
    "two":[4.0, 3.0, 2.0, 1.0]
}
df4 = pd.DataFrame(d)
print("DataFrame df4:", df4)

df5 = pd.DataFrame(d, index=["a", "b", "c", "d"])
print("DataFrame df5:", df5)

DataFrame df4:    one  two
0  1.0  4.0
1  2.0  3.0
2  3.0  2.0
3  4.0  1.0
DataFrame df5:    one  two
a  1.0  4.0
b  2.0  3.0
c  3.0  2.0
d  4.0  1.0


# from structed or record array
data = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "a10")])
print("DataFrame data1:", data)
data[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")]
print("DataFrame data2:", data)
df6 = pd.DataFrame(data)
print("DataFrame df6:", df6)

df7 = pd.DataFrame(data, index=["first", "second"])
print("DataFrame df7:", df7)

df8 = pd.DataFrame(data, columns=["C", "A", "B"])
print("DataFrame df8:", df8)

DataFrame data1: [(0, 0., b'') (0, 0., b'')]
DataFrame data2: [(1, 2., b'Hello') (2, 3., b'World')]
DataFrame df6:    A    B         C
0  1  2.0  b'Hello'
1  2  3.0  b'World'
DataFrame df7:         A    B         C
first   1  2.0  b'Hello'
second  2  3.0  b'World'
DataFrame df8:           C  A    B
0  b'Hello'  1  2.0
1  b'World'  2  3.0


# from a list of dicts
data2 = [
    {"a":1, "b":2},
    {"a":5,"b":10,"c":20}
]
df9 = pd.DataFrame(data2)
print("DataFrame df9:", df9)


print("DataFrame df10:", pd.DataFrame(data2, index=["first", "second"]))

# 只获取columns 列出的那几列数据
print("DataFrame df11:", pd.DataFrame(data2, columns=["a", "b"]))

DataFrame df9:    a   b     c
0  1   2   NaN
1  5  10  20.0
DataFrame df10:         a   b     c
first   1   2   NaN
second  5  10  20.0
DataFrame df11:    a   b
0  1   2
1  5  10


# from a dict of tuples
df12 = pd.DataFrame(
    {
        ("a","b"):{("A", "B"):1, ("A", "C"):2},
        ("a", "a"): {("A", "C"): 3, ("A", "B"): 4},
        ("a", "c"): {("A", "B"): 5, ("A", "C"): 6},
        ("b", "a"): {("A", "C"): 7, ("A", "B"): 7},
        ("b", "b"): {("A", "D"): 9, ("A", "B"): 10}
    })
print("DataFrame df12:", df12)

DataFrame df12:        a              b      
       b    a    c    a     b
A B  1.0  4.0  5.0  7.0  10.0
  C  2.0  3.0  6.0  7.0   NaN
  D  NaN  NaN  NaN  NaN   9.0


# from a Series
ser = pd.Series(range(3), index=list("abc"), name="ser")
print("ser:", ser)
print("DataFrame df13:", pd.DataFrame(ser))
# Series 里面定义的name，就是DataFrame里面的列 名称
print("DataFrame df14:", pd.DataFrame(ser, columns=["ser"]))
print("DataFrame df15:", pd.DataFrame(ser, columns=["ser", "name2"]))

ser: a    0
b    1
c    2
Name: ser, dtype: int64
DataFrame df13:    ser
a    0
b    1
c    2
DataFrame df14:    ser
a    0
b    1
c    2
DataFrame df15:    ser name2
a    0   NaN
b    1   NaN
c    2   NaN


# from a list of namedtuples
from collections import  namedtuple
Point = namedtuple("Point", "x y")
print("DataFrame df16:", pd.DataFrame([Point(0,0), Point(0,3), (2,3)]))
Point3D = namedtuple("Point3D", "x y z")
print("DataFrame df17:", pd.DataFrame([Point3D(0,0,0), Point3D(0,3,5), Point(2,3)]))

DataFrame df16:    x  y
0  0  0
1  0  3
2  2  3
DataFrame df17:    x  y    z
0  0  0  0.0
1  0  3  5.0
2  2  3  NaN


# from a list of dataclasses
from dataclasses import make_dataclass
Point = make_dataclass("Point", [("x", int), ("y", int)])
print("DataFrame df18:", pd.DataFrame([Point(0,0), Point(0,3), Point(2,3)]))

DataFrame df18:    x  y
0  0  0
1  0  3
2  2  3

Alternate constructors

"""
DataFrame.from_dict

DataFrame.from_dict() takes a dict of dicts or a dict of array-like sequences and returns a DataFrame.
It operates like the DataFrame constructor except for the orient parameter
which is 'columns' by default,
but which can be set to 'index' in order to use the dict keys as row labels.
"""


print("df19:",pd.DataFrame.from_dict(dict([("A", [1, 2, 3]), ("B", [4, 5, 6])])))

# orient="index", 把 列明 和索引名交换了，相当于旋转了数组
# If you pass orient='index', the keys will be the row labels.
# In this case, you can also pass the desired column names:
print("df20:",pd.DataFrame.from_dict(
    dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]),
    orient="index",
    columns=["one", "two", "three"],))

print("df21:",pd.DataFrame.from_dict(
    dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]),
    orient="index"))

df19:    A  B
0  1  4
1  2  5
2  3  6
df20:    one  two  three
A    1    2      3
B    4    5      6
df21:    0  1  2
A  1  2  3
B  4  5  6

DataFrame.from_records

DataFrame.from_records() takes a list of tuples or an ndarray with structured dtype.
It works analogously to the normal DataFrame constructor, except that the resulting DataFrame index may be a specific field of the structured dtype.

【暂时不理解】

Column selection, addition, deletion

"""
You can treat a DataFrame semantically like a dict of like-indexed Series objects.
Getting, setting, and deleting columns works with the same syntax as the analogous dict operations:

"""


# 访问df 的某列，df的某列就是一个 Series
print("df1", df1)
print("df22", df1["one"])

df1["three"] = df1["one"] * df1["two"]
# 判断df1["one"]里面每个元素是否 大于2，结果是 一个Bool类型变量
df1["flag"] = df1["one"]>2
print("df23:", df1)

df1    one   two
a  1.0  10.0
b  2.0  20.0
c  3.0  30.0
d  NaN  40.0
df22 a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64
df23:    one   two  three   flag
a  1.0  10.0   10.0  False
b  2.0  20.0   40.0  False
c  3.0  30.0   90.0   True
d  NaN  40.0    NaN  False

# Columns can be deleted or popped like with a dict:
del df1["two"]
three = df1.pop("three")
print("df three:", three)
print("df24:", df1)

df three: a    10.0
b    40.0
c    90.0
d     NaN
Name: three, dtype: float64
df24:    one   flag
a  1.0  False
b  2.0  False
c  3.0   True
d  NaN  False

# when inserting a scalar value, it will naturally be propagated to fill the column.
df1["foo"] = "bar"
print("df25:", df1)

df25:    one   flag  foo
a  1.0  False  bar
b  2.0  False  bar
c  3.0   True  bar
d  NaN  False  bar


# when inserting a Series that does not have the same index as the DataFrame,it will be conformed to the DataFrame's index
## slicing 切片知识补充
tag = "henry, hello slicing!"
print("tag[:]", tag[:])
# [) 切片是一个 半闭半开 的区间
print("tag[1:2]", tag[1:2])
print("tag[0:2]", tag[0:2])
print("tag[:2]", tag[:2])

print("Series [:]", df1["one"][:])
print("Series [:2]", df1["one"][:2])
df1["one_trunc"] = df1["one"][:2]
print("DataFrame df25:", df1)

tag[:] henry, hello slicing!
tag[1:2] e
tag[0:2] he
tag[:2] he
Series [:] a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64
Series [:2] a    1.0
b    2.0
Name: one, dtype: float64
DataFrame df25:    one   flag  foo  one_trunc
a  1.0  False  bar        1.0
b  2.0  False  bar        2.0
c  3.0   True  bar        NaN
d  NaN  False  bar        NaN

# 指定位置插入一列
# You can insert raw ndarrays but their length must match the length of the DataFrame’s index.
# By default, columns get inserted at the end. DataFrame.insert() inserts at a particular location in the column
df1.insert(1,"insert_bar", df1["one"])
print("DataFrame df26:",df1)

DataFrame df26:    one  insert_bar   flag  foo  one_trunc
a  1.0         1.0  False  bar        1.0
b  2.0         2.0  False  bar        2.0
c  3.0         3.0   True  bar        NaN
d  NaN         NaN  False  bar        NaN


# 通过 assign() 方法，从已有的列中 创造一个新的列
print("df1.head()", df1.head())
print("df1['one'].head()", df1["one"].head())
## assign  方法创造了新的列，但是不会改变之前的df数据，新的列是在 返回的数据里面
df2 = df1.assign(new_col=df1["one"]/df1["one_trunc"])
print("DataFrame df27:",df1) # df1 还是之前的结构，没有改变
print("DataFrame df28:",df2) # df2 才是改变后的结构

df1.head()    one  insert_bar   flag  foo  one_trunc
a  1.0         1.0  False  bar        1.0
b  2.0         2.0  False  bar        2.0
c  3.0         3.0   True  bar        NaN
d  NaN         NaN  False  bar        NaN
df1['one'].head() a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64
DataFrame df27:    one  insert_bar   flag  foo  one_trunc
a  1.0         1.0  False  bar        1.0
b  2.0         2.0  False  bar        2.0
c  3.0         3.0   True  bar        NaN
d  NaN         NaN  False  bar        NaN
DataFrame df28:    one  insert_bar   flag  foo  one_trunc  new_col
a  1.0         1.0  False  bar        1.0      1.0
b  2.0         2.0  False  bar        2.0      1.0
c  3.0         3.0   True  bar        NaN      NaN
d  NaN         NaN  False  bar        NaN      NaN


#  通过函数的方式来创建新的列
# In the example above, we inserted a precomputed value.
# We can also pass in a function of one argument to be evaluated on the DataFrame being assigned to.
df3 = df1.assign(func_col=lambda x:(x["one"]+10))
print("DataFrame df29:", df3)

DataFrame df29:    one  insert_bar   flag  foo  one_trunc  func_col
a  1.0         1.0  False  bar        1.0      11.0
b  2.0         2.0  False  bar        2.0      12.0
c  3.0         3.0   True  bar        NaN      13.0
d  NaN         NaN  False  bar        NaN       NaN

20230310.csv

SepalLength,SepalWidth,PetalLength,PetalWidth,Name
 5.1 ,  3.5 ,         1.4 ,        0.2  ,Iris-setosa
  4.9 , 3.0  ,        1.4  ,       0.2 , Iris-setosa
4.7,3.2,1.3 ,0.2,Iris-setosa
4.6 ,        3.1  ,        1.5 ,        0.2  ,Iris-setosa
 5.0  ,       3.6  ,        1.4  ,       0.2,  Iris-setosa


# assign() always returns a copy of the data, leaving the original DataFrame untouched.
# assign 通常不会改变 原来的DataFrame数据，而是返回数据的拷贝
iris = pd.read_csv("20230310.csv")
print("csv data:", iris)
print("iris.assign:",iris.assign(sepal_ratio=iris["SepalWidth"] + 10).head() )
print("iris cloumns:", iris.columns)
## 注意，从csv读取的属性列 有空格
# iris cloumns: Index(['SepalLength', 'SepalWidth', 'PetalLength', '  PetalWidth', 'Name'], dtype='object')
print("csv data:[PetalWidth]", iris["PetalWidth"])
print("csv data:[PetalLength]", iris["PetalLength"])


print("PetalRatio:",iris.assign(PetalRatio=lambda x: x.PetalWidth / x.PetalLength).head())

csv data:    SepalLength  SepalWidth  PetalLength  PetalWidth           Name
0          5.1         3.5          1.4         0.2    Iris-setosa
1          4.9         3.0          1.4         0.2    Iris-setosa
2          4.7         3.2          1.3         0.2    Iris-setosa
3          4.6         3.1          1.5         0.2    Iris-setosa
4          5.0         3.6          1.4         0.2    Iris-setosa
iris.assign:    SepalLength  SepalWidth  PetalLength  PetalWidth           Name  sepal_ratio
0          5.1         3.5          1.4         0.2    Iris-setosa         13.5
1          4.9         3.0          1.4         0.2    Iris-setosa         13.0
2          4.7         3.2          1.3         0.2    Iris-setosa         13.2
3          4.6         3.1          1.5         0.2    Iris-setosa         13.1
4          5.0         3.6          1.4         0.2    Iris-setosa         13.6
iris cloumns: Index(['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'Name'], dtype='object')
csv data:[PetalWidth] 0    0.2
1    0.2
2    0.2
3    0.2
4    0.2
Name: PetalWidth, dtype: float64
csv data:[PetalLength] 0    1.4
1    1.4
2    1.3
3    1.5
4    1.4
Name: PetalLength, dtype: float64
PetalRatio:    SepalLength  SepalWidth  PetalLength  PetalWidth           Name  PetalRatio
0          5.1         3.5          1.4         0.2    Iris-setosa    0.142857
1          4.9         3.0          1.4         0.2    Iris-setosa    0.142857
2          4.7         3.2          1.3         0.2    Iris-setosa    0.153846
3          4.6         3.1          1.5         0.2    Iris-setosa    0.133333
4          5.0         3.6          1.4         0.2    Iris-setosa    0.142857

"""
(
iris.query("SepalLength > 5")
.assign(
SepalRatio=lambda x: x.SepalWidth / x.SepalLength,
PetalRatio=lambda x: x.PetalLength / x.PetalLength,
).plot(kind="scatter", x="SepalRatio", y="PetalRatio")
)
"""

"""
The function signature for assign() is simply **kwargs.
The keys are the column names for the new fields,
and the values are either a value to be inserted (for example, a Series or NumPy array),
or a function of one argument to be called on the DataFrame.
A copy of the original DataFrame is returned, with the new values inserted.

The order of **kwargs is preserved. This allows for dependent assignment,
where an expression later in **kwargs can refer to a column created earlier in the same assign().
"""

# In the second expression, x['C'] will refer to the newly created column, that’s equal to dfa['A'] + dfa['B'].
dfa = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
dfb = dfa.assign(C=lambda x: x["A"] + x["B"], D=lambda x: x["A"] + x["C"])
print("dfa:", dfa)
print("dfb:", dfb)

dfa:    A  B
0  1  4
1  2  5
2  3  6
dfb:    A  B  C   D
0  1  4  5   6
1  2  5  7   9
2  3  6  9  12

Indexing / selection

Operation                           Syntax                  Result
select column :                     df[col]                   Series
select row by label :               df.loc[label]             Series
Select row by integer location:     df.iloc[loc]              Series
Slice rows:                         df[5:10]                  DataFrame
Select rows by boolean vector:      df[bool_vec]              DataFrame

# Row selection, for example, returns a Series whose index is the columns of the DataFrame:
print("df30:", df1)
## 选出 某一行（b 是一个索引值，选出这个索引的行）Select row by label
print("df31:", df1.loc["b"])

df30:    one  insert_bar   flag  foo  one_trunc
a  1.0         1.0  False  bar        1.0
b  2.0         2.0  False  bar        2.0
c  3.0         3.0   True  bar        NaN
d  NaN         NaN  False  bar        NaN
df31: one               2
insert_bar        2
flag          False
foo             bar
one_trunc         2
Name: b, dtype: object

# Select row by integer location
print("df32:", df1.iloc[2])

df32: one              3
insert_bar       3
flag          True
foo            bar
one_trunc      NaN
Name: c, dtype: object

Data alignment and arithmetic

"""
Data alignment between DataFrame objects automatically align on both the columns and the index (row labels).
Again, the resulting object will have the union of the column and row labels.
"""


df = pd.DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"])
df2 = pd.DataFrame(np.random.randn(7, 3), columns=["A", "B", "C"])
print("df40:", df)
print("df41:", df2)
print("df42:", df + df2)
print("df43:", df - df.iloc[0])

print("df44:", df * 5 + 2)
print("df45:", 1 / df)
print("df46:", df ** 4)

df40:           A         B         C         D
0 -1.084084  0.183785 -1.153985  0.055283
1  0.253552  0.077291 -0.303460  0.701300
2  0.821357 -1.116865  0.610512 -1.327411
3 -0.251630 -0.341660  0.730774 -0.584376
4 -1.215528 -0.691270 -0.552341  0.278375
5 -0.009694 -0.963907  1.593190 -0.991862
6  0.377235 -1.089611 -0.515518 -0.252789
7 -1.158782 -1.405582 -0.189190 -1.787201
8  1.469154  0.924453 -0.305886 -0.527754
9 -0.416995  1.469462 -1.107226  0.941600
df41:           A         B         C
0 -0.184298  1.094119 -0.623001
1 -0.531990 -0.025734 -0.948708
2  0.877716 -1.547748 -0.753285
3 -0.248297 -1.370722  1.646786
4  0.958594 -0.373161  1.166930
5 -0.626382  1.731893  0.521530
6 -0.008678  0.955742  0.463842
df42:           A         B         C   D
0 -1.268382  1.277904 -1.776987 NaN
1 -0.278439  0.051558 -1.252168 NaN
2  1.699073 -2.664612 -0.142773 NaN
3 -0.499928 -1.712383  2.377560 NaN
4 -0.256934 -1.064431  0.614589 NaN
5 -0.636076  0.767986  2.114720 NaN
6  0.368556 -0.133869 -0.051676 NaN
7       NaN       NaN       NaN NaN
8       NaN       NaN       NaN NaN
9       NaN       NaN       NaN NaN
df43:           A         B         C         D
0  0.000000  0.000000  0.000000  0.000000
1  1.337636 -0.106494  0.850525  0.646017
2  1.905441 -1.300650  1.764497 -1.382694
3  0.832454 -0.525445  1.884759 -0.639659
4 -0.131444 -0.875055  0.601645  0.223092
5  1.074390 -1.147693  2.747175 -1.047145
6  1.461319 -1.273396  0.638467 -0.308072
7 -0.074698 -1.589367  0.964795 -1.842484
8  2.553238  0.740668  0.848100 -0.583037
9  0.667089  1.285677  0.046759  0.886317
df44:           A         B         C         D
0 -3.420421  2.918925 -3.769926  2.276415
1  3.267758  2.386457  0.482700  5.506499
2  6.106784 -3.584324  5.052559 -4.637055
3  0.741848  0.291698  5.653871 -0.921882
4 -4.077641 -1.456350 -0.761703  3.391873
5  1.951529 -2.819537  9.965949 -2.959312
6  3.886173 -3.448053 -0.577592  0.736056
7 -3.793910 -5.027910  1.054048 -6.936004
8  9.345768  6.622265  0.470572 -0.638769
9 -0.084974  9.347311 -3.536131  6.708002
df45:             A          B         C          D
0   -0.922438   5.441138 -0.866562  18.088755
1    3.943970  12.938049 -3.295328   1.425924
2    1.217498  -0.895364  1.637970  -0.753346
3   -3.974082  -2.926883  1.368412  -1.711226
4   -0.822688  -1.446613 -1.810477   3.592281
5 -103.155091  -1.037444  0.627672  -1.008204
6    2.650870  -0.917759 -1.939795  -3.955872
7   -0.862975  -0.711449 -5.285678  -0.559534
8    0.680664   1.081721 -3.269195  -1.894823
9   -2.398111   0.680521 -0.903158   1.062022
df46:               A         B         C          D
0  1.381186e+00  0.001141  1.773377   0.000009
1  4.133002e-03  0.000036  0.008480   0.241888
2  4.551214e-01  1.555974  0.138924   3.104715
3  4.009152e-03  0.013626  0.285189   0.116619
4  2.183033e+00  0.228345  0.093074   0.006005
5  8.831558e-09  0.863260  6.442732   0.967845
6  2.025099e-02  1.409565  0.070628   0.004083
7  1.803047e+00  3.903235  0.001281  10.202189
8  4.658744e+00  0.730364  0.008755   0.077576
9  3.023589e-02  4.662659  1.502954   0.786080


df1 = pd.DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}, dtype=bool)
df2 = pd.DataFrame({"a": [0, 1, 1], "b": [1, 1, 0]}, dtype=bool)
print("df1:", df1)
print("df2:", df2)
print("df47:", df1 & df2)
print("df48:", df1 | df2)
print("df49:", df1 ^ df2)
print("df50:", -df1)

df1:        a      b
0   True  False
1  False   True
2   True   True
df2:        a      b
0  False   True
1   True   True
2   True  False
df47:        a      b
0  False  False
1  False   True
2   True  False
df48:       a     b
0  True  True
1  True  True
2  True  True
df49:        a      b
0   True   True
1   True  False
2  False   True
df50:        a      b
0  False   True
1   True  False
2  False  False

## 旋转 DataFrame 里面的数据
print("df", df)
print("df[:5]", df[:5].T)


df           A         B         C         D
0 -1.084084  0.183785 -1.153985  0.055283
1  0.253552  0.077291 -0.303460  0.701300
2  0.821357 -1.116865  0.610512 -1.327411
3 -0.251630 -0.341660  0.730774 -0.584376
4 -1.215528 -0.691270 -0.552341  0.278375
5 -0.009694 -0.963907  1.593190 -0.991862
6  0.377235 -1.089611 -0.515518 -0.252789
7 -1.158782 -1.405582 -0.189190 -1.787201
8  1.469154  0.924453 -0.305886 -0.527754
9 -0.416995  1.469462 -1.107226  0.941600
df[:5]           0         1         2         3         4
A -1.084084  0.253552  0.821357 -0.251630 -1.215528
B  0.183785  0.077291 -1.116865 -0.341660 -0.691270
C -1.153985 -0.303460  0.610512  0.730774 -0.552341
D  0.055283  0.701300 -1.327411 -0.584376  0.278375



### https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.filter.html#pandas.DataFrame.filter
### https://www.cnblogs.com/mehome/p/9513492.html

## pandas.DataFrame.filter
df = pd.DataFrame(np.array(([1, 2, 3], [4, 5, 6])),
                  index=['mouse', 'rabbit'],
                  columns=['one', 'two', 'three'])
print("df is :",df)

# select columns by name
print("df.filter select columns:",df.filter(items=["one", 'three']))

df is :         one  two  three
mouse     1    2      3
rabbit    4    5      6

# 使用正则表达式来匹配 行或者列 
# select columns by regular expression.  $:匹配字符串末尾
print("df.filter regex columns:", df.filter(regex="e$", axis=1))
print("df.filter regex row2:", df.filter(regex="ous", axis=0))

df.filter select columns:         one  three
mouse     1      3
rabbit    4      6
df.filter regex columns:         one  three
mouse     1      3
rabbit    4      6
df.filter regex row2:        one  two  three
mouse    1    2      3

## 用like 来筛选， axis=0表示行，=1表示列 
print("df.filter like 1(column): ", df.filter(like="e", axis=1))
print("df.filter like 0(index): ", df.filter(like="bbi", axis=0))

df.filter like 1(column):          one  three
mouse     1      3
rabbit    4      6
df.filter like 0(index):          one  two  three
rabbit    4    5      6

完整的脚本

#!/usr/bin/evn python

import numpy as np
import pandas as pd

# ------------------------ Series
from pandas import Series

"""
Series

Series is a one-dimensional labeled array capable of holding any data type 
(integers, strings, floating point numbers, Python objects, etc.). 

The axis labels are collectively referred to as the index. 
The basic method to create a Series is to call:
s = pd.Series(data, index=index)
"""

# from ndarray
s1: Series = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])
print("pd s1:\n", s1)
print("pd s1.index:", s1.index)
print("pd s1.values:", s1.values)

# from dict
d = {"b": 1, "a": 0, "c": 2}
s2 = pd.Series(d)
print("pd s2:\n", s2)
s3 = pd.Series(d, index=["b", "c", "d", "a"])
print("pd s3:\n", s3)

# from scalar value
s4 = pd.Series(5.0, index=["a", "b", "c", "d", "e"])
print("pd s4:\n", s4)

# Series is ndarray-like，可以像数组一样访问 Series 里面的数据
print("pd s3.array:\n", s3.array[1])
# Series is dic-like. 可以像数组一样访问 Series 里面的数据
print("pd s3.['b']:\n", s3['b'])
print("test s3 key:", "b" in s3)
print("test s3 key:", "f" in s3)

# Using the  Series.get() method, a missing lable will return None or specified default:
print("Series.get() method:", s3.get("name"))
print("Series.get() method:", s3.get("name", np.nan))
print("Series.get() method:", s3.get("name", "henry"))

# Series also has a name attribute:
s5 = pd.Series(np.random.randn(5), name="henry")
print("pd s5:\n", s5)

s6 = s5.rename("henry2")
print("pd s6:\n", s6)
print("pd s6.head():\n", s6.head())
print("pd s6.head(2):\n", s6.head(2))

# ------------------------ DataFrame
"""
DataFrame

DataFrame is a 2-dimensional labeled data structure with columns of potentially different types. 
You can think of it like a spreadsheet or SQL table, 
or a dict of Series objects. 

It is generally the most commonly used pandas object. 
Like Series, DataFrame accepts many different kinds of input:

  Dict of 1D ndarrays, lists, dicts, or Series
  2-D numpy.ndarray
  Structured or record ndarray
  A Series
  Another DataFrame

Along with the data, you can optionally pass index (row labels) and columns (column labels) arguments. 
If you pass an index and / or columns, 
you are guaranteeing the index and / or columns of the resulting DataFrame. 
Thus, a dict of Series plus a specific index will discard all data 
not matching up to the passed index.

If axis labels are not passed, 
they will be constructed from the input data based on common sense rules.

"""

# from dic of Series or dicts
d2 = {
    "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
    "two": pd.Series([10.0, 20.0, 30.0, 40.0], index=["a", "b", "c", "d"])
}
df1 = pd.DataFrame(d2)
print("DataFrame df1:", df1)

df2 = pd.DataFrame(d2, index=["d", "b", "a"])
print("DataFrame df2:", df2)
df2.rename(columns={'two': 'symbol'}, inplace=True)
print("DataFrame df2.rename:", df2)

df3 = pd.DataFrame(d2, index=["d", "b", "a"], columns=["two", "three"])
print("DataFrame df3:", df3)

# the row and colunm lables can be accessed respectively by acessing the index and columns attributes:
#  索引名称
print("df3.index:", df3.index)
# 列 名称
print("df3.columns:", df3.columns)

# from dict of ndarrays / lists
d = {
    "one": [1.0, 2.0, 3.0, 4.0],
    "two": [4.0, 3.0, 2.0, 1.0]
}
df4 = pd.DataFrame(d)
print("DataFrame df4:", df4)

df5 = pd.DataFrame(d, index=["a", "b", "c", "d"])
print("DataFrame df5:", df5)

# from structed or record array
data = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "a10")])
print("DataFrame data1:", data)
data[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")]
print("DataFrame data2:", data)
df6 = pd.DataFrame(data)
print("DataFrame df6:", df6)

df7 = pd.DataFrame(data, index=["first", "second"])
print("DataFrame df7:", df7)

df8 = pd.DataFrame(data, columns=["C", "A", "B"])
print("DataFrame df8:", df8)

# from a list of dicts
data2 = [
    {"a": 1, "b": 2},
    {"a": 5, "b": 10, "c": 20}
]
df9 = pd.DataFrame(data2)
print("DataFrame df9:", df9)

print("DataFrame df10:", pd.DataFrame(data2, index=["first", "second"]))

# 只获取columns 列出的那几列数据
print("DataFrame df11:", pd.DataFrame(data2, columns=["a", "b"]))

# from a dict of tuples
df12 = pd.DataFrame(
    {
        ("a", "b"): {("A", "B"): 1, ("A", "C"): 2},
        ("a", "a"): {("A", "C"): 3, ("A", "B"): 4},
        ("a", "c"): {("A", "B"): 5, ("A", "C"): 6},
        ("b", "a"): {("A", "C"): 7, ("A", "B"): 7},
        ("b", "b"): {("A", "D"): 9, ("A", "B"): 10}
    })
print("DataFrame df12:", df12)

# from a Series
ser = pd.Series(range(3), index=list("abc"), name="ser")
print("ser:", ser)
print("DataFrame df13:", pd.DataFrame(ser))
# Series 里面定义的name，就是DataFrame里面的列 名称
print("DataFrame df14:", pd.DataFrame(ser, columns=["ser"]))
print("DataFrame df15:", pd.DataFrame(ser, columns=["ser", "name2"]))

# from a list of namedtuples
from collections import namedtuple

Point = namedtuple("Point", "x y")
print("DataFrame df16:", pd.DataFrame([Point(0, 0), Point(0, 3), (2, 3)]))
Point3D = namedtuple("Point3D", "x y z")
print("DataFrame df17:", pd.DataFrame([Point3D(0, 0, 0), Point3D(0, 3, 5), Point(2, 3)]))

# from a list of dataclasses
from dataclasses import make_dataclass

Point = make_dataclass("Point", [("x", int), ("y", int)])
print("DataFrame df18:", pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)]))

# --------------------------------------- Alternate constructors
"""
DataFrame.from_dict

DataFrame.from_dict() takes a dict of dicts or a dict of array-like sequences and returns a DataFrame. 
It operates like the DataFrame constructor except for the orient parameter 
which is 'columns' by default, 
but which can be set to 'index' in order to use the dict keys as row labels.
"""
print("df19:", pd.DataFrame.from_dict(dict([("A", [1, 2, 3]), ("B", [4, 5, 6])])))

# orient="index", 把 列明 和索引名交换了，相当于旋转了数组
# If you pass orient='index', the keys will be the row labels.
# In this case, you can also pass the desired column names:
print("df20:", pd.DataFrame.from_dict(
    dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]),
    orient="index",
    columns=["one", "two", "three"], ))

print("df21:", pd.DataFrame.from_dict(
    dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]),
    orient="index"))

"""
DataFrame.from_records

DataFrame.from_records() takes a list of tuples or an ndarray with structured dtype. 
It works analogously to the normal DataFrame constructor,
except that the resulting DataFrame index may be a specific field of the structured dtype.

"""

# ----------------- Column selection, addition, deletion
"""
You can treat a DataFrame semantically like a dict of like-indexed Series objects. 
Getting, setting, and deleting columns works with the same syntax as the analogous dict operations:

"""

# 访问df 的某列，df的某列就是一个 Series
print("df1", df1)
print("df22", df1["one"])

df1["three"] = df1["one"] * df1["two"]
# 判断df1["one"]里面每个元素是否 大于2，结果是 一个Bool类型变量
df1["flag"] = df1["one"] > 2
print("df23:", df1)

# Columns can be deleted or popped like with a dict:
del df1["two"]
three = df1.pop("three")
print("df three:", three)
print("df24:", df1)

# when inserting a scalar value, it will naturally be propagated to fill the column.
df1["foo"] = "bar"
print("df25:", df1)

# when inserting a Series that does not have the same index as the DataFrame,it will be conformed to the DataFrame's index
## slicing 切片知识补充
tag = "henry, hello slicing!"
print("tag[:]", tag[:])
# [) 切片是一个 半闭半开 的区间
print("tag[1:2]", tag[1:2])
print("tag[0:2]", tag[0:2])
print("tag[:2]", tag[:2])

print("Series [:]", df1["one"][:])
print("Series [:2]", df1["one"][:2])
df1["one_trunc"] = df1["one"][:2]
print("DataFrame df25:", df1)

# 指定位置插入一列
# You can insert raw ndarrays but their length must match the length of the DataFrame’s index.
# By default, columns get inserted at the end. DataFrame.insert() inserts at a particular location in the column
df1.insert(1, "insert_bar", df1["one"])
print("DataFrame df26:", df1)

# 通过 assign() 方法，从已有的列中 创造一个新的列
print("df1.head()", df1.head())
print("df1['one'].head()", df1["one"].head())
## assign  方法创造了新的列，但是不会改变之前的df数据，新的列是在 返回的数据里面
df2 = df1.assign(new_col=df1["one"] / df1["one_trunc"])
print("DataFrame df27:", df1)  # df1 还是之前的结构，没有改变
print("DataFrame df28:", df2)  # df2 才是改变后的结构

#  通过函数的方式来创建新的列
# In the example above, we inserted a precomputed value.
# We can also pass in a function of one argument to be evaluated on the DataFrame being assigned to.
df3 = df1.assign(func_col=lambda x: (x["one"] + 10))
print("DataFrame df29:", df3)

# assign() always returns a copy of the data, leaving the original DataFrame untouched.
# assign 通常不会改变 原来的DataFrame数据，而是返回数据的拷贝
iris = pd.read_csv("20230310.csv")
print("csv data:", iris)
print("iris.assign:", iris.assign(sepal_ratio=iris["SepalWidth"] + 10).head())
print("iris cloumns:", iris.columns)
## 注意，从csv读取的属性列 有空格
# iris cloumns: Index(['SepalLength', 'SepalWidth', 'PetalLength', '  PetalWidth', 'Name'], dtype='object')
print("csv data:[PetalWidth]", iris["PetalWidth"])
print("csv data:[PetalLength]", iris["PetalLength"])

print("PetalRatio:", iris.assign(PetalRatio=lambda x: x.PetalWidth / x.PetalLength).head())

"""
(
    iris.query("SepalLength > 5")
.assign(
        SepalRatio=lambda x: x.SepalWidth / x.SepalLength,
PetalRatio=lambda x: x.PetalLength / x.PetalLength,
).plot(kind="scatter", x="SepalRatio", y="PetalRatio")
)
"""

"""
The function signature for assign() is simply **kwargs. 
The keys are the column names for the new fields, 
and the values are either a value to be inserted (for example, a Series or NumPy array), 
or a function of one argument to be called on the DataFrame. 
A copy of the original DataFrame is returned, with the new values inserted.

The order of **kwargs is preserved. This allows for dependent assignment, 
where an expression later in **kwargs can refer to a column created earlier in the same assign().
"""
# In the second expression, x['C'] will refer to the newly created column, that’s equal to dfa['A'] + dfa['B'].
dfa = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
dfb = dfa.assign(C=lambda x: x["A"] + x["B"], D=lambda x: x["A"] + x["C"])
print("dfa:", dfa)
print("dfb:", dfb)

# ---------------------------- Indexing / selection
"""
Operation                           Syntax                  Result
select column :                     df[col]                   Series
select row by label :               df.loc[label]             Series
Select row by integer location:     df.iloc[loc]              Series
Slice rows:                         df[5:10]                  DataFrame
Select rows by boolean vector:      df[bool_vec]              DataFrame
"""

# Row selection, for example, returns a Series whose index is the columns of the DataFrame:
print("df30:", df1)
## 选出 某一行（b 是一个索引值，选出这个索引的行）Select row by label
print("df31:", df1.loc["b"])

# Select row by integer location
print("df32:", df1.iloc[2])

# --------------------------------- Data alignment and arithmetic

"""
Data alignment between DataFrame objects automatically align on both the columns and the index (row labels).
Again, the resulting object will have the union of the column and row labels.
"""
df = pd.DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"])
df2 = pd.DataFrame(np.random.randn(7, 3), columns=["A", "B", "C"])
print("df40:", df)
print("df41:", df2)
print("df42:", df + df2)
print("df43:", df - df.iloc[0])

print("df44:", df * 5 + 2)
print("df45:", 1 / df)
print("df46:", df ** 4)

df1 = pd.DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}, dtype=bool)
df2 = pd.DataFrame({"a": [0, 1, 1], "b": [1, 1, 0]}, dtype=bool)
print("df1:", df1)
print("df2:", df2)
print("df47:", df1 & df2)
print("df48:", df1 | df2)
print("df49:", df1 ^ df2)
print("df50:", -df1)

## 旋转 DataFrame 里面的数据
print("df", df)
print("df[:5]", df[:5].T)


### https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.filter.html#pandas.DataFrame.filter
### https://www.cnblogs.com/mehome/p/9513492.html
## pandas.DataFrame.filter
df = pd.DataFrame(np.array(([1, 2, 3], [4, 5, 6])),
                  index=['mouse', 'rabbit'],
                  columns=['one', 'two', 'three'])
print("df is :",df)

# select columns by name
print("df.filter select columns:",df.filter(items=["one", 'three']))

# select columns by regular expression.  $:匹配字符串末尾
print("df.filter regex columns:", df.filter(regex="e$", axis=1))
print("df.filter regex row2:", df.filter(regex="ous", axis=0))

print("df.filter like 1(column): ", df.filter(like="e", axis=1))
print("df.filter like 0(index): ", df.filter(like="bbi", axis=0))

pandas DataFrame,Series

posted @ 2023-03-09 21:33 He_LiangLiang 阅读(83) 评论(0) 编辑收藏举报

刷新页面返回顶部

小乌龟的笔记本

业精于勤荒于嬉，行成于思毁于随。记录学习、工作中的点点滴滴，沉淀自己。

python-pandas DataFrame,Series笔记1

python-pandas DataFrame,Series笔记1

Series

DataFrame

Alternate constructors

DataFrame.from_records

Column selection, addition, deletion

Indexing / selection

Data alignment and arithmetic

公告