1 import pandas as pd
2 import numpy as np
3
4 s = pd.Series(['A', 'b', 'c', 'bbhello', '123', np.nan, 'hj'])
5 df = pd.DataFrame({'key1': list('abcdef'),
6 'key2': ['hee', 'fv', 'w', 'hija', '123', np.nan]})
7 print(s)
8 print('-'*8)
9 print(df)
10 print('-'*8)
11 '''
12 0 A
13 1 b
14 2 c
15 3 bbhello
16 4 123
17 5 NaN
18 6 hj
19 dtype: object
20 --------
21 key1 key2
22 0 a hee
23 1 b fv
24 2 c w
25 3 d hija
26 4 e 123
27 5 f NaN
28 --------
29 '''
30 # 直接通过.str调用字符串方法,可以对Series、DataFrame使用,自动过滤NaN值
31 print(s.str.count('b'))
32 '''
33 0 0.0
34 1 1.0
35 2 0.0
36 3 2.0
37 4 0.0
38 5 NaN
39 6 0.0
40 dtype: float64
41 '''
42 print(df['key2'].str.upper())
43 '''
44 0 HEE
45 1 FV
46 2 W
47 3 HIJA
48 4 123
49 5 NaN
50 Name: key2, dtype: object
51 '''
52 # 将所有的列名改为大写
53 df.columns = df.columns.str.upper()
54 print(df)
55 '''
56 KEY1 KEY2
57 0 a hee
58 1 b fv
59 2 c w
60 3 d hija
61 4 e 123
62 5 f NaN
63 '''
64 # 字符串常用方法 --lower,upper,len,starswith,endswith
65
66 print('小写,lower()',s.str.lower())
67 print('大写,upper()',s.str.upper())
68 print('长度,len()',s.str.len())
69 print('判断起始是否为b,startswith()',s.str.startswith('b'))
70 print('判断结束是否为"o",endswith()',s.str.endswith('o'))
71 '''
72 小写,lower() 0 a
73 1 b
74 2 c
75 3 bbhello
76 4 123
77 5 NaN
78 6 hj
79 dtype: object
80 大写,upper() 0 A
81 1 B
82 2 C
83 3 BBHELLO
84 4 123
85 5 NaN
86 6 HJ
87 dtype: object
88 长度,len() 0 1.0
89 1 1.0
90 2 1.0
91 3 7.0
92 4 3.0
93 5 NaN
94 6 2.0
95 dtype: float64
96 判断起始是否为b,startswith() 0 False
97 1 True
98 2 False
99 3 True
100 4 False
101 5 NaN
102 6 False
103 dtype: object
104 判断结束是否为"o",endswith() 0 False
105 1 False
106 2 False
107 3 True
108 4 False
109 5 NaN
110 6 False
111 dtype: object
112 '''
113 # 字符串常用方法 --strip
114
115 s2 = pd.Series([' jack', 'jill ', ' jesse '])
116 df2 = pd.DataFrame(np.random.randn(3, 2), columns=[' A ', ' B'], index=range(3))
117 print(s2)
118 print('-'*8)
119 print(df2)
120 print('-'*8)
121 '''
122 0 jack
123 1 jill
124 2 jesse
125 dtype: object
126 --------
127 A B
128 0 -0.333042 -0.467830
129 1 0.605179 -0.658910
130 2 -0.490881 -0.639754
131 --------
132 '''
133 print(s2.str.strip())
134 print('-'*8)
135 print(s2.str.lstrip())
136 print('-'*8)
137 print(s2.str.rstrip())
138 '''
139 0 jack
140 1 jill
141 2 jesse
142 dtype: object
143 --------
144 0 jack
145 1 jill
146 2 jesse
147 dtype: object
148 --------
149 0 jack
150 1 jill
151 2 jesse
152 dtype: object
153 '''
154 df2.columns = df2.columns.str.strip()
155 print(df2)
156 '''
157 A B
158 0 -0.801508 1.650113
159 1 -0.669556 -1.195999
160 2 0.277338 -0.727100
161
162 '''
163
164 # 字符串常用方法 -- replace()
165 df3 = pd.DataFrame(np.random.randn(3, 2), columns=[' A a', ' B b'], index=range(3))
166 df3.columns = df3.columns.str.replace(' ', '-', n=2)
167 print(df3)
168 '''
169 -A-a -B- b
170 0 -1.225938 0.296270
171 1 0.769037 2.794032
172 2 -1.686818 0.109314
173 '''
174 # 字符串常用方法 -- spilt、rsplit
175 s4 = pd.Series(['a,b,c', '1,2,3', ['a,,,c'], np.nan])
176 print(s4)
177 print(s4.str.split(','))
178 '''
179 0 a,b,c
180 1 1,2,3
181 2 [a,,,c]
182 3 NaN
183 dtype: object
184 0 [a, b, c]
185 1 [1, 2, 3]
186 2 NaN
187 3 NaN
188 dtype: object
189 '''
190 # 直接索引得到一个list
191 # 可以使用get或[]符号访问拆散列表中的元素
192 print(s4.str.split(',').str[0])
193 print(s4.str.split(',').str.get(0))
194 '''
195 0 a
196 1 1
197 2 NaN
198 3 NaN
199 dtype: object
200 0 a
201 1 1
202 2 NaN
203 3 NaN
204 dtype: object
205 '''
206
207 # 可以使用expand可以轻松扩展此操作以返回DataFrame
208 # n 参数限制分割数
209 print(s4.str.split(','))
210 print('-' * 8)
211 print(s4.str.split(',', expand=True))
212 '''
213 0 [a, b, c]
214 1 [1, 2, 3]
215 2 NaN
216 3 NaN
217 dtype: object
218 --------
219 0 1 2
220 0 a b c
221 1 1 2 3
222 2 NaN NaN NaN
223 3 NaN NaN NaN
224 '''
225 print(s4.str.split(',', expand=True, n=1))
226 '''
227 0 1
228 0 a b,c
229 1 1 2,3
230 2 NaN NaN
231 3 NaN NaN
232 '''
233 # rsplit类似于split,反向工作,即从字符串的末尾到字符串的开头
234 print(s4.str.split(',', expand=True, n=1))
235 print('-' * 8)
236 print(s4.str.rsplit(',', expand=True, n=1))
237 '''
238 0 1
239 0 a b,c
240 1 1 2,3
241 2 NaN NaN
242 3 NaN NaN
243 --------
244 0 1
245 0 a,b c
246 1 1,2 3
247 2 NaN NaN
248 3 NaN NaN
249 '''
250
251 df4 = pd.DataFrame({'key1': ['a,b,c', '1,2,3', [':,,, ']],
252 'key2': ['a-b-c', '1-2-3', [':-.- ']]})
253 print(df4)
254 print('-'*8)
255 print(df4['key2'].str.split('-'))
256 '''
257 key1 key2
258 0 a,b,c a-b-c
259 1 1,2,3 1-2-3
260 2 [:,,, ] [:-.- ]
261 --------
262 0 [a, b, c]
263 1 [1, 2, 3]
264 2 NaN
265 Name: key2, dtype: object
266 '''
267 # 通过索引获取分割后的元素
268 df4['k201'] = df4['key2'].str.split('-').str[0]
269 df4['k202'] = df4['key2'].str.split('-').str[1]
270 df4['k203'] = df4['key2'].str.split('-').str[2]
271 print(df4)
272 '''
273 key1 key2 k201 k202 k203
274 0 a,b,c a-b-c a b c
275 1 1,2,3 1-2-3 1 2 3
276 2 [:,,, ] [:-.- ] NaN NaN NaN
277 '''