[PY3]——字符串的分割、匹配、搜索方法总结

？分割、匹配、搜索时可以用到什么样的解决方法？

分割方法总结

1. str.split( )

* 分割字符串

* 返回列表

s1='I  love  python'
# 默认以空格为界定符，且多个空格都当做一个处理
print(s1.split())
['I', 'love', 'python']

# (s1中有两个空格)如果这是指定了空格为界定符，则会有其中一个空格会被当做字符输出
print(s1.split(' '))
['I', '', 'love', '', 'python']

# 可指定任意字符/字符串作为界定符
print(s1.split('o'))
['I  l', 've  pyth', 'n']

# maxsplit=n，指定分割n次
print(s1.split(maxsplit=1))
['I', 'love  python']

2. re.split()

* 可定义多个界定符

import re
line = 'asdf fjdk; afed, fjek,asdf, foo'

# 可指定多个字符作为界定符
print(re.split(r'[;,\s]\s*',line))
['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']

# 加一个括号表示捕获分组
print(re.split(r'(;|,|\s)\s*',line))
['asdf', ' ', 'fjdk', ';', 'afed', ',', 'fjek', ',', 'asdf', ',', 'foo']

# (?:)强调为非捕获分组
print(re.split(r'(?:,|;|\s)\s*',line))
['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']

搜索和匹配方法总结

1. str.startswith() | str.endswith()

* 开头/结尾匹配
* 返回True/False
* 常用于“判断文件夹中是否存在指定文件类型”、“URL”

url="http://www.python.org"
# startswith('string')判断是否以string开头
print(url.startswith('http'))
True

# endswith('string')判断是否以string结尾
print(url.endswith('com'))
False

# startswith('string',n,m) 可指定索引范围n-m
print(url.endswith('n',11,17))
True

# 要注意一个特性，传递给startswith/endswith处理的只能是tuple，不能是list
choices=['http:','ftp:']
print(url.startswith(choices))TypeError: startswith first arg must be str or a tuple of str, not list
print(url.startswith(tuple(choices)))
True

# endswith()，应用在检索/判断，一个目录中是否有某一类型结尾的文件
import os
filenames=os.listdir('/test')

#Example-1
print(filenames)
['aa', 'zhuabao', '.python-version', 'test.sh', 'hh.c', '.test.py.swp', 'zhuabao2', 'abc', 'linshi.sh']
print([candsh for candsh in filenames if candsh.endswith(('.sh','.c'))])
['test.sh', 'hh.c', 'linshi.sh']

#Example-2
if any(name.endswith(('.sh','.c')) for name in os.listdir('/test')):
    print('have')
have

2. fnmatch() | fnmatchcase()

* 使用Shell通配符匹配

3. str.find()

* 返回索引

4. re.match(r'')

* 使用正则表达式匹配

* 只检查字符串开始位置

5. re.findall(r'')

* 从任意位置开始匹配
* 以列表方式返回

6. re.finditer(r'')

* 以迭代方式返回

7. r' $'——>正则表达式以$结尾

* 确保精确

8. re.compile(r'')——>先编译正则表达式

* 做多次/大量的匹配和搜索操作时

import re
text1='2017/07/26'
text2='Nov 27,2012'
text3='Today is 11/27/2012. PyCon starts 3/13/2013.'
text5='26/07/2017 is today,PyCon starts 3/13/2013.'

# 编译一个匹配 m/y/d/格式的正则表达式
datepat=re.compile(r'\d+/\d+/\d+')

# re.match('string')实现在string中搜索
print(datepat.match(text1))
<_sre.SRE_Match object; span=(0, 10), match='2017/07/26'>
print(datepat.match(text2))
None

# 我们发现re.match() 只能实现从开始位置搜索，也只能搜索出开头的第一个匹配项
print(datepat.match(text3))
None
print(datepat.match(text5))
<_sre.SRE_Match object; span=(0, 10), match='26/07/2017'>

# 这种情况有时可能得不到我们想要的结果，一种情况是可以在末尾加$，实现精确匹配
text6='26/07/2017abcdef'
datepat1=re.compile(r'\d+/\d+/\d+')
print(datepat1.match(text6))
<_sre.SRE_Match object; span=(0, 10), match='26/07/2017'>
datepat2=re.compile(r'\d+/\d+/\d+$')
print(datepat2.match(text6))
None

# 另一种情况是可以使用考虑使用re.findall('string') 可在string中的全部位置进行搜索
print(datepat.findall(text3))
['11/27/2012', '3/13/2013']

# re.findall返回列表，re.finditer()返回迭代对象
for m in datepat.finditer(text5):
    print(m.groups())

# # 捕获分组 # #
datepat=re.compile(r'(\d+)/(\d+)/(\d+)')
m=datepat.match(text1)
print(m.group(0))
2017/07/26
print(m.group(1))
2017
print(m.group(2))
07
print(m.group(3))
26
print(m.groups())
('2017', '07', '26')

for month,day,year in datepat.findall(text3):
    print('{}-{}-{}'.format(year,month,day))
012-11-272013-3-13

9. ?修饰符

* 将贪婪匹配变为非贪婪匹配

* 从而实现最短匹配模式

text6 = 'Computer says "no." Phone says "yes."'
pat1=re.compile(r'\"(.*)\"')  #匹配冒号包含的文本
print(pat1.findall(text6))
['no." Phone says "yes.']

pat2=re.compile(r'\"(.*?)\"') #增加 ?修饰符
print(pat2.findall(text6))
['no.', 'yes.']

10. （? : . | \n） | re.DOTALL

* 使得（.）能够匹配包括换行符在内的所有字符

* 从而实现多行匹配模式

text7=''' /*this is a
multiline comment*/
'''

pat1=re.compile(r'/\*(.*?)\*/')
print(pat1.findall(text7))
[]                                      #为什么没匹配出来，因为(.)并不能匹配换行符

pat2=re.compile(r'/\*((?:.|\n)*?)\*/')  #把(.) ——> (?:.|\n)
print(pat2.findall(text7))
['this is a\nmultiline comment']

# re.DOTALL可以让正则表达式中的点(.)匹配包括换行符在内的任意字符
pat3=re.compile(r'/\*(.*?)\*/',re.DOTALL)
print(pat3.findall(text7))
['this is a\nmultiline comment']

搜索和替换方法总结

1. str.replace()

# S.replace(old, new[, count]) -> str

text5="a b c d e e e"
print(text5.replace("e","a"))
# a b c d a a a
print(text5.replace("e","a",2))
# a b c d a a e

2. re.sub() | re.(flags=re.IGNORECASE)

* 匹配并替换 | 忽略大小写匹配

# sub(pattern, repl, string, count=0, flags=0)
# 第1个参数：匹配什么
# 第2个参数：替换什么
# 第3个参数：处理的文本
# 第4个参数：替换次数
text1="l o v e"
print(re.sub(r'\s','-',text1))
# l-o-v-e
print(re.sub(r'\s','-',text1,count=1))
# l-o v e

# flags=re.IGNORECASE 忽略大小写
text3 = 'UPPER PYTHON, lower python, Mixed Python'
print(re.sub('python','snake',text3,flags=re.IGNORECASE))
# UPPER snake, lower snake, Mixed snake

# 如果想替换字符跟匹配字符的大小写保持一致，我们需要一个辅助函数
def matchcase(word):
    def replace(m):
        text=m.group()
        if text.isupper():
            return word.upper()
        elif text.islower():
            return word.lower()
        elif text[0].isupper():
            return word.capitalize()
        else:
            return word
    return replace
print(re.sub('python',matchcase('snake'),text3,flags=re.IGNORECASE))
# UPPER SNAKE, lower snake, Mixed Snake

3. re.compile()

* 同理，多次替换时可先进行编译

# 同样可以先编译、可以捕获分组
text2='Today is 11/27/2012. PyCon starts 3/13/2013.'
datepat=re.compile(r'(\d+)/(\d+)/(\d+)')
print(datepat.sub(r'\3-\1-\2',text2))
# Today is 2012-11-27. PyCon starts 2013-3-13.

4. re.subn()

* 获取替换的次数

# re.subn()可以统计替换发生次数
newtext,n=datepat.subn(r'\3-\1-\2',text2)
print(newtext)
# Today is 2012-11-27. PyCon starts 2013-3-13.
print(n)
# 2

posted @ 2017-07-31 20:48 Jelly_lyj 阅读(533) 评论(0) 编辑收藏举报

刷新页面返回顶部

Jelly_lyj

Thoughts, Stories and Ideas.