#文本长度
1 text1 = "Ethics are built right into the ideals and objectives of the United Nations " 2 3 len(text1) # The length of text1
76
1 #文本单词个数 2 text2 = text1.split(' ') # Return a list of the words in text2, separating by ' '. 3 4 len(text2)
14
1 text2
['Ethics', 'are', 'built', 'right', 'into', 'the', 'ideals', 'and', 'objectives', 'of', 'the', 'United', 'Nations', '']
通过列表查找单词
1 [w for w in text2 if len(w) > 3] # 查找单词长度大于3的单词
['Ethics', 'built', 'right', 'into', 'ideals', 'objectives', 'United', 'Nations']
查找单词是开头的单词(即单词首字母大写)
1 [w for w in text2 if w.istitle()] # Capitalized words in text2
['Ethics', 'United', 'Nations']
查找以s结尾的单词
1 [w for w in text2 if w.endswith('s')] # Words in text2 that end in 's'
['Ethics', 'ideals', 'objectives', 'Nations']
使用set去重
1 text3 = 'To be or not to be' 2 text4 = text3.split(' ') 3 4 len(text4)
6
1 len(set(text4))
5
1 set(text4)
{'To', 'be', 'not', 'or', 'to'}
1 len(set([w.lower() for w in text4])) # 把单词转化为小写
4
1 set([w.lower() for w in text4])
{'be', 'not', 'or', 'to'}
处理自由文本
1 text5 = '"Ethics are built right into the ideals and objectives of the United Nations" \ 2 #UNSG @ NY Society for Ethical Culture bit.ly/2guVelr' 3 text6 = text5.split(' ') 4 5 text6
['"Ethics', 'are', 'built', 'right', 'into', 'the', 'ideals', 'and', 'objectives', 'of', 'the', 'United', 'Nations"', '#UNSG', '@', 'NY', 'Society', 'for', 'Ethical', 'Culture', 'bit.ly/2guVelr']
1 [w for w in text6 if w.startswith('#')]
1 [w for w in text6 if w.startswith('@')]
['@']
1 text7 = '@UN @UN_Women "Ethics are built right into the ideals and objectives of the United Nations" \ 2 #UNSG @ NY Society for Ethical Culture bit.ly/2guVelr' 3 text8 = text7.split(' ')
使用复杂的正则表达式
We can use regular expressions to help us with more complex parsing.
For example '@[A-Za-z0-9_]+'
will return all words that:
- start with
'@'
and are followed by at least one: - capital letter (
'A-Z'
) - lowercase letter (
'a-z'
) - number (
'0-9'
) - or underscore (
'_'
)
1 import re # import re - a module that provides support for regular expressions 2 3 [w for w in text8 if re.search('@[A-Za-z0-9_]+', w)]
['@UN', '@UN_Women']