代码改变世界

正则表达式-汉字的匹配方法

2013-07-27 23:26  江湖么名  阅读(2921)  评论(0编辑  收藏  举报

unicode :   ([\u4e00-\u9fa5]+)

unicode :  ([\u2E80-\u9FFF]+)

utf-8  :  ([\x80-\xff]+)

 1 #encoding:utf-8
 2 import re 
 3 
 4 
 5 
 6 def main():
 7     
 8     # ([\u4e00-\u9fa5]+)
 9     TEST_STR_1 = u'ab123kk123'
10     pattern_str = u'[0-9]+([\u4e00-\u9fa5]+)[0-9]+'
11     pattern = re.compile (pattern_str)
12     m = pattern.search(TEST_STR_1)
13     print m.group() if m is not None else None
14     print m.group(1) if m is not None else None
15     print '\n'
16     
17     TEST_STR_2 = u'ab123汉字123'
18     m = pattern.search(TEST_STR_2)
19     print m.group() if m is not None else None
20     print m.group(1) if m is not None else None
21     print '\n'
22     
23     # ([\x80-\xff]+)
24     TEST_STR_3 = 'ab123汉字123'
25     pattern_str = '[0-9]+([\x80-\xff]+)[0-9]+'
26     pattern = re.compile (pattern_str)
27     m = pattern.search(TEST_STR_3)
28     print m.group().decode('utf-8') if m is not None else None
29     print m.group(1).decode('utf-8') if m is not None else None
30     print '\n'    
31     
32     # ([\u2E80-\u9FFF]+)
33     TEST_STR_2 = u'ab123汉字123'
34     pattern_str = u'[0-9]+([\u2E80-\u9FFF]+)[0-9]+'
35     pattern = re.compile (pattern_str)
36     m = pattern.search(TEST_STR_2)
37     print m.group() if m is not None else None
38     print m.group(1) if m is not None else None
39     print '\n'
40     
41     
42     
43 if __name__ == '__main__':
44     main()