#!/usr/bin/python
2 # -*- coding: cp936 -*-
3 import sys;
4 import re;
5 '''
6 原始文件,去重后文件,重复文件
7 '''
8 if( __name__ == "__main__" ):
1 #!/usr/bin/python
2 # -*- coding: cp936 -*-
3 import sys;
4 import re;
5 '''
6 原始文件,去重后文件,重复文件
7 '''
8 if( __name__ == "__main__" ):
9 fid_input = file(str(sys.argv[1]),'r');
10 fid_pure=file(str(sys.argv[2]),'w');
11 fid_assist=file(str(sys.argv[3]),'w');
12 mydict={};
13 phanzi=re.compile(u'[\u4e00-\u9fa5]');
14 preturn=re.compile(u'(^\s+|\s+$)');#去掉首尾空白
15 pfilter=re.compile(u'[”“(\(\))]');#去掉无关字符
16 for line in fid_input:
17 uline=unicode(line,'gbk');
18 candidates=phanzi.findall(uline);
19 unewline=u'';
20 for m in candidates:
21 unewline=unewline+m;
22 unewline=pfilter.sub('',unewline);
23 unewline=preturn.sub('',unewline);
24 newline=unewline.encode('gbk');
25 fid_assist.write(newline);
26 fid_assist.write('\n');
27 if(not mydict.has_key(newline)):
28 mydict[newline]=1;
29 for mykey in mydict.keys():
30 fid_pure.write(mykey);
31 fid_pure.write('\n');
32 fid_pure.close();
33 fid_input.close();
34 fid_assist.close();
35 print 'procedure %s finish!\n'%str( sys.argv[0] );