python 修改文件编码方式

 1 import chardet
 2 import os
 3 
 4 def strJudgeCode(str):
 5     return chardet.detect(str)
 6 
 7 def readFile(path):
 8     try:
 9         f = open(path, 'r')
10         filecontent = f.read()
11     finally:
12         if f:
13             f.close()
14 
15     return filecontent
16 
17 def WriteFile(str, path):
18     try:
19         f = open(path, 'w')
20         f.write(str)
21     finally:
22         if f:
23             f.close()
24 
25 def converCode(path):
26     file_con = readFile(path)
27     result = strJudgeCode(file_con)
28     #print(file_con)
29     if result['encoding'] == 'utf-8':
30         #os.remove(path)
31         a_unicode = file_con.decode('utf-8')
32         gb2312 = a_unicode.encode('gbk')    
33         WriteFile(gb2312, path)
34 
35 def listDirFile(dir):
36     list = os.listdir(dir)
37     for line in list:
38         filepath = os.path.join(dir, line)
39         if os.path.isdir(filepath):
40             listDirFile(filepath)
41         else:
42             print(line)
43             converCode(filepath)            
44 
45 if __name__ == '__main__':
46     listDirFile(u'.\TRMD')

详细解释：







 1 import chardet
 2 import os
 3 
 4 def strJudgeCode(str):
 5     return chardet.detect(str)
 6     '''
 7 chardet.detect()返回字典，其中confidence是检测精确度，encoding是编码形式
 8 {'confidence': 0.98999999999999999, 'encoding': 'GB2312'}
 9 （1）网页编码判断：
10 
11 >>> import urllib
12 >>> rawdata = urllib.urlopen('http://www.google.cn/').read()
13 >>> import chardet
14 >>> chardet.detect(rawdata)
15 {'confidence': 0.98999999999999999, 'encoding': 'GB2312'}
16 （2）文件编码判断
17 
18 复制代码
19 import chardet
20 tt=open('c:\\111.txt','rb')
21 ff=tt.readline()
22 #这里试着换成read(5)也可以，但是换成readlines()后报错
23 enc=chardet.detect(ff)
24 print enc['encoding']
25 tt.close()
26     '''
27 
28 def readFile(path):
29     try:
30         f = open(path, 'r')
31         filecontent = f.read()
32     finally:
33         if f:
34             f.close()
35 
36     return filecontent
37 
38 def WriteFile(str, path):
39     try:
40         f = open(path, 'w')
41         f.write(str)
42     finally:
43         if f:
44             f.close()
45 
46 def converCode(path):
47     file_con = readFile(path)
48     result = strJudgeCode(file_con)
49     #print(file_con)
50     if result['encoding'] == 'utf-8':
51         #os.remove(path)
52         a_unicode = file_con.decode('utf-8')
53     '''
54 使用decode()和encode()来进行解码和编码
55 u = '中文' #指定字符串类型对象u
56 str = u.encode('gb2312') #以gb2312编码对u进行编码，获得bytes类型对象str
57 u1 = str.decode('gb2312')#以gb2312编码对字符串str进行解码，获得字符串类型对象u1
58 u2 = str.decode('utf-8')#如果以utf-8的编码对str进行解码得到的结果，将无法还原原来的字符串内容
59     '''
60         gb2312 = a_unicode.encode('gbk')    
61         WriteFile(gb2312, path)
62 
63 def listDirFile(dir):
64     list = os.listdir(dir)#返回指定路径下的文件和文件夹列表。
65     for line in list:
66         filepath = os.path.join(dir, line)
67         '''
68 是在拼接路径的时候用的。举个例子，
69 os.path.join(“home”, "me", "mywork")
70 在Linux系统上会返回
71 “home/me/mywork"
72 在Windows系统上会返回
73 "home\me\mywork"
74 好处是可以根据系统自动选择正确的路径分隔符"/"或"\"
75         '''
76         if os.path.isdir(filepath):#os.path.isdir()函数判断某一路径是否为目录
77             listDirFile(filepath)
78         else:
79             print(line)
80             converCode(filepath)            
81 
82 if __name__ == '__main__':
83     listDirFile(u'.\TRMD')
84     '''
85 u'string'  表示 已经是 unicode 编码的 'string' 字符串
86 # -*- coding: UTF-8 -*-   这句是告诉python程序中的文本是utf-8编码，让python可以按照utf-8读取程
87 中文前加u就是告诉python后面的是个unicode编码，存储时按unicode格式存储。
88     '''

posted on 2017-08-22 09:52 懵懂的菜鸟阅读(16371) 评论(0) 编辑收藏举报

刷新页面返回顶部

懵懂的菜鸟

导航

公告

python 修改文件编码方式