使用python对文件进行批量处理

代码有点长,包括为类了,主要是对文件进行批量处理使用
1、批量移动文件,符合某种后缀的
2、批量查找两个文件夹重复的文件
3、批量同步两个文件夹的文件
2和3,我现在用duplicate这个软件,同步的话使用File Synchronizer这个软件,代码的话不怎么用了
4、批量移动和复制文件的时候会遇到一个问题,就是若存在相同文件名的情况,这个时候可以在文件名后加“-1”,“-2”这种方式来解决,比之前的用随机时间的要好,那个产生的文件名太长了
5、批量提取docx文档中的图片(如何批量提取doc中的呢?去excelhome论坛搜索vba转doc为docx的代码,批量转换即可
 
Sub doc2docx()    'doc文件转docx文件
    
    Dim myDialog As FileDialog
    Set myDialog = Application.FileDialog(msoFileDialogFilePicker)
    Dim oFile As Object
    Dim oFilePath As Variant
    
    With myDialog
        .Filters.Clear    '清除所有文件筛选器中的项目
        .Filters.Add "所有 WORD2007 文件", "*.doc", 1    '增加筛选器的项目为所有doc文件
        .AllowMultiSelect = True    '允许多项选择
        If .Show = -1 Then    '确定
            For Each oFilePath In .SelectedItems    '在所有选取项目中循环
                Set oFile = Documents.Open(oFilePath)
                oFile.SaveAs FileName:=Replace(oFilePath, "doc", "docx"), FileFormat:=16
                oFile.Close
            Next
    End If
    
End With

End Sub

  

  1 from genericpath import exists
  2 import os
  3 import shutil
  4 from os import path
  5 from pathlib import Path
  6 from hashlib import md5
  7 from PIL import Image
  8 import zipfile
  9 from send2trash import send2trash
 10 
 11 # revides by Stephen Shen @zju
 12 # 2021-4-8 10:14:22
 13 # https://rednafi.github.io/digressions/python/2020/04/13/python-pathlib.html#pathrenametarget
 14 # https://pypi.org/project/Send2Trash/
 15 # revised by Stephen Shen @zju
 16 # 2021年3月15日09:24:10
 17 # zipfile 模块使用说明
 18 # https://www.cnblogs.com/ManyQian/p/9193199.html
 19 #
 20 # shutil.copyfile("oldfile","newfile") #oldfile和newfile都只能是文件
 21 # shutil.copy("oldfile","newfile") #oldfile只能是文件夹,newfile可以是文件,也可以是目标目录
 22 # #复制文件夹:
 23 # shutil.copytree("olddir","newdir") #olddir和newdir都只能是目录,且newdir必须不存在
 24 # #重命名文件(目录)
 25 # os.rename("oldname","newname") #文件或目录都是使用这条命令
 26 # #移动文件(目录)
 27 # shutil.move("oldpos","newpos")
 28 
 29 
 30 class PathBox():
 31     def __init__(self):
 32         pass
 33 
 34     @staticmethod
 35     def batchExtractPicsFromDocs(srcDir, dstDir, zipDir):
 36         dst_dir = Path(dstDir)
 37         src_dir = Path(srcDir)
 38         zip_dir = Path(zipDir)
 39 
 40         if not dst_dir.exists():
 41             dst_dir.mkdir()
 42         if not src_dir.exists():
 43             src_dir.mkdir()
 44         if not zip_dir.exists():
 45             zip_dir.mkdir()
 46 
 47         for root, dirs, files in os.walk(src_dir):
 48             for f in files:
 49                 src_path = Path(root).joinpath(f)
 50                 if src_path.suffix in ['.docx']:
 51                     # dst_sub_dir = dst_dir.joinpath(src_path.stem)
 52                     # if not dst_sub_dir.exists():
 53                     #     dst_sub_dir.mkdir()
 54 
 55                     zip_path = zip_dir.joinpath(src_path.stem+'.zip')
 56                     if not zip_path.exists():
 57                         PathBox.copyAsZip(src_path, zip_path)
 58                         print('{} is copied as zip file'.format(zip_path))
 59                     else:
 60                         print('{} is existed'.format(str(zip_path)))
 61                     # pics_dir = Path(PathBox.extractPics(
 62                     #     dst_dir, zip_path, zip_dir))
 63 
 64                     # if not pics_dir.exists():
 65                     #     PathBox.batchMoveFilesToOneFolder(
 66                     #         pics_dir, dst_dir, ['.jpeg', '.png'])
 67                 else:
 68                     print('{} is not docx'.format(str(src_path)))
 69 
 70         PathBox.extractZipFile(zip_dir, dst_dir)
 71 
 72     @staticmethod
 73     def copyAsZip(srcpath, dstpath):
 74         shutil.copyfile(srcpath, dstpath)
 75 
 76     @staticmethod
 77     def extractPics(dstDir, zippath, zipDir):
 78         # first clear the zipdir directory
 79         # 将docx文档复制为*.zip格式
 80 
 81         # 解压缩文件
 82         try:
 83             with zipfile.ZipFile(zippath, 'r') as f:
 84                 print('{zippath} is extracted'.format(zippath=zippath))
 85                 f.extractall(zipDir)
 86         except:
 87             print('{zippath} cannot be extracted'.format(zippath=zippath))
 88         else:
 89             picsDir = Path(zipDir).joinpath('word/media')
 90             return picsDir
 91 
 92             # if os.path.exists(picsDir):
 93             #     for pic in os.listdir(picsDir):
 94             #         oldpic=os.path.join(picsDir,pic)
 95             #         newpic=os.path.join(out_dir,pic)
 96             #         try:
 97             #             shutil.move(oldpic,newpic)
 98             #         except:
 99             #             print(inDir+' is skipped')
100 
101             # filelist=os.listdir(zipDir)
102             # for f in filelist:
103             #     filepath = os.path.join(zipDir, f )
104             #     if os.path.isfile(filepath):
105             #         os.remove(filepath)
106             #     elif os.path.isdir(filepath):
107             #         shutil.rmtree(filepath,True)
108 
109     @staticmethod
110     def getFileMd5(file_name):
111         """
112         计算文件的md5
113         :param file_name:
114         :return:
115         """
116         m = md5()  # 创建md5对象
117         with open(file_name, 'rb') as fobj:
118             while True:
119                 data = fobj.read(4096)
120                 if not data:
121                     break
122                 m.update(data)  # 更新md5对象
123 
124         return m.hexdigest()  # 返回md5对象
125 
126     @staticmethod
127     def syncFiles(srcDir, dstDir):
128         src_dir = Path(srcDir)
129         dst_dir = Path(dstDir)
130         for root, dirs, files in os.walk(src_dir):
131             for f in files:
132                 src_path = Path(root).joinpath(f)
133                 rel_path = src_path.relative_to(src_dir)
134                 dst_path = dst_dir.joinpath(rel_path)
135                 if dst_path.exists():
136                     if os.path.getsize(src_path) == os.path.getsize(dst_path):
137                         print('{} is existed'.format(str(src_path)))
138                     else:
139                         PathBox.copyFile(src_path, dst_path)
140                         print('{} is copied'.format(str(src_path)))
141                     pass
142                 else:
143                     if not dst_path.parent.exists():
144                         dst_path.parent.mkdir(parents=True, exist_ok=True)
145                     PathBox.copyFile(src_path, dst_path)
146                     print('{} is copied'.format(str(src_path)))
147                 pass
148 
149     @staticmethod
150     def extractZipFile(srcDir, dstDir):
151         src_dir = srcDir
152         dst_dir = dstDir
153         for root, dirs, files in os.walk(src_dir):
154             for f in files:
155                 file_path = Path(root).joinpath(f)
156                 if file_path.suffix in ['.zip', '.rar']:
157                     try:
158                         with zipfile.ZipFile(str(file_path), 'r') as f:
159                             zip_dir = Path(src_dir).joinpath(file_path.stem)
160                             if not zip_dir.exists():
161                                 zip_dir.mkdir()
162                             print('{} is extracted'.format(str(file_path)))
163                             f.extractall(zip_dir)
164                     except:
165                         print('{} cannot be extracted'.format(str(file_path)))
166 
167     @staticmethod
168     def getImageMd5(img_path):
169         try:
170             hash = md5()
171             img = open(img_path, 'rb')
172             hash.update(img.read())
173             img.close()
174             img_md5 = hash.hexdigest()
175             return img_md5
176         except:
177             return None
178 
179     @staticmethod
180     def batchRenameFileSuffix(srcDir):
181         # 批量修改目录下指定类型的后缀
182         for root, dirs, files in os.walk(srcDir):
183             for f in files:
184                 srcpath = Path(os.path.join(root, f))
185                 if srcpath.suffix in ['.jpeg', '.jpg']:
186                     newfilename = srcpath.stem+'.JPG'
187                     srcpath.rename(srcpath.parent / newfilename)
188                     print('{} is renamed'.format(srcpath))
189 
190     @staticmethod
191     def compareTwoDirsByCount(srcDir, dstDir):
192         rootdirs = os.listdir(srcDir)
193 
194         for rootdir in rootdirs:
195             srcpath = os.path.join(srcDir, rootdir)
196             dstpath = os.path.join(dstDir, rootdir)
197             src_count = 0
198             dst_count = 0
199             for root, dirs, files in os.walk(srcpath):
200                 for f in files:
201                     src_count += 1
202             for root, dirs, files in os.walk(dstpath):
203                 for f in files:
204                     dst_count += 1
205             if src_count == dst_count:
206                 shutil.rmtree(srcpath)
207                 print('{} is removed'.format(srcpath))
208 
209     @staticmethod
210     def batchRenameFileName(srcDir):
211         # 批量修改目录下的文件名
212         index = 1
213         for root, dirs, files in os.walk(srcDir):
214             root_path = Path(root)
215             for f in files:
216                 file_path = root_path.joinpath(f)
217                 new_file_path = file_path
218                 index = 1
219                 while True:
220                     new_file_name = str(index)+file_path.suffix
221                     new_file_path = new_file_path.with_name(new_file_name)
222                     if not new_file_path.exists():
223                         break
224                     else:
225                         index += 1
226                 file_path.rename(new_file_path)
227                 print('{} is renamed'.format(str(file_path)))
228 
229     @staticmethod
230     def tongji(srcDir):
231         for root, dirs, files in os.walk(srcDir):
232             if root == srcDir:
233                 pass
234             else:
235                 count = len(os.listdir(root))
236                 print('{0} have total of {1} files'.format(root, count))
237 
238     @staticmethod
239     def rmEmptyDirs(srcDir):
240         for root, dirs, files in os.walk(srcDir):
241             if root == srcDir:
242                 pass
243             else:
244                 count = len(sorted(Path(root).rglob('**/*.*')))
245                 if count == 0:
246                     try:
247                         Path(root).rmdir()
248                         # shutil.rmtree(root)
249                     except:
250                         pass
251 
252     @staticmethod
253     def batchResizePics(srcDir):
254 
255         dstDir = srcDir+"-resize"
256 
257         size = (800, 600)
258 
259         # print("picture resizing is processing,pleae wait...")
260         for root, dirs, files in os.walk(srcDir):
261             newroot = Path(root.replace(srcDir, dstDir))
262             if not newroot.exists():
263                 newroot.mkdir(parents=True, exist_ok=True)
264                 # os.mkdir(newroot)
265 
266             for file in files:
267                 (filename, extension) = os.path.splitext(file)
268                 if extension in ['.jpg', '.jpeg', '.png', '.JPG', '.JPEG', '.PNG']:
269                     newfile = newroot.joinpath(file)
270                     oldfile = Path(root).joinpath(file)
271                     try:
272                         # print('processing')
273                         im = Image.open(oldfile)
274                         if not im.size == size:
275                             im.thumbnail(size)
276                             im.save(newfile, "jpeg")
277                             print('{} is thumbnailed'.format(oldfile))
278                         else:
279                             try:
280                                 PathBox.moveFile(oldfile, newfile)
281                                 # shutil.move(oldfile, newfile)
282                                 # print('{} is moved'.format(oldfile))
283                             except:
284                                 print(Exception)
285                     except IOError:
286                         pass
287         return dstDir
288 
289     @staticmethod
290     def batchMoveNonePicsToOneFolder(srcDir, dstDir='parent'):
291         fexts = ['.jpg', '.png', '.jpeg', '.JPG', '.PNG', '.JPEG']
292         for root, dirs, files in os.walk(srcDir):
293             for f in files:
294                 src_path = Path(root).joinpath(f)
295                 if not src_path.suffix in fexts:
296                     dst_path = Path(srcDir).joinpath(f)
297                     shutil.move(src_path, dst_path)
298                     print('{} is moved'.format(src_path))
299 
300     @staticmethod
301     # 查询文件夹里是否有非IMG开头的文件
302     def excludeFilesByName(srcDir, dstDir='parent'):
303         src_dir = Path(srcDir)
304         if dstDir == 'parent':
305             dst_dir = src_dir
306         else:
307             dst_dir = Path(dstDir)
308         for root, dirs, files in os.walk(src_dir):
309             if dstDir == 'parent' and root == srcDir:
310                 continue
311             else:
312                 for f in files:
313                     if f.startswith("IMG"):
314                         pass
315                     else:
316                         src_path = Path(root).joinpath(f)
317                         dst_path = dst_dir.joinpath(f)
318                         PathBox.moveFile(src_path, dst_path)
319 
320     @staticmethod
321     def batchMoveFilesToOneFolder(srcDir, dstDir='parent', fexts='all'):
322         src_dir = Path(srcDir)
323         if dstDir == 'parent':
324             dstDir = srcDir
325 
326         for root, dirs, files in os.walk(srcDir):
327             if dstDir == 'parent' and root == srcDir:
328                 pass
329             else:
330                 for f in files:
331                     src_path = Path(root).joinpath(f)
332                     if not dstDir:
333                         dst_path = src_dir.joinpath(f)
334                         if fexts == 'all':
335                             PathBox.moveFile(src_path, dst_path)
336                             print('{} is moved'.format(src_path))
337                         else:
338                             if src_path.suffix in fexts:
339                                 PathBox.moveFile(src_path, dst_path)
340                                 print('{} is moved'.format(src_path))
341                     else:
342                         dst_path = Path(dstDir).joinpath(f)
343                         if fexts == 'all':
344                             PathBox.moveFile(src_path, dst_path)
345                             print('{} is moved'.format(src_path))
346                         else:
347                             if src_path.suffix in fexts:
348                                 PathBox.moveFile(src_path, dst_path)
349                                 print('{} is moved'.format(src_path))
350 
351     @staticmethod
352     def moveFile(src_path, dst_path):
353         index = 1
354         new_dst_path = dst_path
355         while True:
356             if new_dst_path.exists():
357                 new_dst_path = dst_path.with_name(
358                     dst_path.stem+'_'+str(index)+dst_path.suffix)
359                 index += 1
360             else:
361                 break
362         shutil.move(src_path, dst_path)
363         print('{} is moved'.format(src_path))
364 
365     @staticmethod
366     def copyFile(src_path, dst_path):
367         index = 1
368         while True:
369             if dst_path.exists():
370                 dst_path = dst_path.with_name(
371                     dst_path.stem+'_'+str(index)+dst_path.suffix)
372                 index += 1
373             else:
374                 break
375         try:
376             shutil.copyfile(src_path, dst_path)
377         except:
378             print(Exception)
379 
380     @staticmethod
381     def compareDirsDeleteTheSameFile(srcDir, dstDir, mode='keep'):
382         # compare two dirs and delete the same file in the srcDir
383         for root, dirs, files in os.walk(srcDir):
384             for f in files:
385                 src_path = Path(os.path.join(root, f))
386                 rel_path = src_path.relative_to(Path(srcDir))
387                 dst_path = Path(dstDir).joinpath(rel_path)
388                 if dst_path.exists():
389                     if mode == 'keep':
390                         pass
391                     if mode == 'delete':
392                         try:
393                             send2trash(str(src_path))
394                             # os.remove(src_path)
395                             print('{} is removed'.format(src_path))
396                         except:
397                             print('{} cannot be removed'.format(src_path))
398                 pass
399 
400     @staticmethod
401     def batchRemoveTheSameFileByMD5(srcKeepDir, srcCompareDirs=[]):
402         zd = {}
403         src_keep_dir = Path(srcKeepDir)
404         for root, dirs, files in os.walk(src_keep_dir):
405             for f in files:
406                 f_path = Path(root).joinpath(f)
407                 img_md5 = PathBox.getFileMd5(f_path)
408         #         img_md5 = PathBox.getImageMd5(f_path)
409                 if img_md5:
410                     if not img_md5 in zd.keys():
411                         zd[img_md5] = f_path
412                     else:
413                         send2trash(str(f_path))
414                         print('{} is removed'.format(f_path))
415 
416         if srcCompareDirs:
417             for folder in srcCompareDirs:
418                 src_compare_dir = Path(folder)
419                 for root, dirs, files in os.walk(src_compare_dir):
420                     for f in files:
421                         f_path = Path(root).joinpath(f)
422                         img_md5 = PathBox.getFileMd5(f_path)
423                         if img_md5:
424                             if not img_md5 in zd.keys():
425                                 zd[img_md5] = f_path
426                             else:
427                                 src_path = f_path
428                                 # dst_path=os.path.join(dstDir,src_path.name)
429                                 send2trash(str(src_path))
430                                 # os.remove(src_path)
431                                 # shutil.move(src_path,dst_path)
432                                 print('{} is removed'.format(src_path))
433 
434 
435 if __name__ == '__main__':
436     # ----------------extract pics from docx--------------------
437     # srcDir = r'D:\Civil\32109012\_工具包\_参考资料\_桥梁检测报告'
438     # dstDir = r'D:\Civil\extract'
439     # zipDir = r'D:\Civil\zip'
440     # PathBox.batchExtractPicsFromDocs(srcDir, dstDir, zipDir)
441     # PathBox.batchMoveFilesToOneFolder(zipDir, dstDir, fexts=['.jpg', '.png', '.emf', '.jpeg'])
442 
443     # ----------------bacth move files via fexts--------------------
444     # PathBox.batchMoveFilesToOneFolder(srcDir, fexts=['.doc'])
445     # srcDir = r'D:\Civil\32109012\_工具包\_softSmall'
446     # PathBox.batchMoveFilesToOneFolder(srcDir)
447 
448     # ----------------bacth exclue the same file in dirs--------------------
449     srcKeepDir = r'D:\_soft'
450     # srcCompareDirs = [r'D:\test']
451     PathBox.batchRemoveTheSameFileByMD5(srcKeepDir)
452     # PathBox.batchRemoveTheSameFileByMD5(srcKeepDir, srcCompareDirs)
453 
454     # ----------------bacth resize the images in dirs--------------------
455     # srcDir = r'D:\BaiduNetdiskDownload\温州东瓯DAQIAO'
456     # # dstDir = r'D:\衢州报告-resize'
457     # dstDir = PathBox.batchResizePics(srcDir)
458     # PathBox.compareDirsDeleteTheSameFile(srcDir, dstDir, mode='delete')
459     # PathBox.rmEmptyDirs(srcDir)
460 
461     # ----------------bacth sync the files between two dirs----------------
462     # srcDir = r'C:\Users\Administrator\Documents\debug\wolf'
463     # dstDir = r'C:\Users\Administrator\Documents\debug\_待整理'
464     # PathBox.syncFiles(srcDir, dstDir)
465     # PathBox.compareDirsDeleteTheSameFile(srcDir, dstDir, mode='delete')
466     # PathBox.rmEmptyDirs(srcDir)
467 
468     # ----------------bacth sync the files between two dirs----------------
469     # srcDir = r'D:\衢州报告'
470     # PathBox.excludeFilesByName(srcDir)
View Code

 

posted on 2021-07-22 09:10  风中狂笑  阅读(892)  评论(0编辑  收藏  举报

导航