Python 小工具之大文件去重

python针对大文件去重:

import sys
import hashlib
def gen_md5(data):
    """
        生成md5
    :param data: 字符串数据
    :return:
    """
    md5 = hashlib.md5()
    md5.update(data.encode('utf-8'))
    return md5.hexdigest()
    
def big_file_remove_same(input_file, output_file):
    """
        针对大文件文件去重(将文件文件写在一行的,没有办法去重)
    :param input_file:
    :param output_file:
    :return:
    """
    finger_print_set = set()
    with open(input_file, 'r', encoding='utf-8') as f, open(output_file, 'w', encoding='utf-8') as ff,open('dumple.txt', 'w', encoding='utf-8') as fff:
        for line in f:
            line_string = line.strip()
            finger_print = gen_md5(line_string)
            if finger_print not in finger_print_set:
                finger_print_set.add(finger_print)
                ff.write(line)
            else:
                fff.write(line)
if __name__ == "__main__":
    big_file_remove_same(sys.argv[1],sys.argv[2])

 

posted @ 2020-09-25 17:31  不能说的秘密  阅读(650)  评论(0编辑  收藏  举报