huffman编解码英文文本[Python]

对英文文本的字母进行huffman编码,heapq优先队列构建huffman树

python huffman.py source.txt result.txt

 

 1 import sys
 2 import heapq
 3 import collections
 4 
 5 class Node(object):
 6     def __init__(self,value = None,count = 1,left = None,right = None, code = ''):
 7         self.value = value
 8         self.count = count
 9         self.left = left
10         self.right = right
11         self.code = code
12 
13     def isleaf(self):
14         if self.left != None:
15             return False
16         elif self.right != None:
17             return False
18         else:
19             return True
20 
21     def __repr__(self):
22         return "Node(%r,%r)"%(self.value,self.count)
23     #for sort or priority queue
24     def __lt__(self,other):
25         return self.count < other.count
26     #for operator +
27     def __add__(self,other):
28         self.code = 0
29         other.code = 1
30         # only leaf node value is need
31         return Node(self.value+other.value,self.count+other.count,self,other)
32 
33 def getTreeRoot(text):
34     Counter = collections.Counter(text)
35     head = [Node(k,v) for (k,v) in Counter.items()]
36     heapq.heapify(head)
37 
38     while len(head) >= 2:
39         heapq.heappush(head, heapq.heappop(head) + heapq.heappop(head))
40 
41     root = head[0]
42     return root
43 
44 def huffman(root, prefix = []):
45     code = {}
46     if root is None:
47         return code
48     prefix = prefix + [root.code]
49     if root.isleaf():
50         code[root.value] = prefix
51     else:
52         code.update(huffman(root.left,prefix))
53         code.update(huffman(root.right,prefix))
54     return code
55 
56 def gethuffmantext(text):
57     root = getTreeRoot(text)
58     codebook = huffman(root)
59     for k,v in codebook.items():
60         newv = "".join(str(char) for char in v)
61         codebook[k] = newv
62     print (codebook)
63     print("The original text size is {}".format(len(text) * 8))
64     huffmantext = []
65     lenhuffman = 0
66     for char in text:
67         lenhuffman += len(codebook[char])
68         huffmantext.append(codebook[char])
69     print ("The huffman code text size is {}".format(lenhuffman))
70 
71     return huffmantext, codebook, lenhuffman
72 
73 def gettextfromhuffmancode(huffmantext,codebook):
74 
75     reversecodebook = {value:key for (key,value) in codebook.items()}
76 
77     text = []
78     for huffmancode in huffmantext:
79         text.append(reversecodebook[huffmancode])
80 
81     return text
82 
83 
84 if __name__ == "__main__":
85     text = open(sys.argv[1],"rb").read()
86     newfile = sys.argv[2]
87     #huffmantext, codebook, lenhuffman = gethuffmantext("hello world")
88     huffmantext, codebook, lenhuffman = gethuffmantext(text)
89     text1 = gettextfromhuffmancode(huffmantext,codebook)
90     text1 = "".join(str(v) for v in text1)
91 
92     lenoftext = len(text)*8.0
93     lenofhuffman = lenhuffman
94 
95     print ("The compression ratio is %lf  \n" % (1.0 * lenhuffman / lenoftext ))
96 
97     fp = open(newfile,"wb")
98     fp.write(text1)
99     fp.close()

 

posted @ 2017-11-23 14:31  demianzhang  阅读(1043)  评论(0编辑  收藏  举报