基于哈夫曼编码完成的文件压缩及解压
这几天在较为认真的研究基于哈夫曼编码的文件压缩及解压,费了点时间,在这分享一下:
这里用链式结构,非顺序表结构;
文件压缩:
1.获取文件信息(这里采用TXT格式文本);
2.压缩文件;
3.写配置文件(便于解压时用,无非就是存放原文件的索引之类的,比如说,文件中某个字符出现的个数,记录下来)
4.解压缩,使用压缩后的文件和配置文件解压文件;
5.用比对软件,比对解压后的文件和源文件是否相同;
下面慢慢解析:
先看一个文件信息类:
typedef long long LongType;
struct FileInfo
{
unsigned char _ch; //字符
LongType _count; //字符出现次数
string _code; //字符对应的哈夫曼编码
FileInfo(unsigned char ch = 0)
:_ch(ch)
,_count(0)
{}
FileInfo operator+(const FileInfo& x)
{
FileInfo tmp;
tmp._count = this->_count + x._count;
return tmp;
}
bool operator !=(const FileInfo& x) const
{
return this->_count != x._count;
}
};
bool operator<(const FileInfo info1,const FileInfo info2)
{
return info1._count < info2._count;
}
此为一个文件信息的类结构,包含字符,字符对应出现的次数,以及这个字符对应的哈夫曼编码(能看到这篇博客的星弟,对哈夫曼编码不会陌生,这里不再强调)
除了统计字符出现的次数及哈夫曼编码,还完成了几个运算符的重载
要获取哈夫曼编码,就得建立哈夫曼树,建立哈夫曼树用最小堆取操作,以下是最小堆建立过程
// 小堆
template<class T>
struct Less
{
bool operator() (const T& l, const T& r)
{
return l < r; // operator<
}
};
template<class T>
struct Greater
{
bool operator() (const T& l, const T& r)
{
return l > r; // operator<
}
};
template<class T, class Compare = Less<T>>
class Heap
{
public:
Heap()
{}
Heap(const T* a, size_t size)
{
for (size_t i = 0; i < size; ++i)
{
_arrays.push_back(a[i]);
}
// 建堆
for(int i = (_arrays.size()-2)/2; i >= 0; --i)
{
AdjustDown(i);
}
}
void Push(const T& x)
{
_arrays.push_back(x);
AdjustUp(_arrays.size()-1);
}
void Pop()
{
assert(_arrays.size() > 0);
swap(_arrays[0], _arrays[_arrays.size() - 1]);
_arrays.pop_back();
AdjustDown(0);
}
T& Top()
{
assert(_arrays.size() > 0);
return _arrays[0];
}
bool Empty()
{
return _arrays.empty();
}
int Size()
{
return _arrays.size();
}
void AdjustDown(int root)
{
int child = root*2 + 1;
//
Compare com;
while (child < _arrays.size())
{
// 比较出左右孩子中小的那个
if (child+1<_arrays.size() &&
*_arrays[child+1] < _arrays[child])
//if(child+1<_arrays.size() &&
// com(_arrays[child+1],_arrays[child]))
{
++child;
}
if(*_arrays[child] < _arrays[root])
//if(com(_arrays[child],_arrays[root]))
{
swap(_arrays[child], _arrays[root]);
root = child;
child = 2*root+1;
}
else
{
break;
}
}
}
void AdjustUp(int child)
{
int parent = (child-1)/2;
//while (parent >= 0)
while (child > 0)
{
if (*_arrays[child] < _arrays[parent])
{
swap(_arrays[parent], _arrays[child]);
child = parent;
parent = (child-1)/2;
}
else
{
break;
}
}
}
public:
vector<T> _arrays;
};
最小堆里也完成了很多接口,包括push pop等
然后就是几个压缩和解压的函数接口
1.根据哈夫曼树获取哈夫曼变慢:
void _GenerateHuffmanCode(HuffmanTreeNode<FileInfo>* root)
{
if (root == nullptr)
{
return;
}
_GenerateHuffmanCode(root->_left);
_GenerateHuffmanCode(root->_right);
//当前节点为叶子节点为空 才生成哈夫曼编码
if (root->_left == nullptr && root->_right == nullptr)
{
HuffmanTreeNode<FileInfo>* cur = root;
HuffmanTreeNode<FileInfo>* parent = cur->_parent;
string& code = _infos[cur->_weight._ch]._code;
while (parent)
{
if (parent->_left == cur)
{
code += '1';
}
else if (parent->_right == cur)
{
code += '0';
}
cur = parent;
parent = cur->_parent;
}
reverse(code.begin(), code.end());
}
}
void CreateTree(T *a, size_t size, const T& invalid)
{
assert(a);
Heap<HuffmanTreeNode<T>*> s1; //草 终于发现问题 在这里 (堆里放的是指针,类型一定要对)
//找两个最小的元素
for (size_t i = 0; i < size; ++i)
{
if (a[i] != invalid)
{
HuffmanTreeNode<T>* node = new HuffmanTreeNode<T>(a[i]);
s1.Push(node);
}
}
while (s1.Size() > 1)
{
HuffmanTreeNode<T>* left = s1.Top();
s1.Pop();
HuffmanTreeNode<T>* right = s1.Top();
s1.Pop();
HuffmanTreeNode<T>* parent = new HuffmanTreeNode<T>(left->_weight + right->_weight);
parent->_left = left;
parent->_right = right;
left->_parent = parent;
right->_parent = parent;
s1.Push(parent);
}
_root = s1.Top();
s1.Pop();
}
bool _ReadLine(FILE *fOutLogFile, string& line)
{
char ch = fgetc(fOutLogFile);
if (feof(fOutLogFile))
return false;
else
{
if (ch == '\n')
{
line += ch;
ch = fgetc(fOutLogFile);
}
while (ch != '\n')
{
line += ch;
ch = fgetc(fOutLogFile);
}
return true;
}
}
4.文件压缩
//文件压缩
bool Compress(const char* filename)
{
//1.打开一个文件,统计文件字符出现的次数
//2.生成对应的哈弗曼编码
//3.压缩文件
//4.写配置文件,方便解压缩
assert(filename);
FILE *fOut = fopen(filename, "rb");
assert(fOut);
//统计文件字符出现的次数
unsigned char ch = fgetc(fOut);
while (!feof(fOut)) //文件结束
{
_infos[ch]._count++;
ch = fgetc(fOut);
}
HuffmanTree<FileInfo> ht;
FileInfo invalid;
ht.CreateTree(_infos, 256, invalid);
//哈夫曼编码
_GenerateHuffmanCode(ht.GetRoot());
string compressFile = filename;
compressFile += ".huf";
//压缩后的文件名 后缀为《输入文件名+.huf》
FILE *finCompress = fopen(compressFile.c_str(), "wb"); //获取string中的C字符串
assert(finCompress);
fseek(fOut, 0, SEEK_SET);//将文件指针移到开头
char cha = fgetc(fOut);
unsigned char inch = 0;
int index = 0; //一个字节的八位
while (!feof(fOut))
{
string& code = _infos[(unsigned char)cha]._code;
for (size_t i = 0; i < code.size(); ++i)
{
inch <<= 1; //低位向高位进
if (code[i] == '1')
{
inch |= 1;
}
if (++index == 8)
{
fputc(inch, finCompress); //够8位,装进文件
index = 0; //重新一轮开始
inch = 0;
}
}
cha = fgetc(fOut);
}
fclose(fOut);
//如果index = 0 说明 上边8位刚好存满 不等 下一个自己又出来了
if (index != 0) //处理最后一个字符不够的问题
{
inch <<= (8 - index); //最高位必须装上 后边的浪费掉
fputc(inch, finCompress);
}
fclose(finCompress);
}
5.写配置文件:
string logFile = filename;
logFile += ".log";
FILE *Log = fopen(logFile.c_str(), "wb");
assert(Log);
string chInfo;
char str[128] = {0}; //没空间 不可以
for (size_t i = 1; i < 256; ++i)
{
if (_infos[i]._count > 0)
{
chInfo += _infos[i]._ch;
chInfo += ',';
chInfo += _itoa(_infos[i]._count,str,10);
chInfo += '\n';
fputs(chInfo.c_str(), Log);
chInfo.clear();
}
}
fclose(Log);
6.最后的文件解压:
//重构文件
void _RestoreFiles(HuffmanTreeNode<FileInfo> *root, const char* Fileneme,long long size)
{
assert(root);
//原压缩文件
string name = Fileneme;
name += ".huf";
FILE* Out = fopen(name.c_str(),"rb");
assert(Out);
string restorefilename = Fileneme;
restorefilename += ".over";
FILE *over = fopen(restorefilename.c_str(),"wb");
assert(over);
int pos = 8;
long long poss = size;
unsigned char chz = fgetc(Out);
while (poss>0)
{
HuffmanTreeNode<FileInfo>* cur = nullptr;
cur = root;
while (cur->_left != nullptr || cur->_right != nullptr)
{
pos--;
unsigned char temp = chz >> pos;
int ch = 1 & temp;
if (ch == 0)
{
cur = cur->_right;
}
else if (ch == 1)
{
cur = cur->_left;
}
if (pos == 0)
{
chz = fgetc(Out);
pos = 8;
}
}
fputc(cur->_weight._ch, over);
poss--;
}
fclose(Out);
fclose(over);
}
void UnCompress(const char* Fileneme)//解压缩
{
//1.打开日志文件
//2.根据信息还原哈夫曼树
//3.还原信息;
string UnCompressneme = Fileneme;
UnCompressneme += ".log";
FILE *fOutLogFile = fopen(UnCompressneme.c_str(), "rb");
assert(fOutLogFile);
string line;
while (_ReadLine(fOutLogFile, line))
{
unsigned char ch = line[0];
_infos[ch]._count = atoi(line.substr(2).c_str());
line.clear();
}
HuffmanTree<FileInfo> f;
FileInfo invalid;
f.CreateTree(_infos, 256, invalid);
//根据重建的哈夫曼树 还原文件;
long long size = f.GetRoot()->_weight._count;
_RestoreFiles(f.GetRoot(), Fileneme,size);
}
到此,此项目基本完成;如遇问题,希望留言,随时解答,如有见解,跪求赐教!