C# 词典数据结构设计【附demo】
分析
要建立词典,最基本的应该有词典的描述信息、词典索引文件以及词典数据文件。
/// <summary>
/// 索引文件
/// </summary>
string idxFile = "dic.idx";/// <summary>
/// 数据文件
/// </summary>
string dictfile = "dic.dict";/// <summary>
/// 词典信息文件
/// </summary>
string ifoFile = "dic.ifo";我们建立对应的三个类
详细的代码如下:
////// 词语解释 /// class DictWord { ////// 解析 /// public string Description { get; set; } } ////// 词典索引 /// class DictIndex { ////// 词语 /// public string Word { get; set; } ////// 偏移 /// public int Offset { get; set; } ////// 数据大小 /// public int DataSize { get; set; } } ////// 词典信息 /// class DictInfo { ////// 词典名称 /// public string BookName { get; set; } ////// 收录词数 /// public int WordCount { get; set; } ////// 当前偏移 /// public int CurrentOffset { get; set; } }
数据结构说明:
- 描述信息包含词典名字,词典词语数量
- 索引文件存储的是排好顺序词语的索引,每个索引包含词语名称、存在数据文件中的偏移量、以及数据块大小,排序的目的在于查找时直接用二分查找节省查找时间。
- 数据块就简单了,就纯粹的数据
建立词典
建立词典比较简单,首先,定义几个变量来存储词典相关信息:
DictInfo info;
SortedList<string, DictIndex> indexs;
List<DictWord> words;
ps: SortedList能直接排序,不用我们再手动排序了
然后我们来看添加词语:
////// 添加词语 /// /// /// public void Add(string word, string description) { words.Add(new DictWord() { Description = description }); indexs.Add(word, new DictIndex { DataSize = Encoding.UTF8.GetBytes(description).Length, Offset = info.CurrentOffset, Word = word }); // 数量++ info.WordCount++; // 偏移++ info.CurrentOffset += Encoding.UTF8.GetBytes(description).Length; }非常简单,就是添加索引,同时把词典的数量加1
最后来看怎么存储到文件:
////// 保存 /// public void Save() { StringBuilder dicBuilder = new StringBuilder(); dicBuilder.AppendLine(string.Format("BookName={0}", info.BookName)); dicBuilder.AppendLine(string.Format("WordCount={0}", info.WordCount)); dicBuilder.AppendLine(string.Format("CurrentOffset={0}", info.CurrentOffset)); File.WriteAllText(ifoFile, dicBuilder.ToString(), Encoding.UTF8); dicBuilder = new StringBuilder(); using (BinaryWriter idxWriter = new BinaryWriter(File.Open(dictfile, FileMode.Create))) { foreach (var word in words) { idxWriter.Write(Encoding.UTF8.GetBytes(word.Description)); } } using (BinaryWriter idxWriter = new BinaryWriter(File.Open(idxFile, FileMode.Create))) { foreach (var index in indexs) { // 分块大小 128+4+4 = 136 // word 最长128 byte[] word = new byte[128]; var wordData = Encoding.UTF8.GetBytes(index.Key); var length = Math.Min(128, wordData.Length); for (var i = 0; i < length; i++) { word[i] = wordData[i]; } idxWriter.Write(word); byte[] re = new byte[4]; idxWriter.Write(index.Value.Offset); idxWriter.Write(index.Value.DataSize); } } }
这里注意下word最多能存128个字节,每个index区地大小为128+4+4 = 136字节
查询词典
前面做这么多准备,不都是为了查询吗?木有查询,神马都是浮云!
前面说到了,索引文件存储的是排序好的词语列表,所以查询就比较简单了
先给出两个辅助方法:
idxStream = new FileStream(idxFile, FileMode.Open);
idxReader = new BinaryReader(idxStream);
dictStream = new FileStream(dictfile, FileMode.Open);
dictReader = new BinaryReader(dictStream);(1) 获取指定位置的索引
////// 获取指定位置的索引 /// /// ///public DictIndex GetWordIndex(int wordIndex) { idxStream.Seek(0, SeekOrigin.Begin); idxStream.Seek(wordIndex * 136, SeekOrigin.Begin); byte[] word = idxReader.ReadBytes(128); var dicIndex = new DictIndex(); dicIndex.Word = Encoding.UTF8.GetString(word).Replace("\0", ""); dicIndex.Offset = idxReader.ReadInt32(); dicIndex.DataSize = idxReader.ReadInt32(); return dicIndex; } (2)获取指定索引对应的词语解释
////// 获取指定词语的解释 /// /// ///public string GetWordDescription(DictIndex dictIndex) { dictStream.Seek(0, SeekOrigin.Begin); if (dictIndex.Offset != 0) dictStream.Seek(dictIndex.Offset, SeekOrigin.Begin); byte[] word = dictReader.ReadBytes(dictIndex.DataSize); return Encoding.UTF8.GetString(word).Replace("\0", ""); } 现在开始二分查找:////// 获取词语解释 /// /// ///public string GetDescription(string word) { var i = 0; var mid = info.WordCount / 2; var max = info.WordCount; DictIndex w = new DictIndex(); while (i <= max) { mid = (i + max) / 2; w = GetWordIndex(mid); if (string.Compare(w.Word, word) > 0) { max = mid - 1; } else if (string.Compare(w.Word, word) < 0) { i = mid + 1; } else { break; } } return "[" + w.Word + "]\n" + GetWordDescription(w); }
此部分完整代码:
////// 词典 /// class Dict { DictInfo info; SortedListindexs; List words; /// /// 索引文件 /// string idxFile = "dic.idx"; ////// 数据文件 /// string dictfile = "dic.dict"; ////// 词典信息文件 /// string ifoFile = "dic.ifo"; BinaryReader idxReader; FileStream idxStream; BinaryReader dictReader; FileStream dictStream; ////// 查询使用 /// public Dict() { LoadDictInfo(); idxStream = new FileStream(idxFile, FileMode.Open); idxReader = new BinaryReader(idxStream); dictStream = new FileStream(dictfile, FileMode.Open); dictReader = new BinaryReader(dictStream); } ////// 创建时使用 /// /// public Dict(string name) { info = new DictInfo { BookName = name, WordCount = 0, CurrentOffset = 0 }; indexs = new SortedList(); words = new List (); } /// /// 获取词语解释 /// /// ///public string GetDescription(string word) { var i = 0; var mid = info.WordCount / 2; var max = info.WordCount; DictIndex w = new DictIndex(); while (i <= max) { mid = (i + max) / 2; w = GetWordIndex(mid); if (string.Compare(w.Word, word) > 0) { max = mid - 1; } else if (string.Compare(w.Word, word) < 0) { i = mid + 1; } else { break; } } return "[" + w.Word + "]\n" + GetWordDescription(w); } /// /// 获取指定位置的索引 /// /// ///public DictIndex GetWordIndex(int wordIndex) { idxStream.Seek(0, SeekOrigin.Begin); idxStream.Seek(wordIndex * 136, SeekOrigin.Begin); byte[] word = idxReader.ReadBytes(128); var dicIndex = new DictIndex(); dicIndex.Word = Encoding.UTF8.GetString(word).Replace("\0", ""); dicIndex.Offset = idxReader.ReadInt32(); dicIndex.DataSize = idxReader.ReadInt32(); return dicIndex; } /// /// 获取指定词语的解释 /// /// ///public string GetWordDescription(DictIndex dictIndex) { dictStream.Seek(0, SeekOrigin.Begin); if (dictIndex.Offset != 0) dictStream.Seek(dictIndex.Offset, SeekOrigin.Begin); byte[] word = dictReader.ReadBytes(dictIndex.DataSize); return Encoding.UTF8.GetString(word).Replace("\0", ""); } /// /// 添加词语 /// /// /// public void Add(string word, string description) { words.Add(new DictWord() { Description = description }); indexs.Add(word, new DictIndex { DataSize = Encoding.UTF8.GetBytes(description).Length, Offset = info.CurrentOffset, Word = word }); // 数量++ info.WordCount++; // 偏移++ info.CurrentOffset += Encoding.UTF8.GetBytes(description).Length; } ////// 加载词典信息 /// void LoadDictInfo() { var infos = File.ReadAllLines(ifoFile); info = new DictInfo { BookName = infos[0].Replace("BookName=", "").Trim(), WordCount = int.Parse(infos[1].Replace("WordCount=", "").Trim()), CurrentOffset = int.Parse(infos[2].Replace("CurrentOffset=", "").Trim()), }; } ////// 保存 /// public void Save() { StringBuilder dicBuilder = new StringBuilder(); dicBuilder.AppendLine(string.Format("BookName={0}", info.BookName)); dicBuilder.AppendLine(string.Format("WordCount={0}", info.WordCount)); dicBuilder.AppendLine(string.Format("CurrentOffset={0}", info.CurrentOffset)); File.WriteAllText(ifoFile, dicBuilder.ToString(), Encoding.UTF8); dicBuilder = new StringBuilder(); using (BinaryWriter idxWriter = new BinaryWriter(File.Open(dictfile, FileMode.Create))) { foreach (var word in words) { idxWriter.Write(Encoding.UTF8.GetBytes(word.Description)); } } using (BinaryWriter idxWriter = new BinaryWriter(File.Open(idxFile, FileMode.Create))) { foreach (var index in indexs) { // 分块大小 128+4+4 = 136 // word 最长128 byte[] word = new byte[128]; var wordData = Encoding.UTF8.GetBytes(index.Key); var length = Math.Min(128, wordData.Length); for (var i = 0; i < length; i++) { word[i] = wordData[i]; } idxWriter.Write(word); byte[] re = new byte[4]; idxWriter.Write(index.Value.Offset); idxWriter.Write(index.Value.DataSize); } } } }
演示
如图所示
文件夹中放置了许多文本文件,内容为词语的解释
首先、建立词典:
Dict dic = new Dict("病症词典"); var files = new DirectoryInfo(@"G:\Users\Administrator\Desktop\新建文件夹 (3)\新建文件夹 (3)").GetFiles(); foreach (var file in files) { Console.WriteLine(file.FullName); dic.Add(file.Name.Replace("的症状.txt", ""), File.ReadAllText(file.FullName)); } dic.Save();
然后、把玩一番:
var dict = new Dict(); while (true) { Console.Write("请输入词语:"); var w = Console.ReadLine(); Stopwatch sw = new Stopwatch(); sw.Start(); Console.WriteLine("找到词语:"); Console.WriteLine(dict.GetDescription(w)); sw.Stop(); Console.WriteLine("耗时:" + sw.ElapsedMilliseconds + "ms"); }
运行结果:
到此为止,谢谢收看!