C# 词典数据结构设计【附demo】
分析#
要建立词典,最基本的应该有词典的描述信息、词典索引文件以及词典数据文件。
/// <summary>
/// 索引文件
/// </summary>
string idxFile = "dic.idx";/// <summary>
/// 数据文件
/// </summary>
string dictfile = "dic.dict";/// <summary>
/// 词典信息文件
/// </summary>
string ifoFile = "dic.ifo";我们建立对应的三个类
详细的代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 | /// <summary> /// 词语解释 /// </summary> class DictWord { /// <summary> /// 解析 /// </summary> public string Description { get ; set ; } } /// <summary> /// 词典索引 /// </summary> class DictIndex { /// <summary> /// 词语 /// </summary> public string Word { get ; set ; } /// <summary> /// 偏移 /// </summary> public int Offset { get ; set ; } /// <summary> /// 数据大小 /// </summary> public int DataSize { get ; set ; } } /// <summary> /// 词典信息 /// </summary> class DictInfo { /// <summary> /// 词典名称 /// </summary> public string BookName { get ; set ; } /// <summary> /// 收录词数 /// </summary> public int WordCount { get ; set ; } /// <summary> /// 当前偏移 /// </summary> public int CurrentOffset { get ; set ; } } |
数据结构说明:
- 描述信息包含词典名字,词典词语数量
- 索引文件存储的是排好顺序词语的索引,每个索引包含词语名称、存在数据文件中的偏移量、以及数据块大小,排序的目的在于查找时直接用二分查找节省查找时间。
- 数据块就简单了,就纯粹的数据
建立词典#
建立词典比较简单,首先,定义几个变量来存储词典相关信息:
DictInfo info;
SortedList<string, DictIndex> indexs;
List<DictWord> words;
ps: SortedList能直接排序,不用我们再手动排序了
然后我们来看添加词语:
///
/// 添加词语
///
///
///
public
void
Add(
string
word,
string
description)
{
words.Add(
new
DictWord() { Description = description });
indexs.Add(word,
new
DictIndex { DataSize = Encoding.UTF8.GetBytes(description).Length, Offset = info.CurrentOffset, Word = word });
// 数量++
info.WordCount++;
// 偏移++
info.CurrentOffset += Encoding.UTF8.GetBytes(description).Length;
}
非常简单,就是添加索引,同时把词典的数量加1
最后来看怎么存储到文件:
/// <summary>
/// 保存
/// </summary>
public
void
Save()
{
StringBuilder dicBuilder =
new
StringBuilder();
dicBuilder.AppendLine(
string
.Format(
"BookName={0}"
, info.BookName));
dicBuilder.AppendLine(
string
.Format(
"WordCount={0}"
, info.WordCount));
dicBuilder.AppendLine(
string
.Format(
"CurrentOffset={0}"
, info.CurrentOffset));
File.WriteAllText(ifoFile, dicBuilder.ToString(), Encoding.UTF8);
dicBuilder =
new
StringBuilder();
using
(BinaryWriter idxWriter =
new
BinaryWriter(File.Open(dictfile, FileMode.Create)))
{
foreach
(
var
word
in
words)
{
idxWriter.Write(Encoding.UTF8.GetBytes(word.Description));
}
}
using
(BinaryWriter idxWriter =
new
BinaryWriter(File.Open(idxFile, FileMode.Create)))
{
foreach
(
var
index
in
indexs)
{
// 分块大小 128+4+4 = 136
// word 最长128
byte
[] word =
new
byte
[128];
var
wordData = Encoding.UTF8.GetBytes(index.Key);
var
length = Math.Min(128, wordData.Length);
for
(
var
i = 0; i < length; i++)
{
word[i] = wordData[i];
}
idxWriter.Write(word);
byte
[] re =
new
byte
[4];
idxWriter.Write(index.Value.Offset);
idxWriter.Write(index.Value.DataSize);
}
}
}
这里注意下word最多能存128个字节,每个index区地大小为128+4+4 = 136字节
查询词典#
前面做这么多准备,不都是为了查询吗?木有查询,神马都是浮云!
此部分完整代码:
/// /// 词典 /// class Dict { DictInfo info; SortedList indexs; List words; /// /// 索引文件 /// string idxFile = "dic.idx" ; /// /// 数据文件 /// string dictfile = "dic.dict" ; /// /// 词典信息文件 /// string ifoFile = "dic.ifo" ; BinaryReader idxReader; FileStream idxStream; BinaryReader dictReader; FileStream dictStream; /// /// 查询使用 /// public Dict() { LoadDictInfo(); idxStream = new FileStream(idxFile, FileMode.Open); idxReader = new BinaryReader(idxStream); dictStream = new FileStream(dictfile, FileMode.Open); dictReader = new BinaryReader(dictStream); } /// /// 创建时使用 /// /// public Dict( string name) { info = new DictInfo { BookName = name, WordCount = 0, CurrentOffset = 0 }; indexs = new SortedList(); words = new List(); } /// /// 获取词语解释 /// /// /// public string GetDescription( string word) { var i = 0; var mid = info.WordCount / 2; var max = info.WordCount; DictIndex w = new DictIndex(); while (i <= max) { mid = (i + max) / 2; w = GetWordIndex(mid); if ( string .Compare(w.Word, word) > 0) { max = mid - 1; } else if ( string .Compare(w.Word, word) < 0) { i = mid + 1; } else { break ; } } return "[" + w.Word + "]\n" + GetWordDescription(w); } /// /// 获取指定位置的索引 /// /// /// public DictIndex GetWordIndex( int wordIndex) { idxStream.Seek(0, SeekOrigin.Begin); idxStream.Seek(wordIndex * 136, SeekOrigin.Begin); byte [] word = idxReader.ReadBytes(128); var dicIndex = new DictIndex(); dicIndex.Word = Encoding.UTF8.GetString(word).Replace( "\0" , "" ); dicIndex.Offset = idxReader.ReadInt32(); dicIndex.DataSize = idxReader.ReadInt32(); return dicIndex; } /// /// 获取指定词语的解释 /// /// /// public string GetWordDescription(DictIndex dictIndex) { dictStream.Seek(0, SeekOrigin.Begin); if (dictIndex.Offset != 0) dictStream.Seek(dictIndex.Offset, SeekOrigin.Begin); byte [] word = dictReader.ReadBytes(dictIndex.DataSize); return Encoding.UTF8.GetString(word).Replace( "\0" , "" ); } /// /// 添加词语 /// /// /// public void Add( string word, string description) { words.Add( new DictWord() { Description = description }); indexs.Add(word, new DictIndex { DataSize = Encoding.UTF8.GetBytes(description).Length, Offset = info.CurrentOffset, Word = word }); // 数量++ info.WordCount++; // 偏移++ info.CurrentOffset += Encoding.UTF8.GetBytes(description).Length; } /// /// 加载词典信息 /// void LoadDictInfo() { var infos = File.ReadAllLines(ifoFile); info = new DictInfo { BookName = infos[0].Replace( "BookName=" , "" ).Trim(), WordCount = int .Parse(infos[1].Replace( "WordCount=" , "" ).Trim()), CurrentOffset = int .Parse(infos[2].Replace( "CurrentOffset=" , "" ).Trim()), }; } /// /// 保存 /// public void Save() { StringBuilder dicBuilder = new StringBuilder(); dicBuilder.AppendLine( string .Format( "BookName={0}" , info.BookName)); dicBuilder.AppendLine( string .Format( "WordCount={0}" , info.WordCount)); dicBuilder.AppendLine( string .Format( "CurrentOffset={0}" , info.CurrentOffset)); File.WriteAllText(ifoFile, dicBuilder.ToString(), Encoding.UTF8); dicBuilder = new StringBuilder(); using (BinaryWriter idxWriter = new BinaryWriter(File.Open(dictfile, FileMode.Create))) { foreach ( var word in words) { idxWriter.Write(Encoding.UTF8.GetBytes(word.Description)); } } using (BinaryWriter idxWriter = new BinaryWriter(File.Open(idxFile, FileMode.Create))) { foreach ( var index in indexs) { // 分块大小 128+4+4 = 136 // word 最长128 byte [] word = new byte [128]; var wordData = Encoding.UTF8.GetBytes(index.Key); var length = Math.Min(128, wordData.Length); for ( var i = 0; i < length; i++) { word[i] = wordData[i]; } idxWriter.Write(word); byte [] re = new byte [4]; idxWriter.Write(index.Value.Offset); idxWriter.Write(index.Value.DataSize); } } } } |
演示#
如图所示
文件夹中放置了许多文本文件,内容为词语的解释
首先、建立词典:
Dict dic = new Dict( "病症词典" ); var files = new DirectoryInfo( @"G:\Users\Administrator\Desktop\新建文件夹 (3)\新建文件夹 (3)" ).GetFiles(); foreach ( var file in files) { Console.WriteLine(file.FullName); dic.Add(file.Name.Replace( "的症状.txt" , "" ), File.ReadAllText(file.FullName)); } dic.Save(); |
然后、把玩一番:
var dict = new Dict(); while ( true ) { Console.Write( "请输入词语:" ); var w = Console.ReadLine(); Stopwatch sw = new Stopwatch(); sw.Start(); Console.WriteLine( "找到词语:" ); Console.WriteLine(dict.GetDescription(w)); sw.Stop(); Console.WriteLine( "耗时:" + sw.ElapsedMilliseconds + "ms" ); } |
运行结果:
到此为止,谢谢收看!
关注作者
作者: JadePeng
出处:https://www.cnblogs.com/xiaoqi/archive/2011/04/02/2003745.html
版权:本文采用「署名-非商业性使用-相同方式共享 4.0 国际(欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出原文链接) 」知识共享许可协议进行许可。
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 没有源码,如何修改代码逻辑?
· 全程不用写代码,我用AI程序员写了一个飞机大战
· DeepSeek 开源周回顾「GitHub 热点速览」
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了