Code
lucene.net索引文件存储简析2007年12月03日 星期一 下午 05:53在lucene.net中,典型的索引文件操作代码如下:
IndexWriter writer = new IndexWriter("c:\index", new StandardAnalyzer(), true);
try
{
Document doc = new Document();
doc.Add(Field.Keyword("name", "name name");
doc.Add(Field.Text("title", "title title");
doc.Add(Field.Text("content", "content content");
writer.AddDocument();
}
finally{
writer.Optimize();
writer.Close();
}
在上面的代码中:
IndexWriter专门用于索引文件的写入操作;
StandardAnalyzer是一个分析器,用于对要索引的内容进行切分处理;
Document表示一条被索引的记录;
下面简单分析一下索引文件的创建过程
01. 创建IndexWriter
IndexWriter有几个重载的构造函数,它们都调用私有构造函数
private IndexWriter(Directory d, Analyzer a, bool create, bool closeDir) {
// 初始化及锁定处理略
lock (directory) {
// in- & inter-process sync
new AnonymousClassWith(create, this, directory.MakeLock(IndexWriter.COMMIT_LOCK_NAME), COMMIT_LOCK_TIMEOUT).Run();
}
}
参数Directory是Lucene.net内实现的一个存储结构,它有基于文件系统的FSDirectory和基于内存的RAMDirectory两个版本;
Analyzer为分析器,用于索引内容的切分,StandardAnalyzer是Lucene.net内的一个标准的分析器实现,支持中英文,不过中文是按单字切分的;
create指定是否创建索引目录,如果要在同一文件夹内进行增量索引,create应设置为false;
AnonymousClassWith是一个辅助类,用于完成索引目录的初始化操作,
// class Lock.With (lock.cs)
public virtual Object Run() {
bool locked = false;
try {
locked = lock_Renamed.Obtain(lockWaitTimeout);
return DoBody();
}
finally {
if (locked)
lock_Renamed.Release();
}
}
先取得一个操作锁对象,然后进行具体的操作,避免多个线程同时进行一项操作。
关于lucene的锁操作在后续文章中介绍,
// class IndexWriter.AnonymousClassWith
public override Object DoBody() {
if (create)
Enclosing_Instance.segmentInfos.Write(Enclosing_Instance.directory);
else
Enclosing_Instance.segmentInfos.Read(Enclosing_Instance.directory);
return null;
}
根据create的值决定是新索引目录(Write)还是使用现有索引目录(Read);
segmentInfos用于记录索引目录下的索引文件信息.
// class SegmentInfos
public void Write(Directory directory) {
OutputStream output = directory.CreateFile("segments.new");
try {
output.WriteInt(FORMAT); // write FORMAT
output.WriteLong(++version); // every write changes the index
output.WriteInt(counter); // write counter
output.WriteInt(Count); // write infos
for (int i = 0; i < Count; i++) {
SegmentInfo si = Info(i);
output.WriteString(si.name);
output.WriteInt(si.docCount);
}
}
finally {
output.Close();
}
directory.RenameFile("segments.new", "segments");
}
创建一个索引信息文件,lucene.net实现了一套与平台无关的存储机制,采用按字节进行读取和写入,并定义了以下几种类型:
Int: 整形 四字节
Long: 长整型 八字节
VInt: 变长整形 不定
VLong: 变长长整形 不定
String: 字符串 不定
OutputStream为lucene.net内实现的输出流类
从上面的代码得到segments文件的格式如下:
Format + Version + Counter + segment总数 + (segment名称1 + doc总数1) + (segment名称2 + doc总数2) + () + (segment名称n + doc总数n)
// class SegmentInfos
public void Read(Directory directory) {
InputStream input = directory.OpenFile("segments");
try {
int format = input.ReadInt();
for (int i = input.ReadInt(); i > 0; i--) {
// read segmentInfos
SegmentInfo si = new SegmentInfo(input.ReadString(), input.ReadInt(), directory);
Add(si);
}
}
finally {
input.Close();
}
}
读取现有索引文件信息。InputStream为lucene.net内实现的输入流类.
02. 添加文档(AddDocument)
// class IndexWriter
public virtual void AddDocument(Document doc, Analyzer analyzer) {
DocumentWriter dw = new DocumentWriter(ramDirectory, analyzer, similarity, maxFieldLength);
System.String segmentName = NewSegmentName();
dw.AddDocument(segmentName, doc);
lock (this) {
segmentInfos.Add(new SegmentInfo(segmentName, 1, ramDirectory));
MaybeMergeSegments();
}
}
先构造一个DocumentWriter, 通过NewSegmentName取得一个随机的segment文件名,
然后调用DocumentWriter.AddDocument对文档进行处理,并把内容写入到segment文件中,
最后调用MaybeMergeSegments检查是否合并segment。
// class DocumentWriter
public void AddDocument(System.String segment, Document doc) {
// write Field names
fieldInfos = new FieldInfos();
fieldInfos.Add(doc);
fieldInfos.Write(directory, segment + ".fnm");
// write Field values
FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos);
try {
fieldsWriter.AddDocument(doc);
}
finally {
fieldsWriter.Close();
}
// invert doc into postingTable
postingTable.Clear(); // clear postingTable
fieldLengths = new int[fieldInfos.Size()]; // init fieldLengths
fieldPositions = new int[fieldInfos.Size()]; // init fieldPositions
fieldBoosts = new float[fieldInfos.Size()]; // init fieldBoosts
float boost = doc.GetBoost();
for (int i = 0; i < fieldBoosts.Length; i++) {
fieldBoosts[i] = boost;
}
InvertDocument(doc);
// sort postingTable into an array
Posting[] postings = SortPostingTable();
// write postings
WritePostings(postings, segment);
// write norms of indexed fields
WriteNorms(doc, segment);
}
1. 将字段信息写入.fnm文件中;
// class FieldInfos
public void Write(OutputStream output) {
output.WriteVInt(Size());
for (int i = 0; i < Size(); i++) {
FieldInfo fi = FieldInfo(i);
byte bits = (byte) (0x0);
if (fi.isIndexed)
bits |= (byte) (0x1);
if (fi.storeTermVector)
bits |= (byte) (0x2);
output.WriteString(fi.name);
output.WriteByte(bits);
}
}
由上面的代码得到.fnm文件的格式如下:
字段总数 + (字段名1 + 字段标志1) + (字段名2 + 字段标志2) + () + (字段名n + 字段标志n)
2. 存储字段内容
// class FieldsWriter
internal void AddDocument(Document doc) {
indexStream.WriteLong(fieldsStream.GetFilePointer());
int storedCount = 0;
foreach (Field field in doc.Fields()) {
if (field.IsStored())
storedCount++;
}
fieldsStream.WriteVInt(storedCount);
foreach (Field field in doc.Fields()) {
if (field.IsStored()) {
fieldsStream.WriteVInt(fieldInfos.FieldNumber(field.Name()));
byte bits = 0;
if (field.IsTokenized())
bits |= 1;
fieldsStream.WriteByte(bits);
fieldsStream.WriteString(field.StringValue());
}
}
}
根据字段的IsStored()决断字段内容是否要存储,从上面的代码得到.fdt的格式如下:
要存储的字段总数 + (字段是.fnm中的字段序号1 + 字段标识1 + 字段内容1) + () + (字段是.fnm中的字段序号n + 字段标识n + 字段内容n)
3. 倒排文档InvertDocument
// class DocumentWriter
private void InvertDocument(Document doc) {
foreach(Field field in doc.Fields()) {
System.String fieldName = field.Name();
int fieldNumber = fieldInfos.FieldNumber(fieldName);
int length = fieldLengths[fieldNumber]; // length of Field
int position = fieldPositions[fieldNumber]; // position in Field
if (field.IsIndexed()) {
if (!field.IsTokenized()) {
// un-tokenized Field
AddPosition(fieldName, field.StringValue(), position++);
length++;
}
else {
System.IO.TextReader reader; // find or make Reader
if (field.ReaderValue() != null)
reader = field.ReaderValue();
else if (field.StringValue() != null)
reader = new System.IO.StringReader(field.StringValue());
else
throw new System.ArgumentException("Field must have either String or Reader value");
// Tokenize Field and add to postingTable
TokenStream stream = analyzer.TokenStream(fieldName, reader);
try {
for (Token t = stream.Next(); t != null; t = stream.Next()) {
position += (t.GetPositionIncrement() - 1);
AddPosition(fieldName, t.TermText(), position++);
if (++length > maxFieldLength)
break;
}
}
finally {
stream.Close();
}
}
fieldLengths[fieldNumber] = length; // save Field length
fieldPositions[fieldNumber] = position; // save Field position
fieldBoosts[fieldNumber] *= field.GetBoost();
}
}
}
通过analyzer(分析器)对字段内容进行切分,并保存Term信息, 关于analyzer在后续文章中进行分绍.
fieldLengths 存储处理的term总数;
fieldPositions 保存最后一个被处理term的位置;
fieldBoosts
Posting类(包含term, 位置和出现频率).
// class Document.Writer.Posting
sealed class Posting {
// info about a Term in a doc
internal Term term; // the Term
internal int freq; // its frequency in doc
internal int[] positions; // positions it occurs at
internal Posting(Term t, int position)
{
term = t;
freq = 1;
positions = new int[1];
positions[0] = position;
}
}
保存Posting信息
// class DocumentWriter
private void AddPosition(System.String field, System.String text, int position) {
termBuffer.Set(field, text);
Posting ti = (Posting) postingTable[termBuffer];
if (ti != null) {
// word seen before
int freq = ti.freq;
if (ti.positions.Length == freq) {
// positions array is full
int[] newPositions = new int[freq * 2]; // double size
int[] positions = ti.positions;
for (int i = 0; i < freq; i++)
// copy old positions to new
newPositions[i] = positions[i];
ti.positions = newPositions;
}
ti.positions[freq] = position; // add new position
ti.freq = freq + 1; // update frequency
}
else {
// word not seen before
Term term = new Term(field, text, false);
postingTable[term] = new Posting(term, position);
}
}
先查找term是否存在,如存在则增加词频freq, 不存在则创建一个Posting。
postingTable是一个Hashtable,用于存储Posting信息,
4. 对Posting进行排序
// class DocumentWriter
private Posting[] SortPostingTable() {
// copy postingTable into an array
Posting[] array = new Posting[postingTable.Count];
System.Collections.IEnumerator postings = postingTable.Values.GetEnumerator();
for (int i = 0; postings.MoveNext(); i++) {
array[i] = (Posting) postings.Current;
}
// sort the array
QuickSort(array, 0, array.Length - 1);
return array;
}
用QuickSort(快速排序)对Posting进行排序,通过比较Term进行排序, QuickSort这里就不列出了,请参考相关源码或算法结构。
// class Term
public int CompareTo(Term other) {
if (field == other.field)
// fields are interned
return String.CompareOrdinal(text, other.text);
else
return String.CompareOrdinal(field, other.field);
}
从上面可得知,Term是通过字符的数值进序的。
5. 将Posting写入文件
03. 合并Segments
来源:http://blog.esoutong.com/user1/Lucene/archives/2006/796.html
lucene.net索引文件存储简析2007年12月03日 星期一 下午 05:53在lucene.net中,典型的索引文件操作代码如下:
IndexWriter writer = new IndexWriter("c:\index", new StandardAnalyzer(), true);
try
{
Document doc = new Document();
doc.Add(Field.Keyword("name", "name name");
doc.Add(Field.Text("title", "title title");
doc.Add(Field.Text("content", "content content");
writer.AddDocument();
}
finally{
writer.Optimize();
writer.Close();
}
在上面的代码中:
IndexWriter专门用于索引文件的写入操作;
StandardAnalyzer是一个分析器,用于对要索引的内容进行切分处理;
Document表示一条被索引的记录;
下面简单分析一下索引文件的创建过程
01. 创建IndexWriter
IndexWriter有几个重载的构造函数,它们都调用私有构造函数
private IndexWriter(Directory d, Analyzer a, bool create, bool closeDir) {
// 初始化及锁定处理略
lock (directory) {
// in- & inter-process sync
new AnonymousClassWith(create, this, directory.MakeLock(IndexWriter.COMMIT_LOCK_NAME), COMMIT_LOCK_TIMEOUT).Run();
}
}
参数Directory是Lucene.net内实现的一个存储结构,它有基于文件系统的FSDirectory和基于内存的RAMDirectory两个版本;
Analyzer为分析器,用于索引内容的切分,StandardAnalyzer是Lucene.net内的一个标准的分析器实现,支持中英文,不过中文是按单字切分的;
create指定是否创建索引目录,如果要在同一文件夹内进行增量索引,create应设置为false;
AnonymousClassWith是一个辅助类,用于完成索引目录的初始化操作,
// class Lock.With (lock.cs)
public virtual Object Run() {
bool locked = false;
try {
locked = lock_Renamed.Obtain(lockWaitTimeout);
return DoBody();
}
finally {
if (locked)
lock_Renamed.Release();
}
}
先取得一个操作锁对象,然后进行具体的操作,避免多个线程同时进行一项操作。
关于lucene的锁操作在后续文章中介绍,
// class IndexWriter.AnonymousClassWith
public override Object DoBody() {
if (create)
Enclosing_Instance.segmentInfos.Write(Enclosing_Instance.directory);
else
Enclosing_Instance.segmentInfos.Read(Enclosing_Instance.directory);
return null;
}
根据create的值决定是新索引目录(Write)还是使用现有索引目录(Read);
segmentInfos用于记录索引目录下的索引文件信息.
// class SegmentInfos
public void Write(Directory directory) {
OutputStream output = directory.CreateFile("segments.new");
try {
output.WriteInt(FORMAT); // write FORMAT
output.WriteLong(++version); // every write changes the index
output.WriteInt(counter); // write counter
output.WriteInt(Count); // write infos
for (int i = 0; i < Count; i++) {
SegmentInfo si = Info(i);
output.WriteString(si.name);
output.WriteInt(si.docCount);
}
}
finally {
output.Close();
}
directory.RenameFile("segments.new", "segments");
}
创建一个索引信息文件,lucene.net实现了一套与平台无关的存储机制,采用按字节进行读取和写入,并定义了以下几种类型:
Int: 整形 四字节
Long: 长整型 八字节
VInt: 变长整形 不定
VLong: 变长长整形 不定
String: 字符串 不定
OutputStream为lucene.net内实现的输出流类
从上面的代码得到segments文件的格式如下:
Format + Version + Counter + segment总数 + (segment名称1 + doc总数1) + (segment名称2 + doc总数2) + () + (segment名称n + doc总数n)
// class SegmentInfos
public void Read(Directory directory) {
InputStream input = directory.OpenFile("segments");
try {
int format = input.ReadInt();
for (int i = input.ReadInt(); i > 0; i--) {
// read segmentInfos
SegmentInfo si = new SegmentInfo(input.ReadString(), input.ReadInt(), directory);
Add(si);
}
}
finally {
input.Close();
}
}
读取现有索引文件信息。InputStream为lucene.net内实现的输入流类.
02. 添加文档(AddDocument)
// class IndexWriter
public virtual void AddDocument(Document doc, Analyzer analyzer) {
DocumentWriter dw = new DocumentWriter(ramDirectory, analyzer, similarity, maxFieldLength);
System.String segmentName = NewSegmentName();
dw.AddDocument(segmentName, doc);
lock (this) {
segmentInfos.Add(new SegmentInfo(segmentName, 1, ramDirectory));
MaybeMergeSegments();
}
}
先构造一个DocumentWriter, 通过NewSegmentName取得一个随机的segment文件名,
然后调用DocumentWriter.AddDocument对文档进行处理,并把内容写入到segment文件中,
最后调用MaybeMergeSegments检查是否合并segment。
// class DocumentWriter
public void AddDocument(System.String segment, Document doc) {
// write Field names
fieldInfos = new FieldInfos();
fieldInfos.Add(doc);
fieldInfos.Write(directory, segment + ".fnm");
// write Field values
FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos);
try {
fieldsWriter.AddDocument(doc);
}
finally {
fieldsWriter.Close();
}
// invert doc into postingTable
postingTable.Clear(); // clear postingTable
fieldLengths = new int[fieldInfos.Size()]; // init fieldLengths
fieldPositions = new int[fieldInfos.Size()]; // init fieldPositions
fieldBoosts = new float[fieldInfos.Size()]; // init fieldBoosts
float boost = doc.GetBoost();
for (int i = 0; i < fieldBoosts.Length; i++) {
fieldBoosts[i] = boost;
}
InvertDocument(doc);
// sort postingTable into an array
Posting[] postings = SortPostingTable();
// write postings
WritePostings(postings, segment);
// write norms of indexed fields
WriteNorms(doc, segment);
}
1. 将字段信息写入.fnm文件中;
// class FieldInfos
public void Write(OutputStream output) {
output.WriteVInt(Size());
for (int i = 0; i < Size(); i++) {
FieldInfo fi = FieldInfo(i);
byte bits = (byte) (0x0);
if (fi.isIndexed)
bits |= (byte) (0x1);
if (fi.storeTermVector)
bits |= (byte) (0x2);
output.WriteString(fi.name);
output.WriteByte(bits);
}
}
由上面的代码得到.fnm文件的格式如下:
字段总数 + (字段名1 + 字段标志1) + (字段名2 + 字段标志2) + () + (字段名n + 字段标志n)
2. 存储字段内容
// class FieldsWriter
internal void AddDocument(Document doc) {
indexStream.WriteLong(fieldsStream.GetFilePointer());
int storedCount = 0;
foreach (Field field in doc.Fields()) {
if (field.IsStored())
storedCount++;
}
fieldsStream.WriteVInt(storedCount);
foreach (Field field in doc.Fields()) {
if (field.IsStored()) {
fieldsStream.WriteVInt(fieldInfos.FieldNumber(field.Name()));
byte bits = 0;
if (field.IsTokenized())
bits |= 1;
fieldsStream.WriteByte(bits);
fieldsStream.WriteString(field.StringValue());
}
}
}
根据字段的IsStored()决断字段内容是否要存储,从上面的代码得到.fdt的格式如下:
要存储的字段总数 + (字段是.fnm中的字段序号1 + 字段标识1 + 字段内容1) + () + (字段是.fnm中的字段序号n + 字段标识n + 字段内容n)
3. 倒排文档InvertDocument
// class DocumentWriter
private void InvertDocument(Document doc) {
foreach(Field field in doc.Fields()) {
System.String fieldName = field.Name();
int fieldNumber = fieldInfos.FieldNumber(fieldName);
int length = fieldLengths[fieldNumber]; // length of Field
int position = fieldPositions[fieldNumber]; // position in Field
if (field.IsIndexed()) {
if (!field.IsTokenized()) {
// un-tokenized Field
AddPosition(fieldName, field.StringValue(), position++);
length++;
}
else {
System.IO.TextReader reader; // find or make Reader
if (field.ReaderValue() != null)
reader = field.ReaderValue();
else if (field.StringValue() != null)
reader = new System.IO.StringReader(field.StringValue());
else
throw new System.ArgumentException("Field must have either String or Reader value");
// Tokenize Field and add to postingTable
TokenStream stream = analyzer.TokenStream(fieldName, reader);
try {
for (Token t = stream.Next(); t != null; t = stream.Next()) {
position += (t.GetPositionIncrement() - 1);
AddPosition(fieldName, t.TermText(), position++);
if (++length > maxFieldLength)
break;
}
}
finally {
stream.Close();
}
}
fieldLengths[fieldNumber] = length; // save Field length
fieldPositions[fieldNumber] = position; // save Field position
fieldBoosts[fieldNumber] *= field.GetBoost();
}
}
}
通过analyzer(分析器)对字段内容进行切分,并保存Term信息, 关于analyzer在后续文章中进行分绍.
fieldLengths 存储处理的term总数;
fieldPositions 保存最后一个被处理term的位置;
fieldBoosts
Posting类(包含term, 位置和出现频率).
// class Document.Writer.Posting
sealed class Posting {
// info about a Term in a doc
internal Term term; // the Term
internal int freq; // its frequency in doc
internal int[] positions; // positions it occurs at
internal Posting(Term t, int position)
{
term = t;
freq = 1;
positions = new int[1];
positions[0] = position;
}
}
保存Posting信息
// class DocumentWriter
private void AddPosition(System.String field, System.String text, int position) {
termBuffer.Set(field, text);
Posting ti = (Posting) postingTable[termBuffer];
if (ti != null) {
// word seen before
int freq = ti.freq;
if (ti.positions.Length == freq) {
// positions array is full
int[] newPositions = new int[freq * 2]; // double size
int[] positions = ti.positions;
for (int i = 0; i < freq; i++)
// copy old positions to new
newPositions[i] = positions[i];
ti.positions = newPositions;
}
ti.positions[freq] = position; // add new position
ti.freq = freq + 1; // update frequency
}
else {
// word not seen before
Term term = new Term(field, text, false);
postingTable[term] = new Posting(term, position);
}
}
先查找term是否存在,如存在则增加词频freq, 不存在则创建一个Posting。
postingTable是一个Hashtable,用于存储Posting信息,
4. 对Posting进行排序
// class DocumentWriter
private Posting[] SortPostingTable() {
// copy postingTable into an array
Posting[] array = new Posting[postingTable.Count];
System.Collections.IEnumerator postings = postingTable.Values.GetEnumerator();
for (int i = 0; postings.MoveNext(); i++) {
array[i] = (Posting) postings.Current;
}
// sort the array
QuickSort(array, 0, array.Length - 1);
return array;
}
用QuickSort(快速排序)对Posting进行排序,通过比较Term进行排序, QuickSort这里就不列出了,请参考相关源码或算法结构。
// class Term
public int CompareTo(Term other) {
if (field == other.field)
// fields are interned
return String.CompareOrdinal(text, other.text);
else
return String.CompareOrdinal(field, other.field);
}
从上面可得知,Term是通过字符的数值进序的。
5. 将Posting写入文件
03. 合并Segments
来源:http://blog.esoutong.com/user1/Lucene/archives/2006/796.html