[Lucene.Net] 应用实践
作者: yuhen
实验一
下面的代码中,我们使用两个线程来模拟并发的读写操作。运行该代码,我们会发现 IndexReader 并不能获得 Directory 的变更数据。
private void Test1()
{
Directory directory = new RAMDirectory();
Analyzer analyzer = new StandardAnalyzer();
new Thread(delegate()
{
IndexWriter writer = new IndexWriter(directory, analyzer, true);
while (true)
{
Document doc = new Document();
doc.Add(Field.Keyword("a", "Hello, World!"));
writer.AddDocument(doc);
Console.WriteLine("Writer:{0}", writer.DocCount());
Thread.Sleep(1000);
}
}).Start();
new Thread(delegate()
{
IndexReader reader = IndexReader.Open(directory);
while (true)
{
Console.WriteLine("Reader:{0}", reader.NumDocs());
Thread.Sleep(1000);
}
}).Start();
}
{
Directory directory = new RAMDirectory();
Analyzer analyzer = new StandardAnalyzer();
new Thread(delegate()
{
IndexWriter writer = new IndexWriter(directory, analyzer, true);
while (true)
{
Document doc = new Document();
doc.Add(Field.Keyword("a", "Hello, World!"));
writer.AddDocument(doc);
Console.WriteLine("Writer:{0}", writer.DocCount());
Thread.Sleep(1000);
}
}).Start();
new Thread(delegate()
{
IndexReader reader = IndexReader.Open(directory);
while (true)
{
Console.WriteLine("Reader:{0}", reader.NumDocs());
Thread.Sleep(1000);
}
}).Start();
}
通过分析我们会发现:
1. IndexReader.Open() 实际返回类型是 SegmentReader。
2. SegmentReader.Get() 方法返回其实例,并调用 Init() 和 Initialize() 方法进行初始化。
3. 由于在 while() 循环中 IndexReader 并没有更新相关信息,因此无法获取 Directory 的变更信息。
解决方法:将 IndexReader reader = IndexReader.Open(directory); 放到 while() 内部,每次都获取新的初始化实例。
private void Test1()
{
// ...
new Thread(delegate()
{
while (true)
{
IndexReader reader = IndexReader.Open(directory);
Console.WriteLine("Reader:{0}", reader.NumDocs());
Thread.Sleep(1000);
}
}).Start();
}
{
// ...
new Thread(delegate()
{
while (true)
{
IndexReader reader = IndexReader.Open(directory);
Console.WriteLine("Reader:{0}", reader.NumDocs());
Thread.Sleep(1000);
}
}).Start();
}
实验二
Apache Lucene API 中对 IndexSearcher 有这样一句话:
For performance reasons it is recommended to open only one IndexSearcher and use it for all of your searches.
我们分别用下面两段代码模拟多用户并发访问,以便进行性能测试。结果发现代码1的性能的确比代码2要高出一些,并发线程越多这种差距越明显。
代码1: 多线程共享 IndexSearcher 对象
for (int i = 0; i < 100; i++)
{
IndexSearcher searcher = new IndexSearcher(directory);
new Thread(delegate()
{
while (true)
{
Stopwatch watch = new Stopwatch();
watch.Start();
Hits hits = searcher.Search(QueryParser.Parse("Hello", "a", analyzer));
watch.Stop();
Console.WriteLine("Hits:{0}; Time:{1}", hits.Length(), watch.Elapsed);
Thread.Sleep(1000);
}
}).Start();
}
{
IndexSearcher searcher = new IndexSearcher(directory);
new Thread(delegate()
{
while (true)
{
Stopwatch watch = new Stopwatch();
watch.Start();
Hits hits = searcher.Search(QueryParser.Parse("Hello", "a", analyzer));
watch.Stop();
Console.WriteLine("Hits:{0}; Time:{1}", hits.Length(), watch.Elapsed);
Thread.Sleep(1000);
}
}).Start();
}
代码2: 使用新 IndexSearcher 实例搜索。
for (int i = 0; i < 100; i++)
{
new Thread(delegate()
{
while (true)
{
Stopwatch watch = new Stopwatch();
watch.Start();
IndexSearcher searcher = new IndexSearcher(directory);
Hits hits = searcher.Search(QueryParser.Parse("Hello", "a", analyzer));
searcher.Close();
watch.Stop();
Console.WriteLine("Hits:{0}; Time:{1}", hits.Length(), watch.Elapsed);
Thread.Sleep(1000);
}
}).Start();
}
{
new Thread(delegate()
{
while (true)
{
Stopwatch watch = new Stopwatch();
watch.Start();
IndexSearcher searcher = new IndexSearcher(directory);
Hits hits = searcher.Search(QueryParser.Parse("Hello", "a", analyzer));
searcher.Close();
watch.Stop();
Console.WriteLine("Hits:{0}; Time:{1}", hits.Length(), watch.Elapsed);
Thread.Sleep(1000);
}
}).Start();
}
具体分析一下:
1. IndexSearcher 内部通过 IndexReader 来读取索引数据。
2. 代码2除了创建 IndexSearcher 对象的开销外,主要性能损失在 IndexReader.Open() 方法中的多线程同步排队上。
private static IndexReader Open(Directory directory, bool closeDirectory)
{
lock (directory)
{
// in- & inter-process sync
return (IndexReader) new AnonymousClassWith(...).Run();
}
}
{
lock (directory)
{
// in- & inter-process sync
return (IndexReader) new AnonymousClassWith(...).Run();
}
}
尽管如此,我们有几个理由放弃代码1方案。
1. 基于实验一的结果,代码1无法实时获得索引库变更信息。
2. 代码1不方便调用 IndexSearcher.Close() 方法,有可能造成内存泄漏。(请参考《内存泄漏问题解决方法 (转载) 》)
实验三
运行下面的代码模拟多线程读写并发,你会发现不定期触发 FileNotFoundException 异常。
private void Test3()
{
Analyzer analyzer = new StandardAnalyzer();
IndexWriter writer = new IndexWriter("y:\\index2", analyzer, true);
for (int i = 0; i < 5; i++)
{
new Thread(delegate()
{
while (true)
{
Document doc = new Document();
doc.Add(Field.Text("a", "Hello, World!"));
writer.AddDocument(doc);
}
}).Start();
}
int count = 0;
for (int i = 0; i < 10; i++)
{
new Thread(delegate()
{
while (true)
{
IndexSearcher searcher = new IndexSearcher("y:\\index2");
Query query = QueryParser.Parse("Hello", "a", analyzer);
Hits hits = searcher.Search(query);
Console.WriteLine("{0}: hits:{1}", Interlocked.Add(ref count, 1), hits.Length());
searcher.Close();
}
}).Start();
}
}
{
Analyzer analyzer = new StandardAnalyzer();
IndexWriter writer = new IndexWriter("y:\\index2", analyzer, true);
for (int i = 0; i < 5; i++)
{
new Thread(delegate()
{
while (true)
{
Document doc = new Document();
doc.Add(Field.Text("a", "Hello, World!"));
writer.AddDocument(doc);
}
}).Start();
}
int count = 0;
for (int i = 0; i < 10; i++)
{
new Thread(delegate()
{
while (true)
{
IndexSearcher searcher = new IndexSearcher("y:\\index2");
Query query = QueryParser.Parse("Hello", "a", analyzer);
Hits hits = searcher.Search(query);
Console.WriteLine("{0}: hits:{1}", Interlocked.Add(ref count, 1), hits.Length());
searcher.Close();
}
}).Start();
}
}
如果我们将读写改用同一个 Directory 对象,则可避免发生异常。
private void Test3()
{
Analyzer analyzer = new StandardAnalyzer();
Directory directory = FSDirectory.GetDirectory("y:\\index2", true);
IndexWriter writer = new IndexWriter(directory, analyzer, true);
for (int i = 0; i < 5; i++)
{
new Thread(delegate()
{
while (true)
{
Document doc = new Document();
doc.Add(Field.Text("a", "Hello, World!"));
writer.AddDocument(doc);
}
}).Start();
}
int count = 0;
for (int i = 0; i < 10; i++)
{
new Thread(delegate()
{
while (true)
{
IndexSearcher searcher = new IndexSearcher(directory);
Query query = QueryParser.Parse("Hello", "a", analyzer);
Hits hits = searcher.Search(query);
Console.WriteLine("{0}: hits:{1}", Interlocked.Add(ref count, 1), hits.Length());
searcher.Close();
}
}).Start();
}
}
{
Analyzer analyzer = new StandardAnalyzer();
Directory directory = FSDirectory.GetDirectory("y:\\index2", true);
IndexWriter writer = new IndexWriter(directory, analyzer, true);
for (int i = 0; i < 5; i++)
{
new Thread(delegate()
{
while (true)
{
Document doc = new Document();
doc.Add(Field.Text("a", "Hello, World!"));
writer.AddDocument(doc);
}
}).Start();
}
int count = 0;
for (int i = 0; i < 10; i++)
{
new Thread(delegate()
{
while (true)
{
IndexSearcher searcher = new IndexSearcher(directory);
Query query = QueryParser.Parse("Hello", "a", analyzer);
Hits hits = searcher.Search(query);
Console.WriteLine("{0}: hits:{1}", Interlocked.Add(ref count, 1), hits.Length());
searcher.Close();
}
}).Start();
}
}
应用实践
1. 使用 Directory、Analyzer Singleton 对象。
2. 每次创建 IndexSearcher 新实例进行搜索。
3. 如果存在读写并发的情况,最好使用 Directory Singleton 对象,否则可能引发 FileNotFoundException 或 IOException 异常。
4. 对于只读型的搜索,可以使用 new IndexSearcher(FsDirectory.GetDirectory(...)) 创建新 Directory 对象来避免同步锁定,以提高性能。
5. 对于定期更新的应用,需要在几种方案之间进行协调才能获得最佳性能。怎么做就看你自己的了。