Lucene学习笔记(二)
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import junit.framework.TestCase;
public class BaseIndexTestCase extends TestCase
{
protected String[] keywords = {"1", "2"};
protected String[] unindexed = {"Netherlands", "Italy"};
protected String[] unstored = {"Amsterdam has lots of bridges", "Venice has lots of canals"};
protected String[] text = {"Amsterdam", "Venice"};
protected Directory dir;
protected void setUp() throws IOException {
String indexDir =
System.getProperty("java.io.tmpdir", "tmp") +
System.getProperty("file.separator") + "index-dir";
dir = FSDirectory.getDirectory(indexDir, true);
addDocuments(dir);
}
protected void addDocuments(Directory dir)
throws IOException {
IndexWriter writer = new IndexWriter(dir, getAnalyzer(), true);
writer.setUseCompoundFile(isCompound());
for (int i = 0; i < keywords.length; i++)
{
Document doc = new Document();
doc.add(new Field("id",keywords[i],Field.Store.YES,Field.Index.UN_TOKENIZED));
doc.add(new Field("country",unindexed[i],Field.Store.YES,Field.Index.NO));
doc.add(new Field("contents",unstored[i],Field.Store.NO,Field.Index.TOKENIZED));
doc.add(new Field("city",text[i],Field.Store.YES,Field.Index.TOKENIZED));
writer.addDocument(doc);
}
writer.optimize();
writer.close();
}
protected Analyzer getAnalyzer()
{
return new SimpleAnalyzer();
}
protected boolean isCompound()
{
return true;
}
public void testIndexWriter() throws IOException
{
IndexWriter writer = new IndexWriter(dir,this.getAnalyzer(),false);
assertEquals(keywords.length,writer.docCount());
writer.close();
}
public void testIndexReader() throws IOException
{
IndexReader reader = IndexReader.open(dir);
assertEquals(keywords.length, reader.maxDoc());
assertEquals(keywords.length, reader.numDocs());
reader.close();
}
}
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import junit.framework.TestCase;
public class BaseIndexTestCase extends TestCase
{
protected String[] keywords = {"1", "2"};
protected String[] unindexed = {"Netherlands", "Italy"};
protected String[] unstored = {"Amsterdam has lots of bridges", "Venice has lots of canals"};
protected String[] text = {"Amsterdam", "Venice"};
protected Directory dir;
protected void setUp() throws IOException {
String indexDir =
System.getProperty("java.io.tmpdir", "tmp") +
System.getProperty("file.separator") + "index-dir";
dir = FSDirectory.getDirectory(indexDir, true);
addDocuments(dir);
}
protected void addDocuments(Directory dir)
throws IOException {
IndexWriter writer = new IndexWriter(dir, getAnalyzer(), true);
writer.setUseCompoundFile(isCompound());
for (int i = 0; i < keywords.length; i++)
{
Document doc = new Document();
doc.add(new Field("id",keywords[i],Field.Store.YES,Field.Index.UN_TOKENIZED));
doc.add(new Field("country",unindexed[i],Field.Store.YES,Field.Index.NO));
doc.add(new Field("contents",unstored[i],Field.Store.NO,Field.Index.TOKENIZED));
doc.add(new Field("city",text[i],Field.Store.YES,Field.Index.TOKENIZED));
writer.addDocument(doc);
}
writer.optimize();
writer.close();
}
protected Analyzer getAnalyzer()
{
return new SimpleAnalyzer();
}
protected boolean isCompound()
{
return true;
}
public void testIndexWriter() throws IOException
{
IndexWriter writer = new IndexWriter(dir,this.getAnalyzer(),false);
assertEquals(keywords.length,writer.docCount());
writer.close();
}
public void testIndexReader() throws IOException
{
IndexReader reader = IndexReader.open(dir);
assertEquals(keywords.length, reader.maxDoc());
assertEquals(keywords.length, reader.numDocs());
reader.close();
}
}
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
public class DocumentDeleteTest extends BaseIndexTestCase
{
public void testDeleteBeforeIndexMerge() throws IOException
{
assertEquals(1, getHitCount("city", "Amsterdam"));
IndexReader reader = IndexReader.open(dir);
assertEquals(2, reader.maxDoc());
assertEquals(2, reader.numDocs());
reader.deleteDocument(1);
assertTrue(reader.isDeleted(1));
assertTrue(reader.hasDeletions());
assertEquals(2, reader.maxDoc());
assertEquals(1, reader.numDocs());
reader.close();
reader = IndexReader.open(dir);
assertEquals(2, reader.maxDoc());
assertEquals(1, reader.numDocs());
reader.close();
}
public void testDeleteAfterIndexMerge() throws IOException
{
IndexReader reader = IndexReader.open(dir);
assertEquals(2, reader.maxDoc());
assertEquals(2, reader.numDocs());
reader.deleteDocument(1);
reader.close();
IndexWriter writer = new IndexWriter(dir, getAnalyzer(),false);
writer.optimize();
writer.close();
reader = IndexReader.open(dir);
assertFalse(reader.isDeleted(1));
assertFalse(reader.hasDeletions());
assertEquals(1, reader.maxDoc());
assertEquals(1, reader.numDocs());
reader.close();
}
private int getHitCount(String fieldName, String searchString)
throws IOException {
IndexSearcher searcher = new IndexSearcher(dir);
Term t = new Term(fieldName, searchString);
Query query = new TermQuery(t);
Hits hits = searcher.search(query);
int hitCount = hits.length();
searcher.close();
return hitCount;
}
protected Analyzer getAnalyzer() {
return new WhitespaceAnalyzer();
}
}
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
public class DocumentDeleteTest extends BaseIndexTestCase
{
public void testDeleteBeforeIndexMerge() throws IOException
{
assertEquals(1, getHitCount("city", "Amsterdam"));
IndexReader reader = IndexReader.open(dir);
assertEquals(2, reader.maxDoc());
assertEquals(2, reader.numDocs());
reader.deleteDocument(1);
assertTrue(reader.isDeleted(1));
assertTrue(reader.hasDeletions());
assertEquals(2, reader.maxDoc());
assertEquals(1, reader.numDocs());
reader.close();
reader = IndexReader.open(dir);
assertEquals(2, reader.maxDoc());
assertEquals(1, reader.numDocs());
reader.close();
}
public void testDeleteAfterIndexMerge() throws IOException
{
IndexReader reader = IndexReader.open(dir);
assertEquals(2, reader.maxDoc());
assertEquals(2, reader.numDocs());
reader.deleteDocument(1);
reader.close();
IndexWriter writer = new IndexWriter(dir, getAnalyzer(),false);
writer.optimize();
writer.close();
reader = IndexReader.open(dir);
assertFalse(reader.isDeleted(1));
assertFalse(reader.hasDeletions());
assertEquals(1, reader.maxDoc());
assertEquals(1, reader.numDocs());
reader.close();
}
private int getHitCount(String fieldName, String searchString)
throws IOException {
IndexSearcher searcher = new IndexSearcher(dir);
Term t = new Term(fieldName, searchString);
Query query = new TermQuery(t);
Hits hits = searcher.search(query);
int hitCount = hits.length();
searcher.close();
return hitCount;
}
protected Analyzer getAnalyzer() {
return new WhitespaceAnalyzer();
}
}
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
public class DocumentUpdateTest extends BaseIndexTestCase
{
public void testUpdate() throws IOException
{
assertEquals(1, getHitCount("city", "Amsterdam"));
IndexReader reader = IndexReader.open(dir);
reader.deleteDocuments(new Term("city", "Amsterdam"));
reader.close();
IndexWriter writer = new IndexWriter(dir, getAnalyzer(),
false);
Document doc = new Document();
doc.add(new Field("id","1",Field.Store.YES,Field.Index.UN_TOKENIZED));
doc.add(new Field("country","Russia",Field.Store.YES,Field.Index.NO));
doc.add(new Field("contents","St. Petersburg has lots of bridges",Field.Store.NO,Field.Index.TOKENIZED));
doc.add(new Field("city","St. Petersburg",Field.Store.YES,Field.Index.TOKENIZED));
writer.addDocument(doc);
writer.optimize();
writer.close();
assertEquals(0, getHitCount("city", "Amsterdam"));
assertEquals(1, getHitCount("city", "Petersburg"));
}
protected Analyzer getAnalyzer() {
return new WhitespaceAnalyzer();
}
private int getHitCount(String fieldName, String searchString)
throws IOException {
IndexSearcher searcher = new IndexSearcher(dir);
Term t = new Term(fieldName, searchString);
Query query = new TermQuery(t);
Hits hits = searcher.search(query);
int hitCount = hits.length();
searcher.close();
return hitCount;
}
}
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
public class DocumentUpdateTest extends BaseIndexTestCase
{
public void testUpdate() throws IOException
{
assertEquals(1, getHitCount("city", "Amsterdam"));
IndexReader reader = IndexReader.open(dir);
reader.deleteDocuments(new Term("city", "Amsterdam"));
reader.close();
IndexWriter writer = new IndexWriter(dir, getAnalyzer(),
false);
Document doc = new Document();
doc.add(new Field("id","1",Field.Store.YES,Field.Index.UN_TOKENIZED));
doc.add(new Field("country","Russia",Field.Store.YES,Field.Index.NO));
doc.add(new Field("contents","St. Petersburg has lots of bridges",Field.Store.NO,Field.Index.TOKENIZED));
doc.add(new Field("city","St. Petersburg",Field.Store.YES,Field.Index.TOKENIZED));
writer.addDocument(doc);
writer.optimize();
writer.close();
assertEquals(0, getHitCount("city", "Amsterdam"));
assertEquals(1, getHitCount("city", "Petersburg"));
}
protected Analyzer getAnalyzer() {
return new WhitespaceAnalyzer();
}
private int getHitCount(String fieldName, String searchString)
throws IOException {
IndexSearcher searcher = new IndexSearcher(dir);
Term t = new Term(fieldName, searchString);
Query query = new TermQuery(t);
Hits hits = searcher.search(query);
int hitCount = hits.length();
searcher.close();
return hitCount;
}
}
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class IndexTuningDemo
{
public static void main(String[] args) throws Exception {
int docsInIndex = Integer.parseInt(args[0]);
// create an index called 'index-dir' in a temp directory
Directory dir = FSDirectory.getDirectory(
System.getProperty("java.io.tmpdir", "tmp") +
System.getProperty("file.separator") + "index-dir", true);
Analyzer analyzer = new SimpleAnalyzer();
IndexWriter writer = new IndexWriter(dir, analyzer, true);
// set variables that affect speed of indexing
writer.setMergeFactor(Integer.parseInt(args[1]));
writer.setMaxMergeDocs(Integer.parseInt(args[2]));
writer.setInfoStream(System.out);
writer.setMaxBufferedDocs(Integer.parseInt(args[3]));
System.out.println("Merge factor: " + writer.getMergeFactor());
System.out.println("Max merge docs: " + writer.getMaxMergeDocs());
System.out.println("Min merge docs: " + writer.getMaxBufferedDocs());
long start = System.currentTimeMillis();
for (int i = 0; i < docsInIndex; i++) {
Document doc = new Document();
doc.add(new Field("fieldname", "Bibamus", Field.Store.YES,Field.Index.TOKENIZED));
writer.addDocument(doc);
}
writer.close();
long stop = System.currentTimeMillis();
System.out.println("Time: " + (stop - start) + " ms");
}
}
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class IndexTuningDemo
{
public static void main(String[] args) throws Exception {
int docsInIndex = Integer.parseInt(args[0]);
// create an index called 'index-dir' in a temp directory
Directory dir = FSDirectory.getDirectory(
System.getProperty("java.io.tmpdir", "tmp") +
System.getProperty("file.separator") + "index-dir", true);
Analyzer analyzer = new SimpleAnalyzer();
IndexWriter writer = new IndexWriter(dir, analyzer, true);
// set variables that affect speed of indexing
writer.setMergeFactor(Integer.parseInt(args[1]));
writer.setMaxMergeDocs(Integer.parseInt(args[2]));
writer.setInfoStream(System.out);
writer.setMaxBufferedDocs(Integer.parseInt(args[3]));
System.out.println("Merge factor: " + writer.getMergeFactor());
System.out.println("Max merge docs: " + writer.getMaxMergeDocs());
System.out.println("Min merge docs: " + writer.getMaxBufferedDocs());
long start = System.currentTimeMillis();
for (int i = 0; i < docsInIndex; i++) {
Document doc = new Document();
doc.add(new Field("fieldname", "Bibamus", Field.Store.YES,Field.Index.TOKENIZED));
writer.addDocument(doc);
}
writer.close();
long stop = System.currentTimeMillis();
System.out.println("Time: " + (stop - start) + " ms");
}
}
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.analysis.SimpleAnalyzer;
import junit.framework.TestCase;
import java.io.IOException;
import java.util.Collection;
import java.util.ArrayList;
import java.util.Iterator;
public class FSversusRAMDirectoryTest extends TestCase
{
private Directory fsDir;
private Directory ramDir;
private Collection docs = loadDocuments(3000, 5);//加载数据
protected void setUp() throws Exception
{
String fsIndexDir = System.getProperty("java.io.tmpdir", "tmp") + System.getProperty("file.separator") + "fs-index";
ramDir = new RAMDirectory();//内存中目录
fsDir = FSDirectory.getDirectory(fsIndexDir, true);
}
public void testTiming() throws IOException
{
long ramTiming = timeIndexWriter(ramDir);
long fsTiming = timeIndexWriter(fsDir);
assertTrue(fsTiming > ramTiming);
System.out.println("RAMDirectory Time: " + (ramTiming) + " ms");
System.out.println("FSDirectory Time : " + (fsTiming) + " ms");
}
private long timeIndexWriter(Directory dir) throws IOException
{
long start = System.currentTimeMillis();
addDocuments(dir);
long stop = System.currentTimeMillis();
return (stop - start);
}
private void addDocuments(Directory dir) throws IOException
{
IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(),true);
/**
// change to adjust performance of indexing with FSDirectory
writer.mergeFactor = writer.mergeFactor;
writer.maxMergeDocs = writer.maxMergeDocs;
writer.minMergeDocs = writer.minMergeDocs;
*/
for (Iterator iter = docs.iterator(); iter.hasNext();)
{
Document doc = new Document();
String word = (String) iter.next();
doc.add(new Field("keyword",word,Field.Store.YES,Field.Index.UN_TOKENIZED));
doc.add(new Field("unindexed",word,Field.Store.YES,Field.Index.NO));
doc.add(new Field("unstored",word,Field.Store.NO,Field.Index.TOKENIZED));
doc.add(new Field("text",word,Field.Store.YES,Field.Index.TOKENIZED));
writer.addDocument(doc);
}
writer.optimize();
writer.close();
}
private Collection loadDocuments(int numDocs, int wordsPerDoc)
{
Collection docs = new ArrayList(numDocs);
for (int i = 0; i < numDocs; i++)
{
StringBuffer doc = new StringBuffer(wordsPerDoc);
for (int j = 0; j < wordsPerDoc; j++)
{
doc.append("Bibamus ");
}
docs.add(doc.toString());
}
return docs;
}
}
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.analysis.SimpleAnalyzer;
import junit.framework.TestCase;
import java.io.IOException;
import java.util.Collection;
import java.util.ArrayList;
import java.util.Iterator;
public class FSversusRAMDirectoryTest extends TestCase
{
private Directory fsDir;
private Directory ramDir;
private Collection docs = loadDocuments(3000, 5);//加载数据
protected void setUp() throws Exception
{
String fsIndexDir = System.getProperty("java.io.tmpdir", "tmp") + System.getProperty("file.separator") + "fs-index";
ramDir = new RAMDirectory();//内存中目录
fsDir = FSDirectory.getDirectory(fsIndexDir, true);
}
public void testTiming() throws IOException
{
long ramTiming = timeIndexWriter(ramDir);
long fsTiming = timeIndexWriter(fsDir);
assertTrue(fsTiming > ramTiming);
System.out.println("RAMDirectory Time: " + (ramTiming) + " ms");
System.out.println("FSDirectory Time : " + (fsTiming) + " ms");
}
private long timeIndexWriter(Directory dir) throws IOException
{
long start = System.currentTimeMillis();
addDocuments(dir);
long stop = System.currentTimeMillis();
return (stop - start);
}
private void addDocuments(Directory dir) throws IOException
{
IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(),true);
/**
// change to adjust performance of indexing with FSDirectory
writer.mergeFactor = writer.mergeFactor;
writer.maxMergeDocs = writer.maxMergeDocs;
writer.minMergeDocs = writer.minMergeDocs;
*/
for (Iterator iter = docs.iterator(); iter.hasNext();)
{
Document doc = new Document();
String word = (String) iter.next();
doc.add(new Field("keyword",word,Field.Store.YES,Field.Index.UN_TOKENIZED));
doc.add(new Field("unindexed",word,Field.Store.YES,Field.Index.NO));
doc.add(new Field("unstored",word,Field.Store.NO,Field.Index.TOKENIZED));
doc.add(new Field("text",word,Field.Store.YES,Field.Index.TOKENIZED));
writer.addDocument(doc);
}
writer.optimize();
writer.close();
}
private Collection loadDocuments(int numDocs, int wordsPerDoc)
{
Collection docs = new ArrayList(numDocs);
for (int i = 0; i < numDocs; i++)
{
StringBuffer doc = new StringBuffer(wordsPerDoc);
for (int j = 0; j < wordsPerDoc; j++)
{
doc.append("Bibamus ");
}
docs.add(doc.toString());
}
return docs;
}
}
作者:洞庭散人
出处:http://phinecos.cnblogs.com/
本博客遵从Creative Commons Attribution 3.0 License,若用于非商业目的,您可以自由转载,但请保留原作者信息和文章链接URL。
posted on 2007-08-29 15:55 Phinecos(洞庭散人) 阅读(2234) 评论(0) 编辑 收藏 举报