42、lucene和机器学习进行全文搜索,并排序
package com.lucene.test; import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.nio.file.Paths; import java.util.ArrayList; import java.util.List; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.joone.engine.FullSynapse; import org.joone.engine.LinearLayer; import org.joone.engine.Monitor; import org.joone.engine.NeuralNetEvent; import org.joone.engine.NeuralNetListener; import org.joone.engine.SigmoidLayer; import org.joone.engine.learning.TeachingSynapse; import org.joone.io.MemoryInputSynapse; import org.joone.io.MemoryOutputSynapse; import org.joone.net.NeuralNet; import org.junit.Test; import org.wltea.analyzer.lucene.IKAnalyzer; import com.lucene.domain.Article; public class TestLucene implements NeuralNetListener{ private NeuralNet nnet = null; private MemoryInputSynapse inputSynapse,desireOutputSynapse; LinearLayer input; SigmoidLayer hidden,output; boolean singleThreadMode = true; //XOR input private double[][] inputArray = new double[][]{ {0.0,0.0}, {0.0,1.0}, {1.0,0.0}, {1.0,1.0} }; //XOR desired output private double[][] desiredOutputArray = new double[][]{ {0.0}, {1.0}, {1.0}, {1.0} }; /** * 创建索引 * @throws Exception */ @Test public void testCreateIndex() throws Exception{ int fileNum = 1; List<String> contents = new ArrayList<String>(); InputStream inputStream = null; String value = null; File directory = new File("./20_newsgroups"); if(directory.isDirectory()){ File[] files = directory.listFiles(); for (int i = 0; i < 1; i++) { if(files[i].isDirectory()){ File[] subFiles = files[i].listFiles(); for (int j = 0; j < 10; j++) { inputStream = new BufferedInputStream(new FileInputStream(subFiles[j])); StringBuffer tempContent = new StringBuffer(); byte[] bytes = new byte[1024*10]; int len = 0; while((len = inputStream.read(bytes))!=-1){ tempContent = tempContent.append(new String(bytes)); } value = tempContent.toString(); System.out.println(value); inputStream.close(); Article article = new Article(fileNum,subFiles[j].getName(),tempContent.toString()); Directory saveDirectory = FSDirectory.open(Paths.get("./indexDir/")); //分词器 Analyzer analyzer = new WhitespaceAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(analyzer); Document doc = new Document(); doc.add(new TextField("id", article.getId().toString(), Store.YES)); doc.add(new TextField("title", article.getTitle(), Store.YES)); doc.add(new TextField("content", article.getContent(), Store.YES)); IndexWriter indexWriter = new IndexWriter(saveDirectory,iwc); System.out.println("have already add file to fileDocment system"+fileNum); indexWriter.addDocument(doc); indexWriter.close();//释放资源 fileNum = fileNum+1; } } } } //1.将需要添加的实体构造成实体对象 Article article = new Article(1,"Lucene是全文检索框架", "全文检索(Full-Test Retrieval)是以文本作为检索对象,找出含有指定词汇的文本。"+ "全面,准确和快速是衡量全文检索系统的关键指标。"); //2,保存到数据库(此步骤暂时省略) //3、建立索引(lucene) //索引库目录 //将 Article 转换为Document //保存到索引库中 } /** * 测试搜索 * @throws IOException * @throws ParseException */ @Test public void testSearch() throws IOException, ParseException{ //1、搜索条件 String queryCondition = "philosophical"; //2、执行搜索(lucene) List<Article> articles = new ArrayList<Article>(); //----------搜索代码------------------------ Directory directory = FSDirectory.open(Paths.get("./indexDir/")); Analyzer analyzer = new WhitespaceAnalyzer();//创建分词器 //把查询字符串转换为Query对象(只在title中查询) QueryParser queryParser = new QueryParser("content",analyzer); Query query = queryParser.parse(queryCondition); //2执行搜索得到搜索结果 IndexReader indexReader = DirectoryReader.open(directory); IndexSearcher indexSearcher = new IndexSearcher(indexReader); TopDocs topDocs = indexSearcher.search(query, 100); Integer count = topDocs.totalHits;//总结果数量 ScoreDoc[] scoreDocs = topDocs.scoreDocs;//返回前N条结果 //2.3处理结果 for (int i = 0; i < scoreDocs.length; i++) { ScoreDoc scoreDoc= scoreDocs[i]; int docId = scoreDoc.doc; System.out.println("得分是:"+scoreDoc.score+"内部编号是:"+docId); //根据内部编号取出真正的Document数据 Document doc = indexSearcher.doc(docId); //将document转化为Article Article article = new Article(Integer.parseInt(doc.get("id")),doc.get("title"),doc.get("content")); articles.add(article); } //------------------------------------------ //3、控制台显示结果 System.err.print("总结果数:"+count); for (Article article : articles) { System.out.println("查询结果:ID为:"+article.getId()+",title为:"+article.getTitle()); } indexSearcher.getIndexReader().close(); } @Test public void testNeuralNet(){ TestLucene testLucene = new TestLucene(); testLucene.initNeuralNet(); testLucene.train(); testLucene.interrogate(); } public void initNeuralNet(){ //First create the three layers input = new LinearLayer(); hidden = new SigmoidLayer(); output = new SigmoidLayer(); //set the dimensions of the layers input.setRows(2); hidden.setRows(3); output.setRows(1); input.setLayerName("L.input"); hidden.setLayerName("L.hidden"); output.setLayerName("L.output"); //Now create the two Synapses FullSynapse synapse_IH = new FullSynapse();//input -->hidden conn FullSynapse synapse_HO = new FullSynapse();//hidden -->output conn //Connect the input layer whit the hidden layer input.addOutputSynapse(synapse_IH); hidden.addInputSynapse(synapse_IH); //Connect the hidden layer whit the output layer hidden.addOutputSynapse(synapse_HO); output.addInputSynapse(synapse_HO); //the input to the neural net inputSynapse = new MemoryInputSynapse(); input.addInputSynapse(inputSynapse); //The Trainer and its desired output desireOutputSynapse = new MemoryInputSynapse(); TeachingSynapse trainer = new TeachingSynapse(); trainer.setDesired(desireOutputSynapse); //Now we add this structure to a NeuralNet object nnet = new NeuralNet(); nnet.addLayer(input,NeuralNet.INPUT_LAYER); nnet.addLayer(hidden,NeuralNet.HIDDEN_LAYER); nnet.addLayer(output, NeuralNet.OUTPUT_LAYER); nnet.setTeacher(trainer); output.addOutputSynapse(trainer); nnet.addNeuralNetListener(this); } public void train(){ //set the inputs inputSynapse.setInputArray(inputArray); inputSynapse.setAdvancedColumnSelector("1,2"); //set the desired outputs desireOutputSynapse.setInputArray(desiredOutputArray); desireOutputSynapse.setAdvancedColumnSelector("1"); //get the monitor object to train or feed forward Monitor monitor = nnet.getMonitor(); //set the monitor parameters monitor.setLearningRate(0.8); monitor.setMomentum(0.3); monitor.setTrainingPatterns(inputArray.length); monitor.setTotCicles(5000); monitor.setLearning(true); long initms = System.currentTimeMillis(); //Run the network in single-thread,synchronized mode nnet.getMonitor().setSingleThreadMode(singleThreadMode); nnet.go(true); System.out.println("Total time="+(System.currentTimeMillis()-initms)+"ms"); } public void interrogate(){ double[][] inputArray = new double[][]{ {0.0,1.0}, {1.0,0.0}, {1.0,1.0}, {0.0,0.0} }; //set the inputs inputSynapse.setInputArray(inputArray); inputSynapse.setAdvancedColumnSelector("1,2"); Monitor monitor = nnet.getMonitor(); monitor.setTrainingPatterns(4); monitor.setTotCicles(1); monitor.setLearning(false); MemoryOutputSynapse memOut = new MemoryOutputSynapse(); //set the output synapse to write the output of the net if(nnet != null){ nnet.addOutputSynapse(memOut); System.out.println(nnet.check()); nnet.getMonitor().setSingleThreadMode(singleThreadMode); nnet.go(); for (int i = 0; i < 4; i++) { double[] pattern = memOut.getNextPattern(); System.out.println("Output pattern #"+(i+1)+"="+pattern[0]); } System.out.println("Interrogating Finished"); } } public void cicleTerminated(NeuralNetEvent arg0) { } public void errorChanged(NeuralNetEvent e) { Monitor mon=(Monitor) e.getSource(); if(mon.getCurrentCicle()%100==0){ System.out.println("Epoch:"+(mon.getTotCicles()-mon.getCurrentCicle())+"RMSE:" +mon.getGlobalError()); } } public void netStarted(NeuralNetEvent e) { Monitor mon = (Monitor) e.getSource(); System.out.println("Network started for "); if(mon.isLearning()){ System.out.println("training"); }else{ System.out.println("interrogation."); } } public void netStopped(NeuralNetEvent e) { Monitor mon = (Monitor) e.getSource(); System.out.println("Network stopped . Last RMSE=" +mon.getGlobalError()); } public void netStoppedError(NeuralNetEvent e, String error) { System.out.println("Network stopped due the following error:" +error); } }
结果
得分是:0.25462872内部编号是:7840 得分是:0.24006625内部编号是:7841 查询结果:ID为:2,title为:51060总结果数:2 查询结果:ID为:1,title为:49960