使用SolrJ生成索引

这个例子使用两种方式来演示如何生成全量索引:
一个是从db中通过sql生成全量索引
一个是通过tika解析文件生成全量索引

  1 package SolrJExample;
  2 
  3 import org.apache.solr.client.solrj.SolrServerException;
  4 import org.apache.solr.client.solrj.impl.StreamingUpdateSolrServer;
  5 import org.apache.solr.client.solrj.impl.XMLResponseParser;
  6 import org.apache.solr.client.solrj.response.UpdateResponse;
  7 import org.apache.solr.common.SolrInputDocument;
  8 import org.apache.tika.metadata.Metadata;
  9 import org.apache.tika.parser.AutoDetectParser;
 10 import org.apache.tika.parser.ParseContext;
 11 import org.apache.tika.sax.BodyContentHandler;
 12 import org.xml.sax.ContentHandler;
 13 
 14 import java.io.File;
 15 import java.io.FileInputStream;
 16 import java.io.IOException;
 17 import java.io.InputStream;
 18 import java.sql.*;
 19 import java.util.ArrayList;
 20 import java.util.Collection;
 21 
 22 /* Example class showing the skeleton of using Tika and
 23    Sql on the client to index documents from
 24    both structured documents and a SQL database.
 25 
 26    NOTE: The SQL example and the Tika example are entirely orthogonal.
 27    Both are included here to make a
 28    more interesting example, but you can omit either of them.
 29 
 30  */
 31 public class SqlTikaExample {
 32   private StreamingUpdateSolrServer _server;
 33   private long _start = System.currentTimeMillis();
 34   private AutoDetectParser _autoParser;
 35   private int _totalTika = 0;
 36   private int _totalSql = 0;
 37 
 38   private Collection _docs = new ArrayList();
 39 
 40   public static void main(String[] args) {
 41     try {
 42       SqlTikaExample idxer = new SqlTikaExample("http://localhost:8983/solr");
 43 
 44       idxer.doTikaDocuments(new File("/Users/Erick/testdocs"));
 45       idxer.doSqlDocuments();
 46 
 47       idxer.endIndexing();
 48     } catch (Exception e) {
 49       e.printStackTrace();
 50     }
 51   }
 52 
 53   private SqlTikaExample(String url) throws IOException, SolrServerException {
 54       // Create a multi-threaded communications channel to the Solr server.
 55       // Could be CommonsHttpSolrServer as well.
 56       //
 57     _server = new StreamingUpdateSolrServer(url, 10, 4);
 58 
 59     _server.setSoTimeout(1000);  // socket read timeout
 60     _server.setConnectionTimeout(1000);
 61     _server.setMaxRetries(1); // defaults to 0.  > 1 not recommended.
 62          // binary parser is used by default for responses
 63     _server.setParser(new XMLResponseParser()); 
 64 
 65       // One of the ways Tika can be used to attempt to parse arbitrary files.
 66     _autoParser = new AutoDetectParser();
 67   }
 68 
 69     // Just a convenient place to wrap things up.
 70   private void endIndexing() throws IOException, SolrServerException {
 71     if (_docs.size() > 0) { // Are there any documents left over?
 72       _server.add(_docs, 300000); // Commit within 5 minutes
 73     }
 74     _server.commit(); // Only needs to be done at the end,
 75                       // commitWithin should do the rest.
 76                       // Could even be omitted
 77                       // assuming commitWithin was specified.
 78     long endTime = System.currentTimeMillis();
 79     log("Total Time Taken: " + (endTime - _start) +
 80          " milliseconds to index " + _totalSql +
 81         " SQL rows and " + _totalTika + " documents");
 82   }
 83 
 84   // I hate writing System.out.println() everyplace,
 85   // besides this gives a central place to convert to true logging
 86   // in a production system.
 87   private static void log(String msg) {
 88     System.out.println(msg);
 89   }
 90 
 91   /**
 92    * ***************************Tika processing here
 93    */
 94   // Recursively traverse the filesystem, parsing everything found.
 95   private void doTikaDocuments(File root) throws IOException, SolrServerException {
 96 
 97     // Simple loop for recursively indexing all the files
 98     // in the root directory passed in.
 99     for (File file : root.listFiles()) {
100       if (file.isDirectory()) {
101         doTikaDocuments(file);
102         continue;
103       }
104         // Get ready to parse the file.
105       ContentHandler textHandler = new BodyContentHandler();
106       Metadata metadata = new Metadata();
107       ParseContext context = new ParseContext();
108 
109       InputStream input = new FileInputStream(file);
110 
111         // Try parsing the file. Note we haven't checked at all to
112         // see whether this file is a good candidate.
113       try {
114         _autoParser.parse(input, textHandler, metadata, context);
115       } catch (Exception e) {
116           // Needs better logging of what went wrong in order to
117           // track down "bad" documents.
118         log(String.format("File %s failed", file.getCanonicalPath()));
119         e.printStackTrace();
120         continue;
121       }
122       // Just to show how much meta-data and what form it's in.
123       dumpMetadata(file.getCanonicalPath(), metadata);
124 
125       // Index just a couple of the meta-data fields.
126       SolrInputDocument doc = new SolrInputDocument();
127 
128       doc.addField("id", file.getCanonicalPath());
129 
130       // Crude way to get known meta-data fields.
131       // Also possible to write a simple loop to examine all the
132       // metadata returned and selectively index it and/or
133       // just get a list of them.
134       // One can also use the LucidWorks field mapping to
135       // accomplish much the same thing.
136       String author = metadata.get("Author");
137 
138       if (author != null) {
139         doc.addField("author", author);
140       }
141 
142       doc.addField("text", textHandler.toString());
143 
144       _docs.add(doc);
145       ++_totalTika;
146 
147       // Completely arbitrary, just batch up more than one document
148       // for throughput!
149       if (_docs.size() >= 1000) {
150           // Commit within 5 minutes.
151         UpdateResponse resp = _server.add(_docs, 300000);
152         if (resp.getStatus() != 0) {
153           log("Some horrible error has occurred, status is: " +
154                   resp.getStatus());
155         }
156         _docs.clear();
157       }
158     }
159   }
160 
161     // Just to show all the metadata that's available.
162   private void dumpMetadata(String fileName, Metadata metadata) {
163     log("Dumping metadata for file: " + fileName);
164     for (String name : metadata.names()) {
165       log(name + ":" + metadata.get(name));
166     }
167     log("\n\n");
168   }
169 
170   /**
171    * ***************************SQL processing here
172    */
173   private void doSqlDocuments() throws SQLException {
174     Connection con = null;
175     try {
176       Class.forName("com.mysql.jdbc.Driver").newInstance();
177       log("Driver Loaded");
178 
179       con = DriverManager.getConnection("jdbc:mysql://192.168.1.103:3306/test?"
180                 + "user=testuser&password=test123");
181 
182       Statement st = con.createStatement();
183       ResultSet rs = st.executeQuery("select id,title,text from test");
184 
185       while (rs.next()) {
186         // DO NOT move this outside the while loop
187         // or be sure to call doc.clear()
188         SolrInputDocument doc = new SolrInputDocument(); 
189         String id = rs.getString("id");
190         String title = rs.getString("title");
191         String text = rs.getString("text");
192 
193         doc.addField("id", id);
194         doc.addField("title", title);
195         doc.addField("text", text);
196 
197         _docs.add(doc);
198         ++_totalSql;
199 
200         // Completely arbitrary, just batch up more than one
201         // document for throughput!
202         if (_docs.size() > 1000) {
203              // Commit within 5 minutes.
204           UpdateResponse resp = _server.add(_docs, 300000);
205           if (resp.getStatus() != 0) {
206             log("Some horrible error has occurred, status is: " +
207                   resp.getStatus());
208           }
209           _docs.clear();
210         }
211       }
212     } catch (Exception ex) {
213       ex.printStackTrace();
214     } finally {
215       if (con != null) {
216         con.close();
217       }
218     }
219   }
220 }

 

posted @ 2013-01-30 13:17  coolbing  阅读(490)  评论(0编辑  收藏  举报