使用SolrJ生成索引
这个例子使用两种方式来演示如何生成全量索引:
一个是从db中通过sql生成全量索引
一个是通过tika解析文件生成全量索引
1 package SolrJExample; 2 3 import org.apache.solr.client.solrj.SolrServerException; 4 import org.apache.solr.client.solrj.impl.StreamingUpdateSolrServer; 5 import org.apache.solr.client.solrj.impl.XMLResponseParser; 6 import org.apache.solr.client.solrj.response.UpdateResponse; 7 import org.apache.solr.common.SolrInputDocument; 8 import org.apache.tika.metadata.Metadata; 9 import org.apache.tika.parser.AutoDetectParser; 10 import org.apache.tika.parser.ParseContext; 11 import org.apache.tika.sax.BodyContentHandler; 12 import org.xml.sax.ContentHandler; 13 14 import java.io.File; 15 import java.io.FileInputStream; 16 import java.io.IOException; 17 import java.io.InputStream; 18 import java.sql.*; 19 import java.util.ArrayList; 20 import java.util.Collection; 21 22 /* Example class showing the skeleton of using Tika and 23 Sql on the client to index documents from 24 both structured documents and a SQL database. 25 26 NOTE: The SQL example and the Tika example are entirely orthogonal. 27 Both are included here to make a 28 more interesting example, but you can omit either of them. 29 30 */ 31 public class SqlTikaExample { 32 private StreamingUpdateSolrServer _server; 33 private long _start = System.currentTimeMillis(); 34 private AutoDetectParser _autoParser; 35 private int _totalTika = 0; 36 private int _totalSql = 0; 37 38 private Collection _docs = new ArrayList(); 39 40 public static void main(String[] args) { 41 try { 42 SqlTikaExample idxer = new SqlTikaExample("http://localhost:8983/solr"); 43 44 idxer.doTikaDocuments(new File("/Users/Erick/testdocs")); 45 idxer.doSqlDocuments(); 46 47 idxer.endIndexing(); 48 } catch (Exception e) { 49 e.printStackTrace(); 50 } 51 } 52 53 private SqlTikaExample(String url) throws IOException, SolrServerException { 54 // Create a multi-threaded communications channel to the Solr server. 55 // Could be CommonsHttpSolrServer as well. 56 // 57 _server = new StreamingUpdateSolrServer(url, 10, 4); 58 59 _server.setSoTimeout(1000); // socket read timeout 60 _server.setConnectionTimeout(1000); 61 _server.setMaxRetries(1); // defaults to 0. > 1 not recommended. 62 // binary parser is used by default for responses 63 _server.setParser(new XMLResponseParser()); 64 65 // One of the ways Tika can be used to attempt to parse arbitrary files. 66 _autoParser = new AutoDetectParser(); 67 } 68 69 // Just a convenient place to wrap things up. 70 private void endIndexing() throws IOException, SolrServerException { 71 if (_docs.size() > 0) { // Are there any documents left over? 72 _server.add(_docs, 300000); // Commit within 5 minutes 73 } 74 _server.commit(); // Only needs to be done at the end, 75 // commitWithin should do the rest. 76 // Could even be omitted 77 // assuming commitWithin was specified. 78 long endTime = System.currentTimeMillis(); 79 log("Total Time Taken: " + (endTime - _start) + 80 " milliseconds to index " + _totalSql + 81 " SQL rows and " + _totalTika + " documents"); 82 } 83 84 // I hate writing System.out.println() everyplace, 85 // besides this gives a central place to convert to true logging 86 // in a production system. 87 private static void log(String msg) { 88 System.out.println(msg); 89 } 90 91 /** 92 * ***************************Tika processing here 93 */ 94 // Recursively traverse the filesystem, parsing everything found. 95 private void doTikaDocuments(File root) throws IOException, SolrServerException { 96 97 // Simple loop for recursively indexing all the files 98 // in the root directory passed in. 99 for (File file : root.listFiles()) { 100 if (file.isDirectory()) { 101 doTikaDocuments(file); 102 continue; 103 } 104 // Get ready to parse the file. 105 ContentHandler textHandler = new BodyContentHandler(); 106 Metadata metadata = new Metadata(); 107 ParseContext context = new ParseContext(); 108 109 InputStream input = new FileInputStream(file); 110 111 // Try parsing the file. Note we haven't checked at all to 112 // see whether this file is a good candidate. 113 try { 114 _autoParser.parse(input, textHandler, metadata, context); 115 } catch (Exception e) { 116 // Needs better logging of what went wrong in order to 117 // track down "bad" documents. 118 log(String.format("File %s failed", file.getCanonicalPath())); 119 e.printStackTrace(); 120 continue; 121 } 122 // Just to show how much meta-data and what form it's in. 123 dumpMetadata(file.getCanonicalPath(), metadata); 124 125 // Index just a couple of the meta-data fields. 126 SolrInputDocument doc = new SolrInputDocument(); 127 128 doc.addField("id", file.getCanonicalPath()); 129 130 // Crude way to get known meta-data fields. 131 // Also possible to write a simple loop to examine all the 132 // metadata returned and selectively index it and/or 133 // just get a list of them. 134 // One can also use the LucidWorks field mapping to 135 // accomplish much the same thing. 136 String author = metadata.get("Author"); 137 138 if (author != null) { 139 doc.addField("author", author); 140 } 141 142 doc.addField("text", textHandler.toString()); 143 144 _docs.add(doc); 145 ++_totalTika; 146 147 // Completely arbitrary, just batch up more than one document 148 // for throughput! 149 if (_docs.size() >= 1000) { 150 // Commit within 5 minutes. 151 UpdateResponse resp = _server.add(_docs, 300000); 152 if (resp.getStatus() != 0) { 153 log("Some horrible error has occurred, status is: " + 154 resp.getStatus()); 155 } 156 _docs.clear(); 157 } 158 } 159 } 160 161 // Just to show all the metadata that's available. 162 private void dumpMetadata(String fileName, Metadata metadata) { 163 log("Dumping metadata for file: " + fileName); 164 for (String name : metadata.names()) { 165 log(name + ":" + metadata.get(name)); 166 } 167 log("\n\n"); 168 } 169 170 /** 171 * ***************************SQL processing here 172 */ 173 private void doSqlDocuments() throws SQLException { 174 Connection con = null; 175 try { 176 Class.forName("com.mysql.jdbc.Driver").newInstance(); 177 log("Driver Loaded"); 178 179 con = DriverManager.getConnection("jdbc:mysql://192.168.1.103:3306/test?" 180 + "user=testuser&password=test123"); 181 182 Statement st = con.createStatement(); 183 ResultSet rs = st.executeQuery("select id,title,text from test"); 184 185 while (rs.next()) { 186 // DO NOT move this outside the while loop 187 // or be sure to call doc.clear() 188 SolrInputDocument doc = new SolrInputDocument(); 189 String id = rs.getString("id"); 190 String title = rs.getString("title"); 191 String text = rs.getString("text"); 192 193 doc.addField("id", id); 194 doc.addField("title", title); 195 doc.addField("text", text); 196 197 _docs.add(doc); 198 ++_totalSql; 199 200 // Completely arbitrary, just batch up more than one 201 // document for throughput! 202 if (_docs.size() > 1000) { 203 // Commit within 5 minutes. 204 UpdateResponse resp = _server.add(_docs, 300000); 205 if (resp.getStatus() != 0) { 206 log("Some horrible error has occurred, status is: " + 207 resp.getStatus()); 208 } 209 _docs.clear(); 210 } 211 } 212 } catch (Exception ex) { 213 ex.printStackTrace(); 214 } finally { 215 if (con != null) { 216 con.close(); 217 } 218 } 219 } 220 }