倒排索引的 JAVA 简单实现
倒排索引的简单 JAVA 实现,当玩具其实都很粗糙,简单实现下原理:
public class IntertedIndex { // 倒排索引 private Map<String, List<String>> indexMap; // 关键词计数 private Map<String, Integer> keywordNums; IntertedIndex() { this.indexMap = new HashMap<String, List<String>>(); this.keywordNums = new HashMap<String, Integer>(); } public static void main(String[] args) throws Exception { IntertedIndex intertedIndex = new IntertedIndex(); intertedIndex.createIndexByFolder("C:\\Users\\Administrator\\Desktop\\logs"); java.util.Scanner s = new java.util.Scanner(System.in); String inString; while ((inString = s.next()) != null) { System.out.println("查询关键字 :" + inString); try { intertedIndex.search(inString); } catch (Exception e) { e.printStackTrace(); } } // System.out.println(intertedIndex.toString()); } public void search(String keyword) { if (!this.indexMap.keySet().contains(keyword)) throw new RuntimeException(" keyword : " + keyword + " is not in map ! "); System.out.println(">>>>>> " + keyword + " ( " + this.keywordNums.get(keyword) + " ) "); List<String> fileList = this.indexMap.get(keyword); for (int i = 0; i < fileList.size(); i++) { System.out.println(" >>> " + fileList.get(i)); } } public void createIndexByFolder(String folderName) throws Exception { File foler = new File(folderName); File[] files = foler.listFiles(); for (int i = 0; i < files.length; i++) { File file = files[i]; String filePath = file.getAbsolutePath(); System.out.println("deal file : " + filePath); this.createIndexByFile(filePath); } } public void createIndexByFile(String fileName) throws Exception { BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), Charset.forName("GBK"))); String row = null; try { while ((row = reader.readLine()) != null) { String[] keywords = row.split(" "); // 分词 StringTokenizer str = new StringTokenizer(row, " ,:\" \rn()'-'.,!?,:“”‘’?-!。," + System.lineSeparator()); while (str.hasMoreElements()) { String keyword = str.nextElement().toString(); if (keyword.length() > 100) continue; while (keyword.contains(" ")) keyword = keyword.replace(" ", ""); this.createIndexByKeyword(keyword, fileName); } } } catch (Exception e) { throw e; } finally { reader.close(); } } public synchronized void createIndexByKeyword(String keyword, String fileName) { System.out.println("deal keyword : " + keyword); boolean isNew = false; if (isNew = !indexMap.containsKey(keyword)) indexMap.put(keyword, new ArrayList<String>()); List<String> fileNameList = indexMap.get(keyword); if (isNew || !fileNameList.contains(fileName)) { fileNameList.add(fileName); } keywordNums.put(keyword, keywordNums.getOrDefault(keyword, 0) + 1); } @Override public String toString() { String res = ""; Iterator<String> iterator = this.indexMap.keySet().iterator(); while (iterator.hasNext()) { String keyword = iterator.next(); res += ">>>>>> " + keyword + " (" + this.keywordNums.get(keyword) + " ) " + " <<<<<<"; res += System.lineSeparator(); List<String> fileList = this.indexMap.get(keyword); for (int i = 0; i < fileList.size(); i++) { String filePath = fileList.get(i); if (filePath.contains("/")) { String[] pathArr = filePath.split("/"); filePath = pathArr[pathArr.length - 1]; } res += " >>> " + filePath; res += System.lineSeparator(); } } return res; } }
当你看清人们的真相,于是你知道了,你可以忍受孤独