rf-idf的java实现
还存在的问题是,对于其中分词借助的库还存在问题
参考此篇链接
http://www.cnblogs.com/ywl925/archive/2013/08/26/3275878.html
具体代码部分:
具体代码在老电脑linux系统中
下面这个类:主要是,1列出某个目录下的所有文件名。2,读取某个特定文件
package com.bobo.paper.util; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; public class FileUtil { public static ArrayList<String> FileList = new ArrayList<String>(); // the list of file /** * 列出某個目錄及其子目錄下所有的文件列表 * @param filepath 目錄路徑 * @return 該路徑及其子路經下的所有文件名列表 * @throws FileNotFoundException * @throws IOException */ public static List<String> readDirs(String filepath) throws FileNotFoundException, IOException { try { File file = new File(filepath); if(!file.isDirectory()) { System.out.println("输入的不是目錄名称;"); System.out.println("filepath:" + file.getAbsolutePath()); } else { String[] flist = file.list(); for(int i = 0; i < flist.length; i++) { File newfile = new File(filepath + "/" + flist[i]); if(!newfile.isDirectory()) { FileList.add(newfile.getAbsolutePath()); } else if(newfile.isDirectory()) //if file is a directory, call ReadDirs { readDirs(filepath + "/" + flist[i]); } } } }catch(FileNotFoundException e) { System.out.println(e.getMessage()); } return FileList; } /** * 讀取文件內容,以字符串的方式返回 * @param file 需要讀取的文件名 * @return 返回讀取的文件內容構成的字符串,行之間用\r\n進行分割 * @throws FileNotFoundException * @throws IOException */ public static String readFile(String file) throws FileNotFoundException, IOException { StringBuffer strSb = new StringBuffer(); //String is constant, StringBuffer can be changed. InputStreamReader inStrR = new InputStreamReader(new FileInputStream(file), "gbk"); //byte streams to character streams BufferedReader br = new BufferedReader(inStrR); String line = br.readLine(); while(line != null){ strSb.append(line).append("\r\n"); line = br.readLine(); } return strSb.toString(); } }
下面这个类主要用于分词
package com.bobo.paper.util; import java.io.IOException; import java.util.ArrayList; import org.wltea.analyzer.lucene.IKAnalyzer; public class CutWordsUtil { /** * 进行分词操作 * @param file * @return * @throws IOException */ public static ArrayList<String> cutWords(String file) throws IOException{ ArrayList<String> words = new ArrayList<String>(); String text = FileUtil.readFile(file); IKAnalyzer analyzer = new IKAnalyzer(); // 这里貌似缺少一个分词jar包进一步依赖的包? // analyzer.split(text); //这个分词的工具,回头要以下即可 return null; } }
下面这个类主要实现tf-idf算法
package com.bobo.paper.athology; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import com.bobo.paper.util.CutWordsUtil; import com.bobo.paper.util.FileUtil; public class TfIdfAthology { /** * 统计各个词语列表中各个词语出现的次数 * @param cutwords 分词之后的词语列表 * @return 返回一个hashmap,key为词,value为词出现的次数 */ public static HashMap<String, Integer> normalTF(ArrayList<String> cutwords){ HashMap<String, Integer> resTF = new HashMap<String, Integer>(); for(String word : cutwords){ if(resTF.get(word) == null){ resTF.put(word, 1); System.out.println(word); } else{ resTF.put(word, resTF.get(word) + 1); System.out.println(word.toString()); } } return resTF; } /** * 统计词频,即tf值 * @param cutwords 分词之后的词语列表 * @return */ public static HashMap<String, Float> tf(ArrayList<String> cutwords){ HashMap<String, Float> resTF = new HashMap<String, Float>(); int wordLen = cutwords.size(); HashMap<String, Integer> intTF = normalTF(cutwords); Iterator iter = intTF.entrySet().iterator(); //iterator for that get from TF while(iter.hasNext()){ Map.Entry entry = (Map.Entry)iter.next(); resTF.put(entry.getKey().toString(), Float.parseFloat(entry.getValue().toString()) / wordLen); System.out.println(entry.getKey().toString() + " = "+ Float.parseFloat(entry.getValue().toString()) / wordLen); } return resTF; } /** * 将以个目录下所有的文件进行分词,返回一个HashMap<String, HashMap<String, Integer>> ,前面一个key是文件名,后面一个key是词,其值为该词出现的次数 * @param dirc * @return * @throws IOException */ public static HashMap<String, HashMap<String, Integer>> normalTFAllFiles(String dirc) throws IOException{ HashMap<String, HashMap<String, Integer>> allNormalTF = new HashMap<String, HashMap<String,Integer>>(); List<String> filelist = FileUtil.readDirs(dirc); for(String file : filelist){ HashMap<String, Integer> dict = new HashMap<String, Integer>(); ArrayList<String> cutwords = CutWordsUtil.cutWords(file); //get cut word for one file dict = normalTF(cutwords); allNormalTF.put(file, dict); } return allNormalTF; } /** * 計算一個目錄下所有文件中詞語的詞頻 * @param dirc 目錄名稱 * @return 返回一個HashMap<String,HashMap<String, Float>>,第一個key是文件名,第二個key是詞,value是該詞語在該文件中的頻率 * @throws IOException */ public static HashMap<String,HashMap<String, Float>> tfAllFiles(String dirc) throws IOException{ HashMap<String, HashMap<String, Float>> allTF = new HashMap<String, HashMap<String, Float>>(); List<String> filelist = FileUtil.readDirs(dirc); for(String file : filelist){ HashMap<String, Float> dict = new HashMap<String, Float>(); ArrayList<String> cutwords = CutWordsUtil.cutWords(file); //get cut words for one file dict = tf(cutwords); allTF.put(file, dict); } return allTF; } /** * 計算词语的idf值 log(|D|/{包含该词语的文档个数+1}) * @param all_tf 爲HashMap<String,HashMap<String, Float>>,第一個key爲文件名,第二個key爲詞語,float代表該詞語在本文件中的詞頻 * @return */ public static HashMap<String, Float> idf(HashMap<String,HashMap<String, Float>> all_tf){ HashMap<String, Float> resIdf = new HashMap<String, Float>(); //dict的key值为词,其value为出现该词的文档个数 HashMap<String, Integer> dict = new HashMap<String, Integer>(); int docNum = FileUtil.FileList.size(); //循环所有的文件 for(int i = 0; i < docNum; i++){ //all_tf中記錄的是 HashMap<String, Float> temp = all_tf.get(FileUtil.FileList.get(i)); Iterator iter = temp.entrySet().iterator(); while(iter.hasNext()){ //循环一个文件中的所有词语的词频 Map.Entry entry = (Map.Entry)iter.next(); String word = entry.getKey().toString(); //IDF的公式,idfi=log(|D|/|{j:ti屬於dj}|),其中|D|爲語料庫中的文件總數目,|{j:ti屬於dj}|指的是包含詞語ti的文件數目,如果该词语不在语料库中,就会导致被除数为零,因此一般情况下使用1 + |\{j : t_{i} \in d_{j}\}| if(dict.get(word) == null){ dict.put(word, 1); }else { dict.put(word, dict.get(word) + 1); } } } System.out.println("IDF for every word is:"); Iterator iter_dict = dict.entrySet().iterator(); while(iter_dict.hasNext()){ Map.Entry entry = (Map.Entry)iter_dict.next(); float value = (float)Math.log(docNum / Float.parseFloat(entry.getValue().toString())); resIdf.put(entry.getKey().toString(), value); System.out.println(entry.getKey().toString() + " = " + value); } return resIdf; } /** * 计算某个词语的idf值 * @param all_tf 记录所有词语tf值的map,第一个key为文件名,第二个key为词语 * @param idfs 记录所有词语idf值的map,key为词语 */ public static void tf_idf(HashMap<String,HashMap<String, Float>> all_tf,HashMap<String, Float> idfs){ HashMap<String, HashMap<String, Float>> resTfIdf = new HashMap<String, HashMap<String, Float>>(); int docNum = FileUtil.FileList.size(); for(int i = 0; i < docNum; i++){ String filepath = FileUtil.FileList.get(i); HashMap<String, Float> tfidf = new HashMap<String, Float>(); HashMap<String, Float> temp = all_tf.get(filepath); Iterator iter = temp.entrySet().iterator(); while(iter.hasNext()){ Map.Entry entry = (Map.Entry)iter.next(); String word = entry.getKey().toString(); Float value = (float)Float.parseFloat(entry.getValue().toString()) * idfs.get(word); tfidf.put(word, value); } resTfIdf.put(filepath, tfidf); } System.out.println("TF-IDF for Every file is :"); DisTfIdf(resTfIdf); } //這個主要用來顯示最終計算得到的tf-idf值 public static void DisTfIdf(HashMap<String, HashMap<String, Float>> tfidf){ Iterator iter1 = tfidf.entrySet().iterator(); while(iter1.hasNext()){ Map.Entry entrys = (Map.Entry)iter1.next(); System.out.println("FileName: " + entrys.getKey().toString()); System.out.print("{"); HashMap<String, Float> temp = (HashMap<String, Float>) entrys.getValue(); Iterator iter2 = temp.entrySet().iterator(); while(iter2.hasNext()){ Map.Entry entry = (Map.Entry)iter2.next(); System.out.print(entry.getKey().toString() + " = " + entry.getValue().toString() + ", "); } System.out.println("}"); } } }
最终的调用方法为:
package com.bobo.paper; import java.io.IOException; import java.util.HashMap; import com.bobo.paper.athology.TfIdfAthology; public class Welcome { /** * @param args */ public static void main(String[] args) { String file = "D:/testfiles"; HashMap<String, HashMap<String, Float>> all_tf; try { all_tf = TfIdfAthology.tfAllFiles(file); System.out.println(); HashMap<String, Float> idfs = TfIdfAthology.idf(all_tf); System.out.println(); TfIdfAthology.tf_idf(all_tf, idfs); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }