TF-IDF
参考源:
http://www.ruanyifeng.com/blog/2013/03/tf-idf.html 写的很明了
package com.data.text.tfidf; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; public class TF_IDF { private double NUM_DOCS; private Map<String, Integer> idf_map; public TF_IDF(String fileName){ idf_map = new HashMap<String, Integer>(); File file = new File(fileName); BufferedReader reader = null; try { reader = new BufferedReader(new FileReader(file)); String tempString = null; //第一行为Num_docs tempString = reader.readLine(); NUM_DOCS = (double)Integer.parseInt(tempString); // 一次读入一行,直到读入null为文件结束 while ((tempString = reader.readLine()) != null) { String[] arr = tempString.split(" : "); String key = arr[0]; Integer value = Integer.parseInt(arr[1]); idf_map.put(key, value); } reader.close(); } catch (IOException e) { e.printStackTrace(); } finally { if (reader != null) { try { reader.close(); } catch (IOException e1) { } } } } public List<Feature> cacu(Map<String, Integer> tf_map) { // 统计总词数 Integer word_num_sum = 0; for (Entry<String, Integer> entry : tf_map.entrySet()) { word_num_sum += entry.getValue(); } //计算tf-idf List<Feature> list_fea = new ArrayList<Feature>(); for (Entry<String, Integer> entry : tf_map.entrySet()) { String word = entry.getKey(); Integer num = entry.getValue(); double tf = (double) num / word_num_sum; double idf = Math.log(NUM_DOCS / idf_map.get(word) + 1);//+1平滑 逆文档频率 double weight = tf * idf; list_fea.add(new Feature(word, num, weight)); } //根据权重排序 Collections.sort(list_fea); return list_fea; } public static void main(String[] args) { // TODO Auto-generated method stub } } package com.data.text.tfidf; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.util.HashSet; import java.util.Set; public class StopWord { public static Set<String> GetStopWords(){ String fileName = "stopwords.txt"; return readwords(fileName); } /** * 读取停用词表 * @param fileName * @return */ private static Set<String> readwords(String fileName){ Set<String> set = new HashSet<String>(); File file = new File(fileName); BufferedReader reader = null; try { reader = new BufferedReader(new FileReader(file)); String tempString = null; // 一次读入一行,直到读入null为文件结束 while ((tempString = reader.readLine()) != null) { set.add(tempString.trim()); } reader.close(); } catch (IOException e) { e.printStackTrace(); } finally { if (reader != null) { try { reader.close(); } catch (IOException e1) { } } } return set; } } package com.data.text.tfidf; /** * 特征词 * @author root * */ public class Feature implements Comparable<Feature> { private String word; private Integer num; private double weight; public Feature(String word, Integer num, double weight) { this.word = word; this.num = num; this.weight = weight; } public String getWord() { return word; } public Integer getNum() { return num; } public double getWeight() { return weight; } @Override public int compareTo(Feature o) { if(this.getWeight() == o.getWeight()){ return 0; }else if(this.getWeight() > o.getWeight()){ return -1; }else{ return 1; } } public String toString(){ return this.word + " freq: " + num + " weight: " + weight; } }
__author__ = 'dell' import math import re from operator import itemgetter class TfIdf: def __init__(self, corpus_filename = None, stopword_filename = None, DEFAULT_IDF = 1.5): self.num_docs = 0 self.term_num_docs = {} self.stopwords = [] self.idf_default = DEFAULT_IDF if corpus_filename: corpus_file = open(corpus_filename, 'r') #load num of documents line = corpus_file.readline() self.num_docs = int(line) #read term:frequency from each subsequent line in the file for line in corpus_file: tokens = line.split(':') term = tokens[0].strip() frequency = int(tokens[1].strip()) self.term_num_docs[term] = frequency if stopword_filename: stopword_file = open(stopword_filename) self.stopwords = [line.strip() for line in stopword_file] def get_tokens(self, str): return re.findall(r"<a.*?/a>|<[^\>]*>|[\w'@#]+", str.lower()) def add_input_document(self, input): self.num_docs += 1 words = set(self.get_tokens(input)) for word in words: if word in self.term_num_docs: self.term_num_docs[word] += 1 else: self.term_num_docs[word] = 1 def get_num_docs(self): return self.num_docs def get_idf(self, term): if term in self.stopwords: return 0 if term not in self.term_num_docs: return self.idf_default return math.log(float(1 + self.get_num_docs()) / (1 + self.term_num_docs[term])) def get_doc_keywords(self, curr_doc): tfidf = {} tokens = self.get_tokens(curr_doc) tokens_set = set(tokens) for word in tokens_set: tf = float(tokens.count(word) / len(tokens)) idf = self.get_idf(word) tfidf[word] = tf * idf return sorted(tfidf.items(), key=itemgetter(1), reverse=True)