利用文本挖掘技术来找出《天龙八部》中的“小鲜词”
问题导读:
1.怎样自动的从文本中找出新的词?
2.怎样在处理数据时自动分割大文件?
3.怎样利用JAVA进行抽词?
开始之前,先看一下从人人网中发现的90后用户爱用的词
是不是很好玩,哈哈。写这篇文章就是让你简单的自动的从文本中找出新的词,这样就知道现在的年轻人喜欢什么了(对于博主这种上了年纪的人来说,真的是很有用,呜呜)
项目结构
当然,text.dat和common.dic这两个文件你可以随意替换,注意text.dat中的数据一定要够份量,否则没啥效果
原理么,看下Matrix67大牛的文章你就懂了
下边开始上代码
common
这个里边包含以下几个类,主要是定义数据结构
CountMap.java
定义一个计数Map来进行数据操作和持久化
package grid.common;
import java.io.Serializable;
import java.util.HashMap;
public class CountMap<T> extends HashMap<T, Integer> implements Serializable {
private static final long serialVersionUID = 6097963798841161750L;
public void increase(T t) {//添加元素
Integer count = get(t);
if (null == count) {
put(t, 1);
} else {
put(t, ++count);
}
}
public int count() { //计数
int count = 0;
for (T t : keySet()) {
count += get(t);
}
return count;
}
public int get(char c) {
Integer count = super.get(c);
return null == count ? 0 : count;
}
}
Node.java
定义语法树的节点
package grid.common;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
public class Node<T> {
protected List<Node<T>> children;
protected Node<T> parent;
protected T value;
Node(T value) {
this.value = value;
}
public Node<T> add(T value) {
if (null == children) {
children = new ArrayList<Node<T>>();
}
Node<T> child = new Node<T>(value);
child.setParent(this);
children.add(child);
return child;
}
public T getValue() {
return value;
}
public Node<T> getParent() {
return parent;
}
public void setParent(Node<T> parent) {
this.parent = parent;
}
//递归遍历孩子节点
private void recurseChildren(List<Node<T>> list, Node<T> parent) {
if (null == parent.children) {
list.add(parent);
} else {
for (Node<T> node : parent.children) {
recurseChildren(list, node);
}
}
}
public List<Node<T>> getLeaves() {
List<Node<T>> list = new ArrayList<Node<T>>();
recurseChildren(list, this);
return list;
}
public List<T> getBranchPath() {
List<T> list = new ArrayList<T>();
Node<T> node = this;
do {
list.add(node.getValue());
node = node.parent;
} while (null != node && !(node instanceof Tree<?>));
Collections.reverse(list);
return list;
}
private void append(StringBuilder builder, int deep, Node<T> node) {
for (int i = 0; i < deep; i++) {
builder.append(" ");
}
builder.append("|--");
builder.append(node.getValue());
builder.append("\n");
if (null != node.children) {
for (Node<T> child : node.children) {
append(builder, deep + 1, child);
}
}
}
public String dump() {
StringBuilder builder = new StringBuilder();
append(builder, 0, this);
return builder.toString();
}
public String toString() {
return value.toString();
}
}
TextDatReader.java
读取处理数据
package grid.common;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
public class TextDatReader {
// public static String read(String path) throws IOException {
// File file = new File(path);
// FileReader reader = new FileReader(file);
// char buffer[] = new char[(int) file.length()];
// reader.read(buffer);
// return new String(buffer);
// }
@SuppressWarnings("resource")
public static String read(String path) throws IOException {
File file = new File(path);
FileInputStream s = new FileInputStream(file);
// 以utf8格式打开文件
// FileReader fr = new FileReader(file);
BufferedReader reader = new BufferedReader(new InputStreamReader(s,
"utf8"));
char buffer[] = new char[(int) file.length()];
reader.read(buffer);
return new String(buffer);
}
// 判断是否存在dat文件夹,没有的话就创建
public static void createDir() {
File file = new File("./dat");
if (!file.exists() && !file.isDirectory()) {
file.mkdir();
}
}
public static final String SUFFIX = ".dat"; // 分割后的文件名后缀
// 将指定的文件按着给定的文件的字节数进行分割文件,其中name指的是需要进行分割的文件名,size指的是指定的小文件的大小
public static void divide(String name, long size) throws Exception {
File file = new File(name);
if (!file.exists() || (!file.isFile())) {
throw new Exception("指定文件不存在!");
}
// 取得文件的大小
long fileLength = file.length();
if (size <= 0) {
size = fileLength / 2;
}
// 取得被分割后的小文件的数目
int num = (fileLength % size != 0) ? (int) (fileLength / size + 1)
: (int) (fileLength / size);
// 存放被分割后的小文件名
String[] fileNames = new String[num];
// 输入文件流,即被分割的文件
FileInputStream in = new FileInputStream(file);
// 读输入文件流的开始和结束下标
long end = 0;
int begin = 0;
createDir();
// 根据要分割的数目输出文件
for (int i = 1; i <= num; i++) {
// 对于前num - 1个小文件,大小都为指定的size
File outFile = new File("./dat", "text" + i + SUFFIX);
// 构建小文件的输出流
FileOutputStream out = new FileOutputStream(outFile);
// 将结束下标后移size
end += size;
end = (end > fileLength) ? fileLength : end;
// 从输入流中读取字节存储到输出流中
for (; begin < end; begin++) {
out.write(in.read());
}
out.close();
fileNames[i] = outFile.getAbsolutePath();
System.out.println("第"+i+"个子文件生成……");
}
in.close();
}
// public static void main(final String[] args) throws Exception {
// String name = "text.dat";
// long size = 1024 * 1024 * 4;// 1K=1024b(字节),切割后每个文件为4M
// TextDatReader.divide(name, size);
//
// }
}
TextUtils.java
用来做文本处理,如判断是否为空、匹配字符等
package grid.common;
public class TextUtils {
public static boolean isCnLetter(char c) {//判断是否为中文字符
return c >= 0x4E00 && c <= 0x9FCB;
}
public static boolean isNumeric(char c) {//判断是否为数字
return c >= '0' && c <= '9';
}
public static boolean isEnLetter(char c) {//判断是否为英文字母
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
}
//字符串匹配
public static boolean match(String src, int off, String dest) {
int len = dest.length();
int srcLen = src.length();
for (int i = 0; i < len; i++) {
if (srcLen <= off + i) {
return false;
}
if (dest.charAt(i) != src.charAt(off + i)) {
return false;
}
}
return true;
}
//判断是否为空
public static boolean isBlank(String str) {
return null == str || str.isEmpty() || str.trim().isEmpty();
}
}
Tree.java
语法树
package grid.common;
public class Tree<T> extends Node<T> {
public Tree(T value) {
super(value);
}
}
dic
里边包含CnDictionary类
CnDictionary.java
词典处理
package grid.text.dic;
import grid.common.CountMap;
import grid.common.TextDatReader;
import grid.common.TextUtils;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
public class CnDictionary {
private final String COMMON_WORD_DIC_PATH = "common.dic";
/**
* This text data is for character statistic. Change to your own if you
* like.
*/
private final String COMMON_LETTER_RESOURCE_PATH = "text.dat";
private Set<String> dictionary = new HashSet<String>();
private CountMap<Character> letterCountMap = new CountMap<Character>();
private int totalLetterCount;
private static CnDictionary instance;
//单例模式
public static CnDictionary Instance() {
if (null == instance) {
try {
instance = new CnDictionary();
} catch (IOException e) {
e.printStackTrace();
}
}
return instance;
}
private CnDictionary() throws IOException {
initWordDic();
initLetterCountMap();
}
private void initLetterCountMap() throws IOException {
String letterResource = TextDatReader.read(COMMON_LETTER_RESOURCE_PATH);//读取语料数据 text.dat
final int len = letterResource.length();
char c;
for (int i = 0; i < len; i++) {
c = letterResource.charAt(i);
if (TextUtils.isCnLetter(c)) {
letterCountMap.increase(c);
}
}
totalLetterCount = letterCountMap.count();
}
private void initWordDic() throws IOException {
String bytes = TextDatReader.read(COMMON_WORD_DIC_PATH);//读取词典commondic
final int len = bytes.length();
String s = "";
char c;
for (int i = 0; i < len; i++) {
c = bytes.charAt(i);
if ('\n' == c || '\r' == c || 0 == c) {
if (!TextUtils.isBlank(s)) {
dictionary.add(s.trim());
}
s = "";
} else {
s += c;
}
if (0 == c) {
break;
}
}
}
public boolean contains(String word) {
return dictionary.contains(word);
}
public double rate(char c) {
return (double) letterCountMap.get(c) / totalLetterCount;
}
public int size() {
return dictionary.size();
}
}
evolution
EntropyJudger.java
计算熵值
package grid.text.evolution;
import grid.common.CountMap;
import grid.common.TextUtils;
import grid.text.index.Pos;
import grid.text.index.TextIndexer;
public class EntropyJudger {
private TextIndexer indexer;
/**
* A word least appeared count
*/
private static int LEAST_COUNT_THRESHOLD = 5;
/**
* Threshold for solid rate calculated by word appeared count and every
* single letter.
*
* The smaller this values is, more new words you will get, but with less
* accuracy. The greater this value is, less new words you will get, but
* with high accuracy.
*/
private static double SOLID_RATE_THRESHOLD = 0.018;
/**
* Threshold for entropy value calculated by candidate word prefix character
* count and suffix character count
*
* The smaller this values is, more new words you will get, but with less
* accuracy. The greater this value is, less new words you will get, but
* with high accuracy.
*/
private static double ENTROPY_THRESHOL = 1.92;
public EntropyJudger(TextIndexer indexer) {
this.indexer = indexer;
}
public boolean judge(String candidate) {
double solidRate = getSolidRate(candidate);
if (solidRate < SOLID_RATE_THRESHOLD) {
return false;
}
double entropy = getEntropy(candidate);
if (entropy < ENTROPY_THRESHOL) {
return false;
}
return true;
}
private double getEntropy(String candidate) {
Pos pos = new Pos(candidate);
CountMap<Character> frontCountMap = new CountMap<Character>();
CountMap<Character> backCountMap = new CountMap<Character>();
final int candidateLen = candidate.length();
int off = 0;
char c;
double rate, frontEntropy = 0, backEntropy = 0;
while (indexer.find(pos).isFound()) {
off = pos.getPos();
c = indexer.charAt(off - 1);
if (TextUtils.isCnLetter(c)) {
frontCountMap.increase(c);
}
c = indexer.charAt(off + candidateLen);
if (TextUtils.isCnLetter(c)) {
backCountMap.increase(c);
}
}
for (char key : frontCountMap.keySet()) {
rate = (double) frontCountMap.get(key) / frontCountMap.count();
frontEntropy -= rate * Math.log(rate);
}
for (char key : backCountMap.keySet()) {
rate = (double) backCountMap.get(key) / backCountMap.count();
backEntropy -= rate * Math.log(rate);
}
return frontEntropy > backEntropy ? backEntropy : frontEntropy;
}
/**
* @param candidate
* @return
*/
public double getSolidRate(String candidate) {
final int candidateLen = candidate.length();
if (candidateLen < 2) {
return 1;
}
final int count = indexer.count(candidate);
double rate = 1;
if (count < LEAST_COUNT_THRESHOLD) {
return 0;
}
for (int i = 0; i < candidateLen; i++) {
rate *= (double) count / indexer.count("" + candidate.charAt(i));
}
return Math.pow(rate, 1D / candidateLen) * Math.sqrt(candidateLen);
}
public void setIndexer(TextIndexer indexer) {
this.indexer = indexer;
}
}
NewWordDiscover.java
抽词程序
package grid.text.evolution;
import grid.common.TextUtils;
import grid.text.dic.CnDictionary;
import grid.text.index.CnPreviewTextIndexer;
import grid.text.index.TextIndexer;
import grid.text.selector.CnTextSelector;
import grid.text.selector.TextSelector;
import java.util.HashSet;
import java.util.Set;
public class NewWordDiscover {
private CnDictionary dictionary;
/**
* Minimum word length
*/
private final static int MIN_CANDIDATE_LEN = 2;
/**
* Maximum word length
*/
private final static int MAX_CANDIDATE_LEN = 6;
private static Set<Character> structuralLetterSet = new HashSet<Character>();
private static char[] structuralLetters = { '我', '你', '您', '他', '她', '谁',
'哪', '那', '这', '的', '了', '着', '也', '是', '有', '不', '在', '与', '呢',
'啊', '呀', '吧', '嗯', '哦', '哈', '呐' };
static {
for (char c : structuralLetters) {
structuralLetterSet.add(c);
}
}
public NewWordDiscover() {
dictionary = CnDictionary.Instance();
}
/**
* New word discover is based on statistic and entropy, better to sure
* document size is in 100kb level, or you may get a unsatisfied result.
*
* @param document
* @return
*/
public Set<String> discover(String document) {
Set<String> set = new HashSet<String>();
TextIndexer indexer = new CnPreviewTextIndexer(document);
TextSelector selector = new CnTextSelector(document, MIN_CANDIDATE_LEN,
MAX_CANDIDATE_LEN);
EntropyJudger judger = new EntropyJudger(indexer);
String candidate;
while (!selector.end()) {
candidate = selector.next();
if (TextUtils.isBlank(candidate)) {
continue;
}
if (structuralLetterSet.contains(candidate.charAt(0))
|| structuralLetterSet.contains(candidate.charAt(candidate
.length() - 1))) {
continue;
}
// Replace IF clause with "set.contains(candidate)" if you want to
// find new word without any dictionary
if (dictionary.contains(candidate) || set.contains(candidate)) {
selector.select();
} else if (judger.judge(candidate)) {
set.add(candidate);
}
}
return set;
}
}
index
这几个类用于给词创建索引,方便从词典中找出
CnPreviewTextIndexer.java
package grid.text.index;
import grid.common.TextUtils;
import java.util.HashMap;
import java.util.Map;
import java.util.Vector;
public class CnPreviewTextIndexer implements TextIndexer {
private final static int CN_LETTER_COUNT = 5021;
private String document;
private Map<Character, Vector<Integer>> posMap;
public CnPreviewTextIndexer(String document) {
this.document = document;
init();
}
private void init() {
final int len = document.length();
final int supposedMinCount = 1 + (int) Math.log(len / CN_LETTER_COUNT
+ 1);
char c;
Vector<Integer> posVector;
posMap = new HashMap<Character, Vector<Integer>>(CN_LETTER_COUNT);
for (int i = 0; i < len; i++) {
c = document.charAt(i);
if (!TextUtils.isCnLetter(c)) {
continue;
}
posVector = posMap.get(c);
if (null == posVector) {
posVector = new Vector<Integer>(supposedMinCount);
posMap.put(c, posVector);
}
posVector.add(i);
}
}
@Override
public int count(String text) {
if (TextUtils.isBlank(text)) {
return 0;
}
Vector<Integer> vector = posMap.get(text.charAt(0));
if (null == vector) {
return 0;
}
if (1 == text.length()) {
return vector.size();
}
final int size = vector.size();
int count = 0;
for (int i = 0; i < size; i++) {
if (TextUtils.match(document, vector.get(i), text)) {
count++;
}
}
return count;
}
@Override
public Pos find(Pos pos) {
String text = pos.getTarget();
pos.setFound(false);
if (TextUtils.isBlank(text)) {
return pos;
}
Vector<Integer> vector = posMap.get(text.charAt(0));
if (null == vector) {
return pos;
}
final int arraySize = vector.size();
final int arrayIndex = pos.arrayIndex + 1;
for (int i = arrayIndex; i < arraySize; i++) {
if (TextUtils.match(document, vector.get(i), text)) {
pos.setFound(true);
pos.setPos(vector.get(i));
pos.arrayIndex = i;
break;
}
}
return pos;
}
@Override
public int len() {
return document.length();
}
@Override
public String sub(int off, int len) {
if (off < 0 || off + len >= document.length()) {
return "";
}
return document.substring(off, off + len);
}
@Override
public char charAt(int index) {
if (index < 0 || index >= document.length()) {
return 0;
}
return document.charAt(index);
}
}
Pos.java
package grid.text.index;
public class Pos {
private String target;
/**
* Pos for current matched full target text
*/
private int pos = -1;
/**
* Index in position array for current matched full target text
*/
int arrayIndex = -1;
private boolean found = false;
public Pos(String target) {
this.target = target;
}
public String getTarget() {
return target;
}
public int getPos() {
return pos;
}
public boolean isFound() {
return found;
}
void setPos(int pos) {
this.pos = pos;
}
void setFound(boolean found) {
this.found = found;
}
}
SimpleTextIndexer.java
package grid.text.index;
public class SimpleTextIndexer implements TextIndexer {
private String document;
public SimpleTextIndexer(String document) {
this.document = document;
}
@Override
public int count(String text) {
int off = 0;
int count = 0;
final int len = text.length();
while ((off = document.indexOf(text, off)) > -1) {
count++;
off += len;
}
return count;
}
@Override
public Pos find(Pos pos) {
final String text = pos.getTarget();
final int len = text.length();
int off = pos.getPos() + len;
if (pos.getPos() < 0)
off = 0;
pos.setFound(false);
if ((off = document.indexOf(text, off)) > -1) {
pos.setFound(true);
pos.setPos(off);
}
return pos;
}
@Override
public int len() {
return document.length();
}
@Override
public String sub(int off, int len) {
return document.substring(off, off + len);
}
@Override
public char charAt(int index) {
if (index < 0 || index >= document.length()) {
return 0;
}
return document.charAt(index);
}
}
TextIndexer.java
package grid.text.index;
public interface TextIndexer {
/**
* @param text
* @return count for specific text
*/
public int count(String text);
/**
* @param pos
* @return next position for current pos
*/
public Pos find(Pos pos);
/**
* @return original document length
*/
public int len();
/**
* @param off
* @param len
* @return the sub string start from <b>off</b> and with a length with
* <b>len</b>
*/
public String sub(int off, int len);
/**
* @param index
* @return return the character in the specified index
*/
public char charAt(int index);
}
participle
分词处理,具体看实现
Chunk.java
package grid.text.participle;
import grid.text.dic.CnDictionary;
import java.util.List;
public class Chunk implements Comparable<Chunk> {
private List<String> list;
private int len = 0;
private double avg = 0;
private double variance = 0;
public Chunk(List<String> list) {
this.list = list;
init();
}
private void init() {
for (String s : list) {
len += s.length();
}
avg = (double) len / list.size();
for (String s : list) {
variance += Math.pow(avg - s.length(), 2);
}
variance = Math.sqrt(variance);
}
public int getLen() {
return len;
}
public double getAvg() {
return avg;
}
public double getVariance() {
return variance;
}
public String getHead() {
if (null == list || list.isEmpty()) {
return "";
}
return list.get(0);
}
private int compareDouble(double d1, double d2) {
if (d1 - d2 < -0.0000001D) {
return 1;
} else if (d1 - d2 > 0.0000001D) {
return -1;
}
return 0;
}
@Override
public int compareTo(Chunk o) {
if (len != o.len) {
return o.len - len;
}
int d = compareDouble(avg, o.avg);
if (0 != d) {
return d;
}
d = compareDouble(variance, o.variance);
if (0 != d) {
return d;
}
CnDictionary dictionary = CnDictionary.Instance();
double rateSrc = 0, rateDest = 0;
for (String s : list) {
if (1 == s.length()) {
rateSrc += dictionary.rate(s.charAt(0));
}
}
for (String s : o.list) {
if (1 == s.length()) {
rateDest += dictionary.rate(s.charAt(0));
}
}
return compareDouble(rateSrc, rateDest);
}
public String toString() {
return list.toString();
}
}
ChunkStream.java
package grid.text.participle;
import grid.common.Node;
import grid.common.TextUtils;
import grid.common.Tree;
import grid.text.dic.CnDictionary;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
public class ChunkStream {
/**
* Define the max supposed word length
*
* You could shorten the value if you don't need too long participle result
*/
private static final int MAX_WORD_LEN = 7;
/**
* Define the predict level while execute participle.
*
* Negligible accuracy will be promoted if you increase this value
*/
private static final int PREDICT_LEVEL = 3;
private static CnDictionary dictionary = CnDictionary.Instance();
public String next(String text, int off) {
Tree<String> root = new Tree<String>("ROOT");
recurse(root, off, text, 0);
List<Node<String>> list = root.getLeaves();
List<Chunk> chunkList = new ArrayList<Chunk>();
for (Node<String> node : list) {
chunkList.add(new Chunk(node.getBranchPath()));
}
Collections.sort(chunkList);
return chunkList.get(0).getHead();
}
private void recurse(Node<String> node, int off, String text,
int predictDeep) {
int len = MAX_WORD_LEN + off > text.length() ? text.length() - off
: MAX_WORD_LEN;
while (predictDeep < PREDICT_LEVEL) {
if (len < 1) {
return;
}
String s = text.substring(off, off + len);
if (len < 2) {
if (!TextUtils.isCnLetter(text.charAt(off))) {
break;
}
recurse(node.add(s), off + 1, text, predictDeep + 1);
} else if (dictionary.contains(s)) {
recurse(node.add(s), off + s.length(), text, predictDeep + 1);
}
len--;
}
}
}
MechanicalParticiple.java
package grid.text.participle;
import grid.common.TextUtils;
import java.util.Vector;
public class MechanicalParticiple {
public Vector<String> partition(String document) {
Vector<String> vector = new Vector<String>();
final int docLen = document.length();
int off = 0;
char c;
String seg = "";
ChunkStream stream = new ChunkStream();
while (off < docLen) {
c = document.charAt(off);
if (TextUtils.isEnLetter(c) || TextUtils.isNumeric(c)) {
seg += c;
off++;
} else if (TextUtils.isCnLetter(c)) {
if (!TextUtils.isBlank(seg)) {
vector.add(seg);
seg = "";
}
String word = stream.next(document, off);
if (!TextUtils.isBlank(word)) {
vector.add(word);
off += word.length();
}
} else {
if (!TextUtils.isBlank(seg)) {
vector.add(seg);
seg = "";
}
/**
* TODO: Uncomment the "ELSE IF" clause if you would like to
* reserve punctuations
*/
// else if (!TextUtils.isBlank("" + c)) { vector.add("" + c); }
off++;
}
}
if (!TextUtils.isBlank(seg)) {
vector.add(seg);
}
return vector;
}
}
selector
文本选择器,筛选出可能为新词的词汇
CnTextSelector.java
package grid.text.selector;
import grid.common.TextUtils;
public class CnTextSelector extends CommonTextSelector {
public CnTextSelector(String document, int minSelectLen, int maxSelectLen) {
super(document, minSelectLen, maxSelectLen);
}
protected void adjustCurLen() {
while (pos < docLen && !TextUtils.isCnLetter(document.charAt(pos))) {
pos++;
}
for (int i = 0; i < maxSelectLen && pos + i < docLen; i++) {
if (!TextUtils.isCnLetter(document.charAt(pos + i))) {
curLen = i;
if (curLen < minSelectLen) {
pos++;
adjustCurLen();
}
return;
}
}
curLen = pos + maxSelectLen > docLen ? docLen - pos : maxSelectLen;
}
}
CommonTextSelector.java
package grid.text.selector;
public class CommonTextSelector implements TextSelector {
protected String document;
protected int pos = 0;
protected int maxSelectLen = 5;
protected int minSelectLen = 2;
protected int curLen;
protected final int docLen;
public CommonTextSelector(String document, int minSelectLen,
int maxSelectLen) {
this.document = document;
this.minSelectLen = minSelectLen;
this.maxSelectLen = maxSelectLen;
docLen = document.length();
adjustCurLen();
}
public void select() {
pos += ++curLen;
adjustCurLen();
}
protected void adjustCurLen() {
curLen = pos + maxSelectLen > docLen ? docLen - pos : maxSelectLen;
}
public String next() {
if (curLen < minSelectLen) {
pos++;
adjustCurLen();
}
if (pos + curLen <= docLen && curLen >= minSelectLen) {
return document.substring(pos, pos + curLen--);
} else {
curLen--;
// return document.substring(pos, docLen);
return "";
}
}
public boolean end() {
return curLen < minSelectLen && curLen + pos >= docLen - 1;
}
@Override
public int getCurPos() {
return pos;
}
}
TextSelector.java
package grid.text.selector;
public interface TextSelector {
public boolean end();
public void select();
public String next();
public int getCurPos();
}
测试代码
NewWordDiscoverTest.java
package grid.test;
import grid.common.TextDatReader;
import grid.text.evolution.NewWordDiscover;
import grid.text.index.CnPreviewTextIndexer;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Scanner;
import java.util.Set;
public class NewWordDiscoverTest {
public static void writefile(String m) {
try {
File file = new File("result.txt");
if (!file.exists()) {
file.createNewFile();
}
FileWriter fileWritter = new FileWriter(file.getName(), true);
BufferedWriter bufferWritter = new BufferedWriter(fileWritter);
bufferWritter.write(m);
bufferWritter.close();
} catch (IOException e) {
e.printStackTrace();
}
}
@SuppressWarnings("resource")
public static void main(String[] args) throws Exception {
// 开始之前,清空result.txt,避免数据重复
File filere = new File("result.txt");
filere.delete();
Scanner scan = new Scanner(System.in);
System.out.println("请输入您要处理的文件名称:\n");
String path = scan.next();
File file = new File(path);
if (!file.exists() || (!file.isFile())) {
throw new Exception("指定文件不存在!");
}
long maxsize = 1024 * 1024 * 1024;// 1G,超过这个值需要做文件切分
long size = 1024 * 1024 * 5; // 子文件最大为100M
long fileLength = file.length();
if (size <= 0) {
size = fileLength / 2;
}
// 取得被分割后的小文件的数目
int num = (fileLength % size != 0) ? (int) (fileLength / size + 1)
: (int) (fileLength / size);
if (file.length() >= maxsize) {
System.out.println("文件大小超出1G,是否开始进行文件切割?1:是 0:否\n");
int t = scan.nextInt();
if (t == 1) {
TextDatReader.divide(path, size);
System.out.println("切割完成\n");
System.out.println("结果保存在当前目录下的dat文件夹中\n");
}
// System.out.println("请输入您要处理的文件序号,例如1代表dat文件架下的text1.dat\n");
// int m = scans.nextInt();
for (int m = 1; m <= num; m++) {
String pathdived = "./dat/text" + m + ".dat";
System.out.println("开始提取第" + m + "个文件……");
discovrWord(pathdived);
}
} else {
System.out.println("开始提取文件……");
discovrWord(path);
}
}
private static void discovrWord(String path) throws IOException {
String document = TextDatReader.read(path);
NewWordDiscover discover = new NewWordDiscover();
Set<String> words = discover.discover(document);
CnPreviewTextIndexer ci = new CnPreviewTextIndexer(document);
// long start = System.currentTimeMillis();
// System.out.println("耗时: " + (double) document.length()
// / (System.currentTimeMillis() - start) * 1000);
System.out.println("新词个数: " + words.size());
System.out.println("发现的新词:" + "\n");
for (String newword : words) {
System.out.println(newword + "," + ci.count(newword) + "\n");// 发现新词后,统计每个新词出现的次数
writefile(newword + "," + ci.count(newword) + "\n");
}
}
}
抽词测试,结果如下
ParticipleTest.java
package grid.test;
import grid.text.participle.MechanicalParticiple;
import java.util.Vector;
public class ParticipleTest {
private static String document = "我是中国人";
public static void main(String args[]) {
MechanicalParticiple participle = new MechanicalParticiple();
Vector<String> vec = participle.partition(document);
System.out.println(vec);
}
}
分词测试,结果如下
怎么样,很酷吧,你还可以试着用《天龙八部》数据集玩下,看看主角是不是乔帮主。如果发现了什么新鲜词,请告诉博主,咱也不落后哈!
VIP独享–天龙八部新词,如果想看结果请心里默夸博主一百次
执行以上步骤后再送您一份哈利波特版的