java_倒排索引(单线程版+多线程(初)版)

package experiment10.exp2;
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.corpus.document.sentence.word.Word;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.seg.common.Term;
import experiment9.FileName;
import java.io.*;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
/*倒排索引
倒排索引是搜索引擎的核心技术,对海量的文本(文档、网页),搜索引擎需要建立搜索索引。
*/
public class ReverseIndex {
public static void main(String[] args) throws IOException {
BufferedReader bufferedReader1=new BufferedReader(new FileReader(FileName.fileName10_1, Charset.forName("utf-8")));
BufferedReader bufferedReader2=new BufferedReader(new FileReader(FileName.fileName10_2,Charset.forName("utf-8")));
BufferedReader bufferedReader3=new BufferedReader(new FileReader(FileName.fileName10_3,Charset.forName("utf-8")));
//输出:
BufferedWriter bufferedWriter= new BufferedWriter(new FileWriter(FileName.fileNameOut10_1));
HashMap<String,Integer> countMap1 = new HashMap<>();
HashMap<String,Integer> countMap2 = new HashMap<>();
HashMap<String,Integer> countMap3 = new HashMap<>();
// List<WordNode> list=new ArrayList<>();//整合统计信息.
HashMap<String,WordNode> map=new HashMap<>();
String line="";//这种只作为参数的对象/变量应当初始化.
//调用countWords统计各份文档中的词频.(统计到同一个map中)
countWords(bufferedReader1, countMap1, line);
for(String word : countMap1.keySet()){
WordNode node=new WordNode(word);
node.addToList("倚天屠龙记.txt->"+countMap1.get(word));
map.put(word, node);
}
countWords(bufferedReader2,countMap2,line);
for(String word : countMap2.keySet()){//
if(!map.containsKey(word)){
WordNode node=new WordNode(word);
node.addToList("鹿鼎记.txt->"+countMap2.get(word));
map.put(word, node);
}
else {
map.get(word).addToList("鹿鼎记.txt->"+countMap2.get(word));
}
}
countWords(bufferedReader3, countMap3, line);
for(String word : countMap3.keySet()){//
if(!map.containsKey(word)){
WordNode node=new WordNode(word);
node.addToList("笑傲江湖.txt->"+countMap3.get(word));
map.put(word, node);
}
else {
map.get(word).addToList("笑傲江湖.txt->"+countMap3.get(word));
}
}
//打印
StringBuffer stringBuffer = new StringBuffer();
for(String item : map.keySet()){
stringBuffer.append(map.get(item));
}
System.out.println(stringBuffer);
bufferedWriter.write(stringBuffer.toString());
bufferedWriter.flush();
bufferedReader1.close();
bufferedReader2.close();
bufferedReader3.close();
bufferedWriter.close();
}//endMain
//countWords();另外可以考虑重载方法,达到默认参数的效果(但应该只是常量才有意义)
static void countWords(BufferedReader bufferedReader,HashMap<String,Integer> countMap,String line) throws IOException {
while ((line=(bufferedReader.readLine()))!=null){
//统计各份文档中的词频的方法.
List<Term> segment = HanLP.segment(line);
for(Term x:segment){
//if(/*x.nature== Nature.w*//*||x.nature==Nature.e*/) continue;
String keyString=x.toString();
if(countMap.containsKey(keyString)){
countMap.put(keyString,countMap.get(keyString)+1);
}else {
countMap.put(keyString,1);
}
}
}//endWhile
}
}

用到一个类WordNode辅助统计.

package experiment10.exp2;
import java.util.ArrayList;
import java.util.List;
public class WordNode {
String word;
List<String> list = new ArrayList<>();
public WordNode(String word) {
this.word = word;
//this.list = list;
}
public String getWord() {
return word;
}
public List<String> getList() {
return list;
}
public void setWord(String word) {
this.word = word;
}
public void addToList(String fileNameAndFrequency) {
list.add(fileNameAndFrequency);
}
@Override
public String toString() {
StringBuffer buffer=new StringBuffer();
for(String phrase:list){
buffer.append(phrase+"\t");
}
return word+"\t"+buffer+"\n";
}
}
package experiment10.exp2;
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.corpus.document.sentence.word.Word;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.seg.common.Term;
import experiment9.FileName;
import java.io.*;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
/*倒排索引
倒排索引是搜索引擎的核心技术,对海量的文本(文档、网页),搜索引擎需要建立搜索索引。
*/
public class ReverseIndex {
public static void main(String[] args) throws IOException, InterruptedException {
BufferedReader bufferedReader1 = new BufferedReader(new FileReader(FileName.fileName10_1, Charset.forName("utf-8")));
BufferedReader bufferedReader2 = new BufferedReader(new FileReader(FileName.fileName10_2, Charset.forName("utf-8")));
BufferedReader bufferedReader3 = new BufferedReader(new FileReader(FileName.fileName10_3, Charset.forName("utf-8")));
//输出:
BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(FileName.fileNameOut10_1));
HashMap<String, Integer> countMap1 = new HashMap<>();
HashMap<String, Integer> countMap2 = new HashMap<>();
HashMap<String, Integer> countMap3 = new HashMap<>();
// List<WordNode> list=new ArrayList<>();//整合统计信息.
HashMap<String, WordNode> map = new HashMap<>();
String line = "";//这种只作为参数的对象/变量应当初始化.
//调用countWords统计各份文档中的词频.(统计到同一个map中)
/*如果采用多线程来写的话,在考虑线程安全之外,三个countWords工作对应可有三个线程来执行.(这是线程安全的,不会互相干扰的)
* 他们都是往同一个map中写入/修改<String,WordNode>键值对.如果有相应的word键,则往wordNode中插入新内容.
* 所插入的内容是各个文档名+该文档包出现word的次数.*/
Thread thread1 = new Thread(new Runnable() {
@Override
public void run() {
try {
countWords(bufferedReader1, countMap1, line);
bufferedReader1.close();
} catch (IOException e) {
e.printStackTrace();
}
for (String word : countMap1.keySet()) {
WordNode node = new WordNode(word);
node.addToList("倚天屠龙记.txt->" + countMap1.get(word));
map.put(word, node);
}
}
});
thread1.start();
Thread thread2 = new Thread(new Runnable() {
@Override
public void run() {
try {
countWords(bufferedReader2, countMap2, line);
bufferedReader2.close();
} catch (IOException e) {
e.printStackTrace();
}
for (String word : countMap2.keySet()) {//
if (!map.containsKey(word)) {
WordNode node = new WordNode(word);
node.addToList("鹿鼎记.txt->" + countMap2.get(word));
map.put(word, node);
} else {
map.get(word).addToList("鹿鼎记.txt->" + countMap2.get(word));
}
}
}
});
thread2.start();
Thread thread3 = new Thread(new Runnable() {
@Override
public void run() {
try {
countWords(bufferedReader3, countMap3, line);
bufferedReader3.close();
} catch (IOException e) {
e.printStackTrace();
}
for (String word : countMap3.keySet()) {//
if (!map.containsKey(word)) {
WordNode node = new WordNode(word);
node.addToList("笑傲江湖.txt->" + countMap3.get(word));
map.put(word, node);
} else {
map.get(word).addToList("笑傲江湖.txt->" + countMap3.get(word));
}
}
}
});
thread3.start();
/*在main执行写入到文档操作之前,需要等待三个统计工作线程全部结束,否则会出异常(使用join来为三个线程争取时间)*/
thread1.join();
thread2.join();
thread3.join();
//输出到文档:
StringBuffer stringBuffer = new StringBuffer();
for (String item : map.keySet()) {
stringBuffer.append(map.get(item));
}
/*在控制台上监视结果(console的显示未必能够完整,一般会把较早的输出推掉,我们看到的是后面的部分)*/
System.out.println(stringBuffer);
bufferedWriter.write(stringBuffer.toString());
bufferedWriter.flush();
bufferedWriter.close();
}//endMain
//countWords();另外可以考虑重载方法,达到默认参数的效果(但应该只是常量才有意义)
static void countWords(BufferedReader bufferedReader, HashMap<String, Integer> countMap, String line) throws IOException {
while ((line = (bufferedReader.readLine())) != null) {
//统计各份文档中的词频的方法.
List<Term> segment = HanLP.segment(line);
for (Term x : segment) {
//if(/*x.nature== Nature.w*//*||x.nature==Nature.e*/) continue;
String keyString = x.toString();
if (countMap.containsKey(keyString)) {
countMap.put(keyString, countMap.get(keyString) + 1);
} else {
countMap.put(keyString, 1);
}
}
}//endWhile
}
}
posted @   xuchaoxin1375  阅读(16)  评论(0编辑  收藏  举报  
相关博文:
阅读排行:
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
历史上的今天:
2022-06-22 EM@数列@等差数列
2022-06-22 js_解构赋值_对象解构/解构对象语句的调试
2021-06-22 PC_规格化数及尾数相关表示形式和范围
2021-06-22 PC_二进制移位运算/定点数移位/算数移位及其移位后的空位添补规则/机器数位数扩充
2021-06-22 PC_溢出概念+判断方法+示例
点击右上角即可分享
微信分享提示