java_倒排索引(单线程版+多线程(初)版)

 package experiment10.exp2;
 
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.corpus.document.sentence.word.Word;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.seg.common.Term;
import experiment9.FileName;
 
import java.io.*;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
 
/*倒排索引
  倒排索引是搜索引擎的核心技术，对海量的文本（文档、网页），搜索引擎需要建立搜索索引。
*/
public class ReverseIndex {
    public static void main(String[] args) throws IOException {
        BufferedReader bufferedReader1=new BufferedReader(new FileReader(FileName.fileName10_1, Charset.forName("utf-8")));
        BufferedReader bufferedReader2=new BufferedReader(new FileReader(FileName.fileName10_2,Charset.forName("utf-8")));
        BufferedReader bufferedReader3=new BufferedReader(new FileReader(FileName.fileName10_3,Charset.forName("utf-8")));
        //输出:
        BufferedWriter bufferedWriter= new BufferedWriter(new FileWriter(FileName.fileNameOut10_1));
 
        HashMap<String,Integer> countMap1 = new HashMap<>();
        HashMap<String,Integer> countMap2 = new HashMap<>();
        HashMap<String,Integer> countMap3 = new HashMap<>();
//        List<WordNode> list=new ArrayList<>();//整合统计信息.
        HashMap<String,WordNode> map=new HashMap<>();
 
        String line="";//这种只作为参数的对象/变量应当初始化.
        //调用countWords统计各份文档中的词频.(统计到同一个map中)
       countWords(bufferedReader1, countMap1, line);
       for(String word : countMap1.keySet()){
           WordNode node=new WordNode(word);
           node.addToList("倚天屠龙记.txt->"+countMap1.get(word));
           map.put(word, node);
       }
       countWords(bufferedReader2,countMap2,line);
        for(String word : countMap2.keySet()){//
            if(!map.containsKey(word)){
                WordNode node=new WordNode(word);
                node.addToList("鹿鼎记.txt->"+countMap2.get(word));
                map.put(word, node);
            }
            else {
                map.get(word).addToList("鹿鼎记.txt->"+countMap2.get(word));
            }
 
        }
       countWords(bufferedReader3, countMap3, line);
        for(String word : countMap3.keySet()){//
            if(!map.containsKey(word)){
                WordNode node=new WordNode(word);
                node.addToList("笑傲江湖.txt->"+countMap3.get(word));
                map.put(word, node);
            }
            else {
                map.get(word).addToList("笑傲江湖.txt->"+countMap3.get(word));
            }
 
        }
 
        //打印
        StringBuffer stringBuffer = new StringBuffer();
        for(String item : map.keySet()){
            stringBuffer.append(map.get(item));
        }
        System.out.println(stringBuffer);
       bufferedWriter.write(stringBuffer.toString());
       bufferedWriter.flush();
        bufferedReader1.close();
        bufferedReader2.close();
        bufferedReader3.close();
        bufferedWriter.close();
 
    }//endMain
 
    //countWords();另外可以考虑重载方法,达到默认参数的效果(但应该只是常量才有意义)
     static void countWords(BufferedReader bufferedReader,HashMap<String,Integer> countMap,String line) throws IOException {
         while ((line=(bufferedReader.readLine()))!=null){
             //统计各份文档中的词频的方法.
             List<Term> segment = HanLP.segment(line);
             for(Term x:segment){
                 //if(/*x.nature== Nature.w*//*||x.nature==Nature.e*/) continue;
                 String keyString=x.toString();
                 if(countMap.containsKey(keyString)){
                     countMap.put(keyString,countMap.get(keyString)+1);
                 }else {
                     countMap.put(keyString,1);
                 }
             }
         }//endWhile
     }
}

用到一个类WordNode辅助统计.

 package experiment10.exp2;
 
import java.util.ArrayList;
import java.util.List;
 
public class WordNode {
    String word;
    List<String> list = new ArrayList<>();
 
    public WordNode(String word) {
        this.word = word;
        //this.list = list;
    }
 
    public String getWord() {
        return word;
    }
 
    public List<String> getList() {
        return list;
    }
 
    public void setWord(String word) {
        this.word = word;
    }
 
    public void addToList(String fileNameAndFrequency) {
        list.add(fileNameAndFrequency);
    }
 
    @Override
    public String toString() {
        StringBuffer buffer=new StringBuffer();
        for(String phrase:list){
            buffer.append(phrase+"\t");
        }
        return word+"\t"+buffer+"\n";
    }
}

 package experiment10.exp2;
 
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.corpus.document.sentence.word.Word;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.seg.common.Term;
import experiment9.FileName;
 
import java.io.*;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
 
/*倒排索引
  倒排索引是搜索引擎的核心技术，对海量的文本（文档、网页），搜索引擎需要建立搜索索引。
*/
public class ReverseIndex {
    public static void main(String[] args) throws IOException, InterruptedException {
        BufferedReader bufferedReader1 = new BufferedReader(new FileReader(FileName.fileName10_1, Charset.forName("utf-8")));
        BufferedReader bufferedReader2 = new BufferedReader(new FileReader(FileName.fileName10_2, Charset.forName("utf-8")));
        BufferedReader bufferedReader3 = new BufferedReader(new FileReader(FileName.fileName10_3, Charset.forName("utf-8")));
        //输出:
        BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(FileName.fileNameOut10_1));
 
        HashMap<String, Integer> countMap1 = new HashMap<>();
        HashMap<String, Integer> countMap2 = new HashMap<>();
        HashMap<String, Integer> countMap3 = new HashMap<>();
//        List<WordNode> list=new ArrayList<>();//整合统计信息.
        HashMap<String, WordNode> map = new HashMap<>();
 
        String line = "";//这种只作为参数的对象/变量应当初始化.
        //调用countWords统计各份文档中的词频.(统计到同一个map中)
        /*如果采用多线程来写的话,在考虑线程安全之外,三个countWords工作对应可有三个线程来执行.(这是线程安全的,不会互相干扰的)
         * 他们都是往同一个map中写入/修改<String,WordNode>键值对.如果有相应的word键,则往wordNode中插入新内容.
         * 所插入的内容是各个文档名+该文档包出现word的次数.*/
        Thread thread1 = new Thread(new Runnable() {
            @Override
            public void run() {
                try {
                    countWords(bufferedReader1, countMap1, line);
                    bufferedReader1.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
                for (String word : countMap1.keySet()) {
                    WordNode node = new WordNode(word);
                    node.addToList("倚天屠龙记.txt->" + countMap1.get(word));
                    map.put(word, node);
                }
            }
        });
        thread1.start();
 
        Thread thread2 = new Thread(new Runnable() {
            @Override
            public void run() {
                try {
                    countWords(bufferedReader2, countMap2, line);
                    bufferedReader2.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
 
                for (String word : countMap2.keySet()) {//
                    if (!map.containsKey(word)) {
                        WordNode node = new WordNode(word);
                        node.addToList("鹿鼎记.txt->" + countMap2.get(word));
                        map.put(word, node);
                    } else {
                        map.get(word).addToList("鹿鼎记.txt->" + countMap2.get(word));
                    }
                }
            }
        });
        thread2.start();
 
        Thread thread3 = new Thread(new Runnable() {
 
            @Override
            public void run() {
                try {
                    countWords(bufferedReader3, countMap3, line);
                    bufferedReader3.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
                for (String word : countMap3.keySet()) {//
                    if (!map.containsKey(word)) {
                        WordNode node = new WordNode(word);
                        node.addToList("笑傲江湖.txt->" + countMap3.get(word));
                        map.put(word, node);
                    } else {
                        map.get(word).addToList("笑傲江湖.txt->" + countMap3.get(word));
                    }
 
                }
            }
        });
        thread3.start();
        /*在main执行写入到文档操作之前,需要等待三个统计工作线程全部结束,否则会出异常(使用join来为三个线程争取时间)*/
        thread1.join();
        thread2.join();
        thread3.join();
 
        //输出到文档:
        StringBuffer stringBuffer = new StringBuffer();
        for (String item : map.keySet()) {
            stringBuffer.append(map.get(item));
        }
        /*在控制台上监视结果(console的显示未必能够完整,一般会把较早的输出推掉,我们看到的是后面的部分)*/
        System.out.println(stringBuffer);
        bufferedWriter.write(stringBuffer.toString());
        bufferedWriter.flush();
 
        bufferedWriter.close();
 
    }//endMain
 
    //countWords();另外可以考虑重载方法,达到默认参数的效果(但应该只是常量才有意义)
    static void countWords(BufferedReader bufferedReader, HashMap<String, Integer> countMap, String line) throws IOException {
        while ((line = (bufferedReader.readLine())) != null) {
            //统计各份文档中的词频的方法.
            List<Term> segment = HanLP.segment(line);
            for (Term x : segment) {
                //if(/*x.nature== Nature.w*//*||x.nature==Nature.e*/) continue;
                String keyString = x.toString();
                if (countMap.containsKey(keyString)) {
                    countMap.put(keyString, countMap.get(keyString) + 1);
                } else {
                    countMap.put(keyString, 1);
                }
            }
        }//endWhile
    }
}

posted @ 2024-06-22 14:46 xuchaoxin1375 阅读(16) 评论(0) 编辑收藏举报来源

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

· java_利用hanlp对文件“三国演义(罗贯中).txt”进行分词，去掉标点符号和停用词，最后统计词频，排序输出到文件“三国演义词频.txt“

· 倒排索引的 JAVA 简单实现

· 测试开发面试题大全

· java题目单词倒排

阅读排行：
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码，我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了，比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· 白话解读 Dapr 1.15：你的「微服务管家」又秀新绝活了

历史上的今天：
2022-06-22 EM@数列@等差数列
2022-06-22 js_解构赋值_对象解构/解构对象语句的调试
2021-06-22 PC_规格化数及尾数相关表示形式和范围
2021-06-22 PC_二进制移位运算/定点数移位/算数移位及其移位后的空位添补规则/机器数位数扩充
2021-06-22 PC_溢出概念+判断方法+示例

公告

昵称： xuchaoxin1375
园龄： 4年10个月
粉丝： 1
关注： 0

+加关注

2025年3月

日

一

二

三

四

五

六

xuchaoxin1375

java_倒排索引(单线程版+多线程(初)版)

公告

搜索

常用链接

随笔档案

阅读排行榜

推荐排行榜

	package experiment10.exp2;

	import com.hankcs.hanlp.HanLP;
	import com.hankcs.hanlp.corpus.document.sentence.word.Word;
	import com.hankcs.hanlp.corpus.tag.Nature;
	import com.hankcs.hanlp.seg.common.Term;
	import experiment9.FileName;

	import java.io.*;
	import java.nio.charset.Charset;
	import java.util.ArrayList;
	import java.util.HashMap;
	import java.util.List;

	/*倒排索引
	倒排索引是搜索引擎的核心技术，对海量的文本（文档、网页），搜索引擎需要建立搜索索引。
	*/
	public class ReverseIndex {
	public static void main(String[] args) throws IOException {
	BufferedReader bufferedReader1=new BufferedReader(new FileReader(FileName.fileName10_1, Charset.forName("utf-8")));
	BufferedReader bufferedReader2=new BufferedReader(new FileReader(FileName.fileName10_2,Charset.forName("utf-8")));
	BufferedReader bufferedReader3=new BufferedReader(new FileReader(FileName.fileName10_3,Charset.forName("utf-8")));
	//输出:
	BufferedWriter bufferedWriter= new BufferedWriter(new FileWriter(FileName.fileNameOut10_1));

	HashMap<String,Integer> countMap1 = new HashMap<>();
	HashMap<String,Integer> countMap2 = new HashMap<>();
	HashMap<String,Integer> countMap3 = new HashMap<>();
	// List<WordNode> list=new ArrayList<>();//整合统计信息.
	HashMap<String,WordNode> map=new HashMap<>();

	String line="";//这种只作为参数的对象/变量应当初始化.
	//调用countWords统计各份文档中的词频.(统计到同一个map中)
	countWords(bufferedReader1, countMap1, line);
	for(String word : countMap1.keySet()){
	WordNode node=new WordNode(word);
	node.addToList("倚天屠龙记.txt->"+countMap1.get(word));
	map.put(word, node);
	}
	countWords(bufferedReader2,countMap2,line);
	for(String word : countMap2.keySet()){//
	if(!map.containsKey(word)){
	WordNode node=new WordNode(word);
	node.addToList("鹿鼎记.txt->"+countMap2.get(word));
	map.put(word, node);
	}
	else {
	map.get(word).addToList("鹿鼎记.txt->"+countMap2.get(word));
	}

	}
	countWords(bufferedReader3, countMap3, line);
	for(String word : countMap3.keySet()){//
	if(!map.containsKey(word)){
	WordNode node=new WordNode(word);
	node.addToList("笑傲江湖.txt->"+countMap3.get(word));
	map.put(word, node);
	}
	else {
	map.get(word).addToList("笑傲江湖.txt->"+countMap3.get(word));
	}

	}

	//打印
	StringBuffer stringBuffer = new StringBuffer();
	for(String item : map.keySet()){
	stringBuffer.append(map.get(item));
	}
	System.out.println(stringBuffer);
	bufferedWriter.write(stringBuffer.toString());
	bufferedWriter.flush();
	bufferedReader1.close();
	bufferedReader2.close();
	bufferedReader3.close();
	bufferedWriter.close();

	}//endMain

	//countWords();另外可以考虑重载方法,达到默认参数的效果(但应该只是常量才有意义)
	static void countWords(BufferedReader bufferedReader,HashMap<String,Integer> countMap,String line) throws IOException {
	while ((line=(bufferedReader.readLine()))!=null){
	//统计各份文档中的词频的方法.
	List<Term> segment = HanLP.segment(line);
	for(Term x:segment){
	//if(/x.nature== Nature.w//\|\|x.nature==Nature.e/) continue;
	String keyString=x.toString();
	if(countMap.containsKey(keyString)){
	countMap.put(keyString,countMap.get(keyString)+1);
	}else {
	countMap.put(keyString,1);
	}
	}
	}//endWhile
	}
	}