java_利用hanlp对文件“三国演义(罗贯中).txt”进行分词，去掉标点符号和停用词，最后统计词频，排序输出到文件“三国演义词频.txt“

配置好hanlp(从官网下载字典数据包:) 在这里插入图片描述

 package experiment8.exp3;
 
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.seg.common.Term;
import com.hankcs.hanlp.tokenizer.StandardTokenizer;
import experiment6.exp4.Tuple;
 
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.*;
 
/*对文件“三国演义(罗贯中).txt”进行分词，去掉标点符号和停用词，
最后统计词频，排序输出到文件“三国演义词频.txt"*/
public class StatisticsWords {
    public static void main(String[] args) {
        HashMap<String, Integer> map = new HashMap<>();//用于统计词频
        System.out.println("输入文件目录:"); //String fileName = new Scanner(System.in).nextLine();
          /*     File file =new File(fileName);//无法正常读入中文。
            Scanner scanner = new Scanner(file);*/
        String fileName = "D:\\ecloud\\textbooks\\java\\experiment_doc\\dataExperiment8\\三国演义(罗贯中).txt";
        /*读入目录: D:\ecloud\textbooks\java\experiment_doc\dataExperiment8\三国演义(罗贯中).txt
    D:\ecloud\textbooks\java\experiment_doc\dataExperiment8\english.txt    */
        String fileNameOut = "D:\\ecloud\\textbooks\\java\\experiment_doc\\dataExperiment8\\三国演义词频.txt";
 
        try {
            /*创建输入流和输出流*/
            FileInputStream inputStream = new FileInputStream(fileName);
            ;/*产生了一个(节点)流对象
            方便后期用流对其数据操作:一般是使用该流对象对应具有的方法进行操作:
            比如.read()方法来取用某些内容*/
            FileOutputStream outputStream = new FileOutputStream(fileNameOut);
 
            //String str;
            byte[] bytes = new byte[1024];/*申请一个byte[]类型的数组(对象),该对象的规格通过new Byte[1024] 来定义,用bytes引用变量类管理这个数组对象;
            这个数组用于存储届时读取的数据,另外注意不要使用byte的包装类Byte,两者有所不容*/
            int len = 0;
            try {
                while ((len = inputStream.read(bytes)) != -1) {
                    //System.out.println(new String(bytes,"utf-8"));//默认是用gbk编码,所以灾打印输出的时候将出现乱码(故这里指定用utf-8)来处理.
                    //String(byte[] bytes, int offset, int length):把字节数组的一部分转换为字符串
//                    StringTokenizer tokenizer = new StringTokenizer(new String(bytes/*, StandardCharsets.UTF_8*/), "《》；，。！【】‘? ”“：￥~—`\\？|");/*《》；，。！【】‘ “：￥~`\?  */
//                    String str;
//                    while (tokenizer.hasMoreElements()) {
//                        str = (String) tokenizer.nextElement();
//                        if (map.containsKey(str)) {
//                            map.put(str, map.get(str) + 1);
//                        } else {
//                            map.put(str, 1);
//                        }
//                        //System.out.println(str);
//                    }//endWhileTokenizer
                    /*使用hanlp包来做：此处受教于同学.*/
                    String s = new String(bytes, StandardCharsets.UTF_8);
                    /*System.out.println(HanLP.segment(s));*/
                  /*  CoreStopWordDictionary.add("??");
                    CoreStopWordDictionary.add(" ");*/
 
                    /*hanlp处理字符串s:光用函数的效果并不如意：*/
      /*              for (Term x : HanLP.segment(s)) {
                        str = x.toString();
                        if (!CoreStopWordDictionary.contains(str)) {
                            if (map.containsKey(str)) {
                                map.put(str, map.get(str) + 1);
                            } else {
                                map.put(str, 1);
                            }
                        }
                    }//endfor
              */
                    /*使用Term类中的nature成员来处理符号问题比较满意。*/
                    //Segment segment = HanLP.newSegment().enableNameRecognize(true);
 
                    List<Term> termList = StandardTokenizer.segment(s);/*标准分词器*/
                   // System.out.println(termList);//查看分词效果,发现符号和词语都别列入列表中。
                    for (Term x : termList) {
 
//                        if (x.nature == Nature.w) {//Term类下的nature成员是词性。而Nature类里有诸词性（名词，连词，乃至人名等其他分类标准..等细致的划分。public static final Nature nr = new Nature("nr");nr成员就是人名,w就是标点符号。但注意统计人名时应当是x.nature!=Nature.nr作为条件
//                            continue;/*如果比较得知该x的属性不是人名，那么跳过后续的统计操作，判断下个记录*/
//                        }
                        if (x.nature!= Nature.nr) {
                            continue;
                        }
 
                        String str = x.toString();
                        if (map.containsKey(str)) {
                            map.put(str, map.get(str) + 1);
                        } else {
                            map.put(str, 1);
                        }
                    }
 
                }
            }//whileEndIO 要特别注意io流（的关闭）不能在循环里写（出现）
            catch (IOException ioException) {
                ioException.printStackTrace();
            }
//
            List<Tuple> list = new ArrayList<>();
            StringBuffer stringBuffer = new StringBuffer();
            for (String x : map.keySet()) {
                list.add(new Tuple(x, map.get(x)));
            }//endFor
            //System.out.println(stringBuffer);
            /*创建Tuple对象，并将Tuple对象放入列表中一遍执行排序：*/
            Collections.sort(list, new Comparator<>() {
                @Override
                public int compare(Tuple o1, Tuple o2) {
                    return o2.getValue() - o1.getValue();
                }
            });
            /*链接成一整个字符串*/
            for (Tuple x : list) {
                stringBuffer.append(x);
            }
           System.out.println(stringBuffer);
            /*写入文件*/
            outputStream.write(stringBuffer.toString().getBytes());
            /*关闭io流，释放资源*/
            inputStream.close();
            outputStream.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
 
 
    }
}

 package experiment6.exp4;
 
public class Tuple {
    String string;
    int num;
 
    public Tuple(String string, int num) {
        this.string = string;
        this.num = num;
    }
 
    public String getKey() {
        return string;
    }
 
    public int getValue() {
        return num;
    }
 
    @Override
    public String toString() {
        return getKey()+"\t"+getValue() +"\n";// +"\n"
    }
}

posted @ 2023-08-18 10:18 xuchaoxin1375 阅读(25) 评论(0) 编辑收藏举报来源

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

· java_倒排索引(单线程版+多线程(初)版)

· hanlp分词

· Spark下中文分词常用项目

· HanLP02

阅读排行：
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码，我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了，比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· 白话解读 Dapr 1.15：你的「微服务管家」又秀新绝活了

公告

昵称： xuchaoxin1375
园龄： 4年10个月
粉丝： 1
关注： 0

+加关注

2025年3月

日

一

二

三

四

五

六

xuchaoxin1375

java_利用hanlp对文件“三国演义(罗贯中).txt”进行分词，去掉标点符号和停用词，最后统计词频，排序输出到文件“三国演义词频.txt“

公告

搜索

常用链接

随笔档案

阅读排行榜

推荐排行榜

	package experiment8.exp3;

	import com.hankcs.hanlp.corpus.tag.Nature;
	import com.hankcs.hanlp.seg.common.Term;
	import com.hankcs.hanlp.tokenizer.StandardTokenizer;
	import experiment6.exp4.Tuple;

	import java.io.FileInputStream;
	import java.io.FileOutputStream;
	import java.io.IOException;
	import java.nio.charset.StandardCharsets;
	import java.util.*;

	/*对文件“三国演义(罗贯中).txt”进行分词，去掉标点符号和停用词，
	最后统计词频，排序输出到文件“三国演义词频.txt"*/
	public class StatisticsWords {
	public static void main(String[] args) {
	HashMap<String, Integer> map = new HashMap<>();//用于统计词频
	System.out.println("输入文件目录:"); //String fileName = new Scanner(System.in).nextLine();
	/* File file =new File(fileName);//无法正常读入中文。
	Scanner scanner = new Scanner(file);*/
	String fileName = "D:\\ecloud\\textbooks\\java\\experiment_doc\\dataExperiment8\\三国演义(罗贯中).txt";
	/*读入目录: D:\ecloud\textbooks\java\experiment_doc\dataExperiment8\三国演义(罗贯中).txt
	D:\ecloud\textbooks\java\experiment_doc\dataExperiment8\english.txt */
	String fileNameOut = "D:\\ecloud\\textbooks\\java\\experiment_doc\\dataExperiment8\\三国演义词频.txt";

	try {
	/创建输入流和输出流/
	FileInputStream inputStream = new FileInputStream(fileName);
	;/*产生了一个(节点)流对象
	方便后期用流对其数据操作:一般是使用该流对象对应具有的方法进行操作:
	比如.read()方法来取用某些内容*/
	FileOutputStream outputStream = new FileOutputStream(fileNameOut);

	//String str;
	byte[] bytes = new byte[1024];/*申请一个byte[]类型的数组(对象),该对象的规格通过new Byte[1024] 来定义,用bytes引用变量类管理这个数组对象;
	这个数组用于存储届时读取的数据,另外注意不要使用byte的包装类Byte,两者有所不容*/
	int len = 0;
	try {
	while ((len = inputStream.read(bytes)) != -1) {
	//System.out.println(new String(bytes,"utf-8"));//默认是用gbk编码,所以灾打印输出的时候将出现乱码(故这里指定用utf-8)来处理.
	//String(byte[] bytes, int offset, int length):把字节数组的一部分转换为字符串
	// StringTokenizer tokenizer = new StringTokenizer(new String(bytes/, StandardCharsets.UTF_8/), "《》；，。！【】‘? ”“：￥~—`\\？\|");/《》；，。！【】‘ “：￥~`\? /
	// String str;
	// while (tokenizer.hasMoreElements()) {
	// str = (String) tokenizer.nextElement();
	// if (map.containsKey(str)) {
	// map.put(str, map.get(str) + 1);
	// } else {
	// map.put(str, 1);
	// }
	// //System.out.println(str);
	// }//endWhileTokenizer
	/使用hanlp包来做：此处受教于同学./
	String s = new String(bytes, StandardCharsets.UTF_8);
	/System.out.println(HanLP.segment(s));/
	/* CoreStopWordDictionary.add("??");
	CoreStopWordDictionary.add(" ");*/

	/hanlp处理字符串s:光用函数的效果并不如意：/
	/* for (Term x : HanLP.segment(s)) {
	str = x.toString();
	if (!CoreStopWordDictionary.contains(str)) {
	if (map.containsKey(str)) {
	map.put(str, map.get(str) + 1);
	} else {
	map.put(str, 1);
	}
	}
	}//endfor
	*/
	/使用Term类中的nature成员来处理符号问题比较满意。/
	//Segment segment = HanLP.newSegment().enableNameRecognize(true);

	List<Term> termList = StandardTokenizer.segment(s);/标准分词器/
	// System.out.println(termList);//查看分词效果,发现符号和词语都别列入列表中。
	for (Term x : termList) {

	// if (x.nature == Nature.w) {//Term类下的nature成员是词性。而Nature类里有诸词性（名词，连词，乃至人名等其他分类标准..等细致的划分。public static final Nature nr = new Nature("nr");nr成员就是人名,w就是标点符号。但注意统计人名时应当是x.nature!=Nature.nr作为条件
	// continue;/如果比较得知该x的属性不是人名，那么跳过后续的统计操作，判断下个记录/
	// }
	if (x.nature!= Nature.nr) {
	continue;
	}

	String str = x.toString();
	if (map.containsKey(str)) {
	map.put(str, map.get(str) + 1);
	} else {
	map.put(str, 1);
	}
	}

	}
	}//whileEndIO 要特别注意io流（的关闭）不能在循环里写（出现）
	catch (IOException ioException) {
	ioException.printStackTrace();
	}
	//
	List<Tuple> list = new ArrayList<>();
	StringBuffer stringBuffer = new StringBuffer();
	for (String x : map.keySet()) {
	list.add(new Tuple(x, map.get(x)));
	}//endFor
	//System.out.println(stringBuffer);
	/创建Tuple对象，并将Tuple对象放入列表中一遍执行排序：/
	Collections.sort(list, new Comparator<>() {
	@Override
	public int compare(Tuple o1, Tuple o2) {
	return o2.getValue() - o1.getValue();
	}
	});
	/链接成一整个字符串/
	for (Tuple x : list) {
	stringBuffer.append(x);
	}
	System.out.println(stringBuffer);
	/写入文件/
	outputStream.write(stringBuffer.toString().getBytes());
	/关闭io流，释放资源/
	inputStream.close();
	outputStream.close();
	} catch (IOException e) {
	e.printStackTrace();
	}


	}
	}

	package experiment6.exp4;

	public class Tuple {
	String string;
	int num;

	public Tuple(String string, int num) {
	this.string = string;
	this.num = num;
	}

	public String getKey() {
	return string;
	}

	public int getValue() {
	return num;
	}

	@Override
	public String toString() {
	return getKey()+"\t"+getValue() +"\n";// +"\n"
	}
	}

xuchaoxin1375

java_利用hanlp对文件“三国演义(罗贯中).txt”进行分词，去掉标点符号和停用词， 最后统计词频，排序输出到文件“三国演义词频.txt“

公告

搜索

常用链接

随笔档案

阅读排行榜

推荐排行榜

java_利用hanlp对文件“三国演义(罗贯中).txt”进行分词，去掉标点符号和停用词，最后统计词频，排序输出到文件“三国演义词频.txt“