Hanlp学习笔记
一、首先要引入mawen依赖包:
<dependency> <groupId>com.hankcs</groupId> <artifactId>hanlp</artifactId> <version>portable-1.7.2</version> </dependency> <dependency> <groupId>com.alibaba</groupId> <artifactId>druid</artifactId> <version>1.1.10</version> </dependency> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.7.3</version> </dependency>
二、提取语句中的关键字
java.util.List<String> keyword = HanLP.extractKeyword(model.getExamineeAnswer(), model.getKeywordList().size());//extractKeyword方法第二个参数为获取关键字个数
,第一个参数为你要提取关键字的语句
三、计算两个语句的相似度
double result=getSimilarity(model.getStandardAnswer(),model.getExamineeAnswer());
计算相似度使用的方法
/* * 获得两个句子的相似度 * @param sentence1 * @param sentence2 * @return */ public static double getSimilarity(String sentence1, String sentence2) { List<String> sent1Words = getSplitWords(sentence1); System.out.println(sent1Words); List<String> sent2Words = getSplitWords(sentence2); System.out.println(sent2Words); List<String> allWords = mergeList(sent1Words, sent2Words); int[] statistic1 = statistic(allWords, sent1Words); int[] statistic2 = statistic(allWords, sent2Words); double dividend = 0; double divisor1 = 0; double divisor2 = 0; for (int i = 0; i < statistic1.length; i++) { dividend += statistic1[i] * statistic2[i]; divisor1 += Math.pow(statistic1[i], 2); divisor2 += Math.pow(statistic2[i], 2); } return dividend / (Math.sqrt(divisor1) * Math.sqrt(divisor2)); } private static int[] statistic(List<String> allWords, List<String> sentWords) { int[] result = new int[allWords.size()]; for (int i = 0; i < allWords.size(); i++) { result[i] = Collections.frequency(sentWords, allWords.get(i)); } return result; } private static List<String> mergeList(List<String> list1, List<String> list2) { List<String> result = new ArrayList<>(); result.addAll(list1); result.addAll(list2); return result.stream().distinct().collect(Collectors.toList()); } private static List<String> getSplitWords(String sentence) { // 去除掉html标签 sentence = Jsoup.parse(sentence.replace(" ","")).body().text(); // 标点符号会被单独分为一个Term,去除之 return HanLP.segment(sentence).stream().map(a -> a.word).
filter(s -> !"`~!@#$^&*()=|{}':;',\\[\\].<>/?~!@#¥……&*()——|{}【】‘;:”“'。,、? ".contains(s)).collect(Collectors.toList()); }
四、提取语句的摘要
List<String> sentenceList = HanLP.extractSummary(str, 3);//摘要
五、hanlp分词
List<Term> termList = NLPTokenizer.segment(str);
六、提取句子中的词
List<String> sentenceList= HanLP.extractPhrase(str, 3);//词
www.baidu.com