python之NLP词性标注
1、知识点
包括中文和英文的词性标注
主要使用的库是nltk和jiaba
2、代码
# coding = utf-8 import nltk from nltk.corpus import stopwords from nltk.corpus import brown import numpy as np """ 标注步骤: 1、清洗,分词 2、标注 FAQ: 1、 Resource punkt not found. 请安装punkt模块 2、安装average_perceptron tagger 3、Resource sinica_treebank not found 请安装sinica_treebank模块 """ def english_label(): """ 英文词性标注 :return: """ # 分词 text = "Sentiment analysis is a challenging subject in machine learning.\ People express their emotions in language that is often obscured by sarcasm,\ ambiguity, and plays on words, all of which could be very misleading for \ both humans and computers.".lower() text_list = nltk.word_tokenize(text) # 去掉标点符号 english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%'] text_list = [word for word in text_list if word not in english_punctuations] # 去掉停用词 stops = set(stopwords.words("english")) text_list = [word for word in text_list if word not in stops] list = nltk.pos_tag(text_list) #打标签 print(list) def chineses_label(): import jieba.posseg as pseg import re """ fool也可以针对中文词性标注 HanLP词性标注集 案例使用jieba进行词性标注 :return: """ str = "我爱你,是粉色,舒服 ,舒服,士大夫" posseg_list = re.sub(r'[,]', " ", str) posseg_list =pseg.cut(posseg_list) print(posseg_list) print(' '.join('%s/%s' % (word, tag) for (word, tag) in posseg_list))
本文来自博客园,作者:小白啊小白,Fighting,转载请注明原文链接:https://www.cnblogs.com/ywjfx/p/11026712.html