NLTK 数据清洗示例
目录
数据清洗
- 去掉多余空格
- 去掉不需要特殊字符
- 去掉一些网站等没用的东西
使用正则,stopwords
import re
from nltk.corpus import stopwords
# 输入数据
s = ' RT @Amila #Test\nTom\'s newly listed Co & Mary\'s unlisted Group to supply tech for nlTK.\nh $TSLA $AAPL https:// t.co/x34afsfQsh'
#指定停用词
cache_english_stopwords = stopwords.words('english')
def text_clean(text):
print('原始数据:', text, '\n')
# 去掉HTML标签(e.g. &)
text_no_special_entities = re.sub(r'\&\w*;|#\w*|@\w*', '', text)
print('去掉特殊标签后的:', text_no_special_entities, '\n')
# 去掉一些价值符号
text_no_tickers = re.sub(r'\$\w*', '', text_no_special_entities)
print('去掉价值符号后的:', text_no_tickers, '\n')
# 去掉超链接
text_no_hyperlinks = re.sub(r'https?:\/\/.*\/\w*', '', text_no_tickers)
print('去掉超链接后的:', text_no_hyperlinks, '\n')
# 去掉一些专门名词缩写,简单来说就是字母比较少的词
text_no_small_words = re.sub(r'\b\w{1,2}\b', '', text_no_hyperlinks)
print('去掉专门名词缩写后:', text_no_small_words, '\n')
# 去掉多余的空格
text_no_whitespace = re.sub(r'\s\s+', ' ', text_no_small_words)
text_no_whitespace = text_no_whitespace.lstrip(' ')
print('去掉空格后的:', text_no_whitespace, '\n')
# 分词
tokens = word_tokenize(text_no_whitespace)
print('分词结果:', tokens, '\n')
# 去停用词
list_no_stopwords = [i for i in tokens if i not in cache_english_stopwords]
print('去停用词后结果:', list_no_stopwords, '\n')
# 过滤后结果
text_filtered =' '.join(list_no_stopwords) # ''.join() would join without spaces between words.
print('过滤后:', text_filtered)
text_clean(s)