数据清洗第二次
词频统计
1.对词汇换行情况进行处理
由于在英文论文中常常出现一个单词换行的情况,导致有些词汇分裂,因此,在处理文本之前,对这种情况进行处理。处理的思路是:逐行读取纯文本文件,然后横向合并。借助正则表达式对词汇换行的情况进行处理。
def open_file(file_path):
with open(file_path, encoding='utf-8') as f:
# txt= f.read()
txt0 = f.readlines()
txt =[x.strip() for x in txt0]
txt1 = " ".join(txt)
txt2 = re.sub('(-\s)', '', txt1)
return txt2
2.借助正则表达式,对缩写词汇进行替换
def replace_abbreviations(text):
new_text = text
new_text = pat_letter.sub(' ', text).strip().lower()
new_text = pat_is.sub(r"\1 is", new_text)
new_text = pat_s.sub("", new_text)
new_text = pat_s2.sub("", new_text)
new_text = pat_not.sub(" not", new_text)
new_text = pat_would.sub(" would", new_text)
new_text = pat_will.sub(" will", new_text)
new_text = pat_am.sub(" am", new_text)
new_text = pat_are.sub(" are", new_text)
new_text = pat_ve.sub(" have", new_text)
new_text = new_text.replace('\'', ' ')
return new_text
3.对标点符号,以及文章中出现的大量的数字以及单个英文字符进行处理
def text_washing(text):
new_text = re.sub('[,\.()":;!?@#$%^&*\d]|\'s|\'', '', text) # txet wash
new_text = re.sub("\W|[0-9]", " ", new_text)
#deleting the solo character
# 删掉单个字母
txt4 = new_text.split(" ")
list = []
for i in txt4:
i = i.strip()
if len(i) > 2:
list.append(i)
wash_text = " ".join(list)
return wash_text
4.对单词的词性进行还原,并借助停用词词典,对停用词进行剔除
def merge(text):
words = text.split()
new_words = []
for word in words:
if word:
tag = nltk.pos_tag(word_tokenize(word)) # tag is like [('bigger', 'JJR')]
pos = get_wordnet_pos(tag[0][1])
if pos:
lemmatized_word = lmtzr.lemmatize(word, pos)
new_words.append(lemmatized_word)
else:
new_words.append(word)
stopwords = [word.strip().lower() for word in open("stopwords.txt")]
clean_tokens = [tok for tok in new_words if len(tok) > 1 and (tok not in stopwords)]
return clean_tokens
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J'):
return nltk.corpus.wordnet.ADJ
elif treebank_tag.startswith('V'):
return nltk.corpus.wordnet.VERB
elif treebank_tag.startswith('N'):
return nltk.corpus.wordnet.NOUN
elif treebank_tag.startswith('R'):
return nltk.corpus.wordnet.ADV
else:
return ''
5.构建词频的统计函数
def append_ext(words_list):
count = collections.Counter(words_list)
words =count.most_common()
new_words = []
for item in words:
word, count = item
tag = nltk.pos_tag(word_tokenize(word))[0][1] # tag is like [('bigger', 'JJR')]
new_words.append((word, count, tag))
return new_words
6.将数据写入文件
def data_write(file_path, datas):
f = xlwt.Workbook()
sheet1 = f.add_sheet(u'sheet1', cell_overwrite_ok=True) # 创建sheet
# 将数据写入第 i 行,第 j 列
j = 2
for data in datas:
for i in range(len(data)):
sheet1.write(i, j, data[j])
i = i + 1
f.save(file_path) # 保存文件
原文链接:
https://blog.csdn.net/weixin_38224930/article/details/106010575
致谢