python对文章词频的统计
import os import re from nltk import ne_chunk, pos_tag, word_tokenize import nltk from docx import Document import langid import pandas as pd def readWord(): text = "" rootdir = 'C:\\Users\\Administrator\\Desktop\\一季度' list = os.listdir(rootdir) # 列出文件夹下所有的目录与文件 for i in range(0, len(list)): path = os.path.join(rootdir, list[i]) print(path) document = Document(path) # 获取所有段落 all_paragraphs = document.paragraphs for paragraph in all_paragraphs: if langid.classify(paragraph.text)[0] == 'en': text += paragraph.text + "\n" return text def get_entities(): obj = {} arr = [] # 对文章分词 # sentence = "I am named John Doe AI AI AI AI" sentence = readWord() obj = {} tagged_sentence = ne_chunk(pos_tag(word_tokenize(sentence))) for tagged in tagged_sentence: if len(tagged) == 2: # print(tagged[1]) pattern = re.compile("\b’\b|\b”\b|\b—\b|\b\[\b|\b…\b|\b/\b|\bs\b|\bP\b|\bII\b|\bR\b|\bA\b|\b]\b") if pattern.findall(tagged[0][0]) and (tagged[1] == "NNP" or tagged[1] == "NNPS"): # if (tagged[1] == "NNP" or tagged[1] == "NNPS") and tagged[0] != "’" and tagged[0][0] != "”" and tagged[0][0] != "—" and tagged[0][0] != "[" and tagged[0][0] != "]" and tagged[0][0] != "…" and tagged[0][0] != "@" and tagged[0][0] != "/" and tagged[0][0] != "s" and tagged[0][0] != "|" and tagged[0][0] != "|" and tagged[0][0] != "P" and tagged[0][0] != "•" and tagged[0][0] != "II" and tagged[0][0] != "R" and tagged[0][0] != "A" and tagged[0][0] != "“" and tagged[0][0] != "‘" and tagged[0][0] != "–": if obj.get(tagged[0]) is not None: obj[tagged[0]] += 1 else: obj[tagged[0]] = 1 else: # print(tagged) # print(tagged[0]) if len(tagged[0]) == 2: # print(tagged[1]) if (tagged[0][1] == "NNP" or tagged[0][1] == "NNPS") and tagged[0][0] != "’" and tagged[0][0] != "”" and \ tagged[0][0] != "—" and tagged[0][0] != "[" and tagged[0][0] != "]" and tagged[0][0] != "…" and \ tagged[0][0] != "@" and tagged[0][0] != "/" and tagged[0][0] != "s" and tagged[0][0] != "|" and \ tagged[0][0] != "|" and tagged[0][0] != "P" and tagged[0][0] != "•" and tagged[0][0] != "II" and \ tagged[0][0] != "R" and tagged[0][0] != "A" and tagged[0][0] != "“" and tagged[0][0] != "‘" and \ tagged[0][0] != "–": if obj.get(tagged[0][0]) is not None: obj[tagged[0][0]] += 1 else: obj[tagged[0][0]] = 1 # # tagged_sentence = nltk.tag.pos_tag(sentence.split()) # tagged_sentence = ne_chunk(pos_tag(word_tokenize(sentence))) # # print(ne_chunk(pos_tag(word_tokenize(sentence)))) # # print(tagged_sentence) # for tagged in tagged_sentence: # if tagged[1] == "NNP" or tagged[1] == "NNPS": # # if obj.get(tagged[0]) is not None: # # obj[tagged[0]] += 1 # # else: # # obj[tagged[0]] = 1 # if obj.get(tagged[0].strip(",")) is not None: # obj[tagged[0].strip(",").strip(".")] += 1 # else: # obj[tagged[0]] = 1
# 将对象转为数组对象,便于pd将数据转为一种数据结构,写入excel中 dataframe是一种表格型的数据存储结构,可以看作是几个serie的集合。dataframe既有行索引,也有列索引。 for o in obj: obja = {"word": o, "num": obj[o]} arr.append(obja) p = pd.DataFrame(arr) # print(p) p.to_csv('c4i.csv', encoding='utf_8_sig') # print(p) if __name__ == '__main__': get_entities() # readWord()
使用的依赖库如下所示:
python-docx==0.8.11