不为别的,只为做一个连自己都羡慕的人

python中命名实体标注

import json
import re
import pandas as pd
import nltk
import save_csv

# bltk命令实体案例  提取文本中的人名,地名,机构等等

def parse_document(document):
    document = re.sub('\n', ' ', document)
    if isinstance(document, str):
        document = document
    else:
        raise ValueError('Document is not string!')
    document = document.strip()
    sentences = nltk.sent_tokenize(document)
    sentences = [sentence.strip() for sentence in sentences]
    return sentences


# sample document
text = open(r'test.json', "r").read()

text1 = ""
for item in json.loads(text):
    text1 += " " + item["text"]
# print(text1)
# text = """
# FIFA was founded in 1904 to oversee international competition among the national associations of Belgium,
# Denmark, France, Germany, the Netherlands, Spain, Sweden, and Switzerland. Headquartered in Zürich, its
# membership now comprises 211 national associations. Member countries must each also be members of one of
# the six regional confederations into which the world is divided: Africa, Asia, Europe, North & Central America
# and the Caribbean, Oceania, and South America.
# """

# tokenize sentences
sentences = parse_document(text1)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
# tag sentences and use nltk's Named Entity Chunker
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
ne_chunked_sents = [nltk.ne_chunk(tagged) for tagged in tagged_sentences]



# extract all named entities
named_entities = []
for ne_tagged_sentence in ne_chunked_sents:
    for tagged_tree in ne_tagged_sentence:
        # extract only chunks having NE labels
        if hasattr(tagged_tree, 'label'):
            entity_name = ' '.join(c[0] for c in tagged_tree.leaves())  # get NE name
            entity_type = tagged_tree.label()  # get NE category
            named_entities.append((entity_name, entity_type))
            # get unique named entities
            named_entities = list(set(named_entities))


# 存入excel之前
print(named_entities)
# store named entities in a data frame
entity_frame = pd.DataFrame(named_entities, columns=['Entity Name', 'Entity Type'])
# 存入csv中
entity_frame.to_csv('data_df.csv', encoding='utf_8_sig')
# display results
print(entity_frame)
# save_csv.save_csv_data(entity_frame)

 

 

posted @ 2021-01-30 17:15  升级打怪  阅读(477)  评论(0编辑  收藏  举报