GENIA命名实体数据集解析代码

GENIA命名实体数据集解析

欢迎联系2448267954@qq.com指正交流。

代码

import xml.sax
xml_ds="GENIA-term.xml"
out_file="result.txt"

class SentenceHandler(xml.sax.ContentHandler):
    def __init__(self):
        self.text = ""
        self.labels = ""
        self.totag = []
        self.indexer = 0
        self.writer = open("result.txt",'w',encoding='utf8')

# 开始元素处理
    def startElement(self, tag, attributes):
        self.CurrentData = tag
        if tag == "cons":
            try:
                self.totag.append(attributes["sem"])
            except AttributeError:
                pass
            except KeyError:
                pass
        if tag == "sentence":
            self.writer.write("\n")

# 元素结束处理
    def endElement(self, tag):
        if self.CurrentData == "cons":
            if len(self.totag) > 1:
                self.totag = self.totag[:-2]
            else:
                self.totag.clear()

# 内容事件处理
    def characters(self, content):
        content = content.split(" ")
        for word in content:
            if word == "" or word == "\n":
                continue
            tow = ' '.join([str(self.indexer), word, "|".join(self.totag)])
            self.writer.write(tow+'\n')
            self.indexer += 1


if (__name__ == "__main__"):

    parser = xml.sax.make_parser()
    parser.setFeature(xml.sax.handler.feature_namespaces, 0)

    Headler = SentenceHandler()
    parser.setContentHandler(Headler)

    parser.parse(xml_ds)

输出格式

index word tag1|tag2|tag3

posted @ 2020-01-10 15:43  Github-Haltz  Views(1161)  Comments(0Edit  收藏  举报