hanlp学习二：自定义命名体识别语料库建设

1.利用成熟的语料库对工程的生语料进行分词+词性标注

# 第一步 生成分词+词性标注的模型

from  pyhanlp import *
import zipfile
import os

from pyhanlp.static import download, remove_file, HANLP_DATA_PATH


def test_data_path():
    """
    获取测试数据路径，位于$root/data/test，根目录由配置文件指定。
    :return:
    """
    data_path = os.path.join(HANLP_DATA_PATH, 'test')
    if not os.path.isdir(data_path):
        os.mkdir(data_path)
    return data_path


def ensure_data(data_name, data_url):
    root_path = test_data_path()
    dest_path = os.path.join(root_path, data_name)
    if os.path.exists(dest_path):
        return dest_path
    if data_url.endswith('.zip'):
        dest_path += '.zip'
    download(data_url, dest_path)
    if data_url.endswith('.zip'):
        with zipfile.ZipFile(dest_path, "r") as archive:
            archive.extractall(root_path)
        remove_file(dest_path)
        dest_path = dest_path[:-len('.zip')]
    return dest_path


PKU98 = ensure_data("pku98", "http://file.hankcs.com/corpus/pku98.zip")
PKU199801 = os.path.join(PKU98, '199801.txt')
PKU199801_TRAIN = os.path.join(PKU98, '199801-train.txt')
PKU199801_TEST = os.path.join(PKU98, '199801-test.txt')
POS_MODEL = os.path.join('C:\\Users\\DELL\\Desktop\\NL_supplier', 'f_pos.bin') # 获取空模型
POSTrainer = JClass('com.hankcs.hanlp.model.perceptron.POSTrainer')
PerceptronSegmenter = JClass('com.hankcs.hanlp.model.perceptron.PerceptronSegmenter')
AbstractLexicalAnalyzer = JClass('com.hankcs.hanlp.tokenizer.lexical.AbstractLexicalAnalyzer')
PerceptronPOSTagger = JClass('com.hankcs.hanlp.model.perceptron.PerceptronPOSTagger')


def train_perceptron_pos(corpus):
    trainer = POSTrainer()
    model = trainer.train(corpus, POS_MODEL).getModel()  # 标注训练并保存文件
    model = 'C:\\Users\\DELL\\Desktop\\NL_supplier\\f_pos.bin' # 指定模型文件路径
    
    tagger = PerceptronPOSTagger(model)  # 加载模型文件
    #print(', '.join(tagger.tag("他", "的", "希望", "是", "希望", "上学")))  # 预测
    analyzer = AbstractLexicalAnalyzer(PerceptronSegmenter(), tagger).enableCustomDictionary(False)  # 构造词法分析器
    #print(analyzer.analyze("浙江大学医学院附属儿童医院"))  # 分词+词性标注
    #return tagger
    return analyzer

# 第二步 利用模型对工程生语料进行分词与词性标注，生成原始训练语料库
# 第三步 在原始训练语料库标识出命名实体
如：

1/m ;/w 安吉县/ns 高级中学/n 监控/vn 设备/n 改造/vn ;/w 湖州/ns 腾云计算机/n 科技/n 有限公司/n ;/w 728860.00/m

标注为
1/m ;/w 安吉县/ns 高级中学/n 监控/vn 设备/n 改造/vn ;/w [湖州/ns 腾云计算机/n 科技/n 有限公司/n]/supplier ;/w 728860.00/m

posted @ 2020-01-25 20:27 秋华阅读(1080) 评论(0) 编辑收藏举报

刷新页面返回顶部

秋华

hanlp学习二：自定义命名体识别语料库建设

公告