分享一个还不错的英文分句代码——基于Stack overflow版本优化

废话不多说直接上代码

import re

caps = "([A-Z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"
digits = "([0-9])"
pics = "(FIG|FIGS|fig|figs|sub|TM|sup)"

def split_into_sentences2(text):
    #text = open(item,'r').read()
    # text = text.replace('\n','')
    text = " " + text + "  "
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + caps + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(caps + "[.]" + caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + caps + "[.]"," \\1<prd>",text)
    text = re.sub(digits + "[.]" + digits, "\\1<prd>\\2", text)
    # text = re.sub(digits + "[.]", "\\1<prd>", text)
    text = re.sub(pics + "[.]", "\\1<prd>", text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    if "e.g." in text: text = text.replace("e.g.", "e<prd>g<prd>")
    if "i.e." in text: text = text.replace("i.e.", "i<prd>e<prd>")
    if "Fig. " in text: text = text.replace("Fig. ", "Fig<prd> ")
    if "No. " in text: text = text.replace("No. ", "No<prd> ")
    if "NO. " in text: text = text.replace("NO. ", "No<prd> ")
    if "CO." in text: text = text.replace("CO.", "Co<prd>")
    text = text.replace(". ",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    # text = text.replace(";",";<stop>")
    # text = text.replace(",",",<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = [s.strip() for s in sentences]

    return sentences

部分数据进行了注释,是因为基于本工作文献文本的处理需要,也可按照自己的工作需要进行修改和微调,目前效果还不错的一个版本了,比NLTK的sent_tokenize效果要好很多。
感觉sent_tokenize就是一个按照. 进行句子分割,实测效果不好,所以在检索到底基础上加以修改,先共享并记录一下。


Life is beautiful. And the meaning of life is to be recognized. Keep Going, All.

posted @ 2022-04-26 13:34  AlsoRan  阅读(36)  评论(0编辑  收藏  举报