Python:合并段落方法

合并段落:
将以非法段落结束符结束的段落和下一段落进行合并,并去掉空白行和段落首尾的空白字符。

def isEndOfP(line):
    notendstrs = ["www.", "文章"]
    for notendstr in notendstrs:
        if line.endswith(notendstr):
            return False
    endstrs = ["\"", ".", "”", "。", "!", "?", "!", "?", "……", "…", "》", ":", ":", ";", ";", "1", "2", "3",
                "4", "5", "6", "7", "8", "9", "0", "章", "部", "录", "著", "译", "言", "~", "---", "」"]
    for endstr in endstrs:
        if line.endswith(endstr):
            return True
    return False

def isStrD(line):
    strDa = [ "\"", "(", "{", "[", "《", "“", "‘", "(", "{", "【" ]
    strDb = [ "\"", ")", "}", "]", "》", "”", "’", ")", "}", "】" ]
    for i in range(0, len(strDa)):
        if countSubString(line, strDa[i]) != countSubString(line, strDb[i]):
            return False
    return True

def countSubString(line, substr):
    if line is None or line == "":
        return 0
    index = 0
    count = 0
    while index < len(line):
        index = line.find(substr, index) + 1
        if index == 0:
            break
        count += 1
    return count

def isP(line):
    return isEndOfP(line) and isStrD(line)

def formated(content):
    lines = content.split("\n")
    res = ""
    for line in lines:
        endLine = ""
        if isP(line.strip()):
            endLine = "\n"   
        res += line.strip() + endLine
    return res
posted @ 2018-12-26 09:27  xuejianbest  阅读(1173)  评论(0编辑  收藏  举报