Python:合并段落方法
合并段落:
将以非法段落结束符结束的段落和下一段落进行合并,并去掉空白行和段落首尾的空白字符。
def isEndOfP(line):
notendstrs = ["www.", "文章"]
for notendstr in notendstrs:
if line.endswith(notendstr):
return False
endstrs = ["\"", ".", "”", "。", "!", "?", "!", "?", "……", "…", "》", ":", ":", ";", ";", "1", "2", "3",
"4", "5", "6", "7", "8", "9", "0", "章", "部", "录", "著", "译", "言", "~", "---", "」"]
for endstr in endstrs:
if line.endswith(endstr):
return True
return False
def isStrD(line):
strDa = [ "\"", "(", "{", "[", "《", "“", "‘", "(", "{", "【" ]
strDb = [ "\"", ")", "}", "]", "》", "”", "’", ")", "}", "】" ]
for i in range(0, len(strDa)):
if countSubString(line, strDa[i]) != countSubString(line, strDb[i]):
return False
return True
def countSubString(line, substr):
if line is None or line == "":
return 0
index = 0
count = 0
while index < len(line):
index = line.find(substr, index) + 1
if index == 0:
break
count += 1
return count
def isP(line):
return isEndOfP(line) and isStrD(line)
def formated(content):
lines = content.split("\n")
res = ""
for line in lines:
endLine = ""
if isP(line.strip()):
endLine = "\n"
res += line.strip() + endLine
return res