Python: 最长公共子串

  

s1='abcdefg'
s2='efgabcd'
s3='zdxc'
s4='mn'
def findcom(str1,str2):
    if len(str1)>len(str2):
        str1,str2=str2,str1
    
    commonstr=[]
    flag=False
    count=0
    
    for sublen in range(len(str1),0,-1):
        for b in range(0,len(str1)-sublen+1): # b记录子串的索引开始位置
            count+=1
            substr=str1[b:b+sublen]
            
            if str2.find(substr) > -1:
                commonstr.append(substr)
                print('count={} sublen={}'.format(count,sublen))
                flag=True
        
        if flag:
            return commonstr
    return 'no common substr'

 

s1='abcdefgaa'
s2='defgabc'

def findcom(str1,str2):
    xmax=0 # 记录最大的值,即最大的字串长度
    xindex=0 # 记录最大值的索引位置

    matrix=[]

    for y,yitem in enumerate(str2):
        matrix.append([]) # 每次对str2迭代,生成新的子列表保存比对结果
        for x,xitem in enumerate(str1):
            if xitem != yitem:
                matrix[y].append(0) # 矩阵比较中,元素不同,置矩阵元素为0
            else:
                if x==0 or y==0: # 对处于矩阵第一行,第一列的直接置1,防止索引超界
                    matrix[y].append(1)
                else:
                    matrix[y].append(matrix[y-1][x-1]+1) # 左上角的值+1

                if matrix[y][x] > xmax: # 此轮中最大的字串长度超过记录值
                    xmax=matrix[y][x]
                    xindex=x # 最大值的索引位置

    return str1[xindex+1-xmax:xindex+1] # xindex+1因为后开特性,xindex+1后需往前回溯3个位置

print(findcom(s1,s2))

 

s1 = 'defabcd'
s2 = 'abcdefghi'


# short -> long
def find_longest_substring(s1, s2):
    # assume s1 is the short string
    len1 = len(s1)
    len2 = len(s2)
    if len1 > len2:
        s1, s2 = s2, s1
        len1, len2 = len2, len1

    substr = ''
    for length in range(1, len1 + 1, 1):  # substring length
        for offset in range(0, len1 - length + 1):  # length-1 - i + 1 = x, need to be x + 1
            # print(s1[offset:offset + length], end=' ')
            if s2.find(s1[offset:offset + length]) != -1:
                substr = s1[offset:offset + length]
                break
        else:  # substring of length is not found, return previous round substring
            return substr


print(find_longest_substring(s1, s2))

 

s1 = 'defabcd'
s2 = 'abcdefghi'


# long -> short
def find_longest_substring(t1: str, t2: str) -> str:
    len1 = len(t1)
    len2 = len(t2)
    # assume t1 is the short string
    if len1 > len2:
        t1, t2 = t2, t1
        len1, len2 = len2, len1

    for length in range(len1, 0, -1):  # substring length
        for offset in range(0, len1 - length + 1):  # len1-1 - length + 1 = x, need to be x + 1
            if t2.find(t1[offset:offset + length]):
                return t1[offset:offset + length]


print(find_longest_substring(s1, s2))

 

s1 = 'defabcd'
s2 = 'abcdefghi'


# Matrix
def find_longest_substring(t1: str, t2: str) -> str:
    # assume t1 is the short string
    if len(t1) > len(t2):
        t1, t2 = t2, t1

    index = -1  # index where substring ended in t2
    length = -1  # substring length
    matrix = []

    for row, v1 in enumerate(t1, start=0):  # t1 is the outer loop
        matrix.append([])  # rows to keep v1 compare with t2's v2
        for col, v2 in enumerate(t2, start=0):
            if v1 == v2:
                if col == 0 or row == 0:
                    matrix[row].append(1)
                else:
                    matrix[row].append(matrix[row - 1][col - 1] + 1)
                if matrix[row][col] > length:
                    length = matrix[row][col]
                    index = col
            else:
                matrix[row].append(0)
    if length == -1:  # not found substring
        return ''
    return t2[index - length + 1:index + 1]


print(find_longest_substring(s1, s2))
print(find_longest_substring('B', s2).__repr__())

 

posted @ 2020-09-07 20:38  ascertain  阅读(1033)  评论(0编辑  收藏  举报