基于编辑距离和最长公共子串计算字符串相似度

基于编辑距离和最长公共子串实现的文本相似度计算:

计算公式为

s=lc/(led+lc)

lc为最长公共子串的长度
led为编辑距离
考虑到汉字和字符的不同,增加了Str2Word()进行字符串分词,实现单字的比较。代码写了一段时间了,有些遗忘。
ld()为编辑距离求解
lcsLen()为动态规划求解最长公共子串问题
lcs()输出最长公共子串
Str2Word()进行字符串分词



# -*- coding: utf-8 -*-
from __future__ import division
import re

def ld(firstStr,secondStr):
    if len(firstStr)>len(secondStr):
        firstStr,secondStr=secondStr,firstStr
    if len(firstStr)==0:
        return len(secondStr),0
    if len(secondStr)==0:
        return len(firstStr),0
    matrix=[range(len(secondStr)+1) for x in range(len(firstStr)+1)]
    #print matrix
    for i in range(1,len(firstStr)+1):
        for j in range(1,len(secondStr)+1):
                ld1=matrix[i-1][j]+1
                ld2=matrix[i][j-1]+1
                ld3=matrix[i-1][j-1]
                if firstStr[i-1]!=secondStr[j-1]:
                    ld3+=1
                matrix[i][j]=min(ld1,ld2,ld3)
        #print matrix
    ld=matrix[len(firstStr)][len(secondStr)]
    #print len(secondStr)
    return ld

def lcsLen(firstStr,secondStr):
    if len(firstStr)>len(secondStr):
        firstStr,secondStr=secondStr,firstStr
    if len(firstStr)==0:
        return len(secondStr),0
    if len(secondStr)==0:
        return len(firstStr),0
    matrix=[range(len(secondStr)+1) for x in range(len(firstStr)+1)]
    #print matrix
    for m in range(0,len(firstStr)+1):
        matrix[m][0]=0
    for k in range(1,len(secondStr)+1):
        matrix[0][k]=0
    for i in range(1,len(firstStr)+1):
        for j in range(1,len(secondStr)+1):
            lcs1=matrix[i-1][j]
            lcs2=matrix[i][j-1]
            lcs3=matrix[i-1][j-1]
            if firstStr[i-1]==secondStr[j-1]:
                lcs3+=1
            matrix[i][j]=max(lcs1,lcs2,lcs3)
       #print matrix
    lcslen=matrix[len(firstStr)][len(secondStr)]
    return lcslen

def midleI(i,firstStr,secondStr):
    strr=''
    j=0
    while i<len(firstStr) and j<len(secondStr):
        while j<len(secondStr) and i<len(firstStr):
            if firstStr[i]==secondStr[j]:
                strr+=secondStr[j]
                #print 'strr=',strr
                i+=1
            j+=1
            #print 'j=',j
    return strr

def lcs(firstStr,secondStr):
    sen=[]
    lenMax=0
    result=''
    lenId=0
    for i in range(len(firstStr)):
        result=midleI(i,firstStr,secondStr)
        sen.append(result)
        if len(result)>lenMax:
            lenMax=len(result)
            lenId=i
    lc=sen[lenId]
    #lcsLen=len(lcs)
    return lc

def similarity(firstStr,secondStr):
    s1=Str2Word(firstStr)
    s2=Str2Word(secondStr)
    lc=len(lcs(s1,s2))
    led=ld(s1,s2)
    s=lc/(led+lc)
    return s

def preProcess(sen,edcode='utf-8'):
    sen=sen.decode(edcode)
    sen=re.sub(u"[。,、.,!……!《》<>\"'::?\?、\|“”‘’;]","",sen)
    #print sen
    return sen

def Str2Word(sen,edcode='utf-8'):
    i=0
    result=[]
    sen=preProcess(sen)
    length=len(sen)
    while i<length:
        # find ab...xyz and AB...YZ and 01..89
        tempi=i
        tok=sen[i:i+1]
        while re.search("[0-9A-Za-z\-\+#@_]{1}",tok)<>None:
            i+=1
            tok=sen[i:i+1]
        if i-tempi>0:
            #print sen[tempi:i].lower()
            result.append(sen[tempi:i].lower().encode(edcode))
    #find chinese word
        leftword = sen[i:i+1]
        if leftword<>" " and i<length:
            result.append(leftword)
        i+=1
    return result

if __name__ == '__main__':
    first='一个全球性问题尽管大部分移民并没有跨出国界 '
    second='气候移民是一个全球性问题移民并没有跨出国界但其影响却是跨国界的'
    s=similarity(first,second)
    print '相似度为:',s


posted @ 2013-07-28 23:32  kuduogedi  阅读(389)  评论(0编辑  收藏  举报