字符串匹配算法

本文用Python代码实现了CLRS中提到的字符串匹配算法：

作为一种动态语言的Python，用来实现伪代码表示的算法，十分地方便。

分别是：

1、幼稚字符串匹配：

　　暴力搜索匹配，最直观，但是运行效率也最低。

2、Rabin-Karp字符串匹配：

　　将模式字符串表示成整数（并hash），同文本字符串直接进行比较。

3、有限自动机字符串匹配：

　　将模式字符串建立一个状态转换函数，以文本字符串作为输入，如果终止态可接受（acceptable），则表示匹配成功。

4、KMP字符串匹配：

　　融合了有限自动机算法的思想，以模式字符串为输入建立了一个前缀函数，在单次扫描文本字符串的过程中，查询前缀

函数，运行效率最好，代码简单，构思也最为巧妙。

# string matching algorithm.
import math

def naive_string_matcher(t, p):
    n = len(t)
    m = len(p)
    for s in range(n - m):
        if p[0 : m] == t[s : s + m]: # p==t[s:s+m]
            print 'Pattern occurs with shift', s


# text, pattern, radix, prime
def rabin_karp_matcher(T, P, d, q):
    n = len(T)
    m = len(P)
    hit = 0
    h = int(math.pow(d, m - 1) % q)
    #print 'm = %s, n = %s, q = %s, h = %s'%(m, n, q, h)
    p = 0
    t = {}; t[0] = 0 # t0 = 0, NOTE: subscript for t can be dropped.
    for i in range(m): # preprocessing
        p    = (d * p    + ord(P[i])) % q
        t[0] = (d * t[0] + ord(T[i])) % q
    for s in range(n - m + 1): # matching # shift: [0..n-m]
        if p == t[s]:
            if P[0 : m] == T[s : s + m]:
                print 'Pattern occurs with shift', s
            else: # spurious hit
                print 'spurious hit @', s; hit = hit + 1
        if s < n - m: # s != n - m, compute t[1] to t[n-m-1]
            t[s + 1] = (d * (t[s] - ord(T[s]) * h) + ord(T[s + m])) % q
    #if hit > 0: print 'spurious hits count: %s for n = %s'%(hit, n)


# automaton machine method
def compute_transition_function(P, S): # sigma
    D = {} # delta function as a dictionary.
    m = len(P)
    for q in range(m + 1): # [0..m]
        for a in S:
            k = min(m + 1, q + 2)
            while True: # repeat
                k = k - 1
                if (P[:q]+a).endswith(P[:k]): # until Pk is a suffix of Pq+a
                    break
            # delta(q, a) = k
            D[q, a] = k
    return D


# Text, Delta, m of P
def finite_automaton_matcher(T, D, m):
    n = len(T)
    q = 0
    for i in range(n):
        q = D[q, T[i]]
        if q == m:
            print 'Pattern occurs with shift', i - m + 1


def fa_matcher(T, P):
    S = ' \n\t_0123456789abcdefghijklmnopqrstuvwxyz' # sigma
    D = compute_transition_function(P, S)
    #print 'Delta:',D
    m = len(P)
    finite_automaton_matcher(T, D, m)


# Knuth-Morris-Pratt string match algorithm.
def compute_prefix_function(P):
    m = len(P)
    P = '*' + P # subscript fix.
    pi = [0 for x in range(m + 1)] # a new array, subscript fix.
    pi[1] = 0 # q > k, 1 > 0, k: [0..q-1]
    k = 0
    for q in range(2, m + 1): # [2..m]
        while k > 0 and P[k + 1] != P[q]: # next char does not match
            k = pi[k]
        if P[k + 1] == P[q]: # next char matches
            k = k + 1
        pi[q] = k
    return pi


def kmp_matcher(T, P):
    n, m = len(T), len(P)
    pi = compute_prefix_function(P)
    #print 'PI:', pi[1:] # subscript fix.
    q = 0
    T = '*' + T # padding at subscript 0.
    P = '*' + P # padding at subscript 0.
    for i in range(1, n + 1):
        while q > 0 and P[q + 1] != T[i]:
            q = pi[q]
        if P[q + 1] == T[i]:
            q = q + 1
        if q == m:
            print 'Pattern occurs with shift', i - m
            q = pi[q] # look for the next match

if __name__ == "__main__":
  
    t = 'one twooxx three ooxx foobar'*10
    p = 'x three o'
    print 'The naive matcher:'
    naive_string_matcher(t, p)
    
    print 'The Rabin-Karp matcher:'
    rabin_karp_matcher(t, p, 256, 57)

    print 'FA matcher:'
    fa_matcher(t, p)

    print 'KMP matcher:'
    kmp_matcher(t, p)

posted @ 2013-10-21 18:09 wilem 阅读(139) 评论(0) 编辑收藏举报

刷新页面返回顶部

字符串匹配算法

公告