正则表达式1.0 - Jackiesteed

公告

正则表达式1.0

Regular Expression即“描述某种规则的表达式”之意。

正则表达式匹配目前大致有两种实现方法:

1, 回溯的方法, 广泛用于Perl, Python, Java等语言里面, 效率一般, 对于特殊的case, 由于使用递归, 会爆栈或者体现为效率较低.

2,把正则表达式转换为NFA, 利用NFA或者DFA的方法处理匹配, awk和grep是这样实现的, 效率比较高.

最基本的正则表达式定义如下:

这里用c/c++实现的正则表达式, 只支持 +,*.?,| 这几种符号, 后面我会实现一下其他的特性, 比如[a-z], 零宽断言,反向引用等.

回溯的方法, 这个是来自 BeautyOfCode里面的实现, 代码很短:

int match(char* regexp, char* text)
{
    if(regexp[0] == '^') return matchhere(regexp, text)  ;
    do
    {
        if(matchhere(regexp, text))
            return 1;
    }while(*text++ != '\0');


    return 0;


}

int matchhere(char* regexp, char* text)
{
    if(regexp[0] == '\0') return 1;
    if(regexp[1] == '*') return matchstar(regexp[0], regexp + 2, text)       ;
    if(regexp[0] == '$' && regexp[1] == '\0' ) return text[0] == '\0';
    if(text[0] != '\0' && (regexp[0] == '.' || regexp[0] == text[0])) return matchhere(regexp + 1, text + 1);

    return 0;

}

int matchstar(int c, char* regexp, char* text)
{
    do
    {
        if(matchhere(regexp, text)) return 1;

    }   while(*text != '\0' && (*text++ == c || c == '.'));

    return 0;
}

int matchplus(int c, char* regexp, char* text)
{
    while(*text != '\0' && (*text++ == c || c == '.'))
    {
        if(matchhere(regexp, text)) return 1;
    }
    return 0;
}

下面这个是使用NFA实现的正则表达式匹配, 代码来自Russ Cox, 这个人是golang的作者之一.

/*
 * Regular expression implementation.
 * Supports only ( | ) * + ?.  No escapes.
 * Compiles to NFA and then simulates NFA
 * using Thompson's algorithm.
 *
 * See also http://swtch.com/~rsc/regexp/ and
 * Thompson, Ken.  Regular Expression Search Algorithm,
 * Communications of the ACM 11(6) (June 1968), pp. 419-422.
 * 
 * Copyright (c) 2007 Russ Cox.
 * Can be distributed under the MIT license, see bottom of file.
 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

/*
 * Convert infix regexp re to postfix notation.
 * Insert . as explicit concatenation operator.
 * Cheesy parser, return static buffer.
 */
char*
re2post(char *re)
{
    int nalt, natom;
    static char buf[8000];
    char *dst;
    struct {
        int nalt;
        int natom;
    } paren[100], *p;
    
    p = paren;
    dst = buf;
    nalt = 0;
    natom = 0;
    if(strlen(re) >= sizeof buf/2)
        return NULL;
    for(; *re; re++){
        switch(*re){
        case '(':
            if(natom > 1){
                --natom;
                *dst++ = '.';
            }
            if(p >= paren+100)
                return NULL;
            p->nalt = nalt;
            p->natom = natom;
            p++;
            nalt = 0;
            natom = 0;
            break;
        case '|':
            if(natom == 0)
                return NULL;
            while(--natom > 0)
                *dst++ = '.';
            nalt++;
            break;
        case ')':
            if(p == paren)
                return NULL;
            if(natom == 0)
                return NULL;
            while(--natom > 0)
                *dst++ = '.';
            for(; nalt > 0; nalt--)
                *dst++ = '|';
            --p;
            nalt = p->nalt;
            natom = p->natom;
            natom++;
            break;
        case '*':
        case '+':
        case '?':
            if(natom == 0)
                return NULL;
            *dst++ = *re;
            break;
        default:
            if(natom > 1){
                --natom;
                *dst++ = '.';
            }
            *dst++ = *re;
            natom++;
            break;
        }
    }
    if(p != paren)
        return NULL;
    while(--natom > 0)
        *dst++ = '.';
    for(; nalt > 0; nalt--)
        *dst++ = '|';
    *dst = 0;
    return buf;
}

/*
 * Represents an NFA state plus zero or one or two arrows exiting.
 * if c == Match, no arrows out; matching state.
 * If c == Split, unlabeled arrows to out and out1 (if != NULL).
 * If c < 256, labeled arrow with character c to out.
 */
enum
{
    Match = 256,
    Split = 257
};
typedef struct State State;
struct State
{
    int c;
    State *out;
    State *out1;
    int lastlist;
};
State matchstate = { Match };    /* matching state */
int nstate;

/* Allocate and initialize State */
State*
state(int c, State *out, State *out1)
{
    State *s;
    
    nstate++;
    s = malloc(sizeof *s);
    s->lastlist = 0;
    s->c = c;
    s->out = out;
    s->out1 = out1;
    return s;
}

/*
 * A partially built NFA without the matching state filled in.
 * Frag.start points at the start state.
 * Frag.out is a list of places that need to be set to the
 * next state for this fragment.
 */
typedef struct Frag Frag;
typedef union Ptrlist Ptrlist;
struct Frag
{
    State *start;
    Ptrlist *out;
};

/* Initialize Frag struct. */
Frag
frag(State *start, Ptrlist *out)
{
    Frag n = { start, out };
    return n;
}

/*
 * Since the out pointers in the list are always 
 * uninitialized, we use the pointers themselves
 * as storage for the Ptrlists.
 */
union Ptrlist
{
    Ptrlist *next;
    State *s;
};

/* Create singleton list containing just outp. */
Ptrlist*
list1(State **outp)
{
    Ptrlist *l;
    
    l = (Ptrlist*)outp;
    l->next = NULL;
    return l;
}

/* Patch the list of states at out to point to start. */
void
patch(Ptrlist *l, State *s)
{
    Ptrlist *next;
    
    for(; l; l=next){
        next = l->next;
        l->s = s;
    }
}

/* Join the two lists l1 and l2, returning the combination. */
Ptrlist*
append(Ptrlist *l1, Ptrlist *l2)
{
    Ptrlist *oldl1;
    
    oldl1 = l1;
    while(l1->next)
        l1 = l1->next;
    l1->next = l2;
    return oldl1;
}

/*
 * Convert postfix regular expression to NFA.
 * Return start state.
 */
State*
post2nfa(char *postfix)
{
    char *p;
    Frag stack[1000], *stackp, e1, e2, e;
    State *s;
    
    // fprintf(stderr, "postfix: %s\n", postfix);

    if(postfix == NULL)
        return NULL;

    #define push(s) *stackp++ = s
    #define pop() *--stackp

    stackp = stack;
    for(p=postfix; *p; p++){
        switch(*p){
        default:
            s = state(*p, NULL, NULL);
            push(frag(s, list1(&s->out)));
            break;
        case '.':    /* catenate */
            e2 = pop();
            e1 = pop();
            patch(e1.out, e2.start);
            push(frag(e1.start, e2.out));
            break;
        case '|':    /* alternate */
            e2 = pop();
            e1 = pop();
            s = state(Split, e1.start, e2.start);
            push(frag(s, append(e1.out, e2.out)));
            break;
        case '?':    /* zero or one */
            e = pop();
            s = state(Split, e.start, NULL);
            push(frag(s, append(e.out, list1(&s->out1))));
            break;
        case '*':    /* zero or more */
            e = pop();
            s = state(Split, e.start, NULL);
            patch(e.out, s);
            push(frag(s, list1(&s->out1)));
            break;
        case '+':    /* one or more */
            e = pop();
            s = state(Split, e.start, NULL);
            patch(e.out, s);
            push(frag(e.start, list1(&s->out1)));
            break;
        }
    }

    e = pop();
    if(stackp != stack)
        return NULL;

    patch(e.out, &matchstate);
    return e.start;
#undef pop
#undef push
}

typedef struct List List;
struct List
{
    State **s;
    int n;
};
List l1, l2;
static int listid;

void addstate(List*, State*);
void step(List*, int, List*);

/* Compute initial state list */
List*
startlist(State *start, List *l)
{
    l->n = 0;
    listid++;
    addstate(l, start);
    return l;
}

/* Check whether state list contains a match. */
int
ismatch(List *l)
{
    int i;

    for(i=0; i<l->n; i++)
        if(l->s[i] == &matchstate)
            return 1;
    return 0;
}

/* Add s to l, following unlabeled arrows. */
void
addstate(List *l, State *s)
{
    if(s == NULL || s->lastlist == listid)
        return;
    s->lastlist = listid;
    if(s->c == Split){
        /* follow unlabeled arrows */
        addstate(l, s->out);
        addstate(l, s->out1);
        return;
    }
    l->s[l->n++] = s;
}

/*
 * Step the NFA from the states in clist
 * past the character c,
 * to create next NFA state set nlist.
 */
void
step(List *clist, int c, List *nlist)
{
    int i;
    State *s;

    listid++;
    nlist->n = 0;
    for(i=0; i<clist->n; i++){
        s = clist->s[i];
        if(s->c == c)
            addstate(nlist, s->out);
    }
}

/* Run NFA to determine whether it matches s. */
int
match(State *start, char *s)
{
    int i, c;
    List *clist, *nlist, *t;

    clist = startlist(start, &l1);
    nlist = &l2;
    for(; *s; s++){
        c = *s & 0xFF;
        step(clist, c, nlist);
        t = clist; clist = nlist; nlist = t;    /* swap clist, nlist */
    }
    return ismatch(clist);
}

int
main(int argc, char **argv)
{
    int i;
    char *post;
    State *start;

    if(argc < 3){
        fprintf(stderr, "usage: nfa regexp string...\n");
        return 1;
    }
    
    post = re2post(argv[1]);
    if(post == NULL){
        fprintf(stderr, "bad regexp %s\n", argv[1]);
        return 1;
    }

    start = post2nfa(post);
    if(start == NULL){
        fprintf(stderr, "error in post2nfa %s\n", post);
        return 1;
    }
    
    l1.s = malloc(nstate*sizeof l1.s[0]);
    l2.s = malloc(nstate*sizeof l2.s[0]);
    for(i=2; i<argc; i++)
        if(match(start, argv[i]))
            printf("%s\n", argv[i]);
    return 0;
}

/*
 * Permission is hereby granted, free of charge, to any person
 * obtaining a copy of this software and associated
 * documentation files (the "Software"), to deal in the
 * Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute,
 * sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall
 * be included in all copies or substantial portions of the
 * Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
 * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
 * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
 * PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS
 * OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

推荐一下这个链接: http://swtch.com/~rsc/regexp/

posted on 2013-05-12 20:02 Jackiesteed 阅读(580) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部