Wu-Manber实现
WM.h
#ifndef WM_H #define WM_H #include #include #include #define HASHTABLESIZE (256*256) #define MAXLEN 256 typedef struct wm_pattern_struct { struct wm_pattern_struct *next; unsigned char *psPat; //pattern array unsigned psLen; //length of pattern in bytes }WM_PATTERN_STRUCT; #define HASH_TYPE short #define SHIFTTABLESIZE (256*256) typedef struct wm_struct { WM_PATTERN_STRUCT *plist; //pattern list WM_PATTERN_STRUCT *msPatArray; //array of patterns unsigned short *msNumArray; //array of group counts, # of patterns in each hash group int msNumPatterns; //number of patterns loaded unsigned msNumHashEntries; HASH_TYPE *msHash; //last 2 characters pattern hash table unsigned char* msShift; //bad word shift table HASH_TYPE *msPrefix; //first 2 characters prefix table int msSmallest; //shortest length of all patterns }WM_STRUCT; WM_STRUCT * wmNew(); void wmFree(WM_STRUCT *ps); int wmAddPattern(WM_STRUCT *ps,unsigned char *P,int m); int wmPrepPatterns(WM_STRUCT *ps); void wmSearch(WM_STRUCT *ps,unsigned char *Tx,int n); #endif
WM.c
#include #include #include #include #include "wm.h" extern int nline=1; extern int nfound=0; #define MAXN 10001 //模式串的最大长度MAXN - 1 #define MAXM 51//单词最大长度为MAXM - 1 WM_STRUCT * wmNew() { WM_STRUCT *p=(WM_STRUCT *)malloc(sizeof(WM_STRUCT)); if(!p) return 0; p->msNumPatterns=0; p->msSmallest=1000; return p; } void wmFree(WM_STRUCT *ps) { if(ps->msPatArray) { if(ps->msPatArray->psPat) free(ps->msPatArray->psPat); free(ps->msPatArray ); } if(ps->msNumArray) free(ps->msNumArray); if(ps->msHash) free(ps->msHash); if(ps->msPrefix) free(ps->msPrefix); if(ps->msShift) free(ps->msShift); free(ps); } int wmAddPattern(WM_STRUCT *ps,unsigned char *q,int m)//m字符串长度 { WM_PATTERN_STRUCT *p; p=(WM_PATTERN_STRUCT *)malloc(sizeof(WM_PATTERN_STRUCT)); if(!p) return -1; p->psPat=(unsigned char*)malloc(m+1); memset(p->psPat+m,0,1); memcpy(p->psPat,q,m); p->psLen=m; ps->msNumPatterns++; if(p->psLen<(unsigned)ps->msSmallest) ps->msSmallest=p->psLen; p->next=ps->plist; ps->plist=p; return 0; } static unsigned HASH16(unsigned char *T) { return (unsigned short) (((*T)<<8) | *(T+1)); } void sort(WM_STRUCT *ps)//字符串哈希值从小到大排列 { int m=ps->msSmallest; int i,j; unsigned char *temp; int flag; for(i=ps->msNumPatterns-1,flag=1;i>0 && flag;i--) { flag=0; for(j=0;j { if(HASH16(&(ps->msPatArray[j+1].psPat[m-2]))msPatArray[j].psPat[m-2]))) { flag=1; temp=ps->msPatArray[j+1].psPat; ps->msPatArray[j+1].psPat=ps->msPatArray[j].psPat; ps->msPatArray[j].psPat=temp; } } } } static void wmPrepHashedPatternGroups(WM_STRUCT *ps)//计算有多少个不同哈希值,且从小到大 { unsigned sindex,hindex,ningroup; int i; int m=ps->msSmallest; ps->msNumHashEntries=HASHTABLESIZE; ps->msHash=(HASH_TYPE*)malloc(sizeof(HASH_TYPE)* ps->msNumHashEntries); if(!ps->msHash) { printf("No memory in wmPrepHashedPatternGroups()\n"); return; } for(i=0;i<(int)ps->msNumHashEntries;i++) { ps->msHash[i]=(HASH_TYPE)-1; } for(i=0;imsNumPatterns;i++) { hindex=HASH16(&ps->msPatArray[i].psPat[m-2]); sindex=ps->msHash[hindex]=i; ningroup=1; while((++imsNumPatterns) && (hindex==HASH16(&ps->msPatArray[i].psPat[m-2]))) ningroup++; ps->msNumArray[sindex]=ningroup; i--; } } static void wmPrepShiftTable(WM_STRUCT *ps)//建立shift表 { int i; unsigned short m,k,cindex; unsigned shift; m=(unsigned short)ps->msSmallest; ps->msShift=(unsigned char*)malloc(SHIFTTABLESIZE*sizeof(char)); if(!ps->msShift) return; for(i=0;i { ps->msShift[i]=(unsigned)(m-2+1); } for(i=0;imsNumPatterns;i++) { for(k=0;k { shift=(unsigned short)(m-2-k); cindex=((ps->msPatArray[i].psPat[k]<<8) | (ps->msPatArray[i].psPat[k+1]));//B为2 if(shiftmsShift[cindex]) ps->msShift[cindex]=shift;//k=m-2时,shift=0, } } } static void wmPrepPrefixTable(WM_STRUCT *ps)//建立Prefix表 { int i; ps->msPrefix=(HASH_TYPE*)malloc(sizeof(HASH_TYPE)* ps->msNumPatterns); if(!ps->msPrefix) { printf("No memory in wmPrepPrefixTable()\n"); return; } for(i=0;imsNumPatterns;i++) { ps->msPrefix[i]=HASH16(ps->msPatArray[i].psPat); } } void wmGroupMatch(WM_STRUCT *ps,//后缀哈希值相同,比较前缀以及整个字符匹配 int lindex, unsigned char *Tx, unsigned char *T) { WM_PATTERN_STRUCT *patrn; WM_PATTERN_STRUCT *patrnEnd; int text_prefix; unsigned char *px,*qx; patrn=&ps->msPatArray[lindex]; patrnEnd=patrn+ps->msNumArray[lindex]; text_prefix=HASH16(T); for(;patrn { if(ps->msPrefix[lindex++]!=text_prefix) continue; else { px=patrn->psPat; qx=T; while(*(px++)==*(qx++) && *(qx-1)!='\0'); if(*(px-1)=='\0') { printf("Match pattern \"%s\" at line %d column %d\n",patrn->psPat,nline,T-Tx+1); nfound++; } } } } int wmPrepPatterns(WM_STRUCT *ps)//由plist得到msPatArray { int kk; WM_PATTERN_STRUCT *plist; ps->msPatArray=(WM_PATTERN_STRUCT*)malloc(sizeof(WM_PATTERN_STRUCT)*ps->msNumPatterns); if(!ps->msPatArray) return -1; ps->msNumArray=(unsigned short*)malloc(sizeof(short)*ps->msNumPatterns); if(!ps->msNumArray) return -1; for(kk=0,plist=ps->plist;plist!=NULL && kkmsNumPatterns;plist=plist->next) { memcpy(&ps->msPatArray[kk++],plist,sizeof(WM_PATTERN_STRUCT)); } sort(ps); wmPrepHashedPatternGroups(ps); wmPrepShiftTable(ps); wmPrepPrefixTable(ps); return 0; } void wmSearch(WM_STRUCT *ps,unsigned char *Tx,int n)//字符串查找 { int Tleft,lindex,tshift; unsigned char *T,*Tend,*window; Tleft=n; Tend=Tx+n; if(nmsSmallest) return; for(T=Tx,window=Tx+ps->msSmallest-1;window { tshift=ps->msShift[(*(window-1)<<8) | *window]; while(tshift) { window+=tshift; T+=tshift; Tleft-=tshift; if(window>Tend) return; tshift=ps->msShift[(*(window-1)<<8) | *window]; } if((lindex=ps->msHash[(*(window-1)<<8) | *window])==(HASH_TYPE)-1) continue; lindex=ps->msHash[(*(window-1)<<8) | *window]; wmGroupMatch(ps,lindex,Tx,T); } } int main() { int length,n; WM_STRUCT *p; char keyword[MAXM]; //单词 char str[MAXN]; //模式串 p=wmNew(); printf("scanf the number of words-->\n"); scanf("%d", &n); printf("scanf the words-->\n"); while(n --) { scanf("%s", keyword); length=strlen(keyword); wmAddPattern(p,keyword,length); } wmPrepPatterns(p); printf("scanf the text string-->\n"); scanf("%s", str); length=strlen(str); wmSearch(p,str,length); wmFree(p); return(0); }