对于海量字符串的查找,一般有两种方法,一种是建树,还有一种就是bf算法,即布隆过滤器,这个从原来上讲比较简单,也易于实现,主要就是根据哈希算法来实现。
int len(char *ch) { int m=0; while(ch[m]!='\0') { m++; } return m; } bool judge(char *vertor,char ch[]){ if (GETBIT(vertor,RSHash(ch,len(ch)))==0) return false; if (GETBIT(vertor,JSHash(ch,len(ch)))==0) return false; if (GETBIT(vertor,PJWHash(ch,len(ch)))==0) return false; if (GETBIT(vertor,ELFHash(ch,len(ch)))==0) return false; if (GETBIT(vertor,BKDRHash(ch,len(ch)))==0) return false; if (GETBIT(vertor,SDBMHash(ch,len(ch)))==0) return false; if (GETBIT(vertor,DJBHash(ch,len(ch)))==0) return false; if (GETBIT(vertor,DEKHash(ch,len(ch)))==0) return false; if (GETBIT(vertor,BPHash(ch,len(ch)))==0) return false; if (GETBIT(vertor,FNVHash(ch,len(ch)))==0) return false; if (GETBIT(vertor,APHash(ch,len(ch)))==0) return false; else return true; } int main(int argc,char *argv[]){ argv[1]="/Users/emaillist.dat"; argv[2]="/Users/checklist.dat"; argv[3]="/Users/result2222.dat"; clock_t a=clock(); int pos=1,k=0,j=0; FILE *fp_strpool,*fp_checkedstr,*fp_result; fp_strpool=fopen(argv[1], "r");//打开三个文件 fp_checkedstr=fopen(argv[2], "r"); fp_result=fopen(argv[3], "w"); char ch[ARRAY_SIZE]; char *vertor; char yes[5]="yes\n"; char no[4]="no\n"; vertor=(char *)calloc(SIZE , sizeof(char) );//申请位数组 for (int i=0; i<SIZE; i++) { vertor[i]=0; } while (fscanf(fp_strpool, "%s",ch)==1) { //fgets(ch, ARRAY_SIZE, fp_strpool); SETBIT(vertor, RSHash(ch,len(ch))); SETBIT(vertor, JSHash(ch,len(ch))); SETBIT(vertor, PJWHash(ch,len(ch))); SETBIT(vertor, ELFHash(ch,len(ch))); SETBIT(vertor, BKDRHash(ch,len(ch))); SETBIT(vertor, SDBMHash(ch,len(ch))); SETBIT(vertor, DJBHash(ch,len(ch))); SETBIT(vertor, DEKHash(ch,len(ch))); SETBIT(vertor, BPHash(ch,len(ch))); SETBIT(vertor, FNVHash(ch,len(ch))); SETBIT(vertor, APHash(ch,len(ch))); j++; } while (fscanf(fp_checkedstr, "%s",ch)==1) { k++; //fgets(ch, ARRAY_SIZE, fp_checkedstr); if (judge(vertor,ch)) { printf("%d\n",pos); pos++; fputs(yes, fp_result); } else fputs(no, fp_result); } printf("%d %d\n",j,k); fclose(fp_result); fclose(fp_checkedstr); fclose(fp_strpool);//关闭文件 clock_t b=clock(); double duration = (double)(b - a) / CLOCKS_PER_SEC; printf( "%f seconds\n", duration ); }