[原]字典树处理单词集

引言：昨天写了一个简单的通过字典树来索引比较大的字母集合的程序。通过字典树，确实能够大大减少查询时间，是一种不错的字母表的匹配方案。这里我就拿出来分享一下。(ps:英文单词集大概有35W+ 条记录，数据量确实不小，在操作中为了简化，去除了英文单词中的 " ' " "-" 等等，作为字典树，必须以26 个英文字母作为树的子节点的索引。)

看看字典树的结构定义：

typedef struct _dict_tree_
{
	struct _dict_tree_ * dt[TREENODENUM];
	char 	c ;
	char	flag ;
}DT ;

dt 指针指向的是通过字母转化的子节点，c可以忽略。

flag后面会提到。

这里只要把字典树看做一个26叉树就可以。

比如一个英文单词：banana 首先通过b 转化为ASCII index = b - 97 ;

把整个单词看做字符串， index = str[i] - 97 ;

然后通过遍历字符串，来构建这棵树、

字符串大小写转换：

static int toSmall( char *str )
{
	if( str == NULL )
		return -1 ;
	int i = 0 ;
	while( str[i] )
	{
		if( str[i] <= 'Z' && str[i] >= 'A')
			str[i] = str[i] | 0x20 ;
		i++;
	}
	return 1 ;
}

简单的宏定义：

#define TREENODENUM   27
#define OCCUPY	      　　 0x0001
#define EMPTY		  0x0000

　　上述的2 ，3 宏就是字典树结构体里面的flag 的值

创建一棵树：实际上这里只是创建一个根节点

DT * createTree()
{
	DT * retdt = (DT *)malloc( sizeof(DT) );
	int i = 0 ; 
	for( ; i < TREENODENUM ; i++ )
	{
		retdt->dt[i] = NULL ;
		retdt->flag = EMPTY ;
	}
	return retdt ;
}

插入一个单词到树：

int insert( DT * t , char *str )
{
	int i = 0 ;
	int j = 0 ;
	int index = 0 ;
	if( t == NULL )
		return -1;
	if( toSmall(str) == -1 )
	{
		printf( "toSmall \n");
		return -1;
	}
	DT *pt = t ;
	int len = strlen(str);
	while( i < len )
	{
		index = str[i] - 97 ;
　　　　　　　　　　/*通过 index 来找到子节点*/
		if( pt->dt[index] == NULL )
		{
			pt->dt[index] = ( DT *)malloc( sizeof( DT) );
			pt->dt[index]->c = str[i] ;
			pt->dt[index]->flag = EMPTY ;

			for( j = 0 ; j < TREENODENUM ; j++ )
			{
				pt->dt[index]->dt[j] = NULL ;
			}
		}
		pt = pt->dt[index] ;
		i++;
	}
	pt->flag = OCCUPY ;
	return 1;
}

在这里我们可以看到flag 标志位的作用了，作为字典树，用以表示一个单词在树中的结束，也就是说，比如

abuse 这个单词，如果不在插入最后一个字符 e 的时候表示结束，那么我们在查找这个单词的过程中，abu 也是存在

于字典中的，故需要一个标示来表明这个可以是结尾，所以当查询的时候如果节点为空或者标志位为 EMPTY , 表示这

个单词不在树中，可能难以理解，不过仔细思考以后可以理解。

查找一个单词是否在树中：

int findstr( DT * t , char *str )
{
	int i = 0 ;
	int j = 0 ;
	int index = 0 ;
	if( t == NULL )
		return -1;
	if( toSmall(str) == -1 )
	{
		printf( "toSmall \n");
		return -1;
	}
	DT *pt = t ;
	int len = strlen(str);
	while( i != len )
	{
		index = str[i] - 97 ;
		if( pt->dt[index] == NULL )
			return 0 ;
		pt = pt->dt[index] ;
		i++;
	}
	if( pt->flag == OCCUPY )
		return 1;
	else
		return 0;
	
}

当然，我们需要从一个buff中批量的将单词插入树中，实际上顶部的节点的复用度是很高的。

int puttodic( DT * t ,char * buf )
{
	char lword[1024];
	char *p = buf ;
	int i = 0 ;
	while( *p == '\n' ) p++ ;
	 
	while( *p )
	{
		if( ( *p > 'z' || *p < 'a') && *p !='\n' )
		{
			while( *p != '\n' )
			{
				p++ ;
			}
			memset(lword,0,40);
			i = 0 ;
			p++;
		}
		if( !(*p) ) break ;
		
		if( *p == '\n' )
		{
			lword[i+1] = '\0' ;
			
			insert(t,lword);
			memset(lword,0,40);
			i = 0 ;
			p++;
		}
		else
		{
			if( *p <='z' && *p >= 'a' )
			{
				lword[i++] = *p ;
				p++;
			}
			else
			{
				memset(lword,0,40);
				i = 0 ;
				p++;
			}
		}
	}
}

在这里，博主过滤了很多非标准的英文单词形式，如果需要可以做相应的改变。

这里的buffer 来自：

char *buf  = (char *)malloc( 1024*1024*30);
	memset(buf,0,1024*1024*30);
int fd = open("Dict.txt",O_RDONLY);
	read(fd,buf,1024*1024*30);

测试代码：

 1 #ifdef _DT_UNIT_TEST_
 2 int main()
 3 {
 4     
 5     DT * mytree = createTree();
 6 
 7     char *buf  = (char *)malloc( 1024*1024*30);
 8     memset(buf,0,1024*1024*30);
 9     int fd = open("Dict.txt",O_RDONLY);
10     read(fd,buf,1024*1024*30);
11     //printf("%s",buf);
12     
13     puttodic( mytree,buf );
14     fprintf(stderr,"%d\n",findstr(mytree,"abuse"));
15     fprintf(stderr,"%d\n",findstr(mytree,"banana"));
16     fprintf(stderr,"%d\n",findstr(mytree,"diandian"));
17     fprintf(stderr,"%d\n",findstr(mytree,"abus"));
18     
19     close(fd);
20     free(buf);
21     return 0;
22 }
23 
24 
25 #endif

　　Dict.txt 可以从Linux 的 /usr/share/dict 目录导出，Debian体系可能没有，我的Ubuntu就没有。

posted @ 2012-04-20 21:13 _Boz 阅读(1158) 评论(1) 收藏举报

刷新页面返回顶部

破修电脑的

新书《自主实现SDN虚拟网络与企业私有云》出版，欢迎拍砖

[原]字典树处理单词集

公告