C语言解析星际译王词典
扫盲时间:
星际译王,即Star Dict,是利用GTK(GIMP TOOLKIT)开发的国际化的、跨平台的自由的桌面字典软件。它并不包含字典档,使用者须自行下载配合使用。它可以运行于多种不同的平台,如Linux,Microsoft Windows ,FreeBSD及Solaris,并使用GPL授权。
星际译王项目:http://stardict.sourceforge.net/
星际译王词典:
下载:http://stardict.sourceforge.net/Dictionaries.php
好,言归正传:
有个学长用c+python写了个web版的在线词典,c做词典服务器,python做cgi服务器,然后通过FIFO通信,词典用的是星际译王格式的词典,很有意思。本人模仿学长思路,仅为唤醒内心深处对纯c的美好回忆-_-,当然是在业余写作,不过由于工作中需要学的东西渐渐多了一些,就少有些时间继续做了。现在先公布一些代码给大家分享,免得以后忘光光-_-
最关键的就是对星际译王词典文件的解析,对于其格式,读者可以自己安装一个,在其安装目录里有个文件专门介绍了词典格式,英文的。
我简单说一下:一个词典分为三个文件,ifo:描述词典信息;idx:存放了单词的索引位置,格式为:单词+偏移值+长度;dict:可以被压缩,只存放解释。
思路:根据输入的单词,在idx中查找其在dict文件中的偏移值和长度,直接取出,很简单。
以下代码使用emacs NT在winXP+MinGW环境下编译通过:
词典信息头文件(dict_info.h):
2 * dict_info.h
3 * Author:shoru
4 * 2009-08-23 12:53
5 */
6
7 #ifndef _DICT_IFO_H
8 #define _DICT_IFO_H
9
10 /*
11 * 测试开关
12 */
13 #define DEBUG
14
15 /*
16 * 行缓冲区大小
17 */
18 #define BUFFER_SIZE 500
19
20 /*
21 * ifo文件后缀
22 */
23 #define IFO_EXT ".ifo"
24
25 /*
26 * dict info file struct.
27 */
28 typedef struct
29 {
30 char version[100]; //版本
31 int wordcount; //单词数量
32 int idxfilesize; //索引文件大小
33 char bookname[100]; //词典名称
34 char sametypesequence[10];
35 char other_info[1000]; //其他不关心的信息
36 } DICT_INFO;
37
38 /*
39 * 解析词典,返回一个DICT_INFO结构体指针
40 */
41 DICT_INFO *get_dict_info(char *file);
42
43 /*
44 * 解析每行
45 */
46 static void parse_line(char *line, DICT_INFO *dict_info);
47
48 #endif /* _DICT_IFO_H */
词典信息源码(dict_info.c):
2 * dict_info.c
3 * Author:shoru
4 * 2009-08-23 12:54
5 */
6
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <errno.h>
11 #include "dict_info.h"
12
13 /*
14 * 将词典的信息文件装入结构体,并返回该结构体指针
15 * 失败返回NULL
16 */
17 DICT_INFO* get_dict_info(char *info_file)
18 {
19 FILE *ifo;
20 char *line;
21 char buffer[BUFFER_SIZE];
22
23 DICT_INFO* dict_info=(DICT_INFO*)malloc(sizeof(DICT_INFO));
24
25 ifo=fopen(info_file,"r");
26 if(ifo == NULL)
27 {
28 fprintf(stderr,"%s",strerror(errno));
29 return NULL;
30 }
31
32 while((line=fgets(buffer,BUFFER_SIZE,ifo)) != NULL)
33 {
34 parse_line(line,dict_info);
35 }
36 fclose(ifo);
37
38 return dict_info;
39 }
40
41 /*
42 * 逐行解析文件,将信息装入特定字段
43 */
44 static void parse_line(char *line,DICT_INFO *dict_info)
45 {
46 char *idx;
47
48 if((idx=strchr(line,'='))!=NULL)
49 {
50 if(strstr(line,"version")!=NULL)
51 {
52 strcpy(dict_info->version,idx+1);
53 }else if(strstr(line,"wordcount")!=NULL)
54 {
55 dict_info->wordcount=atoi(idx+1);
56 }else if(strstr(line,"idxfilesize")!=NULL)
57 {
58 dict_info->idxfilesize=atoi(idx+1);
59 }else if(strstr(line,"bookname")!=NULL)
60 {
61 strcpy(dict_info->bookname,idx+1);
62 }else if(strstr(line,"sametypesequence")!=NULL)
63 {
64 strcpy(dict_info->sametypesequence,idx+1);
65 }else{
66 strcat(dict_info->other_info,line);
67 }
68 }
69 }
70
71
72 #ifdef DEBUG
73
74 int main(int argc,char **argv)
75 {
76 DICT_INFO * tmp=get_dict_info("../dict/oxford-gb/oxford-gb-formated.ifo");
77 if(tmp == NULL)
78 {
79 printf("error\n");
80 exit(EXIT_FAILURE);
81 }else{
82
83 }
84 printf("version:%s",tmp->version);
85 printf("bookname:%s",tmp->bookname);
86 printf("wordcount:%d\n",tmp->wordcount);
87 printf("idxfilesize:%d\n",tmp->idxfilesize);
88 printf("sts:%s\n",tmp->sametypesequence);
89 printf("%s",tmp->other_info);
90 free(tmp);
91 return EXIT_SUCCESS;
92 }
93 #endif /* DEBUG */
词典索引头文件(dict_idx.h):
2 * dict_idx.h
3 * Author:shoru
4 * 2009-09-09 12:27
5 */
6
7 #ifndef _DICT_IDX_H
8 #define _DICT_IDX_H
9
10 #include "dict_info.h"
11 /*
12 * 测试开关
13 */
14 #define DEBUG
15
16 #define TRUE 1
17 /*
18 * idx文件后缀
19 */
20 #define IDX_EXT "idx"
21
22 /*
23 * Struct to describe the idx file.
24 */
25 typedef struct
26 {
27 char word[100];
28 int offset;
29 int length;
30 } WORD_IDX;
31
32 /*
33 * Get a OFF_LEN struct of a word.
34 */
35 static void *get_words(char *filename, DICT_INFO *dict_info, WORD_IDX *word_idx);
36
37 /*
38 * Binary search for word's idx information.
39 */
40 WORD_IDX *get_idx(char *word,WORD_IDX *word_idx, DICT_INFO *dict_info0);
41 inline static int to_int(unsigned char* from_int);
42 #endif /* _DICT_IDX_H */
词典索引源码(dict_idx.c):
2 * dict_idx.c
3 * Author:shoru
4 * 2009-09-09 12:27
5 */
6
7 #include <stdlib.h>
8 #include <stdio.h>
9 #include <string.h>
10 #include "dict_idx.h"
11 #include "dict_info.h"
12
13 static void *get_words(char *filename, DICT_INFO *dict_info, WORD_IDX *word_idx)
14 {
15 FILE *fd= fopen(filename,"rb");
16 size_t nread=0;
17
18 if(fd == NULL || dict_info == NULL)
19 {
20 return NULL;
21 }
22 unsigned char buffer[dict_info->idxfilesize];
23
24 nread = fread(buffer,dict_info->idxfilesize,1,fd);
25
26 unsigned char *head,*tail;
27 head=tail=buffer;
28 int it=0;
29 int total=1;
30 for(; it < dict_info->idxfilesize; it++)
31 {
32 if(*head == '\0')
33 {
34 strncpy((word_idx+total)->word,tail,head-tail+1);
35 (word_idx+total)->offset=to_int(head+1);
36 (word_idx+total)->length=to_int(head+5);
37 total++;
38 head+=9;
39 tail=head;
40 if(total==dict_info->wordcount)break;
41 }else{
42 head++;
43 continue;
44 }
45 }
46 }
47
48 inline static int to_int(unsigned char *from_int)
49 {
50 return *(from_int+3)+(*(from_int+2)<<8)+(*(from_int+1)<<16)+(*from_int<<24);
51 }
52
53 WORD_IDX *get_idx(char *word,WORD_IDX *word_idx, DICT_INFO *dict_info)
54 {
55 if(word == NULL || word_idx == NULL || dict_info == NULL)
56 {
57 return NULL;
58 }
59 int head=0,tail=dict_info->wordcount,cur=tail/2;
60
61 int i=0;
62
63 while(TRUE)
64 {
65 int cmp=strcasecmp(word,word_idx[cur].word);
66 if(0 == cmp)
67 {
68 return &word_idx[cur];
69 }else if(0 > cmp){
70 tail = cur;
71 }else{
72 head = cur;
73 }
74 cur=(tail+head)/2;
75 }
76 }
77
78
79
80
81 #ifdef DEBUG
82
83 int main(int argc, char** argv)
84 {
85 char * filename="../dict/oxford-gb/oxford-gb-formated.idx";
86 char * dictname="../dict/oxford-gb/oxford-gb-formated.dict";
87
88 DICT_INFO dict_info;
89 dict_info.wordcount=39429;
90 dict_info.idxfilesize=721264;
91 WORD_IDX *idx=(WORD_IDX*)malloc(sizeof(WORD_IDX)*dict_info.wordcount);
92 get_words(filename,&dict_info,idx);
93
94 WORD_IDX *word=get_idx("a",idx,&dict_info);
95
96 printf("%s,%d,%d\n",word->word,word->offset,word->length);
97
98 FILE *dict=fopen(dictname,"r");
99 if(dict == NULL)
100 {
101 printf("dict error\n");
102 return -1;
103 }
104 if(0 != fseek(dict,word->offset,SEEK_SET)){
105 printf("seek error\n");
106 return -1;
107 }
108
109 char explain[word->length+1];
110 memset(explain,'\0',word->length+1);
111 fread(explain,word->length,1,dict);
112
113 printf("%s\n",explain);
114 free(idx);
115 return EXIT_SUCCESS;
116 }
117
118 #endif /* DEBUG */