一个用于读unicode文本的迭代器(iterator)

written by chenghuige at gmail.com

也需有更好的方法我没有想到，但是没有在linux下面找到比较方便的读取unicode文本的方法。

用ICU,QT都太重量级了，于是自己写了一个包装好的unicode_iterator,当然还可以进一步

包装比如提出一个类提供begin和end.但是还要考虑很多，比如有的是little edian格式的有的是big edian

格式的，同时有的文本可能并没有标准的开头表明它是unicode格式的，以及ittle edian 还是big edian,

需要用户自己指出。

当前初始化给定一个std::filebuf,然后每次*iter提取的是一个UTF16格式的字符，定义

typedef unsigned short UTF16

注意没有用wchar_t因为这个在windows下是2byte但是在linux下GCC默认是4byte，为UTF32准备的。

迭代器用boost iterator帮助简化书写，类似的我们也可以写出给定一个utf8或者GBK等等编码的流转换

到unicode(UTF16)的iterator,内部统一用unicode处理数据还是比较方便的，C++0X也出现了u16string.

用法示意：假设是一个标准的litle edian 的 unicode文本

代码

1
2   using namespace std;
3   using namespace glseg;
4
5   ifstream istr(infilename.c_str());
6   filebuf* pbuf = istr.rdbuf();
7
8   unsigned char ch  = pbuf->sbumpc();
9   unsigned char ch1 = pbuf->sbumpc();
10
11   if (ch == encoding_type[Utf16LittleEndian][0] && ch1 == encoding_type[Utf16LittleEndian][1])
12     cout << "The encoding of this file is  utf16 little endian" << endl;
13   if (ch == encoding_type[Utf16BigEndian][0] && ch1 == encoding_type[Utf16BigEndian][1])
14     cout << "The encoding of this file is  utf16 big endian" << endl;
15
16   unicode_iterator<> first(pbuf);
17   unicode_iterator<> end;
18
19   UTF8 utf8_array[4];
20   for (;first != end; ++first) {
21     unicode2utf8(*first, utf8_array); //将utf16字符转换到utf8,这样用cout就可以显示了因为默认都是utf8
22     cout << utf8_array;
23   }

代码

1 /**
2  *  ==============================================================================
3  *
4  *          \file   type.h
5  *
6  *        \author   chenghuige at gmail.com
7  *
8  *          \date   2009-12-09 19:02:16.223408
9  *
10  *   Description:   differnt tyes declaraion
11  *  ==============================================================================
12  */
13
14 #ifndef TYPE_H_
15 #define TYPE_H_
16
17 namespace glseg {
18
19 typedef unsigned long   UTF32;  /* at least 32 bits */
20 typedef unsigned short  UTF16;  /* at least 16 bits */
21 typedef unsigned char   UTF8;   /* typically 8 bits */
22 typedef unsigned char   Boolean; /* 0 or 1 */
23
24 enum EncodingType {
25     Unknown,
26     Utf16LittleEndian,  // Default on Windows
27     Utf16BigEndian,
28     Utf8,
29     encoding_num
30 };
31
32 const UTF8 encoding_type[encoding_num][3] =
33 {
34     {0x00, 0x00, 0x00},  // Unknown
35     {0xFF, 0xFE, 0x00},  // Little endian
36   {0xFE, 0xFF, 0x00},  // Big endian
37     {0xEF, 0xBB, 0xBF}, // UTF8
38 };
39
40 }  //----end of namespace glseg
41
42 #endif  //----end of TYPE_H_

代码

  1 /**
  2  *  ==============================================================================
  3  *
  4  *          \file   unicode_iterator.h
  5  *
  6  *        \author   chenghuige at gmail.com
  7  *
  8  *          \date   2009-12-09 16:56:59.395999
  9  *
10  *   Description:   An unicode iterator for unicode encoding file
11  *  ==============================================================================
12  */
13
14 #ifndef UNICODE_ITERATOR_H_
15 #define UNICODE_ITERATOR_H_
16
17 #include <boost/iterator/iterator_facade.hpp>
18 #include <fstream>
19 #include "type.h"
20
21 namespace glseg {
22
23 //this is mainly for file because when we read the file
24 //we can not read two bytes and decide the little or
25 //big edian easy, this class will help
26 //Perhaps name it as unicode or utf16 fstream buf itreator is better:)
27
28 //the big endian case
29 template <bool isLittleEndian>
30 struct UTF8_2_UTF16 {
31   static void convert(const UTF8 ch1, const UTF8 ch2, UTF16* result) {
32     *result = ch1;
33     *result <<= 8;
34     *result |= ch2;
35   }
36 };
37
38 //the little endian case
39 template <>
40 struct UTF8_2_UTF16<true> {
41   static void convert(const UTF8 ch1, const UTF8 ch2, UTF16* result) {
42     *result = ch2;
43     *result <<= 8;
44     *result |= ch1;
45   }
46 };
47
48
49 template<bool isLittleEndian = true>
50 class unicode_iterator
51   : public boost::iterator_facade<
52       unicode_iterator<isLittleEndian>
53     , UTF16
54     , boost::forward_traversal_tag
55     , UTF16&
56     >
57 {
58 public:
59   unicode_iterator()
60     : pbuf_(0), valid_(false) {}
61
62   explicit unicode_iterator(std::filebuf* pbuf)
63     :pbuf_(pbuf), valid_(true) { increment(); } //need to be ready for first *iter
64
65 private:
66   friend class boost::iterator_core_access;
67
68   void increment() {
69     if (pbuf_->sgetc()!=EOF) {
70       ch1_ = pbuf_->sbumpc();
71       ch2_ = pbuf_->sbumpc();
72       UTF8_2_UTF16<isLittleEndian>::convert(ch1_, ch2_, &result_);
73     }
74     else {
75       valid_ = false;
76     }
77   }
78
79   bool equal(unicode_iterator<isLittleEndian> const& other) const {
80     return (other.valid_ && valid_)
81            ?(other.pbuf_ == pbuf_)
82           :(other.valid_ == valid_);
83   }
84
85   UTF16& dereference() const {
86     return result_;
87   }
88
89 private:
90   std::filebuf* pbuf_;
91   bool valid_;
92
93   UTF8  ch1_;
94   UTF8  ch2_;
95   mutable UTF16 result_;
96   //FIXME if not mutable
97   //for 86 error: invalid initialization of reference oftype 'glseg::UTF16&' from expression of type 'const glseg::UTF16'
98 };
99
100 }  //----end of namespace glseg
101
102 #endif  //----end of UNICODE_ITERATOR_H_

posted @ 2009-12-09 21:17 阁子阅读(1856) 评论(0) 编辑收藏举报

刷新页面返回顶部

游园惊梦(https://github.com/chenghuige)

一个用于读unicode文本的迭代器(iterator)

公告