一个用于读unicode文本的迭代器(iterator)
written by chenghuige at gmail.com
也需有更好的方法我没有想到,但是没有在linux下面找到比较方便的读取unicode文本的方法。
用ICU,QT都太重量级了,于是自己写了一个包装好的unicode_iterator,当然还可以进一步
包装比如提出一个类提供begin和end.但是还要考虑很多,比如有的是little edian格式的有的是big edian
格式的,同时有的文本可能并没有标准的开头表明它是unicode格式的,以及ittle edian 还是big edian,
需要用户自己指出。
当前初始化给定一个std::filebuf,然后每次*iter提取的是一个UTF16格式的字符,定义
typedef unsigned short UTF16
注意没有用wchar_t因为这个在windows下是2byte但是在linux下GCC默认是4byte,为UTF32准备的。
迭代器用boost iterator帮助简化书写,类似的我们也可以写出给定一个utf8或者GBK等等编码的流转换
到unicode(UTF16)的iterator,内部统一用unicode处理数据还是比较方便的,C++0X也出现了u16string.
用法示意:假设是一个标准的litle edian 的 unicode文本
代码
1
2 using namespace std;
3 using namespace glseg;
4
5 ifstream istr(infilename.c_str());
6 filebuf* pbuf = istr.rdbuf();
7
8 unsigned char ch = pbuf->sbumpc();
9 unsigned char ch1 = pbuf->sbumpc();
10
11 if (ch == encoding_type[Utf16LittleEndian][0] && ch1 == encoding_type[Utf16LittleEndian][1])
12 cout << "The encoding of this file is utf16 little endian" << endl;
13 if (ch == encoding_type[Utf16BigEndian][0] && ch1 == encoding_type[Utf16BigEndian][1])
14 cout << "The encoding of this file is utf16 big endian" << endl;
15
16 unicode_iterator<> first(pbuf);
17 unicode_iterator<> end;
18
19 UTF8 utf8_array[4];
20 for (;first != end; ++first) {
21 unicode2utf8(*first, utf8_array); //将utf16字符转换到utf8,这样用cout就可以显示了因为默认都是utf8
22 cout << utf8_array;
23 }
2 using namespace std;
3 using namespace glseg;
4
5 ifstream istr(infilename.c_str());
6 filebuf* pbuf = istr.rdbuf();
7
8 unsigned char ch = pbuf->sbumpc();
9 unsigned char ch1 = pbuf->sbumpc();
10
11 if (ch == encoding_type[Utf16LittleEndian][0] && ch1 == encoding_type[Utf16LittleEndian][1])
12 cout << "The encoding of this file is utf16 little endian" << endl;
13 if (ch == encoding_type[Utf16BigEndian][0] && ch1 == encoding_type[Utf16BigEndian][1])
14 cout << "The encoding of this file is utf16 big endian" << endl;
15
16 unicode_iterator<> first(pbuf);
17 unicode_iterator<> end;
18
19 UTF8 utf8_array[4];
20 for (;first != end; ++first) {
21 unicode2utf8(*first, utf8_array); //将utf16字符转换到utf8,这样用cout就可以显示了因为默认都是utf8
22 cout << utf8_array;
23 }
代码
1 /**
2 * ==============================================================================
3 *
4 * \file type.h
5 *
6 * \author chenghuige at gmail.com
7 *
8 * \date 2009-12-09 19:02:16.223408
9 *
10 * Description: differnt tyes declaraion
11 * ==============================================================================
12 */
13
14 #ifndef TYPE_H_
15 #define TYPE_H_
16
17 namespace glseg {
18
19 typedef unsigned long UTF32; /* at least 32 bits */
20 typedef unsigned short UTF16; /* at least 16 bits */
21 typedef unsigned char UTF8; /* typically 8 bits */
22 typedef unsigned char Boolean; /* 0 or 1 */
23
24 enum EncodingType {
25 Unknown,
26 Utf16LittleEndian, // Default on Windows
27 Utf16BigEndian,
28 Utf8,
29 encoding_num
30 };
31
32 const UTF8 encoding_type[encoding_num][3] =
33 {
34 {0x00, 0x00, 0x00}, // Unknown
35 {0xFF, 0xFE, 0x00}, // Little endian
36 {0xFE, 0xFF, 0x00}, // Big endian
37 {0xEF, 0xBB, 0xBF}, // UTF8
38 };
39
40 } //----end of namespace glseg
41
42 #endif //----end of TYPE_H_
2 * ==============================================================================
3 *
4 * \file type.h
5 *
6 * \author chenghuige at gmail.com
7 *
8 * \date 2009-12-09 19:02:16.223408
9 *
10 * Description: differnt tyes declaraion
11 * ==============================================================================
12 */
13
14 #ifndef TYPE_H_
15 #define TYPE_H_
16
17 namespace glseg {
18
19 typedef unsigned long UTF32; /* at least 32 bits */
20 typedef unsigned short UTF16; /* at least 16 bits */
21 typedef unsigned char UTF8; /* typically 8 bits */
22 typedef unsigned char Boolean; /* 0 or 1 */
23
24 enum EncodingType {
25 Unknown,
26 Utf16LittleEndian, // Default on Windows
27 Utf16BigEndian,
28 Utf8,
29 encoding_num
30 };
31
32 const UTF8 encoding_type[encoding_num][3] =
33 {
34 {0x00, 0x00, 0x00}, // Unknown
35 {0xFF, 0xFE, 0x00}, // Little endian
36 {0xFE, 0xFF, 0x00}, // Big endian
37 {0xEF, 0xBB, 0xBF}, // UTF8
38 };
39
40 } //----end of namespace glseg
41
42 #endif //----end of TYPE_H_
代码
1 /**
2 * ==============================================================================
3 *
4 * \file unicode_iterator.h
5 *
6 * \author chenghuige at gmail.com
7 *
8 * \date 2009-12-09 16:56:59.395999
9 *
10 * Description: An unicode iterator for unicode encoding file
11 * ==============================================================================
12 */
13
14 #ifndef UNICODE_ITERATOR_H_
15 #define UNICODE_ITERATOR_H_
16
17 #include <boost/iterator/iterator_facade.hpp>
18 #include <fstream>
19 #include "type.h"
20
21 namespace glseg {
22
23 //this is mainly for file because when we read the file
24 //we can not read two bytes and decide the little or
25 //big edian easy, this class will help
26 //Perhaps name it as unicode or utf16 fstream buf itreator is better:)
27
28 //the big endian case
29 template <bool isLittleEndian>
30 struct UTF8_2_UTF16 {
31 static void convert(const UTF8 ch1, const UTF8 ch2, UTF16* result) {
32 *result = ch1;
33 *result <<= 8;
34 *result |= ch2;
35 }
36 };
37
38 //the little endian case
39 template <>
40 struct UTF8_2_UTF16<true> {
41 static void convert(const UTF8 ch1, const UTF8 ch2, UTF16* result) {
42 *result = ch2;
43 *result <<= 8;
44 *result |= ch1;
45 }
46 };
47
48
49 template<bool isLittleEndian = true>
50 class unicode_iterator
51 : public boost::iterator_facade<
52 unicode_iterator<isLittleEndian>
53 , UTF16
54 , boost::forward_traversal_tag
55 , UTF16&
56 >
57 {
58 public:
59 unicode_iterator()
60 : pbuf_(0), valid_(false) {}
61
62 explicit unicode_iterator(std::filebuf* pbuf)
63 :pbuf_(pbuf), valid_(true) { increment(); } //need to be ready for first *iter
64
65 private:
66 friend class boost::iterator_core_access;
67
68 void increment() {
69 if (pbuf_->sgetc()!=EOF) {
70 ch1_ = pbuf_->sbumpc();
71 ch2_ = pbuf_->sbumpc();
72 UTF8_2_UTF16<isLittleEndian>::convert(ch1_, ch2_, &result_);
73 }
74 else {
75 valid_ = false;
76 }
77 }
78
79 bool equal(unicode_iterator<isLittleEndian> const& other) const {
80 return (other.valid_ && valid_)
81 ?(other.pbuf_ == pbuf_)
82 :(other.valid_ == valid_);
83 }
84
85 UTF16& dereference() const {
86 return result_;
87 }
88
89 private:
90 std::filebuf* pbuf_;
91 bool valid_;
92
93 UTF8 ch1_;
94 UTF8 ch2_;
95 mutable UTF16 result_;
96 //FIXME if not mutable
97 //for 86 error: invalid initialization of reference oftype 'glseg::UTF16&' from expression of type 'const glseg::UTF16'
98 };
99
100 } //----end of namespace glseg
101
102 #endif //----end of UNICODE_ITERATOR_H_
2 * ==============================================================================
3 *
4 * \file unicode_iterator.h
5 *
6 * \author chenghuige at gmail.com
7 *
8 * \date 2009-12-09 16:56:59.395999
9 *
10 * Description: An unicode iterator for unicode encoding file
11 * ==============================================================================
12 */
13
14 #ifndef UNICODE_ITERATOR_H_
15 #define UNICODE_ITERATOR_H_
16
17 #include <boost/iterator/iterator_facade.hpp>
18 #include <fstream>
19 #include "type.h"
20
21 namespace glseg {
22
23 //this is mainly for file because when we read the file
24 //we can not read two bytes and decide the little or
25 //big edian easy, this class will help
26 //Perhaps name it as unicode or utf16 fstream buf itreator is better:)
27
28 //the big endian case
29 template <bool isLittleEndian>
30 struct UTF8_2_UTF16 {
31 static void convert(const UTF8 ch1, const UTF8 ch2, UTF16* result) {
32 *result = ch1;
33 *result <<= 8;
34 *result |= ch2;
35 }
36 };
37
38 //the little endian case
39 template <>
40 struct UTF8_2_UTF16<true> {
41 static void convert(const UTF8 ch1, const UTF8 ch2, UTF16* result) {
42 *result = ch2;
43 *result <<= 8;
44 *result |= ch1;
45 }
46 };
47
48
49 template<bool isLittleEndian = true>
50 class unicode_iterator
51 : public boost::iterator_facade<
52 unicode_iterator<isLittleEndian>
53 , UTF16
54 , boost::forward_traversal_tag
55 , UTF16&
56 >
57 {
58 public:
59 unicode_iterator()
60 : pbuf_(0), valid_(false) {}
61
62 explicit unicode_iterator(std::filebuf* pbuf)
63 :pbuf_(pbuf), valid_(true) { increment(); } //need to be ready for first *iter
64
65 private:
66 friend class boost::iterator_core_access;
67
68 void increment() {
69 if (pbuf_->sgetc()!=EOF) {
70 ch1_ = pbuf_->sbumpc();
71 ch2_ = pbuf_->sbumpc();
72 UTF8_2_UTF16<isLittleEndian>::convert(ch1_, ch2_, &result_);
73 }
74 else {
75 valid_ = false;
76 }
77 }
78
79 bool equal(unicode_iterator<isLittleEndian> const& other) const {
80 return (other.valid_ && valid_)
81 ?(other.pbuf_ == pbuf_)
82 :(other.valid_ == valid_);
83 }
84
85 UTF16& dereference() const {
86 return result_;
87 }
88
89 private:
90 std::filebuf* pbuf_;
91 bool valid_;
92
93 UTF8 ch1_;
94 UTF8 ch2_;
95 mutable UTF16 result_;
96 //FIXME if not mutable
97 //for 86 error: invalid initialization of reference oftype 'glseg::UTF16&' from expression of type 'const glseg::UTF16'
98 };
99
100 } //----end of namespace glseg
101
102 #endif //----end of UNICODE_ITERATOR_H_