浅谈 ZipInteger

我最近在看《Expert .NET 2.0 IL Assembler》这本书(早在2009年11月就购买了这本书，当时看了前面几章就放下了，最近又拣起来继续看)。在“第5章元数据表的组织 5.2 堆和表”中提到：

Blob堆。这种类型的堆包括了一些任意大小的二进制对象。每个二进制对象都以它的长度（以压缩的形式）开始。二进制对象在4字节的边界上对齐。

长度压缩公式非常简单。如果长度（无符号的整数）是0x7F或者更少，就将其表示为一个1字节的整数；如果长度大于0x7F但是不大于0x3FFF，就将其表示为设置了最高权重位的无符号整数。否则，就将其表示为设置了2个最高权重位的4字节无符号整数。表5-1总结了这个公式。

这个压缩公式在元数据中广泛使用。当然，这种压缩只适应于不超过0x1FFFFFFF（536870911）的数字。但这种限制并不是问题，因为压缩通常应用于长度和计数这样的值。

在“第8章基本类型和签名 8.1 CLR中的基本类型 8.1.4 向量和数组”中提到：

有符号整数的值（下界的值）是根据不同的压缩过程进行压缩的。首先，获取原始的有符号整数的绝对值，将有符号的整数编码为无符号整数，左移1位，并根据原始值的最高权重（符号）来设置最低权重位。然后根据表8-4的公式进行压缩。

于是，我想不仅在 Microsoft .NET CLR 的元数据中需要广泛使用这个压缩公式，其它应用也会需要用到的。设想我们需要保存一些表示数据长度和计数等的数字，这些数据一般情况下是很小的，但是也许偶尔有一两个很大的数字。我们就难以选择使用 byte、short、int、long 等数据类型中哪一个来保存这些数字。

我的设想是将 long 按以下公式进行压缩编码，得到一个 ZipInteger 结构：

.
                     7F 0xxxxxxx
                  3F.FF 10xxxxxx XX
               1F.FF.FF 110xxxxx XX XX
            0F.FF.FF.FF 1110xxxx XX XX XX
         07.FF.FF.FF.FF 11110xxx XX XX XX XX
      03.FF.FF.FF.FF.FF 111110xx XX XX XX XX XX
   01.FF.FF.FF.FF.FF.FF 1111110x XX XX XX XX XX XX
   FF.FF.FF.FF.FF.FF.FF 11111110 XX XX XX XX XX XX XX
FF.FF.FF.FF.FF.FF.FF.FF 11111111 XX XX XX XX XX XX XX XX

这个 ZipInteger 结构的表示范围和 long 的表示范围完全一致，可以无损地和 long 进行双向转换。long 总是占用 8 个字节，而 ZipInteger 结构根据其数字的大小占用 1 到 9 个字节。

下面是 ZipInteger.cs 源程序:

001:  using System;
002:  using System.IO;
003:  using System.Drawing;
004:  
005:  namespace Skyiv.Numerics
006:  {
007:    /// <summary>
008:    /// 表示 64-bits 带符号整数，对于小整数而言会节省空间
009:    /// </summary>
010:    public struct ZipInteger : IEquatable<ZipInteger>, IComparable<ZipInteger>
011:    {
012:      long data;
013:  
014:      public static readonly ZipInteger MinValue = long.MinValue;
015:      public static readonly ZipInteger MaxValue = long.MaxValue;
016:  
017:      /// <summary>
018:      /// 使用 64-bits 带符号整数值初始化 ZipInteger 结构的新实例
019:      /// </summary>
020:      /// <param name="value">64-bits 带符号整数</param>
021:      public ZipInteger(long value)
022:      {
023:        data = value;
024:      }
025:  
026:      /// <summary>
027:      /// 使用字节数组中的值初始 ZipInteger 结构的新实例
028:      /// 注意：本构造函数会破坏传入的 bits 参数的值。
029:      /// </summary>
030:      /// <param name="bits">顺序为 big-endian 的字节值的数组</param>
031:      public ZipInteger(byte[] bits)
032:      {
033:        if (bits == null) throw new ArgumentNullException("bits");
034:        if (bits.Length < 1 || bits.Length > 9) throw new ArgumentException("Invalid length", "bits");
035:        byte[] mask = { 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01, 0x00 };
036:        if (bits.Length > 1 && bits.Length < 9) bits[0] &= mask[bits.Length - 1];
037:        Array.Reverse(bits);
038:        Array.Resize(ref bits, 8);
039:        if (!BitConverter.IsLittleEndian) Array.Reverse(bits);
040:        data = Decode(BitConverter.ToInt64(bits, 0));
041:      }
042:  
043:      public static implicit operator long(ZipInteger value)
044:      {
045:        return value.data;
046:      }
047:  
048:      public static implicit operator ZipInteger(long value)
049:      {
050:        return new ZipInteger(value);
051:      }
052:  
053:      public int CompareTo(ZipInteger other)
054:      {
055:        return data.CompareTo(other.data);
056:      }
057:  
058:      public bool Equals(ZipInteger other)
059:      {
060:        return data == other.data;
061:      }
062:  
063:      public override bool Equals(object obj)
064:      {
065:        return (obj is ZipInteger) && data == ((ZipInteger)obj).data;
066:      }
067:  
068:      public override int GetHashCode()
069:      {
070:        return (int)data ^ (int)(data >> 32);
071:      }
072:  
073:      public override string ToString()
074:      {
075:        return data.ToString();
076:      }
077:  
078:      /// <summary>
079:      /// 将 ZipInteger 值转换为字节数组
080:      /// </summary>
081:      /// <returns>顺序为 big-endian 的字节数组</returns>
082:      public byte[] ToByteArray()
083:      {
084:        var data2 = Encode(data);
085:        var bits = BitConverter.GetBytes(data2);
086:        if (!BitConverter.IsLittleEndian) Array.Reverse(bits);
087:        var mask = GetMask((ulong)data2);
088:        Array.Resize(ref bits, mask.X);
089:        Array.Reverse(bits);
090:        bits[0] |= (byte)mask.Y;
091:        return bits;
092:      }
093:  
094:      static Point GetMask(ulong udata)
095:      {
096:        if (udata <= 0x7F) return new Point(1, 0);
097:        if (udata <= 0x3FFF) return new Point(2, 0x80);
098:        if (udata <= 0x1FFFFF) return new Point(3, 0xC0);
099:        if (udata <= 0x0FFFFFFF) return new Point(4, 0xE0);
100:        if (udata <= 0x07FFFFFFFF) return new Point(5, 0xF0);
101:        if (udata <= 0x03FFFFFFFFFF) return new Point(6, 0xF8);
102:        if (udata <= 0x01FFFFFFFFFFFF) return new Point(7, 0xFC);
103:        if (udata <= 0xFFFFFFFFFFFFFF) return new Point(8, 0xFE);
104:        return new Point(9, 0xFF);
105:      }
106:  
107:      static int GetCount(byte value)
108:      {
109:        if ((value & 0x80) == 0) return 0;
110:        if ((value & 0x40) == 0) return 1;
111:        if ((value & 0x20) == 0) return 2;
112:        if ((value & 0x10) == 0) return 3;
113:        if ((value & 0x08) == 0) return 4;
114:        if ((value & 0x04) == 0) return 5;
115:        if ((value & 0x02) == 0) return 6;
116:        if ((value & 0x01) == 0) return 7;
117:        return 8;
118:      }
119:  
120:      /// <summary>
121:      /// 将 ZipInteger 值写入流，并将流内的位置向前推进若干字节。
122:      /// </summary>
123:      /// <param name="writer">要写入的流</param>
124:      public void Write(Stream writer)
125:      {
126:        var bits = ToByteArray();
127:        writer.Write(bits, 0, bits.Length);
128:      }
129:  
130:      /// <summary>
131:      /// 从流中读取一个 ZipInteger，并将流内的位置向前推进若干字节。
132:      /// </summary>
133:      /// <param name="reader">要从中读取的流</param>
134:      /// <returns>所读取的 ZipInteger。如果到达流的末尾，则为 null。 </returns>
135:      public static ZipInteger? Read(Stream reader)
136:      {
137:        var value = reader.ReadByte();
138:        if (value == -1) return null;
139:        var count = GetCount((byte)value);
140:        var bits = new byte[count + 1];
141:        bits[0] = (byte)value;
142:        if (Read(reader, bits, 1, count) != count) throw new EndOfStreamException();
143:        return new ZipInteger(bits);
144:      }
145:  
146:      static int Read(Stream reader, byte[] buffer, int index, int count)
147:      {
148:        var offset = index;
149:        for (int n = -1; n != 0 && count > 0; count -= n, offset += n)
150:          n = reader.Read(buffer, offset, count);
151:        return offset - index;
152:      }
153:  
154:      static long Encode(long x)
155:      {
156:        if (x == long.MinValue) return 1;
157:        return (x >= 0) ? (x << 1) : ((-x << 1) | 1);
158:      }
159:  
160:      static long Decode(long x)
161:      {
162:        if (x == 1) return long.MinValue;
163:        var n = (x >> 1) & long.MaxValue;
164:        return ((x & 1) == 0) ? n : -n;
165:      }
166:    }
167:  }

注意“Stream.Read 方法在尚未到达流的末尾情况下可以返回少于所请求的字节”，所以需要上述程序中第 146 行到第 152 行的 Read 方法来进行处理。请参见：“浅谈 Stream.Read 方法”。

上述程序中第 154 行到第 158 行的 Encode 方法是为了将小的负整数编码为占用字节数比较少的值，如果不进行这种转换，负整数不管大小将总占用 9 个字节。而第 160 到第 164 行的 Decode 方法进行反向解码。

上述程序中第 43 行到第 51 行的两个方法对 long 和 ZipInteger 结构进行隐式双向转换。因为这两个转换都是无损的，所以两个转换都声明为隐式的。有了从 ZipInteger 到 long 的隐式转换，ZipInteger 结构中虽然没有重载 +、-、*、/、%、>、<、>=、<=、==、!= 等算术和逻辑运算符，也可以在程序中使用这些运算符。有了 long 到 ZipInteger 的隐式转换，就可以在程序直接使用 byte、short、int、long 的变量和常量对 ZipInteger 进行赋值。

下面就是测试程序 ZipIntegerTester.cs ：

01:  using System;
02:  using System.IO;
03:  using System.Collections.Generic;
04:  using Skyiv.Numerics;
05:  
06:  namespace Skyiv.Tester
07:  {
08:    sealed class ZipIntegerTester
09:    {
10:      static void Main(string[] args)
11:      {
12:        try
13:        {
14:          var count = (args.Length > 0) ? int.Parse(args[0]) : 1000000;
15:          new ZipIntegerTester().Run(count, "Int64.bin", "ZipInteger.bin");
16:        }
17:        catch (Exception ex)
18:        {
19:          Console.WriteLine(ex);
20:        }
21:      }
22:  
23:      void Run(int count, string fileName1, string fileName2)
24:      {
25:        Console.WriteLine("    OS Version: " + Environment.OSVersion);
26:        Console.WriteLine("   CLR Version: " + Environment.Version);
27:        Console.WriteLine("IsLittleEndian: " + BitConverter.IsLittleEndian);
28:        Write(count, fileName1, fileName2);
29:        Verify(count, fileName1, fileName2);
30:      }
31:  
32:      IEnumerable<long> GetValues(int count)
33:      {
34:        if (count-- > 0) yield return long.MaxValue;
35:        if (count-- > 0) yield return long.MinValue;
36:        if (count-- > 0) yield return long.MaxValue / 2;
37:        if (count-- > 0) yield return -(long.MinValue / 2);
38:        var rand = new Random();
39:        while (count-- > 0)
40:        {
41:          var n = rand.Next((count % 10 == 0) ? 100000 : 100);
42:          yield return (rand.Next() % 2 == 0) ? n : -n;
43:        }
44:      }
45:  
46:      void Write(int count, string name1, string name2)
47:      {
48:        var fmt = "{0,14}: {1,9:N0}";
49:        Console.WriteLine(fmt, "Integer Count", count);
50:        using (var bw = new BinaryWriter(new FileStream(name1, FileMode.Create, FileAccess.Write)))
51:        using (var fs = new FileStream(name2, FileMode.Create, FileAccess.Write))
52:        {
53:          foreach (var n in GetValues(count))
54:          {
55:            bw.Write(n);
56:            ((ZipInteger)n).Write(fs);
57:          }
58:          Console.WriteLine(fmt + " bytes", name1, bw.BaseStream.Length);
59:          Console.WriteLine(fmt + " bytes", name2, fs.Length);
60:        }
61:      }
62:  
63:      void Verify(int count, string name1, string name2)
64:      {
65:        using (var br = new BinaryReader(File.OpenRead(name1)))
66:        using (var fs = File.OpenRead(name2))
67:        {
68:          var i = 0;
69:          for (ZipInteger? v; (v = ZipInteger.Read(fs)).HasValue; i++)
70:            if (br.ReadInt64() != v) throw new Exception("整数值不符");
71:          if (i != count) throw new Exception("计数不符");
72:        }
73:      }
74:    }
75:  }

在 openSUSE 11.3 操作系统的 mono 2.8.1 环境中编译和运行：

ben@ben1520:~/work/ZipInteger> dmcs ZipIntegerTester.cs ZipInteger.cs -r:System.Drawing.dll
ben@ben1520:~/work/ZipInteger> mono ZipIntegerTester.exe
    OS Version: Unix 2.6.34.7
   CLR Version: 4.0.30319.1
IsLittleEndian: True
 Integer Count: 1,000,000
     Int64.bin: 8,000,000 bytes
ZipInteger.bin: 1,515,957 bytes
ben@ben1520:~/work/ZipInteger>

在 Ubuntu 10.10 操作系统的 mono 2.6.7 环境中编译和运行：

ben@ben-1520:~/work/ZipInteger$ gmcs ZipIntegerTester.cs ZipInteger.cs -r:System.Drawing.dll
ben@ben-1520:~/work/ZipInteger$ ./ZipIntegerTester.exe
    OS Version: Unix 2.6.35.24
   CLR Version: 2.0.50727.1433
IsLittleEndian: True
 Integer Count: 1,000,000
     Int64.bin: 8,000,000 bytes
ZipInteger.bin: 1,515,436 bytes
ben@ben-1520:~/work/ZipInteger$

在 Windows Vista 操作系统的 .NET Framework 4 环境中编译和运行：

E:\work\ZipInteger> csc ZipIntegerTester.cs ZipInteger.cs
Microsoft(R) Visual C# 2010 编译器 4.0.30319.1 版
版权所有(C) Microsoft Corporation。保留所有权利。

E:\work\ZipInteger> ZipIntegerTester
    OS Version: Microsoft Windows NT 6.0.6002 Service Pack 2
   CLR Version: 4.0.30319.1
IsLittleEndian: True
 Integer Count: 1,000,000
     Int64.bin: 8,000,000 bytes
ZipInteger.bin: 1,515,451 bytes

E:\work\ZipInteger>

下面是测试程序生成的数据文件：

在 Int64.bin 文件中，每一个数都占用 8 个字节。而在 ZipInteger.bin 文件中，每一个数根据其大小占用 1 到 9 个字节不等。这两个数据文件的头两个数分别是 long.MaxValue 和 long.MinValue。这两个数在 ZipInteger.bin 文件中分别占用 9 个字节和 1 个字节。

在上面的测试中，总共生成一百万个数，其中百分之九十都是小于一百的数。测试结果是 Int64.bin 占用八百万字节，而 ZipInteger.bin 占用一百五十多万字节。这是因为测试数据中绝大部分都是很小的数的缘故。如果测试数据中大部分都是很大的数的话，ZipInteger.bin 占用的空间有可能会比 Int64.bin 占用的空间大。

也就是说，ZipInteger 结构适用于以下场合：需要保存一些一般情况下是很小的，但偶尔也有一两个很大的数字。

posted on 2010-12-21 22:07 银河阅读(1866) 评论(4) 编辑收藏举报

刷新页面返回顶部

银河

公告