在线CHM阅读器（2）——文件提取及关键文件解析

1、文件提取

在在线CHM阅读器(1)一文中已提到，CHM其实就是一个结构化存储文件(Structured Storage)，如果也阅读CHM文档，就必须将文件，图片等从CHM文件中提取出来，提取需要用到Structured Storage的StgOpenStorage函数以及IStorage和IStream接口，不过这些在.NET中都不能直接使用，需要先“包装”一下。如何使用IStorage和IStream可以参考这篇文章：

CHM Help File Extractor

不过这篇文章提供的源代码是用于反编译出CHM的所有文件的，开发在线CHM阅读器并不需要先反编译出所有的文件，只需要把浏览器当前请求的那个文件提取出来发送到客户端即可。提取文件的代码如下：

public class CHH
{
    /// <summary>
    /// 提取CHM中的文件
    /// </summary>
    /// <param name="chm">chm文件的路径</param>
    /// <param name="res">要提取的文件的全路径</param>
    /// <returns></returns>
    public static Stream Find(string chm, string res)
    {
        IStorage storage = ((ITStorage)new ITStorageClass()).StgOpenStorage(chm, IntPtr.Zero, 0x20, IntPtr.Zero, 0);
        try
        {
            DateTime s = DateTime.Now;
            IStream stream = Find(storage, res.Split(new char[] { '/' }, StringSplitOptions.RemoveEmptyEntries), 0);
            double e = (DateTime.Now - s).TotalMilliseconds;
            if (stream == null) return null;
            if (res.ToUpper() == "#SYSTEM")
            {
                //#System是二进制文件，javascript无法处理，因此处理成json格式的文本，数据格式为：
                //{
                //    HomePage:起始页,
                //    Encoding:编码,
                //    Title:标题
                //}
                using (Stream cs = new ComStream(stream))
                {
                    try
                    {
                        return (new ChmInfo(cs)).MakeStream();
                    }
                    finally
                    {
                        cs.Close();
                    }
                }
            }
            else
            {
                return new ComStream(stream);
            }
        }
        finally
        {
            Marshal.ReleaseComObject(storage);
        }
    }

    static IStream Find(IStorage storage, string[] res, int first)
    {
        if (first == res.Length - 1)
        {
            IStream stream = null;
            try
            {
                //找到对应的文件
                stream = storage.OpenStream(res[first], IntPtr.Zero, 0x20, 0);
            }
            catch
            {
            }

            if (stream == null && res[first].ToUpper() == ".HHC")
            {
                //由于目录文件的文件名一般不确定，因此做特殊处理
                System.Runtime.InteropServices.ComTypes.STATSTG stats;
                IEnumSTATSTG enumStats;
                int i = 0;

                storage.EnumElements(0, IntPtr.Zero, 0, out enumStats);

                try
                {
                    enumStats.Reset();
                    //枚举所有文件查找
                    while (enumStats.Next(1, out stats, out i) == 0)
                    {
                        if (System.IO.Path.GetExtension(stats.pwcsName).ToUpper() == ".HHC" && stats.type == 2)
                        {
                            stream = storage.OpenStream(stats.pwcsName, IntPtr.Zero, 0x20, 0);
                            return stream;
                        }
                    }
                }
                finally
                {
                    Marshal.ReleaseComObject(enumStats);
                }
            }

            return stream;
        }
        else
        {
            //在文件夹中递归查找目标文件
            IStorage next = storage.OpenStorage(res[first], IntPtr.Zero, 0x20, IntPtr.Zero, 0);
            try
            {
                return Find(next, res, first + 1);
            }
            finally
            {
                Marshal.ReleaseComObject(next);
            }
        }
    }
}

2、处理#SYSTEM文件

上文提取文件的代码已提到，#SYSTEM是一个二进制文件，而javascript是不能处理二进制文件的，因此，必须在后台处理#SYSTEM文件，转换成json格式的文本发送到客户端。#SYSTEM的格式如下图所示：

根据上图所示的规律，既可以处理#SYSTEM文件了，代码如下：

    class ChmInfo
    {
        const UInt16 ID_HHC = 0x0000;        //
        const UInt16 ID_MAIN = 0x0002;        //起始页的ID
        const UInt16 ID_Version = 0x0009;   //版本的ID
        const UInt16 ID_TITLE = 0x0003;        //标题的ID
        const UInt16 ID_PRJNAME = 0x0006;    //工程的ID
        const UInt16 ID_LAN = 0x0004;        //区域的ID

        public String MainPage = String.Empty, PrjName = String.Empty, Title = String.Empty, HHA_Version = String.Empty, HHC = String.Empty;
        public Encoding Encoding = null;
        Hashtable _session = new Hashtable();

        private bool ReadSession(BinaryReader reader)
        {
            if (reader.BaseStream.Position >= reader.BaseStream.Length) return false;

            UInt16 id = reader.ReadUInt16();
            UInt16 count = reader.ReadUInt16();
            if (count + reader.BaseStream.Position <= reader.BaseStream.Length)
            {
                if (count > 0)
                {
                    _session[id] = reader.ReadBytes(count);
                }
                return true;
            }
            else
            {
                return false;
            }
        }

        public ChmInfo(Stream stream)
        {
            BinaryReader reader = new BinaryReader(stream);

            //读取所有数据及其对应的ID并保存到一个Hashtable中
            while (ReadSession(reader)) ;

            try
            {
                if (_session.ContainsKey(ID_LAN))
                {
                    Byte[] data = _session[ID_LAN] as Byte[];
                    CultureInfo info = new CultureInfo(data[1] * 0x100 + data[0]);
                    Encoding = Encoding.GetEncoding(info.TextInfo.ANSICodePage);
                }
            }
            catch
            {
            }
            if (Encoding == null) Encoding = Encoding.GetEncoding("GB2312");

            if (_session.ContainsKey(ID_MAIN))
            {
                Byte[] data = _session[ID_MAIN] as Byte[];
                MainPage = Encoding.GetString(data, 0, data.Length - 1);
            }

            if (_session.ContainsKey(ID_TITLE))
            {
                Byte[] data = _session[ID_TITLE] as Byte[];
                Title = Encoding.GetString(data, 0, data.Length - 1);
            }

            if (_session.ContainsKey(ID_PRJNAME))
            {
                Byte[] data = _session[ID_PRJNAME] as Byte[];
                PrjName = Encoding.GetString(data, 0, data.Length - 1);
            }

            if (_session.ContainsKey(ID_Version))
            {
                Byte[] data = _session[ID_Version] as Byte[];
                HHA_Version = Encoding.GetString(data, 0, data.Length - 1);
            }

            if (_session.ContainsKey(ID_HHC))
            {
                Byte[] data = _session[ID_HHC] as Byte[];
                HHC = Encoding.GetString(data, 0, data.Length - 1);
            }
        }

        public Stream MakeStream()
        {
            //生成JSON并保存到一个MemoryStream中
            String json = String.Format(
                "{{\"MainPage\":\"{0}\",\"Title\":\"{1}\",\"HHC\":\"{2}\",\"Encoding\":\"{3}\"}}",
                TransferCharJavascript(MainPage),
                TransferCharJavascript(Title),
                TransferCharJavascript(HHC),
                TransferCharJavascript(Encoding.HeaderName)
            );

            Byte[] buffer = Encoding.UTF8.GetBytes(json);
            Stream stream = new MemoryStream(buffer.Length);
            stream.Write(buffer, 0, buffer.Length);
            stream.Seek(0, SeekOrigin.Begin);
            return stream;
        }

        public static string TransferCharJavascript(string s)
        {
            StringBuilder ret = new StringBuilder();
            foreach (char c in s)
            {
                switch (c)
                {
                case '\r':
                case '\t':
                case '\n':
                case '\f':
                case '\v':
                case '\"':
                case '\\':
                case '\'':
                case '<':
                case '>':
                case '\0':
                    ret.AppendFormat("\\u{0:X4}", (int)c);
                    break;
                default:
                    ret.Append(c);
                    break;
                }
            }
            return ret.ToString();
        }
    }

3、处理目录(*.hhc)文件

目录文件保存着一个CHM文件的目录结构，它是一个文本文件，为了减轻服务器的负担，将目录文件放到浏览器来处理。在在线CHM阅读器(1)一文中已提到，目录文件大概的规律是：每一个<LI><OBJECT>…<OBJECT>对应着目录树中的一个节点，<OBJECT>…<OBJECT>中的参数记录着该节点的属性(对应的页面，名称等)。如果这个节点有子节点的话，那么<LI>后面会紧跟着一个<UL></UL>,<UL>里面所有的节点都是其子节点。处理的代码如下：

function ChmHHC(buffer)
{
    var position = 0;

    var RegxTagName = /(<|<\/)([a-zA-Z]+)(\s[\S\s]*|)>/i;
    var RegxAttrs = /([a-zA-Z1-9]+)\s*=\s*\x22([^\x22]+)\x22/ig;
    
    //读取下一个标志(<标志名>)
    function ReadTag()
    {
        var tag = {
            Name: "",
            Type: "",
            Attrs: {}
        };
        var res = null;

        while (res == null)
        {
            if (position >= buffer.length) return null;

            while (position < buffer.length && buffer.charAt(position) != '<') position++;
            if (position >= buffer.length) return null;
            var s = position;

            while (position < buffer.length && buffer.charAt(position) != '>') position++;
            if (position >= buffer.length) return null;
            var e = position;

            position++;

            var tag_str = buffer.substr(s, e - s + 1);
            RegxTagName.lastIndex = 0;
            res = RegxTagName.exec(tag_str);
        }

        tag.Name = res[2].toUpperCase();
        tag.Type = res[1] == '<' ? "Begin": "End";

        if (tag.Type == "Begin" && res.length > 3 && res[3] != "")
        {
            RegxAttrs.lastIndex = 0;
            var atrr = null;
            while ((attr = RegxAttrs.exec(res[3])) != null)
            {
                tag.Attrs[attr[1].toLowerCase()] = attr[2];
            }
        }

        return tag;
    }

    var current = null;

    function IsBeginTag(tag, name)
    {
        return tag.Type == "Begin" && tag.Name == name;
    }

    function IsEndTag(tag, name)
    {
        return tag.Type == "End" && tag.Name == name;
    }

    function RenderTag()
    {
        if (current != null && IsBeginTag(current, "LI"))
        {
            var node = {
                NodeType: "LI",
                SubNodes: []
            };

            current = ReadTag();
            if (current != null && IsBeginTag(current, "OBJECT"))
            {
                node.type = current.Attrs["type"];
                current = ReadTag();
                while (current != null && !IsEndTag(current, "OBJECT"))
                {
                    if (IsBeginTag(current, "PARAM"))
                    {
                        node[current.Attrs["name"]] = current.Attrs["value"];
                    }
                    current = ReadTag();
                }

                if (current != null && IsEndTag(current, "OBJECT")) current = ReadTag();
                if (current != null && IsEndTag(current, "LI")) current = ReadTag();

                //尾随着LI的所有UL中的节点均作为该LI的子节点
                while(current != null && IsBeginTag(current, "UL"))
                {
                    var ul = RenderTag();
                    if (ul != null)
                    {
                        for(var ul_index in ul.Nodes) node.SubNodes.push(ul.Nodes[ul_index]);
                    }
                }
                
                return node;
            }
        }
        else if (current != null && IsBeginTag(current, "UL"))
        {
            var node = {
                NodeType: "UL",
                Nodes: []
            };

            current = ReadTag();
            while (current != null && !IsEndTag(current, "UL"))
            {
                var subNode = RenderTag();
                if (subNode != null) node.Nodes.push(subNode);
            }

            if (current != null)
            {
                current = ReadTag();
                return node;
            }
        }
        else
        {
            current = ReadTag();
        }
        return null;
    }

    var roots = [];

    this.Render = function()
    {
        position = 0;
        current = ReadTag();

        while (current != null)
        {
            var node = RenderTag();
            if (node != null) roots.push(node);
        }
        current = null;
    }

    this.GetNodes = function()
    {
        return roots;
    }
}