在线CHM阅读器(2)——文件提取及关键文件解析
1、文件提取
在在线CHM阅读器(1)一文中已提到,CHM其实就是一个结构化存储文件(Structured Storage),如果也阅读CHM文档,就必须将文件,图片等从CHM文件中提取出来,提取需要用到Structured Storage的StgOpenStorage函数以及IStorage和IStream接口,不过这些在.NET中都不能直接使用,需要先“包装”一下。如何使用IStorage和IStream可以参考这篇文章:
不过这篇文章提供的源代码是用于反编译出CHM的所有文件的,开发在线CHM阅读器并不需要先反编译出所有的文件,只需要把浏览器当前请求的那个文件提取出来发送到客户端即可。提取文件的代码如下:
public class CHH { /// <summary> /// 提取CHM中的文件 /// </summary> /// <param name="chm">chm文件的路径</param> /// <param name="res">要提取的文件的全路径</param> /// <returns></returns> public static Stream Find(string chm, string res) { IStorage storage = ((ITStorage)new ITStorageClass()).StgOpenStorage(chm, IntPtr.Zero, 0x20, IntPtr.Zero, 0); try { DateTime s = DateTime.Now; IStream stream = Find(storage, res.Split(new char[] { '/' }, StringSplitOptions.RemoveEmptyEntries), 0); double e = (DateTime.Now - s).TotalMilliseconds; if (stream == null) return null; if (res.ToUpper() == "#SYSTEM") { //#System是二进制文件,javascript无法处理,因此处理成json格式的文本,数据格式为: //{ // HomePage:起始页, // Encoding:编码, // Title:标题 //} using (Stream cs = new ComStream(stream)) { try { return (new ChmInfo(cs)).MakeStream(); } finally { cs.Close(); } } } else { return new ComStream(stream); } } finally { Marshal.ReleaseComObject(storage); } } static IStream Find(IStorage storage, string[] res, int first) { if (first == res.Length - 1) { IStream stream = null; try { //找到对应的文件 stream = storage.OpenStream(res[first], IntPtr.Zero, 0x20, 0); } catch { } if (stream == null && res[first].ToUpper() == ".HHC") { //由于目录文件的文件名一般不确定,因此做特殊处理 System.Runtime.InteropServices.ComTypes.STATSTG stats; IEnumSTATSTG enumStats; int i = 0; storage.EnumElements(0, IntPtr.Zero, 0, out enumStats); try { enumStats.Reset(); //枚举所有文件查找 while (enumStats.Next(1, out stats, out i) == 0) { if (System.IO.Path.GetExtension(stats.pwcsName).ToUpper() == ".HHC" && stats.type == 2) { stream = storage.OpenStream(stats.pwcsName, IntPtr.Zero, 0x20, 0); return stream; } } } finally { Marshal.ReleaseComObject(enumStats); } } return stream; } else { //在文件夹中递归查找目标文件 IStorage next = storage.OpenStorage(res[first], IntPtr.Zero, 0x20, IntPtr.Zero, 0); try { return Find(next, res, first + 1); } finally { Marshal.ReleaseComObject(next); } } } }
2、处理#SYSTEM文件
上文提取文件的代码已提到,#SYSTEM是一个二进制文件,而javascript是不能处理二进制文件的,因此,必须在后台处理#SYSTEM文件,转换成json格式的文本发送到客户端。#SYSTEM的格式如下图所示:
根据上图所示的规律,既可以处理#SYSTEM文件了,代码如下:
class ChmInfo { const UInt16 ID_HHC = 0x0000; // const UInt16 ID_MAIN = 0x0002; //起始页的ID const UInt16 ID_Version = 0x0009; //版本的ID const UInt16 ID_TITLE = 0x0003; //标题的ID const UInt16 ID_PRJNAME = 0x0006; //工程的ID const UInt16 ID_LAN = 0x0004; //区域的ID public String MainPage = String.Empty, PrjName = String.Empty, Title = String.Empty, HHA_Version = String.Empty, HHC = String.Empty; public Encoding Encoding = null; Hashtable _session = new Hashtable(); private bool ReadSession(BinaryReader reader) { if (reader.BaseStream.Position >= reader.BaseStream.Length) return false; UInt16 id = reader.ReadUInt16(); UInt16 count = reader.ReadUInt16(); if (count + reader.BaseStream.Position <= reader.BaseStream.Length) { if (count > 0) { _session[id] = reader.ReadBytes(count); } return true; } else { return false; } } public ChmInfo(Stream stream) { BinaryReader reader = new BinaryReader(stream); //读取所有数据及其对应的ID并保存到一个Hashtable中 while (ReadSession(reader)) ; try { if (_session.ContainsKey(ID_LAN)) { Byte[] data = _session[ID_LAN] as Byte[]; CultureInfo info = new CultureInfo(data[1] * 0x100 + data[0]); Encoding = Encoding.GetEncoding(info.TextInfo.ANSICodePage); } } catch { } if (Encoding == null) Encoding = Encoding.GetEncoding("GB2312"); if (_session.ContainsKey(ID_MAIN)) { Byte[] data = _session[ID_MAIN] as Byte[]; MainPage = Encoding.GetString(data, 0, data.Length - 1); } if (_session.ContainsKey(ID_TITLE)) { Byte[] data = _session[ID_TITLE] as Byte[]; Title = Encoding.GetString(data, 0, data.Length - 1); } if (_session.ContainsKey(ID_PRJNAME)) { Byte[] data = _session[ID_PRJNAME] as Byte[]; PrjName = Encoding.GetString(data, 0, data.Length - 1); } if (_session.ContainsKey(ID_Version)) { Byte[] data = _session[ID_Version] as Byte[]; HHA_Version = Encoding.GetString(data, 0, data.Length - 1); } if (_session.ContainsKey(ID_HHC)) { Byte[] data = _session[ID_HHC] as Byte[]; HHC = Encoding.GetString(data, 0, data.Length - 1); } } public Stream MakeStream() { //生成JSON并保存到一个MemoryStream中 String json = String.Format( "{{\"MainPage\":\"{0}\",\"Title\":\"{1}\",\"HHC\":\"{2}\",\"Encoding\":\"{3}\"}}", TransferCharJavascript(MainPage), TransferCharJavascript(Title), TransferCharJavascript(HHC), TransferCharJavascript(Encoding.HeaderName) ); Byte[] buffer = Encoding.UTF8.GetBytes(json); Stream stream = new MemoryStream(buffer.Length); stream.Write(buffer, 0, buffer.Length); stream.Seek(0, SeekOrigin.Begin); return stream; } public static string TransferCharJavascript(string s) { StringBuilder ret = new StringBuilder(); foreach (char c in s) { switch (c) { case '\r': case '\t': case '\n': case '\f': case '\v': case '\"': case '\\': case '\'': case '<': case '>': case '\0': ret.AppendFormat("\\u{0:X4}", (int)c); break; default: ret.Append(c); break; } } return ret.ToString(); } }
3、处理目录(*.hhc)文件
目录文件保存着一个CHM文件的目录结构,它是一个文本文件,为了减轻服务器的负担,将目录文件放到浏览器来处理。在在线CHM阅读器(1)一文中已提到,目录文件大概的规律是:每一个<LI><OBJECT>…<OBJECT>对应着目录树中的一个节点,<OBJECT>…<OBJECT>中的参数记录着该节点的属性(对应的页面,名称等)。如果这个节点有子节点的话,那么<LI>后面会紧跟着一个<UL></UL>,<UL>里面所有的节点都是其子节点。处理的代码如下:
function ChmHHC(buffer) { var position = 0; var RegxTagName = /(<|<\/)([a-zA-Z]+)(\s[\S\s]*|)>/i; var RegxAttrs = /([a-zA-Z1-9]+)\s*=\s*\x22([^\x22]+)\x22/ig; //读取下一个标志(<标志名>) function ReadTag() { var tag = { Name: "", Type: "", Attrs: {} }; var res = null; while (res == null) { if (position >= buffer.length) return null; while (position < buffer.length && buffer.charAt(position) != '<') position++; if (position >= buffer.length) return null; var s = position; while (position < buffer.length && buffer.charAt(position) != '>') position++; if (position >= buffer.length) return null; var e = position; position++; var tag_str = buffer.substr(s, e - s + 1); RegxTagName.lastIndex = 0; res = RegxTagName.exec(tag_str); } tag.Name = res[2].toUpperCase(); tag.Type = res[1] == '<' ? "Begin": "End"; if (tag.Type == "Begin" && res.length > 3 && res[3] != "") { RegxAttrs.lastIndex = 0; var atrr = null; while ((attr = RegxAttrs.exec(res[3])) != null) { tag.Attrs[attr[1].toLowerCase()] = attr[2]; } } return tag; } var current = null; function IsBeginTag(tag, name) { return tag.Type == "Begin" && tag.Name == name; } function IsEndTag(tag, name) { return tag.Type == "End" && tag.Name == name; } function RenderTag() { if (current != null && IsBeginTag(current, "LI")) { var node = { NodeType: "LI", SubNodes: [] }; current = ReadTag(); if (current != null && IsBeginTag(current, "OBJECT")) { node.type = current.Attrs["type"]; current = ReadTag(); while (current != null && !IsEndTag(current, "OBJECT")) { if (IsBeginTag(current, "PARAM")) { node[current.Attrs["name"]] = current.Attrs["value"]; } current = ReadTag(); } if (current != null && IsEndTag(current, "OBJECT")) current = ReadTag(); if (current != null && IsEndTag(current, "LI")) current = ReadTag(); //尾随着LI的所有UL中的节点均作为该LI的子节点 while(current != null && IsBeginTag(current, "UL")) { var ul = RenderTag(); if (ul != null) { for(var ul_index in ul.Nodes) node.SubNodes.push(ul.Nodes[ul_index]); } } return node; } } else if (current != null && IsBeginTag(current, "UL")) { var node = { NodeType: "UL", Nodes: [] }; current = ReadTag(); while (current != null && !IsEndTag(current, "UL")) { var subNode = RenderTag(); if (subNode != null) node.Nodes.push(subNode); } if (current != null) { current = ReadTag(); return node; } } else { current = ReadTag(); } return null; } var roots = []; this.Render = function() { position = 0; current = ReadTag(); while (current != null) { var node = RenderTag(); if (node != null) roots.push(node); } current = null; } this.GetNodes = function() { return roots; } }
调用ChmHHC的Render方法后,将HHC文件转换成一个数组,保存着所有的节点,其结构与目录的对应关系如下图所示:
在上文中,已经介绍了如何提取出CHM文件中的文件(网页,图片等)以及如何解析目录文件,下一篇文章,将介绍如何使用ISAPI筛选器和IHttpHandler来开发一个在线CHM阅读器。
如果觉得文章不错的话,欢迎点一下右下角的推荐。