C#生成HMLT(图片)

 C#word转化成HTML可以直接调用word程序的html转换功能。

首先,需要设置权限

1、在命令行中输入:dcomcnfg,会显示出“组件服务”管理器

2、打开“组件服务->计算机->我的电脑->DCOM 配置”,找到“Microsoft Word文档”,单击右键,选择“属性”

3、在“属性”对话框中单击“标识”选项卡,选择“交互式用户”。

4、在“属性”对话框中单击“安全”选项卡,点击“启动和激活权限”自定义,增加“Internet 来宾账号”和“NETWORK SERVICE”,两个账号,并且权限全部选择“允许”;

5、在“属性”对话框中单击“安全”选项卡,点击“访问权限”自定义,增加“Internet 来宾账号”和“NETWORK SERVICE”,两个账号,并且权限全部选择“允许”;

 

(注:有的计算机上需要将第3步设置为“下列用户”,并且填入计算机登陆账号和密码,郁闷,上传到服务器上的时候,碰到这个问题,调试了半天,采用这个方法解决。)

 

设置完权限,接下来就是开始编写代码

 

新建一个项目WordToHtml

引用WORD组件MSWORD.OLB,在OFFICE的安装文件夹下面可以找到(前提是要安装OFFICE,废话。。。)

新建类 WordToHtml

using Microsoft.Office.Interop.Word;

using System.IO;

using System.Text;

using System.Text.RegularExpressions;

 

#region 生成Html

     // path 是word文件的物理路径

    public string ConverToHtml(string path)

    {

        Microsoft.Office.Interop.Word.Application word = new Microsoft.Office.Interop.Word.ApplicationClass();

        Type wordType = word.GetType();

        Microsoft.Office.Interop.Word.Documents docs = word.Documents;

 

        // 打开文件

        Type docsType = docs.GetType();

        object fileName = path;

        Microsoft.Office.Interop.Word.Document doc = (Microsoft.Office.Interop.Word.Document)docsType.InvokeMember("Open", System.Reflection.BindingFlags.InvokeMethod, null, (object)docs, new Object[] { fileName, true, true });

 

        //定义存储静态页码的文件夹

        if (!Directory.Exists(HttpContext.Current.Server.MapPath("upfile/Html")))

        {

            Directory.CreateDirectory(HttpContext.Current.Server.MapPath("upfile/Html"));

        }

 

        //重新命名新文件,使用时间为名称

        string filename = System.DateTime.Now.Year.ToString() + System.DateTime.Now.Month.ToString() + System.DateTime.Now.Day.ToString() + System.DateTime.Now.Hour.ToString() + System.DateTime.Now.Minute.ToString() + System.DateTime.Now.Second.ToString() + System.DateTime.Now.Millisecond.ToString();

        string ConfigPath = HttpContext.Current.Server.MapPath("upfile/Html/" + filename + ".html");

 

        // 转换格式,另存为

        Type docType = doc.GetType();

        object saveFileName = ConfigPath;

        //下面是Microsoft Word 9(11.0) Object Library的写法,如果是10(没试过),可能写成:

        /*

        docType.InvokeMember("SaveAs", System.Reflection.BindingFlags.InvokeMethod,

         null, doc, new object[]{saveFileName, Word.WdSaveFormat.wdFormatFilteredHTML});

        */

        ///其它格式:

        ///wdFormatHTML

        ///wdFormatDocument

        ///wdFormatDOSText

        ///wdFormatDOSTextLineBreaks

        ///wdFormatEncodedText

        ///wdFormatRTF

        ///wdFormatTemplate

        ///wdFormatText

        ///wdFormatTextLineBreaks

        ///wdFormatUnicodeText

        docType.InvokeMember("SaveAs", System.Reflection.BindingFlags.InvokeMethod, null, doc, new object[] { saveFileName, Microsoft.Office.Interop.Word.WdSaveFormat.wdFormatHTML });

 

        docType.InvokeMember("Close", System.Reflection.BindingFlags.InvokeMethod, null, doc, null);

        // 退出 Word

        wordType.InvokeMember("Quit", System.Reflection.BindingFlags.InvokeMethod, null, word, null);

 

        //清理生成的html中的word格式

        if (File.Exists(ConfigPath))

        {

            string textContent = "";

 

            using (StreamReader sr = new StreamReader(ConfigPath, Encoding.Default))

            {

                textContent = sr.ReadToEnd();

                textContent = rep(textContent); //清除HTML标签的多余

                textContent = Regex.Replace(textContent, "<o:p>|<""/o:p>", "", RegexOptions.IgnoreCase);

                textContent = Regex.Replace(textContent, "<!--""[if[""s""S]*?<!""[endif""]-->", "", RegexOptions.IgnoreCase);

                textContent = Regex.Replace(textContent, "<style>[""s""S]*?</style>", "", RegexOptions.IgnoreCase);

                textContent = Regex.Replace(textContent, "v:shapes=[""s""S]*?"">", ">", RegexOptions.IgnoreCase);

                textContent = Regex.Replace(textContent, "<(meta|/?st""d|/?o:|!""[|link)[^>]*?>", "", RegexOptions.IgnoreCase);

                textContent = Regex.Replace(textContent, " lang=[^>^ ]*", "", RegexOptions.IgnoreCase);

                textContent = Regex.Replace(textContent, " class=[^>^ ]*", "", RegexOptions.IgnoreCase);

                textContent = Regex.Replace(textContent, "<!--""[if gte[""s""S]*?<!""[endif""]-->", "", RegexOptions.IgnoreCase);

                textContent = Regex.Replace(textContent, "]*);", "", RegexOptions.IgnoreCase);

                textContent = Regex.Replace(textContent, ""]{1,}?([^"'^"";]*)[""]{1,}?"";", "", RegexOptions.IgnoreCase);

                textContent = Regex.Replace(textContent, "]*)", "", RegexOptions.IgnoreCase);

                textContent = Regex.Replace(textContent, ""]{1,}?([^"'^"";]*)[""]{1,}?", "", RegexOptions.IgnoreCase);

                textContent = Regex.Replace(textContent, "<span[^"">]*></span>", "", RegexOptions.IgnoreCase);

                textContent = Regex.Replace(textContent, "<p[^"">]*></p>", "", RegexOptions.IgnoreCase);

                textContent = Regex.Replace(textContent, "<span[^>]*><img", "<span><img", RegexOptions.IgnoreCase);

 

                textContent.Trim();

                sr.Close();

            }

 

            FileStream fs = new FileStream(ConfigPath, FileMode.Create, FileAccess.Write);

            StreamWriter sw = new StreamWriter(fs, Encoding.Default);

            sw.Write(textContent);

            sw.Flush();

            sw.Close();

        }

        return "/upfile/Html/" + filename + ".html";

    }

 

    /// <summary>

    /// 清除HTML标签的多余

    /// </summary>

    /// <param name="el">标签名</param>

    /// <param name="str">源字符串</param>

    /// <returns></returns>

    private string repElement(string el, string str)

    {

        string pat = @"<" + el + "[^>]+>";

        string rep = "<" + el + ">";

        str = Regex.Replace(str.ToString(), pat, rep);

        return str;

    }

 

    private string rep(string str)

    {

        string[] el = new string[] {"strong", "head", "html", "div", "body" };

        foreach (string s in el)

        {

            str = repElement(s, str);

        }

        return str;

    }

#endregion

 

WORD转换HTML需要用正则表达式过滤掉WORD的格式,不然生成的HTML文档带有WORD格式,显示可能会出问题。

 

 

缺点:word转换html,并不能保证html的排版跟原来的word格式相同。本来以为是HTML过滤WORD格式发生问题,后来研究了QQ邮箱的WORD转换HTML功能,发现其也无法保证排版的一致,QQ邮箱的WORD转换HTML功能,重新对HTML进行排版,过滤了所有的WORD格式代码,并且把标签的样式统一到CSS样式中,重新使用CSS类进行命名。这种方法生成的HTML代码简洁,符合XHTML规范,但也无法解决排版一致的问题。

 

有的项目的比如OA,会涉及到公文,这个时候客户就会要求排版。如何保证排版的一致性呢?可以采用另外一种方法,word生成图片。实现的思路是,使用officeMicrosoft Office Document Image Writer虚拟打印机,将文档打印成tiff格式(一种传真格式),然后转换成图片。

 

具体实现如下:

首先是将Microsoft Office Document Image Writer打印机设置成默认打印机,点击打印机属性,点击“常规”选项卡里面的“打印首选项”,找到“高级”选项卡,设置“输出格式”为TIFF(标准100DPI),默认文件夹选择到你想要保存的地方。接着打开任意一个WORD文档,选择打印,打印机默认选择“Microsoft Office Document Image Writer”,点击“打印”,保存,默认生成tif格式的图片,使用Microsoft Office Document Imaging可以打开tif文档,进行查看。

注:如果没有打印机没有Microsoft Office Document Image Writer,请使用office完整安装盘重新安装。

然后是设置word调用权限,方法如上文

最后是代码实现

#region 生成图片,并且创建Html

    /// <summary>

    /// 将word转成tiff图片,并且创建Html文档

    /// </summary>

    /// <param name="sourceFile"></param>

    /// <returns></returns>

    public string ConverToPic(object sourceFile)

   {

        //定义存储图片的文件夹

        if (!Directory.Exists(HttpContext.Current.Server.MapPath("upfile/Html/images")))

        {

            Directory.CreateDirectory(HttpContext.Current.Server.MapPath("upfile/Html/images"));

        }

        //定义图片名称,使用时间为名称

        string fileName = System.DateTime.Now.Year.ToString() + System.DateTime.Now.Month.ToString() + System.DateTime.Now.Day.ToString() + System.DateTime.Now.Hour.ToString() + System.DateTime.Now.Minute.ToString() + System.DateTime.Now.Second.ToString() + System.DateTime.Now.Millisecond.ToString();

        //图片不加后缀名,防止word打印程序打开预览窗口

        string tifPath = HttpContext.Current.Server.MapPath("upfile/Html/images/" + fileName);

 

        try

        {

            //生成的tif路径

            object OutputFileName = (object)tifPath;

            object varMissing = Type.Missing;

            //定义Word实例对象

            Microsoft.Office.Interop.Word.Application varWord = new Microsoft.Office.Interop.Word.Application();

            Type wordType = varWord.GetType();

 

            //设置Word的虚拟打印机

            varWord.ActivePrinter = "Microsoft Office Document Image Writer";

            //定义Word文档实例对象

            Microsoft.Office.Interop.Word.Document varDoc = varWord.Documents.Open(

                ref sourceFile, ref varMissing, ref varMissing, ref varMissing, ref varMissing,

                ref varMissing, ref varMissing, ref varMissing, ref varMissing, ref varMissing,

                ref varMissing, ref varMissing, ref varMissing, ref varMissing, ref varMissing, ref varMissing);

            Type docType = varDoc.GetType();

            //激活文档

            varDoc.Activate();

            object PrintToFile = true;

            //打印成tif文件

            varDoc.PrintOut(ref varMissing, ref varMissing, ref varMissing, ref OutputFileName, ref varMissing,

                            ref varMissing, ref varMissing, ref varMissing, ref varMissing, ref varMissing, ref PrintToFile,

                            ref varMissing, ref varMissing, ref varMissing, ref varMissing, ref varMissing, ref varMissing, ref varMissing);

            object saveChange = WdSaveOptions.wdSaveChanges; //关闭word文档时不打开新窗体

            docType.InvokeMember("Close", System.Reflection.BindingFlags.InvokeMethod, null, varDoc, new object[] { saveChange, varMissing, varMissing });

            wordType.InvokeMember("Quit", System.Reflection.BindingFlags.InvokeMethod, null, varWord, null);

 

        }

        catch (Exception ex)

        {

            throw new Exception(ex.Message);

        }

 

        //等待word打印tif,线程暂停3秒,因为打印会出现延迟

        System.Threading.Thread.Sleep(3000);

 

//生成HTML格式的文档,并把转换的图片插入到HTML中

        StringBuilder sbd = new StringBuilder();

        sbd.Append("<html>"n"r");

        sbd.Append("<head>"n"r");

        sbd.Append("<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />"n"r");

        sbd.Append("<style type='text/css'>"n"r");

        sbd.Append("body{margin:0; padding:0; text-align:center; background:#fff;}"n"r");

        sbd.Append("</style>"n"r");

        sbd.Append("</head>"n"r");

        sbd.Append("<body>"n"r");

 

        if (File.Exists(tifPath))

        {

            System.Drawing.Image imgInFile = System.Drawing.Image.FromFile(tifPath);

            //获取图片的维数

            Guid objGuid = (imgInFile.FrameDimensionsList[0]);

            System.Drawing.Imaging.FrameDimension objDimension = new System.Drawing.Imaging.FrameDimension(objGuid);

            //总页数

            int totFrame = 0;

            totFrame = imgInFile.GetFrameCount(objDimension);

            //循环tif页数,生成gif图片

            for (int i = 0; i < totFrame; i++)

            {

                imgInFile.SelectActiveFrame(objDimension, i);

                string newGifPath = string.Format("{0}-{1}.gif", tifPath, i);

                imgInFile.Save(newGifPath, ImageFormat.Gif);

                sbd.AppendFormat("<img src='images/{0}-{1}.gif' border='0' alt='第{2}页'/>"n"r", fileName, i, i + 1);

            }

            imgInFile.Dispose();

 

        }

 

        sbd.Append("</body>"n"r");

        sbd.Append("</html>"n"r");

 

        //定义生成的html路径

        string htmlPath = HttpContext.Current.Server.MapPath(string.Format("upfile/Html/{0}.html", fileName));

 

        //生成html文件

        try

        {

            FileStream fs = new FileStream(htmlPath, FileMode.OpenOrCreate, FileAccess.Write);

            StreamWriter sw = new StreamWriter(fs, Encoding.UTF8);

            sw.WriteLine(sbd.ToString());

            sw.Flush();

            sw.Close();

            fs.Close();

        }

        catch (Exception ex)

        {

            throw new Exception(ex.Message);

        }

 

        //删除fileName文件

        try

        {

            File.Delete(fileName);

        }

        catch

        { }

 

        return string.Format("/upfile/Html/{0}.html", fileName);

 

    }

 

    #endregion

 

使用这种方法,可以保证生成的HTML页面,排版一致,缺点是图片只能是黑白,无法使用彩色。(暂时没有找到可以生成彩色的方法)

以上方法参考网上的资料

posted @ 2009-02-07 08:50  阿sam  阅读(3527)  评论(2编辑  收藏  举报