部分说明文档

此博客仅为展示文档使用,为文档的一部分截取,详细可在项目文件夹中查看

C705团队代码说明文档

该文档为C705团队根据以前学霸项目Pipeline以及本团队所实现的所有代码的基础上进行注释说明所得。

C705团队主要修改的类为:OtherToHtml, DataMining, Denoising, WordSegment, GoogleTranslator, MainWindow

其他类大部分为以前团队所写代码,其中大部分注释为本团队标注,仅供参考用。

class OtherToHtml

    {

        public interface IcDocument

        {

            void TransformDocument();

        }

        public abstract class BaseDocument

        {

            /// <summary>

            /// 目標文件夾

            /// </summary>

            protected string TargetFolder;

            /// <summary>

            /// 原文件

            /// </summary>

            protected string source;

 

            /// <summary>

            /// 目標文件

            /// </summary>

            protected string Target;

 

            protected virtual void GetCurrentTarget()

            {

 

                if (!Directory.Exists(TargetFolder))

                {

 

                    Directory.CreateDirectory(TargetFolder);

                }

 

                FileInfo temp = new FileInfo(source);

                string fileName = temp.Name + ".html";

                Target = TargetFolder + @"\" + fileName;

            }

 

            public BaseDocument(string TargetFolder, string source)

            {

                this.source = source;

                this.TargetFolder = TargetFolder;

                GetCurrentTarget();

            }

        }

        public class FactoryDocument

        {

            /// <summary>

            /// 得到操作的文檔

            /// </summary>

            /// <param name="TargetFolder">生成的文件夾</param>

            /// <param name="source">要讀取的文件</param>

            /// <returns></returns>

            public static IcDocument GetDocoment(string TargetFolder, string source)

            {

 

                FileInfo file = new FileInfo(source);

                IcDocument document = null;

                if (file.Exists)

                {

                    switch (Path.GetExtension(source).ToUpper())

                    {

 

 

                        case ".PDF":

                      

                            document = new PdfDocument(TargetFolder, source);

                            break;

 

                    }

                }

                else

                {

                    MessageBox.Show("文件沒有找到");

                }

                return document;

            }

 

            internal static IcDocument GetDocoment(DirectoryInfo directoryInfo, string curItem)

            {

                throw new NotImplementedException();

            }

        }

        public class PdfDocument : BaseDocument, IcDocument

        {

 

            public PdfDocument(string TargetFolder, string source)

                : base(TargetFolder, source)

            {

 

            }

函数功能:将读到的pdf文件转化为txt文件

输入:要转化的文件的路径

输出:转化好的文件

            public void pdf2txt(FileInfo file)

            {

                PDDocument doc = PDDocument.load(file.FullName);

 

                PDFTextStripper pdfStripper = new PDFTextStripper();

 

                string text = pdfStripper.getText(doc);

 

                StreamWriter swPdfChange = new StreamWriter(Target, false, Encoding.GetEncoding(65001));

 

                swPdfChange.Write(text);

                swPdfChange.Close();

 

            }

                            函数功能:处理txt文件,清理转化失败的标识符,图片等乱码

                            输入:要处理的文件的路径

                            输出:处理好的文件

            public void handletxt()

            {

                String path = Target;

                String[] lines = File.ReadAllLines(path);

                List<String> list = new List<String>();

                foreach (String line in lines)

                {

                    if (line.Length > 4)//长度小于4的行,视为处理失败的行

                        list.Add(line);

                }

                lines = list.ToArray();

                File.WriteAllLines(path, lines);//将处理结果写回文件

            }

                            函数功能:处理pdf文件

                            输入:要处理的文件的路径

                            输出:已经处理过并且去除部分乱码的文件

            public void TransformDocument()

            {

                FileInfo pdffile = new FileInfo(source);

 

                if (pdffile.Exists)

                {

                    pdf2txt(pdffile);

                    handletxt();

                }

                else

                {

                    Console.WriteLine("The File is NOT Exist.");

                }

            }

        }

    }

}

 

posted @ 2015-01-15 13:46  C705  阅读(145)  评论(0编辑  收藏  举报