使用NOPI读取Word、Excel文档内容
使用NOPI读取Excel的例子很多,读取Word的例子不多。
Excel的解析方式有多中,可以使用ODBC查询,把Excel作为一个数据集对待。也可以使用文档结构模型的方式进行解析,即解析Workbook(工作簿)、Sheet、Row、Column。
Word的解析比较复杂,因为Word的文档结构模型定义较为复杂。解析Word或者Excel,关键是理解Word、Excel的文档对象模型。
Word、Excel文档对象模型的解析,可以通过COM接口调用,此类方式使用较广。(可以录制宏代码,然后替换为对应的语言)
也可以使用XML模型解析,尤其是对于2007、2010版本的文档的解析。
1 using NPOI.POIFS.FileSystem; 2 using NPOI.SS.UserModel; 3 using NPOI.XSSF.UserModel; 4 using NPOI.XWPF.UserModel; 5 using System; 6 using System.Collections.Generic; 7 using System.Configuration; 8 using System.IO; 9 using System.Text; 10 11 namespace eyuan 12 { 13 public static class NOPIHandler 14 { 15 /// <summary> 16 /// 17 /// </summary> 18 /// <param name="fileName"></param> 19 /// <returns></returns> 20 public static List<List<List<string>>> ReadExcel(string fileName) 21 { 22 //打开Excel工作簿 23 XSSFWorkbook hssfworkbook = null; 24 try 25 { 26 using (FileStream file = new FileStream(fileName, FileMode.Open, FileAccess.Read)) 27 { 28 hssfworkbook = new XSSFWorkbook(file); 29 } 30 } 31 catch (Exception e) 32 { 33 LogHandler.LogWrite(string.Format("文件{0}打开失败,错误:{1}", new string[] { fileName, e.ToString() })); 34 } 35 //循环Sheet页 36 int sheetsCount = hssfworkbook.NumberOfSheets; 37 List<List<List<string>>> workBookContent = new List<List<List<string>>>(); 38 for (int i = 0; i < sheetsCount; i++) 39 { 40 //Sheet索引从0开始 41 ISheet sheet = hssfworkbook.GetSheetAt(i); 42 //循环行 43 List<List<string>> sheetContent = new List<List<string>>(); 44 int rowCount = sheet.PhysicalNumberOfRows; 45 for (int j = 0; j < rowCount; j++) 46 { 47 //Row(逻辑行)的索引从0开始 48 IRow row = sheet.GetRow(j); 49 //循环列(各行的列数可能不同) 50 List<string> rowContent = new List<string>(); 51 int cellCount = row.PhysicalNumberOfCells; 52 for (int k = 0; k < cellCount; k++) 53 { 54 //ICell cell = row.GetCell(k); 55 ICell cell = row.Cells[k]; 56 if (cell == null) 57 { 58 rowContent.Add("NIL"); 59 } 60 else 61 { 62 rowContent.Add(cell.ToString()); 63 //rowContent.Add(cell.StringCellValue); 64 } 65 } 66 //添加行到集合中 67 sheetContent.Add(rowContent); 68 } 69 //添加Sheet到集合中 70 workBookContent.Add(sheetContent); 71 } 72 73 return workBookContent; 74 } 75 76 /// <summary> 77 /// 78 /// </summary> 79 /// <param name="fileName"></param> 80 /// <returns></returns> 81 public static string ReadExcelText(string fileName) 82 { 83 string ExcelCellSeparator = ConfigurationManager.AppSettings["ExcelCellSeparator"]; 84 string ExcelRowSeparator = ConfigurationManager.AppSettings["ExcelRowSeparator"]; 85 string ExcelSheetSeparator = ConfigurationManager.AppSettings["ExcelSheetSeparator"]; 86 // 87 List<List<List<string>>> excelContent = ReadExcel(fileName); 88 string fileText = string.Empty; 89 StringBuilder sbFileText = new StringBuilder(); 90 //循环处理WorkBook中的各Sheet页 91 List<List<List<string>>>.Enumerator enumeratorWorkBook = excelContent.GetEnumerator(); 92 while (enumeratorWorkBook.MoveNext()) 93 { 94 95 //循环处理当期Sheet页中的各行 96 List<List<string>>.Enumerator enumeratorSheet = enumeratorWorkBook.Current.GetEnumerator(); 97 while (enumeratorSheet.MoveNext()) 98 { 99 100 string[] rowContent = enumeratorSheet.Current.ToArray(); 101 sbFileText.Append(string.Join(ExcelCellSeparator, rowContent)); 102 sbFileText.Append(ExcelRowSeparator); 103 } 104 sbFileText.Append(ExcelSheetSeparator); 105 } 106 // 107 fileText = sbFileText.ToString(); 108 return fileText; 109 } 110 111 /// <summary> 112 /// 读取Word内容 113 /// </summary> 114 /// <param name="fileName"></param> 115 /// <returns></returns> 116 public static string ReadWordText(string fileName) 117 { 118 string WordTableCellSeparator = ConfigurationManager.AppSettings["WordTableCellSeparator"]; 119 string WordTableRowSeparator = ConfigurationManager.AppSettings["WordTableRowSeparator"]; 120 string WordTableSeparator = ConfigurationManager.AppSettings["WordTableSeparator"]; 121 // 122 string CaptureWordHeader = ConfigurationManager.AppSettings["CaptureWordHeader"]; 123 string CaptureWordFooter = ConfigurationManager.AppSettings["CaptureWordFooter"]; 124 string CaptureWordTable = ConfigurationManager.AppSettings["CaptureWordTable"]; 125 string CaptureWordImage = ConfigurationManager.AppSettings["CaptureWordImage"]; 126 // 127 string CaptureWordImageFileName = ConfigurationManager.AppSettings["CaptureWordImageFileName"]; 128 // 129 string fileText = string.Empty; 130 StringBuilder sbFileText = new StringBuilder(); 131 132 #region 打开文档 133 XWPFDocument document = null; 134 try 135 { 136 using (FileStream file = new FileStream(fileName, FileMode.Open, FileAccess.Read)) 137 { 138 document = new XWPFDocument(file); 139 } 140 } 141 catch (Exception e) 142 { 143 LogHandler.LogWrite(string.Format("文件{0}打开失败,错误:{1}", new string[] { fileName, e.ToString() })); 144 } 145 #endregion 146 147 #region 页眉、页脚 148 //页眉 149 if (CaptureWordHeader == "true") 150 { 151 sbFileText.AppendLine("Capture Header Begin"); 152 foreach (XWPFHeader xwpfHeader in document.HeaderList) 153 { 154 sbFileText.AppendLine(string.Format("{0}", new string[] { xwpfHeader.Text })); 155 } 156 sbFileText.AppendLine("Capture Header End"); 157 } 158 //页脚 159 if (CaptureWordFooter == "true") 160 { 161 sbFileText.AppendLine("Capture Footer Begin"); 162 foreach (XWPFFooter xwpfFooter in document.FooterList) 163 { 164 sbFileText.AppendLine(string.Format("{0}", new string[] { xwpfFooter.Text })); 165 } 166 sbFileText.AppendLine("Capture Footer End"); 167 } 168 #endregion 169 170 #region 表格 171 if (CaptureWordTable == "true") 172 { 173 sbFileText.AppendLine("Capture Table Begin"); 174 foreach (XWPFTable table in document.Tables) 175 { 176 //循环表格行 177 foreach (XWPFTableRow row in table.Rows) 178 { 179 foreach (XWPFTableCell cell in row.GetTableCells()) 180 { 181 sbFileText.Append(cell.GetText()); 182 // 183 sbFileText.Append(WordTableCellSeparator); 184 } 185 186 sbFileText.Append(WordTableRowSeparator); 187 } 188 sbFileText.Append(WordTableSeparator); 189 } 190 sbFileText.AppendLine("Capture Table End"); 191 } 192 #endregion 193 194 #region 图片 195 if (CaptureWordImage == "true") 196 { 197 sbFileText.AppendLine("Capture Image Begin"); 198 foreach (XWPFPictureData pictureData in document.AllPictures) 199 { 200 string picExtName = pictureData.suggestFileExtension(); 201 string picFileName = pictureData.GetFileName(); 202 byte[] picFileContent = pictureData.GetData(); 203 // 204 string picTempName = string.Format(CaptureWordImageFileName, new string[] { Guid.NewGuid().ToString() + "_" + picFileName + "." + picExtName }); 205 // 206 using (FileStream fs = new FileStream(picTempName, FileMode.Create, FileAccess.Write)) 207 { 208 fs.Write(picFileContent, 0, picFileContent.Length); 209 fs.Close(); 210 } 211 // 212 sbFileText.AppendLine(picTempName); 213 } 214 sbFileText.AppendLine("Capture Image End"); 215 } 216 #endregion 217 218 //正文段落 219 sbFileText.AppendLine("Capture Paragraph Begin"); 220 foreach (XWPFParagraph paragraph in document.Paragraphs) 221 { 222 sbFileText.AppendLine(paragraph.ParagraphText); 223 224 } 225 sbFileText.AppendLine("Capture Paragraph End"); 226 // 227 228 // 229 fileText = sbFileText.ToString(); 230 return fileText; 231 } 232 233 234 } 235 }
作者:马洪彪
出处:http://www.cnblogs.com/mahongbiao/
本文版权归作者和博客园共有,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出原文连接,否则保留追究法律责任的权利。