c#读取doc,pdf,ppt,txt文件
doc pdf ppt与 txt之间的转换 :
组件的作用一般是将文件读出成字符格式,并不是单纯的转换文件名后缀,所以需要将读出的东西写入txt文件 。
添加office引用
.net中对office中的word及ppt进行编程时,确保安装office时已经安装了word,ppt可编程组件(自定义安装时可查看)或者安装“Microsoft Office 2003 Primary Interop Assemblies”
安装后,在编程页面添加引用:
添加引用-com—microsoft powerpoint object 11.0 libaray/word 11.0 object library;
还得添加office组件
using Microsoft.Office.Interop.Word;
using Microsoft.Office.Interop.PowerPoint;
using org.pdfbox.pdmodel;
using org.pdfbox.util;
using Microsoft.Office.Interop.Word;
using Microsoft.Office.Interop.PowerPoint;
publicvoid pdf2txt(FileInfo file,FileInfo txtfile)
{
PDDocument doc =PDDocument.load(file.FullName);
PDFTextStripper pdfStripper =newPDFTextStripper();
string text = pdfStripper.getText(doc);
StreamWriter swPdfChange =newStreamWriter(txtfile.FullName,false,Encoding.GetEncoding("gb2312"));
swPdfChange.Write(text);
swPdfChange.Close();
}
对于doc文件中的表格,读出的结果是去除掉了网格线,内容按行读取。
Public void word2text(FileInfo file,FileInfo txtfile)
{
object readOnly =true;
object missing = System.Reflection.Missing.Value;
object fileName = file.FullName;
Microsoft.Office.Interop.Word.ApplicationClass wordapp =new Microsoft.Office.Interop.Word.ApplicationClass();
Document doc = wordapp.Documents.Open(ref fileName,
ref missing,ref readOnly,ref missing, ref missing,ref missing,
ref missing,ref missing,ref missing, ref missing,ref missing,
ref missing,ref missing,ref missing, ref missing,ref missing);
string text = doc.Content.Text;
doc.Close(ref missing,ref missing,ref missing);
wordapp.Quit(ref missing,ref missing,ref missing);
StreamWriter swWordChange =new StreamWriter(txtfile.FullName,false,Encoding.GetEncoding("gb2312"));
swWordChange.Write(text);
swWordChange.Close();
}
Public void ppt2txt(FileInfo file, FileInfo txtfile)
{
Microsoft.Office.Interop.PowerPoint.Application pa =new Microsoft.Office.Interop.PowerPoint.ApplicationClass();
Microsoft.Office.Interop.PowerPoint.Presentation pp = pa.Presentations.Open(file.FullName,
Microsoft.Office.Core.MsoTriState.msoTrue,
Microsoft.Office.Core.MsoTriState.msoFalse,
Microsoft.Office.Core.MsoTriState.msoFalse);
string pps ="";
StreamWriter swPPtChange =new StreamWriter(txtfile.FullName,false,Encoding.GetEncoding("gb2312"));
foreach (Microsoft.Office.Interop.PowerPoint.Slide slidein pp.Slides)
{
foreach (Microsoft.Office.Interop.PowerPoint.Shape shapein slide.Shapes)
pps += shape.TextFrame.TextRange.Text.ToString();
}
swPPtChange.Write(pps);
swPPtChange.Close();
}
读取不同类型的文件
Public StreamReader text2reader(FileInfo file)
{
StreamReader st =null;
switch (file.Extension.ToLower())
{
case".txt":
st = new StreamReader(file.FullName,Encoding.GetEncoding("gb2312"));
break;
case".doc":
FileInfo wordfile =new FileInfo(@"E:/my programs/200807program/FileSearch/App_Data/word2txt.txt");//不能使用相对路径,想办法改进
word2text(file, wordfile);
st = newStreamReader(wordfile.FullName,Encoding.GetEncoding("gb2312"));
break;
case".pdf":
FileInfo pdffile =new FileInfo(@"E:/my programs/200807program/FileSearch/App_Data/pdf2txt.txt");
pdf2txt(file, pdffile);
st = new StreamReader(pdffile.FullName,Encoding.GetEncoding("gb2312"));
break;
case".ppt":
FileInfo pptfile =new FileInfo(@"E:/my programs/200807program/FileSearch/App_Data/ppt2txt.txt");
ppt2txt(file,pptfile);
st = new StreamReader(pptfile.FullName,Encoding.GetEncoding("gb2312"));
break;
}
return st;
}