C#读取PDF文档文字内容

C#读取PDF文档文字内容

通过iTextSharp读取PDF文件内容,下载地址,下载后解压itextsharp-dll-core.zip。

只能读取英文和数字,文档中包含的汉字无法正常读取:

private string ReadPdfContent(string filepath)  
{  
    try  
    {  
        string pdffilename = filepath;  
        PdfReader pdfReader = new PdfReader(pdffilename);  
        int numberOfPages = pdfReader.NumberOfPages;  
        string text = string.Empty;  
  
        for (int i = 1; i <= numberOfPages; ++i)  
        {  
            byte[] bufferOfPageContent = pdfReader.GetPageContent(i);  
            text += System.Text.Encoding.UTF8.GetString(bufferOfPageContent);  
        }  
        pdfReader.Close();  
  
        return text;  
    }  
    catch (Exception ex)  
    {  
        StreamWriter log = File.AppendText(System.AppDomain.CurrentDomain.SetupInformation.ApplicationBase+"\\log.log");  
        log.WriteLine("出错文件:" + e.FullPath + "原因:" + ex.ToString());  
        log.Flush();  
        log.Close();return null;  
    } 
}  

 

可以读取中英文

private string OnCreated(string filepath)  
{  
    try  
    {  
        string pdffilename = filepath;  
        PdfReader pdfReader = new PdfReader(pdffilename);  
        int numberOfPages = pdfReader.NumberOfPages;  
        string text = string.Empty;  
  
        for (int i = 1; i <= numberOfPages; ++i)  
        {  
            iTextSharp.text.pdf.parser.ITextExtractionStrategy strategy = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy();
            text += iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(pdfReader, i, strategy);
        }  
        pdfReader.Close();  
  
        return text;  
    }  
    catch (Exception ex)  
    {  
        StreamWriter wlog = File.AppendText(System.AppDomain.CurrentDomain.SetupInformation.ApplicationBase+"\\mylog.log");  
        wlog.WriteLine("出错文件:" + e.FullPath + "原因:" + ex.ToString());  
        wlog.Flush();  
        wlog.Close();return null;  
    }  
  
} 

 

posted @ 2016-11-18 14:26  tzdk  阅读(15190)  评论(5编辑  收藏  举报