PDFToText with ITextSharp--Extract text from PDF in C# (100% .NET)(推荐）

Posted on 2008-03-25 11:34 黑*马阅读(1807) 评论(1) 编辑收藏举报

using System;

using System.IO;

using iTextSharp.text.pdf;

namespace PdfToText

{

/// <summary>

/// Parses a PDF file and extracts the text from it.

/// </summary>

public class PDFParser

{

/// BT = Beginning of a text object operator

/// ET = End of a text object operator

/// Td move to the start of next line

/// 5 Ts = superscript

/// -5 Ts = subscript

#region Fields

#region _numberOfCharsToKeep

/// <summary>

/// The number of characters to keep, when extracting text.

/// </summary>

private static int _numberOfCharsToKeep = 15;

#endregion

#region ExtractText

/// <summary>

/// Extracts a text from a PDF file.

/// </summary>

/// <param name="inFileName">the full path to the pdf file.</param>

/// <param name="outFileName">the output file name.</param>

/// <returns>the extracted text</returns>

public bool ExtractText(string inFileName, string outFileName)

{

StreamWriter outFile = null;

try

{

// Create a reader for the given PDF file

PdfReader reader = new PdfReader(inFileName);

//outFile = File.CreateText(outFileName);

outFile = new StreamWriter(outFileName, false, System.Text.Encoding.UTF8);

Console.Write("Processing: ");

int totalLen = 68;

float charUnit = ((float)totalLen) / (float)reader.NumberOfPages;

int totalWritten= 0;

float curUnit = 0;

for (int page = 1; page <= reader.NumberOfPages; page++)

{

outFile.Write(ExtractTextFromPDFBytes(reader.GetPageContent(page)) + " ");

// Write the progress.

if (charUnit >= 1.0f)

{

for (int i = 0; i < (int)charUnit; i++)

{

Console.Write("#");

totalWritten++;

}

else

{

curUnit += charUnit;

if (curUnit >= 1.0f)

{

for (int i = 0; i < (int)curUnit; i++)

{

Console.Write("#");

totalWritten++;

}

curUnit = 0;

}

if (totalWritten < totalLen)

{

for (int i = 0; i < (totalLen - totalWritten); i++)

{

Console.Write("#");

}

return true;

}

catch

{

return false;

}

finally

{

if (outFile != null) outFile.Close();

}

#endregion

#region ExtractTextFromPDFBytes

/// <summary>

/// This method processes an uncompressed Adobe (text) object

/// and extracts text.

/// </summary>

/// <param name="input">uncompressed</param>

/// <returns></returns>

private string ExtractTextFromPDFBytes(byte[] input)

{

if (input == null || input.Length == 0) return "";

try

{

string resultString = "";

// Flag showing if we are we currently inside a text object

bool inTextObject = false;

// Flag showing if the next character is literal

// e.g. '\\' to get a '\' character or '\(' to get '('

bool nextLiteral = false;

// () Bracket nesting level. Text appears inside ()

int bracketDepth = 0;

// Keep previous chars to get extract numbers etc.:

char[] previousCharacters = new char[_numberOfCharsToKeep];

for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';

for (int i = 0; i < input.Length; i++)

{

char c = (char)input[i];

if (inTextObject)

{

// Position the text

if (bracketDepth == 0)

{

if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))

{

resultString += "\n\r";

}

else

{

if (CheckToken(new string[] {"'", "T*", "\""}, previousCharacters))

{

resultString += "\n";

}

else

{

if (CheckToken(new string[] { "Tj" }, previousCharacters))

{

resultString += " ";

}

// End of a text object, also go to a new line.

if (bracketDepth == 0 &&

CheckToken( new string[]{"ET"}, previousCharacters))

{

inTextObject = false;

resultString += " ";

}

else

{

// Start outputting text

if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))

{

bracketDepth = 1;

}

else

{

// Stop outputting text

if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))

{

bracketDepth = 0;

}

else

{

// Just a normal text character:

if (bracketDepth == 1)

{

// Only print out next character no matter what.

// Do not interpret.

if (c == '\\' && !nextLiteral)

{

nextLiteral = true;

}

else

{

if (((c >= ' ') && (c <= '~')) ||

((c >= 128) && (c < 255)))

{

resultString += c.ToString();

}

nextLiteral = false;

}

// Store the recent characters for

// when we have to go back for a checking

for (int j = 0; j < _numberOfCharsToKeep - 1; j++)

{

previousCharacters[j] = previousCharacters[j + 1];

}

previousCharacters[_numberOfCharsToKeep - 1] = c;

// Start of a text object

if (!inTextObject && CheckToken(new string[]{"BT"}, previousCharacters))

{

inTextObject = true;

}

return resultString;

}

catch

{

return "";

}

#endregion

CheckToken

}

usage:

using System;

using System.Text;

using System.IO;

namespace PdfToText

{

/// <summary>

/// The main entry point to the program.

/// </summary>

class Program

{

static void Main(string[] args)

{

try

{

if (args.Length < 1)

{

DisplayUsage();

return;

}

string file = args[0];

if (!File.Exists(file))

{

file = Path.GetFullPath(file);

if (!File.Exists(file))

{

Console.WriteLine("Please give in the path to the PDF file.");

}

PDFParser pdfParser = new PDFParser();

pdfParser.ExtractText(file, Path.GetFileNameWithoutExtension(file)+".txt");

}

catch (Exception exc)

{

Console.WriteLine(exc);

}

static void DisplayUsage()

{

Console.WriteLine();

Console.WriteLine("Usage:\tpdftotext FILE");

Console.WriteLine();

Console.WriteLine("\tFILE\t the path to the PDF file, it may be relative or absolute.");

Console.WriteLine();

}

问题，不支持中文，没有布局，仅仅是把每页的所以文字抽取出来，如果想真正实现PDFtoTxt，仍然有好多路要走，但毕竟是个好的开始。

from http://www.codeproject.com/useritems/PDFToText.asp

posted on 2006-06-16 07:26 RubyPDF 阅读(3302) 评论(8) 编辑收藏所属分类: iTextSharp(iText#)

发表评论

回复引用查看

2006-06-16 08:14 | 自適應軟件......

暈,這是個開源的,早就Release出來了.不過,有點遺憾的是,WritePDF的時候,換行的時候,不能判斷行首位字母的時候,自動換行!

另外,把文字從PDF抽出來的時候,怎麼不支持中文呢?不解.還有,既然你已經把文字抽出來了,為甚麼不能實現PDFtoTxt呢,你直接寫到Txt,或者Word不就行了?

回复引用查看

2006-06-16 09:17 | HardRock

这个不是软件，只是一段应用代码，而且也不是我写的，至于为什么不支持中文，这个就不想多说了，至少目前我还没有能力实现它。
关于你说的首位字母的问题，这个有人已经实现，但不愿意公布方法，我也没有研究过，一是能力问题，另外一个是因为写PDF不是我的研究重点。
PDFToText涉及很多问题的，有兴趣你可以看看XPDF的一个工具就比较清楚了。

回复引用查看

2006-07-20 11:46 | zwg51666 [未注册用户]

谢谢HardRock ,那多页tif怎么处理,我现在每次只能得到第一页,谢谢

回复引用查看

2006-07-20 11:55 | HardRock

@zwg51666
你在搞什么？再把一个问题发几遍或者到处乱发，我删除了!

回复引用查看

2006-07-20 12:01 | zwg51666 [未注册用户]

你好,你看下我的代码,为什么,他还是裁图了,我想得到缩放的效果,
if (tif1.ScaledWidth>760)
{
float tempin=(760/tif1.ScaledWidth);
tempin*=100;
this.textBox1.Text=tempin.ToString();
tif1.ScalePercent(tempin);

}
谢谢

回复引用查看

2006-07-20 18:13 | zwg51666 [未注册用户]

我的问题解决了,谢

回复引用查看

2007-03-07 16:40 | minghong [未注册用户]

用xpdf（http://www.foolabs.com/xpdf/about.html）中的pdftotext，加上enc就可以抽出中文了，例如
pdftotext -layout -enc UTF-8 test.pdf

回复引用查看

2007-10-11 05:06 | Ottoniel [未注册用户]

Esta bien, pero no entiendo nada :D

刷新页面返回顶部

diction

公告

PDFToText with ITextSharp--Extract text from PDF in C# (100% .NET)(推荐）