PDFToText with ITextSharp--Extract text from PDF in C# (100% .NET)(推荐） - RubyPDF

PDFToText with ITextSharp--Extract text from PDF in C# (100% .NET)(推荐）

using System;

using System.IO;

using iTextSharp.text.pdf;

namespace PdfToText

{

/// <summary>

/// Parses a PDF file and extracts the text from it.

/// </summary>

public class PDFParser

{

/// BT = Beginning of a text object operator

/// ET = End of a text object operator

/// Td move to the start of next line

/// 5 Ts = superscript

/// -5 Ts = subscript

Fields

ExtractText#region ExtractText

/// <summary>

/// Extracts a text from a PDF file.

/// </summary>

/// <param name="inFileName">the full path to the pdf file.</param>

/// <param name="outFileName">the output file name.</param>

/// <returns>the extracted text</returns>

public bool ExtractText(string inFileName, string outFileName)

{

StreamWriter outFile = null;

try

{

// Create a reader for the given PDF file

PdfReader reader = new PdfReader(inFileName);

//outFile = File.CreateText(outFileName);

outFile = new StreamWriter(outFileName, false, System.Text.Encoding.UTF8);

Console.Write("Processing: ");

int totalLen = 68;

float charUnit = ((float)totalLen) / (float)reader.NumberOfPages;

int totalWritten= 0;

float curUnit = 0;

for (int page = 1; page <= reader.NumberOfPages; page++)

{

outFile.Write(ExtractTextFromPDFBytes(reader.GetPageContent(page)) + " ");

// Write the progress.

if (charUnit >= 1.0f)

{

for (int i = 0; i < (int)charUnit; i++)

{

Console.Write("#");

totalWritten++;

}

else

{

curUnit += charUnit;

if (curUnit >= 1.0f)

{

for (int i = 0; i < (int)curUnit; i++)

{

Console.Write("#");

totalWritten++;

}

curUnit = 0;

}

if (totalWritten < totalLen)

{

for (int i = 0; i < (totalLen - totalWritten); i++)

{

Console.Write("#");

}

return true;

}

catch

{

return false;

}

finally

{

if (outFile != null) outFile.Close();

}

#endregion

ExtractTextFromPDFBytes#region ExtractTextFromPDFBytes

/// <summary>

/// This method processes an uncompressed Adobe (text) object

/// and extracts text.

/// </summary>

/// <param name="input">uncompressed</param>

/// <returns></returns>

private string ExtractTextFromPDFBytes(byte[] input)

{

if (input == null || input.Length == 0) return "";

try

{

string resultString = "";

// Flag showing if we are we currently inside a text object

bool inTextObject = false;

// Flag showing if the next character is literal

// e.g. '\\' to get a '\' character or '\(' to get '('

bool nextLiteral = false;

// () Bracket nesting level. Text appears inside ()

int bracketDepth = 0;

// Keep previous chars to get extract numbers etc.:

char[] previousCharacters = new char[_numberOfCharsToKeep];

for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';

for (int i = 0; i < input.Length; i++)

{

char c = (char)input[i];

if (inTextObject)

{

// Position the text

if (bracketDepth == 0)

{

if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))

{

resultString += "\n\r";

}

else

{

if (CheckToken(new string[] {"'", "T*", "\""}, previousCharacters))

{

resultString += "\n";

}

else

{

if (CheckToken(new string[] { "Tj" }, previousCharacters))

{

resultString += " ";

}

// End of a text object, also go to a new line.

if (bracketDepth == 0 &&

CheckToken( new string[]{"ET"}, previousCharacters))

{

inTextObject = false;

resultString += " ";

}

else

{

// Start outputting text

if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))

{

bracketDepth = 1;

}

else

{

// Stop outputting text

if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))

{

bracketDepth = 0;

}

else

{

// Just a normal text character:

if (bracketDepth == 1)

{

// Only print out next character no matter what.

// Do not interpret.

if (c == '\\' && !nextLiteral)

{

nextLiteral = true;

}

else

{

if (((c >= ' ') && (c <= '~')) ||

((c >= 128) && (c < 255)))

{

resultString += c.ToString();

}

nextLiteral = false;

}

// Store the recent characters for

// when we have to go back for a checking

for (int j = 0; j < _numberOfCharsToKeep - 1; j++)

{

previousCharacters[j] = previousCharacters[j + 1];

}

previousCharacters[_numberOfCharsToKeep - 1] = c;

// Start of a text object

if (!inTextObject && CheckToken(new string[]{"BT"}, previousCharacters))

{

inTextObject = true;

}

return resultString;

}

catch

{

return "";

}

#endregion

CheckToken

}

usage:

using System;

using System.Text;

using System.IO;

namespace PdfToText

{

/// <summary>

/// The main entry point to the program.

/// </summary>

class Program

{

static void Main(string[] args)

{

try

{

if (args.Length < 1)

{

DisplayUsage();

return;

}

string file = args[0];

if (!File.Exists(file))

{

file = Path.GetFullPath(file);

if (!File.Exists(file))

{

Console.WriteLine("Please give in the path to the PDF file.");

}

PDFParser pdfParser = new PDFParser();

pdfParser.ExtractText(file, Path.GetFileNameWithoutExtension(file)+".txt");

}

catch (Exception exc)

{

Console.WriteLine(exc);

}

static void DisplayUsage()

{

Console.WriteLine();

Console.WriteLine("Usage:\tpdftotext FILE");

Console.WriteLine();

Console.WriteLine("\tFILE\t the path to the PDF file, it may be relative or absolute.");

Console.WriteLine();

}

问题，不支持中文，没有布局，仅仅是把每页的所以文字抽取出来，如果想真正实现PDFtoTxt，仍然有好多路要走，但毕竟是个好的开始。

from http://www.codeproject.com/useritems/PDFToText.asp

posted on 2006-06-16 07:26 RubyPDF 阅读(8977) 评论(8) 收藏举报

刷新页面返回顶部