PDFToText with ITextSharp--Extract text from PDF in C# (100% .NET)(推荐)
Posted on 2008-03-25 11:34 黑*马 阅读(1804) 评论(1) 编辑 收藏 举报
using System;
using System.IO;
using iTextSharp.text.pdf;
namespace PdfToText
{
/// <summary>
/// Parses a PDF file and extracts the text from it.
/// </summary>
public class PDFParser
{
/// BT = Beginning of a text object operator
/// ET = End of a text object operator
/// Td move to the start of next line
/// 5 Ts = superscript
/// -5 Ts = subscript
#region Fields
#region _numberOfCharsToKeep
/// <summary>
/// The number of characters to keep, when extracting text.
/// </summary>
private static int _numberOfCharsToKeep = 15;
#endregion
#endregion
#region ExtractText
/// <summary>
/// Extracts a text from a PDF file.
/// </summary>
/// <param name="inFileName">the full path to the pdf file.</param>
/// <param name="outFileName">the output file name.</param>
/// <returns>the extracted text</returns>
public bool ExtractText(string inFileName, string outFileName)
{
StreamWriter outFile = null;
try
{
// Create a reader for the given PDF file
PdfReader reader = new PdfReader(inFileName);
//outFile = File.CreateText(outFileName);
outFile = new StreamWriter(outFileName, false, System.Text.Encoding.UTF8);
Console.Write("Processing: ");
int totalLen = 68;
float charUnit = ((float)totalLen) / (float)reader.NumberOfPages;
int totalWritten= 0;
float curUnit = 0;
for (int page = 1; page <= reader.NumberOfPages; page++)
{
outFile.Write(ExtractTextFromPDFBytes(reader.GetPageContent(page)) + " ");
// Write the progress.
if (charUnit >= 1.0f)
{
for (int i = 0; i < (int)charUnit; i++)
{
Console.Write("#");
totalWritten++;
}
}
else
{
curUnit += charUnit;
if (curUnit >= 1.0f)
{
for (int i = 0; i < (int)curUnit; i++)
{
Console.Write("#");
totalWritten++;
}
curUnit = 0;
}
}
}
if (totalWritten < totalLen)
{
for (int i = 0; i < (totalLen - totalWritten); i++)
{
Console.Write("#");
}
}
return true;
}
catch
{
return false;
}
finally
{
if (outFile != null) outFile.Close();
}
}
#endregion
#region ExtractTextFromPDFBytes
/// <summary>
/// This method processes an uncompressed Adobe (text) object
/// and extracts text.
/// </summary>
/// <param name="input">uncompressed</param>
/// <returns></returns>
private string ExtractTextFromPDFBytes(byte[] input)
{
if (input == null || input.Length == 0) return "";
try
{
string resultString = "";
// Flag showing if we are we currently inside a text object
bool inTextObject = false;
// Flag showing if the next character is literal
// e.g. '\\' to get a '\' character or '\(' to get '('
bool nextLiteral = false;
// () Bracket nesting level. Text appears inside ()
int bracketDepth = 0;
// Keep previous chars to get extract numbers etc.:
char[] previousCharacters = new char[_numberOfCharsToKeep];
for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';
for (int i = 0; i < input.Length; i++)
{
char c = (char)input[i];
if (inTextObject)
{
// Position the text
if (bracketDepth == 0)
{
if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
{
resultString += "\n\r";
}
else
{
if (CheckToken(new string[] {"'", "T*", "\""}, previousCharacters))
{
resultString += "\n";
}
else
{
if (CheckToken(new string[] { "Tj" }, previousCharacters))
{
resultString += " ";
}
}
}
}
// End of a text object, also go to a new line.
if (bracketDepth == 0 &&
CheckToken( new string[]{"ET"}, previousCharacters))
{
inTextObject = false;
resultString += " ";
}
else
{
// Start outputting text
if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
{
bracketDepth = 1;
}
else
{
// Stop outputting text
if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
{
bracketDepth = 0;
}
else
{
// Just a normal text character:
if (bracketDepth == 1)
{
// Only print out next character no matter what.
// Do not interpret.
if (c == '\\' && !nextLiteral)
{
nextLiteral = true;
}
else
{
if (((c >= ' ') && (c <= '~')) ||
((c >= 128) && (c < 255)))
{
resultString += c.ToString();
}
nextLiteral = false;
}
}
}
}
}
}
// Store the recent characters for
// when we have to go back for a checking
for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
{
previousCharacters[j] = previousCharacters[j + 1];
}
previousCharacters[_numberOfCharsToKeep - 1] = c;
// Start of a text object
if (!inTextObject && CheckToken(new string[]{"BT"}, previousCharacters))
{
inTextObject = true;
}
}
return resultString;
}
catch
{
return "";
}
}
#endregion
CheckToken
}
}
usage:
问题,不支持中文,没有布局,仅仅是把每页的所以文字抽取出来,如果想真正实现PDFtoTxt,仍然有好多路要走,但毕竟是个好的开始。
from http://www.codeproject.com/useritems/PDFToText.asp
using System.IO;
using iTextSharp.text.pdf;
namespace PdfToText
{
/// <summary>
/// Parses a PDF file and extracts the text from it.
/// </summary>
public class PDFParser
{
/// BT = Beginning of a text object operator
/// ET = End of a text object operator
/// Td move to the start of next line
/// 5 Ts = superscript
/// -5 Ts = subscript
#region Fields
#region _numberOfCharsToKeep
/// <summary>
/// The number of characters to keep, when extracting text.
/// </summary>
private static int _numberOfCharsToKeep = 15;
#endregion
#endregion
#region ExtractText
/// <summary>
/// Extracts a text from a PDF file.
/// </summary>
/// <param name="inFileName">the full path to the pdf file.</param>
/// <param name="outFileName">the output file name.</param>
/// <returns>the extracted text</returns>
public bool ExtractText(string inFileName, string outFileName)
{
StreamWriter outFile = null;
try
{
// Create a reader for the given PDF file
PdfReader reader = new PdfReader(inFileName);
//outFile = File.CreateText(outFileName);
outFile = new StreamWriter(outFileName, false, System.Text.Encoding.UTF8);
Console.Write("Processing: ");
int totalLen = 68;
float charUnit = ((float)totalLen) / (float)reader.NumberOfPages;
int totalWritten= 0;
float curUnit = 0;
for (int page = 1; page <= reader.NumberOfPages; page++)
{
outFile.Write(ExtractTextFromPDFBytes(reader.GetPageContent(page)) + " ");
// Write the progress.
if (charUnit >= 1.0f)
{
for (int i = 0; i < (int)charUnit; i++)
{
Console.Write("#");
totalWritten++;
}
}
else
{
curUnit += charUnit;
if (curUnit >= 1.0f)
{
for (int i = 0; i < (int)curUnit; i++)
{
Console.Write("#");
totalWritten++;
}
curUnit = 0;
}
}
}
if (totalWritten < totalLen)
{
for (int i = 0; i < (totalLen - totalWritten); i++)
{
Console.Write("#");
}
}
return true;
}
catch
{
return false;
}
finally
{
if (outFile != null) outFile.Close();
}
}
#endregion
#region ExtractTextFromPDFBytes
/// <summary>
/// This method processes an uncompressed Adobe (text) object
/// and extracts text.
/// </summary>
/// <param name="input">uncompressed</param>
/// <returns></returns>
private string ExtractTextFromPDFBytes(byte[] input)
{
if (input == null || input.Length == 0) return "";
try
{
string resultString = "";
// Flag showing if we are we currently inside a text object
bool inTextObject = false;
// Flag showing if the next character is literal
// e.g. '\\' to get a '\' character or '\(' to get '('
bool nextLiteral = false;
// () Bracket nesting level. Text appears inside ()
int bracketDepth = 0;
// Keep previous chars to get extract numbers etc.:
char[] previousCharacters = new char[_numberOfCharsToKeep];
for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';
for (int i = 0; i < input.Length; i++)
{
char c = (char)input[i];
if (inTextObject)
{
// Position the text
if (bracketDepth == 0)
{
if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
{
resultString += "\n\r";
}
else
{
if (CheckToken(new string[] {"'", "T*", "\""}, previousCharacters))
{
resultString += "\n";
}
else
{
if (CheckToken(new string[] { "Tj" }, previousCharacters))
{
resultString += " ";
}
}
}
}
// End of a text object, also go to a new line.
if (bracketDepth == 0 &&
CheckToken( new string[]{"ET"}, previousCharacters))
{
inTextObject = false;
resultString += " ";
}
else
{
// Start outputting text
if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
{
bracketDepth = 1;
}
else
{
// Stop outputting text
if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
{
bracketDepth = 0;
}
else
{
// Just a normal text character:
if (bracketDepth == 1)
{
// Only print out next character no matter what.
// Do not interpret.
if (c == '\\' && !nextLiteral)
{
nextLiteral = true;
}
else
{
if (((c >= ' ') && (c <= '~')) ||
((c >= 128) && (c < 255)))
{
resultString += c.ToString();
}
nextLiteral = false;
}
}
}
}
}
}
// Store the recent characters for
// when we have to go back for a checking
for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
{
previousCharacters[j] = previousCharacters[j + 1];
}
previousCharacters[_numberOfCharsToKeep - 1] = c;
// Start of a text object
if (!inTextObject && CheckToken(new string[]{"BT"}, previousCharacters))
{
inTextObject = true;
}
}
return resultString;
}
catch
{
return "";
}
}
#endregion
CheckToken
}
}
usage:
using System;
using System.Text;
using System.IO;
namespace PdfToText
{
/// <summary>
/// The main entry point to the program.
/// </summary>
class Program
{
static void Main(string[] args)
{
try
{
if (args.Length < 1)
{
DisplayUsage();
return;
}
string file = args[0];
if (!File.Exists(file))
{
file = Path.GetFullPath(file);
if (!File.Exists(file))
{
Console.WriteLine("Please give in the path to the PDF file.");
}
}
PDFParser pdfParser = new PDFParser();
pdfParser.ExtractText(file, Path.GetFileNameWithoutExtension(file)+".txt");
}
catch (Exception exc)
{
Console.WriteLine(exc);
}
}
static void DisplayUsage()
{
Console.WriteLine();
Console.WriteLine("Usage:\tpdftotext FILE");
Console.WriteLine();
Console.WriteLine("\tFILE\t the path to the PDF file, it may be relative or absolute.");
Console.WriteLine();
}
}
}
using System.Text;
using System.IO;
namespace PdfToText
{
/// <summary>
/// The main entry point to the program.
/// </summary>
class Program
{
static void Main(string[] args)
{
try
{
if (args.Length < 1)
{
DisplayUsage();
return;
}
string file = args[0];
if (!File.Exists(file))
{
file = Path.GetFullPath(file);
if (!File.Exists(file))
{
Console.WriteLine("Please give in the path to the PDF file.");
}
}
PDFParser pdfParser = new PDFParser();
pdfParser.ExtractText(file, Path.GetFileNameWithoutExtension(file)+".txt");
}
catch (Exception exc)
{
Console.WriteLine(exc);
}
}
static void DisplayUsage()
{
Console.WriteLine();
Console.WriteLine("Usage:\tpdftotext FILE");
Console.WriteLine();
Console.WriteLine("\tFILE\t the path to the PDF file, it may be relative or absolute.");
Console.WriteLine();
}
}
}
问题,不支持中文,没有布局,仅仅是把每页的所以文字抽取出来,如果想真正实现PDFtoTxt,仍然有好多路要走,但毕竟是个好的开始。
from http://www.codeproject.com/useritems/PDFToText.asp
发表评论
2006-06-16 08:14 | 自適應軟件......
暈,這是個開源的,早就Release出來了.不過,有點遺憾的是,WritePDF的時候,換行的時候,不能判斷行首位字母的時候,自動換行!
另外,把文字從PDF抽出來的時候,怎麼不支持中文呢?不解.還有,既然你已經把文字抽出來了,為甚麼不能實現PDFtoTxt呢,你直接寫到Txt,或者Word不就行了?
另外,把文字從PDF抽出來的時候,怎麼不支持中文呢?不解.還有,既然你已經把文字抽出來了,為甚麼不能實現PDFtoTxt呢,你直接寫到Txt,或者Word不就行了?
这个不是软件,只是一段应用代码,而且也不是我写的,至于为什么不支持中文,这个就不想多说了,至少目前我还没有能力实现它。
关于你说的首位字母的问题,这个有人已经实现,但不愿意公布方法,我也没有研究过,一是能力问题,另外一个是因为写PDF不是我的研究重点。
PDFToText涉及很多问题的,有兴趣你可以看看XPDF的一个工具就比较清楚了。
关于你说的首位字母的问题,这个有人已经实现,但不愿意公布方法,我也没有研究过,一是能力问题,另外一个是因为写PDF不是我的研究重点。
PDFToText涉及很多问题的,有兴趣你可以看看XPDF的一个工具就比较清楚了。
2006-07-20 11:46 | zwg51666 [未注册用户]
谢谢HardRock ,那多页tif怎么处理,我现在每次只能得到第一页,谢谢
2006-07-20 12:01 | zwg51666 [未注册用户]
你好,你看下我的代码,为什么,他还是裁图了,我想得到缩放的效果,
if (tif1.ScaledWidth>760)
{
float tempin=(760/tif1.ScaledWidth);
tempin*=100;
this.textBox1.Text=tempin.ToString();
tif1.ScalePercent(tempin);
}
谢谢
if (tif1.ScaledWidth>760)
{
float tempin=(760/tif1.ScaledWidth);
tempin*=100;
this.textBox1.Text=tempin.ToString();
tif1.ScalePercent(tempin);
}
谢谢
2006-07-20 18:13 | zwg51666 [未注册用户]
我的问题解决了,谢
2007-03-07 16:40 | minghong [未注册用户]
用xpdf(http://www.foolabs.com/xpdf/about.html)中的pdftotext,加上enc就可以抽出中文了,例如
pdftotext -layout -enc UTF-8 test.pdf
pdftotext -layout -enc UTF-8 test.pdf
2007-10-11 05:06 | Ottoniel [未注册用户]
Esta bien, pero no entiendo nada :D