type
  PPdfObj = ^TPdfObj;
  TPdfObj = record
    number,
    offset: integer;
  end;

function GetPdfPageCount(const filename: string): integer;
var
  ms: TMemoryStream;
  k, cnt, pagesNum, rootNum: integer;
  p, p2: pchar;
  PdfObj: PPdfObj;
  PdfObjList: TList;

  //Summary of steps taken to parse PDF file for page count :-
  //1. Locate 'startxref' at end of file
  //2. get 'xref' offset and go to xref table
  //3. fill my pdfObj List with object numbers and offsets
  //4. handle subsections within xref table.
  //5. read 'trailer' section at end of each xref
  //6. store 'Root' object number if found in 'trailer'
  //7. if 'Prev' xref found in 'trailer' - loop back to step 2
  //8. locate Root in my full pdfObj List
  //9. locate 'Pages' object from Root
  //10. get Count from Pages.

  function GetNumber(out num: integer): boolean;
  var
    tmpStr: string;
  begin
    tmpStr := '';
    while p^ < #33 do inc(p); //skip leading CR,LF & SPC
    while (p^ in ['0'..'9']) do
    begin
      tmpStr := tmpStr + p^;
      inc(p);
    end;
    result := tmpStr <> '';
    if not result then exit;
    num := strtoint(tmpStr);
  end;

  function IsString(const str: string): boolean;
  var
    len: integer;
  begin
    len := length(str);
    result := CompareMem( p, pchar(str), len);
    inc(p, len);
  end;

  function FindStrInDict(const str: string): boolean;
  begin
    //PDF 'dictionaries' (assoc. arrays) terminate with '>>'
    result := false;
    while not result do
    begin
      while (p^ <> '>') and (p^ <> str[1]) do inc(p);
      if (p^ = '>') then
      begin
        inc(p);
        if (p^ = '>') then exit else continue;
      end;
      result := IsString(str);
    end;
  end;

begin
  //on error return -1 as page count
  result := -1;
  try
    ms := TMemoryStream.Create;
    PdfObjList := TList.Create;
    screen.Cursor := crHourGlass;
    application.ProcessMessages;
    try
      ms.LoadFromFile(filename);

      //find 'startxref' ignoring '%%EOF'
      p := pchar(ms.Memory) + ms.Size -5;
      //21-Jun-05: bugfix
      //sometimes rubbish is appended to the pdf so
      //look deeper for 'startxref'
      p2 := pchar(ms.Memory);
      repeat
        while (p > p2) and (p^ <> 'f') do dec(p);
        if (p = p2) then exit;
        if StrLComp( (p-8), 'startxref', 9) = 0 then break;
        dec(p);
      until false;
      inc(p);

      rootNum := -1; //ie flag not yet found

      //xref offset ==> k
      if not GetNumber(k) then exit;
      p :=  pchar(ms.Memory) + k +4;

      while true do //top of loop  //////////////////////////////
      begin
        //get base object number ==> k
        if not GetNumber(k) then exit;
        //get object count ==> cnt
        if not GetNumber(cnt) then exit;
        while not (p^ in ['0'..'9']) do inc(p); //skip CR, LF
        p2 := p;
        //add all objects in section to list ...
        for cnt := 0 to cnt-1 do
        begin
          new(PdfObj);
          PdfObjList.Add(PdfObj);
          PdfObj.number := k + cnt;
          if not GetNumber(PdfObj.offset) then exit;
          inc(p2,20);
          p := p2;
        end;
        //check for and process further subsections ...
        if p^ in ['0'..'9'] then continue;

        // parse 'trailer dictionary' ...
        if not IsString('trailer') then exit;
        p2 := p;
        // get Root (aka /Catalog) ...
        if (rootNum = -1) and FindStrInDict('/Root') then
          if not GetNumber(rootNum) then exit;
        p := p2;
        if not FindStrInDict('/Prev') then break; //no more xrefs

        //next xref offset ==> k
        if not GetNumber(k) then exit;
        p :=  pchar(ms.Memory) + k +4;

      end; //bottom of loop /////////////////////////////////////

      //Make sure we've got Root the object number ...
      if rootNum < 0 then exit;
      //Find Root object in list and go to its offset ...
      k := 0;
      while k < PdfObjList.Count do
        if PPdfObj(PdfObjList[k]).number = rootNum then
          break else
          inc(k);
      if k = PdfObjList.Count then exit;
      p := pchar(ms.Memory) + PPdfObj(PdfObjList[k]).offset;
      //double check that this is the Root object ...
      if not GetNumber(k) or (k <> rootNum) then exit;
      if not FindStrInDict('/Pages') then exit;
      //get Pages object number ==> pagesNum
      if not GetNumber(pagesNum) then exit;
      k := 0;
      while k < PdfObjList.Count do
        if PPdfObj(PdfObjList[k]).number = pagesNum then
          break else
          inc(k);
      if k = PdfObjList.Count then exit;
      //Pages object found in list, now go to offset ...
      p := pchar(ms.Memory) + PPdfObj(PdfObjList[k]).offset;
      //make sure it's the Pages object ...
      if not GetNumber(k) or (k <> pagesNum) then exit;
      if not FindStrInDict('/Count') then exit;
      if not GetNumber(cnt) then exit;
      //21-Jun-05: bugfix
      //occasionally the 'count' value is an indirect object
      if GetNumber(k) and IsString(' R') then
      begin
        //this is an indirect object to the count value,
        //so find the obj ...
        k := 0;
        while k < PdfObjList.Count do
          if PPdfObj(PdfObjList[k]).number = cnt then break else inc(k);
        if k = PdfObjList.Count then exit;
        p := pchar(ms.Memory) + PPdfObj(PdfObjList[k]).offset;
        if not GetNumber(k) or //skip the object num
          not GetNumber(k) or //skip the generation num
          not IsString(' obj') or
          not GetNumber(cnt) then exit;
      end;
      result := cnt;
    finally
      screen.Cursor := crDefault;
      for k := 0 to PdfObjList.Count -1 do
        dispose(PPdfObj(PdfObjList[k]));
      PdfObjList.Free;
      ms.Free;
    end;
  except
    //nb: errors are flagged by returning -1
  end;
end;

from http://www.angusj.com/delphitips/pdfpagecount.php
posted on 2006-06-26 16:12  RubyPDF  阅读(2902)  评论(0编辑  收藏  举报