铷铯

导航

 

完成工作:

今天比较忙,没干什么。poppler基本整理清楚了,开始考虑接口的问题。

一个比较容易实现的想法是,显示采用GCPDF*库,取词用poppler。我暂时写了一个接口如下,如果有任何不方便的地方或修改建议,请告诉我。

1 /*
2 * PdfAnalyze.h
3 * PaperReader
4 *
5 * Created by Guangda Hu on 11-4-20.
6 * Copyright 2011 Ruse. All rights reserved.
7 *
8 */
9
10 #ifndef PDFANALYZE_H
11 #define PDFANALYZE_H
12
13 #include <cstdlib>
14 #include "TextOutputDev.h"
15 #include "PDFDoc.h"
16 #include "GooString.h"
17 #include "CharTypes.h"
18 #include "UnicodeMap.h"
19 #include "Dict.h"
20
21 class PdfAnalyze {
22 public:
23
24 /*
25 Construct Function.
26 filename: PDF file to open.
27 ownerPW: owner password.
28 userPW: user password.
29 */
30 PdfAnalyze(const char *filename, const char *ownerPW = NULL, const char *userPW = NULL);
31
32 /*
33 Set encoding. No need to call this if use "UTF-8".
34 Available encodings are "Latin1", "ASCII7", "Symbol", "ZapfDingbats", "UTF-8" and "UCS-2".
35 */
36 void setEncoding(const char *encName);
37
38 /*
39 Set end of line. Candidates are "unix" (LF, default), "dos" (CR + LF), "mac" (CR).
40 */
41 void setEOL(const char *eolName);
42
43 /*
44 Return the number of pages.
45 */
46 int getNumPages();
47
48 /*
49 Query PDF information dict.
50 key: can be "Title", "Subject", "Keywords", "Author", "Creator", "Producer", "CreationData" or "LastModifiedData".
51 If the dict is not usable, or key is not in dict, NULL is returned.
52 For date information, the return string may start with "D:".
53 */
54 const char * pdfInfo(const char *key);
55
56 /*
57 Process a page. This is automatically called in the first query in that page.
58 You can process all pages at the beginning to provide a more smooth interface, if this processing takes long.
59 */
60 void touchPage(int page);
61
62 /*
63 Query a word and its bounding box by page number and position.
64 The return string is normalized to NKFC (ready for search).
65 See http://en.wikipedia.org/wiki/Unicode_equivalence#Normal_forms for full definition.
66 NULL is returned if no word there.
67 */
68 const char * wordQuery(int page, double x, double y, double *xMin = NULL, double *yMin = NULL,
69 double *xMax = NULL, double *yMax = NULL);
70
71 /*
72 Return the TextWord object hitted by the positition.
73 Font information, color, raw content can be accessed from TextWord.
74 Do not modify TextWord content, or wordQuery may not work as you expected because I'm using cache.
75 */
76 TextWord * getWord(int page, double x, double y);
77
78 /*
79 Get text from a rectangle.
80 The return is NOT normalized, so there may be ligatures.
81 */
82 const char * textQuery(int page, double xMin, double yMin, double xMax, double yMax);
83
84 /*
85 Search text in a page.
86
87 Start & End:
88 If <startAtTop> is true, starts looking at the top of the page;
89 else if <startAtLast> is true, starts looking immediately after the last find result;
90 else starts looking at <xMin>,<yMin>.
91 If <stopAtBottom> is true, stops looking at the bottom of the page;
92 else if <stopAtLast> is true, stops looking just before the last find result;
93 else stops looking at <xMax>,<yMax>.
94
95 Encoding:
96 See http://en.wikipedia.org/wiki/UTF-8 for the conversion between UTF-32 and UTF-8.
97 Normally use UTF-8 for ASCII based string, but it is converted to UTF-32 implicitly and then call the second function.
98 So also provide a Unicode (UTF-32 no higher than 0x10ffff) version for efficiency.
99
100 Automatically NKFC-normalized.
101 */
102 bool findText(int page, const char *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast,
103 bool caseSensitive, bool backward, double *xMin, double *yMin, double *xMax, double *yMax); // UTF-8
104 bool findText(int page, const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast,
105 bool caseSensitive, bool backward, double *xMin, double *yMin, double *xMax, double *yMax); // Unicode
106
107 /*
108 Start from the first reference.
109 */
110 void resetRef();
111
112 /*
113 Return the next reference, without "[1]" or "[2]".
114 NULL if reaches the last reference.
115 */
116 const char * nextReference();
117
118 ~PdfAnalyze();
119
120 private:
121
122 const double resolution;
123
124 UnicodeMap *uMap;
125
126 PDFDoc *doc;
127
128 Object info;
129 Dict *infoDict;
130 GooString lastDict;
131
132 int numPages;
133 TextPage **pages;
134
135 const int cache_size;
136 int *cacheCnt;
137 TextWord **wordCache;
138 GooString *queryCache;
139
140 GooString *lastText;
141
142 int refPage, refStartPage;
143 TextFlow *refFlow, *refStartFlow;
144 TextBlock *refBlock, *refStartBlock;
145 TextLine *refLine, *refStartLine;
146 GooString lastRef;
147 };
148
149 #endif /* PDFANALYZE_H */

另外,我不太清楚objective c和c++传递字符串的方法。改成.mm就可以混合编程,但不知道内存分配会不会有问题,尤其是这种指针在objective c里能不能用我完全不知道。(objective c怎么写hello world我都不会。)

明天计划:

写完PdfAnalyze.cpp,脱离对poppler纠结状态。

posted on 2011-04-22 00:44  se2012  阅读(258)  评论(0编辑  收藏  举报