iText实现URL页面转PDF

原文：http://www.micmiu.com/opensource/expdoc/itext-url-pdf/

目录：

概述
软件要求
实现过程

[一]、概述

前面已经介绍了如何实现对HTML中文字符的转换以及HTML文件生成PDF文件的基本方法，本文主要演示下如何把URL地址对应的内容直接转换生成PDF文件，这个需求也有很多的应用场景，最简单的应用场景比如：自己blog中的文章如何转PDF，如果能生成PDF文件，一方面可以方便自己的阅读，亦可作为一种备份。

[二]、软件要求

如何URL地址内容包含中文字符，需要XML Worker能支持中文字符转换（详见：http://www.micmiu.com/opensource/expdoc/itext-xml-worker-cn/）
Java 的HTML解析器，这里选择：jsoup （官网：http://jsoup.org/），如果是 maven 构建项目的，直接在pom文件中增加jsoup的依赖配置即可：

XHTML

1

2

3

4

5

6

7

<dependency>

<groupId>org.jsoup</groupId>

<artifactId>jsoup</artifactId>

<version>1.7.1</version>

<type>jar</type>

<scope>compile</scope>

</dependency>

[三]、实现过程

以我的blog：http://www.micmiu.com/os/linux/shell-dev-null/ 为例，和HTML文件转PDF类似同样有两种方法，详细介绍见下面的具体实现代码中的注释。

Java实现代码：Demo4URL2PDF.java

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

package com.micmiu.pdf.itext;

import java.io.ByteArrayInputStream;

import java.io.FileOutputStream;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.util.ArrayList;

import java.util.List;

import org.jsoup.Jsoup;

import com.itextpdf.text.BaseColor;

import com.itextpdf.text.Chapter;

import com.itextpdf.text.Chunk;

import com.itextpdf.text.Document;

import com.itextpdf.text.Element;

import com.itextpdf.text.Font;

import com.itextpdf.text.PageSize;

import com.itextpdf.text.Paragraph;

import com.itextpdf.text.Section;

import com.itextpdf.text.WritableDirectElement;

import com.itextpdf.text.pdf.BaseFont;

import com.itextpdf.text.pdf.PdfWriter;

import com.itextpdf.text.pdf.draw.LineSeparator;

import com.itextpdf.tool.xml.ElementHandler;

import com.itextpdf.tool.xml.Writable;

import com.itextpdf.tool.xml.XMLWorkerHelper;

import com.itextpdf.tool.xml.pipeline.WritableElement;

/**

* HTML文件转换为PDF

* @author <a href="http://www.micmiu.com">Michael Sun</a>

public class Demo4URL2PDF {

/**

* @param args

public static void main(String[] args) throws Exception {

String blogURL = "http://www.micmiu.com/os/linux/shell-dev-null/";

// 直接把网页内容转为PDF文件

String pdfFile = "d:/test/itext/demo-URL.pdf";

Demo4URL2PDF.parseURL2PDFFile(pdfFile, blogURL);

// 把网页内容转为PDF中的Elements

String pdfFile2 = "d:/test/itext/demo-URL2.pdf";

Demo4URL2PDF.parseURL2PDFElement(pdfFile2, blogURL);

}

/**

* 根据URL提前blog的基本信息，返回结果>>:[主题 ,分类,日期,内容]等.

* @param blogURL

* @return

* @throws Exception

public static String[] extractBlogInfo(String blogURL) throws Exception {

String[] info = new String[4];

org.jsoup.nodes.Document doc = Jsoup.connect(blogURL).get();

org.jsoup.nodes.Element e_title = doc.select("h2.title").first();

info[0] = e_title.text();

org.jsoup.nodes.Element e_category = doc.select("a[rel=category tag]")

.first();

info[1] = e_category.attr("href").replace("http://www.micmiu.com/", "");

org.jsoup.nodes.Element e_date = doc.select("span.post-info-date")

.first();

String dateStr = e_date.text().split("日期")[1].trim();

info[2] = dateStr;

org.jsoup.nodes.Element entry = doc.select("div.entry").first();

info[3] = formatContentTag(entry);

return info;

}

/**

* 格式化 img标签

* @param entry

* @return

private static String formatContentTag(org.jsoup.nodes.Element entry) {

try {

entry.select("div").remove();

// 把 <a href="*.jpg" ><img src="*.jpg"/></a> 替换为 <img

// src="*.jpg"/>

for (org.jsoup.nodes.Element imgEle : entry

.select("a[href~=(?i)\\.(png|jpe?g)]")) {

imgEle.replaceWith(imgEle.select("img").first());

}

return entry.html();

} catch (Exception e) {

return "";

}

/**

* 把String 转为 InputStream

* @param content

* @return

public static InputStream parse2Stream(String content) {

try {

ByteArrayInputStream stream = new ByteArrayInputStream(

content.getBytes("utf-8"));

return stream;

} catch (Exception e) {

return null;

}

/**

* 直接把网页内容转为PDF文件

* @param fileName

* @throws Exception

public static void parseURL2PDFFile(String pdfFile, String blogURL)

throws Exception {

BaseFont bfCN = BaseFont.createFont("STSongStd-Light", "UniGB-UCS2-H",

false);

// 中文字体定义

Font chFont = new Font(bfCN, 14, Font.NORMAL, BaseColor.BLUE);

Font secFont = new Font(bfCN, 12, Font.NORMAL, new BaseColor(0, 204,

255));

Font textFont = new Font(bfCN, 12, Font.NORMAL, BaseColor.BLACK);

Document document = new Document();

PdfWriter pdfwriter = PdfWriter.getInstance(document,

new FileOutputStream(pdfFile));

pdfwriter.setViewerPreferences(PdfWriter.HideToolbar);

document.open();

String[] blogInfo = extractBlogInfo(blogURL);

int chNum = 1;

Chapter chapter = new Chapter(new Paragraph("URL转PDF测试", chFont),

chNum++);

Section section = chapter

.addSection(new Paragraph(blogInfo[0], secFont));

section.setIndentation(10);

section.setIndentationLeft(10);

section.setBookmarkOpen(false);

section.setNumberStyle(Section.NUMBERSTYLE_DOTTED_WITHOUT_FINAL_DOT);

section.add(new Chunk("分类：" + blogInfo[1] + " 日期：" + blogInfo[2],

textFont));

LineSeparator line = new LineSeparator(1, 100, new BaseColor(204, 204,

204), Element.ALIGN_CENTER, -2);

Paragraph p_line = new Paragraph(" ");

p_line.add(line);

section.add(p_line);

section.add(Chunk.NEWLINE);

document.add(chapter);

// html文件

XMLWorkerHelper.getInstance().parseXHtml(pdfwriter, document,

parse2Stream(blogInfo[3]));

document.close();

}

/**

* 把网页内容转为PDF中的Elements

* @param pdfFile

* @param htmlFileStream

public static void parseURL2PDFElement(String pdfFile, String blogURL) {

try {

Document document = new Document(PageSize.A4);

FileOutputStream outputStream = new FileOutputStream(pdfFile);

PdfWriter pdfwriter = PdfWriter.getInstance(document, outputStream);

// pdfwriter.setViewerPreferences(PdfWriter.HideToolbar);

document.open();

BaseFont bfCN = BaseFont.createFont("STSongStd-Light",

"UniGB-UCS2-H", false);

// 中文字体定义

Font chFont = new Font(bfCN, 14, Font.NORMAL, BaseColor.BLUE);

Font secFont = new Font(bfCN, 12, Font.NORMAL, new BaseColor(0,

204, 255));

Font textFont = new Font(bfCN, 12, Font.NORMAL, BaseColor.BLACK);

int chNum = 1;

Chapter chapter = new Chapter(new Paragraph("URL转PDF元素，便于追加其他内容",

chFont), chNum++);

String[] blogInfo = extractBlogInfo(blogURL);

Section section = chapter.addSection(new Paragraph(blogInfo[0],

secFont));

section.setIndentation(10);

section.setIndentationLeft(10);

section.setBookmarkOpen(false);

section.setNumberStyle(Section.NUMBERSTYLE_DOTTED_WITHOUT_FINAL_DOT);

section.add(new Chunk("分类：" + blogInfo[1] + " 发表日期：" + blogInfo[2],

textFont));

LineSeparator line = new LineSeparator(1, 100, new BaseColor(204,

204, 204), Element.ALIGN_CENTER, -2);

Paragraph p_line = new Paragraph();

p_line.add(line);

section.add(p_line);

section.add(Chunk.NEWLINE);

final List<Element> pdfeleList = new ArrayList<Element>();

ElementHandler elemH = new ElementHandler() {

public void add(final Writable w) {

if (w instanceof WritableElement) {

pdfeleList.addAll(((WritableElement) w).elements());

}

};

XMLWorkerHelper.getInstance().parseXHtml(elemH,

new InputStreamReader(parse2Stream(blogInfo[3]), "utf-8"));

List<Element> list = new ArrayList<Element>();

for (Element ele : pdfeleList) {

if (ele instanceof LineSeparator

|| ele instanceof WritableDirectElement) {

continue;

}

list.add(ele);

}

section.addAll(list);

section = chapter.addSection(new Paragraph("继续添加章节", secFont));

section.setIndentation(10);

section.setIndentationLeft(10);

section.setBookmarkOpen(false);

section.setNumberStyle(Section.NUMBERSTYLE_DOTTED_WITHOUT_FINAL_DOT);

section.add(new Chunk("测试URL转为PDF元素，方便追加其他内容", textFont));

document.add(chapter);

document.close();

} catch (Exception e) {

e.printStackTrace();

}

运行后生成的两个PDF的效果如下：

从上面的效果图可见：根据URL地址生成的PDF和浏览器中页面效果以及之前HTML文件生成的PDF效果完全一致。

posted on 2016-08-01 15:07 杭州糊涂虫阅读(7798) 评论(0) 收藏举报

刷新页面返回顶部

杭州糊涂虫

iText实现URL页面转PDF

导航

公告