代码片段(1)
[文件] HtmlParser.java ~ 8KB 下载(55)
001
import
java.io.File;
002
import
java.io.FileNotFoundException;
003
import
java.io.FileOutputStream;
004
import
java.io.IOException;
005
import
java.io.InputStream;
006
import
java.net.URL;
007
import
java.sql.Connection;
008
import
java.sql.DriverManager;
009
import
java.sql.PreparedStatement;
010
import
java.sql.ResultSet;
011
import
java.sql.SQLException;
012
013
import
org.apache.log4j.Logger;
014
import
org.apache.log4j.PropertyConfigurator;
015
import
org.htmlparser.Node;
016
import
org.htmlparser.NodeFilter;
017
import
org.htmlparser.Parser;
018
import
org.htmlparser.Tag;
019
import
org.htmlparser.filters.TagNameFilter;
020
import
org.htmlparser.tags.LinkTag;
021
import
org.htmlparser.util.NodeIterator;
022
import
org.htmlparser.util.NodeList;
023
import
org.htmlparser.util.ParserException;
024
import
org.htmlparser.util.SimpleNodeIterator;
025
026
/**
027
* 分析www.cheshi.com首页新闻
028
* @author j.li
029
*/
030
public
class
HtmlParser {
031
private
static
Logger logger;
032
private
Connection conn =
null
;
033
private
static
final
String SiteName =
""
;
034
035
public
void
indexNewsContent(String sitepath)
throws
Exception {
036
logger.info(
"分析网站【"
+ sitepath +
"】首页的新闻列表,内容为【<div class=\"hotjd\"></div>】所有网页新闻地址的HTML内容。"
);
037
Parser myParser =
new
Parser(sitepath);
038
myParser.setEncoding(
"GBK"
);
039
NodeList nodeList = myParser.extractAllNodesThatMatch(
new
NodeFilter() {
040
public
boolean
accept(Node node) {
041
return
((node
instanceof
Tag)
042
&& !((Tag)node).isEndTag()
043
&& ((Tag)node).getTagName().equals(
"DIV"
)
044
&& ((Tag)node).getAttribute(
"class"
) !=
null
045
&& ((Tag)node).getAttribute(
"class"
).equals(
"w_box"
));
046
}
047
});
048
Node node = nodeList.elementAt(
1
);
049
logger.debug(node.toHtml());
050
extractText(node.toHtml());
051
}
052
053
public
void
extractText(String inputHtml)
throws
Exception {
054
Parser parser = Parser.createParser(inputHtml,
"GBK"
);
055
TagNameFilter filter =
new
TagNameFilter(
"a"
);
056
NodeList nodeList = parser.extractAllNodesThatMatch(filter);
057
NodeIterator it = nodeList.elements();
058
getConnection();
059
while
(it.hasMoreNodes()) {
060
LinkTag node = (LinkTag) it.nextNode();
061
String href = node.getLink();
062
String title = node.getLinkText();
063
logger.info(
"分析首页新闻【"
+title+
"】,链接地址【"
+href+
"】"
);
064
try
{
065
if
(!newsExist(title)) {
066
insertDataBase(title, extractContent(href));
067
}
else
{
068
logger.info(
"新闻【"
+title+
"】数据库中已经存在,忽略进入下一个新闻分析!"
);
069
}
070
}
catch
(SQLException e) {
071
logger.error(
"插入数据库新闻记录异常!"
+ e.getMessage());
072
e.printStackTrace();
073
}
catch
(Exception e) {
074
logger.error(e.getMessage());
075
logger.info(
"分析新闻【"
+title+
"】,链接地址【"
+href+
"】失败,进入下一个新闻分析。"
);
076
e.printStackTrace();
077
}
078
}
079
closeConnection();
080
}
081
082
public
String extractContent(String content)
throws
Exception {
083
try
{
084
Parser myParser =
new
Parser(content);
085
myParser.setEncoding(
"GBK"
);
086
NodeList nodeList = myParser.extractAllNodesThatMatch(
new
NodeFilter() {
087
public
boolean
accept(Node node) {
088
return
((node
instanceof
Tag)
089
&& !((Tag)node).isEndTag()
090
&& ((Tag)node).getTagName().equals(
"DIV"
)
091
&& ((Tag)node).getAttribute(
"class"
) !=
null
092
&& ((Tag)node).getAttribute(
"class"
).equals(
"cs_content"
));
093
}
094
});
095
int
size = nodeList.size();
096
Node node = nodeList.elementAt(size -
1
);
097
content = node.toHtml();
098
logger.debug(
"==========extractContent=============="
);
099
logger.debug(content);
100
}
catch
(Exception pe) {
101
logger.error(
"分析新闻页面出现异常!"
+ pe.getMessage() +
"原因可能出现于新闻页面不存在<div class=\"cs_content\"></div>标记。"
);
102
throw
pe;
103
}
104
return
removeTagA(content);
105
}
106
107
/**
108
* 去除新闻中href包含cheshi.com的<a>标签
109
* @param content 分析html内容
110
* @return 分析处理后的html内容
111
*/
112
public
String removeTagA(String content)
throws
ParserException {
113
Parser myParser =
new
Parser(content);
114
myParser.setEncoding(
"GBK"
);
115
NodeList nodeList = myParser.extractAllNodesThatMatch(
new
TagNameFilter(
"a"
));
116
SimpleNodeIterator it = nodeList.elements();
117
while
(it.hasMoreNodes()) {
118
LinkTag node = (LinkTag)it.nextNode();
119
logger.info(
"移除新闻内容中包含的文字、图片的链接【"
+node.toHtml()+
"】。"
);
120
if
(node.getLink().indexOf(
"cheshi.com"
) > -
1
)
121
content = content.replace(node.toHtml(), node.getStringText());
122
}
123
logger.debug(
"==========removeTagA=============="
);
124
logger.debug(content);
125
return
downloadImages(content,
"D:\\autodata\\upload\\intersite"
, SiteName +
"upload/intersite"
);
126
}
127
128
public
String downloadImages(String content, String uploadImgPath, String localhost)
throws
ParserException {
129
File f =
new
File(uploadImgPath);
130
if
(!f.exists()) {
131
f.mkdirs();
132
}
133
Parser myParser =
new
Parser(content);
134
myParser.setEncoding(
"GBK"
);
135
NodeList nodeList = myParser.extractAllNodesThatMatch(
new
TagNameFilter(
"img"
));
136
SimpleNodeIterator it = nodeList.elements();
137
while
(it.hasMoreNodes()) {
138
Tag tag = (Tag)it.nextNode();
139
String src = tag.getAttribute(
"src"
);
140
String filename = src.substring(src.lastIndexOf(
"/"
) +
1
);
141
InputStream is =
null
;
142
FileOutputStream fos =
null
;
143
try
{
144
URL url =
new
URL(src);
145
is = url.openStream();
146
int
bytesRead =
0
;
147
byte
[] buff =
new
byte
[
1024
];
148
fos =
new
FileOutputStream(uploadImgPath+
"/"
+filename);
149
while
((bytesRead = is.read(buff,
0
, buff.length)) != -
1
){
150
fos.write(buff,
0
, bytesRead);
151
}
152
content = content.replace(src, localhost +
"/"
+ filename);
153
}
catch
(FileNotFoundException notFoundException) {
154
notFoundException.printStackTrace();
155
}
catch
(IOException ioe) {
156
ioe.printStackTrace();
157
}
finally
{
158
try
{
159
if
(fos !=
null
) fos.close();
160
if
(is !=
null
) is.close();
161
}
catch
(IOException ioe) {
162
ioe.printStackTrace();
163
}
164
}
165
}
166
logger.debug(
"=================downloadImages=================="
);
167
logger.debug(content);
168
return
content;
169
}
170
171
public
void
getConnection() {
172
try
{
173
Class.forName(
"com.microsoft.jdbc.sqlserver.SQLServerDriver"
);
174
String strCon =
"jdbc:microsoft:sqlserver://192.168.99.188:12580;databaseName=Project2009;SelectMethod=cursor"
;
175
String strUserName =
"sa"
;
176
String strPWD =
"qsyjcsxdl@@@web2009@@@"
;
177
conn = DriverManager.getConnection(strCon, strUserName, strPWD);
178
}
catch
(java.lang.ClassNotFoundException cnfe) {
179
cnfe.printStackTrace();
180
}
catch
(SQLException se) {
181
se.printStackTrace();
182
}
183
}
184
185
public
void
closeConnection() {
186
try
{
187
if
(conn!=
null
&& !conn.isClosed()) conn.close();
188
}
catch
(SQLException se) {
189
se.printStackTrace();
190
}
191
}
192
193
public
void
insertDataBase(String newsTitle, String newsContent)
throws
SQLException {
194
PreparedStatement pstmt =
null
;
195
try
{
196
pstmt = conn.prepareStatement(
"INSERT INTO FumNews(NewsTitle, NewsContext, NewsState) values(?, ?, ?)"
);
197
pstmt.setString(
1
, newsTitle);
198
pstmt.setString(
2
, newsContent);
199
pstmt.setInt(
3
,
1
);
200
pstmt.executeUpdate();
201
}
catch
(SQLException e) {
202
throw
e;
203
}
finally
{
204
try
{
205
if
(pstmt !=
null
) pstmt.close();
206
}
catch
(SQLException e) {
207
e.printStackTrace();
208
}
209
}
210
}
211
212
public
boolean
newsExist(String title)
throws
SQLException {
213
PreparedStatement pstmt =
null
;
214
try
{
215
pstmt = conn.prepareStatement(
"SELECT top 1 NewsId from FumNews where NewsTitle = ?"
);
216
pstmt.setString(
1
, title);
217
ResultSet rs = pstmt.executeQuery();
218
return
rs.next();
219
}
catch
(SQLException e) {
220
throw
e;
221
}
finally
{
222
try
{
223
if
(pstmt !=
null
) pstmt.close();
224
}
catch
(SQLException e) {
225
e.printStackTrace();
226
}
227
}
228
}
229
230
public
static
void
main(String[] args) {
231
HtmlParser html =
new
HtmlParser();
232
// 设置代理链接网络
233
// System.getProperties().put("proxySet", "true");
234
// System.getProperties().put("proxyHost", "192.168.99.100");
235
// System.getProperties().put("proxyPort", "80");
236
URL url = html.getClass().getResource(
"log4j.properties"
);
237
PropertyConfigurator.configure(url);
238
logger = Logger.getLogger(HtmlParser.
class
);
239
try
{
240
html.indexNewsContent(
"http://www.cheshi.com/"
);
241
}
catch
(Exception e) {
242
e.printStackTrace();
243
logger.error(
"分析网页遇到错误,原因:"
+e.getMessage());
244
}
245
logger.info(
"分析网页内容完成。"
);
246
}
247
}
代码片段(1)
[文件] HtmlParser.java ~ 8KB 下载(55)
001 | import java.io.File; |
002 | import java.io.FileNotFoundException; |
003 | import java.io.FileOutputStream; |
004 | import java.io.IOException; |
005 | import java.io.InputStream; |
006 | import java.net.URL; |
007 | import java.sql.Connection; |
008 | import java.sql.DriverManager; |
009 | import java.sql.PreparedStatement; |
010 | import java.sql.ResultSet; |
011 | import java.sql.SQLException; |
012 |
013 | import org.apache.log4j.Logger; |
014 | import org.apache.log4j.PropertyConfigurator; |
015 | import org.htmlparser.Node; |
016 | import org.htmlparser.NodeFilter; |
017 | import org.htmlparser.Parser; |
018 | import org.htmlparser.Tag; |
019 | import org.htmlparser.filters.TagNameFilter; |
020 | import org.htmlparser.tags.LinkTag; |
021 | import org.htmlparser.util.NodeIterator; |
022 | import org.htmlparser.util.NodeList; |
023 | import org.htmlparser.util.ParserException; |
024 | import org.htmlparser.util.SimpleNodeIterator; |
025 |
026 | /** |
027 | * 分析www.cheshi.com首页新闻 |
028 | * @author j.li |
029 | */ |
030 | public class HtmlParser { |
031 | private static Logger logger; |
032 | private Connection conn = null ; |
033 | private static final String SiteName = "" ; |
034 |
035 | public void indexNewsContent(String sitepath) throws Exception { |
036 | logger.info( "分析网站【" + sitepath + "】首页的新闻列表,内容为【<div class=\"hotjd\"></div>】所有网页新闻地址的HTML内容。" ); |
037 | Parser myParser = new Parser(sitepath); |
038 | myParser.setEncoding( "GBK" ); |
039 | NodeList nodeList = myParser.extractAllNodesThatMatch( new NodeFilter() { |
040 | public boolean accept(Node node) { |
041 | return ((node instanceof Tag) |
042 | && !((Tag)node).isEndTag() |
043 | && ((Tag)node).getTagName().equals( "DIV" ) |
044 | && ((Tag)node).getAttribute( "class" ) != null |
045 | && ((Tag)node).getAttribute( "class" ).equals( "w_box" )); |
046 | } |
047 | }); |
048 | Node node = nodeList.elementAt( 1 ); |
049 | logger.debug(node.toHtml()); |
050 | extractText(node.toHtml()); |
051 | } |
052 | |
053 | public void extractText(String inputHtml) throws Exception { |
054 | Parser parser = Parser.createParser(inputHtml, "GBK" ); |
055 | TagNameFilter filter = new TagNameFilter( "a" ); |
056 | NodeList nodeList = parser.extractAllNodesThatMatch(filter); |
057 | NodeIterator it = nodeList.elements(); |
058 | getConnection(); |
059 | while (it.hasMoreNodes()) { |
060 | LinkTag node = (LinkTag) it.nextNode(); |
061 | String href = node.getLink(); |
062 | String title = node.getLinkText(); |
063 | logger.info( "分析首页新闻【" +title+ "】,链接地址【" +href+ "】" ); |
064 | try { |
065 | if (!newsExist(title)) { |
066 | insertDataBase(title, extractContent(href)); |
067 | } else { |
068 | logger.info( "新闻【" +title+ "】数据库中已经存在,忽略进入下一个新闻分析!" ); |
069 | } |
070 | } catch (SQLException e) { |
071 | logger.error( "插入数据库新闻记录异常!" + e.getMessage()); |
072 | e.printStackTrace(); |
073 | } catch (Exception e) { |
074 | logger.error(e.getMessage()); |
075 | logger.info( "分析新闻【" +title+ "】,链接地址【" +href+ "】失败,进入下一个新闻分析。" ); |
076 | e.printStackTrace(); |
077 | } |
078 | } |
079 | closeConnection(); |
080 | } |
081 |
082 | public String extractContent(String content) throws Exception { |
083 | try { |
084 | Parser myParser = new Parser(content); |
085 | myParser.setEncoding( "GBK" ); |
086 | NodeList nodeList = myParser.extractAllNodesThatMatch( new NodeFilter() { |
087 | public boolean accept(Node node) { |
088 | return ((node instanceof Tag) |
089 | && !((Tag)node).isEndTag() |
090 | && ((Tag)node).getTagName().equals( "DIV" ) |
091 | && ((Tag)node).getAttribute( "class" ) != null |
092 | && ((Tag)node).getAttribute( "class" ).equals( "cs_content" )); |
093 | } |
094 | }); |
095 | int size = nodeList.size(); |
096 | Node node = nodeList.elementAt(size - 1 ); |
097 | content = node.toHtml(); |
098 | logger.debug( "==========extractContent==============" ); |
099 | logger.debug(content); |
100 | } catch (Exception pe) { |
101 | logger.error( "分析新闻页面出现异常!" + pe.getMessage() + "原因可能出现于新闻页面不存在<div class=\"cs_content\"></div>标记。" ); |
102 | throw pe; |
103 | } |
104 | return removeTagA(content); |
105 | } |
106 | |
107 | /** |
108 | * 去除新闻中href包含cheshi.com的<a>标签 |
109 | * @param content 分析html内容 |
110 | * @return 分析处理后的html内容 |
111 | */ |
112 | public String removeTagA(String content) throws ParserException { |
113 | Parser myParser = new Parser(content); |
114 | myParser.setEncoding( "GBK" ); |
115 | NodeList nodeList = myParser.extractAllNodesThatMatch( new TagNameFilter( "a" )); |
116 | SimpleNodeIterator it = nodeList.elements(); |
117 | while (it.hasMoreNodes()) { |
118 | LinkTag node = (LinkTag)it.nextNode(); |
119 | logger.info( "移除新闻内容中包含的文字、图片的链接【" +node.toHtml()+ "】。" ); |
120 | if (node.getLink().indexOf( "cheshi.com" ) > - 1 ) |
121 | content = content.replace(node.toHtml(), node.getStringText()); |
122 | } |
123 | logger.debug( "==========removeTagA==============" ); |
124 | logger.debug(content); |
125 | return downloadImages(content, "D:\\autodata\\upload\\intersite" , SiteName + "upload/intersite" ); |
126 | } |
127 |
128 | public String downloadImages(String content, String uploadImgPath, String localhost) throws ParserException { |
129 | File f = new File(uploadImgPath); |
130 | if (!f.exists()) { |
131 | f.mkdirs(); |
132 | } |
133 | Parser myParser = new Parser(content); |
134 | myParser.setEncoding( "GBK" ); |
135 | NodeList nodeList = myParser.extractAllNodesThatMatch( new TagNameFilter( "img" )); |
136 | SimpleNodeIterator it = nodeList.elements(); |
137 | while (it.hasMoreNodes()) { |
138 | Tag tag = (Tag)it.nextNode(); |
139 | String src = tag.getAttribute( "src" ); |
140 | String filename = src.substring(src.lastIndexOf( "/" ) + 1 ); |
141 | InputStream is = null ; |
142 | FileOutputStream fos = null ; |
143 | try { |
144 | URL url = new URL(src); |
145 | is = url.openStream(); |
146 | int bytesRead = 0 ; |
147 | byte [] buff = new byte [ 1024 ]; |
148 | fos = new FileOutputStream(uploadImgPath+ "/" +filename); |
149 | while ((bytesRead = is.read(buff, 0 , buff.length)) != - 1 ){ |
150 | fos.write(buff, 0 , bytesRead); |
151 | } |
152 | content = content.replace(src, localhost + "/" + filename); |
153 | } catch (FileNotFoundException notFoundException) { |
154 | notFoundException.printStackTrace(); |
155 | } catch (IOException ioe) { |
156 | ioe.printStackTrace(); |
157 | } finally { |
158 | try { |
159 | if (fos != null ) fos.close(); |
160 | if (is != null ) is.close(); |
161 | } catch (IOException ioe) { |
162 | ioe.printStackTrace(); |
163 | } |
164 | } |
165 | } |
166 | logger.debug( "=================downloadImages==================" ); |
167 | logger.debug(content); |
168 | return content; |
169 | } |
170 | |
171 | public void getConnection() { |
172 | try { |
173 | Class.forName( "com.microsoft.jdbc.sqlserver.SQLServerDriver" ); |
174 | String strCon = "jdbc:microsoft:sqlserver://192.168.99.188:12580;databaseName=Project2009;SelectMethod=cursor" ; |
175 | String strUserName = "sa" ; |
176 | String strPWD = "qsyjcsxdl@@@web2009@@@" ; |
177 | conn = DriverManager.getConnection(strCon, strUserName, strPWD); |
178 | } catch (java.lang.ClassNotFoundException cnfe) { |
179 | cnfe.printStackTrace(); |
180 | } catch (SQLException se) { |
181 | se.printStackTrace(); |
182 | } |
183 | } |
184 | |
185 | public void closeConnection() { |
186 | try { |
187 | if (conn!= null && !conn.isClosed()) conn.close(); |
188 | } catch (SQLException se) { |
189 | se.printStackTrace(); |
190 | } |
191 | } |
192 | |
193 | public void insertDataBase(String newsTitle, String newsContent) throws SQLException { |
194 | PreparedStatement pstmt = null ; |
195 | try { |
196 | pstmt = conn.prepareStatement( "INSERT INTO FumNews(NewsTitle, NewsContext, NewsState) values(?, ?, ?)" ); |
197 | pstmt.setString( 1 , newsTitle); |
198 | pstmt.setString( 2 , newsContent); |
199 | pstmt.setInt( 3 , 1 ); |
200 | pstmt.executeUpdate(); |
201 | } catch (SQLException e) { |
202 | throw e; |
203 | } finally { |
204 | try { |
205 | if (pstmt != null ) pstmt.close(); |
206 | } catch (SQLException e) { |
207 | e.printStackTrace(); |
208 | } |
209 | } |
210 | } |
211 | |
212 | public boolean newsExist(String title) throws SQLException { |
213 | PreparedStatement pstmt = null ; |
214 | try { |
215 | pstmt = conn.prepareStatement( "SELECT top 1 NewsId from FumNews where NewsTitle = ?" ); |
216 | pstmt.setString( 1 , title); |
217 | ResultSet rs = pstmt.executeQuery(); |
218 | return rs.next(); |
219 | } catch (SQLException e) { |
220 | throw e; |
221 | } finally { |
222 | try { |
223 | if (pstmt != null ) pstmt.close(); |
224 | } catch (SQLException e) { |
225 | e.printStackTrace(); |
226 | } |
227 | } |
228 | } |
229 |
230 | public static void main(String[] args) { |
231 | HtmlParser html = new HtmlParser(); |
232 | // 设置代理链接网络 |
233 | // System.getProperties().put("proxySet", "true"); |
234 | // System.getProperties().put("proxyHost", "192.168.99.100"); |
235 | // System.getProperties().put("proxyPort", "80"); |
236 | URL url = html.getClass().getResource( "log4j.properties" ); |
237 | PropertyConfigurator.configure(url); |
238 | logger = Logger.getLogger(HtmlParser. class ); |
239 | try { |
240 | html.indexNewsContent( "http://www.cheshi.com/" ); |
241 | } catch (Exception e) { |
242 | e.printStackTrace(); |
243 | logger.error( "分析网页遇到错误,原因:" +e.getMessage()); |
244 | } |
245 | logger.info( "分析网页内容完成。" ); |
246 | } |
247 | } |