整理搜狗2008版搜狗新闻分类语料库和全网新闻分类语料库--转存至mysql数据库

注解:在读了finallyliyu 的文章 《菜鸟学习C++练笔之整理搜狗2008版语料库--获取分类语料库》之后萌生了用java来处理搜狐新闻数据库的想法。

作者:Alex.XW.Wang (博文转载请注明作者和出处)

代码下载:http://download.csdn.net/user/raindreams

首先,对于搜狗的语料库,有几点说明: 

根据搜狗实验室的数据使用许可说明(http://www.sogou.com/labs/dl/license.html):

有如下条款  三. 受益者的义务  

禁止将搜狗实验室数据提供给第三方使用,第三方如有需要可直接访问搜狗实验室主页下载或提出介质(硬盘、光盘)拷贝申请。

因此无法提供程序处理后的搜狗语料库,需要语料库的可以自行下载搜狗语料库,然后使用我提供的程序进行处理至mysql数据库。

之所以对finallyliyu的算法进行改进是因为:

1. 她的程序没有对分类(主题)进行标注,只是提取出了新闻的url,而没有对其url所对应新闻主题的解析。

2. 感觉java对于这则匹配来处理数据比finallyliuyu用的c++的boost库方便些,自己对于java也稍微熟练些,故用java来写。

以下是正文:数据库结构如下:

这是数据库和表格:sohunews_all对于搜狐分类的全部语料,souhunews_reduced对应其reduced版本。

   

 表格字段如下:

 搜狗数据格式如下:

<doc>
<url>页面URL</url>
<docno>页面ID</docno>
<contenttitle>页面标题</Contenttitle>
<content>页面内容</content>
</doc>

得到的效果图如下

以下是对于程序的解释:

  主程序调用如下:在设置好数据库配置后,只是简单地添加语料库目录就可以将搜狗语料导入到数据库中

public static void main(String args[]) {
SogouCSProcessor pro
= new SogouCSProcessor();
resultToFile(
"data/Sogou.txt");
pro.processor(
"D:\\语料库\\Sogou语料库\\SogouCA.reduced");
}
SogouCSProcessor类,用于对数据处理有一个整体的控制,其中主要的正则匹配处理在getDocBean()函数中,抽取出来以后再对其
去掉<>和<>内部的元素.
View Code
package zju.dawn.ai.corpus.sogou;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class SogouCSProcessor {
private List<DocBean> docList = new ArrayList<DocBean>();
// 匹配表达式.
private String docPatternString = "(?s)<doc>.*?</doc>";
private String noPatternString = "<docno>(.*?)</docno>";
private String urlPatternString = "<url>(.*?)</url>";
private String titlePatternString = "<contenttitle>(.*?)</contenttitle>";
private String contentPatternString = "<content>(.*?)</content>";
private Pattern docPattern = Pattern.compile(docPatternString);
private Pattern noPattern = Pattern.compile(noPatternString);
private Pattern urlPattern = Pattern.compile(urlPatternString);
private Pattern titlePattern = Pattern.compile(titlePatternString);
private Pattern contentPattern = Pattern.compile(contentPatternString);
/**
* 使用正则表达式匹配产生出文档DocBean.
*
*
@param rawText
*/
private void getDocBean(String rawText) {
docList.clear();
boolean completed = true;
String no
= null;
String url
= null;
String title
= null;
String content
= null;
String category
= null;
String docMatchString
= "";
Matcher docMatcher
= docPattern.matcher(rawText);
while (docMatcher.find()) {
DocBean docbean
= new DocBean();
docMatchString
= docMatcher.group();
Matcher noMatcher
= noPattern.matcher(docMatchString);
Matcher urlMatcher
= urlPattern.matcher(docMatchString);
Matcher titleMatcher
= titlePattern.matcher(docMatchString);
Matcher contentMatcher
= contentPattern.matcher(docMatchString);
if (noMatcher.find() && urlMatcher.find() && titleMatcher.find()
&& contentMatcher.find()) {
no
= reProcess(noMatcher.group());
url
= reProcess(urlMatcher.group());
title
= reProcess(titleMatcher.group());
content
= reProcess(contentMatcher.group());
docbean.setNo(no);
docbean.setUrl(url);
docbean.setTitle(title);
docbean.setContent(content);
}
else {
completed
= false;
}
if (completed) {
category
= CategoryDetector.detectCategory(url);
docbean.setCategory(category);
//displayDocBean(docbean);
docList.add(docbean);
}
}
System.out.println(
"txt contains " + docList.size() + " docs!");
}
/**
* 展示docbean的信息.
*
*
@param docbean
*/
private void displayDocBean(DocBean docbean) {
// System.out.println("docbeanNO:" + docbean.getNo());
System.out.println("docbeanURL:" + docbean.getUrl());
// System.out.println("docbeanTitle:" + docbean.getTitle());
// System.out.println("docbeanContent:" + docbean.getContent());
//System.out.println("docbeanCategory:" + docbean.getCategory());
}
/**
* 处理掉<doc></doc>这些信息.
*
*
@param text
*
@return 去噪信息.
*/
private String reProcess(String text) {
int beginIndex = text.indexOf(">");
int endIndex = text.lastIndexOf("<");
return text.substring(beginIndex + 1, endIndex).trim();
}
/**
* 处理文件目录下的所有指定文件.将这些doc导入到数据库中.
*
*
@param fileDir
*/
public void processor(String fileDir) {
List
<String> filePath = FileListViewer
.getFileList(fileDir,
"txt", true);
System.out.println(
"文件数目:" + filePath.size());
Iterator
<String> iter = filePath.iterator();
while (iter.hasNext()) {
String path
= iter.next();
System.out.println(
"===========================================");
System.out.println(path);
String rawText
= FileUtil.readEntire(path);
// System.out.println(rawText);
this.getDocBean(rawText);
SogouDBManager.savaDocBeanToDB(docList);
}
}
/**
* 将输出结果输出到文件中.
*
*
@param path
*
@throws FileNotFoundException
*/
public static void resultToFile(String path) {
File test
= new File(path);
PrintStream printStream
= null;
try {
printStream
= new PrintStream(new FileOutputStream(test));
}
catch (FileNotFoundException e) {
System.out.println(
"文件没有发现!");
e.printStackTrace();
}
PrintStream out
= printStream;
System.setOut(out);
}
public static void main(String args[]) {
SogouCSProcessor pro
= new SogouCSProcessor();
resultToFile(
"data/Sogou.txt");
pro.processor(
"D:\\语料库\\Sogou语料库\\SogouCA.reduced");
}
}

类别探测其如下:找出与category对应的类别,注意:在这里对国内,国际和社会分类都归于社会分类。原因有2:

1是给的一些网站的语料库无法根据url区分出这三个主题,2:这三个主题内容本来就比较相似,区分度不大。

View Code
package zju.dawn.ai.corpus.sogou;

public final class CategoryDetector {

private static String category = null;

public static String detectCategory(String url) {
// xinhuanet.
if (url.startsWith("http://www.xinhuanet.com/auto/")) {
category
= "car";
}
else if (url.startsWith("http://www.xinhuanet.com/fortune")) {
category
= "finance";
}
else if (url.startsWith("http://www.xinhuanet.com/internet/")) {
category
= "IT";
}
else if (url.startsWith("http://www.xinhuanet.com/health/")) {
category
= "health";
}
else if (url.startsWith("http://www.xinhuanet.com/sports")) {
category
= "sports";
}
else if (url.startsWith("http://www.xinhuanet.com/travel")) {
category
= "travel";
}
else if (url.startsWith("http://www.xinhuanet.com/edu")) {
category
= "education";
}
else if (url.startsWith("http://www.xinhuanet.com/employment")) {
category
= "employment";
}
else if (url.startsWith("http://www.xinhuanet.com/life")) {
category
= "culture";
}
else if (url.startsWith("http://www.xinhuanet.com/mil")) {
category
= "military";
}
else if (url.startsWith("http://www.xinhuanet.com/olympics/")) {
category
= "olympics";
}
else if (url.startsWith("http://www.xinhuanet.com/society")
|| url.startsWith("http://www.xinhuanet.com/local/")
|| url.startsWith("http://www.xinhuanet.com/world")) {
category
= "society";
}
else if (url.startsWith("http://www.xinhuanet.com/house")) {
category
= "house";
}
else if (url.startsWith("http://www.xinhuanet.com/ent")) {
category
= "ent";
}
else if (url.startsWith("http://www.xinhuanet.com/lady")) {
category
= "lady";
}
else if (url.startsWith("http://www.xinhuanet.com/school")) {
category
= "school";
}
// china
if (url.startsWith("http://auto.china.com/")) {
category
= "car";
}
else if (url.startsWith("http://caifu.china.com/")) {
category
= "finance";
}
else if (url.startsWith("http://tech.china.com/zh_cn/news/net/")) {
category
= "IT";
}
else if (url.startsWith("http://health.china.com/")) {
category
= "health";
}
else if (url.startsWith("http://sports.china.com/")) {
category
= "sports";
}
else if (url.startsWith("http://goo66.china.com/")) {
category
= "travel";
}
else if (url.startsWith("http://edu.533.com/")) {
category
= "education";
}
else if (url.startsWith("http://culture.china.com/")) {
category
= "culture";
}
else if (url.startsWith("http://military.china.com/")) {
category
= "military";
}
else if (url.startsWith("http://2008.china.com/")) {
category
= "olympics";
}
else if (url.startsWith("http://news.china.com/zh_cn/social/")
|| url.startsWith("http://news.china.com/zh_cn/domestic/")
|| url.startsWith("http://news.china.com/zh_cn/international/")) {
category
= "society";
}
else if (url.startsWith("http://china.soufun.com/")) {
category
= "house";
}
else if (url.startsWith("http://fun.china.com/zh_cn/star/")) {
category
= "ent";
}
else if (url.startsWith("http://meirong.533.com/")) {
category
= "lady";
}
else if (url.startsWith("http://edu.533.com/news/xiaoyuan/")) {
category
= "school";
}
// sina.com.cn
if (url.startsWith("http://auto.sina.com.cn/")) {
category
= "car";
}
else if (url.startsWith("http://finance.sina.com.cn/")) {
category
= "finance";
}
else if (url.startsWith("http://tech.sina.com.cn/it/")) {
category
= "IT";
}
else if (url.startsWith("http://sina.kangq.com/")) {
category
= "health";
}
else if (url.startsWith("http://sports.sina.com.cn/")) {
category
= "sports";
}
else if (url.startsWith("http://tour.sina.com.cn/")) {
category
= "travel";
}
else if (url.startsWith("http://edu.sina.com.cn/j/")) {
// employment和education不能互换.
category = "employment";
}
else if (url.startsWith("http://edu.sina.com.cn/")) {
category
= "education";
}
else if (url.startsWith("http://cul.book.sina.com.cn/")) {
category
= "culture";
}
else if (url.startsWith("http://mil.news.sina.com.cn/")) {
category
= "military";
}
else if (url.startsWith("http://2008.sina.com.cn/")) {
category
= "olympics";
}
else if (url.startsWith("http://news.sina.com.cn/society/")
|| url.startsWith("http://news.sina.com.cn/china/")
|| url.startsWith("http://news.sina.com.cn/world/")) {
category
= "society";
}
else if (url.startsWith("http://house.sina.com.cn/")) {
category
= "house";
}
else if (url.startsWith("http://ent.sina.com.cn/")) {
category
= "ent";
}
else if (url.startsWith("http://eladies.sina.com.cn/")) {
category
= "lady";
}
else if (url.startsWith("http://edu.sina.com.cn/y/")) {
category
= "school";
}
// 163.com
if (url.startsWith("http://auto.163.com/")) {
category
= "car";
}
else if (url.startsWith("http://money.163.com/")) {
category
= "finance";
}
else if (url.startsWith("http://tech.163.com/it/")) {
category
= "IT";
}
else if (url.startsWith("http://163.39.net/")) {
category
= "health";
}
else if (url.startsWith("http://sports.163.com/")) {
category
= "sports";
}
else if (url.startsWith("http://war.163.com/")) {
category
= "military";
}
else if (url.startsWith("http://2008.163.com/")) {
category
= "olympics";
}
else if (url.startsWith("http://news.163.com/shehui/")
|| url.startsWith("http://news.163.com/domestic/")
|| url.startsWith("http://news.163.com/world/")) {
category
= "society";
}
else if (url.startsWith("http://house.163.com/")) {
category
= "house";
}
else if (url.startsWith("http://ent.163.com/")) {
category
= "ent";
}
else if (url.startsWith("http://lady.163.com/")) {
category
= "lady";
}
// qq.com
if (url.startsWith("http://auto.qq.com/")) {
category
= "car";
}
else if (url.startsWith("http://finance.qq.com/")) {
category
= "finance";
}
else if (url.startsWith("http://tech.qq.com/a/")) {
category
= "IT";
}
else if (url.startsWith("http://sports.qq.com/")) {
category
= "sports";
}
else if (url.startsWith("http://edu.qq.com/job/")) {
// employment和education不能互换.
category = "employment";
}
else if (url.startsWith("http://edu.qq.com/")) {
category
= "education";
}
else if (url.startsWith("http://cul.qq.com/")) {
category
= "culture";
}
else if (url.startsWith("http://mil.qq.com/")) {
category
= "military";
}
else if (url.startsWith("http://news.qq.com/a/")) {
category
= "society";
}
else if (url.startsWith("http://2008.qq.com/")) {
category
= "olympics";
}
else if (url.startsWith("http://house.qq.com/")) {
category
= "house";
}
else if (url.startsWith("http://ent.qq.com/")) {
category
= "ent";
}
else if (url.startsWith("http://lady.qq.com/")) {
category
= "lady";
}
else if (url.startsWith("http://campus.qq.com/")) {
category
= "school";
}
// sohu.com
if (url.startsWith("http://auto.sohu.com/")) {
category
= "car";
}
else if (url.startsWith("http://business.sohu.com/")) {
category
= "finance";
}
else if (url.startsWith("http://it.sohu.com/")) {
category
= "IT";
}
else if (url.startsWith("http://health.sohu.com/")) {
category
= "health";
}
else if (url.startsWith("http://sports.sohu.com/")) {
category
= "sports";
}
else if (url.startsWith("http://travel.sohu.com/")) {
category
= "travel";
}
else if (url.startsWith("http://learning.sohu.com/")) {
category
= "education";
}
else if (url.startsWith("http://career.sohu.com/")) {
category
= "employment";
}
else if (url.startsWith("http://cul.sohu.com/")) {
category
= "culture";
}
else if (url.startsWith("http://news.sohu.com/")) {
category
= "society";
}
else if (url.startsWith("http://mil.news.sohu.com/")) {
category
= "military";
}
else if (url.startsWith("http://2008.sohu.com/")) {
category
= "olympics";
}
else if (url.startsWith("http://house.sohu.com/")) {
category
= "house";
}
else if (url.startsWith("http://yule.sohu.com/")) {
category
= "ent";
}
else if (url.startsWith("http://women.sohu.com/")) {
category
= "lady";
}
return category;
}

public static void main(String args[]) {
category
= CategoryDetector.detectCategory("http://edu.sina.com.cn/");
System.out.println(category);
}
}

DocBean类如下:

View Code
package zju.dawn.ai.corpus.sogou;

public class DocBean {

private Integer docid;
private String no;
private String url;
private String title;
private String content;
private String category;

public String getCategory() {
return category;
}

public void setCategory(String category) {
this.category = category;
}

public String getUrl() {
return url;
}

public void setUrl(String url) {
this.url = url;
}

public Integer getDocid() {
return docid;
}

public void setDocid(Integer docid) {
this.docid = docid;
}

public String getTitle() {
return title;
}

public void setTitle(String title) {
this.title = title;
}

public String getContent() {
return content;
}

public void setContent(String content) {
this.content = content;
}

public String getNo() {
return no;
}

public void setNo(String no) {
this.no = no;
}
}

其他类都是一些支持类,在此不再一一详述。代码如下:

FileListViewer类:

View Code
package zju.dawn.ai.corpus.sogou;

import java.io.*;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

/**
* 读取目录及子目录下指定文件名的路径 并放到一个数组里面返回遍历
*
*
@author DAWN
*
*/
public final class FileListViewer {

public static List<String> fileList = new ArrayList<String>();

/**
* 获取path目录下所有以suffix为后缀名的文件路径列表,(可以递归遍历).
*
*
@param path
* 文件路径
*
@param suffix
* 后缀名
*
@param isdepth
* 是否遍历子目录
*
@return
*/
public static List<String> getFileList(String fileDir, String suffix,
boolean isdepth) {
File file
= new File(fileDir);
FileListViewer.generateFilePathList(file, suffix, isdepth);
return fileList;
}

private static void generateFilePathList(File f, String suffix,
boolean isdepth) {
// 是目录,同时需要遍历子目录
if (f.isDirectory() && isdepth == true) {
File[] t
= f.listFiles();
for (int i = 0; i < t.length; i++) {
generateFilePathList(t[i], suffix, isdepth);
}
}
else {
String filePath
= f.getAbsolutePath();
if (suffix != null && suffix != "") {
// 最后一个.(即后缀名前面的.)的索引
int begIndex = filePath.lastIndexOf(".");
String tempsuffix
= "";
if (begIndex != -1)// 有后缀名.
{
tempsuffix
= filePath.substring(begIndex + 1, filePath
.length());
}
if (tempsuffix.equals(suffix)) {// 满足后缀匹配
fileList.add(filePath);
}
}
else {
// 后缀名为null则为所有文件
fileList.add(filePath);
}
}
}

public static void main(String[] args) {
List
<String> arrayList = FileListViewer.getFileList(
"D:\\语料库\\Sogou语料库\\SogouCA.reduced", "txt", true);
if (arrayList.isEmpty()) {
System.out.println(
"没有找到符号后缀要求的文件");
}
else {
String message
= "";
message
+= "符号要求的文件数:" + arrayList.size() + "\r\n";
System.out.println(message);

for (Iterator<String> i = arrayList.iterator(); i.hasNext();) {
String temp
= (String) i.next();
System.out.println(temp);
message
+= temp + "\r\n";
}
}
}
}

FileUtil类:

View Code
package zju.dawn.ai.corpus.sogou;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;

/**
* 文件相关的处理函数
*
*
@author DAWN
*
*/
public class FileUtil {

/**
* 用GBK编码读取一个文件,返回文件内容为一个字符串
*
*
@param file
* 文件名
*
@return 一个字符串代表文件内容
*
@throws IOException
*/
public static String readEntire(String file) {
return readEntire(file, "gbk");
}

/**
* 用指定编码读取一个文件,返回文件内容为一个字符串
*
*
@param file
* 文件名
*
@param encoding
* 编码
*
@return 一个字符串代表文件内容
*
@throws IOException
*/
public static String readEntire(String file, String encoding) {
BufferedReader reader
= null;
try {
reader
= new BufferedReader(new InputStreamReader(
new FileInputStream(file), encoding));
}
catch (UnsupportedEncodingException e) {
System.out.println(
"编码格式不支持!");
e.printStackTrace();
}
catch (FileNotFoundException e) {
System.out.println(
"文件没有发现 !");
e.printStackTrace();
}
StringBuilder sb
= new StringBuilder();
String line
= null;
try {
while (true) {
line
= reader.readLine();
if (line == null)
break;
sb.append(line).append(
"\n");
}
}
catch (IOException e) {
e.printStackTrace();
}
finally {
try {
reader.close();
}
catch (IOException e) {
System.out.println(
"输入流没有关闭!");
e.printStackTrace();
}
}
return sb.toString();
}

/**
* 读取一个文件,返回文件内容为一个字符串.(默认情况下使用GBK,中英文支持).
*
*
@param file
* 文件对象.
*
@return 文件内容的字符串.
*/
public static String readEntire(File file) throws IOException {
StringBuffer sb
= new StringBuffer();
String line;
BufferedReader br
= new BufferedReader(new InputStreamReader(
new FileInputStream(file), "GBK"));
while ((line = br.readLine()) != null) {
sb
= sb.append(line).append("\n\r");
}
br.close();
String text
= sb.toString();
return text;
}

/**
* 读取文件路径,返回文件内容为一个字符串.
*
*
@param dir
* 文件所在父路径.
*
@param filename
* 文件名称.
*
@return 文件内容的字符串.
*/
public static String getFileContentString(String dir, String filename)
throws IOException {
File file
= new File(dir, filename);
String text
= readEntire(file);
return text;
}

/**
* 返回文件夹fileDir下面的文件名称String[].
*
*
@param fileDir
*/
public static String[] getFileList(String fileDir) throws IOException {
File files
= new File(fileDir);
if (files.list() == null) {
System.out.println(
"files.list()指针为空!");
}
if (files.list().length == 0)
System.out.println(
"文件夹" + fileDir + "下面没有文件");
return files.list();

}

/**
* 判断一个文件(夹)是否存在.
*
*
@param filepath
*
@return true=存在 false=不存在
*/
public static boolean isFileExisted(String filepath) {
if (!new File(filepath).exists()) {
System.out.println(
"File:" + filepath + " did not exist!");
return false;
}
return true;
}

}

DBUtil类和SogouDBManager类:

View Code
package zju.dawn.ai.corpus.sogou;

import java.sql.*;

public class DBUtil {
public static Connection getConnection() {
try {
// 第一步:加载MySQL的JDBC的驱动
Class.forName("com.mysql.jdbc.Driver");
}
catch (ClassNotFoundException e) {
System.out.println(
"类com.mysql.jdbc.Driver没有发现!");
e.printStackTrace();
}

// 取得连接的url,能访问MySQL数据库的用户名,密码
// String url = "jdbc:mysql://localhost/copynews";
String url = "jdbc:mysql://localhost:3306/sogou?useUnicode=true&characterEncoding=UTF-8";
String username
= "root";
String password
= "root";

// 第二步:创建与MySQL数据库的连接类的实例
Connection con = null;
try {
con
= DriverManager.getConnection(url, username, password);
}
catch (SQLException e) {
System.out.println(
"获取数据库连接失败!");
e.printStackTrace();
}
return con;
}

public static Connection getConnection(String url) {

try {
// 第一步:加载MySQL的JDBC的驱动
Class.forName("com.mysql.jdbc.Driver");
}
catch (ClassNotFoundException e) {
System.out.println(
"类com.mysql.jdbc.Driver没有发现!");
e.printStackTrace();
}
// 取得能访问MySQL数据库的用户名,密码;
String username = "root";
String password
= "dawndon";
Connection con
= null;
try {
// 第二步:创建与MySQL数据库的连接类的实例
con = DriverManager.getConnection(url, username, password);
}
catch (SQLException e) {
System.out.println(
"获取数据库连接失败!");
e.printStackTrace();
}
return con;
}

public static Connection getConnection(String url, String username,
String password) {
try {
// 第一步:加载MySQL的JDBC的驱动
Class.forName("com.mysql.jdbc.Driver");
}
catch (ClassNotFoundException e) {
System.out.println(
"类com.mysql.jdbc.Driver没有发现!");
e.printStackTrace();
}
// 取得能访问MySQL数据库的用户名,密码;
Connection con = null;
try {
// 第二步:创建与MySQL数据库的连接类的实例
con = DriverManager.getConnection(url, username, password);
}
catch (SQLException e) {
System.out.println(
"获取数据库连接失败!");
e.printStackTrace();
}
return con;
}

/**
* 根据传入的SQL语句返回一个结果集合.
*
*
@param sql
*
@return
*
@throws Exception
*/
public static ResultSet select(String sql) throws Exception {
Connection conn
= null;
Statement stmt
= null;
ResultSet rs
= null;
try {
conn
= getConnection();
stmt
= conn.createStatement();
rs
= stmt.executeQuery(sql);
return rs;
}
catch (SQLException sqle) {
throw new SQLException("select data exception:" + sqle.getMessage());
}
catch (Exception e) {
throw new Exception("System e exception: " + e.getMessage());
}
}

/**
* 根据传入的SQL语句和数据库连接字符串返回一个查询结果集合.
*
*
@param sql
* sql语句.
*
@param constring
* 数据库链接字符串.
*
@return ResultSet 结果集.
*
@throws Exception
*/
public static ResultSet select(String sql, String constring)
throws Exception {
Connection conn
= null;
Statement stmt
= null;
ResultSet rs
= null;
try {
conn
= getConnection(constring);
stmt
= conn.createStatement();
rs
= stmt.executeQuery(sql);
return rs;
}
catch (SQLException sqle) {
throw new SQLException("select data exception:" + sqle.getMessage());
}
catch (Exception e) {
throw new Exception("System e exception: " + e.getMessage());
}
}

/**
* 根据传入的SQL语句向数据库增加一条记录
*
*
@param sql
* sql语句.
*
@throws Exception
*/
public static void insert(String sql) throws Exception {
Connection conn
= null;
PreparedStatement ps
= null;
try {
conn
= getConnection();
ps
= conn.prepareStatement(sql);
ps.executeUpdate();
}
catch (SQLException sqle) {
throw new Exception("insert data exception:" + sqle.getMessage());
}
finally {
try {
if (ps != null) {
ps.close();
}
}
catch (Exception e) {
throw new Exception("ps close exception: " + e.getMessage());
}
}
try {
if (conn != null) {
conn.close();
}
}
catch (Exception e) {
throw new Exception("connection close exception: " + e.getMessage());
}
}
/**
* 根据传入的SQL语句和链接函数向数据库增加一条记录
*
*
@param sql
* sql语句.
*
@throws Exception
*/
public static void insert(Connection conn,String sql) throws Exception {
PreparedStatement ps
= null;
try {
ps
= conn.prepareStatement(sql);
ps.executeUpdate();
}
catch (SQLException sqle) {
throw new Exception("insert data exception:" + sqle.getMessage());
}
finally {
try {
if (ps != null) {
ps.close();
}
}
catch (Exception e) {
throw new Exception("ps close exception: " + e.getMessage());
}
}
}
/**
* 根据传入的SQL语句更新数据库记录
*
*
@param sql
* sql语句.
*
@throws Exception
*/
public static void update(String sql) throws Exception {
Connection conn
= null;
PreparedStatement ps
= null;
try {
conn
= getConnection();
ps
= conn.prepareStatement(sql);
ps.executeUpdate();
}
catch (SQLException sqle) {
throw new Exception("update data exception:" + sqle.getMessage());
}
finally {
try {
if (ps != null) {
ps.close();
}
}
catch (Exception e) {
throw new Exception("ps close exception: " + e.getMessage());
}
}
try {
if (conn != null) {
conn.close();
}
}
catch (Exception e) {
throw new Exception("connection close exception: " + e.getMessage());
}
}

/**
* 根据传入的SQL语句删除一条数据库记录
*
*
@param sql
* sql语句.
*
@throws Exception
*/
public static void delete(String sql) throws Exception {
Connection conn
= null;
PreparedStatement ps
= null;
try {
conn
= getConnection();
ps
= conn.prepareStatement(sql);
ps.executeUpdate();
}
catch (SQLException sqle) {
throw new Exception("delete data exception:" + sqle.getMessage());
}
finally {
try {
if (ps != null) {
ps.close();
}
}
catch (Exception e) {
throw new Exception("ps close exception: " + e.getMessage());
}
}
try {
if (conn != null) {
conn.close();
}
}
catch (Exception e) {
throw new Exception("connection close exception: " + e.getMessage());
}
}
}
View Code
package zju.dawn.ai.corpus.sogou;

import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.Iterator;
import java.util.List;

public class SogouDBManager {

private static String url = "jdbc:mysql://localhost:3306/sogou?useUnicode=true&characterEncoding=UTF-8";
private static String username = "root";
private static String password = "";

public static boolean savaDocBeanToDB(List<DocBean> docList) {
Connection conn
= DBUtil.getConnection(url, username, password);
PreparedStatement ps
= null;
// 成功与否.
boolean flag = false;
// 查询字符串.
String sqlString = "insert into sohunews_reduced (newsno,newstitle,newscontent,newsurl,category) values(?,?,?,?,?)";
Iterator
<DocBean> docIter = docList.iterator();
DocBean docbean
= null;
while (docIter.hasNext()) {
docbean
= docIter.next();
try {
ps
= conn.prepareStatement(sqlString);
ps.setString(
1, docbean.getNo());
ps.setString(
2, docbean.getTitle());
ps.setString(
3, docbean.getContent());
ps.setString(
4, docbean.getUrl());
ps.setString(
5, docbean.getCategory());
// 执行sql语句.
flag = ps.execute();
}
catch (SQLException e) {
System.out.println(
"插入newsbean时,Sql语句执行错误!");
System.out.println(
"内容是:"+docbean.getContent());
e.printStackTrace();
return false;
}
}
try {
if (conn != null) {
conn.close();
}
}
catch (Exception e) {
System.out.println(
"connection close exception: " + e.getMessage());
}
return flag;
}
}
posted @ 2011-05-25 20:27  Alex.XW.Wang  阅读(2133)  评论(1编辑  收藏  举报