整理搜狗2008版搜狗新闻分类语料库和全网新闻分类语料库--转存至mysql数据库
注解:在读了finallyliyu 的文章 《菜鸟学习C++练笔之整理搜狗2008版语料库--获取分类语料库》之后萌生了用java来处理搜狐新闻数据库的想法。
作者:Alex.XW.Wang (博文转载请注明作者和出处)
代码下载:http://download.csdn.net/user/raindreams
首先,对于搜狗的语料库,有几点说明:
根据搜狗实验室的数据使用许可说明(http://www.sogou.com/labs/dl/license.html):
有如下条款 三. 受益者的义务
禁止将搜狗实验室数据提供给第三方使用,第三方如有需要可直接访问搜狗实验室主页下载或提出介质(硬盘、光盘)拷贝申请。
因此无法提供程序处理后的搜狗语料库,需要语料库的可以自行下载搜狗语料库,然后使用我提供的程序进行处理至mysql数据库。
之所以对finallyliyu的算法进行改进是因为:
1. 她的程序没有对分类(主题)进行标注,只是提取出了新闻的url,而没有对其url所对应新闻主题的解析。
2. 感觉java对于这则匹配来处理数据比finallyliuyu用的c++的boost库方便些,自己对于java也稍微熟练些,故用java来写。
以下是正文:数据库结构如下:
这是数据库和表格:sohunews_all对于搜狐分类的全部语料,souhunews_reduced对应其reduced版本。
表格字段如下:
搜狗数据格式如下:
<doc>
<url>页面URL</url>
<docno>页面ID</docno>
<contenttitle>页面标题</Contenttitle>
<content>页面内容</content>
</doc>
得到的效果图如下
以下是对于程序的解释:
主程序调用如下:在设置好数据库配置后,只是简单地添加语料库目录就可以将搜狗语料导入到数据库中。
public static void main(String args[]) {
SogouCSProcessor pro = new SogouCSProcessor();
resultToFile("data/Sogou.txt");
pro.processor("D:\\语料库\\Sogou语料库\\SogouCA.reduced");
}
去掉<>和<>内部的元素.
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
package zju.dawn.ai.corpus.sogou;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class SogouCSProcessor {
private List<DocBean> docList = new ArrayList<DocBean>();
// 匹配表达式.
private String docPatternString = "(?s)<doc>.*?</doc>";
private String noPatternString = "<docno>(.*?)</docno>";
private String urlPatternString = "<url>(.*?)</url>";
private String titlePatternString = "<contenttitle>(.*?)</contenttitle>";
private String contentPatternString = "<content>(.*?)</content>";
private Pattern docPattern = Pattern.compile(docPatternString);
private Pattern noPattern = Pattern.compile(noPatternString);
private Pattern urlPattern = Pattern.compile(urlPatternString);
private Pattern titlePattern = Pattern.compile(titlePatternString);
private Pattern contentPattern = Pattern.compile(contentPatternString);
/**
* 使用正则表达式匹配产生出文档DocBean.
*
* @param rawText
*/
private void getDocBean(String rawText) {
docList.clear();
boolean completed = true;
String no = null;
String url = null;
String title = null;
String content = null;
String category = null;
String docMatchString = "";
Matcher docMatcher = docPattern.matcher(rawText);
while (docMatcher.find()) {
DocBean docbean = new DocBean();
docMatchString = docMatcher.group();
Matcher noMatcher = noPattern.matcher(docMatchString);
Matcher urlMatcher = urlPattern.matcher(docMatchString);
Matcher titleMatcher = titlePattern.matcher(docMatchString);
Matcher contentMatcher = contentPattern.matcher(docMatchString);
if (noMatcher.find() && urlMatcher.find() && titleMatcher.find()
&& contentMatcher.find()) {
no = reProcess(noMatcher.group());
url = reProcess(urlMatcher.group());
title = reProcess(titleMatcher.group());
content = reProcess(contentMatcher.group());
docbean.setNo(no);
docbean.setUrl(url);
docbean.setTitle(title);
docbean.setContent(content);
} else {
completed = false;
}
if (completed) {
category = CategoryDetector.detectCategory(url);
docbean.setCategory(category);
//displayDocBean(docbean);
docList.add(docbean);
}
}
System.out.println("txt contains " + docList.size() + " docs!");
}
/**
* 展示docbean的信息.
*
* @param docbean
*/
private void displayDocBean(DocBean docbean) {
// System.out.println("docbeanNO:" + docbean.getNo());
System.out.println("docbeanURL:" + docbean.getUrl());
// System.out.println("docbeanTitle:" + docbean.getTitle());
// System.out.println("docbeanContent:" + docbean.getContent());
//System.out.println("docbeanCategory:" + docbean.getCategory());
}
/**
* 处理掉<doc></doc>这些信息.
*
* @param text
* @return 去噪信息.
*/
private String reProcess(String text) {
int beginIndex = text.indexOf(">");
int endIndex = text.lastIndexOf("<");
return text.substring(beginIndex + 1, endIndex).trim();
}
/**
* 处理文件目录下的所有指定文件.将这些doc导入到数据库中.
*
* @param fileDir
*/
public void processor(String fileDir) {
List<String> filePath = FileListViewer
.getFileList(fileDir, "txt", true);
System.out.println("文件数目:" + filePath.size());
Iterator<String> iter = filePath.iterator();
while (iter.hasNext()) {
String path = iter.next();
System.out.println("===========================================");
System.out.println(path);
String rawText = FileUtil.readEntire(path);
// System.out.println(rawText);
this.getDocBean(rawText);
SogouDBManager.savaDocBeanToDB(docList);
}
}
/**
* 将输出结果输出到文件中.
*
* @param path
* @throws FileNotFoundException
*/
public static void resultToFile(String path) {
File test = new File(path);
PrintStream printStream = null;
try {
printStream = new PrintStream(new FileOutputStream(test));
} catch (FileNotFoundException e) {
System.out.println("文件没有发现!");
e.printStackTrace();
}
PrintStream out = printStream;
System.setOut(out);
}
public static void main(String args[]) {
SogouCSProcessor pro = new SogouCSProcessor();
resultToFile("data/Sogou.txt");
pro.processor("D:\\语料库\\Sogou语料库\\SogouCA.reduced");
}
}
类别探测其如下:找出与category对应的类别,注意:在这里对国内,国际和社会分类都归于社会分类。原因有2:
1是给的一些网站的语料库无法根据url区分出这三个主题,2:这三个主题内容本来就比较相似,区分度不大。
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
package zju.dawn.ai.corpus.sogou;
public final class CategoryDetector {
private static String category = null;
public static String detectCategory(String url) {
// xinhuanet.
if (url.startsWith("http://www.xinhuanet.com/auto/")) {
category = "car";
} else if (url.startsWith("http://www.xinhuanet.com/fortune")) {
category = "finance";
} else if (url.startsWith("http://www.xinhuanet.com/internet/")) {
category = "IT";
} else if (url.startsWith("http://www.xinhuanet.com/health/")) {
category = "health";
} else if (url.startsWith("http://www.xinhuanet.com/sports")) {
category = "sports";
} else if (url.startsWith("http://www.xinhuanet.com/travel")) {
category = "travel";
} else if (url.startsWith("http://www.xinhuanet.com/edu")) {
category = "education";
} else if (url.startsWith("http://www.xinhuanet.com/employment")) {
category = "employment";
} else if (url.startsWith("http://www.xinhuanet.com/life")) {
category = "culture";
} else if (url.startsWith("http://www.xinhuanet.com/mil")) {
category = "military";
} else if (url.startsWith("http://www.xinhuanet.com/olympics/")) {
category = "olympics";
} else if (url.startsWith("http://www.xinhuanet.com/society")
|| url.startsWith("http://www.xinhuanet.com/local/")
|| url.startsWith("http://www.xinhuanet.com/world")) {
category = "society";
} else if (url.startsWith("http://www.xinhuanet.com/house")) {
category = "house";
} else if (url.startsWith("http://www.xinhuanet.com/ent")) {
category = "ent";
} else if (url.startsWith("http://www.xinhuanet.com/lady")) {
category = "lady";
} else if (url.startsWith("http://www.xinhuanet.com/school")) {
category = "school";
}
// china
if (url.startsWith("http://auto.china.com/")) {
category = "car";
} else if (url.startsWith("http://caifu.china.com/")) {
category = "finance";
} else if (url.startsWith("http://tech.china.com/zh_cn/news/net/")) {
category = "IT";
} else if (url.startsWith("http://health.china.com/")) {
category = "health";
} else if (url.startsWith("http://sports.china.com/")) {
category = "sports";
} else if (url.startsWith("http://goo66.china.com/")) {
category = "travel";
} else if (url.startsWith("http://edu.533.com/")) {
category = "education";
} else if (url.startsWith("http://culture.china.com/")) {
category = "culture";
} else if (url.startsWith("http://military.china.com/")) {
category = "military";
} else if (url.startsWith("http://2008.china.com/")) {
category = "olympics";
} else if (url.startsWith("http://news.china.com/zh_cn/social/")
|| url.startsWith("http://news.china.com/zh_cn/domestic/")
|| url.startsWith("http://news.china.com/zh_cn/international/")) {
category = "society";
} else if (url.startsWith("http://china.soufun.com/")) {
category = "house";
} else if (url.startsWith("http://fun.china.com/zh_cn/star/")) {
category = "ent";
} else if (url.startsWith("http://meirong.533.com/")) {
category = "lady";
} else if (url.startsWith("http://edu.533.com/news/xiaoyuan/")) {
category = "school";
}
// sina.com.cn
if (url.startsWith("http://auto.sina.com.cn/")) {
category = "car";
} else if (url.startsWith("http://finance.sina.com.cn/")) {
category = "finance";
} else if (url.startsWith("http://tech.sina.com.cn/it/")) {
category = "IT";
} else if (url.startsWith("http://sina.kangq.com/")) {
category = "health";
} else if (url.startsWith("http://sports.sina.com.cn/")) {
category = "sports";
} else if (url.startsWith("http://tour.sina.com.cn/")) {
category = "travel";
} else if (url.startsWith("http://edu.sina.com.cn/j/")) {
// employment和education不能互换.
category = "employment";
} else if (url.startsWith("http://edu.sina.com.cn/")) {
category = "education";
} else if (url.startsWith("http://cul.book.sina.com.cn/")) {
category = "culture";
} else if (url.startsWith("http://mil.news.sina.com.cn/")) {
category = "military";
} else if (url.startsWith("http://2008.sina.com.cn/")) {
category = "olympics";
} else if (url.startsWith("http://news.sina.com.cn/society/")
|| url.startsWith("http://news.sina.com.cn/china/")
|| url.startsWith("http://news.sina.com.cn/world/")) {
category = "society";
} else if (url.startsWith("http://house.sina.com.cn/")) {
category = "house";
} else if (url.startsWith("http://ent.sina.com.cn/")) {
category = "ent";
} else if (url.startsWith("http://eladies.sina.com.cn/")) {
category = "lady";
} else if (url.startsWith("http://edu.sina.com.cn/y/")) {
category = "school";
}
// 163.com
if (url.startsWith("http://auto.163.com/")) {
category = "car";
} else if (url.startsWith("http://money.163.com/")) {
category = "finance";
} else if (url.startsWith("http://tech.163.com/it/")) {
category = "IT";
} else if (url.startsWith("http://163.39.net/")) {
category = "health";
} else if (url.startsWith("http://sports.163.com/")) {
category = "sports";
} else if (url.startsWith("http://war.163.com/")) {
category = "military";
} else if (url.startsWith("http://2008.163.com/")) {
category = "olympics";
} else if (url.startsWith("http://news.163.com/shehui/")
|| url.startsWith("http://news.163.com/domestic/")
|| url.startsWith("http://news.163.com/world/")) {
category = "society";
} else if (url.startsWith("http://house.163.com/")) {
category = "house";
} else if (url.startsWith("http://ent.163.com/")) {
category = "ent";
} else if (url.startsWith("http://lady.163.com/")) {
category = "lady";
}
// qq.com
if (url.startsWith("http://auto.qq.com/")) {
category = "car";
} else if (url.startsWith("http://finance.qq.com/")) {
category = "finance";
} else if (url.startsWith("http://tech.qq.com/a/")) {
category = "IT";
} else if (url.startsWith("http://sports.qq.com/")) {
category = "sports";
} else if (url.startsWith("http://edu.qq.com/job/")) {
// employment和education不能互换.
category = "employment";
} else if (url.startsWith("http://edu.qq.com/")) {
category = "education";
} else if (url.startsWith("http://cul.qq.com/")) {
category = "culture";
} else if (url.startsWith("http://mil.qq.com/")) {
category = "military";
} else if (url.startsWith("http://news.qq.com/a/")) {
category = "society";
} else if (url.startsWith("http://2008.qq.com/")) {
category = "olympics";
} else if (url.startsWith("http://house.qq.com/")) {
category = "house";
} else if (url.startsWith("http://ent.qq.com/")) {
category = "ent";
} else if (url.startsWith("http://lady.qq.com/")) {
category = "lady";
} else if (url.startsWith("http://campus.qq.com/")) {
category = "school";
}
// sohu.com
if (url.startsWith("http://auto.sohu.com/")) {
category = "car";
} else if (url.startsWith("http://business.sohu.com/")) {
category = "finance";
} else if (url.startsWith("http://it.sohu.com/")) {
category = "IT";
} else if (url.startsWith("http://health.sohu.com/")) {
category = "health";
} else if (url.startsWith("http://sports.sohu.com/")) {
category = "sports";
} else if (url.startsWith("http://travel.sohu.com/")) {
category = "travel";
} else if (url.startsWith("http://learning.sohu.com/")) {
category = "education";
} else if (url.startsWith("http://career.sohu.com/")) {
category = "employment";
} else if (url.startsWith("http://cul.sohu.com/")) {
category = "culture";
} else if (url.startsWith("http://news.sohu.com/")) {
category = "society";
} else if (url.startsWith("http://mil.news.sohu.com/")) {
category = "military";
} else if (url.startsWith("http://2008.sohu.com/")) {
category = "olympics";
} else if (url.startsWith("http://house.sohu.com/")) {
category = "house";
} else if (url.startsWith("http://yule.sohu.com/")) {
category = "ent";
} else if (url.startsWith("http://women.sohu.com/")) {
category = "lady";
}
return category;
}
public static void main(String args[]) {
category = CategoryDetector.detectCategory("http://edu.sina.com.cn/");
System.out.println(category);
}
}
DocBean类如下:
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
package zju.dawn.ai.corpus.sogou;
public class DocBean {
private Integer docid;
private String no;
private String url;
private String title;
private String content;
private String category;
public String getCategory() {
return category;
}
public void setCategory(String category) {
this.category = category;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public Integer getDocid() {
return docid;
}
public void setDocid(Integer docid) {
this.docid = docid;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getNo() {
return no;
}
public void setNo(String no) {
this.no = no;
}
}
其他类都是一些支持类,在此不再一一详述。代码如下:
FileListViewer类:
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
package zju.dawn.ai.corpus.sogou;
import java.io.*;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
/**
* 读取目录及子目录下指定文件名的路径 并放到一个数组里面返回遍历
*
* @author DAWN
*
*/
public final class FileListViewer {
public static List<String> fileList = new ArrayList<String>();
/**
* 获取path目录下所有以suffix为后缀名的文件路径列表,(可以递归遍历).
*
* @param path
* 文件路径
* @param suffix
* 后缀名
* @param isdepth
* 是否遍历子目录
* @return
*/
public static List<String> getFileList(String fileDir, String suffix,
boolean isdepth) {
File file = new File(fileDir);
FileListViewer.generateFilePathList(file, suffix, isdepth);
return fileList;
}
private static void generateFilePathList(File f, String suffix,
boolean isdepth) {
// 是目录,同时需要遍历子目录
if (f.isDirectory() && isdepth == true) {
File[] t = f.listFiles();
for (int i = 0; i < t.length; i++) {
generateFilePathList(t[i], suffix, isdepth);
}
} else {
String filePath = f.getAbsolutePath();
if (suffix != null && suffix != "") {
// 最后一个.(即后缀名前面的.)的索引
int begIndex = filePath.lastIndexOf(".");
String tempsuffix = "";
if (begIndex != -1)// 有后缀名.
{
tempsuffix = filePath.substring(begIndex + 1, filePath
.length());
}
if (tempsuffix.equals(suffix)) {// 满足后缀匹配
fileList.add(filePath);
}
} else {
// 后缀名为null则为所有文件
fileList.add(filePath);
}
}
}
public static void main(String[] args) {
List<String> arrayList = FileListViewer.getFileList(
"D:\\语料库\\Sogou语料库\\SogouCA.reduced", "txt", true);
if (arrayList.isEmpty()) {
System.out.println("没有找到符号后缀要求的文件");
} else {
String message = "";
message += "符号要求的文件数:" + arrayList.size() + "\r\n";
System.out.println(message);
for (Iterator<String> i = arrayList.iterator(); i.hasNext();) {
String temp = (String) i.next();
System.out.println(temp);
message += temp + "\r\n";
}
}
}
}
FileUtil类:
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
package zju.dawn.ai.corpus.sogou;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
/**
* 文件相关的处理函数
*
* @author DAWN
*
*/
public class FileUtil {
/**
* 用GBK编码读取一个文件,返回文件内容为一个字符串
*
* @param file
* 文件名
* @return 一个字符串代表文件内容
* @throws IOException
*/
public static String readEntire(String file) {
return readEntire(file, "gbk");
}
/**
* 用指定编码读取一个文件,返回文件内容为一个字符串
*
* @param file
* 文件名
* @param encoding
* 编码
* @return 一个字符串代表文件内容
* @throws IOException
*/
public static String readEntire(String file, String encoding) {
BufferedReader reader = null;
try {
reader = new BufferedReader(new InputStreamReader(
new FileInputStream(file), encoding));
} catch (UnsupportedEncodingException e) {
System.out.println("编码格式不支持!");
e.printStackTrace();
} catch (FileNotFoundException e) {
System.out.println("文件没有发现 !");
e.printStackTrace();
}
StringBuilder sb = new StringBuilder();
String line = null;
try {
while (true) {
line = reader.readLine();
if (line == null)
break;
sb.append(line).append("\n");
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
reader.close();
} catch (IOException e) {
System.out.println("输入流没有关闭!");
e.printStackTrace();
}
}
return sb.toString();
}
/**
* 读取一个文件,返回文件内容为一个字符串.(默认情况下使用GBK,中英文支持).
*
* @param file
* 文件对象.
* @return 文件内容的字符串.
*/
public static String readEntire(File file) throws IOException {
StringBuffer sb = new StringBuffer();
String line;
BufferedReader br = new BufferedReader(new InputStreamReader(
new FileInputStream(file), "GBK"));
while ((line = br.readLine()) != null) {
sb = sb.append(line).append("\n\r");
}
br.close();
String text = sb.toString();
return text;
}
/**
* 读取文件路径,返回文件内容为一个字符串.
*
* @param dir
* 文件所在父路径.
* @param filename
* 文件名称.
* @return 文件内容的字符串.
*/
public static String getFileContentString(String dir, String filename)
throws IOException {
File file = new File(dir, filename);
String text = readEntire(file);
return text;
}
/**
* 返回文件夹fileDir下面的文件名称String[].
*
* @param fileDir
*/
public static String[] getFileList(String fileDir) throws IOException {
File files = new File(fileDir);
if (files.list() == null) {
System.out.println("files.list()指针为空!");
}
if (files.list().length == 0)
System.out.println("文件夹" + fileDir + "下面没有文件");
return files.list();
}
/**
* 判断一个文件(夹)是否存在.
*
* @param filepath
* @return true=存在 false=不存在
*/
public static boolean isFileExisted(String filepath) {
if (!new File(filepath).exists()) {
System.out.println("File:" + filepath + " did not exist!");
return false;
}
return true;
}
}
DBUtil类和SogouDBManager类:
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
package zju.dawn.ai.corpus.sogou;
import java.sql.*;
public class DBUtil {
public static Connection getConnection() {
try {
// 第一步:加载MySQL的JDBC的驱动
Class.forName("com.mysql.jdbc.Driver");
} catch (ClassNotFoundException e) {
System.out.println("类com.mysql.jdbc.Driver没有发现!");
e.printStackTrace();
}
// 取得连接的url,能访问MySQL数据库的用户名,密码
// String url = "jdbc:mysql://localhost/copynews";
String url = "jdbc:mysql://localhost:3306/sogou?useUnicode=true&characterEncoding=UTF-8";
String username = "root";
String password = "root";
// 第二步:创建与MySQL数据库的连接类的实例
Connection con = null;
try {
con = DriverManager.getConnection(url, username, password);
} catch (SQLException e) {
System.out.println("获取数据库连接失败!");
e.printStackTrace();
}
return con;
}
public static Connection getConnection(String url) {
try {
// 第一步:加载MySQL的JDBC的驱动
Class.forName("com.mysql.jdbc.Driver");
} catch (ClassNotFoundException e) {
System.out.println("类com.mysql.jdbc.Driver没有发现!");
e.printStackTrace();
}
// 取得能访问MySQL数据库的用户名,密码;
String username = "root";
String password = "dawndon";
Connection con = null;
try {
// 第二步:创建与MySQL数据库的连接类的实例
con = DriverManager.getConnection(url, username, password);
} catch (SQLException e) {
System.out.println("获取数据库连接失败!");
e.printStackTrace();
}
return con;
}
public static Connection getConnection(String url, String username,
String password) {
try {
// 第一步:加载MySQL的JDBC的驱动
Class.forName("com.mysql.jdbc.Driver");
} catch (ClassNotFoundException e) {
System.out.println("类com.mysql.jdbc.Driver没有发现!");
e.printStackTrace();
}
// 取得能访问MySQL数据库的用户名,密码;
Connection con = null;
try {
// 第二步:创建与MySQL数据库的连接类的实例
con = DriverManager.getConnection(url, username, password);
} catch (SQLException e) {
System.out.println("获取数据库连接失败!");
e.printStackTrace();
}
return con;
}
/**
* 根据传入的SQL语句返回一个结果集合.
*
* @param sql
* @return
* @throws Exception
*/
public static ResultSet select(String sql) throws Exception {
Connection conn = null;
Statement stmt = null;
ResultSet rs = null;
try {
conn = getConnection();
stmt = conn.createStatement();
rs = stmt.executeQuery(sql);
return rs;
} catch (SQLException sqle) {
throw new SQLException("select data exception:" + sqle.getMessage());
} catch (Exception e) {
throw new Exception("System e exception: " + e.getMessage());
}
}
/**
* 根据传入的SQL语句和数据库连接字符串返回一个查询结果集合.
*
* @param sql
* sql语句.
* @param constring
* 数据库链接字符串.
* @return ResultSet 结果集.
* @throws Exception
*/
public static ResultSet select(String sql, String constring)
throws Exception {
Connection conn = null;
Statement stmt = null;
ResultSet rs = null;
try {
conn = getConnection(constring);
stmt = conn.createStatement();
rs = stmt.executeQuery(sql);
return rs;
} catch (SQLException sqle) {
throw new SQLException("select data exception:" + sqle.getMessage());
} catch (Exception e) {
throw new Exception("System e exception: " + e.getMessage());
}
}
/**
* 根据传入的SQL语句向数据库增加一条记录
*
* @param sql
* sql语句.
* @throws Exception
*/
public static void insert(String sql) throws Exception {
Connection conn = null;
PreparedStatement ps = null;
try {
conn = getConnection();
ps = conn.prepareStatement(sql);
ps.executeUpdate();
} catch (SQLException sqle) {
throw new Exception("insert data exception:" + sqle.getMessage());
} finally {
try {
if (ps != null) {
ps.close();
}
} catch (Exception e) {
throw new Exception("ps close exception: " + e.getMessage());
}
}
try {
if (conn != null) {
conn.close();
}
} catch (Exception e) {
throw new Exception("connection close exception: " + e.getMessage());
}
}
/**
* 根据传入的SQL语句和链接函数向数据库增加一条记录
*
* @param sql
* sql语句.
* @throws Exception
*/
public static void insert(Connection conn,String sql) throws Exception {
PreparedStatement ps = null;
try {
ps = conn.prepareStatement(sql);
ps.executeUpdate();
} catch (SQLException sqle) {
throw new Exception("insert data exception:" + sqle.getMessage());
} finally {
try {
if (ps != null) {
ps.close();
}
} catch (Exception e) {
throw new Exception("ps close exception: " + e.getMessage());
}
}
}
/**
* 根据传入的SQL语句更新数据库记录
*
* @param sql
* sql语句.
* @throws Exception
*/
public static void update(String sql) throws Exception {
Connection conn = null;
PreparedStatement ps = null;
try {
conn = getConnection();
ps = conn.prepareStatement(sql);
ps.executeUpdate();
} catch (SQLException sqle) {
throw new Exception("update data exception:" + sqle.getMessage());
} finally {
try {
if (ps != null) {
ps.close();
}
} catch (Exception e) {
throw new Exception("ps close exception: " + e.getMessage());
}
}
try {
if (conn != null) {
conn.close();
}
} catch (Exception e) {
throw new Exception("connection close exception: " + e.getMessage());
}
}
/**
* 根据传入的SQL语句删除一条数据库记录
*
* @param sql
* sql语句.
* @throws Exception
*/
public static void delete(String sql) throws Exception {
Connection conn = null;
PreparedStatement ps = null;
try {
conn = getConnection();
ps = conn.prepareStatement(sql);
ps.executeUpdate();
} catch (SQLException sqle) {
throw new Exception("delete data exception:" + sqle.getMessage());
} finally {
try {
if (ps != null) {
ps.close();
}
} catch (Exception e) {
throw new Exception("ps close exception: " + e.getMessage());
}
}
try {
if (conn != null) {
conn.close();
}
} catch (Exception e) {
throw new Exception("connection close exception: " + e.getMessage());
}
}
}
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
package zju.dawn.ai.corpus.sogou;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.Iterator;
import java.util.List;
public class SogouDBManager {
private static String url = "jdbc:mysql://localhost:3306/sogou?useUnicode=true&characterEncoding=UTF-8";
private static String username = "root";
private static String password = "";
public static boolean savaDocBeanToDB(List<DocBean> docList) {
Connection conn = DBUtil.getConnection(url, username, password);
PreparedStatement ps = null;
// 成功与否.
boolean flag = false;
// 查询字符串.
String sqlString = "insert into sohunews_reduced (newsno,newstitle,newscontent,newsurl,category) values(?,?,?,?,?)";
Iterator<DocBean> docIter = docList.iterator();
DocBean docbean = null;
while (docIter.hasNext()) {
docbean = docIter.next();
try {
ps = conn.prepareStatement(sqlString);
ps.setString(1, docbean.getNo());
ps.setString(2, docbean.getTitle());
ps.setString(3, docbean.getContent());
ps.setString(4, docbean.getUrl());
ps.setString(5, docbean.getCategory());
// 执行sql语句.
flag = ps.execute();
} catch (SQLException e) {
System.out.println("插入newsbean时,Sql语句执行错误!");
System.out.println("内容是:"+docbean.getContent());
e.printStackTrace();
return false;
}
}
try {
if (conn != null) {
conn.close();
}
} catch (Exception e) {
System.out.println("connection close exception: " + e.getMessage());
}
return flag;
}
}