整理搜狗2008版搜狗新闻分类语料库和全网新闻分类语料库--转存至mysql数据库

注解：在读了finallyliyu 的文章《菜鸟学习C++练笔之整理搜狗2008版语料库--获取分类语料库》之后萌生了用java来处理搜狐新闻数据库的想法。

作者：Alex.XW.Wang (博文转载请注明作者和出处)

代码下载：http://download.csdn.net/user/raindreams

首先，对于搜狗的语料库，有几点说明：

根据搜狗实验室的数据使用许可说明(http://www.sogou.com/labs/dl/license.html):

有如下条款三. 受益者的义务

禁止将搜狗实验室数据提供给第三方使用，第三方如有需要可直接访问搜狗实验室主页下载或提出介质(硬盘、光盘)拷贝申请。

因此无法提供程序处理后的搜狗语料库，需要语料库的可以自行下载搜狗语料库，然后使用我提供的程序进行处理至mysql数据库。

之所以对finallyliyu的算法进行改进是因为：

1. 她的程序没有对分类（主题）进行标注，只是提取出了新闻的url，而没有对其url所对应新闻主题的解析。

2. 感觉java对于这则匹配来处理数据比finallyliuyu用的c++的boost库方便些，自己对于java也稍微熟练些，故用java来写。

以下是正文：数据库结构如下：

这是数据库和表格：sohunews_all对于搜狐分类的全部语料，souhunews_reduced对应其reduced版本。

表格字段如下：

搜狗数据格式如下：

<doc>
<url>页面URL</url>
<docno>页面ID</docno>
<contenttitle>页面标题</Contenttitle>
<content>页面内容</content>
</doc>

得到的效果图如下

以下是对于程序的解释：

主程序调用如下：在设置好数据库配置后，只是简单地添加语料库目录就可以将搜狗语料导入到数据库中。

public static void main(String args[]) {
        SogouCSProcessor pro = new SogouCSProcessor();
        resultToFile("data/Sogou.txt");
        pro.processor("D:\\语料库\\Sogou语料库\\SogouCA.reduced");
    }

SogouCSProcessor类，用于对数据处理有一个整体的控制，其中主要的正则匹配处理在getDocBean()函数中，抽取出来以后再对其

去掉<>和<>内部的元素.

View Code

package zju.dawn.ai.corpus.sogou;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class SogouCSProcessor {
private List<DocBean> docList = new ArrayList<DocBean>();
// 匹配表达式.
private String docPatternString = "(?s)<doc>.*?</doc>";
private String noPatternString = "<docno>(.*?)</docno>";
private String urlPatternString = "<url>(.*?)</url>";
private String titlePatternString = "<contenttitle>(.*?)</contenttitle>";
private String contentPatternString = "<content>(.*?)</content>";
private Pattern docPattern = Pattern.compile(docPatternString);
private Pattern noPattern = Pattern.compile(noPatternString);
private Pattern urlPattern = Pattern.compile(urlPatternString);
private Pattern titlePattern = Pattern.compile(titlePatternString);
private Pattern contentPattern = Pattern.compile(contentPatternString);
/**
* 使用正则表达式匹配产生出文档DocBean.
*
* @param rawText
*/
private void getDocBean(String rawText) {
docList.clear();
boolean completed = true;
String no = null;
String url = null;
String title = null;
String content = null;
String category = null;
String docMatchString = "";
Matcher docMatcher = docPattern.matcher(rawText);
while (docMatcher.find()) {
DocBean docbean = new DocBean();
docMatchString = docMatcher.group();
Matcher noMatcher = noPattern.matcher(docMatchString);
Matcher urlMatcher = urlPattern.matcher(docMatchString);
Matcher titleMatcher = titlePattern.matcher(docMatchString);
Matcher contentMatcher = contentPattern.matcher(docMatchString);
if (noMatcher.find() && urlMatcher.find() && titleMatcher.find()
&& contentMatcher.find()) {
no = reProcess(noMatcher.group());
url = reProcess(urlMatcher.group());
title = reProcess(titleMatcher.group());
content = reProcess(contentMatcher.group());
docbean.setNo(no);
docbean.setUrl(url);
docbean.setTitle(title);
docbean.setContent(content);
} else {
completed = false;
}
if (completed) {
category = CategoryDetector.detectCategory(url);
docbean.setCategory(category);
//displayDocBean(docbean);
docList.add(docbean);
}
}
System.out.println("txt contains " + docList.size() + " docs!");
}
/**
* 展示docbean的信息.
*
* @param docbean
*/
private void displayDocBean(DocBean docbean) {
// System.out.println("docbeanNO:" + docbean.getNo());
System.out.println("docbeanURL:" + docbean.getUrl());
// System.out.println("docbeanTitle:" + docbean.getTitle());
// System.out.println("docbeanContent:" + docbean.getContent());
//System.out.println("docbeanCategory:" + docbean.getCategory());
}
/**
* 处理掉<doc></doc>这些信息.
*
* @param text
* @return 去噪信息.
*/
private String reProcess(String text) {
int beginIndex = text.indexOf(">");
int endIndex = text.lastIndexOf("<");
return text.substring(beginIndex + 1, endIndex).trim();
}
/**
* 处理文件目录下的所有指定文件.将这些doc导入到数据库中.
*
* @param fileDir
*/
public void processor(String fileDir) {
List<String> filePath = FileListViewer
.getFileList(fileDir, "txt", true);
System.out.println("文件数目:" + filePath.size());
Iterator<String> iter = filePath.iterator();
while (iter.hasNext()) {
String path = iter.next();
System.out.println("===========================================");
System.out.println(path);
String rawText = FileUtil.readEntire(path);
// System.out.println(rawText);
this.getDocBean(rawText);
SogouDBManager.savaDocBeanToDB(docList);
}
}
/**
* 将输出结果输出到文件中.
*
* @param path
* @throws FileNotFoundException
*/
public static void resultToFile(String path) {
File test = new File(path);
PrintStream printStream = null;
try {
printStream = new PrintStream(new FileOutputStream(test));
} catch (FileNotFoundException e) {
System.out.println("文件没有发现!");
e.printStackTrace();
}
PrintStream out = printStream;
System.setOut(out);
}
public static void main(String args[]) {
SogouCSProcessor pro = new SogouCSProcessor();
resultToFile("data/Sogou.txt");
pro.processor("D:\\语料库\\Sogou语料库\\SogouCA.reduced");
}
}

类别探测其如下：找出与category对应的类别，注意：在这里对国内，国际和社会分类都归于社会分类。原因有2:

1是给的一些网站的语料库无法根据url区分出这三个主题，2：这三个主题内容本来就比较相似，区分度不大。

View Code

package zju.dawn.ai.corpus.sogou;

public final class CategoryDetector {

    private static String category = null;

    public static String detectCategory(String url) {
        // xinhuanet.
        if (url.startsWith("http://www.xinhuanet.com/auto/")) {
            category = "car";
        } else if (url.startsWith("http://www.xinhuanet.com/fortune")) {
            category = "finance";
        } else if (url.startsWith("http://www.xinhuanet.com/internet/")) {
            category = "IT";
        } else if (url.startsWith("http://www.xinhuanet.com/health/")) {
            category = "health";
        } else if (url.startsWith("http://www.xinhuanet.com/sports")) {
            category = "sports";
        } else if (url.startsWith("http://www.xinhuanet.com/travel")) {
            category = "travel";
        } else if (url.startsWith("http://www.xinhuanet.com/edu")) {
            category = "education";
        } else if (url.startsWith("http://www.xinhuanet.com/employment")) {
            category = "employment";
        } else if (url.startsWith("http://www.xinhuanet.com/life")) {
            category = "culture";
        } else if (url.startsWith("http://www.xinhuanet.com/mil")) {
            category = "military";
        } else if (url.startsWith("http://www.xinhuanet.com/olympics/")) {
            category = "olympics";
        } else if (url.startsWith("http://www.xinhuanet.com/society")
                || url.startsWith("http://www.xinhuanet.com/local/")
                || url.startsWith("http://www.xinhuanet.com/world")) {
            category = "society";
        } else if (url.startsWith("http://www.xinhuanet.com/house")) {
            category = "house";
        } else if (url.startsWith("http://www.xinhuanet.com/ent")) {
            category = "ent";
        } else if (url.startsWith("http://www.xinhuanet.com/lady")) {
            category = "lady";
        } else if (url.startsWith("http://www.xinhuanet.com/school")) {
            category = "school";
        }
        // china
        if (url.startsWith("http://auto.china.com/")) {
            category = "car";
        } else if (url.startsWith("http://caifu.china.com/")) {
            category = "finance";
        } else if (url.startsWith("http://tech.china.com/zh_cn/news/net/")) {
            category = "IT";
        } else if (url.startsWith("http://health.china.com/")) {
            category = "health";
        } else if (url.startsWith("http://sports.china.com/")) {
            category = "sports";
        } else if (url.startsWith("http://goo66.china.com/")) {
            category = "travel";
        } else if (url.startsWith("http://edu.533.com/")) {
            category = "education";
        } else if (url.startsWith("http://culture.china.com/")) {
            category = "culture";
        } else if (url.startsWith("http://military.china.com/")) {
            category = "military";
        } else if (url.startsWith("http://2008.china.com/")) {
            category = "olympics";
        } else if (url.startsWith("http://news.china.com/zh_cn/social/")
                || url.startsWith("http://news.china.com/zh_cn/domestic/")
                || url.startsWith("http://news.china.com/zh_cn/international/")) {
            category = "society";
        } else if (url.startsWith("http://china.soufun.com/")) {
            category = "house";
        } else if (url.startsWith("http://fun.china.com/zh_cn/star/")) {
            category = "ent";
        } else if (url.startsWith("http://meirong.533.com/")) {
            category = "lady";
        } else if (url.startsWith("http://edu.533.com/news/xiaoyuan/")) {
            category = "school";
        }
        // sina.com.cn
        if (url.startsWith("http://auto.sina.com.cn/")) {
            category = "car";
        } else if (url.startsWith("http://finance.sina.com.cn/")) {
            category = "finance";
        } else if (url.startsWith("http://tech.sina.com.cn/it/")) {
            category = "IT";
        } else if (url.startsWith("http://sina.kangq.com/")) {
            category = "health";
        } else if (url.startsWith("http://sports.sina.com.cn/")) {
            category = "sports";
        } else if (url.startsWith("http://tour.sina.com.cn/")) {
            category = "travel";
        } else if (url.startsWith("http://edu.sina.com.cn/j/")) {
            // employment和education不能互换.
            category = "employment";
        } else if (url.startsWith("http://edu.sina.com.cn/")) {
            category = "education";
        } else if (url.startsWith("http://cul.book.sina.com.cn/")) {
            category = "culture";
        } else if (url.startsWith("http://mil.news.sina.com.cn/")) {
            category = "military";
        } else if (url.startsWith("http://2008.sina.com.cn/")) {
            category = "olympics";
        } else if (url.startsWith("http://news.sina.com.cn/society/")
                || url.startsWith("http://news.sina.com.cn/china/")
                || url.startsWith("http://news.sina.com.cn/world/")) {
            category = "society";
        } else if (url.startsWith("http://house.sina.com.cn/")) {
            category = "house";
        } else if (url.startsWith("http://ent.sina.com.cn/")) {
            category = "ent";
        } else if (url.startsWith("http://eladies.sina.com.cn/")) {
            category = "lady";
        } else if (url.startsWith("http://edu.sina.com.cn/y/")) {
            category = "school";
        }
        // 163.com
        if (url.startsWith("http://auto.163.com/")) {
            category = "car";
        } else if (url.startsWith("http://money.163.com/")) {
            category = "finance";
        } else if (url.startsWith("http://tech.163.com/it/")) {
            category = "IT";
        } else if (url.startsWith("http://163.39.net/")) {
            category = "health";
        } else if (url.startsWith("http://sports.163.com/")) {
            category = "sports";
        } else if (url.startsWith("http://war.163.com/")) {
            category = "military";
        } else if (url.startsWith("http://2008.163.com/")) {
            category = "olympics";
        } else if (url.startsWith("http://news.163.com/shehui/")
                || url.startsWith("http://news.163.com/domestic/")
                || url.startsWith("http://news.163.com/world/")) {
            category = "society";
        } else if (url.startsWith("http://house.163.com/")) {
            category = "house";
        } else if (url.startsWith("http://ent.163.com/")) {
            category = "ent";
        } else if (url.startsWith("http://lady.163.com/")) {
            category = "lady";
        }
        // qq.com
        if (url.startsWith("http://auto.qq.com/")) {
            category = "car";
        } else if (url.startsWith("http://finance.qq.com/")) {
            category = "finance";
        } else if (url.startsWith("http://tech.qq.com/a/")) {
            category = "IT";
        } else if (url.startsWith("http://sports.qq.com/")) {
            category = "sports";
        } else if (url.startsWith("http://edu.qq.com/job/")) {
            // employment和education不能互换.
            category = "employment";
        } else if (url.startsWith("http://edu.qq.com/")) {
            category = "education";
        } else if (url.startsWith("http://cul.qq.com/")) {
            category = "culture";
        } else if (url.startsWith("http://mil.qq.com/")) {
            category = "military";
        } else if (url.startsWith("http://news.qq.com/a/")) {
            category = "society";
        } else if (url.startsWith("http://2008.qq.com/")) {
            category = "olympics";
        } else if (url.startsWith("http://house.qq.com/")) {
            category = "house";
        } else if (url.startsWith("http://ent.qq.com/")) {
            category = "ent";
        } else if (url.startsWith("http://lady.qq.com/")) {
            category = "lady";
        } else if (url.startsWith("http://campus.qq.com/")) {
            category = "school";
        }
        // sohu.com
        if (url.startsWith("http://auto.sohu.com/")) {
            category = "car";
        } else if (url.startsWith("http://business.sohu.com/")) {
            category = "finance";
        } else if (url.startsWith("http://it.sohu.com/")) {
            category = "IT";
        } else if (url.startsWith("http://health.sohu.com/")) {
            category = "health";
        } else if (url.startsWith("http://sports.sohu.com/")) {
            category = "sports";
        } else if (url.startsWith("http://travel.sohu.com/")) {
            category = "travel";
        } else if (url.startsWith("http://learning.sohu.com/")) {
            category = "education";
        } else if (url.startsWith("http://career.sohu.com/")) {
            category = "employment";
        } else if (url.startsWith("http://cul.sohu.com/")) {
            category = "culture";
        } else if (url.startsWith("http://news.sohu.com/")) {
            category = "society";
        } else if (url.startsWith("http://mil.news.sohu.com/")) {
            category = "military";
        } else if (url.startsWith("http://2008.sohu.com/")) {
            category = "olympics";
        } else if (url.startsWith("http://house.sohu.com/")) {
            category = "house";
        } else if (url.startsWith("http://yule.sohu.com/")) {
            category = "ent";
        } else if (url.startsWith("http://women.sohu.com/")) {
            category = "lady";
        }
        return category;
    }

    public static void main(String args[]) {
        category = CategoryDetector.detectCategory("http://edu.sina.com.cn/");
        System.out.println(category);
    }
}

DocBean类如下：

View Code

package zju.dawn.ai.corpus.sogou;

public class DocBean {

    private Integer docid;
    private String no;
    private String url;
    private String title;
    private String content;
    private String category;

    public String getCategory() {
        return category;
    }

    public void setCategory(String category) {
        this.category = category;
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
    }

    public Integer getDocid() {
        return docid;
    }

    public void setDocid(Integer docid) {
        this.docid = docid;
    }

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public String getContent() {
        return content;
    }

    public void setContent(String content) {
        this.content = content;
    }

    public String getNo() {
        return no;
    }

    public void setNo(String no) {
        this.no = no;
    }
}

其他类都是一些支持类，在此不再一一详述。代码如下：

FileListViewer类：

View Code

package zju.dawn.ai.corpus.sogou;

import java.io.*;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

/**
 * 读取目录及子目录下指定文件名的路径 并放到一个数组里面返回遍历
 * 
 * @author DAWN
 * 
 */
public final class FileListViewer {

    public static List<String> fileList = new ArrayList<String>();

    /**
     * 获取path目录下所有以suffix为后缀名的文件路径列表,(可以递归遍历）.
     * 
     * @param path
     *            文件路径
     * @param suffix
     *            后缀名
     * @param isdepth
     *            是否遍历子目录
     * @return
     */
    public static List<String> getFileList(String fileDir, String suffix,
            boolean isdepth) {
        File file = new File(fileDir);
        FileListViewer.generateFilePathList(file, suffix, isdepth);
        return fileList;
    }

    private static void generateFilePathList(File f, String suffix,
            boolean isdepth) {
        // 是目录，同时需要遍历子目录
        if (f.isDirectory() && isdepth == true) {
            File[] t = f.listFiles();
            for (int i = 0; i < t.length; i++) {
                generateFilePathList(t[i], suffix, isdepth);
            }
        } else {
            String filePath = f.getAbsolutePath();
            if (suffix != null && suffix != "") {
                // 最后一个.(即后缀名前面的.)的索引
                int begIndex = filePath.lastIndexOf(".");
                String tempsuffix = "";
                if (begIndex != -1)// 有后缀名.
                {
                    tempsuffix = filePath.substring(begIndex + 1, filePath
                            .length());
                }
                if (tempsuffix.equals(suffix)) {// 满足后缀匹配
                    fileList.add(filePath);
                }
            } else {
                // 后缀名为null则为所有文件
                fileList.add(filePath);
            }
        }
    }

    public static void main(String[] args) {
        List<String> arrayList = FileListViewer.getFileList(
                "D:\\语料库\\Sogou语料库\\SogouCA.reduced", "txt", true);
        if (arrayList.isEmpty()) {
            System.out.println("没有找到符号后缀要求的文件");
        } else {
            String message = "";
            message += "符号要求的文件数：" + arrayList.size() + "\r\n";
            System.out.println(message);

            for (Iterator<String> i = arrayList.iterator(); i.hasNext();) {
                String temp = (String) i.next();
                System.out.println(temp);
                message += temp + "\r\n";
            }
        }
    }
}

FileUtil类：

View Code

package zju.dawn.ai.corpus.sogou;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;

/**
 * 文件相关的处理函数
 * 
 * @author DAWN
 * 
 */
public class FileUtil {

    /**
     * 用GBK编码读取一个文件，返回文件内容为一个字符串
     * 
     * @param file
     *            文件名
     * @return 一个字符串代表文件内容
     * @throws IOException
     */
    public static String readEntire(String file) {
        return readEntire(file, "gbk");
    }

    /**
     * 用指定编码读取一个文件，返回文件内容为一个字符串
     * 
     * @param file
     *            文件名
     * @param encoding
     *            编码
     * @return 一个字符串代表文件内容
     * @throws IOException
     */
    public static String readEntire(String file, String encoding) {
        BufferedReader reader = null;
        try {
            reader = new BufferedReader(new InputStreamReader(
                    new FileInputStream(file), encoding));
        } catch (UnsupportedEncodingException e) {
            System.out.println("编码格式不支持!");
            e.printStackTrace();
        } catch (FileNotFoundException e) {
            System.out.println("文件没有发现 !");
            e.printStackTrace();
        }
        StringBuilder sb = new StringBuilder();
        String line = null;
        try {
            while (true) {
                line = reader.readLine();
                if (line == null)
                    break;
                sb.append(line).append("\n");
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                reader.close();
            } catch (IOException e) {
                System.out.println("输入流没有关闭!");
                e.printStackTrace();
            }
        }
        return sb.toString();
    }

    /**
     * 读取一个文件，返回文件内容为一个字符串.(默认情况下使用GBK，中英文支持).
     * 
     * @param file
     *            文件对象.
     * @return 文件内容的字符串.
     */
    public static String readEntire(File file) throws IOException {
        StringBuffer sb = new StringBuffer();
        String line;
        BufferedReader br = new BufferedReader(new InputStreamReader(
                new FileInputStream(file), "GBK"));
        while ((line = br.readLine()) != null) {
            sb = sb.append(line).append("\n\r");
        }
        br.close();
        String text = sb.toString();
        return text;
    }

    /**
     * 读取文件路径，返回文件内容为一个字符串.
     * 
     * @param dir
     *            文件所在父路径.
     * @param filename
     *            文件名称.
     * @return 文件内容的字符串.
     */
    public static String getFileContentString(String dir, String filename)
            throws IOException {
        File file = new File(dir, filename);
        String text = readEntire(file);
        return text;
    }

    /**
     * 返回文件夹fileDir下面的文件名称String[].
     * 
     * @param fileDir
     */
    public static String[] getFileList(String fileDir) throws IOException {
        File files = new File(fileDir);
        if (files.list() == null) {
            System.out.println("files.list()指针为空！");
        }
        if (files.list().length == 0)
            System.out.println("文件夹" + fileDir + "下面没有文件");
        return files.list();

    }

    /**
     * 判断一个文件(夹)是否存在.
     * 
     * @param filepath
     * @return true=存在 false=不存在
     */
    public static boolean isFileExisted(String filepath) {
        if (!new File(filepath).exists()) {
            System.out.println("File:" + filepath + " did not exist!");
            return false;
        }
        return true;
    }

}

DBUtil类和SogouDBManager类：

View Code

package zju.dawn.ai.corpus.sogou;

import java.sql.*;

public class DBUtil {
    public static Connection getConnection() {
        try {
            // 第一步：加载MySQL的JDBC的驱动
            Class.forName("com.mysql.jdbc.Driver");
        } catch (ClassNotFoundException e) {
            System.out.println("类com.mysql.jdbc.Driver没有发现!");
            e.printStackTrace();
        }

        // 取得连接的url,能访问MySQL数据库的用户名,密码
        // String url = "jdbc:mysql://localhost/copynews";
        String url = "jdbc:mysql://localhost:3306/sogou?useUnicode=true&characterEncoding=UTF-8";
        String username = "root";
        String password = "root";

        // 第二步：创建与MySQL数据库的连接类的实例
        Connection con = null;
        try {
            con = DriverManager.getConnection(url, username, password);
        } catch (SQLException e) {
            System.out.println("获取数据库连接失败!");
            e.printStackTrace();
        }
        return con;
    }

    public static Connection getConnection(String url) {

        try {
            // 第一步：加载MySQL的JDBC的驱动
            Class.forName("com.mysql.jdbc.Driver");
        } catch (ClassNotFoundException e) {
            System.out.println("类com.mysql.jdbc.Driver没有发现!");
            e.printStackTrace();
        }
        // 取得能访问MySQL数据库的用户名,密码；
        String username = "root";
        String password = "dawndon";
        Connection con = null;
        try {
            // 第二步：创建与MySQL数据库的连接类的实例
            con = DriverManager.getConnection(url, username, password);
        } catch (SQLException e) {
            System.out.println("获取数据库连接失败!");
            e.printStackTrace();
        }
        return con;
    }

    public static Connection getConnection(String url, String username,
            String password) {
        try {
            // 第一步：加载MySQL的JDBC的驱动
            Class.forName("com.mysql.jdbc.Driver");
        } catch (ClassNotFoundException e) {
            System.out.println("类com.mysql.jdbc.Driver没有发现!");
            e.printStackTrace();
        }
        // 取得能访问MySQL数据库的用户名,密码；
        Connection con = null;
        try {
            // 第二步：创建与MySQL数据库的连接类的实例
            con = DriverManager.getConnection(url, username, password);
        } catch (SQLException e) {
            System.out.println("获取数据库连接失败!");
            e.printStackTrace();
        }
        return con;
    }

    /**
     * 根据传入的SQL语句返回一个结果集合.
     * 
     * @param sql
     * @return
     * @throws Exception
     */
    public static ResultSet select(String sql) throws Exception {
        Connection conn = null;
        Statement stmt = null;
        ResultSet rs = null;
        try {
            conn = getConnection();
            stmt = conn.createStatement();
            rs = stmt.executeQuery(sql);
            return rs;
        } catch (SQLException sqle) {
            throw new SQLException("select data exception:" + sqle.getMessage());
        } catch (Exception e) {
            throw new Exception("System e exception: " + e.getMessage());
        }
    }

    /**
     * 根据传入的SQL语句和数据库连接字符串返回一个查询结果集合.
     * 
     * @param sql
     *            sql语句.
     * @param constring
     *            数据库链接字符串.
     * @return ResultSet 结果集.
     * @throws Exception
     */
    public static ResultSet select(String sql, String constring)
            throws Exception {
        Connection conn = null;
        Statement stmt = null;
        ResultSet rs = null;
        try {
            conn = getConnection(constring);
            stmt = conn.createStatement();
            rs = stmt.executeQuery(sql);
            return rs;
        } catch (SQLException sqle) {
            throw new SQLException("select data exception:" + sqle.getMessage());
        } catch (Exception e) {
            throw new Exception("System e exception: " + e.getMessage());
        }
    }

    /**
     * 根据传入的SQL语句向数据库增加一条记录
     * 
     * @param sql
     *            sql语句.
     * @throws Exception
     */
    public static void insert(String sql) throws Exception {
        Connection conn = null;
        PreparedStatement ps = null;
        try {
            conn = getConnection();
            ps = conn.prepareStatement(sql);
            ps.executeUpdate();
        } catch (SQLException sqle) {
            throw new Exception("insert data exception:" + sqle.getMessage());
        } finally {
            try {
                if (ps != null) {
                    ps.close();
                }
            } catch (Exception e) {
                throw new Exception("ps close exception: " + e.getMessage());
            }
        }
        try {
            if (conn != null) {
                conn.close();
            }
        } catch (Exception e) {
            throw new Exception("connection close exception: " + e.getMessage());
        }
    }
    /**
     * 根据传入的SQL语句和链接函数向数据库增加一条记录
     * 
     * @param sql
     *            sql语句.
     * @throws Exception
     */
    public static void insert(Connection conn,String sql) throws Exception {
        PreparedStatement ps = null;
        try {
            ps = conn.prepareStatement(sql);
            ps.executeUpdate();
        } catch (SQLException sqle) {
            throw new Exception("insert data exception:" + sqle.getMessage());
        } finally {
            try {
                if (ps != null) {
                    ps.close();
                }
            } catch (Exception e) {
                throw new Exception("ps close exception: " + e.getMessage());
            }
        }
    }
    /**
     * 根据传入的SQL语句更新数据库记录
     * 
     * @param sql
     *            sql语句.
     * @throws Exception
     */
    public static void update(String sql) throws Exception {
        Connection conn = null;
        PreparedStatement ps = null;
        try {
            conn = getConnection();
            ps = conn.prepareStatement(sql);
            ps.executeUpdate();
        } catch (SQLException sqle) {
            throw new Exception("update data exception:" + sqle.getMessage());
        } finally {
            try {
                if (ps != null) {
                    ps.close();
                }
            } catch (Exception e) {
                throw new Exception("ps close exception: " + e.getMessage());
            }
        }
        try {
            if (conn != null) {
                conn.close();
            }
        } catch (Exception e) {
            throw new Exception("connection close exception: " + e.getMessage());
        }
    }

    /**
     * 根据传入的SQL语句删除一条数据库记录
     * 
     * @param sql
     *            sql语句.
     * @throws Exception
     */
    public static void delete(String sql) throws Exception {
        Connection conn = null;
        PreparedStatement ps = null;
        try {
            conn = getConnection();
            ps = conn.prepareStatement(sql);
            ps.executeUpdate();
        } catch (SQLException sqle) {
            throw new Exception("delete data exception:" + sqle.getMessage());
        } finally {
            try {
                if (ps != null) {
                    ps.close();
                }
            } catch (Exception e) {
                throw new Exception("ps close exception: " + e.getMessage());
            }
        }
        try {
            if (conn != null) {
                conn.close();
            }
        } catch (Exception e) {
            throw new Exception("connection close exception: " + e.getMessage());
        }
    }
}

View Code

package zju.dawn.ai.corpus.sogou;

import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.Iterator;
import java.util.List;

public class SogouDBManager {

    private static String url = "jdbc:mysql://localhost:3306/sogou?useUnicode=true&characterEncoding=UTF-8";
    private static String username = "root";
    private static String password = "";

    public static boolean savaDocBeanToDB(List<DocBean> docList) {
        Connection conn = DBUtil.getConnection(url, username, password);
        PreparedStatement ps = null;
        // 成功与否.
        boolean flag = false;
        // 查询字符串.
        String sqlString = "insert into sohunews_reduced (newsno,newstitle,newscontent,newsurl,category) values(?,?,?,?,?)";
        Iterator<DocBean> docIter = docList.iterator();
        DocBean docbean = null;        
        while (docIter.hasNext()) {
            docbean = docIter.next();
            try {
                ps = conn.prepareStatement(sqlString);
                ps.setString(1, docbean.getNo());
                ps.setString(2, docbean.getTitle());
                ps.setString(3, docbean.getContent());
                ps.setString(4, docbean.getUrl());
                ps.setString(5, docbean.getCategory());
                // 执行sql语句.
                flag = ps.execute();
            } catch (SQLException e) {
                System.out.println("插入newsbean时,Sql语句执行错误!");
                System.out.println("内容是:"+docbean.getContent());
                e.printStackTrace();
                return false;
            }
        }
        try {
            if (conn != null) {
                conn.close();
            }
        } catch (Exception e) {
            System.out.println("connection close exception: " + e.getMessage());
        }
        return flag;
    }
}

posted @ 2011-05-25 20:27 Alex.XW.Wang 阅读(2133) 评论(1) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

雨龙啸梦

浙大读研中.....信息检索，信息抽取，情感分析(SA)....

整理搜狗2008版搜狗新闻分类语料库和全网新闻分类语料库--转存至mysql数据库

公告