朴树贝叶斯新闻分类系统

基于搜狗语料库,建立的一个新闻分类系统;类别包括:

classifierMap.put(0, "IT");
classifierMap.put(1, "体育");
classifierMap.put(2, "健康");
classifierMap.put(3, "军事");
classifierMap.put(4, "招聘");
classifierMap.put(5, "教育");
classifierMap.put(6, "文化");
classifierMap.put(7, "旅游");
classifierMap.put(8, "财经");

分词器:中科院分词工具或者IK;本人采用IK分词器,通过测试发现速度快,内存消耗低,不会电脑死机,在训练数据的时候;训练集是下载的搜狗新闻数据集,对新闻分类

算法步骤:

1. 首先下载IK分词器和搜狗新闻训练集和搜狗词典(对词进行了词性标注,个人只选择了名词,考虑到内存和速度,准确率的因素)

2. 对训练集分词处理,将属于不同类别的新闻分词处理,并去除,词频低于10的词,过滤掉,节省内存和提高速度的考虑;并以文本的形式保存,以类别定义文件名字

3. 编写朴素贝叶斯分类函数,对输入文本进行分类处理,选择概率最大的作为分类类别

4. web系统采用JSP+JavaBean+Servlet的架构,软件平台式新浪云;网址:http://naivebayes.sinaapp.com;如果是无法访问,应该是服务器没有开

使用方式:输入文本,并点击新闻分类;

主程序代码:

package com.sogou.servlet;

import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import javax.servlet.RequestDispatcher;
import javax.servlet.ServletContext;
import javax.servlet.ServletException;
import javax.servlet.annotation.WebServlet;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import com.sogou.util.BayesUtil;

/**
 * Servlet implementation class BayesServlet
 */
@WebServlet("/bayes.do")
public class BayesServlet extends HttpServlet {
    private static final long serialVersionUID = 1L;

    /**
     * @see HttpServlet#HttpServlet()
     */
    public BayesServlet() {
        super();
        // TODO Auto-generated constructor stub
    }

    /**
     * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse
     *      response)
     */
    protected void doGet(HttpServletRequest request,
            HttpServletResponse response) throws ServletException, IOException {
        // TODO Auto-generated method stub
        this.doPost(request, response);
    }

    /**
     * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse
     *      response)
     */
    @SuppressWarnings("unchecked")
    protected void doPost(HttpServletRequest request,
            HttpServletResponse response) throws ServletException, IOException {
        // TODO Auto-generated method stub
        String newsText = request.getParameter("newsText");
        newsText = new String(newsText.getBytes("ISO8859-1"), "utf-8");
        ServletContext st = this.getServletContext();
        List<Map<String, Integer>> trainSets = (List<Map<String, Integer>>) st
                .getAttribute("trainSets");
        Map<Integer, String> classifierMap = (Map<Integer, String>) st
                .getAttribute("classifierMap");
        if (classifierMap == null) {
            classifierMap = new HashMap<Integer, String>();
            classifierMap.put(0, "IT");
            classifierMap.put(1, "体育");
            classifierMap.put(2, "健康");
            classifierMap.put(3, "军事");
            classifierMap.put(4, "招聘");
            classifierMap.put(5, "教育");
            classifierMap.put(6, "文化");
            classifierMap.put(7, "旅游");
            classifierMap.put(8, "财经");
            st.setAttribute("classifierMap", classifierMap);
        }
        BayesUtil bayes = new BayesUtil();
        if (trainSets == null) {
            String dirName = "D:/dataMing/bys";
            trainSets = bayes.loadTrainSet(dirName);
            st.setAttribute("trainSets", trainSets);
        }
        String classifier = bayes.bayesClassifierText(trainSets, newsText,
                classifierMap);
        System.out.println(classifier);
        request.setAttribute("classifier", classifier);
        RequestDispatcher rd = request.getRequestDispatcher("./index.jsp");
        rd.forward(request, response);
    }

}
package com.sogou.util;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;

public class BayesUtil {

    /**
     * 加载训练集分类词典目录,对内容分类处理
     * 
     * @param dirName
     * @param content
     */
    public List<Map<String, Integer>> loadTrainSet(String dirName) {
        File directory = new File(dirName);
        File[] files = directory.listFiles();
        BufferedReader br = null;
        List<Map<String, Integer>> list = new ArrayList<>(files.length);
        // 加载字典
        for (int i = 0; i < files.length; i++) {
            try {
                br = new BufferedReader(new FileReader(files[i]));
                Map<String, Integer> hashMap = new HashMap<String, Integer>();
                String line = null;
                while ((line = br.readLine()) != null) {
                    String[] values = line.split("\t");
                    hashMap.put(values[0], Integer.parseInt(values[1]));
                }
                list.add(hashMap);
            } catch (FileNotFoundException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            } finally {
                try {
                    br.close();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
        }
        // 对传入文本或者文件处理
        return list;
    }

    /**
     * 对传入的文本分类处理
     * 
     * @param content
     */
    public String bayesClassifierText(List<Map<String, Integer>> trainSets,
            String content, Map<Integer, String> textClassifier) {
        IKSegmenter ik = new IKSegmenter(new StringReader(content), true);
        Lexeme value = null;

        List<String> list = new LinkedList<String>();
        String text = null;
        try {
            while ((value = ik.next()) != null) {
                text = value.getLexemeText();
                if (text.length() >= 2) {
                    list.add(text);
                }
            }
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        int length = trainSets.size();
        long[] maxCfVal = new long[length];
        int[] wordsCount = new int[length];
        boolean flag = false;
        for (String tt : list) {
            for (int i = 0; i < length; i++) {
                if (!flag) {
                    wordsCount[i] = trainSets.get(i).get("wordsCount");
                }
                Integer iv = trainSets.get(i).get(tt);
                if (iv != null) {
                    maxCfVal[i] += Math.log((float) iv / wordsCount[i]);
                } else {
                    maxCfVal[i] += Math.log(1.0 / (wordsCount[i]));
                }
            }
            flag = true;
        }
        long maxValue = maxCfVal[0];
        int index = 0;
        for (int i = 1; i < length; i++) {
            if (maxCfVal[i] > maxValue) {
                index = i;
                maxValue = maxCfVal[i];
            }
        }
        return textClassifier.get(index);
    }

    /**
     * 对传入的文本文件分类
     * 
     * @param fileName
     */
    public void bayesClassifierFile(String fileName) {

    }
}

 

posted @ 2014-07-07 14:54  曹守鑫  阅读(800)  评论(0编辑  收藏  举报