文本倾向性分析

package test;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.Vector;



public class OpinionAnalyser {
    //倾向词表
    public Vector <Word> words=new Vector <Word>();
    //修饰词表
    public Vector <Word> adjectives=new Vector <Word>();
    //描述词表
    public Vector <Word> descriptions=new Vector <Word>();
    //正面句子数
    public int posCount;
    //负面句子数
    public int negCount;
    
    static String SERVER="59.77.233.*";
    static String USER="";
    static String PASSWORD="";
    static String DATABASE="skycent";
    
    //负面词的权重,为2表示负面词是正面词权重的两倍
    static int NEG_WEIGHT=2;
    static int TITLE_WEIGHT=10;
    

    
    private static int atoi(String s)
    {
        return Integer.parseInt(s);
    }
    
    //读取数据库初始化三个词表和其他成员变量
    public void OpinionAnalyser() throws SQLException
    {
        ConnDB conndb;
        PreparedStatement stmt = null;
//        PreparedStatement stmt = null;
        ResultSet rs = null;
        conndb = new ConnDB(SERVER, USER, PASSWORD, DATABASE);
        conndb.executeUpdate("SET NAMES 'utf8mb4'");
        
        //获取倾向性词表
        String strSQL = "select word,polar,weight from twordlist";
        try {
            stmt = conndb.getConnection().prepareStatement(strSQL);
            rs = stmt.executeQuery();
            } catch (SQLException e1) {
                e1.printStackTrace();
            }
        //            处理空集情况
        if (rs.next() == false) {
            System.out.println("twordlist没有词!");
            } 
        else{
            rs.previous();
            }
        while(rs.next())
        {
            int polar=atoi(rs.getString("polar"));
            int weight=atoi(rs.getString("weight"));
        //    System.out.println(polar+" "+weight+" "+rs.getString("word"));
            Word tmp=new Word(rs.getString("word"),polar,weight);
            words.addElement(tmp);    
        //    System.out.println(polar+" "+weight);
        }
        
        //获取描述词表
        strSQL = "select word,type from twordlist_ms";
        try {
            stmt = conndb.getConnection().prepareStatement(strSQL);
            rs = stmt.executeQuery();
            } catch (SQLException e1) {
                e1.printStackTrace();
            }
        //            处理空集情况
        if (rs.next() == false) {
            System.out.println("twordlist_ms没有词!");
            } 
        else{
            rs.previous();
            }
        while(rs.next())
        {
            int polar=atoi(rs.getString("type"));
        //    System.out.println(polar+" "+rs.getString("word"));
            Word tmp=new Word(rs.getString("word"),polar,0);
            descriptions.addElement(tmp);    
        
        }
        
        //获取修饰词表
        strSQL = "select word,polar,weight from twordlist_xs";
        try {
            stmt = conndb.getConnection().prepareStatement(strSQL);
            rs = stmt.executeQuery();
            } catch (SQLException e1) {
                e1.printStackTrace();
            }
        //            处理空集情况
        if (rs.next() == false) {
            System.out.println("twordlist_xs没有词!");
            } 
        else{
            rs.previous();
            }
        while(rs.next())
        {
            int polar=atoi(rs.getString("polar"));
            int weight=atoi(rs.getString("weight"));
        //    System.out.println(polar+" "+weight+" "+rs.getString("word"));
            Word tmp=new Word(rs.getString("word"),polar,weight);
            adjectives.addElement(tmp);    
        
        }
        posCount=0;
        negCount=0;
        
        conndb.close();
    }
    
    //句子倾向性得分
    public int sentenceScore(String sentence)
    {
        int opinionScore=0;
        //是否出现倾向词
        int opinionPosition=0;
        
        for(int i=0;i<words.size();i++)
        {
            //找到倾向性词表
            opinionPosition=sentence.indexOf(words.get(i).getWord());
        //    System.out.println(opinionPosition);
            
            if(opinionPosition!=-1)
            {
                //是否出现修饰词+倾向词
                int flag=0;
                for(int j=0;j<adjectives.size();j++)
                {
                    StringBuffer wordPair=new StringBuffer();
                    wordPair.append(adjectives.get(j).getWord());
                    wordPair.append(words.get(i).getWord());
                    int pairPosition =0;
                    pairPosition=sentence.indexOf(wordPair.toString());
                    
                    if(pairPosition!=-1)
                    {
                    //    System.out.println("yeyeyeyey");
                        flag=1;
                        int tmpScore=words.get(i).getWeight()*adjectives.get(j).getWeight()*words.get(i).getPolar()*adjectives.get(j).getPolar();
                        
                        if(tmpScore>0)
                            opinionScore +=tmpScore;
                        else 
                            opinionScore +=tmpScore*NEG_WEIGHT;
                    }
                }
                //没出现修饰词只计算倾向次本身的权重
                if(flag==0)
                {
                //    System.out.println(opinionPosition);
                //    System.out.println("nnnnnnnnnnnnn");
                    if(words.get(i).getPolar()==1)
                    {
                        opinionScore+=words.get(i).getWeight()*words.get(i).getPolar();
                //        System.out.println(words.get(i).getWord());
                //        System.out.println("wwwwwwwww");
                    }
                    else if(words.get(i).getPolar()==-1)
                    {
                        opinionScore+=words.get(i).getWeight()*words.get(i).getPolar()*NEG_WEIGHT;
                //        System.out.println(words.get(i).getWord());
                    }
                }
            }
        }
        //System.out.println("最后得分:"+opinionScore);
        return opinionScore;
    }
    
    //计算一般新闻的倾向性
    public void opinion(Set<String> keyword,String text,String title)
    {
        posCount=0;
        negCount=0;
        System.out.println("opinion");
        //计算title的倾向性
        shortTextOpinion(keyword,title);
        
        Set<String> sentences = new HashSet();
        String[] array=text.split(" ");
        //System.err.println(array.length);
        for(int i=0;i<array.length;i++)
        {
            sentences.add(array[i]);
        }
        Iterator KwordIter=keyword.iterator();
        Iterator senIter=sentences.iterator();
        while(KwordIter.hasNext())
        {
            String kwordIt=KwordIter.next().toString();
            while(senIter.hasNext())
            {
                String senIt=senIter.next().toString();
            //    String kwordIt=KwordIter.next().toString();
                if((senIt.indexOf(kwordIt))!=-1)
                {
                    //单个句子倾向性得分
                    int value=sentenceScore(senIt);
                    if(value>0)
                        posCount++;
                    else if(value<0)
                        negCount +=NEG_WEIGHT;
                }
            }
        }
    }
    
    //计算短文本如微博的倾向性
    public void shortTextOpinion(Set<String> keyword,String text)
    {
        System.out.println("shortTextOpinion");
        posCount=0;
        negCount=0;
        
        int kwordP=0;
        int owordP=0;
        
        Iterator kwordIter=keyword.iterator();
        while(kwordIter.hasNext())
        {
            String kwordIt=kwordIter.next().toString();
            kwordP=text.indexOf(kwordIt);
            //文本中存在关键词
            if(kwordP!=-1)
            {
                int opinionScore=0;
                int pairPosition=0;
                
                StringBuffer wordPair=new StringBuffer();
                
                for(int i=0;i<words.size();i++)
                {
                    owordP=text.indexOf(words.get(i).getWord());
                    if(owordP!=-1)
                    {
                        //是否出现词对
                        int flag=0;
                        for(int j=0;j<adjectives.size();j++)
                        {
                            wordPair.append(adjectives.get(j).getWord());
                            wordPair.append(words.get(i).getWord());
                            pairPosition=text.indexOf(wordPair.toString());
                            if(pairPosition!=-1)
                            {
                                flag=1;
                                int tmpScore=words.get(i).getWeight()*adjectives.get(j).getWeight()*words.get(i).getPolar()*adjectives.get(j).getPolar();
                                if(tmpScore>0)
                                    opinionScore +=tmpScore;
                                else 
                                    opinionScore +=NEG_WEIGHT*tmpScore;
                            }
                        }
                        if(flag==0)
                        {
                            if(words.get(i).getPolar()==1)
                                opinionScore +=words.get(i).getWeight()*words.get(i).getPolar();
                            else if(words.get(i).getPolar()==-1)
                                opinionScore +=NEG_WEIGHT*words.get(i).getWeight()*words.get(i).getPolar();
                        }
                    }
                }
                if(opinionScore>0)
                    posCount +=TITLE_WEIGHT;
                else if(opinionScore<0)
                    negCount +=TITLE_WEIGHT*NEG_WEIGHT;
            } 
        }
    }
    
    //media=3为微博采用短文本倾向性,第二个参数为空
    public void analyse(int media,Set<String> keyword,String text,String title)
    {
        if(media ==3)
        {
            System.out.println("media=3");
            shortTextOpinion(keyword,title);
        }
        else
        {
            System.out.println("media=1");
            opinion(keyword,text,title);
        }
    }
    
    //最终倾向性
    public int getPolar()
    {
        if(posCount>negCount)
            return 1;
        else if(negCount>posCount)
            return -1;
        else
            return 0;
    }
    
    public static void main(String[] args) throws SQLException
    {
        OpinionAnalyser a=new OpinionAnalyser();
        a.OpinionAnalyser();
        a.sentenceScore("好不好!");
        String str="心情很好";
        System.out.println("文本倾向性:"+a.sentenceScore(str));
        //String text="兴业证券正面临着暴跌!需要采取一定的措施来进行抵御!";
        //Set <String> keyword = new HashSet();
        //keyword.add("兴业证券");
        //keyword.add("金融危机");
        //String title="兴业证券面临金融危机";
        
        //a.analyse(1, keyword, text, title);
        //System.out.println("该文本最后倾向性:"+a.getPolar());
    }

    
}

 

posted @ 2016-03-29 08:49  陈泽泽  阅读(1437)  评论(0编辑  收藏  举报