作者:finallyliuyu 转载使用等请注明出处

功能:根据:档集合大小,特征词数目,交叉验证折数的不同需求,快速生成Libsvm格式数据

数据Demo请见:新闻文本分类libsvm格式数据

预处理部分的框架图

此模块的各个子模块的功能我就不详细介绍了,与此系列博客K-means文本聚类系列(已经完成) 一曲同工。

下面开始给出各个子模块的代码:

# -*- coding: cp936 -*-
########################################################################3
#
#自动建立文件夹
#
########################################################################
        
def CreateFolders(path):
    import os
    curpath=os.getcwd()
    os.mkdir(path)
    os.chdir(path)
    os.mkdir('class1')
    os.mkdir('class2')
    os.chdir(curpath)
def CreateAssist(toCalInfoGain):
    import os
    if toCalInfoGain==0:
        folders=[r'D:\TextCategorization\corpus',r'D:\TextCategorization\testing',r'D:\TextCategorization\training',r'D:\TextCategorization\segmented',r'D:\TextCategorization\tsegmented']
        for myfolder in folders:
            CreateFolders(myfolder)
        os.mkdir(r'D:\TextCategorization\VITdata')
        os.mkdir(r'D:\TextCategorization\data')
        os.mkdir(r'D:\TextCategorization\VITdata\data')
        os.mkdir(r'D:\TextCategorization\data\data')
   
    print 'finish,congratulations'
if __name__=="__main__":
    import os
    os.mkdir(r'D:\TextCategorization')
    CreateAssist()
    

文档集分割:多少篇文章归为训练集,多少篇文章归为测试集:注意参数N为整个文档集合(包括测试集和训练集)的每一类的文章数目。这里设置两个类别有相等的文章数目集整个文档集合的文章数目为2N

# -*- coding: cp936 -*-
#此模块用于对文本语料库进行预处理
###################################################
#origidir:原语料库目录如E:\新闻语料\EntireTrainingSet\ClassFile\C000024
#destidir:目标语料库目录D:\corpus\class1
#N:需要移动的文本的个数
#####################################################

def CorpusFormation(origidir,destidir,N):
    paths=[]
    import os,shutil
    for i in range(0,N):#产生待移动的文件路径
        paths.append(origidir+'\\'+str(i)+'.txt')
    for mypath in paths:
        shutil.copy(mypath,destidir)
        #print 'finsh%s'%mypath

#####################################################
        #移动语料库
#####################################################
def MoveCorpus(N,toCalInfoGain):
    if toCalInfoGain==0:
        originaldirs=[r'E:\新闻语料\EntireTrainingSet\ClassFile\C000024',r'E:\新闻语料\EntireTrainingSet\ClassFile\C000013']
        destinationdirs=[r'D:\TextCategorization\corpus\class1',r'D:\TextCategorization\corpus\class2']
        for i in range(0,2):
            CorpusFormation(originaldirs[i],destinationdirs[i],N)
    print 'finish'

#####################################################################
#origidir:原语料库目录,如D:\corpus\class1
#destdir1:目标目录:如D:\training\class1
#destidir2:目标目录:如D:\testing\class1
#Vfold:几折交叉验证 count:已经进行了几次实验count=0,1,2,3
#N语料库的总规模
#####################################################################

def CorpusPartition(origidir,destdir1,destdir2,count,N,vfold=5):
    import os,shutil
    step=N/vfold
    paths=[]
    for i in range(0+count*step,step+count*step):
         paths.append(origidir+'\\'+str(i)+'.txt')   
    for mypath in paths:
        shutil.move(mypath,destdir1)
        #print 'finsh%s'%mypath
    paths=[]
    pathstemp=os.listdir(origidir)
    for m in pathstemp:
        paths.append(origidir+os.sep+m)
    for mypath in paths:
        shutil.move(mypath,destdir2)
        #print 'finish%s'%mypath

    #print 'finalfinish congratulations!'

    
def moveAccordingPartition(N,count,toCalInfoGain):
    if toCalInfoGain==0:
        originaldirs=[r'D:\TextCategorization\corpus\class1',r'D:\TextCategorization\corpus\class2']
        destidirs1=[r'D:\TextCategorization\training\class1',r'D:\TextCategorization\training\class2']
        destidirs2=[r'D:\TextCategorization\testing\class1',r'D:\TextCategorization\testing\class2']
        for i in range(0,2):
            CorpusPartition(originaldirs[i],destidirs1[i],destidirs2[i],count,N)
            print '第%s finish'%i
        

    


if __name__=="__main__":
    #MoveCorpus(500)
    N=500
    count=0
    moveAccordingPartition(N,count)
    
# -*- coding: cp936 -*-
#此模块用于建立词袋子模型

def BagOfWordsConstruction(root,toCalInfoGain):
    if toCalInfoGain==0:
        
        import cPickle as mypickle
        file_dest=file(r'D:\TextCategorization\VITdata\vocabularystatistics.dat','w')
        rawVSMMatrix=TrainingFileProcess(root)
        vocabularystatistics={}
        templist=[]
        for rawVSM in rawVSMMatrix:
            templist=templist+rawVSM
        wordscollection=list(set(templist))
    
        for word in wordscollection:
            index=0
            for rawVSM in rawVSMMatrix:
                count=rawVSM.count(word)
                if count>0 :
                    if vocabularystatistics.has_key(word)==False:
                        vocabularystatistics[word]=[]
                        vocabularystatistics[word].append((index,count))
                    else:
                        vocabularystatistics[word].append((index,count))
                index=index+1
        mypickle.dump(vocabularystatistics,file_dest)
        print len(vocabularystatistics)
        file_dest.close()
    print 'BagOfWordsConstructionFinish'
       
############################################################################################
#将文章内容变成词集合    
def FilePreProcess(rawtext):
    import re
    listresult=rawtext.split("|")
    finalrawVSM=[]
    stopwordlist=FilterNoiseWord(r'C:\Python26\SVM\stopwords.txt')
    for m in listresult:
         if m!=''and m not in stopwordlist and re.search('\xa3[\xa1-\xfe]',m)==None:
            finalrawVSM.append(m)
                     
    return finalrawVSM

#################################################################################################
#训练集文档预处理
def TrainingFileProcess(root):
    from SVM import DataManager
    import cPickle as mypickle
    import os
   
    rawVSMMatrix=[]#存放整个文档集
    dm=DataManager.DataManager(root)
    subdir=dm.GetSubDir()
    for sub in subdir:
        dm.SetFilePathsFromsubDir(root+os.sep+sub)
        filepaths=dm.GetFilePaths()
        for path in filepaths:
            myfile=file(root+os.sep+sub+os.sep+path)
            rawtext=myfile.read()
            myfile.close()
            rawVSM=FilePreProcess(rawtext)
            rawVSMMatrix.append(rawVSM)
    return rawVSMMatrix

####################################################################################
#生成停用词列表
def FilterNoiseWord(stopword_file_name):
    import re
    f=file(stopword_file_name)
    stopword=f.read()
    f.close()
    stopwordlist=re.split('\n',stopword)
    return stopwordlist
    

   
if __name__=="__main__":
    BagOfWordsConstruction(r'D:\TextCategorization\segmented')
    #fid=file(r'D:\3011.txt')
    #rawtext=fid.read()
    #fid.close()
    #FilePreProcess(rawtext)
    

        
    

    
class IG:
    '''
       此模块用于计算信息增益
    '''
    #######################################################################################
    def __init__(self,n_size):
        
        ''' 类的构造函数,初始化 类的数据成员变量
            keys保存Term的值
            labelOneNums保存对应的term出现在类1中的文章的篇数
            labelTwoNums保存对应的term出现在类2中的文字的篇数
        '''
        import cPickle as mypickle
        mydict=mypickle.load(file(r'D:\TextCategorization\VITData\vocabularystatistics.dat'))
        self.mykeys=[]
        self.labelOneNums=[]
        self.labelTwoNums=[]
        self.probs=[]
        self.conProbs=[]
        #self.informationgain=[]
       
        for key ,value in mydict.iteritems():
            self.mykeys.append(key)
            class1_count=0 #某个term属于类别一的次数
            class2_count=0#某个term 属于类别二的次数
            for val in value:
                if val[0]<n_size/2:#该文章标号属于类别1
                    class1_count=class1_count+val[1]
                    #class1_count=class1_count+1
                else:#该文章标号属于类别2
                    class2_count=class2_count+val[1]
                    #class2_count=class2_count+1
            self.labelOneNums.append(class1_count)
            self.labelTwoNums.append(class2_count)
        #测试代码
        #fid=file('1.txt','a')
        #for m in self.labelOneNums:
            #print>>fid,m
            #fid.flush()
        #fid.close()
        #print len([m for m in self.labelOneNums if m>0])
        #print len(self.labelTwoNums)
        #print len(self.mykeys)
    ###################################################################################3
    #def GetConditionProbabilityBaseC(self,n_size,termcount):  
        #conditionPtxC=float(termcount+1)/(n_size/2+len(self.mykeys))
        #return conditionPtxC
    def GetConditionProbabilityBaseC(self,index,termcount):
        '''
            计算P(t|C)
        '''
        if index==1:
            conditionPtxC=float(termcount+1)/(len(self.mykeys)+sum(self.labelOneNums))
        else:
            conditionPtxC=float(termcount+1)/(len(self.mykeys)+sum(self.labelTwoNums))
        return conditionPtxC
                                       
            
    def GetTermProbability(self,n_size):
        
        '''
            计算每个term的先验概率
        '''
        #sumtotal=sum(self.labelOneNums)+sum(self.labelTwoNums)
        for i in range(0,len(self.mykeys)):
            prob=0.5*self.GetConditionProbabilityBaseC(1,self.labelOneNums[i])+0.5*self.GetConditionProbabilityBaseC(2,self.labelTwoNums[i])
            self.probs.append(prob)
        #测试代码
        #fid=file('prob.txt','a')
        #for m in self.probs:
            #print>>fid,m
            #fid.flush()
        #fid.close()
            
        

    ###################################################################################

    def GetCategoryProbConditionTerm(self,n_size):
                       
        '''
           保存在一个词出现与否的前提下,文章属于某个类的概率
        '''
        
        for i in range(0,len(self.mykeys)):
                       # conprob1:出现term t 其属于类别1的概率 
                       conprob1=self.GetConditionProbabilityBaseC(1,self.labelOneNums[i])*0.5/self.probs[i]
                       # conprob2:出现term t 其属于类别2的概率 
                       conprob2=self.GetConditionProbabilityBaseC(2,self.labelTwoNums[i])*0.5/self.probs[i]
                       #nonconprob1:不出现term t的条件下,属于类别 1的概率
                       nonconprob1=1-conprob1
                       #nonconprob2:不出现term t的条件下,属于类别 2的概率
                       nonconprob2=1-conprob2
                       self.conProbs.append((conprob1,conprob2,nonconprob1,nonconprob2))
        #测试代码
        #fid=file('conprob.txt','a')
        #for m in self.conProbs:
            #print>>fid,m
            #fid.flush()
        #fid.close()
        

    ########################################################################################
    def CalInformationGain(self,n_size):
                       
        '''
            计算每个单词的信息增益
        '''
        import math
        import cPickle as mypickle
        
        temp=0#辅助计算变量
        nontemp=0#辅助计算变量
        self.GetTermProbability(n_size)
        self.GetCategoryProbConditionTerm(n_size)
        infoGain={}
       
        for  i in range(0,len(self.mykeys)):
            temp=0#辅助计算变量
            nontemp=0#辅助计算变量
            conprob1=self.conProbs[i][0]
            conprob2=self.conProbs[i][1]
            nonconprob1=self.conProbs[i][2]
            nonconprob2=self.conProbs[i][3]
            if conprob1!=0:
                temp=temp+conprob1*math.log(conprob1,2)
            if conprob2!=0:
                temp=temp+conprob2*math.log(conprob2,2)
            if nonconprob1!=0:
                nontemp=nontemp+nonconprob1*math.log(nonconprob1,2)
            if nonconprob2!=0:
                nontemp=nontemp+nonconprob2+math.log(nonconprob2,2)
            igval=2+self.probs[i]*temp+(1-self.probs[i])*nontemp
            infoGain[self.mykeys[i]]=igval
            
        #infoGain.sort(key=lambda d:d[1],reverse=True)
        infoGainResult=sorted(infoGain.iteritems(),key=lambda infoGain:infoGain[1],reverse=True)
        print '共计算了%s个词的IG值' %len(infoGainResult)
        #for m in infoGainResult:
            #print '%s,%s'%(m[0],m[1])
                                  
                                  
                            
        fid=file(r'D:\TextCategorization\VITData\infoGain.dat','w')
        mypickle.dump(infoGainResult,fid)
        fid.close()
if __name__=="__main__":
    
    MyIG=IG(200)
    MyIG.CalInformationGain(200)
    
    
 -*- coding: cp936 -*-
'''
此模块根据信息增益选择特征词
'''
###########################################################################
def featureSelectionIG(N,flag,n_size):
    '''
        更新数据库,并返回特征词集合
        #flag=0表示infoGain没有被计算
    '''
    from SVM import InformationGain
    import cPickle as mypickle
    if flag==0:
        MyIG=InformationGain.IG(n_size)
        MyIG.CalInformationGain(n_size)
    featurewords=[]
    infoGainResult=mypickle.load(file(r'D:\TextCategorization\VITdata\infoGain.dat'))
    print 'infoGainResult的长度%s'%len(infoGainResult)
    #N=1000#确定特征维数。
    infoGainfinal=infoGainResult[0:N]
    print 'infoGainfinal的长度%s' %len(infoGainfinal)
    featurewords=[m[0] for m in infoGainfinal]
    print '共有%s个特征词'%len(featurewords)
    return featurewords
#####################################################################
if __name__=="__main__":
    featurewords=featureSelectionIG(1000,0,200)
    import cPickle as mypickle
    fid=file(r'D:\TextCategorization\VITData\data\keywords.dat','w')
    mypickle.dump(featurewords,fid)
    fid.close()
    
 
'''
    此模块用于形成文档向量模型
'''
################################################################
def FormatVSM(sub,root,keywordsaddress):
    '''
        对文档集建立文档向量模型,储存在一个二维list中
    '''
    from SVM import DataManager
    import cPickle as mypickle
    import re
    import os
    #root=r'D:\tsegmented'
    keywords=mypickle.load(file(keywordsaddress))
    dm=DataManager.DataManager(root)#读数据专家
    VSMMatrix=[]
    dm.SetFilePathsFromsubDir(root+os.sep+sub)
    filepaths=dm.GetFilePaths()
    for path in filepaths:
        myfile=file(root+os.sep+sub+os.sep+path)
        rawtext=myfile.read()
        myfile.close()
        textwordslist=FilePreProcess(rawtext)
        VSM=[]
        for i in range(0,len(keywords)):
            count=textwordslist.count(keywords[i])
            VSM.append((i+1,count))   
        VSMMatrix.append(VSM)
    return VSMMatrix
####################################################################


def LibSVMFormat(dest,root,keywordsaddress):
    '''
        形成VSM
    '''
    fid=file(dest,'a')
    VSMMatrix=FormatVSM('class1',root,keywordsaddress)
    for VSM in VSMMatrix:
        s='1'
        for elem in VSM:
            if elem[1]!=0:
                
                s=s+' \t'+str(elem[0])+':'+str(elem[1])
        s=s+' \t\n'
        fid.write(s)
    VSMMatrix=FormatVSM('class2',root,keywordsaddress)
    for VSM in VSMMatrix:
        s='0'
        for elem in VSM:
            if elem[1]!=0:
                s=s+' \t'+str(elem[0])+':'+str(elem[1])
        s=s+' \t\n'
        fid.write(s)
        #print 'finish'
    fid.close()
    print 'functionfinish'
#############################################################33
def FilePreProcess(rawtext):
    listresult=rawtext.split("|")
    return listresult    

###################################################################
        
if __name__=="__main__":
    root1=r'D:\TextCategorization\segmented'
    root2=r'D:\TextCategorization\tsegmented'
    print 'begin.....'
    LibSVMFormat(r'D:\TextCategorization\data\train.libsvm',root1,keywordsaddress)
    print '训练语料库转化完毕'
    LibSVMFormat(r'D:\TextCategorization\data\test.libsvm',root2,keywordsaddress)
    print '测试语料库转化完毕'
    
    
        
文本预处理的主程序模块,该模块调用上面的各个子模块完成“根据:档集合大小,特征词数目,交叉验证折数的不同需求,快速生成Libsvm格式数据”的功能。
# -*- coding: cp936 -*-
#coding gb2312
from SVM import FoldersCreation
import os
##############################################################################################
#参数设计
corpus_size=[1500]
#N: half of total corpus size
vfold=5 #vfold: 循环验证的次数 
featureDimensions=[10,20,30,40,50,60,70,80,90,100,110,120,130,140,150] #featureDimension:VSM模型特征维度
toCalInfoGain=0#是否计算词袋子模型中的词集合的信息增益=1则不计算
times=[2]
#count_done_research_times=0#已经进行了几次试验
# N,count_done_research 为CorpusPartition.moveAccordingPartition的参数
#featureDimension,toCalInfoGain 2*N/vfold 为FeatureSelectionModel.featureSelectionIG
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
for count_done_research_times in times:
    
    for N in corpus_size:
        print '目前文档集规模为%s'%N
        print '目前在该规模文档集上面已经进行了%s次实验'%count_done_research_times
        for  featureDimension in featureDimensions:
            if featureDimension>10:
                toCalInfoGain=1
            print '目前处理的特征维数是%s'%featureDimension
    ##############创建文件夹########################################################################
            if toCalInfoGain==0:
                os.mkdir(r'D:\TextCategorization')
            FoldersCreation.CreateAssist(toCalInfoGain)
            print '创建文件夹模块运行结束'
            print '***************************************************************************'
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

    ################处理文档集合,对文档集合进行划分,区分测试集合和训练集合###############################
            from SVM import CorpusPartition
            CorpusPartition.MoveCorpus(N,toCalInfoGain)
            CorpusPartition.moveAccordingPartition(N,count_done_research_times,toCalInfoGain)
            print '分割文本集模块运行结束'
            print '*******************************************************************'
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
    #########################文档集合分词##########################################################
            from SVM import DataManager
            from ctypes import *
            import os
            import cPickle as p
            import re
            if toCalInfoGain==0:   
                roots=[r'D:\TextCategorization\training',r'D:\TextCategorization\testing']
                rootfinals=[r'D:\TextCategorization\segmented',r'D:\TextCategorization\tsegmented']
                for i in range(0,2):
                    dm=DataManager.DataManager(roots[i])
                    subdir=dm.GetSubDir()
                    filepathstotalsrc=[]
                    for sub  in subdir:
                        dm.SetFilePathsFromsubDir(roots[i]+os.sep+sub)
                        filepaths=dm.GetFilePaths()
                        filepathsassist=[sub+os.sep+path for path in filepaths ]
                        filepathstotalsrc=filepathstotalsrc+filepathsassist  
                    for path in filepathstotalsrc:
                        myfile=file(roots[i]+os.sep+path)
                        s=myfile.read()
                        myfile.close()
                        dll=cdll.LoadLibrary("ICTCLAS30.dll")    
                        dll.ICTCLAS_Init(c_char_p("."))  
                        bSuccess = dll.ICTCLAS_ParagraphProcess(c_char_p(s),0)
                        segmented=c_char_p(bSuccess).value
                        segmentedtmp=re.sub("\s+",'|',segmented,0)
                        segmentedfinal=re.sub('\xa1\xa1','',segmentedtmp)
                        fid=file(rootfinals[i]+os.sep+path,'w')
                        fid.write(segmentedfinal)
                        fid.close()
                        dll.ICTCLAS_Exit()
            #print 'finalfinish congratulations!'     
            print '文档集分词模块运行结束'
            print '**********************************************************************'
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

    ##################建立词袋子模型######################################################################
            from SVM import BagOfWordsConstruction
            BagOfWordsConstruction.BagOfWordsConstruction(r'D:\TextCategorization\segmented',toCalInfoGain)
            print '建立词袋子模型模块运行结束'
            print '***********************************************************************************'
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

    #######################特征词选择##################################################################
            from SVM import FeatureSelectionModel
            featurewords=FeatureSelectionModel.featureSelectionIG(featureDimension,toCalInfoGain,2*N/vfold)#feature
            import cPickle as mypickle
            fid=file(r'D:\TextCategorization\VITData\data\keywords.dat','w')
            mypickle.dump(featurewords,fid)
            fid.close()
            print '特征词选择模块运行结束'
            print '*******************************************************************************************'
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

    #######################文档向量模型建立模块##############################################################
            from SVM import VSMformation
            import shutil
            root1=r'D:\TextCategorization\segmented'
            root2=r'D:\TextCategorization\tsegmented'
            keywordsaddress=r'D:\TextCategorization\VITData\data\keywords.dat'
            print 'begin.....'
            VSMformation.LibSVMFormat(r'D:\TextCategorization\data\data\train.libsvm',root1,keywordsaddress)
            print '训练语料库转化完毕'
            VSMformation.LibSVMFormat(r'D:\TextCategorization\data\data\test.libsvm',root2,keywordsaddress)
            print '测试语料库转化完毕'
            print '文档向量模型建立模块运行结束'
            print '批处理完毕,congratulations!'
            os.chdir(r'C:\\Python26')
            os.chdir('D:\\TextCategorization')
            new_dir='TextCategorization_'+str(count_done_research_times)+'_'+str(N)+'_'+str(featureDimension)
            os.mkdir(new_dir)
            os.chdir(new_dir)
            os.mkdir('data')
            os.chdir(r'C:\\Python26')
            print os.getcwd()
            shutil.move(r'D:\TextCategorization\VITdata\data\keywords.dat','D:\\TextCategorization\\'+new_dir+'\\data')
            shutil.move(r'D:\TextCategorization\data\data\train.libsvm','D:\\TextCategorization\\'+new_dir+'\\data')
            shutil.move(r'D:\TextCategorization\data\data\test.libsvm','D:\\TextCategorization\\'+new_dir+'\\data')
            print'恭喜,文件夹重命名完毕'
            print '###########################finish##################################'
        os.chdir('D:\\')
        print os.getcwd()
        if os.path.isdir('TextCategorization'):
            os.rename('TextCategorization',str(count_done_research_times)+'_'+str(N)+'_rfinish')
        os.chdir(r'C:\Python26')
        toCalInfoGain=0
        print str(count_done_research_times)+'_'+str(N)+'finish'

        
       
posted on 2010-09-04 10:19  finallyly  阅读(6942)  评论(4编辑  收藏  举报