公告

pyscws4 是一个python的分词程序

pyscws4 是一个python的分词程序 | mei year-美叶专注思想。

     pyscws4 是一个python的分词程序
    Posted on 2012 年 11 月 15 日 by dingyangfan

    注意：pyscws4 是一个python的分词程序，抄袭至：马明练开发的php版的pscws4 地址是：http://www.ftphp.com/scws/ 。

    翻译了两个文件:

    1. pscws4.php

    2.xdb_r.php

    希望高人可以帮我优化一下代码

    规则文件和词典下载：

    分词.tar

    pyscws4.py文件源码：
    view source
    001    #coding=gbk
    002    from __future__ import division
    003    from collections import OrderedDict
    004    from xdb_r import XDB_R
    005    import math ,struct,copy
    006    import sys,time
    007    reload(sys)
    008    sys.setdefaultencoding('gbk')
    009    ''' defines for ruleset '''
    010    PSCWS4_RULE_MAX     = 31    # just 31, PHP do not support unsigined Int
    011    PSCWS4_RULE_SPECIAL=    0x80000000
    012    PSCWS4_RULE_NOSTATS=    0x40000000
    013    PSCWS4_ZRULE_NONE= 0x00
    014    PSCWS4_ZRULE_PREFIX=    0x01
    015    PSCWS4_ZRULE_SUFFIX=    0x02
    016    PSCWS4_ZRULE_INCLUDE=   0x04    # with include
    017    PSCWS4_ZRULE_EXCLUDE=   0x08    # with exclude
    018    PSCWS4_ZRULE_RANGE =    0x10    # with znum range
    019
    020    ''' defines for mode of scws <= 0x800 '''
    021    PSCWS4_IGN_SYMBOL= 0x01
    022    PSCWS4_DEBUG=           0x02
    023    PSCWS4_DUALITY=     0x04
    024
    025    ''' multi segment policy >= 0x1000 '''
    026    PSCWS4_MULTI_NONE=    0x0000        # nothing
    027    PSCWS4_MULTI_SHORT= 0x1000      # split long words to short words from left to right
    028    PSCWS4_MULTI_DUALITY=   0x2000      # split every long words(3 chars?) to two chars
    029    PSCWS4_MULTI_ZMAIN=   0x4000        # split to main single chinese char atr = j¦a¦n?¦v?
    030    PSCWS4_MULTI_ZALL= 0x8000      # attr = ** , all split to single chars
    031    PSCWS4_MULTI_MASK= 0xf000      # mask check for multi set
    032    PSCWS4_ZIS_USED=        0x8000000
    033
    034    ''' single bytes segment flag (纯单字节字符) '''
    035    PSCWS4_PFLAG_WITH_MB=   0x01
    036    PSCWS4_PFLAG_ALNUM= 0x02
    037    PSCWS4_PFLAG_VALID= 0x04
    038    PSCWS4_PFLAG_DIGIT= 0x08
    039    PSCWS4_PFLAG_ADDSYM=    0x10
    040
    041    ''' constant var define '''
    042    PSCWS4_WORD_FULL=       0x01    # 多字: 整词
    043    PSCWS4_WORD_PART=       0x02    # 多字: 前词段
    044    PSCWS4_WORD_USED=       0x04    # 多字: 已使用
    045    PSCWS4_WORD_RULE=       0x08    # 多字: 自动识别的
    046
    047    PSCWS4_ZFLAG_PUT=       0x02    # 单字: 已使用
    048    PSCWS4_ZFLAG_N2=        0x04    # 单字: 双字名词头
    049    PSCWS4_ZFLAG_NR2=       0x08    # 单字: 词头且为双字人名
    050    PSCWS4_ZFLAG_WHEAD= 0x10    # 单字: 词头
    051    PSCWS4_ZFLAG_WPART= 0x20    # 单字: 词尾或词中
    052    PSCWS4_ZFLAG_ENGLISH=   0x40    # 单字: 夹在中间的英文
    053    PSCWS4_ZFLAG_SYMBOL=    0x80    # 单字: 符号系列
    054
    055    PSCWS4_MAX_EWLEN=       16
    056    PSCWS4_MAX_ZLEN=        128
    057
    058    class PSCWS4(object):
    059        _xd = None # xdb dict handler
    060        _rs = None      # ruleset resource
    061        _rd = None      # ruleset data
    062        _cs = ''    # charset
    063        _ztab = []      # zi len table
    064        _mode = 0   # scws mode
    065        _txt = None     # text string
    066        _res = None
    067        _zis = None     # z if used?(duality)
    068        _off = 0
    069        _len = 0
    070        _wend = 0
    071        _wmap = []
    072        _zmap = []
    073        i = 0
    074
    075        def __init__(self,charset='gbk'):
    076            self._xd = False
    077            self._rs = self._rd = OrderedDict()
    078            self.set_charset(charset)
    079        def __del__(self):
    080            self.close()
    081        def debug(self):
    082            print "off:{0} len(_res):{1} len(_wmap):{2}\
    083    len(_zmap):{3} _wend:{4} _zis:{5}\
    084    len(_rs):{6} len(_rd):{7}\
    085            ".format(\
    086            self._off,len(self._res),len(self._wmap),len(self._zmap),self._wend,self._zis,\
    087            len(self._rs),len(self._rd)
    088            )
    089        #设置字符集(ztab)
    090        def set_charset(self,charset='gbk'):
    091            charset = charset.strip().lower()
    092            if(charset != self._cs):
    093                self._cs = charset
    094                self._ztab = [1 for i in range(0,0x81)]
    095                if(charset == 'utf-8' or charset == 'utf8'):
    096                    self._ztab.extend([1 for i in range(0x81,0xc0)])
    097                    self._ztab.extend([2 for i in range(0xc0,0xe0)])
    098                    self._ztab.extend([3 for i in range(0xe0,0xf0)])
    099                    self._ztab.extend([4 for i in range(0xf0,0xf8)])
    100                    self._ztab.extend([5 for i in range(0xf8,0xfc)])
    101                    self._ztab.extend([6 for i in range(0xfc,0xfe)])
    102                    self._ztab.extend([1])
    103                else:
    104                    self._ztab.extend([2 for i in range(0x81,0xff)])
    105                self._ztab.extend([1])
    106                #print len(self._ztab)
    107            # 设置词典
    108        def set_dict(self,fpath,mem=False):
    109            xdb = XDB_R(mem)
    110            if(xdb.Open(fpath) is not True): return False
    111            self._xd = xdb
    112        #设置规则集
    113        def set_rule(self,fpath):
    114            self._rule_load(fpath)
    115        #设置忽略符号与无用字符
    116        def set_igonre(self,yes):
    117            if(yes is True):self._mode ¦= PSCWS4_IGN_SYMBOL
    118            else: self._mode &= ~PSCWS4_IGN_SYMBOL
    119        #设置复合分词等级 ($level = 0,15)
    120        def set_multi(self,level):
    121            level = (int(level) << 12)
    122            self._mode &= ~PSCWS4_MULTI_MASK
    123            if(level & PSCWS4_MULTI_MASK): self._mode ¦= level
    124        #设置是否显示分词调试信息
    125        def set_debug(self,yes):
    126            if(yes is True): self._mode ¦= PSCWS4_DEBUG
    127            else:self._mode &= ~PSCWS4_DEBUG
    128        #设置是否自动将散字二元化
    129        def set_duality(self,yes):
    130            if(yes is True): self._mode ¦= PSCWS4_DUALITY
    131            else:self._mode &= ~PSCWS4_DUALITY
    132        # 设置要分词的文本字符串
    133        def send_text(self,text):
    134            self._txt = str(text)
    135            self._len = len(self._txt)
    136            self._off =0
    137        # 取回一批分词结果(需要多次调用, 直到返回 false)
    138        def get_result(self):
    139            off = self._off
    140            tlen = self._len
    141            txt = self._txt
    142            self._res = []
    143
    144            while ((off < tlen) and (ord(txt[off])<=0x20)):
    145                if(txt[off] == "\r" or txt[off] == "\n"):
    146                    self._off = off +1
    147                    self._put_res(off,0,1,'un')
    148                    return self._res
    149                off +=1
    150            if(off >= tlen): return False
    151            self._off = off
    152            ch = txt[off]
    153            cx = ord(ch)
    154            if(self._char_token(ch)):
    155                self._off +=1
    156                self._put_res(off,0,1,'un')
    157                return self._res
    158            clen = self._ztab[cx]
    159
    160            zlen = 1
    161            pflag = (PSCWS4_PFLAG_WITH_MB if clen >1 else (PSCWS4_PFLAG_ALNUM if self._is_alnum(cx) else 0))
    162            off = (off + clen)
    163            while off < tlen:
    164                ch = txt[off]
    165                cx = ord(ch)
    166                if (cx <= 0x20 or self._char_token(ch)):break
    167                clen = self._ztab[cx]
    168                if(not (pflag & PSCWS4_PFLAG_WITH_MB)):
    169                    if(clen ==1):
    170                        if((pflag & PSCWS4_PFLAG_ALNUM) and not self._is_alnum(cx)):
    171                            pflag ^= PSCWS4_PFLAG_ALNUM
    172                    else:
    173                        if(not ((pflag & PSCWS4_PFLAG_ALNUM) ) or zlen > 2): break
    174                        pflag ¦= PSCWS4_PFLAG_WITH_MB
    175                elif ( ((pflag & PSCWS4_PFLAG_WITH_MB) ) and clen ==1):
    176                    #mb + single-byte. allowd: alpha+num + 中文
    177                    if(not self._is_alnum(cx)): break
    178                    pflag &= ~PSCWS4_PFLAG_VALID
    179                    i = off+1
    180                    while i<(off+3):
    181                        ch = txt[i]
    182                        cx = ord(ch)
    183                        if( (i >= tlen) or (cx <=0x20) or (self._ztab[cx] > 1)):
    184                            pflag ¦= PSCWS4_PFLAG_VALID
    185                            break
    186                        if(not self._is_alnum(cx)): break
    187                        i+=1
    188                    if( not(pflag & PSCWS4_PFLAG_VALID) ): break
    189                    clen += (i - off -1)
    190                #add max zlen limit
    191                zlen +=1
    192                if(zlen >=PSCWS4_MAX_ZLEN):break
    193                off = (off + clen)
    194
    195            #处理半个字的问题
    196            ch =off
    197            if (ch > tlen):
    198                off -= clen
    199            #do the real segment
    200            if(off <= self._off):
    201                return False
    202            elif ( pflag & PSCWS4_PFLAG_WITH_MB ):
    203                self._msegment(off,zlen)
    204            elif ( not(pflag & PSCWS4_PFLAG_ALNUM) or ((off - self._off) >=PSCWS4_MAX_EWLEN ) ):
    205                self._ssegment(off)
    206            else:
    207                zlen = off -self._off
    208                self._put_res(self._off,2.5*math.log(zlen),zlen,'en')
    209            self._off = (tlen if ch > tlen else off)
    210            if(len(self._res) == 0): return self.get_result()
    211            return self._res
    212        def get_tops(self,limit = 10,xattr = ''):
    213            ret = {}
    214            if(self._txt is None): return False
    215            xmode = False
    216            attrs = {}
    217            if(xattr != ''):
    218                if(xattr[0:1] == '~'):
    219                    xattr = xattr[1:]
    220                    xmode = 1
    221                for tmp in xattr.split(','):
    222                    tmp = tmp.strip().lower()
    223                    if( tmp != ''): attrs[tmp] = True
    224            off = self._off
    225            self._off = cnt = 0
    226            tlist = {}
    227            while 1:
    228                tmpa = self.get_result()
    229                if (not tmpa): break
    230                for tmp in tmpa:
    231                    #有改
    232                    if(tmp['idf'] < 0.2 or tmp['attr'][0:1] == '#'): continue
    233                    if(len(attrs) >0):
    234                        if(xmode == True and not attrs.has_key(tmp['attr'])): continue
    235                        if(xmode == False and attrs.has_key(tmp['attr'])): continue
    236                    word = tmp['word'].lower()
    237                    if(self._rule_checkbit(word,PSCWS4_RULE_NOSTATS)): continue
    238                    if(tlist.has_key(word)):
    239                        tlist[word]['weight'] += tmp['idf']
    240                        tlist[word]['times'] +=1
    241                    else:
    242                        tlist[word] = {'word':tmp['word'],'times':1,'weight':tmp['idf'],'attr':tmp['attr']}
    243            self._off = off
    244            t= sorted(tlist.values(),key=lambda d:d['weight'],cmp=lambda a,b: 1 if b > a else -1)
    245            return t[0:limit]
    246        def close(self):
    247            if(self._xd):
    248                self._xd.Close()
    249                self._xd = False
    250            self._rd = []
    251            self._rs = []
    252        def version(self):
    253            return 'pySCWS/1.0 - by donghongyi'
    254        def _rule_load(self,fpath):
    255            try:
    256                fd = file(fpath,'r')
    257            except IOError:
    258                return False
    259            i = j = 0
    260            self._rs = OrderedDict()
    261            while 1:
    262                buf = fd.readline()
    263                if not buf:
    264                    break
    265                if (buf[0:1] != '['): continue
    266                pos = buf.find(']')
    267                if(pos == -1 or pos ==1 or pos > 15):continue
    268                key = buf[1:pos].lower()
    269                if(self._rs.has_key(key)): continue
    270                item = {'tf':5.0, 'idf':3.5, 'attr':'un', 'bit':0, 'flag':0, 'zmin':0, 'zmax':0, 'inc':0, 'exc':0}
    271                if(key == 'special'):
    272                    item['bit'] = PSCWS4_RULE_SPECIAL
    273                elif (key == 'nostats'):
    274                    item['bit'] = PSCWS4_RULE_NOSTATS
    275                else:
    276                    item['bit'] = (1 << j)
    277                    j +=1
    278                self._rs[key] = item
    279                #这里可能是错误
    280                i +=1
    281                if(i >=PSCWS4_RULE_MAX): break
    282            #load the ruleset
    283            fd.seek(0)
    284            rbl = False
    285            item= {}
    286            while 1:
    287                buf = fd.readline()
    288                if not buf:
    289                    break
    290                ch = buf[0:1]
    291                if(ch == ';'): continue
    292                if(ch == '['):
    293                    item = {}
    294                    pos = buf.find(']')
    295                    if(pos > 1):
    296                        key = buf[1:pos].lower()
    297                        if(self._rs.has_key(key)):
    298                            rbl = True
    299                            item = self._rs[key]
    300                    continue
    301                if(ch == ':'):
    302                    buf = buf[1:]
    303                    pos = buf.find('=')
    304                    if(pos == -1):
    305                        continue
    306                    pkey,pval = buf.split('=',2)
    307                    pkey = pkey.strip()
    308                    pval = pval.strip()
    309                    if(pkey == 'line'):    rbl = False if pval[0:1].strip() == 'n' else True
    310                    elif (pkey =='tf'):    item['tf'] = float(pval)
    311                    elif (pkey =='idf'):    item['idf'] = float(pval)
    312                    elif (pkey =='attr'):    item['attr'] = pval
    313                    elif (pkey == 'znum'):
    314                        pos = pval.find(',')
    315                        if(pos > -1):
    316                            item['zmax'] = int(pval[pos+1:].strip())
    317                            item['flag'] ¦= PSCWS4_ZRULE_RANGE
    318                            pval = pval[0:pos]
    319                        item['zmin'] = int(pval)
    320                    elif (pkey == 'type'):
    321                        if(pval == 'prefix'):
    322                            item['flag'] ¦= PSCWS4_ZRULE_PREFIX
    323                        if(pval == 'suffix'):
    324                            item['flag'] ¦= PSCWS4_ZRULE_SUFFIX
    325                    elif (pkey == 'include' or pkey =='exclude'):
    326                        clude = 0
    327                        for tmp in pval.split(','):
    328                            tmp = tmp.strip().lower()
    329                            if(not self._rs.has_key(tmp)): continue
    330                            clude ¦= self._rs[tmp]['bit']
    331                        if(pkey == 'include'):
    332                            item['inc'] ¦= clude
    333                            item['flag'] ¦= PSCWS4_ZRULE_INCLUDE
    334                        else:
    335                            item['exc'] ¦= clude
    336                            item['flag'] ¦=PSCWS4_ZRULE_EXCLUDE
    337                    continue
    338                if(item == {}): continue
    339                buf = buf.strip()
    340                if (buf == ''): continue
    341                if(rbl):
    342                    self._rd[buf] = item
    343                else:
    344                    tlen = len(buf)
    345                    off =0
    346                    while off < tlen:
    347                        tord = ord(buf[off:off+1])
    348                        zlen = self._ztab[tord]
    349                        if( off + zlen >= tlen): break
    350                        zch = buf[off:off+zlen]
    351                        self._rd[zch] = item
    352                        off += zlen
    353        #get the ruleset
    354        def _rule_get(self,str):
    355            if(not self._rd.has_key(str)): return False
    356            return self._rd[str]
    357        #check the bit with str
    358        def _rule_checkbit(self,str,bit):
    359            if(not self._rd.has_key(str)): return False
    360            bit2 = self._rd[str]['bit']
    361            return (True if (bit & bit2) else False)
    362        #check the rule include ¦ exclude
    363        def _rule_check(self,rule,str):
    364            if( (rule['flag'] & PSCWS4_ZRULE_INCLUDE) and not self._rule_checkbit(str,rule['bit'])): return False
    365            if( (rule['flag'] & PSCWS4_ZRULE_EXCLUDE) and self._rule_checkbit(str,rule['bit'])): return False
    366            return True
    367        #bulid res
    368        def _put_res(self,o,i,l,a):
    369            word = self._txt[o:o+l]
    370            item = {'word':word,'off':o,'idf':i,'len':l,'attr':a}
    371            self._res.append(item)
    372        #alpha, numeric check by ORD value
    373        def _is_alnum(self,c):
    374            return ((c>=48 and c<=57) or (c>=65 and c<=90) or (c>=97 and c<=122))
    375        def _is_alpha(self,c):
    376            return ((c>=65 and c<=90) or ( c>=97 and c<=122))
    377        def _is_ualpha(self,c):
    378            return (c>=65 and c<=90)
    379        def _is_digit(self,c):
    380            return (c>=48 and c<=57)
    381        def _no_rule1(self,f):
    382            return ((f & (PSCWS4_ZFLAG_SYMBOL¦PSCWS4_ZFLAG_ENGLISH)) or ((f & (PSCWS4_ZFLAG_WHEAD¦PSCWS4_ZFLAG_NR2)) == PSCWS4_ZFLAG_WHEAD))
    383        def _no_rule2(self,f):
    384            return self._no_rule1(f)
    385        def _char_token(self,c):
    386            return (c=='('or c==')'or c=='['or c==']'or c=='{'or c=='}'or c==':'or c=='"')
    387        # query the dict
    388        def _dict_query(self,word):
    389            if(not self._xd): return False
    390            value = self._xd.Get(word)
    391            if(not value): return False
    392            tmp = struct.unpack('f f B 3s',value)
    393            return {'tf':tmp[0],'idf':tmp[1],'flag':tmp[2],'attr':tmp[3].rstrip(b'\x00')}
    394        #ssegment, 单字节用语切割
    395        def _ssegment(self,end):
    396            start = self._off
    397            wlen = end - start
    398            #check special words (need strtoupper)
    399            if(wlen > 1):
    400                #可能出错
    401                txt = self._txt[start:start+wlen].lower()
    402                if(self._rule_checkbit(txt,PSCWS4_RULE_SPECIAL)):
    403                    self._put_res(start,9.5,wlen,'nz')
    404                    return
    405            txt = self._txt
    406            #check brief words such as S.H.E M.R.
    407            if( self._is_ualpha(ord(txt[start])) and txt[start+1] == '.'):
    408                #修改
    409                ch = start +2
    410                while ch< end:
    411                    if(not self._is_alpha(ord(txt[ch]))): break
    412                    ch +=1
    413                    if(ch == end or txt[ch] != '.'): break
    414                    ch +=1
    415                if(ch == end):
    416                    self._put_res(start,7.5,wlen,'nz')
    417                    return
    418            #取出单词及标点. 数字允许一个点且下一个为数字,不连续的. 字母允许一个不连续的'
    419            #print 1111
    420            while start < end:
    421                #修改过的
    422                ch = txt[start]
    423                start +=1
    424                cx = ord(ch)
    425                if(self._is_alnum(cx)):
    426                    pflag =PSCWS4_PFLAG_DIGIT if self._is_digit(cx) else 0
    427                    wlen = 1
    428                    while start < end:
    429                        ch = txt[start]
    430                        cx = ord(ch)
    431                        if(pflag & PSCWS4_PFLAG_DIGIT):
    432                            if(not self._is_digit(cx)):
    433                                if( (pflag & PSCWS4_PFLAG_ADDSYM) or cx !=0x2e or not self._is_digit(ord(txt[start+1]))):
    434                                    break
    435                                pflag ¦= PSCWS4_PFLAG_ADDSYM
    436                        else:
    437                            if(not self._is_alpha(cx)):
    438                                if( (pflag & PSCWS4_PFLAG_ADDSYM) or cx !=0x27 or not self._is_alpha(ord(txt[start+1]))):
    439                                    break
    440                                pflag ¦= PSCWS4_PFLAG_ADDSYM
    441                        start +=1
    442                        #可能出错
    443                        wlen +=1
    444                        if(wlen >=PSCWS4_MAX_EWLEN): break
    445                    self._put_res(start-wlen,2.5*math.log(wlen),wlen,'en')
    446                elif (not(self._mode & PSCWS4_IGN_SYMBOL)):
    447                    self._put_res(start-1,0,1,'un')
    448        #get one z by ZMAP
    449        def _get_zs(self,i,j = -1):
    450            if(j == -1): j = i
    451            return self._txt[self._zmap[i]['start']:self._zmap[i]['start']+( self._zmap[j]['end'] - self._zmap[i]['start'])]
    452        #mget_word
    453        def _mget_word(self,i,j):
    454            wmap = self._wmap
    455            if(not (wmap[i][i]['flag'] & PSCWS4_ZFLAG_WHEAD)): return i
    456            r = i
    457            #观察
    458            #k=i+1
    459            for k in range(i+1,j+1):
    460                #while k<=j:
    461                if(wmap[i][k] and wmap[i][k]['flag'] & PSCWS4_WORD_FULL): r =k
    462                #k+=1
    463            return r
    464        #mset_word
    465        def _mset_word(self,i,j):
    466                wmap = self._wmap
    467                zmap = self._zmap
    468                item = wmap[i][j]
    469                if( (item is False) or (( self._mode & PSCWS4_IGN_SYMBOL)\
    470                    and not (item['flag'] & PSCWS4_ZFLAG_ENGLISH) and item['attr'] == 'un' )\
    471                    ):
    472                    return
    473                #散字自动二元聚合
    474                if(self._mode & PSCWS4_DUALITY):
    475                    k = self._zis
    476                    if(i == j and not(item['flag'] & PSCWS4_ZFLAG_ENGLISH) and item['attr'] == 'un'):
    477                        self._zis = i
    478                        if(k < 0): return
    479                        i = (k & ~PSCWS4_ZIS_USED)
    480                        if( (i != (j-1)) or (not (k & PSCWS4_ZIS_USED) and self._wend == i)):
    481                            self._put_res(zmap[i]['start'],wmap[i][i]['idf'],zmap[i]['end'] - zmap[i]['start'],wmap[i][i]['attr'])
    482                            if( i != (j -1)): return
    483                        self._zis ¦= PSCWS4_ZIS_USED
    484                    else:
    485                        if( (k >=0) and (not (k & PSCWS4_ZIS_USED) or ( j > i))):
    486                            k &= ~PSCWS4_ZIS_USED
    487                            self._put_res(zmap[k]['start'], wmap[k][k]['idf'], zmap[k]['end'] - zmap[k]['start'], wmap[k][k]['attr'])
    488                        if( j > i): self._wend = j + 1
    489                        self._zis = -1
    490                #save the res
    491                self._put_res(zmap[i]['start'], item['idf'], zmap[j]['end'] - zmap[i]['start'], item['attr'])
    492                if( (j -i) > 1):
    493                    m = i
    494                    if ( self._mode & PSCWS4_MULTI_SHORT):
    495                        while (m < j):
    496                            k = m
    497                            n = m+1
    498                            while n<=j:
    499                                if(n ==j and m ==i): break
    500                                item = wmap[m][n]
    501                                if(item and item['flag'] & PSCWS4_WORD_FULL):
    502                                    k = n
    503                                    self._put_res(zmap[m]['start'], item['idf'], zmap[n]['end'] - zmap[m]['start'], item['attr'])
    504                                    if (not (item['flag'] & PSCWS4_WORD_PART)): break
    505                                n +=1
    506                            if (k == m):
    507                                if (m == i): break
    508                                item = wmap[m][m]
    509
    510                                self._put_res(zmap[m]['start'], item['idf'], zmap[m]['end'] - zmap[m]['start'], item['attr'])
    511                            m = k+1
    512                            if(m == j):
    513                                m -=1
    514                                break
    515                    if( self._mode & PSCWS4_MULTI_DUALITY):
    516                        while m < j:
    517                            self._put_res(zmap[m]['start'], wmap[m][m]['idf'], zmap[m+1]['end'] - zmap[m]['start'], wmap[m][m]['attr'])
    518                            m +=1
    519                if( (j > i) and (self._mode & (PSCWS4_MULTI_ZMAIN¦PSCWS4_MULTI_ZALL))):
    520                    if( (j -i) == 1 and not wmap[i][j]):
    521                        if(wmap[i][i]['flag'] & PSCWS4_ZFLAG_PUT): i +=1
    522                        else: wmap[i][i]['flag'] ¦= PSCWS4_ZFLAG_PUT
    523                        wmap[j][j]['flag'] ¦= PSCWS4_ZFLAG_PUT
    524                    #这里可能错误
    525                    while i <=j:
    526                        if(wmap[i][i]['flag'] & PSCWS4_ZFLAG_PUT): continue
    527                        ssss = wmap[i][i]['attr'][0:1]
    528                        #print ssss
    529                        if( not (self._mode & PSCWS4_MULTI_ZALL) and not ( ssss[ssss.find('jnv'):])): continue
    530                        self._put_res(zmap[i]['start'], wmap[i][i]['idf'], zmap[i]['end'] - zmap[i]['start'], wmap[i][i]['attr'])
    531                        i +=1
    532        #mseg_zone
    533        def _mseg_zone(self,f,t):
    534            weight = nweight = 0.0
    535            wmap = self._wmap
    536            zmap = self._zmap
    537            mpath = npath = []
    538            x = f
    539            for i in range(f,t+1):
    540                j = self._mget_word(i,t)
    541                if ( j == i or j <=x or ( wmap[i][j]['flag'] & PSCWS4_WORD_USED)): continue
    542                #one word only
    543                if (i ==f and j ==t):
    544                    mpath = [(j-i),0xff]
    545                    break
    546                if( i !=f and (wmap[i][j]['flag'] & PSCWS4_WORD_RULE)): continue
    547                #create the new path
    548                wmap[i][j]['flag'] ¦= PSCWS4_WORD_USED
    549                nweight = wmap[i][j]['tf'] * (j-i+1)
    550
    551                if (i ==f): nweight *=1.2
    552                elif (j ==t): nweight *=1.4
    553                if(npath == []):
    554                    npath = [0xff for uuu in range(t-f+2)]
    555
    556                #lookfor backward
    557                x = 0
    558                m = f
    559                while m< i:
    560                    n = self._mget_word(m,i-1)
    561                    nweight *= wmap[m][n]['tf'] * (n-m+1)
    562                    npath[x] = n-m
    563                    x +=1
    564                    if(n>m): wmap[m][n]['flag'] ¦= PSCWS4_WORD_USED
    565                    m = n+1
    566                #my self
    567                npath[x] = j-i
    568                x+=1
    569                #lookfor forward
    570                m = j+1
    571                while m <=t:
    572                    n = self._mget_word(m,t)
    573                    nweight *= wmap[m][n]['tf'] * (n-m+1)
    574                    npath[x] = n-m
    575                    x +=1
    576                    if(n >m): wmap[m][n]['flag'] ¦= PSCWS4_WORD_USED
    577                    m = n+1
    578                npath[x] = 0xff
    579                nweight /= pow(x-1,4)
    580                #draw the path for debug
    581                if(self._mode & PSCWS4_DEBUG):
    582                    print "PATH by keyword = {0} (weight={1}):\n".format(self._get_zs(i,j),nweight)
    583                    m=f
    584                    x=0
    585                    n = npath[x]
    586                    while n !=0xff:
    587                        n +=m
    588                        print self._get_zs(m,n),' '
    589                        m = n+1
    590                        x+=1
    591                        n = npath[x]
    592                    print "\n--\n"
    593                x = j
    594                #check better path
    595                if(nweight > weight):
    596                    weight = copy.deepcopy(nweight)
    597                    swap = copy.deepcopy(mpath)
    598                    mpath = copy.deepcopy(npath)
    599                    npath = copy.deepcopy(swap)
    600                    del swap
    601            #set the result, mpath != NULL
    602            if(mpath == []): return
    603            m = f
    604            x=0
    605            n = mpath[x]
    606            #print mpath
    607            while n !=0xff:
    608                n +=m
    609                #print m,n
    610                self._mset_word(m,n)
    611                m = n +1
    612                x+=1
    613                n = mpath[x]
    614        #msegment(重点函数)
    615        def _msegment(self,end,zlen):
    616            self._wmap = [[False for ooooo in range(zlen)] for i in range(zlen)]
    617            self._zmap = [False for ooooo in range(zlen)]
    618            wmap = self._wmap
    619            zmap = self._zmap
    620            txt = self._txt
    621            start = self._off
    622            self._zis = -1
    623            #load the zmap
    624            i =0
    625            #load the zmap
    626            while start < end:
    627                ch = txt[start]
    628                cx = ord(ch)
    629                clen = self._ztab[cx]
    630                if(clen == 1):
    631                    while start < end:
    632                        start +=1 #修改
    633                        cx = ord(txt[start])
    634                        if(self._ztab[cx] > 1): break
    635                        clen +=1
    636                    wmap[i][i] = {'tf':0.5, 'idf':0, 'flag':PSCWS4_ZFLAG_ENGLISH, 'attr':'un'}
    637                else:
    638                    query = self._dict_query(txt[start:start+clen])
    639                    if(not query):
    640                        wmap[i][i] = {'tf':0.5, 'idf':0, 'flag':0, 'attr':'un'}
    641                    else:
    642                        if(query['attr'][0:1] == '#'): query['flag'] ¦= PSCWS4_ZFLAG_SYMBOL
    643                        wmap[i][i] = query
    644                    start += clen
    645                zmap[i] = {'start':start-clen, 'end':start}
    646                i+=1
    647
    648            #fixed real zlength
    649            zlen = i
    650            #create word query table
    651            for i in range(zlen):
    652                k=0
    653                j = i +1
    654                while j<zlen:
    655                    query = self._dict_query(self._get_zs(i,j))
    656                    if (not query):break
    657                    ch = query['flag']
    658                    if(ch & PSCWS4_WORD_FULL):
    659                        wmap[i][j] = query
    660                        wmap[i][i]['flag'] ¦= PSCWS4_ZFLAG_WHEAD
    661                        k = i+1
    662                        while k<=j:
    663                            wmap[k][k]['flag'] ¦= PSCWS4_ZFLAG_WPART
    664                            k+=1
    665                    if (not (ch & PSCWS4_WORD_PART)): break
    666                    j +=1
    667                k-=1
    668                if(k and k>=0):
    669                    #set nr2 to some short name
    670                    if(k == (i+1)):
    671                        if(wmap[i][k]['attr'] == 'nr'):
    672                            wmap[i][i]['flag'] ¦= PSCWS4_ZFLAG_NR2
    673                    #clean the PART flag for the last word
    674                    if(k < j):
    675                        wmap[i][k]['flag'] ^= PSCWS4_WORD_PART
    676            # try to do the ruleset match
    677            # for name & zone & chinese numeric
    678            if(len(self._rd) > 0):
    679                #check for 'one word'
    680                for i in range(zlen):
    681                    if(self._no_rule1(wmap[i][i]['flag'])): continue
    682                    r1 = self._rule_get(self._get_zs(i))
    683                    if(not r1): continue
    684                    clen = r1['zmin'] if r1['zmin'] >0 else 1
    685                    if(( r1['flag'] & PSCWS4_ZRULE_PREFIX) and (i < (zlen-clen))):
    686                        #先检查 zmin 字内是否全部符合要求, 再在 zmax 范围内取得符合要求的字
    687                        ch =1
    688                        while ch <=clen:
    689                            j = i + ch
    690                            if(j >= zlen or self._no_rule2(wmap[j][j]['flag'])):break
    691                            if(not self._rule_check(r1,self._get_zs(j))): break
    692                            ch+=1
    693                        if(ch <= clen): continue
    694                        #no limit znum or limit to a range
    695                        j = i +ch
    696                        while 1:
    697                            if( (not r1['zmax'] and r1['zmin']) or (r1['zmax'] and (clen >= r1['zmax']))): break
    698                            if(j >= zlen or self._no_rule2(wmap[j][j]['flag'])): break
    699                            if( not self._rule_check(r1,self._get_zs(j))): break
    700                            clen +=1
    701                            j +=1
    702                        # 注意原来2字人名,识别后仍为2字的情况
    703                        if(wmap[i][i]['flag'] & PSCWS4_ZFLAG_NR2):
    704                            if(clen ==1): continue
    705                            wmap[i][i+1]['flag'] ¦= PSCWS4_WORD_PART
    706                        #ok, got: i & clen
    707                        k = i + clen
    708                        wmap[i][k] = {'tf':r1['tf'], 'idf':r1['idf'], 'flag':(PSCWS4_WORD_RULE¦PSCWS4_WORD_FULL), 'attr':r1['attr']}
    709                        wmap[i][i]['flag'] ¦= PSCWS4_ZFLAG_WHEAD
    710
    711                        j = i+1
    712                        while j<=k:
    713                            wmap[j][j]['flag'] ¦= PSCWS4_ZFLAG_WPART
    714                            j+=1
    715                        if(not (wmap[i][i]['flag'] & PSCWS4_ZFLAG_WPART)): i =k
    716                        continue
    717                    if( (r1['flag'] & PSCWS4_ZRULE_SUFFIX) and (i >= clen)):
    718                        #suffix, check before
    719                        ch = 1
    720                        while ch<=clen:
    721                            j = i -ch
    722                            if(j < 0 or self._no_rule1(wmap[j][j]['flag'])): break
    723                            if(not self._rule_check(r1, self._get_zs(j))):break
    724                            ch+=1
    725                        if (ch <= clen): continue
    726                        #no limit znum or limit to a range
    727                        j = i - ch
    728                        while 1:
    729                            if( (not r1['zmax'] and r1['zmin']) or (r1['zmax'] and (clen >= r1['zmax']))): break
    730                            if( j < 0 or self._no_rule2(wmap[j][j]['flag'])): break
    731                            if( not self._rule_check(r1,self._get_zs(j))): break
    732                            clen +=1
    733                            j -=1
    734                        #ok, got: i & clen (maybe clen=1 & [k][i] isset)
    735                        k = i -clen
    736                        if(wmap[k][i] is not False): continue
    737                        wmap[k][i] = {'tf':r1['tf'], 'idf':r1['idf'], 'flag':PSCWS4_WORD_FULL, 'attr':r1['attr']}
    738                        wmap[k][k]['flag'] ¦= PSCWS4_ZFLAG_WHEAD
    739                        j = k+1
    740                        while j <=i:
    741                            wmap[j][j]['flag'] ¦= PSCWS4_ZFLAG_WPART
    742                            if( (j != i) and (wmap[k][i] is not False) ): wmap[k][j]['flag'] ¦= PSCWS4_WORD_PART
    743                            j+=1
    744                        continue
    745                #check for 'two words' (such as: 欧阳** , **西路)
    746                #print wmap[6]
    747                for i in range(zlen-2,-1,-1):
    748                    #with value ==> must be have SCWS_WORD_FULL, so needn't check it ag.
    749                    if( (wmap[i][i+1] is False) or wmap[i][i+1]['flag'] & PSCWS4_WORD_PART): continue
    750                    k = i +1
    751                    #print k
    752                    r1= self._rule_get(self._get_zs(i,k))
    753                    if(not r1): continue
    754                    clen =r1['zmin'] if r1['zmin'] else 1
    755                    if( (r1['flag'] & PSCWS4_ZRULE_PREFIX) and (k < (zlen-clen))):
    756                        ch = 1
    757                        while ch<=clen:
    758                            j = k +ch
    759                            if(j >= zlen or self._no_rule2(wmap[j][j]['flag'])): break
    760                            if(not self._rule_check(r1,self._get_zs(j))): break
    761                            ch +=1
    762                        if(ch <= clen):continue
    763                        #no limit znum or limit to a range
    764                        j = k+ch
    765                        while 1:
    766                            if( (not r1['zmax'] and r1['zmin']) or (r1['zmax'] and (clen >=r1['zmax']))): break
    767                            if(j >= zlen or self._no_rule2(wmap[j][j]['flag'])): break
    768                            if(not self._rule_check(r1,self._get_zs(j))): break
    769                            clen +=1
    770                            j +=1
    771                        #ok, got: i & clen
    772                        k = k + clen
    773                        wmap[i][k] = {'tf':r1['tf'], 'idf':r1['idf'], 'flag':PSCWS4_WORD_FULL, 'attr':r1['attr']}
    774                        wmap[i][i+1]['flag'] ¦= PSCWS4_WORD_PART
    775                        j=i+2
    776                        while j<=k:
    777                            wmap[j][j]['flag'] ¦= PSCWS4_ZFLAG_WPART
    778                            j+=1
    779                        i -=1
    780                        continue
    781                    if ( (r1['flag'] & PSCWS4_ZRULE_SUFFIX) and (i >= clen)):
    782                        # suffix, check before
    783                        ch = 1
    784                        while ch<=clen:
    785                            j = i -ch
    786                            if(j < 0 or self._no_rule1(wmap[j][j]['flag'])): break
    787                            if ( not self._rule_check(r1,self._get_zs(j))): break
    788                            ch +=1
    789                        if (ch <= clen): continue
    790                        #no limit znum or limit to a range
    791                        j = i - ch
    792                        while 1:
    793                            if( (not r1['zmax'] and r1['zmin']) or (r1['zmax'] and (clen >= r1['zmax'])) ): break
    794                            if(j < 0 or self._no_rule2(wmap[j][j]['flag'])): break
    795                            if( not self._rule_check(r1,self._get_zs(j))): break
    796                            clen +=1
    797                            j -=1
    798                        #ok, got: i & clen (maybe clen=1 & [k][i] isset)
    799                        k = i - clen
    800                        i = i +1
    801                        wmap[k][i] = {'tf':r1['tf'], 'idf':r1['idf'], 'flag':PSCWS4_WORD_FULL, 'attr':r1['attr']}
    802                        wmap[k][k]['flag'] ¦= PSCWS4_ZFLAG_WHEAD
    803                        j = k+1
    804                        while j<=i:
    805                            wmap[j][j]['flag'] ¦= PSCWS4_ZFLAG_WPART
    806                            if(wmap[k][j] is not False): wmap[k][j]['flag'] ¦= PSCWS4_WORD_PART
    807                            j+=1
    808                        i -= (clen +1)
    809                        continue
    810            # do the segment really
    811            # find the easy break point
    812
    813            j=0
    814            i=0
    815            for i in range(zlen):
    816                if(wmap[i][i]['flag'] & PSCWS4_ZFLAG_WPART): continue
    817                if(i > j):
    818                    self._mseg_zone(j,i-1)
    819                j = i
    820                if (not (wmap[i][i]['flag'] & PSCWS4_ZFLAG_WHEAD)):
    821                    self._mset_word(i,i)
    822                    j+=1
    823            i+=1
    824            #错在这里
    825            #the lastest zone
    826            if(i > j):
    827                self._mseg_zone(j,i-1)
    828            if( (self._mode & PSCWS4_DUALITY) and (self._zis >=0) and not (self._zis & PSCWS4_ZIS_USED) ):
    829                i = self._zis
    830                self._put_res(zmap[i]['start'],wmap[i][i]['idf'],zmap[i]['end'] - zmap[i]['start'],wmap[i][i]['attr'])
    831
    832    def test(text):
    833            st = time.time()
    834            text = text
    835            for i in range(100):
    836                cws.send_text(text)
    837                while cws.get_result():
    838                    pass
    839
    840            ret = cws.get_tops(10,'r,v,p')
    841            print "No.\tWord\t\t\tAttr\tTimes\tRank\n------------------------------------------------------\n"
    842            i = 0
    843            for tmp in ret:
    844                i+=1
    845                print "%02d.\t%-8s\t%s\t%d\t%.2f" %( i, tmp['word'].decode('gbk'),tmp['attr'], tmp['times'], tmp['weight'])
    846
    847            print u'所花时间：',time.time()-st
    848    if __name__=='__main__':
    849        cws = PSCWS4('gbk')
    850        cws.set_dict('dict.xdb',True)
    851        cws.set_rule('rules.ini')
    852        cws.send_text("""中国航天官员应邀到美国与太空总署官员开会发展中国家上海大学城书店表面的东西今天我买了一辆面的，于是我坐着面的去上班化妆和服装这个门把手坏了，请把手拿开将军任命了一名中将，产量三年中将增长两倍王军虎去广州了，王军虎头虎脑的欧阳明练功很厉害可是马明练不厉害北京华烟云人中出吕布马中出赤兔Q1,中我要买Q币充值""")
    853        cws.set_igonre(False) #设置忽略符号与无用字符
    854        #cws.set_debug(True) #设置是否显示分词调试信息
    855        cws.set_multi(3) #设置复合分词等级 ($level = 0,15)
    856        cws.set_duality(True) #设置是否自动将散字二元化
    857        #test("中国航天官员应邀到美国与太空总署官员开会发展中国家上海大学城书店表面的东西今天我买了一辆面的，于是我坐着面的去上班化妆和服装这个门把手坏了，请把手拿开将军任命了一名中将，产量三年中将增长两倍王军虎去广州了，王军虎头虎脑的欧阳明练功很厉害可是马明练不厉害京华烟云人中出吕布马中出赤兔Q1,中我要买Q币充值")
    858
    859    while 1:
    860        tmp = cws.get_result()
    861        if(not tmp):break
    862        line = ''
    863        for w in tmp:
    864            if (w['word'] == "\r"): continue
    865            if (w['word'] == "\n"):
    866                line = line.rstrip(' ') + "\n"
    867            #else: line .= w['word'] . "/{w['attr']} "
    868            else: line += w['word'] + " "
    869        print line
    870    #t = '，'
    871    #print len(t[0:2])
    872    #print ord(t[1])
    873
    874    #    ret = cws.get_tops(10,'r,v,p')
    875
    876    #    print "No.\tWord\t\t\tAttr\tTimes\tRank\n------------------------------------------------------\n"
    877    #    i = 0
    878    #    for tmp in ret:
    879    #        i+=1
    880    #       print "%02d.\t%-8s\t%s\t%d\t%.2f" %( i, tmp['word'].decode('gbk'),tmp['attr'], tmp['times'], tmp['weight'])

    xdb_r.py文件源码：
    view source
    001    #coding=gbk
    002    import os
    003    import struct
    004    import sys
    005    reload(sys)
    006    sys.setdefaultencoding('gbk')
    007    XDB_VERSION = 34 # 0x01 ~ 0xff
    008    XDB_TAGNAME = 'XDB' # First bytes
    009    XDB_MAXKLEN = 0xf0 # maxklen: < 255
    010
    011    class XDB_R(object):
    012        fd = False
    013        hash_base = 0
    014        hash_prime = 0
    015        memread = None #内存
    016        mem = False #是否启用内存
    017        off = 0 #位置
    018        len = 0 #内存长度
    019        def __init__(self,mem=False):
    020            self.mem = mem
    021            pass
    022        def __del__(self):
    023            self.Close()
    024            pass
    025        def Open(self,fpath):
    026            self.Close()
    027            try:
    028                fd = file(fpath,'rb')
    029            except IOError:
    030                raise Exception('XDB::Open("' + os.path.basename(fpath) + '"),invalid xdb failed.')
    031            else:
    032                if(self.mem):
    033                    self.memread = fd.read()
    034                    self.len = len(self.memread)
    035                self.fd = fd
    036            if( self._check_header(fd) is False):
    037                raise Exception('XDB::Open("' + os.path.basename(fpath) + '"),invalid xdb format.')
    038                fd.close()
    039            return True
    040        def _read(self,size):
    041            if(self.mem):
    042                return self.memread[self.off:self.off+size]
    043            else:
    044                return self.fd.read(size)
    045        def _seek(self,seek,flag=False):
    046            if(self.mem):
    047                if self.off > self.len: raise Exception('Mem offset !')
    048                self.off = seek
    049            else:
    050                self.fd.seek(seek,flag)
    051        def _close(self):
    052            if(self.mem):
    053                self.memread = None
    054            else:
    055                self.fd.close()
    056            self.fd = False
    057        def Get(self,key):
    058            if(self.fd is False):
    059                raise Exception('XDB:Get(), null db handler.')
    060            klen = len(key)
    061            #print klen
    062            if(klen ==0 or klen > XDB_MAXKLEN):
    063                return False
    064            rec = self._get_record(key)
    065            if(not rec.has_key('vlen') or rec['vlen'] ==0):
    066                return False
    067
    068            return rec['value']
    069        def Close(self):
    070            if(self.fd is False):
    071                return
    072            self._close()
    073        def _get_index(self,key):
    074            l = len(key)
    075            h = self.hash_base
    076            while l:
    077                l-=1
    078                h += (h << 5)
    079                h ^= ord(key[l])
    080                h &= 0x7fffffff
    081            return (h % self.hash_prime)
    082        def _check_header(self,fd):
    083            fd.seek(0,os.SEEK_SET)
    084            buf = fd.read(32)
    085            if(len(buf) != 32): return False
    086            unpack = struct.unpack('3s B I I I f 12s',buf)
    087            if(len(unpack) <=6):
    088                unpack = list(unpack)
    089                unpack.extend(' ')
    090            hdr = {}
    091            hdr['tag'],hdr['ver'],hdr['base'],hdr['prime'],hdr['fsize'],hdr['check'],hdr['reversed'] =unpack[0],unpack[1],unpack[2],unpack[3],unpack[4],unpack[5],unpack[6]
    092            if(hdr['tag'] != XDB_TAGNAME): return False
    093            fstat = os.fstat(fd.fileno())
    094            if(fstat.st_size != hdr['fsize']): return False
    095            self.hash_base = hdr['base']
    096            self.hash_prime = hdr['prime']
    097            self.version = hdr['ver']
    098            self.fsize = hdr['fsize']
    099        def _get_record(self,key):
    100            self._io_times = 1
    101            index = self._get_index(key) if self.hash_prime > 1 else 0
    102            poff = index * 8 + 32
    103            self._seek(poff,os.SEEK_SET)
    104            buf = self._read(8)
    105
    106            if(len(buf) ==8):
    107                tmp = struct.unpack('I I',buf)
    108                tmp = {'off':tmp[0],'len':tmp[1]}
    109            else:tmp = {'off':0,'len':0}
    110            return self._tree_get_record(tmp['off'],tmp['len'],poff,key)
    111
    112        def _tree_get_record(self,off,len,poff =0,key =''):
    113            if(len == 0): return {'poff':poff}
    114            self._io_times+=1
    115            self._seek(off,os.SEEK_SET)
    116            rlen = XDB_MAXKLEN + 17
    117
    118            if(rlen > len): rlen = len
    119            buf = self._read(rlen)
    120            unpack = struct.unpack('I I I I B',buf[0:17])
    121            rec = {}
    122            rec['loff'],rec['llen'],rec['roff'],rec['rlen'],rec['klen'] = unpack[0],unpack[1],unpack[2],unpack[3],unpack[4]
    123
    124            fkey = buf[17:17+rec['klen']]
    125            cmpl = cmp(key,fkey) if(key) else 0
    126            #print key.decode('gbk'),fkey.decode('gbk')
    127            if(cmpl > 0):
    128                buf =''
    129                return self._tree_get_record(rec['roff'],rec['rlen'],off+8,key)
    130            elif (cmpl < 0):
    131                buf=''
    132                return self._tree_get_record(rec['loff'],rec['llen'],off,key)
    133            else:
    134                rec['poff'] = poff
    135                rec['off'] = off
    136                rec['len'] = len
    137                rec['voff'] = off + 17 + rec['klen']
    138                rec['vlen'] = len - 17 - rec['klen']
    139                rec['key'] = fkey
    140                self._seek(rec['voff'],os.SEEK_SET)
    141                rec['value'] = self._read(rec['vlen'])
    142                return rec
    143    #
    144
    145    #aa = XDB_R(True)
    146    #aa.Open('./dict.xdb')
    147    #aab = aa.Get('上海')
    148    #print aab

posted on 2012-12-12 15:52 lexus 阅读(1105) 评论(0) 收藏举报

刷新页面返回顶部

浙江省高等学校教师教育理论培训

公告

pyscws4 是一个python的分词程序