【python】字符串编码问题

参考:http://blog.csdn.net/tingsking18/article/details/4033645

python内部的字符串是以unicode来编码

decode函数用来将其他编码解码为unicode

encode函数将unicode编码为指定的编码类型,例如gbk,utf-8

# -*- coding: utf-8 -*-
"""
Created on Wed Jan 15 15:20:59 2014

@author: hp
"""


import urllib2
import re
import time
import jieba


url="http://blog.sina.com.cn/s/blog_608e1afd0102e5ym.html"
def geturl(url):
    html=urllib2.urlopen(url).read()
    html=unicode(html,'utf-8')
    word=re.findall(ur"[\u4e00-\u9fa5]+",html)
    
    s=""
    for w in word:
        s+=w
    return s  #return web content
def separate_word(s):    
    seg_list=jieba.cut(s,cut_all=False)
    fenci="/ ".join(seg_list)
#    print 'get web-->',s
#    print 'div result-》',fenci
#    print "fenci[1]-->",fenci[1]
    word_list=[]
    word_tmp=""
    #word_tmp.decode('utf-8')
    for i in range(len(fenci)):    
        if fenci[i]!="/":
            word_tmp+=fenci[i]
        else:
            i+=1
            word_tmp.decode('utf-8','ignore')
            word_list.append(word_tmp)
            word_tmp=""
    #word_list=seg_list.split("/ ")
    
#    print "word_list-->",word_list
    return word_list
    
def count_word(word_list):
    word_list_group=[]
    word_num=[]
    dic={}
    for i in range(len(word_list)):
        w_tmp=word_list[i]
        signal=0
        for j in range(len(word_list_group)):
            if word_list_group[j]==w_tmp:
                signal=1
        if signal==0:
            word_list_group.append(unicode(w_tmp.encode('utf-8'),'utf-8'))
            
    for i in range(len(word_list_group)):
        num=0
        for j in range(len(word_list)):
            if word_list_group[i]==word_list[j]:
                num+=1
        word_num.append(num)
    
    for i in range(len(word_list_group)):
        dic[word_list_group[i].encode('gbk')]=word_num[i]
    
#    for i in range(len(word_list_group)):
#        print "word_list_group-->",word_list_group[i].encode('gbk'),"word_num-->",word_num[i]
    return dic    
#    return word_list_group,word_num
    
contant=geturl(url)
word=separate_word(contant)
result=count_word(word)
for key in result.keys():
    print key.encode('gbk'),"--->",result[key]
#print result

time.sleep(10)

 

posted on 2014-01-16 16:59  colipso  阅读(353)  评论(0编辑  收藏  举报

导航