python机器学习-中文文本特征提取

#中文分词
def cut_word(text):
     text=" ".join(list(jieba.cut(text)))
     return text

#中文文本的特征提取
def count_chinese_dome():
    data=["10艘中俄军舰穿过津轻海峡,这一举措合乎国际法,无可指摘,却引起日本国内“异样反应”。"
          "19日,日本内阁官房副长官矶崎仁彦称,日方对此“高度关注”,"
          "“将对我国周边海空域进行警戒和监视,采取万全的应对姿态”。"]
    data_new=[]
    # for sent in data:
    #     data_new.append(cut_word(sent))
    data_new=[cut_word(sent) for sent in data]
    # 1.实例化一个转换器对象
    transfer = CountVectorizer(stop_words=[])  # 停用词
    # 2.调用fit_transform()
    data_new_2 = transfer.fit_transform(data_new)
    print(data_new_2.toarray())
    print(transfer.get_feature_names())
    print(data_new_2)

#Tf-idf文本特征抽取
def tfidf_demo():
    data=["10艘中俄军舰穿过津轻海峡,这一举措合乎国际法,无可指摘,却引起日本国内“异样反应”。"
          "19日,日本内阁官房副长官矶崎仁彦称,日方对此“高度关注”,"
          "“将对我国周边海空域进行警戒和监视,采取万全的应对姿态”。"]
    data_new=[]
    # for sent in data:
    #     data_new.append(cut_word(sent))
    data_new=[cut_word(sent) for sent in data]
    # 1.实例化一个转换器对象
    transfer = TfidfVectorizer(stop_words=[])  # 停用词
    # 2.调用fit_transform()
    data_new_2 = transfer.fit_transform(data_new)
    print(data_new_2.toarray())
    print(transfer.get_feature_names())

 

posted @ 2021-10-16 21:23  风吹过半夏  阅读(605)  评论(0编辑  收藏  举报