from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_selection import SelectKBest, chi2 from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn import metrics from text.textpredict import * from sklearn.cross_validation import * def chi22(): train_words=["急需 钱用 不用 出售 如图 价值 千多 便宜 出售 出售 急 ", "读 读 重复 读好输 不变 绿 求高人 指点迷津 ", "诚召搛只呆家小时工,全职妈妈、在校学生、在家待业者、上班族、游戏者皆可做!每天5", "发福利了 火熱找小莳工,每天在綫2--3小莳,莳涧地點没限制,薪资鈤结80--150/", "急招小时工,每天在綫2--3小拭,时间地点没限制,薪资日结80--150/天,适 急招小时工,每天在綫2--3小拭,时间地点没限制,薪资日结80--150/天,适合学生党,手机党,上班族,有空闲时间者,有興趣缪系,QQ(937117723)咨询,此处不回!!", "发福利来 火熱找小莳工,每天在綫2--3小莳,莳涧地點没限制,薪资鈤结80--150/", " 读 不好 呜呜 ","这句 话 总是 知道 连读 ","求 师傅 交 口语 求有 耐心 老师 基础 学 ", "听到 读 " ] train_tags=[1,0,1,1,1,1,0,0,0,0] """ ##就提取了词频CountVectorizer count_v1 = CountVectorizer(stop_words=None, max_df=0.5) counts_train = count_v1.fit_transform(train_words) ##卡方检验chi,配合selectkbest 对特征进行选择 chi= SelectKBest(chi2,10) mychi2 = chi.fit(counts_train, train_tags) hi2_train = mychi2.transform(counts_train) clf = MultinomialNB(alpha=0.01) clf.fit(hi2_train, np.asarray(train_tags)) priediced = cross_val_predict(clf, hi2_train, train_tags) print metrics.confusion_matrix(train_tags, priediced) """ ##tf-idf Tfidf = TfidfVectorizer() tfidf_train = Tfidf.fit_transform(train_words) clf = MultinomialNB(alpha=0.01) clf.fit(tfidf_train, np.asarray(train_tags)) priediced = cross_val_predict(clf, tfidf_train, train_tags) print metrics.confusion_matrix(train_tags, priediced) #print hi2_train chi22()
从菜鸟走向大神,这是道路。