转载
"""
原始数据,用于建立模型
"""
courses = [
u'Writing II: Rhetorical Composing',
u'Genetics and Society: A Course for Educators',
u'General Game Playing',
u'Genes and the Human Condition (From Behavior to Biotechnology)',
u'A Brief History of Humankind',
u'New Models of Business in Society',
u'Analyse Numrique pour Ingnieurs',
u'Evolution: A Course for Educators',
u'Coding the Matrix: Linear Algebra through Computer Science Applications',
u'The Dynamic Earth: A Course for Educators',
u'Tiny Wings\tYou have always dreamed of flying - but your wings are tiny. Luckily the world is full of beautiful hills. Use the hills as jumps - slide down, flap your wings and fly! At least for a moment - until this annoying gravity brings you back down to earth. But the next hill is waiting for you already. Watch out for the night and fly as fast as you can. ',
u'Angry Birds Free',
u'没有\它很相似',
u'没有\t它很相似',
u'没有\t他很相似',
u'没有\t他不很相似',
u'没有',
u'可以没有',
u'也没有',
u'有没有也不管',
u'Angry Birds Stella',
u'Flappy Wings - FREE\tFly into freedom!A parody of the #1 smash hit game!',
u'没有一个',
u'没有一个2',
]
courses_name = courses
"""
预处理(easy_install nltk)
"""
def pre_process_cn(courses, low_freq_filter = True):
"""
简化的 中文+英文 预处理
1.去掉停用词
2.去掉标点符号
3.处理为词干
4.去掉低频词
"""
import nltk
import jieba.analyse
from nltk.tokenize import word_tokenize
texts_tokenized = []
for document in courses:
texts_tokenized_tmp = []
for word in word_tokenize(document):
texts_tokenized_tmp += jieba.analyse.extract_tags(word,10)
texts_tokenized.append(texts_tokenized_tmp)
texts_filtered_stopwords = texts_tokenized
english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
texts_filtered = [[word for word in document if not word in english_punctuations] for document in texts_filtered_stopwords]
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
texts_stemmed = [[st.stem(word) for word in docment] for docment in texts_filtered]
if low_freq_filter:
all_stems = sum(texts_stemmed, [])
stems_once = set(stem for stem in set(all_stems) if all_stems.count(stem) == 1)
texts = [[stem for stem in text if stem not in stems_once] for text in texts_stemmed]
else:
texts = texts_stemmed
return texts
lib_texts = pre_process_cn(courses)
"""
引入gensim,正式开始处理(easy_install gensim)
"""
def train_by_lsi(lib_texts):
"""
通过LSI模型的训练
"""
from gensim import corpora, models, similarities
dictionary = corpora.Dictionary(lib_texts)
corpus = [dictionary.doc2bow(text) for text in lib_texts]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)
index = similarities.MatrixSimilarity(lsi[corpus])
return (index, dictionary, lsi)
(index,dictionary,lsi) = train_by_lsi(lib_texts)
target_courses = [u'没有']
target_text = pre_process_cn(target_courses, low_freq_filter=False)
"""
对具体对象相似度匹配
"""
ml_course = target_text[0]
ml_bow = dictionary.doc2bow(ml_course)
ml_lsi = lsi[ml_bow]
sims = index[ml_lsi]
sort_sims = sorted(enumerate(sims), key=lambda item: -item[1])
print sort_sims[0:10]
print courses_name[sort_sims[1][0]]
print courses_name[sort_sims[2][0]]
print courses_name[sort_sims[3][0]]
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步