1、for...if...构建List
segs = [v for v in segs if not str(v).isdigit()]#去数字
https://www.cnblogs.com/eniac1946/p/7327144.html
for if 基本语法以及示例
https://www.cnblogs.com/huchong/p/9328687.html
2、python之lambda、filter、map、reduce的用法说明
https://www.cnblogs.com/yufeihlf/p/6179982.html
3、pandas库DataFrame的分组,拼接,统计运算等用法
https://blog.csdn.net/cymy001/article/details/78300900
4、jieba分词 介绍及入门示例
https://www.cnblogs.com/tonglin0325/p/6298456.html
jieba 进阶版:https://www.cnblogs.com/wangbaihan/p/9474295.html
5、词袋模型
https://baike.baidu.com/item/%E8%AF%8D%E8%A2%8B%E6%A8%A1%E5%9E%8B/22776998?fr=aladdin
6、用docsim/doc2vec/LSH比较两个文档之间的相似度
https://blog.csdn.net/vs412237401/article/details/52238248
https://blog.csdn.net/qq_16633405/article/details/80578804
7、python 文件操作
https://blog.csdn.net/qq_37383691/article/details/76060972
w:以写方式打开, a:以追加模式打开 (从 EOF 开始, 必要时创建新文件) r+:以读写模式打开 w+:以读写模式打开 (参见 w ) a+:以读写模式打开 (参见 a ) rb:以二进制读模式打开 wb:以二进制写模式打开 (参见 w ) ab:以二进制追加模式打开 (参见 a ) rb+:以二进制读写模式打开 (参见 r+ ) wb+:以二进制读写模式打开 (参见 w+ ) ab+:以二进制读写模式打开 (参见 a+ )fp.read([size])
8、LSHForest 进行短文本相似性计算
LSH︱python实现局部敏感随机投影森林——LSHForest/sklearn(一) 介绍了一些概念
用docsim/doc2vec/LSH比较两个文档之间的相似度
LSHForest进行文本相似性计算 有示例代码和数据
9、TF-IDF提取行业关键词
10、scikit-learn
11、基于jieba、TfidfVectorizer、LogisticRegression的文档分类
基于jieba、TfidfVectorizer、LogisticRegression的文档分类
12、CountVectorizer与TfidfVectorizer
CountVectorizer与TfidfVectorizer 参数详解
1、可以使用"+"号完成操作 c=a+b
2、使用extend方法 a.extend(b)
Python将多个list合并为1个list的方法
item.endswith('.mp4')
item.startswith('demo')
16、无监督的文本分类
文章:http://blogspring.cn/view/234
源码:https://blog.csdn.net/lhxsir/article/details/83310136
import random import jieba import pandas as pd import numpy as np from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import TfidfVectorizer import matplotlib.pyplot as plt from sklearn.decomposition import PCA from sklearn.cluster import KMeans import gensim from gensim.models import Word2Vec from sklearn.preprocessing import scale import multiprocessing #加载停用词 stopwords=pd.read_csv('D://input_py//day07//stopwords.txt',index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8') stopwords=stopwords['stopword'].values #加载语料 laogong_df = pd.read_csv('D://input_py//day07//beilaogongda.csv', encoding='utf-8', sep=',') laopo_df = pd.read_csv('D://input_py//day07//beilaogongda.csv', encoding='utf-8', sep=',') erzi_df = pd.read_csv('D://input_py//day07//beierzida.csv', encoding='utf-8', sep=',') nver_df = pd.read_csv('D://input_py//day07//beinverda.csv', encoding='utf-8', sep=',') #删除语料的nan行 laogong_df.dropna(inplace=True) laopo_df.dropna(inplace=True) erzi_df.dropna(inplace=True) nver_df.dropna(inplace=True) #转换 laogong = laogong_df.segment.values.tolist() laopo = laopo_df.segment.values.tolist() erzi = erzi_df.segment.values.tolist() nver = nver_df.segment.values.tolist() # 定义分词函数preprocess_text def preprocess_text(content_lines, sentences): for line in content_lines: try: segs=jieba.lcut(line) segs = [v for v in segs if not str(v).isdigit()]#去数字 segs = list(filter(lambda x:x.strip(), segs)) #去左右空格 segs = list(filter(lambda x:len(x)>1, segs)) #长度为1的字符 segs = list(filter(lambda x:x not in stopwords, segs)) #去掉停用词 sentences.append(" ".join(segs)) except Exception: print(line) continue sentences = [] preprocess_text(laogong, sentences) preprocess_text(laopo, sentences) preprocess_text(erzi, sentences) preprocess_text(nver, sentences) random.shuffle(sentences) # 控制台输出前10条数据 for sentence in sentences[:10]: print(sentence) # 将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频 vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5) # 统计每个词语的tf-idf权值 transformer = TfidfTransformer() # 第一个fit_transform是计算tf-idf 第二个fit_transform是将文本转为词频矩阵 tfidf = transformer.fit_transform(vectorizer.fit_transform(sentences)) # 获取词袋模型中的所有词语 word = vectorizer.get_feature_names() # 将tf-idf矩阵抽取出来,元素w[i][j]表示j词在i类文本中的tf-idf权重 weight = tfidf.toarray() # 查看特征大小 print ('Features length: ' + str(len(word))) # TF-IDF 的中文文本 K-means 聚类 numClass=4 # 聚类分几簇 clf = KMeans(n_clusters=numClass, max_iter=10000, init="k-means++", tol=1e-6) #这里也可以选择随机初始化init="random" pca = PCA(n_components=10) # 降维 TnewData = pca.fit_transform(weight) # 载入N维 s = clf.fit(TnewData) # 定义聚类结果可视化函数 def plot_cluster(result,newData,numClass): plt.figure(2) Lab = [[] for i in range(numClass)] index = 0 for labi in result: Lab[labi].append(index) index += 1 color = ['oy', 'ob', 'og', 'cs', 'ms', 'bs', 'ks', 'ys', 'yv', 'mv', 'bv', 'kv', 'gv', 'y^', 'm^', 'b^', 'k^', 'g^'] * 3 for i in range(numClass): x1 = [] y1 = [] for ind1 in newData[Lab[i]]: # print ind1 try: y1.append(ind1[1]) x1.append(ind1[0]) except: pass plt.plot(x1, y1, color[i]) # 绘制初始中心点 x1 = [] y1 = [] for ind1 in clf.cluster_centers_: try: y1.append(ind1[1]) x1.append(ind1[0]) except: pass plt.plot(x1, y1, "rv") #绘制中心 plt.show() # 对数据降维到2维,绘制聚类结果图 # pca = PCA(n_components=2) # 输出2维 # newData = pca.fit_transform(weight) # 载入N维 # result = list(clf.predict(TnewData)) # plot_cluster(result,newData,numClass) # 先用 PCA 进行降维,再使用 TSNE from sklearn.manifold import TSNE newData = PCA(n_components=4).fit_transform(weight) # 载入N维 newData =TSNE(2).fit_transform(newData) result = list(clf.predict(TnewData)) plot_cluster(result,newData,numClass)
17、使用K-means及TF-IDF算法对中文文本聚类并可视化
18、python jieba分词(结巴分词)、提取词,加载词,修改词频,定义词库
19、朴素贝叶斯和 SVM 文本分类
SVM(回归分析):支持向量机(英语:support vector machine,常简称为SVM,又名支持向量网络)
s