DataMining-相似性度量

(2017-04-05 银河统计)

相似性度量

  相似性和相异性被许多数据挖掘技术所使用,如聚类、最近邻分类、异常检测等。不同组样本之间的相似度是样本间差异程度的数值度量,两组样本越相似,它们的相异度就越低,相似度越高。通常用各种“距离”来衡量样本(观测值)的相似性,用相似系数来衡量指标(变量)的相似性。

  原理详细讲解和网页(JS)计算实现,见银河统计相似性度量 - 数据挖掘算法

  R和Python计算实现见下文。


目录概览

1) R语言实战

  • A) "距离"计算

    1、欧氏距离(Euclidean Distance)

    2、曼哈顿距离(绝对值距离)(Manhattan Distance)

    3、切比雪夫距离(Chebyshev Distance)

    4、闵氏距离(Minkowski Distance)

    5、马氏距离(Mahalanobis Distance)

  • B) 相似系数计算

    1、皮尔逊相关系数(Pearson Correlation Coefficient)

    2、斯皮尔曼秩相关系数(Spearman Rank Correlation)

    3、肯德尔秩相关系数(Kendall Rank Correlation)

    4、余弦相似度(Cosine Similarity)

2) Python实战

3) R语言函数封装

  • A) "距离"计算函数封装

  • B) 相似系数计算函数封装

4) Python函数封装


Data - 10名学生六门课程成绩表

序号概率论统计学英语政治数据挖掘线性代数
1676373754491
2746966948155
3769393797127
4653885856145
5803948754152
6728070888643
7605091954264
8774969508955
9658950709985
10784155897128

1) R语言实战[返回]

A) "距离"计算


1、欧氏距离(Euclidean Distance)

 Code

	options(digits=4)
	mydata <- read.table("clipboard",header=T)
	class(mydata)
	dim(mydata)
	head(mydata)
	# 第3名学生成绩
	A <- mydata[3,2:7]
	# 第5名学生成绩
	B <- mydata[5,2:7]
	# 第3名和第5名学生成绩之间的欧氏距离
	x <- rbind(A, B)
	D35 <- dist(x, method = "euclidean", diag = FALSE, upper = FALSE)
	D35
	D35 <- dist(x, method = "euclidean", diag = TRUE, upper = TRUE)
	D35
	D35 <- dist(x, method = "euclidean", diag = FALSE, upper = TRUE)
	D35
	D35 <- dist(x, method = "euclidean", diag = TRUE, upper = FALSE)
	D35
	class(D35)
	cat("欧氏距离 =", D35, "\n")

 Result

	> 
	> options(digits=4)
	> mydata <- read.table("clipboard",header=T)
	> class(mydata)
	[1] "data.frame"
	> dim(mydata)
	[1] 10  7
	> head(mydata)
	  序号 概率论 统计学 英语 政治 数据挖掘 线性代数
	1    1     67     63   73   75       44       91
	2    2     74     69   66   94       81       55
	3    3     76     93   93   79       71       27
	4    4     65     38   85   85       61       45
	5    5     80     39   48   75       41       52
	6    6     72     80   70   88       86       43
	> # 第3名学生成绩
	> A <- mydata[3,2:7]
	> A
	  概率论 统计学 英语 政治 数据挖掘 线性代数
	3     76     93   93   79       71       27
	> # 第5名学生成绩
	> B <- mydata[5,2:7]
	> B
	  概率论 统计学 英语 政治 数据挖掘 线性代数
	5     80     39   48   75       41       52
	> # 第3名和第5名学生成绩之间的欧氏距离
	> x <- rbind(A, B)
	> x
	  概率论 统计学 英语 政治 数据挖掘 线性代数
	3     76     93   93   79       71       27
	5     80     39   48   75       41       52
	> D35 <- dist(x, method = "euclidean", diag = FALSE, upper = FALSE)
	> D35
	      3
	5 80.61
	> D35 <- dist(x, method = "euclidean", diag = TRUE, upper = TRUE)
	> D35
	      3     5
	3  0.00 80.61
	5 80.61  0.00
	> D35 <- dist(x, method = "euclidean", diag = FALSE, upper = TRUE)
	> D35
	      3     5
	3       80.61
	5 80.61      
	> D35 <- dist(x, method = "euclidean", diag = TRUE, upper = FALSE)
	> D35
	      3     5
	3  0.00      
	5 80.61  0.00
	> class(D35)
	[1] "dist"
	> cat("欧氏距离 =", D35, "\n")
	欧氏距离 = 80.61 
	> 

 Explanation

	1.读取"剪切板"中的数据到R变量mydata中。【首先,复制数据Data,然后,运行Code程序!】

		mydata <- read.table("clipboard",header=T)

	2.距离计算
	    dist(x, method = "euclidean", diag = FALSE, upper = FALSE, p = 2) 
	    其中
			x 是样本矩阵或者数据框;
			method表示计算哪种距离;
			diag为TRUE的时候给出对角线上的距离;
			upper为TURE的时候给出上三角矩阵上的值。
			
			method的取值有:
				euclidean        欧几里德距离(欧氏距离)(Euclidean Distance) 
				manhattan		 曼哈顿距离(绝对值距离)(Manhattan Distance)
				maximum          切比雪夫距离(Chebyshev Distance)
				minkowski        闵可夫斯基距离(Minkowski Distance)(要指定p值)   
				canberra         兰式距离

2、曼哈顿距离(绝对值距离)(Manhattan Distance)

 Code

	options(digits=4)
	mydata <- read.table("clipboard",header=T)
	class(mydata)
	dim(mydata)
	head(mydata)
	# 第3名学生成绩
	A <- mydata[3,2:7]
	# 第5名学生成绩
	B <- mydata[5,2:7]
	# 第3名和第5名学生成绩之间的曼哈顿距离
	x <- rbind(A, B)
	D35 <- dist(x, method = "manhattan", diag = FALSE, upper = FALSE)
	D35
	D35 <- dist(x, method = "manhattan", diag = TRUE, upper = TRUE)
	D35
	D35 <- dist(x, method = "manhattan", diag = FALSE, upper = TRUE)
	D35
	D35 <- dist(x, method = "manhattan", diag = TRUE, upper = FALSE)
	D35
	class(D35)
	cat("曼哈顿距离 =", D35, "\n")

 Result

	> 
	> D35 <- dist(x, method = "manhattan", diag = FALSE, upper = FALSE)
	> D35
	    3
	5 162
	> class(D35)
	[1] "dist"
	> cat("曼哈顿距离 =", D35, "\n")
	曼哈顿距离 = 162 
	> 

3、切比雪夫距离(Chebyshev Distance)

 Code

	options(digits=4)
	mydata <- read.table("clipboard",header=T)
	class(mydata)
	dim(mydata)
	head(mydata)
	# 第3名学生成绩
	A <- mydata[3,2:7]
	# 第5名学生成绩
	B <- mydata[5,2:7]
	# 第3名和第5名学生成绩之间的切比雪夫距离
	x <- rbind(A, B)
	D35 <- dist(x, method = "maximum", diag = FALSE, upper = FALSE)
	D35
	D35 <- dist(x, method = "maximum", diag = TRUE, upper = TRUE)
	D35
	D35 <- dist(x, method = "maximum", diag = FALSE, upper = TRUE)
	D35
	D35 <- dist(x, method = "maximum", diag = TRUE, upper = FALSE)
	D35
	class(D35)
	cat("切比雪夫距离 =", D35, "\n")

 Result

	> 
	> D35 <- dist(x, method = "maximum", diag = FALSE, upper = FALSE)
	> D35
	   3
	5 54
	> class(D35)
	[1] "dist"
	> cat("切比雪夫距离 =", D35, "\n")
	切比雪夫距离 = 54 
	> 

4、闵氏距离(Minkowski Distance)

 Code

	options(digits=4)
	mydata <- read.table("clipboard",header=T)
	class(mydata)
	dim(mydata)
	head(mydata)
	# 第3名学生成绩
	A <- mydata[3,2:7]
	# 第5名学生成绩
	B <- mydata[5,2:7]
	# 第3名和第5名学生成绩之间的闵可夫斯基距离
	x <- rbind(A, B)
	D35 <- dist(x, method = "minkowski", diag = FALSE, upper = FALSE, p = 1.5)
	D35
	D35 <- dist(x, method = "minkowski", diag = TRUE, upper = TRUE, p = 1.5)
	D35
	D35 <- dist(x, method = "minkowski", diag = FALSE, upper = TRUE, p = 1.5)
	D35
	D35 <- dist(x, method = "minkowski", diag = TRUE, upper = FALSE, p = 1.5)
	D35
	class(D35)
	cat("闵可夫斯基距离 =", D35, "\n")

 Result

	> 
	> D35 <- dist(x, method = "minkowski", diag = FALSE, upper = FALSE, p = 1.5)
	> D35
	        3
	5 100.267
	> class(D35)
	[1] "dist"
	> cat("闵可夫斯基距离 =", D35, "\n")
	闵可夫斯基距离 = 100.267 
	> 

5、马氏距离(Mahalanobis Distance)

 Code

	# 马氏距离函数
	Mahalanobis_Distance <- function(A,B,C){
	  # A,B为【求距离】的向量 | C为【求样本协方差】的矩阵
	  result <- sqrt((A-B) %*% solve(cov(C)) %*% t(t(A-B)))
	  result
	}
	
	options(digits=4)
	mydata <- read.table("clipboard",header=T)
	class(mydata)
	dim(mydata)
	head(mydata)
	# 第3名学生成绩
	A <- mydata[3,2:7]
	A
	# 第5名学生成绩
	B <- mydata[5,2:7]
	B
	# 所有学生的成绩
	C <- mydata[,-1]
	C
	# 第3名和第5名学生成绩之间的马氏距离
	A <- as.numeric(A)
	B <- as.numeric(B)
	C <- as.matrix(C)
	result <- Mahalanobis_Distance(A,B,C)
	cat("马氏距离 =", result, "\n")

 Result

	> 
	> A <- as.numeric(A)
	> B <- as.numeric(B)
	> C <- as.matrix(C)
	> result <- Mahalanobis_Distance(A,B,C)
	> cat("马氏距离 =", result, "\n")
	马氏距离 = 3.841 
	> 

B) "相似系数"计算


1、皮尔逊相关系数(Pearson Correlation Coefficient)

 Code

	options(digits=4)
	mydata <- read.table("clipboard",header=T)
	class(mydata)
	dim(mydata)
	head(mydata)
	# 第3名学生成绩
	A <- mydata[3,2:7]
	A <- as.numeric(A)
	A
	# 第5名学生成绩
	B <- mydata[5,2:7]
	B <- as.numeric(B)
	B
	x <- data.frame(A,B)
	x
	result <- cor(x, method=c("pearson"))
	cat("皮尔逊相关系数 =", result[1,2], "\n")

 Result

	> 
	> result <- cor(x, method=c("pearson"))
	> cat("皮尔逊相关系数 =", result[1,2], "\n")
	皮尔逊相关系数 = -0.04686 
	> 

2、斯皮尔曼秩相关系数(Spearman Rank Correlation)

 Code

	options(digits=4)
	mydata <- read.table("clipboard",header=T)
	class(mydata)
	dim(mydata)
	head(mydata)
	# 第3名学生成绩
	A <- mydata[3,2:7]
	A <- as.numeric(A)
	A
	# 第5名学生成绩
	B <- mydata[5,2:7]
	B <- as.numeric(B)
	B
	x <- data.frame(A,B)
	x
	result <- cor(x, method=c("spearman"))
	cat("斯皮尔曼秩相关系数 =", result[1,2], "\n")

 Result

	>
	> result <- cor(x, method=c("spearman"))
	> cat("斯皮尔曼秩相关系数 =", result[1,2], "\n")
	斯皮尔曼秩相关系数 = -0.3189
	>

3、肯德尔秩相关系数(Kendall Rank Correlation)

 Code

	options(digits=4)
	mydata <- read.table("clipboard",header=T)
	class(mydata)
	dim(mydata)
	head(mydata)
	# 第3名学生成绩
	A <- mydata[3,2:7]
	A <- as.numeric(A)
	A
	# 第5名学生成绩
	B <- mydata[5,2:7]
	B <- as.numeric(B)
	B
	x <- data.frame(A,B)
	x
	result <- cor(x, method=c("kendall"))
	cat("肯德尔秩相关系数 =", result[1,2], "\n")

 Result

	>
	> result <- cor(x, method=c("kendall"))
	> cat("肯德尔秩相关系数 =", result[1,2], "\n")
	肯德尔秩相关系数 = -0.276
	>

4、余弦相似度(Cosine Similarity)

 Code

	# 余弦相似度函数
	Cosine_Similarity <- function(A, B){
	  result <- t(A)%*%B/sqrt(sum(A^2)*sum(B^2))
	  result
	}
	
	options(digits=4)
	mydata <- read.table("clipboard",header=T)
	class(mydata)
	dim(mydata)
	head(mydata)
	# 第3名学生成绩
	A <- mydata[3,2:7]
	A <- as.numeric(A)
	A
	# 第5名学生成绩
	B <- mydata[5,2:7]
	B <- as.numeric(B)
	B
	result <- Cosine_Similarity(A,B)
	cat("余弦相似度 =", result, "\n")

 Result

	> 
	> result <- Cosine_Similarity(A,B)
	> cat("余弦相似度 =", result, "\n")
	余弦相似度 = 0.9162 
	> 

2) Python实战[返回]

3) R语言函数封装[返回]

A) "距离"计算函数封装


	# euclidean        欧几里德距离(欧氏距离)(Euclidean Distance) 
	# manhattan        曼哈顿距离(绝对值距离)(Manhattan Distance)
	# maximum          切比雪夫距离(Chebyshev Distance)
	# minkowski        闵可夫斯基距离(Minkowski Distance)(要指定p值)   
	# mahalanobis      马氏距离(Mahalanobis Distance)
	
	Similarity_Distance <- function(A, B, oType, C=NULL, P=NULL){
	  
	  if(oType=='euclidean'){
	    
	    x <- rbind(A, B)
	    result <- dist(x, method = "euclidean", diag = TRUE, upper = FALSE)
	    
	  }else if(oType=='manhattan'){
	    
	    x <- rbind(A, B)
	    result <- dist(x, method = "manhattan", diag = TRUE, upper = FALSE)
	    
	  }else if(oType=='maximum'){
	    
	    x <- rbind(A, B)
	    result <- dist(x, method = "maximum", diag = TRUE, upper = FALSE)
	    
	  }else if(oType=='minkowski'){
	    
	    x <- rbind(A, B)
	    result <- dist(x, method = "minkowski", diag = TRUE, upper = FALSE, p = P)
	    
	  }else if(oType=='mahalanobis'){
	    
	    result <- sqrt((A-B) %*% solve(cov(C)) %*% t(t(A-B)))
	    
	  }else {
	    
	    stop("Error, Please Checking !!!")
	    
	  }
	  
	  result
	  
	}
	
	Similarity_Distance(A, B, oType='euclidean')
	Similarity_Distance(A, B, oType='manhattan')
	Similarity_Distance(A, B, oType='maximum')
	Similarity_Distance(A, B, oType='minkowski', P=1.5)
	Similarity_Distance(A, B, oType='mahalanobis', C=C)

B) 相似系数计算函数封装


	# pearson        皮尔逊相关系数(Pearson Correlation Coefficient)
	# spearman       斯皮尔曼秩相关系数(Spearman Rank Correlation)
	# kendall        肯德尔秩相关系数(Kendall Rank Correlation)
	# cosine         余弦相似度(Cosine Similarity)
	
	Similarity_coefficient <- function(A, B, oType){
	
	  if(oType=='pearson'){
	    
	    x <- data.frame(A,B)
	    result <- cor(x, method=c("pearson"))
	    
	  }else if(oType=='spearman'){
	    
	    x <- data.frame(A,B)
	    result <- cor(x, method=c("spearman"))
	    
	  }else if(oType=='kendall'){
	    
	    x <- data.frame(A,B)
	    result <- cor(x, method=c("kendall"))
	    
	  }else if(oType=='cosine'){
	    
	    result <- t(A)%*%B / sqrt(sum(A^2)*sum(B^2))
	    
	  }else {
	    
	    stop("Error, Please Checking !!!")
	    
	  }
	  
	  result
	  
	}
	
	Similarity_coefficient(A, B, oType='pearson')
	Similarity_coefficient(A, B, oType='spearman')
	Similarity_coefficient(A, B, oType='kendall')
	Similarity_coefficient(A, B, oType='cosine')

4) Python函数封装[返回]

posted @ 2017-04-05 22:20  银河统计  阅读(687)  评论(0编辑  收藏  举报