应用高斯分布来解决异常检测问题(二)
(原创文章,转载请注明出处!)
在文章应用高斯分布来解决异常检测问题(一)中对如何使用高斯分布来解决异常检测问题进行了描述,本篇是使用R编程实现了第一篇中所描述的两个模型:多个一元高斯分布模型和一个多元高斯分布模型。
一、 多个一元高斯分布模型
1 ## parameters: 2 ## xNew - a vector, which is the data of new samples. 3 ## or a matrix, each line is one new sample 4 ## X - a matrix, which stores samples' data. 5 ## parameterFile - path of paramter file, 6 ## the paramter file stores the paramters of the MultiUnivariate Norm model. 7 ## isTraining - flag, TRUE will trigger the training, 8 ## FALSE will skip the training. 9 ## return value: 10 ## a vector of new samples' probability value. 11 craEx8RS_funMultiUnivariateNorm <- function(xNew, X = NULL, parameterFile = ".MultiUnivariateNorm", isTraining = FALSE) 12 { 13 if (isTraining == TRUE) { 14 if (is.null(X) == TRUE) { 15 cat("X is NULL, MultiUnivariateNorm model Can't be trained\n") 16 return 17 } 18 numOfSamples <- dim(X)[1] 19 numOfFeatures <- dim(X)[2] 20 21 vectrMean <- colMeans(X) 22 vectrSD <- numeric(0) 23 for (i in 1:numOfFeatures) { 24 vectrSD[i] <- sd(X[,i]) 25 } 26 27 ## write the parameters to the file 28 ## 1st line is means divided by one blank 29 ## 2nd line is SDs divided by one blank 30 matrixMeanSD <- matrix(c(vectrMean, vectrSD), ncol=numOfFeatures, byrow=TRUE) 31 # checking of parameterFile leaves to write.table 32 write.table(x=matrixMeanSD, file=parameterFile, row.names=FALSE, col.names=FALSE, sep=" ") 33 } else { 34 matrixMeanSD <- read.table(file=parameterFile) 35 matrixMeanSD <- as.matrix(matrixMeanSD) 36 vectrMean <- matrixMeanSD[1,] 37 vectrSD <- matrixMeanSD[2,] 38 } 39 40 if (is.vector(xNew) == TRUE) { 41 vectrProbabilityNewSample <- dnorm(xNew, mean = vectrMean, sd = vectrSD, log = FALSE) 42 return( c(prod(vectrProbabilityNewSample)) ) # probability of the new sample 43 } else if (is.matrix(xNew) == TRUE) { 44 45 retVec <- numeric(0) 46 numOfnewSample <- dim(xNew)[1] 47 for (i in 1:numOfnewSample) { 48 vectrProbabilityNewSample <- dnorm(xNew[i,], mean = vectrMean, sd = vectrSD, log = FALSE) 49 retVec[i] <- prod(vectrProbabilityNewSample) 50 } 51 return(retVec) 52 } else { 53 cat("error! parameter xNew is ", class(xNew), " . It must be a vector, or a matrix.\n") 54 } 55 }
二、 一个多元高斯分布模型
1 ## Before using this function the package mvtnorm need to be installed. 2 ## To install package mvtnorm, issuing command install.packages("mvtnorm") 3 ## and using command library(mvtnorm) to load the package to R workspace. 4 ## 5 ## parameters: 6 ## xNew - a vector, the data of one samples that need to be calculate the output by the Multivariate Norm model. 7 ## or a matrix, each line is one sample that need to be calculate the output by the Multivariate Norm model. 8 ## X - a matrix, which stores samples' data. 9 ## parameterFile - path of paramter file, 10 ## the paramter file stores the paramters of the Multivariate Norm model. 11 ## isTraining - flag, TRUE will trigger the training, 12 ## FALSE will skip the training. 13 ## return value: 14 ## a vector of new samples' probability value. 15 craEx8RS_funMultivariateNorm <- function(xNew, X = NULL, parameterFile = ".MultivariateNorm", isTraining = FALSE) 16 { 17 if (isTraining == TRUE) { 18 if (is.null(X) == TRUE) { 19 cat("X is NULL, MultivariateNorm model Can't be trained\n") 20 return 21 } 22 23 vectrMean <- colMeans(X) 24 matrixSigma <- cov(X) 25 ## write the parameters to the file 26 ## 1st line is means divided by one blank 27 ## from the 2nd line to the last line are variances divided by one blank 28 matrixMeanCov <- rbind(vectrMean, matrixSigma) 29 # checking of parameterFile leaves to write.table 30 write.table(x=matrixMeanCov, file=parameterFile, row.names=FALSE, col.names=FALSE, sep=" ") 31 } else { 32 matrixMeanCov <- read.table(file=parameterFile) 33 matrixMeanCov <- as.matrix(matrixMeanCov) 34 vectrMean <- matrixMeanCov[1,] 35 matrixSigma <- matrixMeanCov[c(2:dim(matrixMeanCov)[1]),] 36 } 37 38 return( dmvnorm(xNew, mean = vectrMean, sigma = matrixSigma, log = FALSE) ) # probability of the new samples 39 }