练习3 循环读取、分布拟合
path <- "C:/Users/lujian/Desktop/input1" ##文件目录 fileNames <- dir(path)[1:366] ##获取该路径下的文件名 filePath <- sapply(fileNames, function(x){ paste(path,x,sep='/')}) ##生成读取文件路径 ##读取数据,结果为list data <- sapply(filePath, function(x){ read.csv(x, header=FALSE, stringsAsFactors = FALSE)}) ## 计算每天帖子矩阵列数 everyday.ncol <- c(length=366) for(i in 1:366){ everyday.ncol[i] <- ncol(data[[i]]) } ## 选出每天一条帖子评论数评论数大于5000,51天符合要求 table(everyday.ncol-6 > 5000) useful.day <- which(everyday.ncol-6 > 5000) index2012 <- array(dim=c(length(useful.day), 2)) for(i in useful.day){ index2012[which(i == useful.day), ] <- c(i, which(data[[i]][ , 5006] != "")[1]) } # 某些天不止一条帖子评论数超过5000,仅选取那天第一条 ## 摘出51条帖子数据放入数据框 df2012 <- data.frame(matrix(NA, 51, 16384)) # EXCEL最多显示16384列,也只能读取这 for(i in index2012[ , 1]){ which.row <- which(index2012[ , 1] == i) df2012[which.row, ] <- data[[i]][index2012[which.row, 2], ] # 将循环填充的数据删去 true.ncol <- ncol(data[[i]][index2012[which.row, 2], ]) if(true.ncol < 16384){ df2012[which.row, (true.ncol+1):16384] <- NA } } ## 第50行时间格式与其他行不同,不做复杂转换处理,直接手动删去 df2012 <- df2012[-50, ] ## 时间格式数据转化为每小时评论数(前8小时每小时累计) comment2012 <- matrix(nrow=nrow(df2012), ncol=8) for(i in 1:nrow(df2012)){ initial.time <- strptime(df2012[i, 6], "%Y/%m/%d %H:%M") count <- c(length=8) for(j in 1:8){ number <- 0 reply.time <- initial.time + j*3600 #从首发帖时间j小时后 for (k in 7:ncol(df2012)){ if ((strptime(df2012[i, k], "%Y/%m/%d %H:%M") <= reply.time) & !is.na(df2012[i, k])){ number <- number+1 }else{ break } } count[j] <- number } comment2012[i, ] <- count } ## 差分得出每小时评论数 for(i in 8:2){ comment2012[ , i] <- comment2012[ , i]-comment2012[ , i-1] } ## Pearson拟合优度卡方检验是否为泊松分布 table(comment2012[ , 1]) X <- as.numeric(names(table(comment2012[ , 1]))) Y <- as.numeric(table(comment2012[ , 1])) q <- ppois(X, mean(rep(X,Y))) p <- numeric(length(Y)) p[1] <- q[1]; p[n] <- 1-q[n-1] for (i in 2:(n-1)){ p[i]<-q[i]-q[i-1] } chisq.test(Y,p=p) # p小于0.05,拒绝原假设,不是泊松分布 # # 上述分组每组频数小于5,导致近似算法不准 ## K-S检验是否为泊松分布 mean(comment2012[ , 8]) ks.test(jitter(comment2012[ , 1]), "ppois", 10) # p小于0.05,拒绝原假设,不是泊松分布 # ks.test(jitter(comment2012[ , 8]/8), "ppois", 10) ks.test(jitter(comment2012[ , 8]), "ppois", 86) ## 随机数产生参数为25的泊松分布 set.seed(100) z1 <- rpois(50, 25) hist(z1) ## 随机数产生参数为10的泊松分布 set.seed(100) z2 <- rpois(50, 10) hist(z2) ## 查看50帖子8小时评论数图形 # range(comment2012[ , 8]) ## 画频数分布直方图 hist(comment2012[ , 8], freq = F, breaks = 20) ## 再画概率分布曲线 lines(density(comment2012[ , 8], bw=10), col="red", lwd=2) ## 查看50帖子1小时评论数图形 # range(comment2012[ , 1]) hist(comment2012[ , 1], freq = F, breaks = 30) lines(density(comment2012[ , 1], bw=1), col="red", lwd=2) ## 查看50条帖子时间点分布 dim(df2012) days <- strptime(df2012[ , 2006], "%Y/%m/%d %H:%M") - strptime(df2012[ , 6], "%Y/%m/%d %H:%M") days/24 sort(days)/24 ## 以下用的comment2012是未差分过的!!! ## 第2条帖子 df2012[2, 6] initial.time0109 <- strptime(df2012[2, 6], "%Y/%m/%d %H:%M") count0109 <- c(length=76) for(j in 1:76){ number <- 0 reply.time0109 <- initial.time0109 + j*3600 #从首发帖时间j小时后 for (k in 7:2006){ if ((strptime(df2012[2, k], "%Y/%m/%d %H:%M") <= reply.time0109) & !is.na(df2012[2, k])){ number <- number+1 }else{ break } } count0109[j] <- number } for(i in 76:2){ count0109[i] <- count0109[i] - count0109[i-1] } plot(count0109, type = "l") ## 第5条帖子 df2012[5, 6] days[5] initial.time0121 <- strptime(df2012[5, 6], "%Y/%m/%d %H:%M") count0121 <- c(length=87) for(j in 1:87){ number <- 0 reply.time0121 <- initial.time0121 + j*3600 #从首发帖时间j小时后 for (k in 7:2006){ if ((strptime(df2012[5, k], "%Y/%m/%d %H:%M") <= reply.time0121) & !is.na(df2012[5, k])){ number <- number+1 }else{ break } } count0121[j] <- number } for(i in 87:2){ count0121[i] <- count0121[i] - count0121[i-1] } plot(count0121, type = "l") ## 第39条帖子 df2012[39, 6] days[39] initial.time0601 <- strptime(df2012[39, 6], "%Y/%m/%d %H:%M") count0601 <- c(length=42) for(j in 1:42){ number <- 0 reply.time0601 <- initial.time0601 + j*3600 #从首发帖时间j小时后 for (k in 7:2006){ if ((strptime(df2012[39, k], "%Y/%m/%d %H:%M") <= reply.time0601) & !is.na(df2012[39, k])){ number <- number+1 }else{ break } } count0601[j] <- number } for(i in 42:2){ count0601[i] <- count0601[i] - count0601[i-1] } plot(count0601, type = "l") ## 拟合伽马分布 ## 第2个帖子第一小时评论时间间隔gamma分布拟合 comment2012[5, 1] df2012[5, 6] post20120121 <- strptime(df2012[5, 6:29], "%Y/%m/%d %H:%M") time.interval <- c(length=23) for(i in 2:length(post20120121)){ time.interval[i-1] <- post20120121[i]-post20120121[i-1] } # hist(time.interval, freq = F, breaks = 23) # lines(density(time.interval, bw=0.5), col="red", lwd=2) ## ks检验是否符合gamma分布 a20120121 <- mean(time.interval)^2/var(time.interval) b20120121 <- mean(time.interval)/var(time.interval) ks.test(jitter(time.interval), "pgamma", a20120121, b20120121) ## p值大于0.05,不能拒绝原假设,即符合gamma分布 ## 第39个帖子第一小时评论时间间隔gamma分布拟合 comment2012[39, 1] df2012[39, 6] post20120601 <- strptime(df2012[39, 6:15], "%Y/%m/%d %H:%M") time.interval2 <- c(length=9) for(i in 2:length(post20120601)){ time.interval2[i-1] <- post20120601[i]-post20120601[i-1] } ## ks检验是否符合gamma分布 a20120601 <- mean(time.interval2)^2/var(time.interval2) b20120601 <- mean(time.interval2)/var(time.interval2) ks.test(jitter(time.interval2), "pgamma", a20120601, b20120601) ## p值大于0.05,不能拒绝原假设,即符合gamma分布 ## gamma分布画图 plot(sort(time.interval), dgamma(sort(time.interval), a20120121, a20120121), main="the Gamma Density Distribution", col = "red", type="l", lwd=2) lines(sort(time.interval2), dgamma(sort(time.interval2), a20120601, a20120601), col="blue",lwd=2) x = seq(0,6,length.out=100) y = dgamma(x,2,1) plot(x, y, main="the Gamma Density Distribution", xlim = c(0,6),ylim = c(0,1) ,col = "red" , type="l",lwd=2)