练习3 循环读取、分布拟合

path <- "C:/Users/lujian/Desktop/input1" ##文件目录
fileNames <- dir(path)[1:366]  ##获取该路径下的文件名
filePath <- sapply(fileNames, function(x){ 
  paste(path,x,sep='/')})   ##生成读取文件路径
##读取数据,结果为list
data <- sapply(filePath, function(x){
  read.csv(x, header=FALSE, stringsAsFactors = FALSE)}) 

## 计算每天帖子矩阵列数
everyday.ncol <- c(length=366)
for(i in 1:366){
  everyday.ncol[i] <- ncol(data[[i]])
}

## 选出每天一条帖子评论数评论数大于5000,51天符合要求
table(everyday.ncol-6 > 5000)

useful.day <- which(everyday.ncol-6 > 5000)
index2012 <- array(dim=c(length(useful.day), 2))
for(i in useful.day){
  index2012[which(i == useful.day), ] <- 
    c(i, which(data[[i]][ , 5006] != "")[1])
}   # 某些天不止一条帖子评论数超过5000,仅选取那天第一条


## 摘出51条帖子数据放入数据框
df2012 <- data.frame(matrix(NA, 51, 16384))  # EXCEL最多显示16384列,也只能读取这
for(i in index2012[ , 1]){
  which.row <- which(index2012[ , 1] == i)
  df2012[which.row, ] <- data[[i]][index2012[which.row, 2], ]
  # 将循环填充的数据删去
  true.ncol <- ncol(data[[i]][index2012[which.row, 2], ])
  if(true.ncol < 16384){
    df2012[which.row, (true.ncol+1):16384] <- NA
  }  
}
## 第50行时间格式与其他行不同,不做复杂转换处理,直接手动删去
df2012 <- df2012[-50, ]


## 时间格式数据转化为每小时评论数(前8小时每小时累计)
comment2012 <- matrix(nrow=nrow(df2012), ncol=8)
for(i in 1:nrow(df2012)){
  initial.time <- strptime(df2012[i, 6], "%Y/%m/%d %H:%M")
  count <- c(length=8)
  for(j in 1:8){
    number <- 0
    reply.time <- initial.time + j*3600  #从首发帖时间j小时后
    for (k in 7:ncol(df2012)){
      if ((strptime(df2012[i, k], "%Y/%m/%d %H:%M") <= 
           reply.time)  & !is.na(df2012[i, k])){
        number <- number+1
      }else{
        break
      }
    }
    count[j] <- number
  }
  comment2012[i, ] <- count
}

## 差分得出每小时评论数
for(i in 8:2){
  comment2012[ , i] <- comment2012[ , i]-comment2012[ , i-1] 
}

## Pearson拟合优度卡方检验是否为泊松分布
table(comment2012[ , 1])
X <- as.numeric(names(table(comment2012[ , 1])))
Y <- as.numeric(table(comment2012[ , 1]))
q <- ppois(X, mean(rep(X,Y)))
p <- numeric(length(Y))
p[1] <- q[1]; p[n] <- 1-q[n-1]
for (i in 2:(n-1)){
  p[i]<-q[i]-q[i-1]
}
chisq.test(Y,p=p)  # p小于0.05,拒绝原假设,不是泊松分布
# # 上述分组每组频数小于5,导致近似算法不准


## K-S检验是否为泊松分布
mean(comment2012[ , 8])
ks.test(jitter(comment2012[ , 1]), "ppois", 10)
# p小于0.05,拒绝原假设,不是泊松分布
# ks.test(jitter(comment2012[ , 8]/8), "ppois", 10)
ks.test(jitter(comment2012[ , 8]), "ppois", 86)

## 随机数产生参数为25的泊松分布
set.seed(100)
z1 <- rpois(50, 25)
hist(z1)
## 随机数产生参数为10的泊松分布
set.seed(100)
z2 <- rpois(50, 10)
hist(z2)

## 查看50帖子8小时评论数图形
# range(comment2012[ , 8])
## 画频数分布直方图
hist(comment2012[ , 8], freq = F, breaks = 20) 
## 再画概率分布曲线
lines(density(comment2012[ , 8], bw=10), col="red", lwd=2)

## 查看50帖子1小时评论数图形
# range(comment2012[ , 1])
hist(comment2012[ , 1], freq = F, breaks = 30) 
lines(density(comment2012[ , 1], bw=1), col="red", lwd=2)

## 查看50条帖子时间点分布

dim(df2012)
days <- strptime(df2012[ , 2006], "%Y/%m/%d %H:%M") - 
           strptime(df2012[ , 6], "%Y/%m/%d %H:%M")
days/24
sort(days)/24

## 以下用的comment2012是未差分过的!!!
## 第2条帖子
df2012[2, 6]
initial.time0109 <- strptime(df2012[2, 6], "%Y/%m/%d %H:%M")
count0109 <- c(length=76)
for(j in 1:76){
  number <- 0
  reply.time0109 <- initial.time0109 + j*3600  #从首发帖时间j小时后
  for (k in 7:2006){
    if ((strptime(df2012[2, k], "%Y/%m/%d %H:%M") <= 
         reply.time0109)  & !is.na(df2012[2, k])){
      number <- number+1
    }else{
      break
    }
  }
  count0109[j] <- number
}
for(i in 76:2){
  count0109[i] <- count0109[i] - count0109[i-1]  
}
plot(count0109, type = "l")

## 第5条帖子
df2012[5, 6]
days[5]
initial.time0121 <- strptime(df2012[5, 6], "%Y/%m/%d %H:%M")
count0121 <- c(length=87)
for(j in 1:87){
  number <- 0
  reply.time0121 <- initial.time0121 + j*3600  #从首发帖时间j小时后
  for (k in 7:2006){
    if ((strptime(df2012[5, k], "%Y/%m/%d %H:%M") <= 
         reply.time0121)  & !is.na(df2012[5, k])){
      number <- number+1
    }else{
      break
    }
  }
  count0121[j] <- number
}
for(i in 87:2){
  count0121[i] <- count0121[i] - count0121[i-1]  
}
plot(count0121, type = "l")

## 第39条帖子
df2012[39, 6]
days[39]
initial.time0601 <- strptime(df2012[39, 6], "%Y/%m/%d %H:%M")
count0601 <- c(length=42)
for(j in 1:42){
  number <- 0
  reply.time0601 <- initial.time0601 + j*3600  #从首发帖时间j小时后
  for (k in 7:2006){
    if ((strptime(df2012[39, k], "%Y/%m/%d %H:%M") <= 
         reply.time0601)  & !is.na(df2012[39, k])){
      number <- number+1
    }else{
      break
    }
  }
  count0601[j] <- number
}
for(i in 42:2){
  count0601[i] <- count0601[i] - count0601[i-1]  
}
plot(count0601, type = "l")






## 拟合伽马分布
## 第2个帖子第一小时评论时间间隔gamma分布拟合
comment2012[5, 1]
df2012[5, 6]
post20120121 <- strptime(df2012[5, 6:29], "%Y/%m/%d %H:%M")
time.interval <- c(length=23)
for(i in 2:length(post20120121)){
  time.interval[i-1] <- post20120121[i]-post20120121[i-1]
}

# hist(time.interval, freq = F, breaks = 23) 
# lines(density(time.interval, bw=0.5), col="red", lwd=2)

## ks检验是否符合gamma分布
a20120121 <- mean(time.interval)^2/var(time.interval) 
b20120121 <- mean(time.interval)/var(time.interval)
ks.test(jitter(time.interval), "pgamma", a20120121, b20120121)
## p值大于0.05,不能拒绝原假设,即符合gamma分布

## 第39个帖子第一小时评论时间间隔gamma分布拟合
comment2012[39, 1]
df2012[39, 6]
post20120601 <- strptime(df2012[39, 6:15], "%Y/%m/%d %H:%M")
time.interval2 <- c(length=9)
for(i in 2:length(post20120601)){
  time.interval2[i-1] <- post20120601[i]-post20120601[i-1]
}
## ks检验是否符合gamma分布
a20120601 <- mean(time.interval2)^2/var(time.interval2) 
b20120601 <- mean(time.interval2)/var(time.interval2)
ks.test(jitter(time.interval2), "pgamma", a20120601, b20120601)
## p值大于0.05,不能拒绝原假设,即符合gamma分布

## gamma分布画图

plot(sort(time.interval), 
     dgamma(sort(time.interval), a20120121, a20120121),
     main="the Gamma Density Distribution",
     col = "red", type="l", lwd=2)
lines(sort(time.interval2), 
      dgamma(sort(time.interval2), a20120601, a20120601),
      col="blue",lwd=2)

x = seq(0,6,length.out=100)
y = dgamma(x,2,1)

plot(x, y, main="the Gamma Density Distribution",
     xlim = c(0,6),ylim = c(0,1) ,col = "red" ,   
     type="l",lwd=2)

  

posted @ 2017-10-23 11:02  坴戋  阅读(199)  评论(0编辑  收藏  举报