R语言实现统计plink格式数据基因频率
1、
dir() dat <- read.table("outcome.ped") dat <- dat[,-(1:6)] loci <- data.frame() loci[1:(nrow(dat) * 2), 1] <- 1 for (i in 1:(ncol(dat)/2)) { loci <- cbind(loci, c(dat[, 2*i -1], dat[,2 * i])) } loci <- loci[, -1] colnames(loci) <- paste0("c",1:8) result <- data.frame() temp <- vector() for (i in 1:ncol(loci)) { locidat <- as.data.frame(table(loci[,i])) if (nrow(locidat) == 1 & locidat[1, 1] == 0 ) { temp <- c(0, 0, 0, 1) }else if (nrow(locidat) == 1 & locidat[1, 1] != 0 ){ temp <- c(0, 0, as.character(locidat[1, 1]), 1) }else if(nrow(locidat) == 2) { locidat <- locidat[order(locidat$Var1),] if (locidat[1, 1] == 0) { temp <- c(0,0, as.character(locidat[2,1]), 1) }else { locidat <- locidat[order(locidat$Freq),] temp <- c(as.character(locidat[1,1]), locidat[1,2]/sum(locidat$Freq), as.character(locidat[2,1]), locidat[2,2]/sum(locidat$Freq)) } }else if (nrow(locidat) == 3){ locidat <- locidat[locidat$Var1 != 0, ] locidat <- locidat[order(locidat$Freq),] temp <- c(as.character(locidat[1,1]),locidat[1,2]/sum(locidat$Freq), as.character(locidat[2,1]), locidat[2,2]/sum(locidat$Freq) ) } result <- rbind(result, temp) } for (i in c(2,4)) { result[,i] = round(as.numeric(result[,i]),5) } map <- read.table("outcome.map") result <- cbind(map[,c(1:2,4)], result) colnames(result) <- c("chr", "snp", "pos", "minor", "freq", "major", "freq") result write.csv(result, "result.csv", row.names = F) dir()
2、plink软件验证
[root@centos79 test]# plink --file outcome --freq --sheep --out verify >/dev/null; rm *.log *.nosex [root@centos79 test]# ls outcome.map outcome.ped verify.frq [root@centos79 test]# cat verify.frq CHR SNP A1 A2 MAF NCHROBS 1 snp1 0 G 0 12 1 snp2 G C 0.2 10 1 snp3 0 0 NA 0 1 snp4 0 G 0 8 1 snp5 A G 0.3333 12 1 snp6 A T 0.5 12 1 snp7 A G 0.4167 12 1 snp8 G C 0.4167 12