R语言将没有空格的列数据拆分为空格分割
1、
dir() dat <- read.table("test.ped") ## 读取测试数据,ped基因型数据 dat genoList =list() for ( i in 1:ncol(dat) ) { ## 将每一列数据保存为列表的一项 genoList[[i]]<- dat[,i] } genoList length(genoList) a1 <- lapply(genoList,function(x){unlist(strsplit(x,""))[seq(1, 2 * dim(dat)[1], 2)]}) ## 将第一个等位基因拆分出来 a1 a2 <- lapply(genoList,function(x){unlist(strsplit(x,""))[seq(2, 2 * dim(dat)[1], 2)]}) ##将第二个等位基因拆分出来 a2 a1 <- as.data.frame(matrix(unlist(a1), byrow = F, ncol = ncol(dat))) ## 将第一个等位基因转换为数据框 a1 a2 <- as.data.frame(matrix(unlist(a2), byrow = F, ncol = ncol(dat))) ## 将第二个等位基因转换为数据框 a2 temp_list <- list() for (i in 1:ncol(dat)) { ## 将拆分出来的等位基因合并保存为新的列表 temp_list[[i * 2 - 1]] = a1[,i] temp_list[[i * 2]] = a2[,i] } temp_list result <- as.data.frame(matrix(unlist(temp_list), byrow = F, ncol = 2 * ncol(dat))) ## 将列表转换为数据框,实现拆分 result dat
> dir() [1] "test.ped" > dat <- read.table("test.ped") ## 测试数据 > dat V1 V2 V3 V4 V5 V6 1 GG CC GG GG GA AA 2 TT GC CC GG GG AA 3 TT GC CG GG GG TT 4 GG GC GG GG GG AA > genoList =list() > for ( i in 1:ncol(dat) ) { ## 保存为新列表 + genoList[[i]]<- dat[,i] + } > genoList [[1]] [1] "GG" "TT" "TT" "GG" [[2]] [1] "CC" "GC" "GC" "GC" [[3]] [1] "GG" "CC" "CG" "GG" [[4]] [1] "GG" "GG" "GG" "GG" [[5]] [1] "GA" "GG" "GG" "GG" [[6]] [1] "AA" "AA" "TT" "AA" > length(genoList) [1] 6 > a1 <- lapply(genoList,function(x){unlist(strsplit(x,""))[seq(1, 2 * dim(dat)[1], 2)]}) ## 拆分第一个等位基因 > a1 [[1]] [1] "G" "T" "T" "G" [[2]] [1] "C" "G" "G" "G" [[3]] [1] "G" "C" "C" "G" [[4]] [1] "G" "G" "G" "G" [[5]] [1] "G" "G" "G" "G" [[6]] [1] "A" "A" "T" "A" > a2 <- lapply(genoList,function(x){unlist(strsplit(x,""))[seq(2, 2 * dim(dat)[1], 2)]}) ## 拆分第二个等位基因 > a2 [[1]] [1] "G" "T" "T" "G" [[2]] [1] "C" "C" "C" "C" [[3]] [1] "G" "C" "G" "G" [[4]] [1] "G" "G" "G" "G" [[5]] [1] "A" "G" "G" "G" [[6]] [1] "A" "A" "T" "A" > a1 <- as.data.frame(matrix(unlist(a1), byrow = F, ncol = ncol(dat))) ## 转换为数据框 > a1 V1 V2 V3 V4 V5 V6 1 G C G G G A 2 T G C G G A 3 T G C G G T 4 G G G G G A > a2 <- as.data.frame(matrix(unlist(a2), byrow = F, ncol = ncol(dat))) ## 转换为数据框 > a2 V1 V2 V3 V4 V5 V6 1 G C G G A A 2 T C C G G A 3 T C G G G T 4 G C G G G A > temp_list <- list() > for (i in 1:ncol(dat)) { ## 合并在新列表中 + temp_list[[i * 2 - 1]] = a1[,i] + temp_list[[i * 2]] = a2[,i] + } > temp_list [[1]] [1] "G" "T" "T" "G" [[2]] [1] "G" "T" "T" "G" [[3]] [1] "C" "G" "G" "G" [[4]] [1] "C" "C" "C" "C" [[5]] [1] "G" "C" "C" "G" [[6]] [1] "G" "C" "G" "G" [[7]] [1] "G" "G" "G" "G" [[8]] [1] "G" "G" "G" "G" [[9]] [1] "G" "G" "G" "G" [[10]] [1] "A" "G" "G" "G" [[11]] [1] "A" "A" "T" "A" [[12]] [1] "A" "A" "T" "A" > result <- as.data.frame(matrix(unlist(temp_list), byrow = F, ncol = 2 * ncol(dat))) ## 转换为数据框 > result ## 查看结果 V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 1 G G C C G G G G G A A A 2 T T G C C C G G G G A A 3 T T G C C G G G G G T T 4 G G G C G G G G G G A A > dat V1 V2 V3 V4 V5 V6 1 GG CC GG GG GA AA 2 TT GC CC GG GG AA 3 TT GC CG GG GG TT 4 GG GC GG GG GG AA
参考:https://zhuanlan.zhihu.com/p/378405836
2、shell实现
root@PC1:/home/test# ls test.ped root@PC1:/home/test# cat test.ped ## 测试数据 GG CC GG GG GA AA TT GC CC GG GG AA TT GC CG GG GG TT GG GC GG GG GG AA root@PC1:/home/test# sed 's/. / &/g' test.ped ## 使用sed对“字符空格“”替换为“空格字符空格” G G C C G G G G G A AA T T G C C C G G G G AA T T G C C G G G G G TT G G G C G G G G G G AA root@PC1:/home/test# sed 's/. / &/g' test.ped | sed 's/.$/ &/' ## 将最后一个字符替换为空格字符 G G C C G G G G G A A A T T G C C C G G G G A A T T G C C G G G G G T T G G G C G G G G G G A A
分类:
R语言
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 单元测试从入门到精通
· 上周热点回顾(3.3-3.9)
· winform 绘制太阳,地球,月球 运作规律
2020-12-19 linux系统scp远程传输命令
2020-12-19 linux系统中ssh部署两台服务器远程免密登录
2020-12-19 linux系统中创建网络会话
2020-12-19 python中给列表元素排序
2020-12-19 python中返回列表中元素的索引