linux 中依据某列展开为多行

 

001、 方法1

[root@pc1 test01]# ls
a.txt  test.sh
[root@pc1 test01]# cat a.txt     ## 测试数据, 依据最后一列展开为多行
chrY    2657879 2658063 CTCF    652     GM19239
chrY    2664424 2664734 CTCF    185     hL-hESC,HepG2,HUVEC
chrY    2668277 2668694 CTCF    766     Dnd41,H1-hESC,HepG2,HSMM,HSMMtube,HUVEC,NH-A,A549,ProgFib,AG10803,BJ,GM12864,HCM,HCPEpic,HEEpiC,HFF,HEF-Myc,HPF,HVMF,NHLF,RPTEC,SAEC
[root@pc1 test01]# cat test.sh     ## 展开程序
#!/bin/bash

for i in $(seq $(wc -l < a.txt))
do
        sed -n "$i"p a.txt | awk '{OFS = "\t"; split($NF, ay, ","); len = length(ay); for (i = 1; i <= len; i++) {print $0, ay[i]}}' | awk '{OFS = "\t"; $(NF - 1) = $NF; sub(/\s*\S*$/, ""); print $0}' >> result.txt
done
[root@pc1 test01]# bash test.sh      ## 执行程序
[root@pc1 test01]# ls
a.txt  result.txt  test.sh
[root@pc1 test01]# head result.txt -n 6      ## 运行结果
chrY    2657879 2658063 CTCF    652     GM19239
chrY    2664424 2664734 CTCF    185     hL-hESC
chrY    2664424 2664734 CTCF    185     HepG2
chrY    2664424 2664734 CTCF    185     HUVEC
chrY    2668277 2668694 CTCF    766     Dnd41
chrY    2668277 2668694 CTCF    766     H1-hESC

 

002、方法2

[root@pc1 test01]# ls
a.txt  test.sh
[root@pc1 test01]# cat a.txt    
chrY    2657879 2658063 CTCF    652     GM19239
chrY    2664424 2664734 CTCF    185     hL-hESC,HepG2,HUVEC
chrY    2668277 2668694 CTCF    766     Dnd41,H1-hESC,HepG2,HSMM,HSMMtube,HUVEC,NH-A,A549,ProgFib,AG10803,BJ,GM12864,HCM,HCPEpic,HEEpiC,HFF,HEF-Myc,HPF,HVMF,NHLF,RPTEC,SAEC
[root@pc1 test01]# cat test.sh      ## 转换程序
#!/bin/bash
for i in $(seq $(sed -n "$=" a.txt))
do
        sed -n "$i"p a.txt | awk '{print $NF}' | sed 's/,/\n/g' | awk -v a=$i '{OFS = "\t"; print a, $0}' | while read {j,k}
        do
                sed -n "$j"p a.txt | awk -v a=$k '{NF -= 1} {print $0, a}'  >> result.txt
        done
done
[root@pc1 test01]# bash test.sh      ## 执行程序
[root@pc1 test01]# ls
a.txt  result.txt  test.sh
[root@pc1 test01]# head result.txt -n 6      ## 转换结果
chrY 2657879 2658063 CTCF 652 GM19239
chrY 2664424 2664734 CTCF 185 hL-hESC
chrY 2664424 2664734 CTCF 185 HepG2
chrY 2664424 2664734 CTCF 185 HUVEC
chrY 2668277 2668694 CTCF 766 Dnd41
chrY 2668277 2668694 CTCF 766 H1-hESC

 

003、方法3

[root@pc1 test01]# ls
test  test.sh
[root@pc1 test01]# cat test           ## 测试数据
chrY    2657879 2658063 CTCF    652     GM19239
chrY    2664424 2664734 CTCF    185     hL-hESC,HepG2,HUVEC
chrY    2668277 2668694 CTCF    766     Dnd41,H1-hESC,HepG2,HSMM,HSMMtube,HUVEC,NH-A,A549,ProgFib,AG10803,BJ,GM12864,HCM,HCPEpic,HEEpiC,HFF,HEF-Myc,HPF,HVMF,NHLF,RPTEC,SAEC
[root@pc1 test01]# cat test.sh        ## 转换程序
#!/bin/bash
while IFS= read -r line
do
need1=`echo "$line" | awk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5}'` #提取每行的第一列到第五列
need2=`echo "$line" | awk '{print $6}' | tr "," "\n"` #提取每行的第六列并转为多行
num=`echo "$need2" | wc | awk '{print $1}'` #统计need2变量共有多少行
need3=`for (( c=1; c<=num; c++)) ; do echo "$need1" ; done` #重复打印need1变量num次
echo "$need3" >> file1
echo "$need2" >> file2
done < test
paste file1 file2 > file3 #合并
[root@pc1 test01]# bash test.sh         ## 运行程序
[root@pc1 test01]# ls
file1  file2  file3  test  test.sh
[root@pc1 test01]# head -n 5 file3      ## 运行结果
chrY    2657879 2658063 CTCF    652     GM19239
chrY    2664424 2664734 CTCF    185     hL-hESC
chrY    2664424 2664734 CTCF    185     HepG2
chrY    2664424 2664734 CTCF    185     HUVEC
chrY    2668277 2668694 CTCF    766     Dnd41

 

004、补充

[root@pc1 test01]# ls
a.sh  a.txt
[root@pc1 test01]# cat a.sh
#!/bin/bash

for i in $(seq $(wc -l < a.txt))
do
        tmp1=$(sed -n "$i"p a.txt)
        tmp2=$(echo $tmp1 | awk '{print $NF}' | sed 's/,/\n/g')
        echo "$tmp2" >> part2
        num=$(echo "$tmp2" | wc -l)
        for ((i = 1; i <= num; i++)); do echo "$tmp1" | awk '{NF -= 1}1' >> part1; done
done
paste part1 part2 > part3
[root@pc1 test01]# cat a.txt
chrY    2657879 2658063 CTCF    652     GM19239
chrY    2664424 2664734 CTCF    185     hL-hESC,HepG2,HUVEC
chrY    2668277 2668694 CTCF    766     Dnd41,H1-hESC,HepG2,HSMM,HSMMtube,HUVEC,NH-A,A549,ProgFib,AG10803,BJ,GM12864,HCM,HCPEpic,HEEpiC,HFF,HEF-Myc,HPF,HVMF,NHLF,RPTEC,SAEC
[root@pc1 test01]# bash a.sh
[root@pc1 test01]# ls
a.sh  a.txt  part1  part2  part3
[root@pc1 test01]# head part3 -n 5
chrY 2657879 2658063 CTCF 652   GM19239
chrY 2664424 2664734 CTCF 185   hL-hESC
chrY 2664424 2664734 CTCF 185   HepG2
chrY 2664424 2664734 CTCF 185   HUVEC
chrY 2668277 2668694 CTCF 766   Dnd41

 

原文:

https://home.cnblogs.com/u/chenwenyan

 

posted @ 2023-09-12 22:14  小鲨鱼2018  阅读(19)  评论(0编辑  收藏  举报