linux 中awk 根据多列读数据进行去重复

 

001、

(base) [b20223040323@admin1 test2]$ ls
test.txt
(base) [b20223040323@admin1 test2]$ cat test.txt   ## 测试数据如下;根据第一列和第三列对数据进行去重复
ID=gene-RIN1    rna-XM_018043206.1      3615
ID=gene-STRIP2  rna-XM_018046935.1      3917
ID=gene-STRIP2  rna-XM_018046937.1      3917
ID=gene-STRIP2  rna-XM_018046937.1      3917
ID=gene-SLC35D2 rna-XM_018052476.1      1603
ID=gene-DENND6B rna-XM_018048970.1      4265
ID=gene-PSMD13  rna-XM_005700085.3      1683
ID=gene-LAT     rna-XM_018040785.1      1790
ID=gene-LAT     rna-XM_018040784.1      1790
ID=gene-PSMD14  rna-XM_005676052.2      1908

 

002、

(base) [b20223040323@admin1 test2]$ awk '{if(ay1[$1] == "") {ay1[$1] = "xxx"; tmp = $1$3; print $0; next}; if(tmp == $1$3) {print $0, "dup"}}' test.txt
ID=gene-RIN1    rna-XM_018043206.1      3615            ## 标记处重复
ID=gene-STRIP2  rna-XM_018046935.1      3917
ID=gene-STRIP2  rna-XM_018046937.1      3917 dup
ID=gene-STRIP2  rna-XM_018046937.1      3917 dup
ID=gene-SLC35D2 rna-XM_018052476.1      1603
ID=gene-DENND6B rna-XM_018048970.1      4265
ID=gene-PSMD13  rna-XM_005700085.3      1683
ID=gene-LAT     rna-XM_018040785.1      1790
ID=gene-LAT     rna-XM_018040784.1      1790 dup
ID=gene-PSMD14  rna-XM_005676052.2      1908
(base) [b20223040323@admin1 test2]$ awk '{if(ay1[$1] == "") {ay1[$1] = "xxx"; tmp = $1$3; print $0; next}; if(tmp == $1$3) {next}}' test.txt
ID=gene-RIN1    rna-XM_018043206.1      3615               ## 去重复
ID=gene-STRIP2  rna-XM_018046935.1      3917
ID=gene-SLC35D2 rna-XM_018052476.1      1603
ID=gene-DENND6B rna-XM_018048970.1      4265
ID=gene-PSMD13  rna-XM_005700085.3      1683
ID=gene-LAT     rna-XM_018040785.1      1790
ID=gene-PSMD14  rna-XM_005676052.2      1908

 

003、简洁做法

[root@pc1 test1]# ls
test.txt
[root@pc1 test1]# cat test.txt                       ## 测试文本
ID=gene-RIN1    rna-XM_018043206.1      3615
ID=gene-STRIP2  rna-XM_018046935.1      3917
ID=gene-STRIP2  rna-XM_018046937.1      3917
ID=gene-STRIP2  rna-XM_018046937.1      3917
ID=gene-SLC35D2 rna-XM_018052476.1      1603
ID=gene-DENND6B rna-XM_018048970.1      4265
ID=gene-PSMD13  rna-XM_005700085.3      1683
ID=gene-LAT     rna-XM_018040785.1      1790
ID=gene-LAT     rna-XM_018040784.1      1790
ID=gene-PSMD14  rna-XM_005676052.2      1908
[root@pc1 test1]# awk '!ay1[$1$3]++' test.txt        ## 根据1、3列去重复,并保持原来的顺序
ID=gene-RIN1    rna-XM_018043206.1      3615
ID=gene-STRIP2  rna-XM_018046935.1      3917
ID=gene-SLC35D2 rna-XM_018052476.1      1603
ID=gene-DENND6B rna-XM_018048970.1      4265
ID=gene-PSMD13  rna-XM_005700085.3      1683
ID=gene-LAT     rna-XM_018040785.1      1790
ID=gene-PSMD14  rna-XM_005676052.2      1908

 。

 

posted @ 2024-02-18 10:08  小鲨鱼2018  阅读(66)  评论(0编辑  收藏  举报