linux 中awk 根据多列读数据进行去重复

 

001、

复制代码
(base) [b20223040323@admin1 test2]$ ls
test.txt
(base) [b20223040323@admin1 test2]$ cat test.txt   ## 测试数据如下;根据第一列和第三列对数据进行去重复
ID=gene-RIN1    rna-XM_018043206.1      3615
ID=gene-STRIP2  rna-XM_018046935.1      3917
ID=gene-STRIP2  rna-XM_018046937.1      3917
ID=gene-STRIP2  rna-XM_018046937.1      3917
ID=gene-SLC35D2 rna-XM_018052476.1      1603
ID=gene-DENND6B rna-XM_018048970.1      4265
ID=gene-PSMD13  rna-XM_005700085.3      1683
ID=gene-LAT     rna-XM_018040785.1      1790
ID=gene-LAT     rna-XM_018040784.1      1790
ID=gene-PSMD14  rna-XM_005676052.2      1908
复制代码

 

002、

复制代码
(base) [b20223040323@admin1 test2]$ awk '{if(ay1[$1] == "") {ay1[$1] = "xxx"; tmp = $1$3; print $0; next}; if(tmp == $1$3) {print $0, "dup"}}' test.txt
ID=gene-RIN1    rna-XM_018043206.1      3615            ## 标记处重复
ID=gene-STRIP2  rna-XM_018046935.1      3917
ID=gene-STRIP2  rna-XM_018046937.1      3917 dup
ID=gene-STRIP2  rna-XM_018046937.1      3917 dup
ID=gene-SLC35D2 rna-XM_018052476.1      1603
ID=gene-DENND6B rna-XM_018048970.1      4265
ID=gene-PSMD13  rna-XM_005700085.3      1683
ID=gene-LAT     rna-XM_018040785.1      1790
ID=gene-LAT     rna-XM_018040784.1      1790 dup
ID=gene-PSMD14  rna-XM_005676052.2      1908
(base) [b20223040323@admin1 test2]$ awk '{if(ay1[$1] == "") {ay1[$1] = "xxx"; tmp = $1$3; print $0; next}; if(tmp == $1$3) {next}}' test.txt
ID=gene-RIN1    rna-XM_018043206.1      3615               ## 去重复
ID=gene-STRIP2  rna-XM_018046935.1      3917
ID=gene-SLC35D2 rna-XM_018052476.1      1603
ID=gene-DENND6B rna-XM_018048970.1      4265
ID=gene-PSMD13  rna-XM_005700085.3      1683
ID=gene-LAT     rna-XM_018040785.1      1790
ID=gene-PSMD14  rna-XM_005676052.2      1908
复制代码

 

003、简洁做法

复制代码
[root@pc1 test1]# ls
test.txt
[root@pc1 test1]# cat test.txt                       ## 测试文本
ID=gene-RIN1    rna-XM_018043206.1      3615
ID=gene-STRIP2  rna-XM_018046935.1      3917
ID=gene-STRIP2  rna-XM_018046937.1      3917
ID=gene-STRIP2  rna-XM_018046937.1      3917
ID=gene-SLC35D2 rna-XM_018052476.1      1603
ID=gene-DENND6B rna-XM_018048970.1      4265
ID=gene-PSMD13  rna-XM_005700085.3      1683
ID=gene-LAT     rna-XM_018040785.1      1790
ID=gene-LAT     rna-XM_018040784.1      1790
ID=gene-PSMD14  rna-XM_005676052.2      1908
[root@pc1 test1]# awk '!ay1[$1$3]++' test.txt        ## 根据1、3列去重复,并保持原来的顺序
ID=gene-RIN1    rna-XM_018043206.1      3615
ID=gene-STRIP2  rna-XM_018046935.1      3917
ID=gene-SLC35D2 rna-XM_018052476.1      1603
ID=gene-DENND6B rna-XM_018048970.1      4265
ID=gene-PSMD13  rna-XM_005700085.3      1683
ID=gene-LAT     rna-XM_018040785.1      1790
ID=gene-PSMD14  rna-XM_005676052.2      1908
复制代码

 。

 

posted @   小鲨鱼2018  阅读(99)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 单元测试从入门到精通
· 上周热点回顾(3.3-3.9)
· winform 绘制太阳,地球,月球 运作规律
点击右上角即可分享
微信分享提示