提取最长转录本|多物种批量操作版

点击查看代码

#!/usr/bin/env python
# -*- coding=utf-8 -*-
 
'''
提取序列文件中最长的转录本ID
需要修改######位置的参数 以及 open的目录
'''
 
import sys
import re
 
Fasta=open("/Business/psn_company/sc01/Workbase_Wangcanbiao/reference/MF_new/pep.fa","r")
Sequence={}
 
## 1.创建ID和序列的字典
for line in Fasta.readlines():
  content=line.strip()
  if content.startswith(">"):
    nameS=content[1:]
    name=re.sub(" .*$","",nameS)       ######修改一下序列的header
    Sequence[name]=''
  else:
    Sequence[name]+=content
Fasta.close()
#print(Sequence.keys())
 
 
## 2.提取每个转录本的序列长度，形成三列
Out=open("./gene_id_len.txt","w")
for i in Sequence.keys():
    #print(i+"\t"+str(len(Sequence[i]))+"\t"+i.split(".")[0]+"\n")
    Out.write(i+"\t"+str(len(Sequence[i]))+"\t"+i.split(".")[0]+"\n")   #第一列是转录本ID，第二列是转录本长度，第三列是gene ID
Out.close()
 
 
## 3.这一步准备对out文件进行排序，找到每个基因最长的转录本
from operator import itemgetter
gene_id_len=open("./gene_id_len.txt","r")
table = []
for line in gene_id_len:
    col = line.strip().split("\t")
    col[0] = str(col[0])
    col[1] = int(col[1])
    col[2] = str(col[2])
    table.append(col)                                              #生成嵌套式列表
#print(table)
table_sorted = sorted(table, key=itemgetter(2, 1),reverse=True)                 #先后按列索引2,1排序
#print(table_sorted)
output_file = open("./gene_id_len_sorted.txt","w")
for row in table_sorted:                                           #遍历读取排序后的嵌套列表
    row = [str(x) for x in row]                                    #列表要转换为字符串才可以写入文本
    output_file.write("\t".join(row) + '\n')
output_file.close()
 
 
## 4.找到最长转录本
input_file=open("./gene_id_len_sorted.txt","r")
dict2={}
for line in input_file.readlines():
    col = line.strip().split("\t")
    col[0] = str(col[0])
    col[1] = int(col[1])
    col[2] = str(col[2])
    if col[2] not in dict2:
        dict2[col[2]]=col[0]
    else:
        dict2[col[2]]+= '\t'+col[0]
#print(dict2)                     
#print(dict2.values())
list_values=list(dict2.values())
#print(list_values)
result_file = open("./longest_pep_id.txt","w")
for line in list_values:
    col = line.strip().split("\t")
    col[0] = str(col[0])
    #print(col[0])
    result_file.write(col[0]+"\n")
result_file.close()

zcat hairpin.fa.gz | seqkit grep -f list > new.fa

点击查看代码

import sys
import re
def func(pep):
    Fasta=open(pep,"r")
    Sequence={}
    ## 1.创建ID和序列的字典
    for line in Fasta.readlines():
        content=line.strip()
        if content.startswith(">"):
            nameS=content[1:]
            name=re.sub(" .*$","",nameS)       ######修改一下序列的header
            Sequence[name]=''
        else:
            Sequence[name]+=content
            Fasta.close()

    #字符串拼接
    str2 = [pep[0:len(pep)-3] + '.txt']
    str1 = ''.join(str2)
    output_file2 = [pep[0:len(pep)-3] + '.sorted.txt']
    output_file1 = ''.join(output_file2)
    longest2 = [pep[0:len(pep)-3] + '.longest.txt']
    longest1 = ''.join(longest2)
    
    Out=open(str1,"w")
    for i in Sequence.keys():
        #print(i+"\t"+str(len(Sequence[i]))+"\t"+i.split(".")[0]+"\n")
        Out.write(i+"\t"+str(len(Sequence[i]))+"\t"+i.split(".")[0]+"\n")   #第一列是转录本ID，第二列是转录本长度，第三列是gene ID
    Out.close()
    
    from operator import itemgetter
    gene_id_len=open(str1,"r")
    table = []
    for line in gene_id_len:
        col = line.strip().split("\t")
        col[0] = str(col[0])
        col[1] = int(col[1])
        col[2] = str(col[2])
        table.append(col)                                              #生成嵌套式列表
    #print(table)
    table_sorted = sorted(table, key=itemgetter(2, 1),reverse=True)                 #先后按列索引2,1排序
    #print(table_sorted)
    output_file = open(output_file1,"w")
    for row in table_sorted:                                           #遍历读取排序后的嵌套列表
        row = [str(x) for x in row]                                    #列表要转换为字符串才可以写入文本
        output_file.write("\t".join(row) + '\n')
    output_file.close()    
    
    input_file=open(output_file1,"r")
    dict2={}
    for line in input_file.readlines():
        col = line.strip().split("\t")
        col[0] = str(col[0])
        col[1] = int(col[1])
        col[2] = str(col[2])
        if col[2] not in dict2:
            dict2[col[2]]=col[0]
        else:
            dict2[col[2]]+= '\t'+col[0]
    #print(dict2)                     
    #print(dict2.values())
    list_values=list(dict2.values())
    #print(list_values)
    result_file = open(longest1,"w")
    for line in list_values:
        col = line.strip().split("\t")
        col[0] = str(col[0])
        #print(col[0])
        result_file.write(col[0]+"\n")
    result_file.close()


import os
import sys
import re

dir = './'

filelist=[]
str2 = []
output_file2 = []
longest2 = []
for i in os.listdir(dir):  #遍历整个文件夹
    path = os.path.join(dir, i)
    if os.path.isfile(path):  #判断是否为一个文件，排除文件夹
        if os.path.splitext(path)[1]==".fa":  #判断文件扩展名是否为“.fa”
            filelist.append(i)
for i in filelist:
    func(i)


for id in *.fa 
	do
		#echo ${id%.*}
		nohup cat ${id} | seqkit grep -f ${id%.*}.longest.txt > ${id%.*}.result
	done &

posted on 2022-09-28 17:49 Bonjour_! 阅读(379) 评论(0) 收藏举报

刷新页面返回顶部

公告