003生信人必练
- gtf 文件
序列的编号 注释信息的来源 注释信息的类型 开始与结束的位置 得分 序列的方向 起始编码的位置,仅对CDS有效 注释信息描述 11 ensembl_havana gene 5422111 5423206 ”.”表示为空。 +表示正义链, -反义链 , ? 表示未知. 有效值为0、1、2 键+值
11 ensembl_havana gene 5422111 5423206 . + . gene_id "ENSG00000167360"; gene_version "4"; gene_name "OR51Q1"; gene_source "ensembl_havana"; gene_biotype "protein_coding";
11 ensembl_havana transcript 5422111 5423206 . + . gene_id "ENSG00000167360"; gene_version "4"; transcript_id "ENST00000300778"; transcript_version "4"; gene_name "OR51Q1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR51Q1-001"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS31381";
11 ensembl_havana exon 5422111 5423206 . + . gene_id "ENSG00000167360"; gene_version "4"; transcript_id "ENST00000300778"; transcript_version "4"; exon_number "1"; gene_name "OR51Q1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR51Q1-001"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS31381"; exon_id "ENSE00001276439"; exon_version "4";
11 ensembl_havana CDS 5422201 5423151 . + 0 gene_id "ENSG00000167360"; gene_version "4"; transcript_id "ENST00000300778"; transcript_version "4"; exon_number "1"; gene_name "OR51Q1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR51Q1-001"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS31381"; protein_id "ENSP00000300778"; protein_version "4"; - 探究内容
-
import sys import re args = sys.argv '''sys.argv 是命令行参数,是一个字符串列表,0代表其路径''' class Genome_info: #这是一个基类,所有类型都通用的 def __init__(self): self.chr = "" #染色体号 self.start = 0 self.end = 0 class Gene(Genome_info): #这个函数继承了基类 def __init__(self): Genome_info.__init__(self) self.orientation = "" self.id = "" class Transcript(Genome_info): def __init__(self): Genome_info.__init__(self) self.id = "" self.parent = "" class Exon(Genome_info): def __init__(self): Genome_info.__init__(self) self.parent = "" def main(args): """ 一个输入参数: 第一个参数为物种gtf文件 :return: """ list_chr = [] list_gene = {} #因为有 id 号 所以用dir list_transcript = {} list_exon = [] # l_n = 0 with open(args[1]) as fp_gtf: #打开文件遍历GTF for line in fp_gtf: if line.startswith("#"): #号开头的注释过滤掉,不统计这个 continue # print ("in %s" % l_n) # l_n += 1 lines = line.strip("\n").split("\t") chr = lines[0] type = lines[2] start = int(lines[3]) end = int(lines[4]) orientation = lines[6] attr = lines[8] if not re.search(r'protein_coding', attr): #取到的是蛋白的编码,如果没有的话就跳过,不做统计了 continue if not chr in list_chr: #把染色体的类型添加到列表里 list_chr.append(chr) if type == "gene": gene = Gene() #初始化一个基因对象 id = re.search(r'gene_id "([^;]+)";?', attr).group(1) # 0 返回所有列表,1取第一个 gene.chr = chr gene.start = start gene.end = end gene.id = id gene.orientation = orientation list_gene[id] = gene # print(id) elif type == "transcript": transcript = Transcript() id = re.search(r'transcript_id "([^;]+)";?', attr).group(1) parent = re.search(r'gene_id "([^;]+)";?', attr).group(1) if not parent in list_gene: continue transcript.chr = chr transcript.start = start transcript.end = end transcript.id = id transcript.parent = parent list_transcript[id] = transcript elif type == "exon": exon = Exon() parent = re.search(r'transcript_id "([^;]+)";?', attr).group(1) if not parent in list_transcript: continue exon.chr = chr exon.start = start exon.end = end exon.parent = parent list_exon.append(exon) chr_gene(list_gene) gene_len(list_gene) gene_transcript(list_transcript) transcript_exon(list_exon) exon_pos(list_exon) def chr_gene(list_gene): """ 染色体上基因数量分布 :param list_gene: :return: """ print("染色体上基因数量分布") count_gene = {} #这是一个计数器 for info in list_gene.values(): chr = info.chr if chr in count_gene: count_gene[info.chr] += 1 else: count_gene[info.chr] = 1 with open("chr_gene.txt", 'w') as fp_out: for chr, num in count_gene.items(): print("\t".join([chr, str(num)]) + "\n") fp_out.write("\t".join([chr, str(num)]) + "\n") def gene_len(list_gene): """ 基因长度分布情况 :param list_gene: :return: """ print("基因长度分布情况") with open("gene_len.txt", 'w') as fp_out: for gene_id, info in list_gene.items(): len = info.end - info.start + 1 fp_out.write("\t".join([gene_id, str(len)]) + "\n") print("\t".join([gene_id, str(len)]) + "\n") def gene_transcript(list_transcript): """ 基因的转录本数量分布 :param list_transcript: :return: """ print("基因的转录本数量分布") count_transcript = {} for info in list_transcript.values(): gene_id = info.parent if gene_id in count_transcript: count_transcript[gene_id] += 1 else: count_transcript[gene_id] = 1 with open("gene_transcript.txt", 'w') as fp_out: for gene_id, num in count_transcript.items(): print("\t".join([gene_id, str(num)]) + "\n") fp_out.write("\t".join([gene_id, str(num)]) + "\n") def transcript_exon(list_exon): """ 转录本的外显子数量统计 :param list_exon: :return: """ print("转录本的外显子数量统计") count_exon = {} for exon in list_exon: transcript_id = exon.parent if transcript_id in count_exon: count_exon[transcript_id] += 1 else: count_exon[transcript_id] = 1 with open("transcript_exon.txt", 'w') as fp_out: for transcript_id, num in count_exon.items(): print("\t".join([transcript_id, str(num)]) + "\n") fp_out.write("\t".join([transcript_id, str(num)]) + "\n") def exon_pos(list_exon): """ 外显子坐标统计 :param list_exon: :return: """ print("外显子坐标统计") count_exon = {} for exon in list_exon: transcript_id = exon.parent if transcript_id in count_exon: count_exon[transcript_id] += ",%s-%s" % (str(exon.start), str(exon.end)) else: count_exon[transcript_id] = "%s-%s" % (str(exon.start), str(exon.end)) with open("exon_pos.txt", 'w') as fp_out: for transcript_id, pos in count_exon.items(): print("\t".join([transcript_id, pos]) + "\n") fp_out.write("\t".join([transcript_id, pos]) + "\n") def gene_exon_pos(list_gene, list_transcript, list_exon): """ 根据exon的parent将所有exon对应到transcript 根据transcript的parent将所有transcript对应到gene 根据gene按chr分组得到chromosome列表 从chromosome中输出某个指定基因的所有外显子坐标信息并画图 生信编程直播第五题 :param list_gene: :param list_transcript: :param list_exon: :return: """ pass if __name__ == "__main__": main(args)