biopython Sequence相关(一)

参考:http://biopython.org/DIST/docs/tutorial/Tutorial.html

1.构建Seq()对象

from Bio.Seq import Seq
myseq = Seq("AGTACACTCA")
print(myseq)  # AGTACACTCA
print(type(myseq)) # <class 'Bio.Seq.Seq'>

 注:Seq()对象与标准python字符串不同。

2.Seq对象支持的方法

2.1 seq()与标准python字符串均支持字符的遍历/长度计算/获取/截取/连接、.count()检索特定字符、.join()、字母大小写转换

from Bio.Seq import Seq
myseq = Seq("AGTACACTCA")
for i, letter in enumerate(myseq): 
    print(i, letter)
# 0 A
# 1 G
# 2 T
# 3 A
# 4 C
# 5 A
# 6 C
# 7 T
# 8 C
# 9 A
print(len(myseq)) # 10
print(myseq[0])   # A
print(myseq[0::2])  # ATCCC
print(myseq[::-1])  # ACTCACATGA
print(myseq + Seq("AGTAA")) # AGTACACTCAAGTAA
print(myseq.count("CA")) # 2
print(myseq.lower())  # agtacactca

 2.2 计算seq对象的GC含量

from Bio.Seq import Seq
myseq = Seq("AGTACACTCA")
from Bio.SeqUtils import GC
print(GC(myseq)) # 40.0

注:biopython 1.80及之后的版本将求GC含量的函数GC改为了gc.fraction

"""
biopython 1.80及之后的版本将求GC含量的函数GC改为了gc.fraction
Bio.SeqUtils.gc_fraction(seq, ambiguous='remove')
返回的序列G+C百分比在0和1之间浮动
Ambiguous核苷酸指的是ATCGSW (S is G or C, and W is A or T)以外的
若ambiguous='remove'(默认),计算GCS在由ATCGSW组成的序列中所在比例
若ambiguous='ignore',计算GCS在由Ambiguous和unAmbiguous核苷酸组成的序列中所在比例
若ambiguous='weighted',在计算歧义核苷酸是会使用平均值,如G and C记为1, N and X记为0.5
"""
from Bio.Seq import Seq
myseq = Seq("AGTACACTCA")
from Bio.SeqUtils import gc_fraction
print(gc_fraction(myseq)) # 0.4

 2.3 将Seq对象转换为字符串

from Bio.Seq import Seq
myseq1 = Seq("CACTCA")
print(str(myseq1)) 
# CACTCA
print(">name\n%s\n" % myseq1)
# >name
# CACTCA

 2.4 获取核苷酸Seq对象的互补序列

from Bio.Seq import Seq
myseq2 = Seq("CGATAA")
print(myseq2.complement()) # GCTATT
print(myseq2.reverse_complement()) # TTATCG

2.5 转录 

from Bio.Seq import Seq
coding_seq = Seq("GCAATCGAT")
template_seq = coding_seq.reverse_complement()
print(template_seq)  # ATCGATTGC
messenger_seq = coding_seq.transcribe() #转录
print(messenger_seq) # GCAAUCGAU
back_messenger_seq = messenger_seq.back_transcribe() #反转录
print(back_messenger_seq) # GCAATCGAT

2.6 翻译

from Bio.Seq import Seq
messenger_seq = Seq("GCAAUCGAUCCGCUGUGAAAAGGGUGA")
"""
默认情况下,翻译将使用标准遗传密码,即table=1
"""
seq1 = messenger_seq.translate()  
print(seq1) # AIDPL*KG*
"""
线粒体序列相关的基因编码
"""
seq2 = messenger_seq.translate(table="Vertebrate Mitochondrial")
print(seq2) # AIDPLWKGW
"""
终止子选项
"""
seq3 = messenger_seq.translate(to_stop=True) #第一次遇到终止子就停止翻译
print(seq3) # AIDPL 
seq4 = messenger_seq.translate(stop_symbol="@") #设置终止子符号
print(seq4) # AIDPL@KG@

2.7 翻译Tables (具体见官网教程

2.8 MutableSeq objects:可构建可变序列

from Bio.Seq import Seq
from Bio.Seq import MutableSeq
myseq3 = Seq("GCCATTGTAATGGGCCGTA") #与普通Python字符串一样,Seq对象是“只读”的,即不可变序列
# myseq3["4"] = "T" #报错
myseq4 = MutableSeq(myseq3)
print(myseq4)           # GCCATTGTAATGGGCCGTA
myseq4[4] = "C" 
print(myseq4)           # GCCACTGTAATGGGCCGTA
myseq4.remove("C") #从左到右删除第一次出现的C
print(myseq4)           # GCACTGTAATGGGCCGTA
myseq4.reverse() #与python list中的reverse()方法一样,翻转序列
print(myseq4)           # ATGCCGGGTAATGTCACG
myseq5 = Seq(myseq4) #转变成不可变序列
print(type(myseq5))     # <class 'Bio.Seq.Seq'>

 

posted @ 2023-03-19 23:19  yayagogogo  阅读(108)  评论(0编辑  收藏  举报