1、算GC含量
def validate_base_sequence(base_sequence, RNAflag = False): #判断序列是否只含有A、T、G、C、U seq = base_sequence.upper() return len(seq) == (seq.count('U' if RNAflag else 'T') +seq.count('C') + seq.count('A') +seq.count('G')) def gc_content(base_seq): #计算GC含量 assert validate_base_sequence(base_seq), 'argument has invalid characters' seq = base_seq.upper() return (base_seq.count('G') +base_seq.count('C')) / len(base_seq) def recognition_site(base_seq, recognition_seq): return base_seq.find(recognition_seq) def test(): assert validate_base_sequence('ACTG') assert validate_base_sequence('') assert not validate_base_sequence('ACUG') assert validate_base_sequence('ACUG', True) assert not validate_base_sequence('ACUG', False) assert validate_base_sequence('ACTG', False) assert .5 == gc_content('ACTG') assert 1.0 == gc_content('CCGG') assert .25 == gc_content('ACTT') print('All tests passed.') test()
2、元组
>>> DNABases, RNABases = 'TCAG', 'UCAG' >>> DNABases 'TCAG' >>> RNABases 'UCAG' >>> bases = 'TCAG', 'UCAG' # a two-element tuple >>> bases ('TCAG', 'UCAG')
3、切割
def recognition_site(base_seq, recognition_seq): return base_seq.find(recognition_seq) def restriction_cut(base_seq, recognition_seq, offset = 0): site = recognition_site(base_seq, recognition_seq) return base_seq[:site+offset], base_seq[site+offset:] aseq1 = 'AAAAATCCCGAGGCGGCTATATAGGGCTCCGGAGGCGTAATATAAAA' left, right = restriction_cut(aseq1, 'TCCGGA')
4、读取fasta序列,并取最长的序列
def read_FASTA(filename): with open(filename) as file: contents = file.read() entries = contents.split('>')[1:] # skip blank first entry partitioned_entries = [entry.partition('\n') for entry in entries] result = [(entry[0], entry[2].replace('\n', '')) for entry in partitioned_entries] return result def longest_sequence(filename): longest_seq = '' for info, seq in read_FASTA(filename): longest_seq = max(longest_seq, seq, key=len) return longest_seq