python 根据染色体起始终止点坐标来获取碱基序列
在工作当中,有时候我们知到染色体编号以及染色体起始终止坐标,我们想知道这段序列是什么样的碱基。
其一,我们一般用去UCSC的genome browser里面去查询 ,其实也可以从UCSC的接口去解析网页,然后在提取序列信息
比如chr17:7676091,7676196 ,那么我只需要构造下面一个网页地址 http://genome.ucsc.edu/cgi-bin/das/hg38/dna?segment=chr17:7676091,7676196
然后 hg38可以更换成hg19,dna?segment= 后面可以按照标准格式更换,就可以返回我们想要的序列了。现在对网页返回 一个xml格式的信息,用python解析一下
1 import requests
2 import re 3 from bs4 import BeautifulSoup 4 import xlwt 5 import xlrd 6 from xlutils.copy import copy 7 import os ,sys 8 #print(sys.path) 9 cwd=os.getcwd() 10 11 12 13 def getHTMLText(url): 14 print("111111") 15 try: 16 header = {'user-agent': 'Mozilla/5.0'} 17 r = requests.get(url,headers = header,timeout = 30 ) 18 r.raise_for_status() 19 r.encoding =r.apparent_encoding 20 print("get 222222222222222") 21 return r.text 22 except: 23 return "" 24 25 def fillDNAList(dnalist, html): 26 # 使用正则表达式获取dna 序列的头文件 27 match = re.search('SEQUENCE([\s\S]*?version="1.00")', html) 28 print("match ok") 29 if match: 30 dna_header = re.search('SEQUENCE([\s\S]*?version="1.00")', html) 31 #print('10====>', dna_header.group()) 32 #dna_header 存到列表 33 dnalist.append(dna_header.group()) 34 35 match = re.search('DNA.*?length="(\d*)"', html) 36 if match: 37 length_header= re.search('DNA.*?length="(\d*)"', html) 38 #print('11=====>', length_header.group()) 39 dnalist.append(length_header.group()) 40 41 #使用 BeautifulSoup 42 print("BeautifulSoup tag属性 获取dna标签属性的字符串部分") 43 soup = BeautifulSoup(html, 'html.parser') 44 tag = soup.dna 45 seq = soup.dna.string 46 seq = seq.replace('\n','').upper() #换行符删除掉,转换成大写 47 # seq 存到列表 48 dnalist.append(seq) 49 print("final======>",dnalist) 50 return dnalist 51 52 def write_excell(dnalist,chrnum,pos): 53 head = '>hg19' + ' ' + dnalist[0] + dnalist[1] 54 f = xlwt.Workbook(encoding='utf-8', style_compression=0) #创建新的Excel(新的workbook) 55 sheet = f.add_sheet('test8', cell_overwrite_ok=True) #创建新的表单 56 # 先写第一行的头文件 57 sheet.write(0,0,head) 58 #再从第二行开始写,每行写入50 个字符串 59 dna = dnalist[2] 60 print('=====',dna,type(dna)) 61 for i in range(0,len(dna),50): 62 sheet.write((int((i+1)/50))+1,0,dna[i:i+50]) 63 64 out_file = 'chrmosome%s_%s.xls'% (chrnum,pos) 65 f.save(out_file) 66 out_file_dir = os.path.join(cwd, out_file) 67 return out_file_dir 68 69 def modify_excell(out_file_dir,chrnum,pos): 70 ''' 71 改Excel表(xlutils模块) 72 :return: 73 ''' 74 rb = xlrd.open_workbook(out_file_dir) # 打开out_file.xls文件,创建工作簿实例对象 75 sheet = rb.sheet_by_index(0) 76 nrow11 = sheet.cell_value(10, 0) #修改第11行第一列,索引是10,0 77 # 根据需要截取原单元格里面的内容与需要添加的内容进行拼接 78 new_nrow11 = '[' + nrow11 79 # 同理操作nrow12 80 nrow12 = sheet.cell_value(11, 0) 81 new_nrow12 = nrow12 + ']' 82 83 wb = copy(rb) 84 ws = wb.get_sheet(0) 85 # 往单元格中写入拼接后的新字符串内容 86 ws.write(10,0,new_nrow11) 87 ws.write(11, 0, new_nrow12) 88 modify_file = 'new_chrmosome%s_%s.xls' % (chrnum,pos) 89 wb.save(modify_file) 90 91 def main(): 92 hg19 = "hg19" 93 chrnum = 17 94 pos = 7676091 95 start = pos - 500 96 end = pos + 500 97 position_DNA_list = [] 99 url = f"http://genome.ucsc.edu/cgi-bin/das/{hg19}/dna?segment=chr{chrnum}:{start},{end}" 100 101 print(url) 102 html = getHTMLText(url) 103 dnalist = fillDNAList(position_DNA_list,html) 104 out_file_dir = write_excell(dnalist,chrnum,pos) 105 modify_excell(out_file_dir,chrnum,pos) 106 107 main()
结果如下:
http://genome.ucsc.edu/cgi-bin/das/hg19/dna?segment=chr17:7675591,7676591 match ok BeautifulSoup tag属性 获取dna标签属性的字符串部分 final======> ['SEQUENCE id="chr17" start="7675591" stop="7676591" version="1.00"', 'DNA length="1001"', 'CCCAAGAGCCTTCAGTATACACATCAATAAAAATAATTTTAATTATTCTGATAAAAGATAAACATGAAAAGTTATGGTATGCAAAGTTGAATGACAACAACTGATACTATTTGAAATAATTGACAGAATTATATTCCGTAACAATTTATAAGCAAAGCCAAAAAAACAATGATCCCTTTGTTGAATGCACAGAACAAATCCATCTTGTCCACGGCTACTGAGCATGCCTGTGATCTCCAGGGGTCACTCAGGTTTGACTCAAAGGATCCAACAGCCTGTAGACCCTGTGCTTGAAGGCATGAGGGTCACCTCTGAGTTCACACTCACTAGTGTCCCTCCTTTCTTCAGAAAGCTAGGAACTGGGAAGACAAGGGGAAAATCAATCAAGGCCTGAGGTATGGGGCTGTAGGCTGGGAGGAAACTAACATTATTGAGAAGCTACTGATGTGAATACATTTCAATTACTACTCACATTGGTTTTTTGTTTGTTTGTTTGTTTGTTTGTTTGTTTGTTTTTTAAGACGGAGTTTTGCTCTCGTTGCCCAGGCTGGAGTGCAATGGAATGATCTAAGGTCACCACAACCTCCACCTCCCGGTTCAAGCAATTCTCCTGCCTCAGCCTCCCAAGTAGCTGGGACTACAGGCGTGTGCCACCACACCCAGCTAAGTTTGTATTTTTTTAGTAGAGACGGTGTTTCACCATGTTGGTCAGGCTGGTCTCGAACTCCTGACCTCAAGTGATCCACCCACCTCGGCCTCCCAAAGTGCTGGGATTATAGGCATGAGCCACCACACCCAGCCTCACGTTGGTTTTTGAGATGGATTTTATTGCCATTTTGTACACAAAAAGGTCAAAACTCAGTGAGGTGAATTGACATGACAGTAAGTGAAAGAACTACTATCTGATTGGGGGTCTTCTGCCGCCTGCTCTGGGACTCTTTCTGCTATGACATGAAGGACATTGGCAACCCCAGTCCTTGCAGATTTCTTTCACTGTGTGC'] ===== CCCAAGAGCCTTCAGTATACACATCAATAAAAATAATTTTAATTATTCTGATAAAAGATAAACATGAAAAGTTATGGTATGCAAAGTTGAATGACAACAACTGATACTATTTGAAATAATTGACAGAATTATATTCCGTAACAATTTATAAGCAAAGCCAAAAAAACAATGATCCCTTTGTTGAATGCACAGAACAAATCCATCTTGTCCACGGCTACTGAGCATGCCTGTGATCTCCAGGGGTCACTCAGGTTTGACTCAAAGGATCCAACAGCCTGTAGACCCTGTGCTTGAAGGCATGAGGGTCACCTCTGAGTTCACACTCACTAGTGTCCCTCCTTTCTTCAGAAAGCTAGGAACTGGGAAGACAAGGGGAAAATCAATCAAGGCCTGAGGTATGGGGCTGTAGGCTGGGAGGAAACTAACATTATTGAGAAGCTACTGATGTGAATACATTTCAATTACTACTCACATTGGTTTTTTGTTTGTTTGTTTGTTTGTTTGTTTGTTTGTTTTTTAAGACGGAGTTTTGCTCTCGTTGCCCAGGCTGGAGTGCAATGGAATGATCTAAGGTCACCACAACCTCCACCTCCCGGTTCAAGCAATTCTCCTGCCTCAGCCTCCCAAGTAGCTGGGACTACAGGCGTGTGCCACCACACCCAGCTAAGTTTGTATTTTTTTAGTAGAGACGGTGTTTCACCATGTTGGTCAGGCTGGTCTCGAACTCCTGACCTCAAGTGATCCACCCACCTCGGCCTCCCAAAGTGCTGGGATTATAGGCATGAGCCACCACACCCAGCCTCACGTTGGTTTTTGAGATGGATTTTATTGCCATTTTGTACACAAAAAGGTCAAAACTCAGTGAGGTGAATTGACATGACAGTAAGTGAAAGAACTACTATCTGATTGGGGGTCTTCTGCCGCCTGCTCTGGGACTCTTTCTGCTATGACATGAAGGACATTGGCAACCCCAGTCCTTGCAGATTTCTTTCACTGTGTGC <class 'str'>