读书笔记 Bioinformatics Algorithms Chapter5
Chapter5 HOW DO WE COMPARE DNA SEQUENCES
Bioinformatics Algorithms-An_Active Learning Approach
http://bioinformaticsalgorithms.com/
一、
1983年,Russell Doolitte 将血小板源生长因子[platelet derived growth factor(PDGF),一种刺激细胞增殖的物质]和其它已知基因比对,发现它的序列和原癌基因(oncogene)的序列有很高的相似度。这让科学家猜测某些癌症是因为基因在不合适的时机作用所致(scientists hypothesized that some forms of cancer might be caused by a good gene doing the right thing at the wrong time.)。
二、提出问题 序列比对:动态规划法
1.全局比对:
1 ''' 2 Code Challenge: Solve the Global Alignment Problem. 3 Input: Two protein strings written in the single-letter amino acid alphabet. 4 Output: The maximum alignment score of these strings followed by an alignment achieving this maximum score. Use the 5 BLOSUM62 scoring matrix for matches and mismatches as well as the indel penalty σ = 5. 6 ---------- 7 Sample Input: 8 PLEASANTLY 9 MEANLY 10 ---------- 11 Sample Output: 12 8 13 PLEASANTLY 14 -MEA--N-LY 15 ---------- 16 @ Lo Kowngho 2018.9 17 ''' 18 import numpy 19 from os.path import dirname 20 21 def Grade(Symb1,Symb2): 22 Indx1 = symbolList[Symb1] 23 Indx2 = symbolList[Symb2] 24 return matrix[Indx1][Indx2] 25 26 def Init_Graph_Global(l1,l2): 27 Graph = numpy.zeros([l2,l1]) 28 for x in range(1,l2): 29 Graph[x][0] = Graph[x-1][0]-5 30 for y in range(1,l1): 31 Graph[0][y] = Graph[0][y-1]-5 32 return Graph 33 34 def Init_Path(l1,l2): 35 Path = numpy.zeros([l2,l1]) 36 for x in range(1,l2): 37 Path[x][0] = 1 38 for y in range(1,l1): 39 Path[0][y] = 2 40 return Path 41 42 def buildGlobalAlignmentGraph(text1,text2): 43 44 l1 = len(text1) 45 l2 = len(text2) 46 Graph = Init_Graph_Global(l1, l2) 47 Path = Init_Path(l1, l2) 48 49 for x in range(1,l2): 50 for y in range(1,l1): 51 Graph[x][y] = max(Graph[x-1][y]-5, Graph[x][y-1]-5, Graph[x-1][y-1]+Grade(text1[y],text2[x])) 52 if Graph[x-1][y]-5==Graph[x][y]: 53 Path[x][y]=1 54 elif Graph[x][y-1]-5==Graph[x][y]: 55 Path[x][y]=2 56 else: 57 Path[x][y]=3 58 return [Graph,Path] 59 60 61 def OutputGlobalAligement(Path,Graph,text1,text2): 62 outT1 = '' 63 outT2 = '' 64 l1 = len(text1) 65 l2 = len(text2) 66 x = l2-1 67 y = l1-1 68 while(x!=0 or y!=0): 69 if Path[x][y]==1: 70 outT1 += '-' 71 outT2 += text2[x] 72 x -= 1 73 elif Path[x][y]==2: 74 outT1 += text1[y] 75 outT2 += '-' 76 y -= 1 77 else: 78 outT1 += text1[y] 79 outT2 += text2[x] 80 x -= 1 81 y -= 1 82 return [outT1[::-1],outT2[::-1]] 83 84 def ImportScoreMatrix(): 85 dataset = open(dirname(__file__)+'BLOSUM62.txt').read().strip().split('\n') 86 symbolList = dataset[0].split() 87 for i in range(len(symbolList)): 88 symbolList[i]=[symbolList[i],i] 89 symbolList = dict(symbolList) 90 print(symbolList) 91 matrix = [] 92 for i in range(1,len(dataset)): 93 matrix.append(dataset[i].split()[1:]) 94 for l in range(len(matrix)): 95 for i in range(len(matrix[l])): 96 matrix[l][i]=int(matrix[l][i]) 97 return [matrix,symbolList] 98 99 if __name__ == '__main__': 100 101 [matrix,symbolList] = ImportScoreMatrix() 102 103 dataset = open(dirname(__file__)+'dataset.txt').read().strip().split() 104 text1 = '0'+dataset[0] 105 text2 = '0'+dataset[1] 106 107 [Graph,Path] = buildGlobalAlignmentGraph(text1, text2) 108 109 [outT1,outT2] = OutputGlobalAligement(Path,Graph,text1,text2) 110 111 print(int(Graph[-1][-1])) 112 print(outT1) 113 print(outT2)
2. 局部比对
可以把局部比对想象成下面的Free Taxi场景,在开始和结尾都不受罚分约束,只在中间的某一过程受罚分约束。
在全局比对的基础上,状态转移方程在加上一个0,表示每一个点,既可以由→、↓、↘经过罚分得到,也可以直接由起点,不经罚分得到(Free Taxi)。
1 ''' 2 Code Challenge: Solve the Local Alignment Problem. 3 Input: Two protein strings written in the single-letter amino acid alphabet. 4 Output: The maximum score of a local alignment of the strings, followed by a local alignment of these strings achieving the maximum 5 score. Use the PAM250 scoring matrix for matches and mismatches as well as the indel penalty σ = 5. 6 --------------- 7 Sample Input: 8 MEANLY 9 PENALTY 10 --------------- 11 Sample Output: 12 15 13 EANL-Y 14 ENALTY 15 --------------- 16 Lo Kwongho 2018.9 17 ''' 18 import numpy 19 from os.path import dirname 20 21 def Grade(Symb1,Symb2): 22 Indx1 = symbolList[Symb1] 23 Indx2 = symbolList[Symb2] 24 return matrix[Indx1][Indx2] 25 26 def Init_Graph_Local(l1,l2): 27 Graph = numpy.zeros([l1,l2]) 28 return Graph 29 30 def Init_Path(l1,l2): 31 Path = numpy.ones([l1,l2])*-1 32 for x in range(1,l1): 33 Path[x][0] = 1 34 for y in range(1,l2): 35 Path[0][y] = 2 36 return Path 37 38 def buildLocalAlignmentGraph(text1,text2): 39 l1 = len(text1) 40 l2 = len(text2) 41 Graph = Init_Graph_Local(l1, l2) 42 Path = Init_Path(l1, l2) 43 44 for x in range(1,l1): 45 for y in range(1,l2): 46 Graph[x][y] = max(Graph[x-1][y]-5, Graph[x][y-1]-5, Graph[x-1][y-1]+Grade(text1[x],text2[y]),0) 47 if Graph[x-1][y]-5 == Graph[x][y]: 48 Path[x][y] = 1 49 elif Graph[x][y-1]-5==Graph[x][y]: 50 Path[x][y] = 2 51 elif Graph[x][y] == 0: 52 Path[x][y] = 0 53 else: 54 Path[x][y] = 3 55 maxVal = 0 56 maxIndx = [-1,-1] 57 for x in range(1,l1): 58 for y in range(1,l2): 59 if Graph[x][y]>maxVal: 60 maxVal=Graph[x][y] 61 maxIndx=[x,y] 62 63 return [Graph,Path,maxIndx] 64 65 def OutputLocalAligement(Path,Graph,text1,text2,maxIndx): 66 outT1 = '' 67 outT2 = '' 68 l1 = len(text1) 69 l2 = len(text2) 70 x = maxIndx[0] 71 y = maxIndx[1] 72 while(x!=0 or y!=0): 73 if Path[x][y]==1: 74 outT1 += text1[x] 75 outT2 += '-' 76 x -= 1 77 elif Path[x][y]==2: 78 outT1 += '-' 79 outT2 += text2[y] 80 y -= 1 81 elif Path[x][y]==3: 82 outT1 += text1[x] 83 outT2 += text2[y] 84 x -= 1 85 y -= 1 86 else: 87 x=0 88 y=0 89 return [outT1[::-1],outT2[::-1]] 90 91 92 def ImportScoreMatrix(): 93 dataset = open(dirname(__file__)+'PAM250.txt').read().strip().split('\n') 94 symbolList = dataset[0].split() 95 for i in range(len(symbolList)): 96 symbolList[i]=[symbolList[i],i] 97 symbolList = dict(symbolList) 98 matrix = [] 99 for i in range(1,len(dataset)): 100 matrix.append(dataset[i].split()[1:]) 101 for l in range(len(matrix)): 102 for i in range(len(matrix[l])): 103 matrix[l][i]=int(matrix[l][i]) 104 return [matrix,symbolList] 105 106 if __name__ == '__main__': 107 [matrix,symbolList] = ImportScoreMatrix() 108 109 dataset = open(dirname(__file__)+'dataset.txt').read().strip().split() 110 text1 = '0'+dataset[0] 111 text2 = '0'+dataset[1] 112 113 [Graph,Path,maxIndx] = buildLocalAlignmentGraph(text1,text2) 114 115 [outT1,outT2]=OutputLocalAligement(Path,Graph,text1,text2,maxIndx) 116 print(int(Graph[maxIndx[0]][maxIndx[1]])) 117 print(outT1) 118 print(outT2)
3. Overlarp Alignment
1 ''' 2 Code Challenge: Solve the Overlap Alignment Problem. 3 >>Input: Two strings v and w, each of length at most 1000. 4 >>Output: The score of an optimal overlap alignment of v and w, followed by an alignment of a suffix v' of v and a prefix w' of w. 5 achieving this maximum score. Use an alignment score in which matches count +1 and both the mismatch and indel penalties are 2. 6 ------------------- 7 Sample Input: 8 PAWHEAE 9 HEAGAWGHEE 10 ------------------- 11 Sample Output: 12 1 13 HEAE 14 HEAG 15 ------------------- 16 coder: Lo Kwongho 17 ''' 18 19 import numpy 20 from os.path import dirname 21 22 def Init_Graph_Overlap(l1,l2): 23 Graph = numpy.zeros([l1,l2]) 24 for y in range(1,l2): 25 Graph[0][y] = Graph[0][y-1]-1 26 return Graph 27 28 def Init_Path(l1,l2): 29 Path = numpy.ones([l1,l2])*-1 30 for x in range(1,l1): 31 Path[x][0] = 1 32 for y in range(1,l2): 33 Path[0][y] = 2 34 return Path 35 36 def buildOverlapAlignmentGraph(text1,text2): 37 l1 = len(text1) 38 l2 = len(text2) 39 Graph = Init_Graph_Overlap(l1, l2) 40 Path = Init_Path(l1,l2) 41 for x in range(1,l1): 42 for y in range(1,l2): 43 if text1[x]==text2[y]: 44 Graph[x][y]=max(Graph[x-1][y-1]+1,Graph[x-1][y]-2,Graph[x][y-1]-2) 45 else: 46 Graph[x][y]=max(Graph[x-1][y-1]-2,Graph[x-1][y]-2,Graph[x][y-1]-2) 47 if Graph[x][y]==Graph[x-1][y]-2: 48 Path[x][y]=1 49 elif Graph[x][y]==Graph[x][y-1]-2: 50 Path[x][y]=2 51 else: 52 Path[x][y]=3 53 54 maxVal = 0 55 maxIndx = -1 56 for i in range(l2): 57 if Graph[l1-1][i]>maxVal: 58 maxVal=Graph[l1-1][i] 59 maxIndx=i 60 61 return [Graph,Path,maxIndx,maxVal] 62 63 def OutputOverlapAligement(Path,Graph,text1,text2,maxIndx): 64 outT1 = '' 65 outT2 = '' 66 l1 = len(text1) 67 l2 = len(text2) 68 x = l1-1 69 y = maxIndx 70 while(y!=0): 71 if Path[x][y]==1: 72 outT1 += text1[x] 73 outT2 += '-' 74 x -= 1 75 elif Path[x][y]==2: 76 outT1 += '-' 77 outT2 += text2[y] 78 y -= 1 79 elif Path[x][y]==3: 80 outT1 += text1[x] 81 outT2 += text2[y] 82 x -= 1 83 y -= 1 84 else: 85 x=0 86 y=0 87 return [outT1[::-1],outT2[::-1]] 88 89 90 if __name__ == '__main__': 91 dataset = open(dirname(__file__)+'dataset.txt').read().strip().split() 92 text1 = '0'+dataset[0] 93 text2 = '0'+dataset[1] 94 l1 = len(text1) 95 l2 = len(text2) 96 [Graph,Path,maxIndx,maxVal] = buildOverlapAlignmentGraph(text1,text2) 97 #print(Graph) 98 99 [outText1,outText2]=OutputOverlapAligement(Path, Graph, text1, text2, maxIndx) 100 101 print(int(maxVal)) 102 print(outText1) 103 print(outText2)
4.Fitting Alignment
1 ''' 2 Fitting Alignment Problem: Construct a highest-scoring fitting alignment between two strings. 3 >>Input: Strings v and w as well as a matrix Score. 4 >>Output: A highest-scoring fitting alignment of v and w as defined by the scoring matrix Score. 5 ------------------- 6 Sample Input: 7 GTAGGCTTAAGGTTA 8 TAGATA 9 ------------------- 10 Sample Output: 11 2 12 TAGGCTTA 13 TAGA--TA 14 ------------------- 15 coder: Lo Kwongho 16 ''' 17 18 import numpy 19 from os.path import dirname 20 21 def Init_Graph_Fiting(l1,l2): 22 Graph = numpy.zeros([l1,l2]) 23 for y in range(1,l2): 24 Graph[0][y] = Graph[0][y-1]-1 25 return Graph 26 27 def Init_Path(l1,l2): 28 Path = numpy.ones([l1,l2])*-1 29 for x in range(1,l1): 30 Path[x][0] = 1 31 for y in range(1,l2): 32 Path[0][y] = 2 33 return Path 34 35 def buildFittingAlignmentGraph(text1,text2): 36 l1 = len(text1) 37 l2 = len(text2) 38 Graph = Init_Graph_Fiting(l1, l2) 39 Path = Init_Path(l1,l2) 40 for x in range(1,l1): 41 for y in range(1,l2): 42 if text1[x]==text2[y]: 43 Graph[x][y]=max(Graph[x-1][y-1]+1,Graph[x-1][y]-1,Graph[x][y-1]-1) 44 else: 45 Graph[x][y]=max(Graph[x-1][y-1]-1,Graph[x-1][y]-1,Graph[x][y-1]-1) 46 if Graph[x][y]==Graph[x-1][y]-1: 47 Path[x][y]=1 48 elif Graph[x][y]==Graph[x][y-1]-1: 49 Path[x][y]=2 50 else: 51 Path[x][y]=3 52 53 maxVal = 0 54 maxIndx = -1 55 for i in range(l1): 56 if Graph[i][l2-1]>maxVal: 57 maxVal=Graph[i][l2-1] 58 maxIndx=i 59 60 return [Graph,Path,maxIndx,maxVal] 61 62 def OutputFittingAligement(Path,Graph,text1,text2,maxIndx): 63 outT1 = '' 64 outT2 = '' 65 l1 = len(text1) 66 l2 = len(text2) 67 x = maxIndx 68 y = l2-1 69 while(y!=0): 70 if Path[x][y]==1: 71 outT1 += text1[x] 72 outT2 += '-' 73 x -= 1 74 elif Path[x][y]==2: 75 outT1 += '-' 76 outT2 += text2[y] 77 y -= 1 78 elif Path[x][y]==3: 79 outT1 += text1[x] 80 outT2 += text2[y] 81 x -= 1 82 y -= 1 83 else: 84 x=0 85 y=0 86 return [outT1[::-1],outT2[::-1]] 87 88 89 if __name__ == '__main__': 90 dataset = open(dirname(__file__)+'dataset.txt').read().strip().split() 91 text1 = '0'+dataset[0] 92 text2 = '0'+dataset[1] 93 l1 = len(text1) 94 l2 = len(text2) 95 [Graph,Path,maxIndx,maxVal] = buildFittingAlignmentGraph(text1,text2) 96 97 [outText1,outText2]=OutputFittingAligement(Path, Graph, text1, text2, maxIndx) 98 #print(Graph) 99 print(int(maxVal)) 100 print(outText1) 101 print(outText2)
这四种比对的关系如图:
全局比对 局部比对
Overlarp Alignment Fitting Alignment
5、基因的插入和删除,通常都是连续的一段,故在比对出现的连续空位,应该把它们当作一个整体看待。在比对的空位罚分中,生物学家认为,在每一条链上新开一个空位,应罚重分,而空位的延续,罚分应较少:
解决问题的方法是:开三个矩阵,每个矩阵代表一种方向。在→、↓方向上行走,代表产生空位。故每当从↘转移到→、↓,或者→、↓间转移,代表在某链上产生新空位,重罚,而在→、↓内转移,代表空位延续,轻罚。
1 ''' 2 Code Challenge: Solve the Alignment with Affine Gap Penalties Problem. 3 >>Input: Two amino acid strings v and w (each of length at most 100). 4 >>Output: The maximum alignment score between v and w, followed by an alignment of v and w achieving this maximum score. Use the 5 BLOSUM62 scoring matrix, a gap opening penalty of 11, and a gap extension penalty of 1. 6 --------------------- 7 Sample Input: 8 PRTEINS 9 PRTWPSEIN 10 --------------------- 11 Sample Output: 12 8 13 PRT---EINS 14 PRTWPSEIN- 15 --------------------- 16 coder: Lo Kwongho 17 ''' 18 import numpy 19 from os.path import dirname 20 negINFINITY = -999 21 #Penalties 22 gs = -10 #gap_Start 23 ge = -1 #gap_Extend 24 # 25 def Grade(Symb1,Symb2): 26 Indx1 = symbolList[Symb1] 27 Indx2 = symbolList[Symb2] 28 return matrix[Indx1][Indx2] 29 30 def initGraph(l1,l2): 31 Graph = [numpy.zeros([l1,l2] ,dtype=int) for i in range(3)] 32 33 Graph[1][0][0] = negINFINITY 34 Graph[2][0][0] = negINFINITY 35 for x in range(1,l1): 36 Graph[0][x][0]=negINFINITY 37 if x==1: 38 Graph[1][x][0]=ge+gs 39 else: 40 Graph[1][x][0]=Graph[1][x-1][0]+ge 41 Graph[2][x][0]=negINFINITY 42 for y in range(1,l2): 43 Graph[0][0][y]=negINFINITY 44 if y ==1: 45 Graph[2][0][y]=ge+gs 46 else: 47 Graph[2][0][y]=Graph[2][0][y-1]+ge 48 Graph[1][0][y]=negINFINITY 49 return Graph 50 51 def Init_Path(l1,l2): 52 Path = [numpy.ones([l1,l2])*-1 for i in range(3)] 53 '''for x in range(1,l1): 54 Path[x][0] = 1 55 for y in range(1,l2): 56 Path[0][y] = 2''' 57 return Path 58 59 def buildAlignmentGraph(text1,text2,l1,l2): 60 61 Graph = initGraph(l1,l2) 62 #Path = #Init_Path(l1,l2) 63 for x in range(1,l1): 64 for y in range(1,l2): 65 # X ###### 66 Graph[1][x][y]=max(gs+ge+Graph[0][x-1][y],gs+ge+Graph[2][x-1][y],ge+Graph[1][x-1][y]) 67 68 69 # Y ###### 70 Graph[2][x][y]=max(gs+ge+Graph[0][x][y-1],gs+ge+Graph[1][x][y-1],ge+Graph[2][x][y-1]) 71 72 # M ###### 73 Graph[0][x][y]=Grade(text1[x], text2[y])+max(Graph[0][x-1][y-1],Graph[1][x-1][y-1],Graph[2][x-1][y-1]) 74 75 maxVal = 0 76 maxIndx = -1 77 for i in range(3): 78 if Graph[i][l1-1][l2-1]>maxVal: 79 maxVal=Graph[i][l1-1][l2-1] 80 maxIndx=i 81 return [Graph,maxIndx,maxVal] 82 83 def trackBack(Graph,maxIndx,text1,text2): 84 x = len(text1)-1 85 y = len(text2)-1 86 otext1 = '' 87 otext2 = '' 88 Indx = maxIndx 89 while(1): 90 if Indx==0: 91 otext1 += text1[x] 92 otext2 += text2[y] 93 if x ==1: 94 break 95 if Graph[0][x][y]==Graph[1][x-1][y-1]+Grade(text1[x], text2[y]): 96 Indx = 1 97 elif Graph[0][x][y]==Graph[2][x-1][y-1]+Grade(text1[x], text2[y]): 98 Indx = 2 99 else: 100 Indx = 0 101 x -= 1 102 y -= 1 103 elif Indx==1: 104 otext1 += text1[x] 105 otext2 += '-' 106 if x == 1: 107 break 108 if Graph[1][x][y]==Graph[0][x-1][y]+ge+gs: 109 Indx = 0 110 elif Graph[1][x][y]==Graph[2][x-1][y]+ge+gs: 111 Indx = 2 112 else: 113 Indx = 1 114 x -= 1 115 else: 116 otext1 += '-' 117 otext2 += text2[y] 118 if y == 1: 119 break 120 if Graph[2][x][y]==Graph[0][x][y-1]+ge+gs: 121 Indx = 0 122 elif Graph[2][x][y]==Graph[1][x][y-1]+ge+gs: 123 Indx = 1 124 else: 125 Indx = 2 126 y -= 1 127 128 return [otext1[::-1],otext2[::-1]] 129 130 def ImportScoreMatrix(): 131 dataset = open(dirname(__file__)+'BLOSUM62.txt').read().strip().split('\n') 132 symbolList = dataset[0].split() 133 for i in range(len(symbolList)): 134 symbolList[i]=[symbolList[i],i] 135 symbolList = dict(symbolList) 136 matrix = [] 137 for i in range(1,len(dataset)): 138 matrix.append(dataset[i].split()[1:]) 139 for l in range(len(matrix)): 140 for i in range(len(matrix[l])): 141 matrix[l][i]=int(matrix[l][i]) 142 return [matrix,symbolList] 143 144 145 if __name__ == '__main__': 146 [matrix,symbolList] = ImportScoreMatrix() # 打分矩阵 147 148 dataset = open(dirname(__file__)+'dataset.txt').read().strip().split() 149 text1 = '0'+dataset[0] 150 text2 = '0'+dataset[1] 151 l1 = len(text1) 152 l2 = len(text2) 153 [Graph,maxIndx,maxVal] = buildAlignmentGraph(text1, text2, l1, l2) 154 #print(Graph) 155 156 [output_text1,output_text2] = trackBack(Graph,maxIndx,text1,text2) 157 print(maxVal) 158 print(output_text1) 159 print(output_text2) 160
6 * 一种线性空间的比对方法 Space-Efficient Sequence Alignment(分治+动态规划)
https://www.cs.rit.edu/~rlaz/algorithms20082/slides/SpaceEfficientAlignment.pdf
1 ''' 2 Code Challenge: Implement LinearSpaceAlignment to solve the Global Alignment Problem for a large dataset. 3 >>>Input: Two long (10000 amino acid) protein strings written in the single-letter amino acid alphabet. 4 >>>Output: The maximum alignment score of these strings, followed by an alignment achieving this maximum score. Use the 5 6 BLOSUM62 scoring matrix and indel penalty σ = 5. 7 ------------ 8 Sample Input: 9 PLEASANTLY 10 MEANLY 11 ------------ 12 Sample Output: 13 8 14 PLEASANTLY 15 -MEA--N-LY 16 ------------ 17 coder: Lo Kwongho in 2018.9 18 ''' 19 from os.path import dirname 20 import numpy 21 # 22 indel = -5 23 negINF = -9999 24 # 25 # 26 def Grade(Symb1,Symb2): 27 Indx1 = symbolList[Symb1] 28 Indx2 = symbolList[Symb2] 29 return matrix[Indx1][Indx2] 30 31 def ImportScoreMatrix(): 32 dataset = open(dirname(__file__)+'BLOSUM62.txt').read().strip().split('\n') 33 symbolList = dataset[0].split() 34 for i in range(len(symbolList)): 35 symbolList[i]=[symbolList[i],i] 36 symbolList = dict(symbolList) 37 matrix = [] 38 for i in range(1,len(dataset)): 39 matrix.append(dataset[i].split()[1:]) 40 for l in range(len(matrix)): 41 for i in range(len(matrix[l])): 42 matrix[l][i]=int(matrix[l][i]) 43 return [matrix,symbolList] 44 # 45 def half_Alignment(v,w): 46 nv = len(v) 47 mw = len(w) 48 s = numpy.zeros(shape=(nv+1,2),dtype=int) 49 for i in range(nv+1): 50 s[i,0] = indel*i 51 if mw==0: 52 return s[:,0] # 53 for j in range(mw): 54 s[0,1]=s[0,0]+indel 55 for i in range(nv): 56 s[i+1,1]=max(s[i,1]+indel,s[i+1,0]+indel,s[i,0]+Grade(w[j],v[i])) 57 s[:,0]=s[:,1] 58 return s[:,1] 59 60 def midEdge(v,w): 61 nv = len(v) 62 mw = len(w) 63 mid = int((mw-1)/2) 64 wl = w[:mid] 65 wr = w[mid+1:] 66 pre = half_Alignment(v,wl) 67 suf = half_Alignment(v[::-1],wr[::-1])[::-1] 68 hs = [pre[i]+suf[i]+indel for i in range(nv+1)] 69 ds = [pre[i]+suf[i+1]+Grade(w[mid],v[i]) for i in range(nv)] 70 maxhs = max(hs) 71 maxds = max(ds) 72 if maxhs>maxds: 73 return ( (hs.index(maxhs),mid) ,(hs.index(maxhs),mid+1) ) 74 else: 75 return ( (ds.index(maxds),mid) ,(ds.index(maxds)+1,mid+1) ) 76 77 def build_Alignment_track(v,w): 78 vn = len(v) 79 wm = len(w) 80 if vn==0 and wm==0: 81 return [] 82 elif vn==0: 83 return ['-']*wm 84 elif wm==0: 85 return ['|']*vn 86 ((x1,y1),(x2,y2)) = midEdge(v,w) 87 if x1==x2: 88 edge = ['-'] 89 else: 90 edge = ['\\'] 91 wleft = w[:y1] 92 wright = w[y2:] 93 vupper = v[:x1] 94 vbotm = v[x2:] 95 96 upper_left_track = build_Alignment_track(vupper,wleft) 97 bottom_right_track = build_Alignment_track(vbotm,wright) 98 return upper_left_track+edge+bottom_right_track 99 100 def trackToString(v,w,track): 101 vi = 0 102 wj = 0 103 outv = '' 104 outw = '' 105 score = 0 106 for i in track: 107 if i == '|': 108 outv += v[vi] 109 outw += '-' 110 score += indel 111 vi += 1 112 elif i == '-': 113 outv += '-' 114 outw += w[wj] 115 score += indel 116 wj += 1 117 else: 118 outv += v[vi] 119 outw += w[wj] 120 score += Grade(v[vi], w[wj]) 121 vi += 1 122 wj += 1 123 124 return [score,outv,outw] 125 126 def LinearSpaceAlignment(v,w): 127 track = build_Alignment_track(v,w) 128 [score,outv, outw] = trackToString(v,w,track) 129 print(score) 130 print(outv) 131 print(outw) 132 133 if __name__ == '__main__': 134 dataset = open(dirname(__file__)+'dataset.txt').read().strip().split() 135 [matrix,symbolList] = ImportScoreMatrix() 136 v = dataset[0] 137 w = dataset[1] 138 LinearSpaceAlignment(v,w)
posted on 2018-09-21 18:55 iojafekniewg 阅读(400) 评论(0) 编辑 收藏 举报