DNA序列组装(贪婪算法)
生物信息学原理作业第四弹:DNA序列组装(贪婪算法)
原理:生物信息学(孙啸)
大致思想:
1. 找到权值最大的边;
2. 除去以最大权值边的起始顶点为起始顶点的边;
3. 除去以最大权值边为终点为终点的边;
4. 重复上述步骤,得到所有符合条件的边;
5. 拼接得到的边;
6. 加入孤立点(如果有)。
附上Python代码,如果有问题我会及时更正(确实不太熟算法)
转载请保留出处!
1 # -*- coding: utf-8 -*-
2 """
3 Created on Mon Dec 4 15:04:39 2017
4 @author: zxzhu
5 python3.6
6 """
7 from functools import reduce
8 def get_weight(s1,s2): #通过两条序列的overlap计算出权值
9 l = min(len(s1),len(s2))
10 while l>0:
11 if s2[:l] == s1[-l:]:
12 return l
13 else:
14 l-=1
15 return 0
16
17 def print_result(s1,s2): #将两条序列去除首尾overlap后合并
18 weight = get_weight(s1,s2)
19 s = s1 + s2[weight:]
20 #print(s)
21 return s
22
23 def dir_graph(l,t=3): #得到满足条件的有向图(权值大于等于t)
24 graph = {}
25 for i in l:
26 for j in l:
27 if i!=j:
28 weight = get_weight(i,j)
29 if weight >= t:
30 key = (i,j)
31 graph[key] = weight
32 return graph
33
34 def rm_path(graph,path): #贪婪算法加入一条边后应该去除与该边首尾顶点相同的边
35 key = graph.keys()
36 rm_key = []
37 for i in key:
38 if i[1] == path[1] or i[0] == path[0]:
39 rm_key.append(i)
40 for i in rm_key:
41 graph.pop(i)
42 return graph
43
44 def get_path(graph,path = []): #得到满足条件的所有边
45 while graph:
46 max_weight = 0
47 for i in graph.keys():
48 if graph[i] > max_weight:
49 max_weight = graph[i]
50 cur_path = i
51 path.append(cur_path)
52 graph = rm_path(graph,cur_path)
53 get_path(graph,path)
54 return path
55
56 def out_num(path,V): #计算某顶点的出度
57 count = 0
58 for i in path:
59 if i[0] == V:
60 count+=1
61 return count
62
63 def get_last_V(path,last_V = None): #得到最后一条边
64 index = 0
65 if last_V: #非随机寻找出度为0的顶点
66 for i in path:
67 if i[1] == last_V:
68 return i,index
69 else:
70 index+=1
71 return None #没有找到指向last_V的顶点(一条路径结束)
72 else: #随机寻找出度为0的顶点
73 for i in path:
74 if out_num(path,i[1]) == 0:
75 return i,index
76 else:
77 index+=1
78 return -1 #首尾相连
79
80
81 def assemble(cur_V,path,new_path = []): #给满足条件的边排序
82 while path:
83 path.pop(cur_V[1])
84 new_path.insert(0,cur_V[0])
85 cur_V = get_last_V(path,last_V = cur_V[0][0])
86 if cur_V:
87 assemble(cur_V,path,new_path)
88 else:
89 cur_V = get_last_V(path)
90 assemble(cur_V,path,new_path)
91 return new_path
92
93 def align_isolated(path,sequence): #加入孤立顶点
94 new_path = reduce(lambda x,y:x+y,path)
95 for i in sequence:
96 if i not in new_path:
97 new_path.append(i)
98 return new_path
99
100 x = 'CCTTTTGG'
101 y = 'TTGGCAATCACT'
102 w = 'AGTATTGGCAATC'
103 u = 'ATGCAAACCT'
104 z = 'AATCGATG'
105 v = 'TCACTCCTTTT'
106 a = w
107 b = y
108 c = 'TCACTAGTA'
109 sequence = [x,y,w,u,z]
110 sequence1 = [a,b,c]
111 graph = dir_graph(sequence1,t=3)
112 print(graph)
113 path = get_path(graph)
114 path = [list(i) for i in path] #将path中的tuple元素换成list
115 #print(path)
116 start = get_last_V(path) #起始出度为0的顶点所在的边
117 if start == -1: #序列首尾相连
118 new_path = reduce(lambda x,y:x+y, path)
119 new_path = new_path[:-1]
120 result = reduce(print_result,new_path)
121 else:
122 new_path = assemble(start,path) #排序后的边
123 new_path = align_isolated(new_path,sequence1) #加入孤立顶点
124 #print(new_path)
125 result = reduce(print_result,new_path) #组装
126 #print(new_path)
127 print(result)
Talk is cheap,show me your code!