关于结核分枝杆菌的进化中,最简单的判断菌株受到选择压力的方法就是看是否存在明显的平行进化特征。
因此,利用进化树和相关测序数据,可以进行相关判断。

import sys
import re
from collections import Iterable

#将进化树进行简化
tree=""
for row in open(sys.argv[1]).readlines():
string="tree tree"
if string in row:
tree=row.rstrip()
simplified_tree=re.compile('(\:\d+\.\d+|E\-\d)').sub('',tree).replace(';','').replace('-','').replace('_','')
strains_input=[]
for row in open(sys.argv[2]).readlines():
strains_input.append(row.rstrip().replace('-','').replace('_',''))
if simplified_tree.startswith('('):
strains_in_tree=simplified_tree.replace('(','').replace(')','').split(',')
else:
strains_in_tree=simplified_tree.replace('(','').replace(')','')[19:].split(',')

#判断是否存在平行进化特征
def ei(strains):
if not isinstance(strains,list):
return "Input error"
if len(strains)<2:
return "Input strains < 2"
name_len=0
name_sum={}
for row in strains:
name_len+=len(row)
name_sum[row]=simplified_tree.index(row)
sort_name=sorted(name_sum.items(),key=lambda e:e[1])
min_index=sort_name[0][1]-1
max_index=sort_name[-1][1]+len(sort_name[-1][0])
if simplified_tree[min_index] == "(" and simplified_tree[max_index] == ")" : #( )
substract=''
for i in range(min_index,max_index+1):
substract+=simplified_tree[i]
if substract.count("(") != substract.count(")"): #(...) (((,,)), 和 ,((...)
if substract.count("(") > substract.count(")") and simplified_tree[max_index+1] != ")" :
return "Evovle independently !"
if substract.count("(") < substract.count(")") and simplified_tree[min_index-1] != "(" :
return "Evovle independently !"
if len(substract.replace(',','').replace('(','').replace(')','')) == name_len: # (...)内没有其他菌株
return "Not evovle independently !"
else:
return "Evovle independently !"
else: #(A,(B,)) ((,B),C) (A,(,),D)
return "Evovle independently !"

print(sys.argv[2]+ " " +ei(strains_input))

def positive(adjacent_strains):
count=1
i=2
while i <= len(adjacent_strains):
if ei(adjacent_strains[:2]) != "Evovle independently !":
i+=1
else:
count+=1
adjacent_strains=adjacent_strains[i-1:]
i=2
return count

def negative(adjacent_strains):
count=1
j=len(adjacent_strains)
while j >=2 :
if ei(adjacent_strains[j-2:]) != "Evovle independently !":
j-=1
else:
count+=1
adjacent_strains=adjacent_strains[:j-1]
j=len(adjacent_strains)
return count

def adjacent_judge(adjacent_strains): #树展开按一个方向的顺序,对于一个簇可能是由里往外,又有可能由外往里,因此取两者最小值是正确选择
if ei(adjacent_strains) != "Evovle independently !":
return 1
else:
return min(negative(adjacent_strains),positive(adjacent_strains))

#取出input strain中相邻的进行上面判断
if ei(strains_input) == "Evovle independently !" :
count=0
sort_index=sorted([strains_in_tree.index(i) for i in strains_input])
adjacent_substract=[sort_index[i]-sort_index[i-1] for i in range(1,len(sort_index))]+[0]
separate=[]
adjacent_strains=[]
for i in range(len(adjacent_substract)):
if adjacent_substract[i]==1:
adjacent_strains.append(strains_in_tree[sort_index[i]])
elif adjacent_substract[i-1]==1:
adjacent_strains.append(strains_in_tree[sort_index[i]])
separate.append(adjacent_strains)
adjacent_strains=[]
else:
adjacent_strains.append(strains_in_tree[sort_index[i]])
separate.append(adjacent_strains)
adjacent_strains=[]
for item in separate:
if len(item) == 1:
count+=1
else:
count+=adjacent_judge(item)
print(sys.argv[2]+ " Evovle independently " + str(count) + " times !")

写的比较简陋,也很粗糙,之后可能会进行修改。