oma同源基因查找

oma查找同源基因【最终需要从HOGFasta文件夹中提取文件】

#同源基因的另一种查找方法oma (https://omabrowser.org/standalone)#####################################################
wget -O oma.tgz https://omabrowser.org/standalone/OMA.2.4.2.tgz
# on MacOS you can also use curl: 
# curl https://omabrowser.org/standalone/OMA.2.4.2.tgz -o oma.tgz
tar xvzf oma.tgz
cd OMA.2.4.2
./install.sh prefix=files  #下载安装

# copy FASTA files into DB directory
cp /home/you/fasta/yourFirstGenomeFile.fa DB/
cp /home/you/fasta/yourSecondGenomeFile.fa DB/
cp /home/you/fasta/yourContigFile.contig.fa DB/ #将以.fa结尾的所有蛋白文件放在DB文件夹中
# generate default parameter file
oma -p #生成.drw结尾的配置文件
# adjust parameters
vim parameters.drw
# run oma
oma -n 10 #运行命令

#【ly-2021/10/22】 注意文件中有个参数UseOnlyOneSplicingVariant，我将其默认的ture改为了false，因为是true时，发现OrthologousGroupsFasta文件夹下的序列文件，每个文件对于每个物种都至多包含一条序列，这样我就无法获得对于4倍体，可以包含两条序列的情况。改了之后具体有什么变化，出来后再看【结果：跟上面结果一样，每个文件对于每个物种照样至多包含一条序列】
# 从HOGFasta文件夹下提取特定的序列文件

特别注意：

1，放在DB路径里的序列文件必须都是.fa结尾，否则会出错，提示找不到基因组序列；

2，修改parameters.drw的必须参数，一般修改outgroup就行(将outgroup对应的文件名改了，注意不要后缀)

结果文件：主要根据OrthologousGroupsFasta文件夹下的序列文件【值得注意的是，每个文件对于每个物种至多只包含一条序列】，提取单拷贝直系同源基因。【发现在文件夹HOGFasta下的序列文件中，可能存在一个文件包含一个物种的多条序列的情况，因此感觉可以从该文件提取特定拷贝数的文件】

提取但拷贝基因保存到已有的Single_Copy文件夹中：

#!/usr/bin/python
#ly-2021/10/08
#Goals: 提取符合条件的序列文件，存到拧一个文件夹中
#version_01
#Usage: python /home/data/vip10t01/manscript/py/06_extrSingle-copy.py

#################################
import os
import sys 
import shutil
path = "/home/data/vip10t01/project/03_dark/oma/Output/OrthologousGroupsFasta/"  #文件路径【修改】
desPath = "/home/data/vip10t01/project/03_dark/oma/Output/single_copy2/"   #提取出来后存放的路径【修改】
fileList = os.listdir(path)  #获取路径下的文件，存为列表
#print(fileList)

def get_files(path, desPath):
    for filename in fileList:  #遍历列表
        tmp = path + filename  #获得文件的绝对路径
        with open(tmp, "r") as file:  #打开文件
            id=0  #用于统计该文件有多少个“>”
            a=[]  #用于统计同一物种，是否只包含一条序列
            for line in file.readlines():  #读取文件
                if ">" in line:  
                    id += 1 #统计该文件有多少个“>”
                    line_str=line[0:4] #这里，我们只要知道前4个字符相同，那么就可以确定这两条序列属于同一物种
            #       print(line_str)
                    a.append(line_str)
            #print(a)
            #print(id)
            if id == 9 and len(a)==len(set(a)): #满足两个条件：1、包含9个“>”；2、每个物种只有一条序列
                shutil.copy(tmp, desPath)  #满足条件的文件复制到指定文件夹中
            #   a=[]
            #   for line in file.readlines():
            #       line_str = line[0:2]
            #       print(line)
            #       if line_str not in a:
            #           a.append(line_str)
            #   if len(a) == 9:
            #       shutil.copy(tmp, desPath)

get_files(path, desPath)

#!/usr/bin/python
#ly2021/10/23 修改第一版，实用性更好
import os
import sys
import shutil
#path = "/home/data/vip10t01/project/03_dark/oma/Output/OrthologousGroupsFasta/"
path = "/home/data/vip10t01/project/03_dark/oma_2/Output/OrthologousGroupsFasta/"

#desPath = "/home/data/vip10t01/project/03_dark/oma/Output/single_copy3/"
#desPath = "/home/data/vip10t01/project/03_dark/oma_2/Output/single_copy/"
desPath = "/home/data/vip10t01/project/03_dark/oma_2/Output/single_copy_1/"

fileList = os.listdir(path)
#print(fileList)

def get_files(path, desPath):
    for filename in fileList:
        tmp = path + filename
        with open(tmp, "r") as file:
            id=0
            a=[]
            for line in file.readlines():
                if ">" in line:
                    id += 1
            #       line_str=line[0:8]
                    line_str=line
            #       print(line_str)
                    a.append(line_str)
#           print(a)
#           print(id)
            req1 = "A_mexicanus"
            req2 = "C_idella"
            req3 = "L_oculatus"
            req4 = "L_tanaka"
            req5 = "P_nattereri"
            req6 = "P_swirei"
            req7 = "P_transmontana"
            req8 = "S_anshuiensis"
            req9 = "T_subterraneus"

            if id == 9 and len(a)==len(set(a)) and req1 in str(a) and req2 in str(a) and req3 in str(a) and  req4 in str(a) and req5 in str(a) and req6 in str(a) and req7 in str(a) and req8 in str(a) and req9 in str(a):
                shutil.copy(tmp, desPath)
            #   a=[]
            #   for line in file.readlines():
            #       line_str = line[0:2]
            #       print(line)
            #       if line_str not in a:
            #           a.append(line_str)
            #   if len(a) == 9:
            #       shutil.copy(tmp, desPath)

get_files(path, desPath)

#!/usr/bin/python
#ly-2021/10/22 修改第二版，可以使多倍体保留两条序列
import os
import sys
import shutil
#path = "/home/data/vip10t01/project/03_dark/oma/Output/OrthologousGroupsFasta/"
path = "/home/data/vip10t01/project/03_dark/oma_2/Output/OrthologousGroupsFasta/"

#desPath = "/home/data/vip10t01/project/03_dark/oma/Output/single_copy3/"
desPath = "/home/data/vip10t01/project/03_dark/oma_2/Output/single_copy/"
#desPath = "/home/data/vip10t01/project/03_dark/oma_2/Output/single_copy_1/"

fileList = os.listdir(path)
#print(fileList)

def get_files(path, desPath):
    for filename in fileList:
        tmp = path + filename
        with open(tmp, "r") as file:
            id=0
            a=[]
            for line in file.readlines():
                if ">" in line:
                    id += 1
            #       line_str=line[0:8]
                    line_str=line
            #       print(line_str)
                    a.append(line_str)
#           print(a)
            print(id)
            req1 = "A_mexicanus"
            req2 = "C_idella"
            req3 = "L_oculatus"
            req4 = "L_tanaka"
            req5 = "P_nattereri"
            req6 = "P_swirei"
            req7 = "P_transmontana"
            req8 = "S_anshuiensis"
            req9 = "T_subterraneus"

#           if id == 9 and len(a)==len(set(a)) and req1 in str(a) and req2 in str(a) and req3 in str(a) and  req4 in str(a) and req5 in str(a) and req6 in str(a) and req7 in str(a) and req8 in str(a) and req9 in str(a) or id == 10 and req1 in str(a) and req2 in str(a) and req3 in str(a) and  req4 in str(a) and req5 in str(a) and req6 in str(a) and req7 in str(a) and req8 in str(a) and req9 in str(a) and str(a).count(req8) == 2:
            if id == 10 and str(a).count(req8) == 2 and req1 in str(a) and req2 in str(a) and req3 in str(a) and  req4 in str(a) and req5 in str(a) and req6 in str(a) and req7 in str(a) and req9 in str(a):
                shutil.copy(tmp, desPath)
            #   a=[]
            #   for line in file.readlines():
            #       line_str = line[0:2]
            #       print(line)
            #       if line_str not in a:
            #           a.append(line_str)
            #   if len(a) == 9:
            #       shutil.copy(tmp, desPath)

get_files(path, desPath)

#ly-2021/11/01 根据需要，提取文件

#!/usr/bin/python

import os
import sys
import shutil
#path = "/home/data/vip10t01/project/03_dark/oma/Output/OrthologousGroupsFasta/"
#path = "/home/data/vip10t01/project/03_dark/oma_2/Output/OrthologousGroupsFasta/"
path = "/home/data/vip10t01/project/03_dark/oma_2/Output/HOGFasta/"

#desPath = "/home/data/vip10t01/project/03_dark/oma/Output/single_copy3/"
#desPath = "/home/data/vip10t01/project/03_dark/oma_2/Output/single_copy3/"
desPath = "/home/data/vip10t01/project/03_dark/oma_2/Output/single_copy_1/"

fileList = os.listdir(path)
#print(fileList)

def get_files(path, desPath):
    for filename in fileList:
        tmp = path + filename
        with open(tmp, "r") as file:
            id=0
            a=[]
            for line in file.readlines():
                if ">" in line:
                    id += 1
            #       line_str=line[0:8]
                    line_str=line
            #       print(line_str)
                    a.append(line_str)
#           print(a)
            print(id)
            req1 = "A_mexicanus"
            req2 = "C_idella"
            req3 = "L_oculatus"
            req4 = "L_tanaka"
            req5 = "P_nattereri"
            req6 = "P_swirei"
            req7 = "P_transmontana"
            req8 = "S_anshuiensis"
            req9 = "T_subterraneus"

            if id == 9 and len(a)==len(set(a)) and req1 in str(a) and req2 in str(a) and req3 in str(a) and  req4 in str(a) and req5 in str(a) and req6 in str(a) and req7 in str(a) and req8 in str(a) and req9 in str(a) or id == 10 and req1 in str(a) and req2 in str(a) and req3 in str(a) and  req4 in str(a) and req5 in str(a) and req6 in str(a) and req7 in str(a) and req8 in str(a) and req9 in str(a) and str(a).count(req8) == 2:

#           if id == 9 and req1 in str(a) and req2 in str(a) and req3 in str(a) and  req4 in str(a) and req5 in str(a) and req6 in str(a) and req7 in str(a) and req8 in str(a) and req9 in str(a):

#           if id == 10 and str(a).count(req8) == 2 and req1 in str(a) and req2 in str(a) and req3 in str(a) and  req4 in str(a) and req5 in str(a) and req6 in str(a) and req7 in str(a) and req9 in str(a):
                shutil.copy(tmp, desPath)
            #   a=[]
            #   for line in file.readlines():
            #       line_str = line[0:2]
            #       print(line)
            #       if line_str not in a:
            #           a.append(line_str)
            #   if len(a) == 9:
            #       shutil.copy(tmp, desPath)

get_files(path, desPath)