提取BioGRID中的基因symbol和得分所在列
使用多线程的方法,对BioGRID的数据进行提取,主要提取第8,9,19列
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import os
import sys
import logging
import argparse
import click
from multiprocessing.pool import Pool
'''
@author: yueyao
@time: 2019/6/19
@file: formatdb.py
@mail: yueyaomail@gmail.com
'''
@click.group()
def main():
pass
@click.command()
@click.option("-i","--indir", help="a name file.")
@click.option("-o","--outdir", help="a outdir include fpkm file.")
@click.option("-p","--thread", type=int,default=6,help="a outdir include fpkm file.")
def BioGRID(indir,outdir,thread):
'''
fetch gene symbol as protein relationship.
'''
if indir is None or outdir is None :
click.echo('Usage:\n\tpython formatdb.py BioGRID -indir /path/ -outdir /path/ ')
sys.exit(1)
p=Pool(int(thread))
filelist=os.listdir(indir)
os.makedirs(outdir)
tab2list=filter(lambda x:x.endswith(".tab2.txt"),filelist)
plist=[]
for tab2 in tab2list:
filename=tab2.split('-')[2]
input=indir+"/"+tab2
output=outdir+"/"+filename+".format.txt"
plist.append((input,output))
p.apply_async(changeformat, args=(input,output,))
print("Waiting for all subprocess done...")
p.close()
p.join()
print ("All subprocess done")
def changeformat(file1,file2):
f1=open(file1,'r')
f2=open(file2,'w')
for line in f1:
line=line.strip().split("\t")
f2.write("\t".join([line[7],line[8],line[18]])+"\n")
f1.close()
f2.close()
main.add_command(BioGRID)
if __name__ == '__main__':
main()