提取BioGRID中的基因symbol和得分所在列

使用多线程的方法,对BioGRID的数据进行提取,主要提取第8,9,19列

#!/usr/bin/env python
# -*- coding:utf-8 -*-
import os
import sys
import logging
import argparse
import click
from multiprocessing.pool import Pool
'''
@author: yueyao
@time: 2019/6/19
@file: formatdb.py
@mail: yueyaomail@gmail.com
'''
@click.group()
def main():
    pass
@click.command()
@click.option("-i","--indir", help="a name file.")
@click.option("-o","--outdir", help="a outdir include fpkm file.")
@click.option("-p","--thread", type=int,default=6,help="a outdir include fpkm file.")
def BioGRID(indir,outdir,thread):
    '''
    fetch gene symbol as protein relationship.
    '''
    if indir is None or outdir is None :
        click.echo('Usage:\n\tpython formatdb.py BioGRID -indir /path/ -outdir /path/ ')
        sys.exit(1)
    p=Pool(int(thread))
    filelist=os.listdir(indir)
    os.makedirs(outdir)
    tab2list=filter(lambda x:x.endswith(".tab2.txt"),filelist)
    plist=[]
    for tab2 in tab2list:
        filename=tab2.split('-')[2]
        input=indir+"/"+tab2
        output=outdir+"/"+filename+".format.txt"
        plist.append((input,output))
        p.apply_async(changeformat, args=(input,output,))
    print("Waiting for all subprocess done...")
    p.close()
    p.join()
    print ("All subprocess done")
def changeformat(file1,file2):
    f1=open(file1,'r')
    f2=open(file2,'w')
    for line in f1:
        line=line.strip().split("\t")
        f2.write("\t".join([line[7],line[8],line[18]])+"\n")
    f1.close()
    f2.close()
main.add_command(BioGRID)
if __name__ == '__main__':
    main()




posted @ 2021-08-26 15:47  raisok  阅读(87)  评论(0编辑  收藏  举报