提取BioGRID中的基因symbol和得分所在列

使用多线程的方法,对BioGRID的数据进行提取,主要提取第8,9,19列

#!/usr/bin/env python
# -*- coding:utf-8 -*-
import os
import sys
import logging
import argparse
import click
from multiprocessing.pool import Pool
'''
@author: yueyao
@time: 2019/6/19
@file: formatdb.py
@mail: yueyaomail@gmail.com
'''
@click.group()
def main():
    pass
@click.command()
@click.option("-i","--indir", help="a name file.")
@click.option("-o","--outdir", help="a outdir include fpkm file.")
@click.option("-p","--thread", type=int,default=6,help="a outdir include fpkm file.")
def BioGRID(indir,outdir,thread):
    '''
    fetch gene symbol as protein relationship.
    '''
    if indir is None or outdir is None :
        click.echo('Usage:
	python formatdb.py BioGRID -indir /path/ -outdir /path/ ')
        sys.exit(1)
    p=Pool(int(thread))
    filelist=os.listdir(indir)
    os.makedirs(outdir)
    tab2list=filter(lambda x:x.endswith(".tab2.txt"),filelist)
    plist=[]
    for tab2 in tab2list:
        filename=tab2.split('-')[2]
        input=indir+"/"+tab2
        output=outdir+"/"+filename+".format.txt"
        plist.append((input,output))
        p.apply_async(changeformat, args=(input,output,))
    print("Waiting for all subprocess done...")
    p.close()
    p.join()
    print ("All subprocess done")
def changeformat(file1,file2):
    f1=open(file1,'r')
    f2=open(file2,'w')
    for line in f1:
        line=line.strip().split("	")
        f2.write("	".join([line[7],line[8],line[18]])+"
")
    f1.close()
    f2.close()
main.add_command(BioGRID)
if __name__ == '__main__':
    main()




原文地址:https://www.cnblogs.com/raisok/p/15190006.html