依据gff切fa并翻译为蛋白质

#!/usr/bin/python
import re
import sys
import gzip

change={'A':'T','T':'A','C':'G','G':'C','N':'N'}

CODE = {
    'GCA' : 'A', 'GCC' : 'A', 'GCG' : 'A', 'GCT' : 'A',                     
    'TGC' : 'C', 'TGT' : 'C',                                                           # Cysteine
    'GAC' : 'D', 'GAT' : 'D',                                                           # Aspartic Acid
    'GAA' : 'E', 'GAG' : 'E',                                                           # Glutamic Acid
    'TTC' : 'F', 'TTT' : 'F',                                                           # Phenylalanine
    'GGA' : 'G', 'GGC' : 'G', 'GGG' : 'G', 'GGT' : 'G',                               # Glycine
    'CAC' : 'H', 'CAT' : 'H',                                                           # Histidine
    'ATA' : 'I', 'ATC' : 'I', 'ATT' : 'I',                                             # Isoleucine
    'AAA' : 'K', 'AAG' : 'K',                                                           # Lysine
    'CTA' : 'L', 'CTC' : 'L', 'CTG' : 'L', 'CTT' : 'L', 'TTA' : 'L', 'TTG' : 'L',   # Leucine
    'ATG' : 'M',                                                                         # Methionine
    'AAC' : 'N', 'AAT' : 'N',                                                           # Asparagine
    'CCA' : 'P', 'CCC' : 'P', 'CCG' : 'P', 'CCT' : 'P',                               # Proline
    'CAA' : 'Q', 'CAG' : 'Q',                                                           # Glutamine
    'CGA' : 'R', 'CGC' : 'R', 'CGG' : 'R', 'CGT' : 'R', 'AGA' : 'R', 'AGG' : 'R',   # Arginine
    'TCA' : 'S', 'TCC' : 'S', 'TCG' : 'S', 'TCT' : 'S', 'AGC' : 'S', 'AGT' : 'S',   # Serine
    'ACA' : 'T', 'ACC' : 'T', 'ACG' : 'T', 'ACT' : 'T',                               # Threonine
    'GTA' : 'V', 'GTC' : 'V', 'GTG' : 'V', 'GTT' : 'V',                               # Valine
    'TGG' : 'W',                                                                         # Tryptophan
    'TAC' : 'Y', 'TAT' : 'Y',                                                           # Tyrosine
    'TAA' : '*', 'TAG' : '*', 'TGA' : '*'                                              # Stop
}




def readfa(l):
    col={}
    arr =[]
    sca =''
    li= gzip.open(l,'rb')
    for line in li:
        if '>' in line:
            arr =[]
            sca = line.split()[0].lstrip('>')
            col[sca]=arr
        else:
            without = re.sub(r'
',"",line)
            arr.append(without)
    return col



def readgff(l):
    col ={}
    arr =[]
    li= gzip.open(l,'rb')
    for line in li:
        sp = line.split( )
        if sp[2] == 'mRNA':
            gene = re.match(r'ID=(.*?);',sp[8]).group(1)
            arr=[]
            col[gene]=[arr,sp[0],sp[6]]
#            start=sp[3]
        elif sp[2] == 'CDS':
            gene = re.match(r'Parent=(.*?);',sp[8]).group(1)
            col[gene][0].append([int(sp[3])-1,int(sp[4])])
    return col
#main###


out= gzip.open(sys.argv[3],'wb')
gff=readgff(sys.argv[2])
c=gff
s=''
fa =readfa(sys.argv[1])


for k1,v1 in gff.items():
    if v1[1] in fa.keys():
        lon=s.join(fa[v1[1]])
        short=''
        for i in v1[0]:
            short +=lon[i[0]:i[1]]
        if v1[2] == '-':
            short=''.join(change[i] for i in short)[::-1]
        j=0
        AA=''
        while j <= (len(short)-3):
            sp = short[j:3+j]
            if 'N' in sp:
                j=j+3
                continue
            else:
                AA += CODE[sp]
                j=j+3
        print >>out,">",k1,"
",AA
    else:
        print "no fa"
原文地址:https://www.cnblogs.com/yuanjingnan/p/11172734.html