14 Finding a Shared Motif

Problem

A common substring of a collection of strings is a substring of every member of the collection. We say that a common substring is a longest common substring if there does not exist a longer common substring. For example, "CG" is a common substring of "ACGTACGT" and "AACCGTATA", but it is not as long as possible; in this case, "CGTA" is a longest common substring of "ACGTACGT" and "AACCGTATA".

Note that the longest common substring is not necessarily unique; for a simple example, "AA" and "CC" are both longest common substrings of "AACC" and "CCAA".

Given: A collection of $k$ ( $k \leq 100$ ) DNA strings of length at most 1 kbp each in FASTA format.

Return: A longest common substring of the collection. (If multiple solutions exist, you may return any single solution.)

Sample Dataset

>Rosalind_1
GATTACA
>Rosalind_2
TAGACCA
>Rosalind_3
ATACA

Sample Output

AC


# 方法一

# coding=utf-8
'''
>Rosalind_1
GATTACA
>Rosalind_2
TAGACCA
>Rosalind_3
ATACA
'''

def readfasta(filename, sample):
    fa = open(filename, 'r')
    fo = open(sample, 'w')
    res = {}
    rres = []
    ID = ''
    for line in fa:
        if line.startswith('>'):
            ID = line.strip('
')
            res[ID] = ''
        else:
            res[ID] += line.strip('
')

    for key in res.values():
        rres.append(key)
        fo.write(key + '
')
    return rres


def fragement(seq_list):
    res = []
    seq = seq_list[0]
    for i in range(len(seq)):
        s_seq = seq[i:]
        #print s_seq
        for j in range(len(s_seq)):
            res.append(s_seq[:(len(s_seq) - j)])
            #print res

    return res


def main(infile, sample):
    seq_list = readfasta(infile, sample)   #['TAGACCA','ATACA','GATTACA']
    frags = fragement(seq_list)
    frags.sort(key=len, reverse=True)     # 从长到短排列
    for i in range(len(frags)):
        ans = []
        # s = 0
        # m+=1
        # print(m)
        # res[frags[i]] = 0
        for j in seq_list:
            r = j.count(frags[i])
            if r != 0:
                ans.append(r)
        if len(ans) >= len(seq_list):
            print(frags[i])
            break


main('14.txt', 'sample.txt')

　　方法二：（没看懂）

# coding=utf-8
'''
A solution to a ROSALIND bioinformatics problem.
Problem Title: Finding a Shared Motif
Rosalind ID: LCSM
Rosalind #: 014
URL: [url]http://rosalind.info/problems/lcsm/[/url]
'''


def LongestSubstring(string_list):
    '''Extracts all substrings from the first string in a list, and sends longest substring candidates to be checked.'''
    longest = ''
    for start_index in range(len(string_list[0])):
        for end_index in range(len(string_list[0]), start_index, -1):
            # Break if the length becomes too small, as it will only get smaller.
            if end_index - start_index <= len(longest):
                break
            elif CheckSubstring(string_list[0][start_index:end_index], string_list):
                longest = string_list[0][start_index:end_index]

    return longest


def CheckSubstring(find_string, string_list):
    'Checks if a given substring appears in all members of a given collection of strings and returns True/False.'
    for string in string_list:
        if (len(string) < len(find_string)) or (find_string not in string):
            return False
    return True


seq = {}
seq_name = ''
with open('14.txt') as f:
    for line in f:
        if line[0] == '>':
            seq_name = line.rstrip()
            seq[seq_name] = ''
            continue
        else:
            seq[seq_name] += (line.rstrip()).upper()

print(seq)

if __name__ == '__main__':
    dna = []
    for seq_name in seq:
        dna.append(seq[seq_name])

    lcsm = LongestSubstring(dna)
    print(lcsm)
    with open('014_LCSM.txt', 'w') as output_data:
        output_data.write(lcsm)