#!/usr/bin/python # Solution to the ORF rosalind problem - 'Open Reading Frames' # Author: Peter Vlasveld # function to generate the orfs def orfGen(stri): # dictionary for translation translate = { "AAA": "K", "AAG": "K", "GAA": "E", "GAG": "E", "AAC": "N", "AAU": "N", "GAC": "D", "GAU": "D", "ACA": "T", "ACC": "T", "ACG": "T", "ACU": "T", "GCA": "A", "GCC": "A", "GCG": "A", "GCU": "A", "GGA": "G", "GGC": "G", "GGG": "G", "GGU": "G", "GUA": "V", "GUC": "V", "GUG": "V", "GUU": "V", "AUG": "M", "UAA": "*", "UAG": "*", "UGA": "*", "AUC": "I", "AUU": "I", "AUA": "I", "UAC": "Y", "UAU": "Y", "CAA": "Q", "CAG": "Q", "AGC": "S", "AGU": "S", "UCA": "S", "UCC": "S", "UCG": "S", "UCU": "S", "CAC": "H", "CAU": "H", "UGC": "C", "UGU": "C", "CCA": "P", "CCC": "P", "CCG": "P", "CCU": "P", "UGG": "W", "AGA": "R", "AGG": "R", "CGA": "R", "CGC": "R", "CGG": "R", "CGU": "R", "UUA": "L", "UUG": "L", "CUA": "L", "CUC": "L", "CUG": "L", "CUU": "L", "UUC": "F", "UUU": "F", } # list to contain protein sequences proteins = [] # loop that runs through the sequences from each amino acid in the sequence for i in range(0, len(stri) - 2): tempStr = "" tempBool = False j = i # find an orf and break when it is finished while j < len(stri) - 2: if translate[stri[j: j + 3]] == "*": tempBool = False if translate[stri[j: j + 3]] == "M": tempBool = True if tempBool: tempStr += translate[stri[j: j + 3]] else: break j += 3 # add the orf to proteins only if it ends with a stop codon if tempStr != "" and tempBool is False: proteins.extend([tempStr]) # return the protein list return proteins # function to return the DNA compliment def DNACompliment(stri): rev = stri[::-1] revList = list(rev) for i in range(0, len(revList)): if revList[i] == "A": revList[i] = "T" elif revList[i] == "T": revList[i] = "A" elif revList[i] == "G": revList[i] = "C" elif revList[i] == "C": revList[i] = "G" revCompStr = "".join(revList) return revCompStr # get file content f = open("rosalind_orf.txt") content = f.readlines() f.close() DNAStr = "" # construct full DNA sequence for i in range(1, len(content)): DNAStr += content[i] # remove whitespace noWhiteDNA = "".join(DNAStr.split()) # convert DNA to RNA RNA = noWhiteDNA.replace("T", "U") # get DNA compliment and convert that to RNA as well revCompRNA = DNACompliment(noWhiteDNA).replace("T", "U") # get orfs for both sequences protList = orfGen(RNA) protList.extend(orfGen(revCompRNA)) # get rid of duplicates finalList = list(set(protList)) # print the list for i in finalList: print(i)