rosalind-solutions/python/ORF.py

#!/usr/bin/python

# Solution to the ORF rosalind problem - 'Open Reading Frames'
# Author: Peter Vlasveld

# function to generate the orfs
def orfGen(stri):
    # dictionary for translation
    translate = {
        "AAA": "K",
        "AAG": "K",
        "GAA": "E",
        "GAG": "E",
        "AAC": "N",
        "AAU": "N",
        "GAC": "D",
        "GAU": "D",
        "ACA": "T",
        "ACC": "T",
        "ACG": "T",
        "ACU": "T",
        "GCA": "A",
        "GCC": "A",
        "GCG": "A",
        "GCU": "A",
        "GGA": "G",
        "GGC": "G",
        "GGG": "G",
        "GGU": "G",
        "GUA": "V",
        "GUC": "V",
        "GUG": "V",
        "GUU": "V",
        "AUG": "M",
        "UAA": "*",
        "UAG": "*",
        "UGA": "*",
        "AUC": "I",
        "AUU": "I",
        "AUA": "I",
        "UAC": "Y",
        "UAU": "Y",
        "CAA": "Q",
        "CAG": "Q",
        "AGC": "S",
        "AGU": "S",
        "UCA": "S",
        "UCC": "S",
        "UCG": "S",
        "UCU": "S",
        "CAC": "H",
        "CAU": "H",
        "UGC": "C",
        "UGU": "C",
        "CCA": "P",
        "CCC": "P",
        "CCG": "P",
        "CCU": "P",
        "UGG": "W",
        "AGA": "R",
        "AGG": "R",
        "CGA": "R",
        "CGC": "R",
        "CGG": "R",
        "CGU": "R",
        "UUA": "L",
        "UUG": "L",
        "CUA": "L",
        "CUC": "L",
        "CUG": "L",
        "CUU": "L",
        "UUC": "F",
        "UUU": "F",
    }

    # list to contain protein sequences
    proteins = []

    # loop that runs through the sequences from each amino acid in the sequence
    for i in range(0, len(stri) - 2):
        tempStr = ""
        tempBool = False
        j = i
        # find an orf and break when it is finished
        while j < len(stri) - 2:
            if translate[stri[j: j + 3]] == "*":
                tempBool = False
            if translate[stri[j: j + 3]] == "M":
                tempBool = True
            if tempBool:
                tempStr += translate[stri[j: j + 3]]
            else:
                break
            j += 3
        # add the orf to proteins only if it ends with a stop codon
        if tempStr != "" and tempBool is False:
            proteins.extend([tempStr])
    # return the protein list
    return proteins


# function to return the DNA compliment
def DNACompliment(stri):
    rev = stri[::-1]
    revList = list(rev)
    for i in range(0, len(revList)):
        if revList[i] == "A":
            revList[i] = "T"
        elif revList[i] == "T":
            revList[i] = "A"
        elif revList[i] == "G":
            revList[i] = "C"
        elif revList[i] == "C":
            revList[i] = "G"
    revCompStr = "".join(revList)
    return revCompStr


# get file content
f = open("rosalind_orf.txt")
content = f.readlines()
f.close()

DNAStr = ""
# construct full DNA sequence
for i in range(1, len(content)):
    DNAStr += content[i]

# remove whitespace
noWhiteDNA = "".join(DNAStr.split())

# convert DNA to RNA
RNA = noWhiteDNA.replace("T", "U")

# get DNA compliment and convert that to RNA as well
revCompRNA = DNACompliment(noWhiteDNA).replace("T", "U")

# get orfs for both sequences
protList = orfGen(RNA)
protList.extend(orfGen(revCompRNA))

# get rid of duplicates
finalList = list(set(protList))

# print the list
for i in finalList:
    print(i)
added solution to ORF problem 2019-01-06 18:25:22 +00:00			`#!/usr/bin/python`

update ORF to python 3 2025-02-19 02:08:41 +00:00			`# Solution to the ORF rosalind problem - 'Open Reading Frames'`
			`# Author: Peter Vlasveld`
added solution to ORF problem 2019-01-06 18:25:22 +00:00
update ORF to python 3 2025-02-19 02:08:41 +00:00			`# function to generate the orfs`
added solution to ORF problem 2019-01-06 18:25:22 +00:00			`def orfGen(stri):`
update ORF to python 3 2025-02-19 02:08:41 +00:00			`# dictionary for translation`
			`translate = {`
			`"AAA": "K",`
			`"AAG": "K",`
			`"GAA": "E",`
			`"GAG": "E",`
			`"AAC": "N",`
			`"AAU": "N",`
			`"GAC": "D",`
			`"GAU": "D",`
			`"ACA": "T",`
			`"ACC": "T",`
			`"ACG": "T",`
			`"ACU": "T",`
			`"GCA": "A",`
			`"GCC": "A",`
			`"GCG": "A",`
			`"GCU": "A",`
			`"GGA": "G",`
			`"GGC": "G",`
			`"GGG": "G",`
			`"GGU": "G",`
			`"GUA": "V",`
			`"GUC": "V",`
			`"GUG": "V",`
			`"GUU": "V",`
			`"AUG": "M",`
			`"UAA": "*",`
			`"UAG": "*",`
			`"UGA": "*",`
			`"AUC": "I",`
			`"AUU": "I",`
			`"AUA": "I",`
			`"UAC": "Y",`
			`"UAU": "Y",`
			`"CAA": "Q",`
			`"CAG": "Q",`
			`"AGC": "S",`
			`"AGU": "S",`
			`"UCA": "S",`
			`"UCC": "S",`
			`"UCG": "S",`
			`"UCU": "S",`
			`"CAC": "H",`
			`"CAU": "H",`
			`"UGC": "C",`
			`"UGU": "C",`
			`"CCA": "P",`
			`"CCC": "P",`
			`"CCG": "P",`
			`"CCU": "P",`
			`"UGG": "W",`
			`"AGA": "R",`
			`"AGG": "R",`
			`"CGA": "R",`
			`"CGC": "R",`
			`"CGG": "R",`
			`"CGU": "R",`
			`"UUA": "L",`
			`"UUG": "L",`
			`"CUA": "L",`
			`"CUC": "L",`
			`"CUG": "L",`
			`"CUU": "L",`
			`"UUC": "F",`
			`"UUU": "F",`
			`}`

			`# list to contain protein sequences`
			`proteins = []`

			`# loop that runs through the sequences from each amino acid in the sequence`
			`for i in range(0, len(stri) - 2):`
			`tempStr = ""`
			`tempBool = False`
			`j = i`
			`# find an orf and break when it is finished`
			`while j < len(stri) - 2:`
			`if translate[stri[j: j + 3]] == "*":`
			`tempBool = False`
			`if translate[stri[j: j + 3]] == "M":`
			`tempBool = True`
			`if tempBool:`
			`tempStr += translate[stri[j: j + 3]]`
			`else:`
			`break`
			`j += 3`
			`# add the orf to proteins only if it ends with a stop codon`
			`if tempStr != "" and tempBool is False:`
			`proteins.extend([tempStr])`
			`# return the protein list`
			`return proteins`


			`# function to return the DNA compliment`
added solution to ORF problem 2019-01-06 18:25:22 +00:00			`def DNACompliment(stri):`
update ORF to python 3 2025-02-19 02:08:41 +00:00			`rev = stri[::-1]`
			`revList = list(rev)`
			`for i in range(0, len(revList)):`
			`if revList[i] == "A":`
			`revList[i] = "T"`
			`elif revList[i] == "T":`
			`revList[i] = "A"`
			`elif revList[i] == "G":`
			`revList[i] = "C"`
			`elif revList[i] == "C":`
			`revList[i] = "G"`
			`revCompStr = "".join(revList)`
			`return revCompStr`


			`# get file content`
added solution to ORF problem 2019-01-06 18:25:22 +00:00			`f = open("rosalind_orf.txt")`
			`content = f.readlines()`
			`f.close()`

			`DNAStr = ""`
update ORF to python 3 2025-02-19 02:08:41 +00:00			`# construct full DNA sequence`
			`for i in range(1, len(content)):`
			`DNAStr += content[i]`
added solution to ORF problem 2019-01-06 18:25:22 +00:00
update ORF to python 3 2025-02-19 02:08:41 +00:00			`# remove whitespace`
added solution to ORF problem 2019-01-06 18:25:22 +00:00			`noWhiteDNA = "".join(DNAStr.split())`

update ORF to python 3 2025-02-19 02:08:41 +00:00			`# convert DNA to RNA`
added solution to ORF problem 2019-01-06 18:25:22 +00:00			`RNA = noWhiteDNA.replace("T", "U")`

update ORF to python 3 2025-02-19 02:08:41 +00:00			`# get DNA compliment and convert that to RNA as well`
added solution to ORF problem 2019-01-06 18:25:22 +00:00			`revCompRNA = DNACompliment(noWhiteDNA).replace("T", "U")`

update ORF to python 3 2025-02-19 02:08:41 +00:00			`# get orfs for both sequences`
added solution to ORF problem 2019-01-06 18:25:22 +00:00			`protList = orfGen(RNA)`
			`protList.extend(orfGen(revCompRNA))`

update ORF to python 3 2025-02-19 02:08:41 +00:00			`# get rid of duplicates`
added solution to ORF problem 2019-01-06 18:25:22 +00:00			`finalList = list(set(protList))`

update ORF to python 3 2025-02-19 02:08:41 +00:00			`# print the list`
added solution to ORF problem 2019-01-06 18:25:22 +00:00			`for i in finalList:`
update ORF to python 3 2025-02-19 02:08:41 +00:00			`print(i)`