2019-01-06 18:25:22 +00:00
|
|
|
#!/usr/bin/python
|
|
|
|
|
2025-02-19 02:08:41 +00:00
|
|
|
# Solution to the ORF rosalind problem - 'Open Reading Frames'
|
|
|
|
# Author: Peter Vlasveld
|
2019-01-06 18:25:22 +00:00
|
|
|
|
2025-02-19 02:08:41 +00:00
|
|
|
# function to generate the orfs
|
2019-01-06 18:25:22 +00:00
|
|
|
def orfGen(stri):
|
2025-02-19 02:08:41 +00:00
|
|
|
# dictionary for translation
|
|
|
|
translate = {
|
|
|
|
"AAA": "K",
|
|
|
|
"AAG": "K",
|
|
|
|
"GAA": "E",
|
|
|
|
"GAG": "E",
|
|
|
|
"AAC": "N",
|
|
|
|
"AAU": "N",
|
|
|
|
"GAC": "D",
|
|
|
|
"GAU": "D",
|
|
|
|
"ACA": "T",
|
|
|
|
"ACC": "T",
|
|
|
|
"ACG": "T",
|
|
|
|
"ACU": "T",
|
|
|
|
"GCA": "A",
|
|
|
|
"GCC": "A",
|
|
|
|
"GCG": "A",
|
|
|
|
"GCU": "A",
|
|
|
|
"GGA": "G",
|
|
|
|
"GGC": "G",
|
|
|
|
"GGG": "G",
|
|
|
|
"GGU": "G",
|
|
|
|
"GUA": "V",
|
|
|
|
"GUC": "V",
|
|
|
|
"GUG": "V",
|
|
|
|
"GUU": "V",
|
|
|
|
"AUG": "M",
|
|
|
|
"UAA": "*",
|
|
|
|
"UAG": "*",
|
|
|
|
"UGA": "*",
|
|
|
|
"AUC": "I",
|
|
|
|
"AUU": "I",
|
|
|
|
"AUA": "I",
|
|
|
|
"UAC": "Y",
|
|
|
|
"UAU": "Y",
|
|
|
|
"CAA": "Q",
|
|
|
|
"CAG": "Q",
|
|
|
|
"AGC": "S",
|
|
|
|
"AGU": "S",
|
|
|
|
"UCA": "S",
|
|
|
|
"UCC": "S",
|
|
|
|
"UCG": "S",
|
|
|
|
"UCU": "S",
|
|
|
|
"CAC": "H",
|
|
|
|
"CAU": "H",
|
|
|
|
"UGC": "C",
|
|
|
|
"UGU": "C",
|
|
|
|
"CCA": "P",
|
|
|
|
"CCC": "P",
|
|
|
|
"CCG": "P",
|
|
|
|
"CCU": "P",
|
|
|
|
"UGG": "W",
|
|
|
|
"AGA": "R",
|
|
|
|
"AGG": "R",
|
|
|
|
"CGA": "R",
|
|
|
|
"CGC": "R",
|
|
|
|
"CGG": "R",
|
|
|
|
"CGU": "R",
|
|
|
|
"UUA": "L",
|
|
|
|
"UUG": "L",
|
|
|
|
"CUA": "L",
|
|
|
|
"CUC": "L",
|
|
|
|
"CUG": "L",
|
|
|
|
"CUU": "L",
|
|
|
|
"UUC": "F",
|
|
|
|
"UUU": "F",
|
|
|
|
}
|
|
|
|
|
|
|
|
# list to contain protein sequences
|
|
|
|
proteins = []
|
|
|
|
|
|
|
|
# loop that runs through the sequences from each amino acid in the sequence
|
|
|
|
for i in range(0, len(stri) - 2):
|
|
|
|
tempStr = ""
|
|
|
|
tempBool = False
|
|
|
|
j = i
|
|
|
|
# find an orf and break when it is finished
|
|
|
|
while j < len(stri) - 2:
|
|
|
|
if translate[stri[j: j + 3]] == "*":
|
|
|
|
tempBool = False
|
|
|
|
if translate[stri[j: j + 3]] == "M":
|
|
|
|
tempBool = True
|
|
|
|
if tempBool:
|
|
|
|
tempStr += translate[stri[j: j + 3]]
|
|
|
|
else:
|
|
|
|
break
|
|
|
|
j += 3
|
|
|
|
# add the orf to proteins only if it ends with a stop codon
|
|
|
|
if tempStr != "" and tempBool is False:
|
|
|
|
proteins.extend([tempStr])
|
|
|
|
# return the protein list
|
|
|
|
return proteins
|
|
|
|
|
|
|
|
|
|
|
|
# function to return the DNA compliment
|
2019-01-06 18:25:22 +00:00
|
|
|
def DNACompliment(stri):
|
2025-02-19 02:08:41 +00:00
|
|
|
rev = stri[::-1]
|
|
|
|
revList = list(rev)
|
|
|
|
for i in range(0, len(revList)):
|
|
|
|
if revList[i] == "A":
|
|
|
|
revList[i] = "T"
|
|
|
|
elif revList[i] == "T":
|
|
|
|
revList[i] = "A"
|
|
|
|
elif revList[i] == "G":
|
|
|
|
revList[i] = "C"
|
|
|
|
elif revList[i] == "C":
|
|
|
|
revList[i] = "G"
|
|
|
|
revCompStr = "".join(revList)
|
|
|
|
return revCompStr
|
|
|
|
|
|
|
|
|
|
|
|
# get file content
|
2019-01-06 18:25:22 +00:00
|
|
|
f = open("rosalind_orf.txt")
|
|
|
|
content = f.readlines()
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
DNAStr = ""
|
2025-02-19 02:08:41 +00:00
|
|
|
# construct full DNA sequence
|
|
|
|
for i in range(1, len(content)):
|
|
|
|
DNAStr += content[i]
|
2019-01-06 18:25:22 +00:00
|
|
|
|
2025-02-19 02:08:41 +00:00
|
|
|
# remove whitespace
|
2019-01-06 18:25:22 +00:00
|
|
|
noWhiteDNA = "".join(DNAStr.split())
|
|
|
|
|
2025-02-19 02:08:41 +00:00
|
|
|
# convert DNA to RNA
|
2019-01-06 18:25:22 +00:00
|
|
|
RNA = noWhiteDNA.replace("T", "U")
|
|
|
|
|
2025-02-19 02:08:41 +00:00
|
|
|
# get DNA compliment and convert that to RNA as well
|
2019-01-06 18:25:22 +00:00
|
|
|
revCompRNA = DNACompliment(noWhiteDNA).replace("T", "U")
|
|
|
|
|
2025-02-19 02:08:41 +00:00
|
|
|
# get orfs for both sequences
|
2019-01-06 18:25:22 +00:00
|
|
|
protList = orfGen(RNA)
|
|
|
|
protList.extend(orfGen(revCompRNA))
|
|
|
|
|
2025-02-19 02:08:41 +00:00
|
|
|
# get rid of duplicates
|
2019-01-06 18:25:22 +00:00
|
|
|
finalList = list(set(protList))
|
|
|
|
|
2025-02-19 02:08:41 +00:00
|
|
|
# print the list
|
2019-01-06 18:25:22 +00:00
|
|
|
for i in finalList:
|
2025-02-19 02:08:41 +00:00
|
|
|
print(i)
|
|
|
|
|