rosalind-solutions/python/ORF.py

149 lines
3.3 KiB
Python
Raw Normal View History

2019-01-06 18:25:22 +00:00
#!/usr/bin/python
2025-02-19 02:08:41 +00:00
# Solution to the ORF rosalind problem - 'Open Reading Frames'
# Author: Peter Vlasveld
2019-01-06 18:25:22 +00:00
2025-02-19 02:08:41 +00:00
# function to generate the orfs
2019-01-06 18:25:22 +00:00
def orfGen(stri):
2025-02-19 02:08:41 +00:00
# dictionary for translation
translate = {
"AAA": "K",
"AAG": "K",
"GAA": "E",
"GAG": "E",
"AAC": "N",
"AAU": "N",
"GAC": "D",
"GAU": "D",
"ACA": "T",
"ACC": "T",
"ACG": "T",
"ACU": "T",
"GCA": "A",
"GCC": "A",
"GCG": "A",
"GCU": "A",
"GGA": "G",
"GGC": "G",
"GGG": "G",
"GGU": "G",
"GUA": "V",
"GUC": "V",
"GUG": "V",
"GUU": "V",
"AUG": "M",
"UAA": "*",
"UAG": "*",
"UGA": "*",
"AUC": "I",
"AUU": "I",
"AUA": "I",
"UAC": "Y",
"UAU": "Y",
"CAA": "Q",
"CAG": "Q",
"AGC": "S",
"AGU": "S",
"UCA": "S",
"UCC": "S",
"UCG": "S",
"UCU": "S",
"CAC": "H",
"CAU": "H",
"UGC": "C",
"UGU": "C",
"CCA": "P",
"CCC": "P",
"CCG": "P",
"CCU": "P",
"UGG": "W",
"AGA": "R",
"AGG": "R",
"CGA": "R",
"CGC": "R",
"CGG": "R",
"CGU": "R",
"UUA": "L",
"UUG": "L",
"CUA": "L",
"CUC": "L",
"CUG": "L",
"CUU": "L",
"UUC": "F",
"UUU": "F",
}
# list to contain protein sequences
proteins = []
# loop that runs through the sequences from each amino acid in the sequence
for i in range(0, len(stri) - 2):
tempStr = ""
tempBool = False
j = i
# find an orf and break when it is finished
while j < len(stri) - 2:
if translate[stri[j: j + 3]] == "*":
tempBool = False
if translate[stri[j: j + 3]] == "M":
tempBool = True
if tempBool:
tempStr += translate[stri[j: j + 3]]
else:
break
j += 3
# add the orf to proteins only if it ends with a stop codon
if tempStr != "" and tempBool is False:
proteins.extend([tempStr])
# return the protein list
return proteins
# function to return the DNA compliment
2019-01-06 18:25:22 +00:00
def DNACompliment(stri):
2025-02-19 02:08:41 +00:00
rev = stri[::-1]
revList = list(rev)
for i in range(0, len(revList)):
if revList[i] == "A":
revList[i] = "T"
elif revList[i] == "T":
revList[i] = "A"
elif revList[i] == "G":
revList[i] = "C"
elif revList[i] == "C":
revList[i] = "G"
revCompStr = "".join(revList)
return revCompStr
# get file content
2019-01-06 18:25:22 +00:00
f = open("rosalind_orf.txt")
content = f.readlines()
f.close()
DNAStr = ""
2025-02-19 02:08:41 +00:00
# construct full DNA sequence
for i in range(1, len(content)):
DNAStr += content[i]
2019-01-06 18:25:22 +00:00
2025-02-19 02:08:41 +00:00
# remove whitespace
2019-01-06 18:25:22 +00:00
noWhiteDNA = "".join(DNAStr.split())
2025-02-19 02:08:41 +00:00
# convert DNA to RNA
2019-01-06 18:25:22 +00:00
RNA = noWhiteDNA.replace("T", "U")
2025-02-19 02:08:41 +00:00
# get DNA compliment and convert that to RNA as well
2019-01-06 18:25:22 +00:00
revCompRNA = DNACompliment(noWhiteDNA).replace("T", "U")
2025-02-19 02:08:41 +00:00
# get orfs for both sequences
2019-01-06 18:25:22 +00:00
protList = orfGen(RNA)
protList.extend(orfGen(revCompRNA))
2025-02-19 02:08:41 +00:00
# get rid of duplicates
2019-01-06 18:25:22 +00:00
finalList = list(set(protList))
2025-02-19 02:08:41 +00:00
# print the list
2019-01-06 18:25:22 +00:00
for i in finalList:
2025-02-19 02:08:41 +00:00
print(i)