From 25bfaf8e87351e8d8d515c9ecf5b2974d92df047 Mon Sep 17 00:00:00 2001
From: Peter Vlasveld <pvlasveld@gmail.com>
Date: Sun, 6 Jan 2019 13:25:22 -0500
Subject: [PATCH] added solution to ORF problem

---
 ORF.py | 102 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 102 insertions(+)
 create mode 100644 ORF.py

diff --git a/ORF.py b/ORF.py
new file mode 100644
index 0000000..3653328
--- /dev/null
+++ b/ORF.py
@@ -0,0 +1,102 @@
+#!/usr/bin/python
+
+#Solution to the ORF rosalind problem - 'Open Reading Frames'
+#Author: Peter Vlasveld
+
+#function to generate the orfs
+def orfGen(stri):
+	#dictionary for translation
+	translate = { 
+		"AAA":'K', "AAG":'K',
+		"GAA":'E', "GAG":'E',
+		"AAC":'N', "AAU":'N',
+		"GAC":'D', "GAU":'D',
+		"ACA":'T', "ACC":'T', "ACG":'T', "ACU":'T',
+		"GCA":"A", "GCC":"A", "GCG":"A", "GCU":"A",
+		"GGA":"G", "GGC":"G", "GGG":"G", "GGU":"G",
+		"GUA":"V", "GUC":"V", "GUG":"V", "GUU":"V",
+		"AUG":"M",
+		"UAA":"*", "UAG":"*", "UGA":"*",
+		"AUC":"I", "AUU":"I", "AUA":"I",
+		"UAC":"Y", "UAU":"Y",
+		"CAA":"Q", "CAG":"Q",
+		"AGC":"S", "AGU":"S", "UCA":"S", "UCC":"S", "UCG":"S", "UCU":"S",
+		"CAC":"H", "CAU":"H",
+		"UGC":"C", "UGU":"C",
+		"CCA":"P", "CCC":"P", "CCG":"P", "CCU":"P",
+		"UGG":"W",
+		"AGA":"R", "AGG":"R", "CGA":"R", "CGC":"R", "CGG":"R", "CGU":"R",
+		"UUA":"L", "UUG":"L", "CUA":"L", "CUC":"L", "CUG":"L", "CUU":"L",
+		"UUC":"F", "UUU":"F"
+	}
+	
+	#list to contain protein sequences
+	proteins = []
+
+	#loop that runs through the sequences from each amino acid in the sequence
+	for i in xrange(0, len(stri)-2):
+		tempStr = ""
+		tempBool = False
+		j = i
+		#find an orf and break when it is finished
+		while j < len(stri)-2:
+			if translate[stri[j:j+3]] == "*":
+				tempBool = False
+			if translate[stri[j:j+3]] == "M":
+				tempBool = True
+			if tempBool:
+				tempStr += translate[stri[j:j+3]]
+			else:
+				break
+			j += 3
+		#add the orf to proteins only if it ends with a stop codon
+		if tempStr != "" and tempBool == False:
+			proteins.extend([tempStr])
+	#return the protein list			
+	return proteins
+
+#function to return the DNA compliment
+def DNACompliment(stri):
+	rev = stri[::-1]
+	revList = list(rev)
+	for i in xrange(0, len(revList)):
+		if revList[i] == "A":
+			revList[i] = "T"
+		elif revList[i] == "T":
+			revList[i] = "A"
+		elif revList[i] == "G":
+			revList[i] = "C"
+		elif revList[i] == "C":
+			revList[i] = "G"
+	revCompStr = "".join(revList)
+	return revCompStr
+
+#get file content
+f = open("rosalind_orf.txt")
+content = f.readlines()
+f.close()
+
+DNAStr = ""
+#construct full DNA sequence
+for i in xrange(1, len(content)):
+	DNAStr += content[i]
+
+#remove whitespace
+noWhiteDNA = "".join(DNAStr.split())
+
+#convert DNA to RNA
+RNA = noWhiteDNA.replace("T", "U")
+
+#get DNA compliment and convert that to RNA as well
+revCompRNA = DNACompliment(noWhiteDNA).replace("T", "U")
+
+#get orfs for both sequences
+protList = orfGen(RNA)
+protList.extend(orfGen(revCompRNA))
+
+#get rid of duplicates
+finalList = list(set(protList))
+
+#print the list
+for i in finalList:
+	print(i)
\ No newline at end of file