rosalind-solutions/python/ORF.py

#!/usr/bin/python

#Solution to the ORF rosalind problem - 'Open Reading Frames'
#Author: Peter Vlasveld

#function to generate the orfs
def orfGen(stri):
	#dictionary for translation
	translate = { 
		"AAA":'K', "AAG":'K',
		"GAA":'E', "GAG":'E',
		"AAC":'N', "AAU":'N',
		"GAC":'D', "GAU":'D',
		"ACA":'T', "ACC":'T', "ACG":'T', "ACU":'T',
		"GCA":"A", "GCC":"A", "GCG":"A", "GCU":"A",
		"GGA":"G", "GGC":"G", "GGG":"G", "GGU":"G",
		"GUA":"V", "GUC":"V", "GUG":"V", "GUU":"V",
		"AUG":"M",
		"UAA":"*", "UAG":"*", "UGA":"*",
		"AUC":"I", "AUU":"I", "AUA":"I",
		"UAC":"Y", "UAU":"Y",
		"CAA":"Q", "CAG":"Q",
		"AGC":"S", "AGU":"S", "UCA":"S", "UCC":"S", "UCG":"S", "UCU":"S",
		"CAC":"H", "CAU":"H",
		"UGC":"C", "UGU":"C",
		"CCA":"P", "CCC":"P", "CCG":"P", "CCU":"P",
		"UGG":"W",
		"AGA":"R", "AGG":"R", "CGA":"R", "CGC":"R", "CGG":"R", "CGU":"R",
		"UUA":"L", "UUG":"L", "CUA":"L", "CUC":"L", "CUG":"L", "CUU":"L",
		"UUC":"F", "UUU":"F"
	}
	
	#list to contain protein sequences
	proteins = []

	#loop that runs through the sequences from each amino acid in the sequence
	for i in xrange(0, len(stri)-2):
		tempStr = ""
		tempBool = False
		j = i
		#find an orf and break when it is finished
		while j < len(stri)-2:
			if translate[stri[j:j+3]] == "*":
				tempBool = False
			if translate[stri[j:j+3]] == "M":
				tempBool = True
			if tempBool:
				tempStr += translate[stri[j:j+3]]
			else:
				break
			j += 3
		#add the orf to proteins only if it ends with a stop codon
		if tempStr != "" and tempBool == False:
			proteins.extend([tempStr])
	#return the protein list			
	return proteins

#function to return the DNA compliment
def DNACompliment(stri):
	rev = stri[::-1]
	revList = list(rev)
	for i in xrange(0, len(revList)):
		if revList[i] == "A":
			revList[i] = "T"
		elif revList[i] == "T":
			revList[i] = "A"
		elif revList[i] == "G":
			revList[i] = "C"
		elif revList[i] == "C":
			revList[i] = "G"
	revCompStr = "".join(revList)
	return revCompStr

#get file content
f = open("rosalind_orf.txt")
content = f.readlines()
f.close()

DNAStr = ""
#construct full DNA sequence
for i in xrange(1, len(content)):
	DNAStr += content[i]

#remove whitespace
noWhiteDNA = "".join(DNAStr.split())

#convert DNA to RNA
RNA = noWhiteDNA.replace("T", "U")

#get DNA compliment and convert that to RNA as well
revCompRNA = DNACompliment(noWhiteDNA).replace("T", "U")

#get orfs for both sequences
protList = orfGen(RNA)
protList.extend(orfGen(revCompRNA))

#get rid of duplicates
finalList = list(set(protList))

#print the list
for i in finalList:
	print(i)
added solution to ORF problem 2019-01-06 18:25:22 +00:00			`#!/usr/bin/python`

			`#Solution to the ORF rosalind problem - 'Open Reading Frames'`
			`#Author: Peter Vlasveld`

			`#function to generate the orfs`
			`def orfGen(stri):`
			`#dictionary for translation`
			`translate = {`
			`"AAA":'K', "AAG":'K',`
			`"GAA":'E', "GAG":'E',`
			`"AAC":'N', "AAU":'N',`
			`"GAC":'D', "GAU":'D',`
			`"ACA":'T', "ACC":'T', "ACG":'T', "ACU":'T',`
			`"GCA":"A", "GCC":"A", "GCG":"A", "GCU":"A",`
			`"GGA":"G", "GGC":"G", "GGG":"G", "GGU":"G",`
			`"GUA":"V", "GUC":"V", "GUG":"V", "GUU":"V",`
			`"AUG":"M",`
			`"UAA":"", "UAG":"", "UGA":"*",`
			`"AUC":"I", "AUU":"I", "AUA":"I",`
			`"UAC":"Y", "UAU":"Y",`
			`"CAA":"Q", "CAG":"Q",`
			`"AGC":"S", "AGU":"S", "UCA":"S", "UCC":"S", "UCG":"S", "UCU":"S",`
			`"CAC":"H", "CAU":"H",`
			`"UGC":"C", "UGU":"C",`
			`"CCA":"P", "CCC":"P", "CCG":"P", "CCU":"P",`
			`"UGG":"W",`
			`"AGA":"R", "AGG":"R", "CGA":"R", "CGC":"R", "CGG":"R", "CGU":"R",`
			`"UUA":"L", "UUG":"L", "CUA":"L", "CUC":"L", "CUG":"L", "CUU":"L",`
			`"UUC":"F", "UUU":"F"`
			`}`

			`#list to contain protein sequences`
			`proteins = []`

			`#loop that runs through the sequences from each amino acid in the sequence`
			`for i in xrange(0, len(stri)-2):`
			`tempStr = ""`
			`tempBool = False`
			`j = i`
			`#find an orf and break when it is finished`
			`while j < len(stri)-2:`
			`if translate[stri[j:j+3]] == "*":`
			`tempBool = False`
			`if translate[stri[j:j+3]] == "M":`
			`tempBool = True`
			`if tempBool:`
			`tempStr += translate[stri[j:j+3]]`
			`else:`
			`break`
			`j += 3`
			`#add the orf to proteins only if it ends with a stop codon`
			`if tempStr != "" and tempBool == False:`
			`proteins.extend([tempStr])`
			`#return the protein list`
			`return proteins`

			`#function to return the DNA compliment`
			`def DNACompliment(stri):`
			`rev = stri[::-1]`
			`revList = list(rev)`
			`for i in xrange(0, len(revList)):`
			`if revList[i] == "A":`
			`revList[i] = "T"`
			`elif revList[i] == "T":`
			`revList[i] = "A"`
			`elif revList[i] == "G":`
			`revList[i] = "C"`
			`elif revList[i] == "C":`
			`revList[i] = "G"`
			`revCompStr = "".join(revList)`
			`return revCompStr`

			`#get file content`
			`f = open("rosalind_orf.txt")`
			`content = f.readlines()`
			`f.close()`

			`DNAStr = ""`
			`#construct full DNA sequence`
			`for i in xrange(1, len(content)):`
			`DNAStr += content[i]`

			`#remove whitespace`
			`noWhiteDNA = "".join(DNAStr.split())`

			`#convert DNA to RNA`
			`RNA = noWhiteDNA.replace("T", "U")`

			`#get DNA compliment and convert that to RNA as well`
			`revCompRNA = DNACompliment(noWhiteDNA).replace("T", "U")`

			`#get orfs for both sequences`
			`protList = orfGen(RNA)`
			`protList.extend(orfGen(revCompRNA))`

			`#get rid of duplicates`
			`finalList = list(set(protList))`

			`#print the list`
			`for i in finalList:`
			`print(i)`