From 222e3ee4473c45b94eaf07fff0e59a4dfa772420 Mon Sep 17 00:00:00 2001 From: Fizzizist <git@fizzizist.33mail.com> Date: Tue, 18 Feb 2025 21:08:41 -0500 Subject: [PATCH] update ORF to python 3 --- python/ORF.py | 198 +++++++++++++++++++++++++++++++------------------- 1 file changed, 122 insertions(+), 76 deletions(-) diff --git a/python/ORF.py b/python/ORF.py index 3653328..a203782 100644 --- a/python/ORF.py +++ b/python/ORF.py @@ -1,102 +1,148 @@ #!/usr/bin/python -#Solution to the ORF rosalind problem - 'Open Reading Frames' -#Author: Peter Vlasveld +# Solution to the ORF rosalind problem - 'Open Reading Frames' +# Author: Peter Vlasveld -#function to generate the orfs +# function to generate the orfs def orfGen(stri): - #dictionary for translation - translate = { - "AAA":'K', "AAG":'K', - "GAA":'E', "GAG":'E', - "AAC":'N', "AAU":'N', - "GAC":'D', "GAU":'D', - "ACA":'T', "ACC":'T', "ACG":'T', "ACU":'T', - "GCA":"A", "GCC":"A", "GCG":"A", "GCU":"A", - "GGA":"G", "GGC":"G", "GGG":"G", "GGU":"G", - "GUA":"V", "GUC":"V", "GUG":"V", "GUU":"V", - "AUG":"M", - "UAA":"*", "UAG":"*", "UGA":"*", - "AUC":"I", "AUU":"I", "AUA":"I", - "UAC":"Y", "UAU":"Y", - "CAA":"Q", "CAG":"Q", - "AGC":"S", "AGU":"S", "UCA":"S", "UCC":"S", "UCG":"S", "UCU":"S", - "CAC":"H", "CAU":"H", - "UGC":"C", "UGU":"C", - "CCA":"P", "CCC":"P", "CCG":"P", "CCU":"P", - "UGG":"W", - "AGA":"R", "AGG":"R", "CGA":"R", "CGC":"R", "CGG":"R", "CGU":"R", - "UUA":"L", "UUG":"L", "CUA":"L", "CUC":"L", "CUG":"L", "CUU":"L", - "UUC":"F", "UUU":"F" - } - - #list to contain protein sequences - proteins = [] + # dictionary for translation + translate = { + "AAA": "K", + "AAG": "K", + "GAA": "E", + "GAG": "E", + "AAC": "N", + "AAU": "N", + "GAC": "D", + "GAU": "D", + "ACA": "T", + "ACC": "T", + "ACG": "T", + "ACU": "T", + "GCA": "A", + "GCC": "A", + "GCG": "A", + "GCU": "A", + "GGA": "G", + "GGC": "G", + "GGG": "G", + "GGU": "G", + "GUA": "V", + "GUC": "V", + "GUG": "V", + "GUU": "V", + "AUG": "M", + "UAA": "*", + "UAG": "*", + "UGA": "*", + "AUC": "I", + "AUU": "I", + "AUA": "I", + "UAC": "Y", + "UAU": "Y", + "CAA": "Q", + "CAG": "Q", + "AGC": "S", + "AGU": "S", + "UCA": "S", + "UCC": "S", + "UCG": "S", + "UCU": "S", + "CAC": "H", + "CAU": "H", + "UGC": "C", + "UGU": "C", + "CCA": "P", + "CCC": "P", + "CCG": "P", + "CCU": "P", + "UGG": "W", + "AGA": "R", + "AGG": "R", + "CGA": "R", + "CGC": "R", + "CGG": "R", + "CGU": "R", + "UUA": "L", + "UUG": "L", + "CUA": "L", + "CUC": "L", + "CUG": "L", + "CUU": "L", + "UUC": "F", + "UUU": "F", + } - #loop that runs through the sequences from each amino acid in the sequence - for i in xrange(0, len(stri)-2): - tempStr = "" - tempBool = False - j = i - #find an orf and break when it is finished - while j < len(stri)-2: - if translate[stri[j:j+3]] == "*": - tempBool = False - if translate[stri[j:j+3]] == "M": - tempBool = True - if tempBool: - tempStr += translate[stri[j:j+3]] - else: - break - j += 3 - #add the orf to proteins only if it ends with a stop codon - if tempStr != "" and tempBool == False: - proteins.extend([tempStr]) - #return the protein list - return proteins + # list to contain protein sequences + proteins = [] -#function to return the DNA compliment + # loop that runs through the sequences from each amino acid in the sequence + for i in range(0, len(stri) - 2): + tempStr = "" + tempBool = False + j = i + # find an orf and break when it is finished + while j < len(stri) - 2: + if translate[stri[j: j + 3]] == "*": + tempBool = False + if translate[stri[j: j + 3]] == "M": + tempBool = True + if tempBool: + tempStr += translate[stri[j: j + 3]] + else: + break + j += 3 + # add the orf to proteins only if it ends with a stop codon + if tempStr != "" and tempBool is False: + proteins.extend([tempStr]) + # return the protein list + return proteins + + +# function to return the DNA compliment def DNACompliment(stri): - rev = stri[::-1] - revList = list(rev) - for i in xrange(0, len(revList)): - if revList[i] == "A": - revList[i] = "T" - elif revList[i] == "T": - revList[i] = "A" - elif revList[i] == "G": - revList[i] = "C" - elif revList[i] == "C": - revList[i] = "G" - revCompStr = "".join(revList) - return revCompStr + rev = stri[::-1] + revList = list(rev) + for i in range(0, len(revList)): + if revList[i] == "A": + revList[i] = "T" + elif revList[i] == "T": + revList[i] = "A" + elif revList[i] == "G": + revList[i] = "C" + elif revList[i] == "C": + revList[i] = "G" + revCompStr = "".join(revList) + return revCompStr -#get file content + +# get file content f = open("rosalind_orf.txt") content = f.readlines() f.close() DNAStr = "" -#construct full DNA sequence -for i in xrange(1, len(content)): - DNAStr += content[i] +# construct full DNA sequence +for i in range(1, len(content)): + DNAStr += content[i] -#remove whitespace +# remove whitespace noWhiteDNA = "".join(DNAStr.split()) -#convert DNA to RNA +# convert DNA to RNA RNA = noWhiteDNA.replace("T", "U") -#get DNA compliment and convert that to RNA as well +# get DNA compliment and convert that to RNA as well revCompRNA = DNACompliment(noWhiteDNA).replace("T", "U") -#get orfs for both sequences +# get orfs for both sequences protList = orfGen(RNA) protList.extend(orfGen(revCompRNA)) -#get rid of duplicates +# get rid of duplicates finalList = list(set(protList)) -#print the list +# print the list for i in finalList: - print(i) \ No newline at end of file + print(i) +