update ORF to python 3

This commit is contained in:
Fizzizist 2025-02-18 21:08:41 -05:00
parent 43de339fc5
commit 222e3ee447

View File

@ -1,65 +1,109 @@
#!/usr/bin/python #!/usr/bin/python
#Solution to the ORF rosalind problem - 'Open Reading Frames' # Solution to the ORF rosalind problem - 'Open Reading Frames'
#Author: Peter Vlasveld # Author: Peter Vlasveld
#function to generate the orfs # function to generate the orfs
def orfGen(stri): def orfGen(stri):
#dictionary for translation # dictionary for translation
translate = { translate = {
"AAA":'K', "AAG":'K', "AAA": "K",
"GAA":'E', "GAG":'E', "AAG": "K",
"AAC":'N', "AAU":'N', "GAA": "E",
"GAC":'D', "GAU":'D', "GAG": "E",
"ACA":'T', "ACC":'T', "ACG":'T', "ACU":'T', "AAC": "N",
"GCA":"A", "GCC":"A", "GCG":"A", "GCU":"A", "AAU": "N",
"GGA":"G", "GGC":"G", "GGG":"G", "GGU":"G", "GAC": "D",
"GUA":"V", "GUC":"V", "GUG":"V", "GUU":"V", "GAU": "D",
"AUG":"M", "ACA": "T",
"UAA":"*", "UAG":"*", "UGA":"*", "ACC": "T",
"AUC":"I", "AUU":"I", "AUA":"I", "ACG": "T",
"UAC":"Y", "UAU":"Y", "ACU": "T",
"CAA":"Q", "CAG":"Q", "GCA": "A",
"AGC":"S", "AGU":"S", "UCA":"S", "UCC":"S", "UCG":"S", "UCU":"S", "GCC": "A",
"CAC":"H", "CAU":"H", "GCG": "A",
"UGC":"C", "UGU":"C", "GCU": "A",
"CCA":"P", "CCC":"P", "CCG":"P", "CCU":"P", "GGA": "G",
"UGG":"W", "GGC": "G",
"AGA":"R", "AGG":"R", "CGA":"R", "CGC":"R", "CGG":"R", "CGU":"R", "GGG": "G",
"UUA":"L", "UUG":"L", "CUA":"L", "CUC":"L", "CUG":"L", "CUU":"L", "GGU": "G",
"UUC":"F", "UUU":"F" "GUA": "V",
"GUC": "V",
"GUG": "V",
"GUU": "V",
"AUG": "M",
"UAA": "*",
"UAG": "*",
"UGA": "*",
"AUC": "I",
"AUU": "I",
"AUA": "I",
"UAC": "Y",
"UAU": "Y",
"CAA": "Q",
"CAG": "Q",
"AGC": "S",
"AGU": "S",
"UCA": "S",
"UCC": "S",
"UCG": "S",
"UCU": "S",
"CAC": "H",
"CAU": "H",
"UGC": "C",
"UGU": "C",
"CCA": "P",
"CCC": "P",
"CCG": "P",
"CCU": "P",
"UGG": "W",
"AGA": "R",
"AGG": "R",
"CGA": "R",
"CGC": "R",
"CGG": "R",
"CGU": "R",
"UUA": "L",
"UUG": "L",
"CUA": "L",
"CUC": "L",
"CUG": "L",
"CUU": "L",
"UUC": "F",
"UUU": "F",
} }
#list to contain protein sequences # list to contain protein sequences
proteins = [] proteins = []
#loop that runs through the sequences from each amino acid in the sequence # loop that runs through the sequences from each amino acid in the sequence
for i in xrange(0, len(stri)-2): for i in range(0, len(stri) - 2):
tempStr = "" tempStr = ""
tempBool = False tempBool = False
j = i j = i
#find an orf and break when it is finished # find an orf and break when it is finished
while j < len(stri)-2: while j < len(stri) - 2:
if translate[stri[j:j+3]] == "*": if translate[stri[j: j + 3]] == "*":
tempBool = False tempBool = False
if translate[stri[j:j+3]] == "M": if translate[stri[j: j + 3]] == "M":
tempBool = True tempBool = True
if tempBool: if tempBool:
tempStr += translate[stri[j:j+3]] tempStr += translate[stri[j: j + 3]]
else: else:
break break
j += 3 j += 3
#add the orf to proteins only if it ends with a stop codon # add the orf to proteins only if it ends with a stop codon
if tempStr != "" and tempBool == False: if tempStr != "" and tempBool is False:
proteins.extend([tempStr]) proteins.extend([tempStr])
#return the protein list # return the protein list
return proteins return proteins
#function to return the DNA compliment
# function to return the DNA compliment
def DNACompliment(stri): def DNACompliment(stri):
rev = stri[::-1] rev = stri[::-1]
revList = list(rev) revList = list(rev)
for i in xrange(0, len(revList)): for i in range(0, len(revList)):
if revList[i] == "A": if revList[i] == "A":
revList[i] = "T" revList[i] = "T"
elif revList[i] == "T": elif revList[i] == "T":
@ -71,32 +115,34 @@ def DNACompliment(stri):
revCompStr = "".join(revList) revCompStr = "".join(revList)
return revCompStr return revCompStr
#get file content
# get file content
f = open("rosalind_orf.txt") f = open("rosalind_orf.txt")
content = f.readlines() content = f.readlines()
f.close() f.close()
DNAStr = "" DNAStr = ""
#construct full DNA sequence # construct full DNA sequence
for i in xrange(1, len(content)): for i in range(1, len(content)):
DNAStr += content[i] DNAStr += content[i]
#remove whitespace # remove whitespace
noWhiteDNA = "".join(DNAStr.split()) noWhiteDNA = "".join(DNAStr.split())
#convert DNA to RNA # convert DNA to RNA
RNA = noWhiteDNA.replace("T", "U") RNA = noWhiteDNA.replace("T", "U")
#get DNA compliment and convert that to RNA as well # get DNA compliment and convert that to RNA as well
revCompRNA = DNACompliment(noWhiteDNA).replace("T", "U") revCompRNA = DNACompliment(noWhiteDNA).replace("T", "U")
#get orfs for both sequences # get orfs for both sequences
protList = orfGen(RNA) protList = orfGen(RNA)
protList.extend(orfGen(revCompRNA)) protList.extend(orfGen(revCompRNA))
#get rid of duplicates # get rid of duplicates
finalList = list(set(protList)) finalList = list(set(protList))
#print the list # print the list
for i in finalList: for i in finalList:
print(i) print(i)