moved perl and python scripts into their own directories

This commit is contained in:
Fizzizist
2025-01-04 15:58:59 -05:00
parent 821caa4618
commit af2ce045ec
13 changed files with 0 additions and 0 deletions

51
python/LEXF.py Normal file
View File

@@ -0,0 +1,51 @@
#!/usr/bin/python
"""
LEXF Rosalind.info Solution - Enumerating k-mers Lexicographically
Author: Peter Vlasveld
Date: 12/01/2019
"""
#Get dataset from file
f1 = open("rosalind_lexf.txt")
content = f1.readlines()
f1.close()
#duplicate the lex set r times
firstLex = content[0].split( )
lex = []
for i in xrange(0, int(content[1])):
lex.extend(firstLex)
"""
Recursive function to get all possible combinations within the lex set
"""
def wordCombo (arr, data, start, end, index, r):
if index == r:
tempStr = ""
for i in data:
tempStr += i
global comboList
comboList.append(tempStr)
return
i = start
while i <= end and end - i + 1 >= r - index:
data[index] = arr[i]
wordCombo(arr, data, i + 1, end, index + 1, r)
i += 1
#empty lsit to be passed into function
data = [0]*int(content[1])
#global list to be filled with combinations
comboList = []
#run the function, remove duplicates, and sort it alhphabetically
wordCombo(lex,data,0,len(lex)-1,0,int(content[1]))
finalList = list(set(comboList))
sortedList = sorted(finalList)
#print the list to a file
f2 = open("output.txt", "w")
for i in sortedList:
f2.writelines(i + "\n")
f2.close()

70
python/LGIS.py Normal file
View File

@@ -0,0 +1,70 @@
#!/usr/bin/python
# Solution to the LGIS Rosalind Problem - Longest Increasing Subsequence
# Author: Peter Vlasveld
# Date: 18/01/2019
import copy
#get input
f1 = open("rosalind_lgis.txt")
content = f1.readlines()
f1.close()
#get array out of content
arr = content[1].split( )
#function to find longest increasing subsequence of arr
def inc (arr):
#set s array to a list of a list of the first value in arr
s = [[arr[0]]]
#loop through arr skipping the first value
for i in xrange(1,len(arr)):
#if this arr value is greater than the last element in the last list of s,
# then duplicate the last list and append the new value
if int(arr[i]) > int(s[len(s)-1][len(s[len(s)-1])-1]):
tempLis = copy.deepcopy(s[len(s)-1])
tempLis.append(arr[i])
s.append(tempLis)
#else if the first list of s is greater than the arr value, make the arr value the new first list
elif int(s[0][0]) > int(arr[i]):
s[0] = copy.deepcopy([arr[i]])
#otherwise, find the list in s where the last value is less greater than the arr value,
# and replace it with the list before it, with the new value appended to the end
else:
for j in xrange(0, len(s)):
if int(arr[i]) < int(s[j][len(s[j])-1]):
s[j] = copy.deepcopy(s[j-1])
s[j].append(arr[i])
break
#return the last element of s which at this point is the longest increasing subsequence
return s[len(s)-1]
#find the longest decreasing subsequence
#the logic is the exact same as the above increasing function,
# just with all of the greater-than and less-than operators turned around
def dec (arr):
s = [[arr[0]]]
for i in xrange(1,len(arr)):
if int(arr[i]) < int(s[len(s)-1][len(s[len(s)-1])-1]):
tempLis = copy.deepcopy(s[len(s)-1])
tempLis.append(arr[i])
s.append(tempLis)
elif int(s[0][0]) < int(arr[i]):
s[0] = copy.deepcopy([arr[i]])
else:
for j in xrange(0, len(s)):
if int(arr[i]) > int(s[j][len(s[j])-1]):
s[j] = copy.deepcopy(s[j-1])
s[j].append(arr[i])
break
return s[len(s)-1]
#run the functions
inc = inc(arr)
dec = dec(arr)
#print the output to a file
f2 = open("output.txt", "w")
f2.write(" ".join(inc) + "\n")
f2.write(" ".join(dec))
f2.close()

44
python/MPRT.py Normal file
View File

@@ -0,0 +1,44 @@
#!/usr/bin/python
#MPRT - Finding a Protein Motif Solution
#Problem can be found at http://rosalind.info/problems/mprt/
#Author: Peter Vlasveld
import urllib2
import re
#declare motif
motif = "N[^P][ST][^P]"
#read in file
f0 = open("rosalind_mprt.txt", "r")
content = f0.read().splitlines()
f0.close()
#open output file
f1 = open("output.txt", "w+")
#loop through each accession ID
for i in content:
#get fasta from url
url = "http://www.uniprot.org/uniprot/" + i + ".fasta"
response = urllib2.urlopen(url)
fasta = response.read().splitlines()
#format protein string
protStr = ""
for j in fasta:
if not j.startswith('>'):
protStr += j
#construct output strings
outStr = ""
for j in range(0, len(protStr)-4):
if re.match(motif,protStr[j:j+4]):
outStr += str(j+1) + " "
#output
if not outStr == "":
print i
f1.write(i + "\n")
print outStr
f1.write(outStr + "\n")
#close output file
f1.close()

102
python/ORF.py Normal file
View File

@@ -0,0 +1,102 @@
#!/usr/bin/python
#Solution to the ORF rosalind problem - 'Open Reading Frames'
#Author: Peter Vlasveld
#function to generate the orfs
def orfGen(stri):
#dictionary for translation
translate = {
"AAA":'K', "AAG":'K',
"GAA":'E', "GAG":'E',
"AAC":'N', "AAU":'N',
"GAC":'D', "GAU":'D',
"ACA":'T', "ACC":'T', "ACG":'T', "ACU":'T',
"GCA":"A", "GCC":"A", "GCG":"A", "GCU":"A",
"GGA":"G", "GGC":"G", "GGG":"G", "GGU":"G",
"GUA":"V", "GUC":"V", "GUG":"V", "GUU":"V",
"AUG":"M",
"UAA":"*", "UAG":"*", "UGA":"*",
"AUC":"I", "AUU":"I", "AUA":"I",
"UAC":"Y", "UAU":"Y",
"CAA":"Q", "CAG":"Q",
"AGC":"S", "AGU":"S", "UCA":"S", "UCC":"S", "UCG":"S", "UCU":"S",
"CAC":"H", "CAU":"H",
"UGC":"C", "UGU":"C",
"CCA":"P", "CCC":"P", "CCG":"P", "CCU":"P",
"UGG":"W",
"AGA":"R", "AGG":"R", "CGA":"R", "CGC":"R", "CGG":"R", "CGU":"R",
"UUA":"L", "UUG":"L", "CUA":"L", "CUC":"L", "CUG":"L", "CUU":"L",
"UUC":"F", "UUU":"F"
}
#list to contain protein sequences
proteins = []
#loop that runs through the sequences from each amino acid in the sequence
for i in xrange(0, len(stri)-2):
tempStr = ""
tempBool = False
j = i
#find an orf and break when it is finished
while j < len(stri)-2:
if translate[stri[j:j+3]] == "*":
tempBool = False
if translate[stri[j:j+3]] == "M":
tempBool = True
if tempBool:
tempStr += translate[stri[j:j+3]]
else:
break
j += 3
#add the orf to proteins only if it ends with a stop codon
if tempStr != "" and tempBool == False:
proteins.extend([tempStr])
#return the protein list
return proteins
#function to return the DNA compliment
def DNACompliment(stri):
rev = stri[::-1]
revList = list(rev)
for i in xrange(0, len(revList)):
if revList[i] == "A":
revList[i] = "T"
elif revList[i] == "T":
revList[i] = "A"
elif revList[i] == "G":
revList[i] = "C"
elif revList[i] == "C":
revList[i] = "G"
revCompStr = "".join(revList)
return revCompStr
#get file content
f = open("rosalind_orf.txt")
content = f.readlines()
f.close()
DNAStr = ""
#construct full DNA sequence
for i in xrange(1, len(content)):
DNAStr += content[i]
#remove whitespace
noWhiteDNA = "".join(DNAStr.split())
#convert DNA to RNA
RNA = noWhiteDNA.replace("T", "U")
#get DNA compliment and convert that to RNA as well
revCompRNA = DNACompliment(noWhiteDNA).replace("T", "U")
#get orfs for both sequences
protList = orfGen(RNA)
protList.extend(orfGen(revCompRNA))
#get rid of duplicates
finalList = list(set(protList))
#print the list
for i in finalList:
print(i)

46
python/PROB.py Normal file
View File

@@ -0,0 +1,46 @@
#!/usr/bin/python3
"""
Solutions to the PROB Rosalind Problem - Introduction to Random Strings
Author: Peter Vlasveld
Date: 20190222
"""
import math
#Read in file content into list
f1 = open("rosalind_prob.txt")
content = f1.readlines()
f1.close()
"""
Function to calculate the probability of the exact sequence given the GC-content
@param gc - the GC-content given
@param seq - the sequence being studied
@return - the raw probability value
"""
def calcProb(gc, seq):
# get probability for just one nucleotide
gchalf = gc/2.0
athalf = (1.0-gc)/2.0
# multiply all of the probability values for each letter in the sequence
prob = 1.0
for i in range(0, len(seq)):
if seq[i] == 'A' or seq[i] == 'T':
prob *= athalf
elif seq[i] == 'G' or seq[i] == 'C':
prob *= gchalf
# return the probability value
return prob
# take the log of the probabilities returned by calcProb
# format them to 3 decimal places
gcCons = content[1].split( )
finalProbs = []
for i in gcCons:
rawProb = calcProb(float(i), content[0])
finalProbs.append("%.3f" % math.log10(rawProb))
# print the probabilities with spaces in between
print(" ".join(finalProbs))

50
python/REVP.py Normal file
View File

@@ -0,0 +1,50 @@
#!/usr/bin/python
#Solution to REVP Rosalind problem - Locating Restriction Sites
#Author: Peter Vlasveld
#read in file and get contents
f1 = open("rosalind_revp.txt")
content = f1.readlines()
f1.close()
#get DNA string from content
DNAStr = ""
for i in xrange(1,len(content)):
DNAStr += content[i].strip()
#function to return the DNA compliment
def DNACompliment(stri):
rev = stri[::-1]
revList = list(rev)
for i in xrange(0, len(revList)):
if revList[i] == "A":
revList[i] = "T"
elif revList[i] == "T":
revList[i] = "A"
elif revList[i] == "G":
revList[i] = "C"
elif revList[i] == "C":
revList[i] = "G"
revCompStr = "".join(revList)
return revCompStr
#loop through DNA string
indexes = []
lengths = []
for j in xrange(0, len(DNAStr)-3):
#evaluating 4 to 12 character sets, taking the reverse compliment and comparing,
#if a match is found record the index and length and then break
for k in xrange(4, 13):
comp = DNACompliment(DNAStr[j:j+k])
if comp == DNAStr[j:j+k]:
indexes.extend([j+1])
lengths.extend([k])
print(DNAStr[j:j+k])
break
#output results to a file
f2 = open("output.txt","w")
for i in xrange(0,len(indexes)):
f2.write(str(indexes[i]) + " " + str(lengths[i]) + "\n")
f2.close()