#!/usr/bin/python
#MPRT - Finding a Protein Motif Solution
#Problem can be found at http://rosalind.info/problems/mprt/
#Author: Peter Vlasveld

import urllib2
import re

#declare motif
motif = "N[^P][ST][^P]"

#read in file
f0 = open("rosalind_mprt.txt", "r")
content = f0.read().splitlines()
f0.close()

#open output file
f1 = open("output.txt", "w+")
#loop through each accession ID
for i in content:
	#get fasta from url
	url = "http://www.uniprot.org/uniprot/" + i + ".fasta"
	response = urllib2.urlopen(url)
	fasta = response.read().splitlines()
	
	#format protein string
	protStr = ""
	for j in fasta:
		if not j.startswith('>'):
			protStr += j
	#construct output strings
	outStr = ""
	for j in range(0, len(protStr)-4):
		if re.match(motif,protStr[j:j+4]):
			outStr += str(j+1) + " "
	
	#output
	if not outStr == "":
		print i
		f1.write(i + "\n")
		print outStr
		f1.write(outStr + "\n")
#close output file
f1.close()