Source code for GetProteinFromUniprot

# -*- coding: utf-8 -*-
#  Copyright (c) 2016-2017, Zhijiang Yao, Jie Dong and Dongsheng Cao
#  All rights reserved.
#  This file is part of the PyBioMed.
#  The contents are covered by the terms of the BSD license
#  which is included in the file license.txt, found at the root
#  of the PyBioMed source tree.
"""
################################################################################################

This module is used to download the protein sequence from the uniprot (http://www.uniprot.org/)

website. You can only need input a protein ID or prepare a file (ID.txt) related to ID. You can

 obtain a .txt (ProteinSequence.txt) file saving protein sequence you need.  You can freely use

 and distribute it. If you hava  any problem, you could contact with us timely!

Authors: Zhijiang Yao and Dongsheng Cao.

Date: 2016.06.04

Email: gadsby@163.com

################################################################################################
"""

try:
    # Python 3
    from urllib.request import urlopen
except ImportError:
    # Python 2
    from urllib2 import urlopen
# Core Library modules
import string


##################################################################################################
[docs]def GetProteinSequence(ProteinID): """ ######################################################################################### Get the protein sequence from the uniprot website by ID. Usage: result=GetProteinSequence(ProteinID) Input: ProteinID is a string indicating ID such as "P48039". Output: result is a protein sequence. ######################################################################################### """ ID = str(ProteinID) localfile = urlopen("http://www.uniprot.org/uniprot/" + ID + ".fasta") temp = localfile.readlines() res = "" for i in range(1, len(temp)): res = res + temp[i].strip() return res
##################################################################################################
[docs]def GetProteinSequenceFromTxt(path, openfile, savefile): """ ######################################################################################### Get the protein sequence from the uniprot website by the file containing ID. Usage: result=GetProteinSequenceFromTxt(path,openfile,savefile) Input: path is a directory path containing the ID file such as "/home/orient/protein/" openfile is the ID file such as "proteinID.txt" savefile is the file saving the obtained protein sequences such as "protein.txt" ######################################################################################### """ f1 = file(path + savefile, "wb") f2 = file(path + openfile, "r") # res=[] for index, i in enumerate(f2): itrim = i.strip() if itrim == "": continue else: temp = GetProteinSequence(itrim) print("--------------------------------------------------------") print("The %d protein sequence has been downloaded!" % (index + 1)) print(temp) f1.write(temp + "\n") print("--------------------------------------------------------") # res.append(temp+'\n') # f1.writelines(res) f2.close() f1.close() return 0
################################################################################################## if __name__ == "__main__": localfile = ["P48039"] for index, i in enumerate(localfile): itrim = i.strip() if itrim == "": continue else: temp = GetProteinSequence(itrim) print("--------------------------------------------------------") print("The %d protein sequence has been downloaded!" % (index + 1)) print(temp) print("--------------------------------------------------------")