Source code for GetProteinFromUniprot

# -*- coding: utf-8 -*-
#  Copyright (c) 2016-2017, Zhijiang Yao, Jie Dong and Dongsheng Cao
#  All rights reserved.
#  This file is part of the PyBioMed.
#  The contents are covered by the terms of the BSD license
#  which is included in the file license.txt, found at the root
#  of the PyBioMed source tree.
"""
################################################################################################

This module is used to download the protein sequence from the uniprot (http://www.uniprot.org/)

website. You can only need input a protein ID or prepare a file (ID.txt) related to ID. You can

 obtain a .txt (ProteinSequence.txt) file saving protein sequence you need.  You can freely use

 and distribute it. If you hava  any problem, you could contact with us timely!

Authors: Zhijiang Yao and Dongsheng Cao.

Date: 2016.06.04

Email: gadsby@163.com

################################################################################################
"""

try:
    # Python 3
    from urllib.request import urlopen
except ImportError:
    # Python 2
    from urllib2 import urlopen
# Core Library modules
import string


##################################################################################################
[docs]def GetProteinSequence(ProteinID):
    """
    #########################################################################################
    Get the protein sequence from the uniprot website by ID.

    Usage:

    result=GetProteinSequence(ProteinID)

    Input: ProteinID is a string indicating ID such as "P48039".

    Output: result is a protein sequence.
    #########################################################################################
    """

    ID = str(ProteinID)
    localfile = urlopen("http://www.uniprot.org/uniprot/" + ID + ".fasta")
    temp = localfile.readlines()
    res = ""
    for i in range(1, len(temp)):
        res = res + temp[i].strip()
    return res


##################################################################################################
[docs]def GetProteinSequenceFromTxt(path, openfile, savefile):
    """
    #########################################################################################
    Get the protein sequence from the uniprot website by the file containing ID.

    Usage:

    result=GetProteinSequenceFromTxt(path,openfile,savefile)

    Input: path is a directory path containing the ID file such as "/home/orient/protein/"

    openfile is the ID file such as "proteinID.txt"

    savefile is the file saving the obtained protein sequences such as "protein.txt"
    #########################################################################################
    """
    f1 = file(path + savefile, "wb")
    f2 = file(path + openfile, "r")
    # 	res=[]
    for index, i in enumerate(f2):

        itrim = i.strip()
        if itrim == "":
            continue
        else:
            temp = GetProteinSequence(itrim)
            print("--------------------------------------------------------")
            print("The %d protein sequence has been downloaded!" % (index + 1))
            print(temp)
            f1.write(temp + "\n")
            print("--------------------------------------------------------")
        # 		res.append(temp+'\n')
        # 	f1.writelines(res)
    f2.close()
    f1.close()
    return 0


##################################################################################################
if __name__ == "__main__":

    localfile = ["P48039"]
    for index, i in enumerate(localfile):
        itrim = i.strip()
        if itrim == "":
            continue
        else:
            temp = GetProteinSequence(itrim)
            print("--------------------------------------------------------")
            print("The %d protein sequence has been downloaded!" % (index + 1))
            print(temp)
            print("--------------------------------------------------------")