Source code for GetDNA

# -*- coding: utf-8 -*-
#  Copyright (c) 2016-2017, Zhijiang Yao, Jie Dong and Dongsheng Cao
#  All rights reserved.
#  This file is part of the PyBioMed.
#  The contents are covered by the terms of the BSD license
#  which is included in the file license.txt, found at the root
#  of the PyBioMed source tree.
"""

This module is used for downloading the DNA sequence from ncbi web. You can only

need input a DNA ID.


Authors: Zhijiang Yao and Dongsheng Cao.

Date: 2016.11.04

Email: gadsby@163.com

"""


try:
    # Python 3
    from urllib.request import urlopen
except ImportError:
    # Python 2
    from urllib2 import urlopen
# Core Library modules
import sys

ALPHABET = "ACGT"


[docs]class Seq:
    def __init__(self, name, seq, no):
        self.name = name
        self.seq = seq.upper()
        self.no = no
        self.length = len(seq)

    def __str__(self):
        """Output seq when 'print' method is called."""
        return "%s\tNo:%s\tlength:%s\n%s" % (
            self.name,
            str(self.no),
            str(self.length),
            self.seq,
        )


[docs]def GetDNAFromUniGene(SeqID=""):

    """
    This module is used for downloading the DNA sequence from ncbi web. You can only

    need input a DNA ID.

    """
    url = "http://www.ebi.ac.uk/ena/data/view/{0}&display=fasta".format(SeqID)
    temp = urlopen(url).read()
    return temp


[docs]def IsUnderAlphabet(s, alphabet):
    """
    #################################################################
    Judge the string is within the scope of the alphabet or not.

    :param s: The string.
    :param alphabet: alphabet.

    Return True or the error character.
    #################################################################
    """
    for e in s:
        if e not in alphabet:
            return e

    return True


[docs]def IsFasta(seq):
    """
    #################################################################
    Judge the Seq object is in FASTA format.
    Two situation:
    1. No seq name.
    2. Seq name is illegal.
    3. No sequence.

    :param seq: Seq object.
    #################################################################
    """
    if not seq.name:
        error_info = "Error, sequence " + str(seq.no) + " has no sequence name."
        print(seq)
        sys.stderr.write(error_info)
        return False
    if -1 != seq.name.find(">"):
        error_info = "Error, sequence " + str(seq.no) + " name has > character."
        sys.stderr.write(error_info)
        return False
    if 0 == seq.length:
        error_info = "Error, sequence " + str(seq.no) + " is null."
        sys.stderr.write(error_info)
        return False

    return True


[docs]def ReadFasta(f):
    """
    #################################################################
    Read a fasta file.

    :param f: HANDLE to input. e.g. sys.stdin, or open(<file>)

    Return Seq obj list.
    #################################################################
    """
    name, seq = "", ""
    count = 0
    seq_list = []
    lines = f.readlines()
    for line in lines:
        if not line:
            break

        if ">" == line[0]:
            if 0 != count or (0 == count and seq != ""):
                if IsFasta(Seq(name, seq, count)):
                    seq_list.append(seq)
                else:
                    sys.exit(0)

            seq = ""
            name = line[1:].strip()
            count += 1
        else:
            seq += line.strip()

    count += 1
    if IsFasta(Seq(name, seq, count)):
        seq_list.append(seq)
    else:
        sys.exit(0)

    return seq_list


if __name__ == "__main__":
    print("-" * 10 + "START" + "-" * 10)
    print("Only PyBioMed is successfully installed the code below can be run！")
    from PyBioMed.PyGetMol.GetProtein import timelimited

    @timelimited(10)
    def run_GetDNAFromUniGene():
        seqid = "AA954964"
        seqid2 = "CB216422"
        print(GetDNAFromUniGene(seqid))

    @timelimited(10)
    def run_ReadFasta():

        dna = ReadFasta(open("../test/test_data/example.fasta"))
        print(dna)

    run_GetDNAFromUniGene()
    print("-" * 25)
    run_ReadFasta()
    print("-" * 10 + "END" + "-" * 10)