Source code for GetDNA

# -*- coding: utf-8 -*-
#  Copyright (c) 2016-2017, Zhijiang Yao, Jie Dong and Dongsheng Cao
#  All rights reserved.
#  This file is part of the PyBioMed.
#  The contents are covered by the terms of the BSD license
#  which is included in the file license.txt, found at the root
#  of the PyBioMed source tree.
"""

This module is used for downloading the DNA sequence from ncbi web. You can only

need input a DNA ID.


Authors: Zhijiang Yao and Dongsheng Cao.

Date: 2016.11.04

Email: gadsby@163.com

"""


try:
    # Python 3
    from urllib.request import urlopen
except ImportError:
    # Python 2
    from urllib2 import urlopen
# Core Library modules
import sys

ALPHABET = "ACGT"


[docs]class Seq: def __init__(self, name, seq, no): self.name = name self.seq = seq.upper() self.no = no self.length = len(seq) def __str__(self): """Output seq when 'print' method is called.""" return "%s\tNo:%s\tlength:%s\n%s" % ( self.name, str(self.no), str(self.length), self.seq, )
[docs]def GetDNAFromUniGene(SeqID=""): """ This module is used for downloading the DNA sequence from ncbi web. You can only need input a DNA ID. """ url = "http://www.ebi.ac.uk/ena/data/view/{0}&display=fasta".format(SeqID) temp = urlopen(url).read() return temp
[docs]def IsUnderAlphabet(s, alphabet): """ ################################################################# Judge the string is within the scope of the alphabet or not. :param s: The string. :param alphabet: alphabet. Return True or the error character. ################################################################# """ for e in s: if e not in alphabet: return e return True
[docs]def IsFasta(seq): """ ################################################################# Judge the Seq object is in FASTA format. Two situation: 1. No seq name. 2. Seq name is illegal. 3. No sequence. :param seq: Seq object. ################################################################# """ if not seq.name: error_info = "Error, sequence " + str(seq.no) + " has no sequence name." print(seq) sys.stderr.write(error_info) return False if -1 != seq.name.find(">"): error_info = "Error, sequence " + str(seq.no) + " name has > character." sys.stderr.write(error_info) return False if 0 == seq.length: error_info = "Error, sequence " + str(seq.no) + " is null." sys.stderr.write(error_info) return False return True
[docs]def ReadFasta(f): """ ################################################################# Read a fasta file. :param f: HANDLE to input. e.g. sys.stdin, or open(<file>) Return Seq obj list. ################################################################# """ name, seq = "", "" count = 0 seq_list = [] lines = f.readlines() for line in lines: if not line: break if ">" == line[0]: if 0 != count or (0 == count and seq != ""): if IsFasta(Seq(name, seq, count)): seq_list.append(seq) else: sys.exit(0) seq = "" name = line[1:].strip() count += 1 else: seq += line.strip() count += 1 if IsFasta(Seq(name, seq, count)): seq_list.append(seq) else: sys.exit(0) return seq_list
if __name__ == "__main__": print("-" * 10 + "START" + "-" * 10) print("Only PyBioMed is successfully installed the code below can be run!") from PyBioMed.PyGetMol.GetProtein import timelimited @timelimited(10) def run_GetDNAFromUniGene(): seqid = "AA954964" seqid2 = "CB216422" print(GetDNAFromUniGene(seqid)) @timelimited(10) def run_ReadFasta(): dna = ReadFasta(open("../test/test_data/example.fasta")) print(dna) run_GetDNAFromUniGene() print("-" * 25) run_ReadFasta() print("-" * 10 + "END" + "-" * 10)