Source code for GetDNA
# -*- coding: utf-8 -*-
# Copyright (c) 2016-2017, Zhijiang Yao, Jie Dong and Dongsheng Cao
# All rights reserved.
# This file is part of the PyBioMed.
# The contents are covered by the terms of the BSD license
# which is included in the file license.txt, found at the root
# of the PyBioMed source tree.
"""
This module is used for downloading the DNA sequence from ncbi web. You can only
need input a DNA ID.
Authors: Zhijiang Yao and Dongsheng Cao.
Date: 2016.11.04
Email: gadsby@163.com
"""
try:
# Python 3
from urllib.request import urlopen
except ImportError:
# Python 2
from urllib2 import urlopen
# Core Library modules
import sys
ALPHABET = "ACGT"
[docs]class Seq:
def __init__(self, name, seq, no):
self.name = name
self.seq = seq.upper()
self.no = no
self.length = len(seq)
def __str__(self):
"""Output seq when 'print' method is called."""
return "%s\tNo:%s\tlength:%s\n%s" % (
self.name,
str(self.no),
str(self.length),
self.seq,
)
[docs]def GetDNAFromUniGene(SeqID=""):
"""
This module is used for downloading the DNA sequence from ncbi web. You can only
need input a DNA ID.
"""
url = "http://www.ebi.ac.uk/ena/data/view/{0}&display=fasta".format(SeqID)
temp = urlopen(url).read()
return temp
[docs]def IsUnderAlphabet(s, alphabet):
"""
#################################################################
Judge the string is within the scope of the alphabet or not.
:param s: The string.
:param alphabet: alphabet.
Return True or the error character.
#################################################################
"""
for e in s:
if e not in alphabet:
return e
return True
[docs]def IsFasta(seq):
"""
#################################################################
Judge the Seq object is in FASTA format.
Two situation:
1. No seq name.
2. Seq name is illegal.
3. No sequence.
:param seq: Seq object.
#################################################################
"""
if not seq.name:
error_info = "Error, sequence " + str(seq.no) + " has no sequence name."
print(seq)
sys.stderr.write(error_info)
return False
if -1 != seq.name.find(">"):
error_info = "Error, sequence " + str(seq.no) + " name has > character."
sys.stderr.write(error_info)
return False
if 0 == seq.length:
error_info = "Error, sequence " + str(seq.no) + " is null."
sys.stderr.write(error_info)
return False
return True
[docs]def ReadFasta(f):
"""
#################################################################
Read a fasta file.
:param f: HANDLE to input. e.g. sys.stdin, or open(<file>)
Return Seq obj list.
#################################################################
"""
name, seq = "", ""
count = 0
seq_list = []
lines = f.readlines()
for line in lines:
if not line:
break
if ">" == line[0]:
if 0 != count or (0 == count and seq != ""):
if IsFasta(Seq(name, seq, count)):
seq_list.append(seq)
else:
sys.exit(0)
seq = ""
name = line[1:].strip()
count += 1
else:
seq += line.strip()
count += 1
if IsFasta(Seq(name, seq, count)):
seq_list.append(seq)
else:
sys.exit(0)
return seq_list
if __name__ == "__main__":
print("-" * 10 + "START" + "-" * 10)
print("Only PyBioMed is successfully installed the code below can be run!")
from PyBioMed.PyGetMol.GetProtein import timelimited
@timelimited(10)
def run_GetDNAFromUniGene():
seqid = "AA954964"
seqid2 = "CB216422"
print(GetDNAFromUniGene(seqid))
@timelimited(10)
def run_ReadFasta():
dna = ReadFasta(open("../test/test_data/example.fasta"))
print(dna)
run_GetDNAFromUniGene()
print("-" * 25)
run_ReadFasta()
print("-" * 10 + "END" + "-" * 10)