# -*- coding: utf-8 -*-
# Copyright (c) 2016-2017, Zhijiang Yao, Jie Dong and Dongsheng Cao
# All rights reserved.
# This file is part of the PyBioMed.
# The contents are covered by the terms of the BSD license
# which is included in the file license.txt, found at the root
# of the PyBioMed source tree.
"""
Used for process original data.
Created on Wed May 18 14:06:37 2016
@author: yzj
"""
# Core Library modules
import sys
ALPHABET = "ACGT"
[docs]class Seq:
def __init__(self, name, seq, no):
self.name = name
self.seq = seq.upper()
self.no = no
self.length = len(seq)
def __str__(self):
"""Output seq when 'print' method is called."""
return "%s\tNo:%s\tlength:%s\n%s" % (
self.name,
str(self.no),
str(self.length),
self.seq,
)
[docs]def IsUnderAlphabet(s, alphabet):
"""
#################################################################
Judge the string is within the scope of the alphabet or not.
:param s: The string.
:param alphabet: alphabet.
Return True or the error character.
#################################################################
"""
for e in s:
if e not in alphabet:
return e
return True
[docs]def IsFasta(seq):
"""
#################################################################
Judge the Seq object is in FASTA format.
Two situation:
1. No seq name.
2. Seq name is illegal.
3. No sequence.
:param seq: Seq object.
#################################################################
"""
if not seq.name:
error_info = "Error, sequence " + str(seq.no) + " has no sequence name."
print(seq)
sys.stderr.write(error_info)
return False
if -1 != seq.name.find(">"):
error_info = "Error, sequence " + str(seq.no) + " name has > character."
sys.stderr.write(error_info)
return False
if 0 == seq.length:
error_info = "Error, sequence " + str(seq.no) + " is null."
sys.stderr.write(error_info)
return False
return True
[docs]def ReadFasta(f):
"""
#################################################################
Read a fasta file.
:param f: HANDLE to input. e.g. sys.stdin, or open(<file>)
Return Seq obj list.
#################################################################
"""
name, seq = "", ""
count = 0
seq_list = []
lines = f.readlines()
for line in lines:
if not line:
break
if ">" == line[0]:
if 0 != count or (0 == count and seq != ""):
if IsFasta(Seq(name, seq, count)):
seq_list.append(Seq(name, seq, count))
else:
sys.exit(0)
seq = ""
name = line[1:].strip()
count += 1
else:
seq += line.strip()
count += 1
if IsFasta(Seq(name, seq, count)):
seq_list.append(Seq(name, seq, count))
else:
sys.exit(0)
return seq_list
[docs]def ReadFastaYield(f):
"""
#################################################################
Yields a Seq object.
:param f: HANDLE to input. e.g. sys.stdin, or open(<file>)
#################################################################
"""
name, seq = "", ""
count = 0
while True:
line = f.readline()
if not line:
break
if ">" == line[0]:
if 0 != count or (0 == count and seq != ""):
if IsFasta(Seq(name, seq, count)):
yield Seq(name, seq, count)
else:
sys.exit(0)
seq = ""
name = line[1:].strip()
count += 1
else:
seq += line.strip()
if IsFasta(Seq(name, seq, count)):
yield Seq(name, seq, count)
else:
sys.exit(0)
[docs]def ReadFastaCheckDna(f):
"""
#################################################################
Read the fasta file, and check its legality.
:param f: HANDLE to input. e.g. sys.stdin, or open(<file>)
Return the seq list.
#################################################################
"""
seq_list = []
for e in ReadFastaYield(f):
# print e
res = IsUnderAlphabet(e.seq, ALPHABET)
if res:
seq_list.append(e)
else:
error_info = (
"Sorry, sequence "
+ str(e.no)
+ " has character "
+ str(res)
+ ".(The character must be A or C or G or T)"
)
sys.stderr(error_info)
sys.exit(0)
return seq_list
[docs]def GetSequenceCheckDna(f):
"""
#################################################################
Read the fasta file.
Input: f: HANDLE to input. e.g. sys.stdin, or open(<file>)
Return the sequence list.
#################################################################
"""
sequence_list = []
for e in ReadFastaYield(f):
# print e
res = IsUnderAlphabet(e.seq, ALPHABET)
if res is not True:
error_info = (
"Sorry, sequence "
+ str(e.no)
+ " has character "
+ str(res)
+ ".(The character must be A, C, G or T)"
)
sys.stderr.write(error_info)
sys.exit(0)
else:
sequence_list.append(e.seq)
return sequence_list
[docs]def IsSequenceList(sequence_list):
"""
#################################################################
Judge the sequence list is within the scope of alphabet and
change the lowercase to capital.
#################################################################
"""
count = 0
new_sequence_list = []
for e in sequence_list:
e = e.upper()
count += 1
res = IsUnderAlphabet(e, ALPHABET)
if res is not True:
error_info = (
"Sorry, sequence "
+ str(count)
+ " has illegal character "
+ str(res)
+ ".(The character must be A, C, G or T)"
)
sys.stderr.write(error_info)
return False
else:
new_sequence_list.append(e)
return new_sequence_list
[docs]def GetData(input_data, desc=False):
"""
#################################################################
Get sequence data from file or list with check.
:param input_data: type file or list
:param desc: with this option, the return value will be a Seq object list(it only works in file object).
:return: sequence data or shutdown.
#################################################################
"""
if hasattr(input_data, "read"):
if desc is False:
return GetSequenceCheckDna(input_data)
else:
return ReadFastaCheckDna(input_data)
elif isinstance(input_data, list):
input_data = IsSequenceList(input_data)
if input_data is not False:
return input_data
else:
sys.exit(0)
else:
error_info = (
"Sorry, the parameter in get_data method must be list or file type."
)
sys.stderr.write(error_info)
sys.exit(0)
# Some basic function for generate feature vector
[docs]def Frequency(tol_str, tar_str):
"""
#################################################################
Generate the frequency of tar_str in tol_str.
:param tol_str: mother string.
:param tar_str: substring.
#################################################################
"""
i, j, tar_count = 0, 0, 0
len_tol_str = len(tol_str)
len_tar_str = len(tar_str)
while i < len_tol_str and j < len_tar_str:
if tol_str[i] == tar_str[j]:
i += 1
j += 1
if j >= len_tar_str:
tar_count += 1
i = i - j + 1
j = 0
else:
i = i - j + 1
j = 0
return tar_count
[docs]def WriteLibsvm(vector_list, label_list, write_file):
"""
#################################################################
Write the vector into disk in libSVM format.
#################################################################
"""
len_vector_list = len(vector_list)
len_label_list = len(label_list)
if len_vector_list == 0:
sys.stderr.write("The vector is none.")
sys.exit(1)
if len_label_list == 0:
sys.stderr.write("The label is none.")
sys.exit(1)
if len_vector_list != len_label_list:
sys.stderr.write("The length of vector and label is different.")
sys.exit(1)
with open(write_file, "w") as f:
len_vector = len(vector_list[0])
for i in range(len_vector_list):
temp_write = str(label_list[i])
for j in range(0, len_vector):
temp_write += " " + str(j + 1) + ":" + str(vector_list[i][j])
f.write(temp_write)
f.write("\n")
[docs]def GeneratePhycheValue(
k, phyche_index=None, all_property=False, extra_phyche_index=None
):
"""
#################################################################
Combine the user selected phyche_list, is_all_property and
extra_phyche_index to a new standard phyche_value.
#################################################################
"""
if phyche_index is None:
phyche_index = []
if extra_phyche_index is None:
extra_phyche_index = {}
diphyche_list = [
"Base stacking",
"Protein induced deformability",
"B-DNA twist",
"Dinucleotide GC Content",
"A-philicity",
"Propeller twist",
"Duplex stability:(freeenergy)",
"Duplex tability(disruptenergy)",
"DNA denaturation",
"Bending stiffness",
"Protein DNA twist",
"Stabilising energy of Z-DNA",
"Aida_BA_transition",
"Breslauer_dG",
"Breslauer_dH",
"Breslauer_dS",
"Electron_interaction",
"Hartman_trans_free_energy",
"Helix-Coil_transition",
"Ivanov_BA_transition",
"Lisser_BZ_transition",
"Polar_interaction",
"SantaLucia_dG",
"SantaLucia_dH",
"SantaLucia_dS",
"Sarai_flexibility",
"Stability",
"Stacking_energy",
"Sugimoto_dG",
"Sugimoto_dH",
"Sugimoto_dS",
"Watson-Crick_interaction",
"Twist",
"Tilt",
"Roll",
"Shift",
"Slide",
"Rise",
]
triphyche_list = [
"Dnase I",
"Bendability (DNAse)",
"Bendability (consensus)",
"Trinucleotide GC Content",
"Nucleosome positioning",
"Consensus_roll",
"Consensus-Rigid",
"Dnase I-Rigid",
"MW-Daltons",
"MW-kg",
"Nucleosome",
"Nucleosome-Rigid",
]
# Set and check physicochemical properties.
if 2 == k:
if all_property is True:
phyche_index = diphyche_list
else:
for e in phyche_index:
if e not in diphyche_list:
error_info = (
"Sorry, the physicochemical properties " + e + " is not exit."
)
import sys
sys.stderr.write(error_info)
sys.exit(0)
elif 3 == k:
if all_property is True:
phyche_index = triphyche_list
else:
for e in phyche_index:
if e not in triphyche_list:
error_info = (
"Sorry, the physicochemical properties " + e + " is not exit."
)
import sys
sys.stderr.write(error_info)
sys.exit(0)
# Generate phyche_value.
from PyBioMed.PyDNA.PyDNApsenacutil import GetPhycheIndex, ExtendPhycheIndex
return ExtendPhycheIndex(GetPhycheIndex(k, phyche_index), extra_phyche_index)
[docs]def ConvertPhycheIndexToDict(phyche_index):
"""
#################################################################
Convert phyche index from list to dict.
#################################################################
"""
# for e in phyche_index:
# print e
len_index_value = len(phyche_index[0])
k = 0
for i in range(1, 10):
if len_index_value < 4 ** i:
error_infor = "Sorry, the number of each index value is must be 4^k."
sys.stdout.write(error_infor)
sys.exit(0)
if len_index_value == 4 ** i:
k = i
break
from PyBioMed.PyDNA.PyDNAnacutil import MakeKmerList
kmer_list = MakeKmerList(k, ALPHABET)
# print kmer_list
len_kmer = len(kmer_list)
phyche_index_dict = {}
for kmer in kmer_list:
phyche_index_dict[kmer] = []
# print phyche_index_dict
phyche_index = list(zip(*phyche_index))
for i in range(len_kmer):
phyche_index_dict[kmer_list[i]] = list(phyche_index[i])
return phyche_index_dict
[docs]def StandardDeviation(value_list):
"""
#################################################################
Return standard deviation.
#################################################################
"""
from math import sqrt
from math import pow
n = len(value_list)
average_value = sum(value_list) * 1.0 / n
return sqrt(sum([pow(e - average_value, 2) for e in value_list]) * 1.0 / (n - 1))
[docs]def NormalizeIndex(phyche_index, is_convert_dict=False):
"""
#################################################################
Normalize the physicochemical index.
#################################################################
"""
normalize_phyche_value = []
for phyche_value in phyche_index:
average_phyche_value = sum(phyche_value) * 1.0 / len(phyche_value)
sd_phyche = StandardDeviation(phyche_value)
normalize_phyche_value.append(
[round((e - average_phyche_value) / sd_phyche, 2) for e in phyche_value]
)
if is_convert_dict is True:
return ConvertPhycheIndexToDict(normalize_phyche_value)
return normalize_phyche_value
[docs]def DNAChecks(s):
for e in s:
if e not in ALPHABET:
return e
return True
if __name__ == "__main__":
re = ["GACTGAACTGCACTTTGGTTTCATATTATTTGCTC"]
phyche_index = [
[
1.019,
-0.918,
0.488,
0.567,
0.567,
-0.070,
-0.579,
0.488,
-0.654,
-2.455,
-0.070,
-0.918,
1.603,
-0.654,
0.567,
1.019,
]
]
print(NormalizeIndex(phyche_index, is_convert_dict=False)[0])