Source code for PyPretreatDNA

# -*- coding: utf-8 -*-
#  Copyright (c) 2016-2017, Zhijiang Yao, Jie Dong and Dongsheng Cao
#  All rights reserved.
#  This file is part of the PyBioMed.
#  The contents are covered by the terms of the BSD license
#  which is included in the file license.txt, found at the root
#  of the PyBioMed source tree.
"""
Used for process original data.

Created on Wed May 18 14:06:37 2016

@author: yzj
"""


# Core Library modules
import sys

ALPHABET = "ACGT"


[docs]class Seq:
    def __init__(self, name, seq, no):
        self.name = name
        self.seq = seq.upper()
        self.no = no
        self.length = len(seq)

    def __str__(self):
        """Output seq when 'print' method is called."""
        return "%s\tNo:%s\tlength:%s\n%s" % (
            self.name,
            str(self.no),
            str(self.length),
            self.seq,
        )


[docs]def IsUnderAlphabet(s, alphabet):
    """
    #################################################################
    Judge the string is within the scope of the alphabet or not.

    :param s: The string.
    :param alphabet: alphabet.

    Return True or the error character.
    #################################################################
    """
    for e in s:
        if e not in alphabet:
            return e

    return True


[docs]def IsFasta(seq):
    """
    #################################################################
    Judge the Seq object is in FASTA format.
    Two situation:
    1. No seq name.
    2. Seq name is illegal.
    3. No sequence.

    :param seq: Seq object.
    #################################################################
    """
    if not seq.name:
        error_info = "Error, sequence " + str(seq.no) + " has no sequence name."
        print(seq)
        sys.stderr.write(error_info)
        return False
    if -1 != seq.name.find(">"):
        error_info = "Error, sequence " + str(seq.no) + " name has > character."
        sys.stderr.write(error_info)
        return False
    if 0 == seq.length:
        error_info = "Error, sequence " + str(seq.no) + " is null."
        sys.stderr.write(error_info)
        return False

    return True


[docs]def ReadFasta(f):
    """
    #################################################################
    Read a fasta file.

    :param f: HANDLE to input. e.g. sys.stdin, or open(<file>)

    Return Seq obj list.
    #################################################################
    """
    name, seq = "", ""
    count = 0
    seq_list = []
    lines = f.readlines()
    for line in lines:
        if not line:
            break

        if ">" == line[0]:
            if 0 != count or (0 == count and seq != ""):
                if IsFasta(Seq(name, seq, count)):
                    seq_list.append(Seq(name, seq, count))
                else:
                    sys.exit(0)

            seq = ""
            name = line[1:].strip()
            count += 1
        else:
            seq += line.strip()

    count += 1
    if IsFasta(Seq(name, seq, count)):
        seq_list.append(Seq(name, seq, count))
    else:
        sys.exit(0)

    return seq_list


[docs]def ReadFastaYield(f):
    """
    #################################################################
    Yields a Seq object.

    :param f: HANDLE to input. e.g. sys.stdin, or open(<file>)
    #################################################################
    """
    name, seq = "", ""
    count = 0
    while True:
        line = f.readline()
        if not line:
            break

        if ">" == line[0]:
            if 0 != count or (0 == count and seq != ""):
                if IsFasta(Seq(name, seq, count)):
                    yield Seq(name, seq, count)
                else:
                    sys.exit(0)

            seq = ""
            name = line[1:].strip()
            count += 1
        else:
            seq += line.strip()

    if IsFasta(Seq(name, seq, count)):
        yield Seq(name, seq, count)
    else:
        sys.exit(0)


[docs]def ReadFastaCheckDna(f):
    """
    #################################################################
    Read the fasta file, and check its legality.

    :param f: HANDLE to input. e.g. sys.stdin, or open(<file>)

    Return the seq list.
    #################################################################
    """
    seq_list = []
    for e in ReadFastaYield(f):
        # print e
        res = IsUnderAlphabet(e.seq, ALPHABET)
        if res:
            seq_list.append(e)
        else:
            error_info = (
                "Sorry, sequence "
                + str(e.no)
                + " has character "
                + str(res)
                + ".(The character must be A or C or G or T)"
            )
            sys.stderr(error_info)
            sys.exit(0)

    return seq_list


[docs]def GetSequenceCheckDna(f):
    """
    #################################################################
    Read the fasta file.

    Input: f: HANDLE to input. e.g. sys.stdin, or open(<file>)

    Return the sequence list.
    #################################################################
    """
    sequence_list = []
    for e in ReadFastaYield(f):
        # print e
        res = IsUnderAlphabet(e.seq, ALPHABET)
        if res is not True:
            error_info = (
                "Sorry, sequence "
                + str(e.no)
                + " has character "
                + str(res)
                + ".(The character must be A, C, G or T)"
            )
            sys.stderr.write(error_info)
            sys.exit(0)
        else:
            sequence_list.append(e.seq)

    return sequence_list


[docs]def IsSequenceList(sequence_list):
    """
    #################################################################
    Judge the sequence list is within the scope of alphabet and
    change the lowercase to capital.
    #################################################################
    """
    count = 0
    new_sequence_list = []

    for e in sequence_list:
        e = e.upper()
        count += 1
        res = IsUnderAlphabet(e, ALPHABET)
        if res is not True:
            error_info = (
                "Sorry, sequence "
                + str(count)
                + " has illegal character "
                + str(res)
                + ".(The character must be A, C, G or T)"
            )
            sys.stderr.write(error_info)
            return False
        else:
            new_sequence_list.append(e)

    return new_sequence_list


[docs]def GetData(input_data, desc=False):
    """
    #################################################################
    Get sequence data from file or list with check.

    :param input_data: type file or list
    :param desc: with this option, the return value will be a Seq object list(it only works in file object).
    :return: sequence data or shutdown.
    #################################################################
    """
    if hasattr(input_data, "read"):
        if desc is False:
            return GetSequenceCheckDna(input_data)
        else:
            return ReadFastaCheckDna(input_data)
    elif isinstance(input_data, list):
        input_data = IsSequenceList(input_data)
        if input_data is not False:
            return input_data
        else:
            sys.exit(0)
    else:
        error_info = (
            "Sorry, the parameter in get_data method must be list or file type."
        )
        sys.stderr.write(error_info)
        sys.exit(0)


# Some basic function for generate feature vector


[docs]def Frequency(tol_str, tar_str):
    """
    #################################################################
    Generate the frequency of tar_str in tol_str.

    :param tol_str: mother string.
    :param tar_str: substring.
    #################################################################
    """
    i, j, tar_count = 0, 0, 0
    len_tol_str = len(tol_str)
    len_tar_str = len(tar_str)
    while i < len_tol_str and j < len_tar_str:
        if tol_str[i] == tar_str[j]:
            i += 1
            j += 1
            if j >= len_tar_str:
                tar_count += 1
                i = i - j + 1
                j = 0
        else:
            i = i - j + 1
            j = 0

    return tar_count


[docs]def WriteLibsvm(vector_list, label_list, write_file):
    """
    #################################################################
    Write the vector into disk in libSVM format.
    #################################################################
    """
    len_vector_list = len(vector_list)
    len_label_list = len(label_list)
    if len_vector_list == 0:
        sys.stderr.write("The vector is none.")
        sys.exit(1)
    if len_label_list == 0:
        sys.stderr.write("The label is none.")
        sys.exit(1)
    if len_vector_list != len_label_list:
        sys.stderr.write("The length of vector and label is different.")
        sys.exit(1)

    with open(write_file, "w") as f:
        len_vector = len(vector_list[0])
        for i in range(len_vector_list):
            temp_write = str(label_list[i])
            for j in range(0, len_vector):
                temp_write += " " + str(j + 1) + ":" + str(vector_list[i][j])
            f.write(temp_write)
            f.write("\n")


[docs]def GeneratePhycheValue(
    k, phyche_index=None, all_property=False, extra_phyche_index=None
):
    """
    #################################################################
    Combine the user selected phyche_list, is_all_property and
    extra_phyche_index to a new standard phyche_value.
    #################################################################
    """
    if phyche_index is None:
        phyche_index = []
    if extra_phyche_index is None:
        extra_phyche_index = {}

    diphyche_list = [
        "Base stacking",
        "Protein induced deformability",
        "B-DNA twist",
        "Dinucleotide GC Content",
        "A-philicity",
        "Propeller twist",
        "Duplex stability:(freeenergy)",
        "Duplex tability(disruptenergy)",
        "DNA denaturation",
        "Bending stiffness",
        "Protein DNA twist",
        "Stabilising energy of Z-DNA",
        "Aida_BA_transition",
        "Breslauer_dG",
        "Breslauer_dH",
        "Breslauer_dS",
        "Electron_interaction",
        "Hartman_trans_free_energy",
        "Helix-Coil_transition",
        "Ivanov_BA_transition",
        "Lisser_BZ_transition",
        "Polar_interaction",
        "SantaLucia_dG",
        "SantaLucia_dH",
        "SantaLucia_dS",
        "Sarai_flexibility",
        "Stability",
        "Stacking_energy",
        "Sugimoto_dG",
        "Sugimoto_dH",
        "Sugimoto_dS",
        "Watson-Crick_interaction",
        "Twist",
        "Tilt",
        "Roll",
        "Shift",
        "Slide",
        "Rise",
    ]
    triphyche_list = [
        "Dnase I",
        "Bendability (DNAse)",
        "Bendability (consensus)",
        "Trinucleotide GC Content",
        "Nucleosome positioning",
        "Consensus_roll",
        "Consensus-Rigid",
        "Dnase I-Rigid",
        "MW-Daltons",
        "MW-kg",
        "Nucleosome",
        "Nucleosome-Rigid",
    ]

    # Set and check physicochemical properties.
    if 2 == k:
        if all_property is True:
            phyche_index = diphyche_list
        else:
            for e in phyche_index:
                if e not in diphyche_list:
                    error_info = (
                        "Sorry, the physicochemical properties " + e + " is not exit."
                    )
                    import sys

                    sys.stderr.write(error_info)
                    sys.exit(0)
    elif 3 == k:
        if all_property is True:
            phyche_index = triphyche_list
        else:
            for e in phyche_index:
                if e not in triphyche_list:
                    error_info = (
                        "Sorry, the physicochemical properties " + e + " is not exit."
                    )
                    import sys

                    sys.stderr.write(error_info)
                    sys.exit(0)

    # Generate phyche_value.
    from PyBioMed.PyDNA.PyDNApsenacutil import GetPhycheIndex, ExtendPhycheIndex

    return ExtendPhycheIndex(GetPhycheIndex(k, phyche_index), extra_phyche_index)


[docs]def ConvertPhycheIndexToDict(phyche_index):
    """
    #################################################################
    Convert phyche index from list to dict.
    #################################################################
    """
    # for e in phyche_index:
    #     print e
    len_index_value = len(phyche_index[0])
    k = 0
    for i in range(1, 10):
        if len_index_value < 4 ** i:
            error_infor = "Sorry, the number of each index value is must be 4^k."
            sys.stdout.write(error_infor)
            sys.exit(0)
        if len_index_value == 4 ** i:
            k = i
            break
    from PyBioMed.PyDNA.PyDNAnacutil import MakeKmerList

    kmer_list = MakeKmerList(k, ALPHABET)
    # print kmer_list
    len_kmer = len(kmer_list)
    phyche_index_dict = {}
    for kmer in kmer_list:
        phyche_index_dict[kmer] = []
    # print phyche_index_dict
    phyche_index = list(zip(*phyche_index))
    for i in range(len_kmer):
        phyche_index_dict[kmer_list[i]] = list(phyche_index[i])

    return phyche_index_dict


[docs]def StandardDeviation(value_list):
    """
    #################################################################
    Return standard deviation.
    #################################################################
    """
    from math import sqrt
    from math import pow

    n = len(value_list)
    average_value = sum(value_list) * 1.0 / n
    return sqrt(sum([pow(e - average_value, 2) for e in value_list]) * 1.0 / (n - 1))


[docs]def NormalizeIndex(phyche_index, is_convert_dict=False):
    """
    #################################################################
    Normalize the physicochemical index.
    #################################################################
    """
    normalize_phyche_value = []
    for phyche_value in phyche_index:
        average_phyche_value = sum(phyche_value) * 1.0 / len(phyche_value)
        sd_phyche = StandardDeviation(phyche_value)
        normalize_phyche_value.append(
            [round((e - average_phyche_value) / sd_phyche, 2) for e in phyche_value]
        )

    if is_convert_dict is True:
        return ConvertPhycheIndexToDict(normalize_phyche_value)

    return normalize_phyche_value


[docs]def DNAChecks(s):
    for e in s:
        if e not in ALPHABET:
            return e
    return True


if __name__ == "__main__":
    re = ["GACTGAACTGCACTTTGGTTTCATATTATTTGCTC"]

    phyche_index = [
        [
            1.019,
            -0.918,
            0.488,
            0.567,
            0.567,
            -0.070,
            -0.579,
            0.488,
            -0.654,
            -2.455,
            -0.070,
            -0.918,
            1.603,
            -0.654,
            0.567,
            1.019,
        ]
    ]
    print(NormalizeIndex(phyche_index, is_convert_dict=False)[0])