Source code for AAComposition

# -*- coding: utf-8 -*-
#  Copyright (c) 2016-2017, Zhijiang Yao, Jie Dong and Dongsheng Cao
#  All rights reserved.
#  This file is part of the PyBioMed.
#  The contents are covered by the terms of the BSD license
#  which is included in the file license.txt, found at the root
#  of the PyBioMed source tree.
"""
The module is used for computing the composition of amino acids, dipetide and
3-mers (tri-peptide) for a given protein sequence. You can get 8420 descriptors

for a given protein sequence. You can freely use and distribute it. If you hava

any problem, you could contact with us timely!

References
----------
.. [1] Reczko, M. and Bohr, H. (1994) The DEF data base of sequence based protein
   fold class predictions. Nucleic Acids Res, 22, 3616-3619.

.. [2] Hua, S. and Sun, Z. (2001) Support vector machine approach for protein
   subcellular localization prediction. Bioinformatics, 17, 721-728.

.. [3] Grassmann, J., Reczko, M., Suhai, S. and Edler, L. (1999) Protein fold
   class prediction: new methods of statistical classification. Proc Int Conf
   Intell Syst Mol Biol, 106-112.

Authors: Dongsheng Cao and Zhijiang Yao.

Date: 2016.7.27

Email: oriental-cds@163.com and gadsby@163.com
"""

# Core Library modules
import re

AALetter = [
    "A",
    "R",
    "N",
    "D",
    "C",
    "E",
    "Q",
    "G",
    "H",
    "I",
    "L",
    "K",
    "M",
    "F",
    "P",
    "S",
    "T",
    "W",
    "Y",
    "V",
]


#############################################################################################
[docs]def CalculateAAComposition(ProteinSequence): """ ######################################################################## Calculate the composition of Amino acids for a given protein sequence. Usage: result=CalculateAAComposition(protein) Input: protein is a pure protein sequence. Output: result is a dict form containing the composition of 20 amino acids. ######################################################################## """ LengthSequence = len(ProteinSequence) Result = {} for i in AALetter: Result[i] = round(float(ProteinSequence.count(i)) / LengthSequence * 100, 3) return Result
[docs]def CalculateDipeptideComposition(ProteinSequence): """ Calculate the composition of dipeptidefor a given protein sequence. Usage: result=CalculateDipeptideComposition(protein) Input: protein is a pure protein sequence. Output: result is a dict form containing the composition of 400 dipeptides. """ LengthSequence = len(ProteinSequence) Result = {} for i in AALetter: for j in AALetter: Dipeptide = i + j Result[Dipeptide] = round( float(ProteinSequence.count(Dipeptide)) / (LengthSequence - 1) * 100, 2 ) return Result
#############################################################################################
[docs]def Getkmers(): """ ######################################################################## Get the amino acid list of 3-mers. Usage: result=Getkmers() Output: result is a list form containing 8000 tri-peptides. ######################################################################## """ kmers = list() for i in AALetter: for j in AALetter: for k in AALetter: kmers.append(i + j + k) return kmers
[docs]def GetSpectrumDict(proteinsequence): """ ######################################################################## Calcualte the spectrum descriptors of 3-mers for a given protein. Usage: result=GetSpectrumDict(protein) Input: protein is a pure protein sequence. Output: result is a dict form containing the composition values of 8000 3-mers. """ result = {} kmers = Getkmers() for i in kmers: result[i] = len(re.findall(i, proteinsequence)) return result
#############################################################################################
[docs]def CalculateAADipeptideComposition(ProteinSequence): """ ######################################################################## Calculate the composition of AADs, dipeptide and 3-mers for a given protein sequence. Usage: result=CalculateAADipeptideComposition(protein) Input: protein is a pure protein sequence. Output: result is a dict form containing all composition values of AADs, dipeptide and 3-mers (8420). ######################################################################## """ result = {} result.update(CalculateAAComposition(ProteinSequence)) result.update(CalculateDipeptideComposition(ProteinSequence)) result.update(GetSpectrumDict(ProteinSequence)) return result
############################################################################################# if __name__ == "__main__": protein = "ADGCGVGEGTGQGPMCNCMCMKWVYADEDAADLESDSFADEDASLESDSFPWSNQRVFCSFADEDAS" AAC = CalculateAAComposition(protein) print(AAC) DIP = CalculateDipeptideComposition(protein) print(DIP) spectrum = GetSpectrumDict(protein) print(spectrum) res = CalculateAADipeptideComposition(protein) print(len(res))