Source code for rpg.sequence

# -*- coding: utf-8 -*-

########################################################################
# Author: Nicolas Maillet                                              #
# Copyright © 2018 Institut Pasteur, Paris.                            #
# See the COPYRIGHT file for details                                   #
#                                                                      #
# This file is part of Rapid Peptide Generator (RPG) software.         #
#                                                                      #
# RPG is free software: you can redistribute it and/or modify          #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or    #
# any later version.                                                   #
#                                                                      #
# RPG is distributed in the hope that it will be useful,               #
# but WITHOUT ANY WARRANTY; without even the implied warranty of       #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the        #
# GNU General Public License for more details.                         #
#                                                                      #
# You should have received a copy of the GNU General Public license    #
# along with RPG (LICENSE file).                                       #
# If not, see <http://www.gnu.org/licenses/>.                          #
########################################################################

"""Contains classes and function related to sequences"""
import decimal
from rpg import core


[docs]
class Peptide:
    """Definition of a peptide, containing the header of its original
    sequence, an amino acid sequence, the name of the enzyme used to
    produce it and more informations.

    :param header: header of the peptide
    :param sequence: sequence in amino acids
    :param enzyme_name: name of the enzyme used
    :param aa_pka: pKa values (IPC / IPC2 / Stryer)
    :param aa_mass: masse values (average / monoisotopic)
    :param water_mass: masse value of water (average / monoisotopic)
    :param nb_peptide: number of this peptide (default: 0)
    :param position: position of cleavage on the original sequence (default: 0)
    :type header: str
    :type sequence: str
    :type enzyme_name: str
    :type aa_pka: dict()
    :type aa_mass: dict()
    :type water_mass: dict()
    :type nb_peptide: int
    :type position: int

    :var size: size of the peptide
    :var mass: mass of the peptide
    :var p_i: pI of the peptide
    :vartype size: int
    :vartype mass: float
    :vartype p_i: float
    """
    def __init__(self, header, sequence, enzyme_name, aa_pka, aa_mass,
                 water_mass, nb_peptide=0, position=0):
        self.header = header  # header of this peptide
        self.sequence = sequence  # peptide sequence
        self.enzyme_name = enzyme_name  # name of the enzyme used
        self.aa_pka = aa_pka # pKa values for pI calculation
        self.aa_mass = aa_mass # atomic mass value of all aa
        self.water_mass = water_mass # atomic mass value of water
        self.nb_peptide = nb_peptide  # number of this peptide
        self.position = position  # position of cleavage
        self.size = len(sequence)  # size of the peptide
        # Mass of the peptide
        self.mass = water_mass
        for i in sequence:
            self.mass += aa_mass[i]
        self.p_i = self.get_isoelectric_point()

    # self representation for print
    def __repr__(self):
        pka = "IPC2"
        if self.aa_pka == core.AA_PKA_S:
            pka = "Stryer"
        elif self.aa_pka == core.AA_PKA_IPC:
            pka = "IPC"
        return "Original header: " + self.header + "\nNo. peptide: " + \
            str(self.nb_peptide) + "\nEnzyme: " + self.enzyme_name + \
            "\nCleav. pos: " + str(self.position) + "\nPep. size: " + \
            str(self.size) + "\nPep. mass: " + str(self.mass) + \
            "\npKa values from: " + pka + "\nPep. pI: " + str(self.p_i) +\
            "\nSequence: " + self.sequence + "\n"

    # Equality between two Peptides
    def __eq__(self, other):
        if isinstance(self, other.__class__):
            return self.__dict__ == other.__dict__
        return False

    # Create a clean output according to fmt
    def __format__(self, fmt):
        ret = ""
        # Formating the print according to format
        if fmt == "fasta":
            ret += ">"
            separator = "_"
        elif fmt == "csv":
            separator = ","
        else:
            separator = "\t"
        # Main values to print
        ret += self.header + separator + str(self.nb_peptide) + separator + \
            self.enzyme_name + separator + str(self.position) + separator + \
            str(self.size) + separator + str(self.mass) + separator + \
            str(self.p_i)
        # Last separator, \n for fasta format
        if fmt == "fasta":
            ret += "\n"
        else:
            ret += separator
        # End of the print
        ret += self.sequence
        return ret


[docs]
    def get_isoelectric_point(self):
        """Compute isoelectric point (pI) of the peptide using
        binary search.

        :return: computed pI
        :rtype: float

        :note: This function used :py:const:`~rpg.core.AA_PKA`
        """
        ph_val = 7 # Neutral pH, starting point of binary search
        ph_min = 0.0 # Minimal pH
        ph_max = 14.0 # Maximal pH
        precision = 0.01
        # While we are not precise enough
        while (ph_val-ph_min > precision) or (ph_max-ph_val > precision):
            # Compute the pI
            qn1 = -1.0 / (1.0 + pow(10, (self.aa_pka["Cterm"] - ph_val)))
            qn2 = -self.sequence.count('D') / (1.0 + pow(10, (self.aa_pka["D"]-
                                                              ph_val)))
            qn3 = -self.sequence.count('E') / (1.0 + pow(10, (self.aa_pka["E"]-
                                                              ph_val)))
            qn4 = -self.sequence.count('C') / (1.0 + pow(10, (self.aa_pka["C"]-
                                                              ph_val)))
            qn5 = -self.sequence.count('Y') / (1.0 + pow(10, (self.aa_pka["Y"]-
                                                              ph_val)))
            qp1 = self.sequence.count('H') / (1.0 + pow(10, (ph_val -
                                                             self.aa_pka["H"])))
            qp2 = 1.0 / (1.0 + pow(10, (ph_val - self.aa_pka["Nterm"])))
            qp3 = self.sequence.count('K') / (1.0 + pow(10, (ph_val -
                                                             self.aa_pka["K"])))
            qp4 = self.sequence.count('R') / (1.0 + pow(10, (ph_val -
                                                             self.aa_pka["R"])))
            nq_final = qn1 + qn2 + qn3 + qn4 + qn5 + qp1 + qp2 + qp3 + qp4
            # We are below solution, good pH value must be smaller
            if nq_final < 0.0:
                ph_max = ph_val
                ph_val -= (ph_max - ph_min) / 2
            # We are above solution, good pH value must be bigger
            else:
                ph_min = ph_val
                ph_val += (ph_max - ph_min) / 2
        # We got a good enough pH value
        return round(ph_val, 2)




[docs]
class Sequence:
    """Definition of an amino acid sequence to digest.

    :param header: header of the sequence
    :param sequence: sequence itself
    :type header: str
    :type sequence: str
    """
    def __init__(self, header, sequence):
        self.header = header  # header of this peptide
        self.sequence = sequence  # peptide sequence

    # self representation for print
    def __repr__(self):
        return "Header: " + self.header + "\nSequence: " + self.sequence + "\n"

    # Equality between two Sequences
    def __eq__(self, other):
        if isinstance(self, other.__class__):
            return self.__dict__ == other.__dict__
        return False



[docs]
def check_sequence(seq):
    """Validate an input sequence. Each amino acid should be in
    :py:const:`~rpg.core.AMINOACIDS`.

    :param seq: the sequence to check
    :type seq: str

    :return: Sequence in UPPERCASE
    :rtype: str
    """
    validate = seq.strip().upper()
    for i in validate:
        if i not in core.AMINOACIDS:
            raise ValueError("amino acid \"%s\" in %s not "\
                             "recognized." % (i, validate))
    return validate