Source code for rpg.RapidPeptidesGenerator

# -*- coding: utf-8 -*-
#!/usr/bin/env python3.11

########################################################################
# Rapid Peptide Generator (RPG) is a software dedicated to predict     #
# cleavage sites of proteases. User can create his own enzyme,         #
# following a simple grammar.                                          #
#                                                                      #
# Author: Nicolas Maillet                                              #
# Copyright © 2018 Institut Pasteur, Paris.                            #
# See the COPYRIGHT file for details                                   #
#                                                                      #
# RPG is free software: you can redistribute it and/or modify          #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or    #
# any later version.                                                   #
#                                                                      #
# RPG is distributed in the hope that it will be useful,               #
# but WITHOUT ANY WARRANTY; without even the implied warranty of       #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the        #
# GNU General Public License for more details.                         #
#                                                                      #
# You should have received a copy of the GNU General Public license    #
# along with RPG (LICENSE file).                                       #
# If not, see <http://www.gnu.org/licenses/>.                          #
########################################################################

"""Main file of RPG software, handle input/output and launch
necessary functions
"""

__version_info__ = ('2', '0', '5')
__version__ = '.'.join(__version_info__)
__revision_date__ = "2024-06-12"
__author__ = "Nicolas Maillet"

import argparse
import os
import sys
import uuid
from pathlib import Path
#from context import rpg
from rpg import core
from rpg import digest
from rpg import enzyme
from rpg.enzymes_definition import AVAILABLE_ENZYMES
sys.path.insert(0, str(Path.home())) # Home path
from rpg_user import AVAILABLE_ENZYMES_USER

ALL_ENZYMES = AVAILABLE_ENZYMES + AVAILABLE_ENZYMES_USER
"""All available enzymes in RPG."""


[docs]
def restricted_float(mc_val):
    """Restricts input miscleavage value to a float between 0 and 100.

    :param mc_val: value to test
    :type mc_val: float

    :return: the inputed value if correct
    :rtype: float

    :raises custom ValueError: if value is not between 0 and 100
    :raises custom TypeError: if value is not a float
    """
    try:
        mc_val = float(mc_val)
        if mc_val < 0 or mc_val > 100:
            core.handle_errors("miscleavage value should be between 0 and "\
                               "100.", 0, "Value ")
        return mc_val
    except ValueError:
        # Throw an error
        core.handle_errors("miscleavage value should be a float between 0 "\
                           "and 100.", 0, "Type ")



[docs]
def restricted_int(mc_val):
    """Restricts input miscleavage value to a int.

    :param mc_val: value to test
    :type mc_val: int

    :return: the inputed value if correct
    :rtype: int

    :raises custom TypeError: if value is not an int
    """
    try:
        mc_val = int(mc_val)
        if mc_val < 0 :
            core.handle_errors("miscleavage value should be greater then 0.", 0, "Value ")
        return mc_val
    except ValueError:
        # Throw an error
        core.handle_errors("miscleavage value should be an integer.", 0,
                           "Type ")



[docs]
def restricted_enzyme_id(enz_id):
    """Restrict input enzyme id to a str corresponding to an enzyme.

    :param mc_val: value to test
    :type mc_val: str

    :return: the inputed enzyme id
    :rtype: str
    """

    ids_available = []
    names_available = []
    for i in ALL_ENZYMES:
        ids_available.append(str(i.id_))
        names_available.append(i.name.casefold())
    if str(enz_id) not in ids_available and \
       str(enz_id).casefold() not in names_available:
        core.handle_errors(f"id {enz_id} does not correspond to any "\
                            "enzyme. Use -l to get enzyme ids.", 0,
                            "Input ")
    return str(enz_id)




[docs]
def list_enzyme():
    """Print all available enzymes"""
    for enz in ALL_ENZYMES:
        print("%i: %s" % (enz.id_, enz.name))



[docs]
def create_enzymes_to_use(enzymes, miscleavage):
    """Create the list of chosen :py:class:`~rpg.enzyme.Enzyme` to use.
    Each enzyme can be associated to a miscleavage value.

    :param enzymes: enzymes ids or names chosen by user
    :param miscleavage: associated miscleavage values
    :type enzymes: list(str)
    :type miscleavage: list(float)

    :return: list of enzyme's id with associated miscleavage values
    :rtype: list(int)
    """

    # Complete Enzymes to use (return)
    enzymes_to_use = []
    if enzymes:
        # Too much miscleavage values
        if len(miscleavage) > len(enzymes):
            core.handle_errors("Too much miscleavage values. Last values"
                               " will be ignored.")
            # Get only the first ones
            miscleavage = miscleavage[:len(enzymes)]
        cur_pos = -1
        # Get all enzymes with a given miscleavage
        for i, _ in enumerate(miscleavage):
            # In all available enzymes
            for enz in ALL_ENZYMES:
                # Get the current one
                if str(enz.id_) == str(enzymes[i]) or \
                   enz.name.casefold() == str(enzymes[i]).casefold():
                    # Change miscleavage ratio
                    enz.ratio_miscleavage = miscleavage[i]
                    # Add it
                    enzymes_to_use.append(enz)
            cur_pos = i
        # Get all enzymes without miscleavage value
        for i in enzymes[cur_pos + 1:]:
            # In all available enzymes
            for enz in ALL_ENZYMES:
                # Get the current one
                if str(enz.id_) == str(i) or enz.name.casefold() == str(i).casefold():
                    # Add it
                    enzymes_to_use.append(enz)
    # Return enzymes to use
    return enzymes_to_use

# Not tested

[docs]
def get_enzymes_to_use(mode, id_enz_selected, miscleavage):
    """Get the list of chosen :py:class:`~rpg.enzyme.Enzyme` to use.
    Each enzyme (and associated miscleavage value) are inputed by
    user. If there is a problem, user is interrogated again.

    :param mode: Digestion mode. If 'concurrent', no miscleavage values are used
    :param enzymes: enzyme's ids or names chosen by user
    :param miscleavage: associated miscleavage values
    :type mode: str
    :type enzymes: list(str)
    :type miscleavage: list(float)

    :return: list of enzyme's id with associated miscleavage values
    :rtype: list(int)
    """

    # Get the correct Enzymes inputed
    enzymes_to_use = create_enzymes_to_use(id_enz_selected, miscleavage)
    # No good Enzymes inputed, let user choose
    if not enzymes_to_use:
        id_enz_inputed = []
        # Print all available enzymes
        list_enzyme()
        # Ask user to give correct enzymes ids
        while not enzymes_to_use:
            id_enz_inp = input("Choose which enzyme(s) to use, separated by"
                               " space (example: 1 5 6). (q) to quit:\n")
            # Quit
            if "q" in id_enz_inp:
                sys.exit(0)
            # Get a list of ids
            for i in id_enz_inp.split(" "):
                id_enz_inputed.append(i)

            mc_enz_inputed = []
            if mode == "sequential":
                mc_enz_inp = input("Percentage of miscleavage per inputed"
                                   " enzyme (default 0), separated by sapce"
                                   " (example: 1.2 5 6):\n")
                if mc_enz_inp:
                    # Get a list of int
                    for i in mc_enz_inp.split(" "):
                        try:
                            # Convert it to int
                            i = float(i)
                            mc_enz_inputed.append(i)
                        # Not an int?
                        except ValueError:
                            # Throw an error
                            core.handle_errors("'%s' should be an floating"
                                               " value. This values will be"
                                               " ignored." % i)
            # Get the correct Enzyme if enzymes inputed
            enzymes_to_use = create_enzymes_to_use(id_enz_inputed,
                                                   mc_enz_inputed)
    # Return Enzymes to use
    return enzymes_to_use


# Not tested

[docs]
def main():
    """Launcher of RapidPeptidesGenerator"""
    parser = argparse.ArgumentParser(description="This software takes protein "
                                                 "sequences as input (-i optio"
                                                 "n). All sequences will be cl"
                                                 "eaved according to selected "
                                                 "enzymes (-e option) and give"
                                                 "n miscleavage percentage ("
                                                 "-m option). Cleaving can be "
                                                 "sequential or concurrent (-d"
                                                 " option). Resulting peptides"
                                                 " are outputted in a file (-o"
                                                 " option) in fasta, csv or ts"
                                                 "v format (-f option). Classi"
                                                 "cal enzymes are included (-l"
                                                 " option to print available e"
                                                 "nzymes) but it is possible t"
                                                 "o define other enzymes (-a o"
                                                 "ption). See https://gitlab.p"
                                                 "asteur.fr/nmaillet/rpg/ and "
                                                 "https://rapid-peptide-genera"
                                                 "tor.readthedocs.io for more "
                                                 "informations.")
    group_launch = parser.add_mutually_exclusive_group(required=True)
    group_launch.add_argument("-a", "--addenzyme", action="store_true",
                              help="Create a new enzyme. See https://rapid-pe"\
                              "ptide-generator.readthedocs.io for more inform"\
                              "ation")
    group_launch.add_argument("-b", "--delenzyme", metavar="str",
                              help="Delete a user-define enzyme. See "\
                              "https://rapid-peptide-generator.readthedocs.io "\
                              "for more information")
    parser.add_argument("-d", "--digest", metavar="str",
                        choices=['s', 'sequential', 'c', 'concurrent'],
                        default="s", help="Digestion mode. Either 's', 'seque"
                        "ntial', 'c' or 'concurrent' (default: s)")
    parser.add_argument("-e", "--enzymes", metavar="str", default=[],
                        nargs='+', type=restricted_enzyme_id,
                        help="Id or name of enzyme(s) to use (e.g. -e 1 Asp-N "
                             "10 to use enzymes 1, Asp-N and 10). Use -l first"
                             " to get enzyme ids or names")
    parser.add_argument("-f", "--fmt", metavar="str",
                        choices=['fasta', 'csv', 'tsv'], default="fasta",
                        help="Output file format. Either 'fasta', 'csv', or "
                             "'tsv' (default: fasta)")
    group_launch.add_argument("-i", "--inputdata", metavar="path",
                              help="Input file, in fasta / fastq format "\
                                   "(gzipped or not)")
    group_launch.add_argument("-l", "--list", action="store_true",
                              help="Display the list of available enzymes")
    group_launch.add_argument("-s", "--sequence", metavar="str",
                              help="Input a single protein sequence without "\
                                   "commentary")
    group_mc = parser.add_mutually_exclusive_group(required=False)
    group_mc.add_argument("-m", "--miscleavage", metavar="float", default=[],
                        nargs='+', type=restricted_float,
                        help="Percentage of miscleavage, between 0 and 100,"
                        " by enzyme(s). It should be in the same order than "
                        "-enzymes options (e.g. -m 15 5 10). Only for sequenti"
                        "al digestion (default: 0)")
    group_mc.add_argument("-t", "--theoretical", metavar="int", default=[],
                        nargs='+', type=restricted_int,
                        help="Maximum number of miscleavages, by enzyme(s). "
                        "RPG will produce all theoretical peptides allowing "
                        "at most -t miscleavages. It should be in the same "
                        "order than -enzymes options (e.g. -t 1 3 2). Only "
                        "for sequential digestion (default: 0)")
    parser.add_argument("-n", "--noninteractive", action='store_true',
                        help="Non-interactive mode. No standard output, only "
                        "error(s) (--quiet enable, overwrite -v). If output "
                        "filename already exists, output file will be "
                        "overwritten.")
    parser.add_argument("-x", "--cleave", type=str, metavar="str", default=[],
                        nargs='+',
                        help="Cleavage site(s) when adding enzyme in non-inte"\
                        "ractive (e.g. -x \"(D or E,)\" or -x \"(D,)\" \"(E,)\"")

    parser.add_argument("-z", "--exception", type=str, metavar="str", default=[],
                        nargs='+',
                        help="Exception(s) when adding enzyme in non-interact"\
                        "ive (e.g. -z \"(D,)(W)\" \"(D,)(A)\"")

    parser.add_argument("-y", "--enzname", type=str, metavar="str", default="",
                        help="Enzyme's name when adding enzyme in non-interac"\
                        "tive.")
    parser.add_argument("-p", "--pka", metavar="str", choices=['ipc2', 'stryer', 'ipc'],
                        default="ipc2", help="Define pKa values. Either 'ipc2' "
                        "'stryer' or 'ipc' (default: ipc2)")
    parser.add_argument("-w", "--mass", metavar="str", choices=['avg', 'mono'],
                        default="avg", help="Define atomic mass values. "\
                        "Either 'avg' for average mass or 'mono' for "\
                        "monoisotopic mass (default: avg)")
    group_output = parser.add_mutually_exclusive_group()
    group_output.add_argument("-o", "--outputfile", type=str, metavar="str",
                              default="", help="Optional result file "
                              "to output result peptides.")
    group_output.add_argument("-r", "--randomname", action="store_true",
                              help="Random (not used) output file name")
    parser.add_argument("-c", "--processes", type=int, metavar="int", default=1,
                       help="Number of parallel processes to use (default: 1)")
    group_verbose = parser.add_mutually_exclusive_group()
    group_verbose.add_argument("-q", "--quiet", action="store_true",
                               help="No standard output, only error(s)")
    group_verbose.add_argument("-v", "--verbose", action="count", default=0,
                               help="Increase output verbosity. -vv will "
                               "increase more than -v")
    parser.add_argument("--version", action="version",
                        version="%(prog)s " + __version__ + " from " +
                        __revision_date__)
    args = parser.parse_args()

    # --addenzyme option
    if args.addenzyme:
        if args.noninteractive:
            # We need a name
            if not args.enzname:
                parser.error("argument -y/--enzname is mandatory")
            # We need at least a rule
            if not args.enzname:
                parser.error("argument -x/--cleave is mandatory")
            # Create the enzyme
            enzyme.user_creation_enzyme_non_interactive(ALL_ENZYMES,
                                                        args.enzname,
                                                        args.cleave,
                                                        args.exception)
        else:
            enzyme.user_creation_enzyme(ALL_ENZYMES)
        sys.exit(0)

    # --delenzyme option
    if args.delenzyme:
        enzyme.delete_enzyme(ALL_ENZYMES, args.delenzyme)
        sys.exit(0)

    # --digest option
    mode = "sequential"
    if args.digest == "c" or args.digest == "concurrent":
        mode = "concurrent"
        args.miscleavage = []  # No miscleavage on concurrent, infinite time

    # --pka option
    aa_pka = core.AA_PKA_IPC_2
    if args.pka == "stryer":
        aa_pka = core.AA_PKA_S
    elif args.pka == "ipc":
        aa_pka = core.AA_PKA_IPC

    # --mass option
    aa_mass = core.AA_MASS_AVERAGE
    water_mass = core.WATER_MASS
    if args.mass == "mono":
        aa_mass = core.AA_MASS_MONO
        water_mass = core.WATER_MASS_MONO

    # --list option
    if args.list:
        list_enzyme()
        sys.exit(0)

    # --nointeractive option
    if args.noninteractive:
        args.quiet = 1
        args.verbose = 0

    # Be sure to have at least 1 process
    if args.processes <= 0:
        parser.error("argument -c/--processes should be greater than 0")

    # input data
    input_data = None
    input_type = None
    if args.inputdata:
        if os.path.isfile(args.inputdata):
            input_data = args.inputdata
            input_type = "file"
        else:
            core.handle_errors("file not found (%s)." % args.inputdata, 0, "I"\
                               "nput ")
    elif args.sequence:
        input_data = args.sequence
        input_type = "sequence"

    # --outputfile / --randomname options
    output_file = "" # No output file (default)
    if args.randomname:
        # Generate a random file name
        output_file = str(uuid.uuid4().hex) + "." + args.fmt
        # Ensure the name is unique
        while os.path.isfile(output_file):
            # Generate a random file name
            output_file = str(uuid.uuid4().hex) + "." + args.fmt
    # Chosen file name if exist
    elif args.outputfile:
        # Given name
        tmpname = str(args.outputfile)
        # No extension?
        if "." not in tmpname:
            # Add default extension for this file format
            output_file = tmpname + "." + args.fmt
        else:
            output_file = tmpname
        # If interactive mode
        if not args.noninteractive:
            # This file already exist?
            while os.path.isfile(output_file):
                core.handle_errors("File '%s' already exit!" % output_file)
                # Don't overwrite it
                if input("Overwrite it? (y/n)\n") != "y":
                    tmpname = input("Output filename?\n")
                    # No extension?
                    if "." not in tmpname:
                        # Add default extension for this file format
                        output_file = tmpname + "." + args.fmt
                    else:
                        output_file = tmpname
                # Overwrite it
                else:
                    break

    # More mis cleavage than enzyme
    if len(args.miscleavage) > len(args.enzymes):
        core.handle_errors("Too much miscleavage values. Last values will "
                           "be ignored.")
        args.miscleavage = args.miscleavage[:len(args.enzymes)]

    # Get all enzymes inputed
    enzymes_to_use = get_enzymes_to_use(mode, args.enzymes, args.miscleavage)

    # Output options
    if args.verbose:
        print("Input: " + input_data)
        print("Enzyme(s) used: " + str([enz.name for enz in enzymes_to_use]))
        print("Mode: " + mode)
        print("miscleavage ratio: " +
              str([enz.ratio_miscleavage for enz in enzymes_to_use]))
        if output_file:
            print("Output file: " + os.path.abspath(output_file))

    # Make the actual digestion of input data
    results_digestion = digest.digest_from_input(input_data, input_type,
                                                 enzymes_to_use, mode, aa_pka,
                                                 aa_mass, water_mass,
                                                 args.processes)

    # We need to compute all theoretical peptides for at least one enzyme
    if args.theoretical:
        # More mis cleavage than enzyme
        if len(args.theoretical) > len(args.enzymes):
            core.handle_errors("Too much miscleavage values. Last values will "
                               "be ignored.")
        # Not enough mis cleavage values, complete with 0
        elif len(args.theoretical) < len(args.enzymes):
            args.theoretical += [0]*(len(args.enzymes) - len(args.theoretical))
        # Dict of enzymes/max mis cleavage
        enzymes_to_use = {}
        # Link enzyme to mc value
        for i, j in enumerate(args.enzymes):
            enzymes_to_use[enzyme.get_enz_name(ALL_ENZYMES, j)] = args.theoretical[i]
        # Compute the results
        digest.theoretical_peptides(results_digestion, enzymes_to_use)
    # Output results
    core.output_results(output_file, results_digestion, args.fmt, args.quiet,
                        args.verbose)


### Let'z go ###
if __name__ == '__main__':
    main()
    # The end
    sys.exit(0)