Source code for rpg.core

# -*- coding: utf-8 -*-

########################################################################
# Author: Nicolas Maillet                                              #
# Copyright © 2018 Institut Pasteur, Paris.                            #
# See the COPYRIGHT file for details                                   #
#                                                                      #
# This file is part of Rapid Peptide Generator (RPG) software.         #
#                                                                      #
# RPG is free software: you can redistribute it and/or modify          #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or    #
# any later version.                                                   #
#                                                                      #
# RPG is distributed in the hope that it will be useful,               #
# but WITHOUT ANY WARRANTY; without even the implied warranty of       #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the        #
# GNU General Public License for more details.                         #
#                                                                      #
# You should have received a copy of the GNU General Public license    #
# along with RPG (LICENSE file).                                       #
# If not, see <http://www.gnu.org/licenses/>.                          #
########################################################################

"""Contains generic functions and global variables used by RPG"""
import sys
import gzip

AMINOACIDS = ["A", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N",
              "O", "P", "Q", "R", "S", "T", "U", "V", "W", "Y", "B", "X", "Z",
              "#", "*"]
"""All character accepted in a peptide."""

AA_MASS_AVERAGE = {"*" : 0.0,
                   "A" : 71.0788,
                   "C" : 103.1388,
                   "D" : 115.0886,
                   "E" : 129.1155,
                   "F" : 147.1766,
                   "G" : 57.0519,
                   "H" : 137.1411,
                   "I" : 113.1594,
                   "J" : 113.1594,
                   "K" : 128.1741,
                   "L" : 113.1594,
                   "M" : 131.1926,
                   "N" : 114.1038,
                   "O" : 237.3018,
                   "P" : 97.1167,
                   "Q" : 128.1307,
                   "R" : 156.1875,
                   "S" : 87.0782,
                   "T" : 101.1051,
                   "U" : 150.0388,
                   "V" : 99.1326,
                   "W" : 186.2132,
                   "Y" : 163.1760,
                   "B" : 0.0,
                   "X" : 0.0,
                   "Z" : 0.0,
                   "#" : 0.0}
""" Average mass of all amino acids."""

AA_MASS_MONO =  {"*" : 0.0,
                  "A" : 71.037113805,
                  "C" : 103.009184505,
                  "D" : 115.026943065,
                  "E" : 129.042593135,
                  "F" : 147.068413945,
                  "G" : 57.021463735,
                  "H" : 137.058911875,
                  "I" : 113.084064015,
                  "J" : 113.084064015,
                  "K" : 128.09496305,
                  "L" : 113.084064015,
                  "M" : 131.040484645,
                  "N" : 114.04292747,
                  "O" : 237.147726925,
                  "P" : 97.052763875,
                  "Q" : 128.05857754,
                  "R" : 156.10111105,
                  "S" : 87.032028435,
                  "T" : 101.047678505,
                  "U" : 150.953633405,
                  "V" : 99.068413945,
                  "W" : 186.07931298,
                  "Y" : 163.063328575,
                  "B" : 0.0,
                  "X" : 0.0,
                  "Z" : 0.0,
                  "#" : 0.0}
""" Monoisotopic mass of all amino acids.
From https://proteomicsresource.washington.edu/protocols06/masses.php
"""

WATER_MASS = 18.01528
"""Mass of a water molecule."""
WATER_MASS_MONO = 18.0105647
"""Monoisotopic mass of a water molecule."""

# Biochemistry Stryer 7th
AA_PKA_S = {"Nterm" : 8.0,
            "C" : 8.3,
            "D" : 4.1,
            "E" : 4.1,
            "H" : 6.0,
            "K" : 10.8,
            "R" : 12.5,
            "Y" : 10.9,
            "Cterm" : 3.1}
"""pKa of important amino acid to compute pI (from Stryer)."""
# IPC_peptide
AA_PKA_IPC = {"Nterm" : 9.564,
              "C" : 8.297,
              "D" : 3.887,
              "E" : 4.317,
              "H" : 6.018,
              "K" : 10.517,
              "R" : 12.503,
              "Y" : 10.071,
              "Cterm" : 2.383}
"""pKa of important amino acid to compute pI (from IPC_peptide. See http://isoelectric.org/theory.html for details)."""
# IPC_peptide2
AA_PKA_IPC_2 = {"Nterm" : 7.947,
              "C" : 9.454,
              "D" : 3.969,
              "E" : 4.507,
              "H" : 6.439,
              "K" : 8.165,
              "R" : 11.493,
              "Y" : 9.153,
              "Cterm" : 2.977}
"""pKa of important amino acid to compute pI (from IPC_peptide2. See http://www.ipc2-isoelectric-point.org/ for details)."""

[docs] def handle_errors(message="", err=1, error_type=""): """Custom handling of errors and warnings. :param message: error message to print :param err: Type of message :param error_type: header of error to print :type message: str :type err: int :type error_type: str *Type of message* is:\n - **0** for critical error (exit)\n - **1** for warning (no exit, default)\n - **2** for print in stderr """ if err == 0: print(error_type + "Error: " + message, file=sys.stderr) sys.exit(1) elif err == 2: print(error_type + message, file=sys.stderr) else: print(error_type + "Warning: " + message, file=sys.stderr)
[docs] def get_header(fmt="fasta"): """Construct a header for output file in `csv` or `tsv`. :param fmt: format of header :type fmt: str :return: formatted header :rtype: str or None Informations on the header are:\n Original_header No_pep Enzyme Cleav_pos Pep_size Pep_mass pI Seq\n No header for `fasta` or other format. """ ret = None if fmt == "csv": separator = "," elif fmt == "tsv": separator = "\t" if fmt == "csv" or fmt == "tsv": ret = "Original_header" + separator + "No_peptide" + separator + \ "Enzyme" + separator + "Cleaving_pos" + separator + \ "Peptide_size" + separator + "Peptide_mass" + separator + "pI" +\ separator + "Sequence" return ret
[docs] def output_results(output_file, all_seq_digested, fmt, quiet, verbose): """Output results of digestion in file and optionally in `stdout`. :param output_file: the file where to print results, if exist :param all_seq_digested: results of digestions :param fmt: output format (`csv`, `tsv` or `fasta`) :param quiet: quiet mode, no `stdout` message :param verbose: verbosity level :type output_file: str :type all_seq_digested: list(list(:py:class:`~rpg.digest.ResultOneDigestion`)) :type fmt: str :type quiet: bool :type verbose: int """ # Not output file if not output_file: # Header header = get_header(fmt) # If we have a header to print (csv/tsv) if header: # Stdout if small verbose if verbose < 2 and not quiet: print(header) # Print all peptides for one_seq in all_seq_digested: # list of list of ResultOneDig # For all ResultOneDigestion for one_enz_res in one_seq: # Print on stdout if verbose >= 2: # Big verbose print(one_enz_res.get_more_info()) if header: print(header) # Default stdout if not quiet: print(format(one_enz_res, fmt), end='') # Output file exist else: try: with open(output_file, 'w') as outfile: # Header header = get_header(fmt) # If we have a header to print (csv/tsv) if header: # Print it in file outfile.write(header + "\n") # Stdout if small verbose if verbose < 2 and not quiet: print(header) # Print all peptides for one_seq in all_seq_digested: # list of list of ResultOneDi # For all ResultOneDigestion for one_enz_res in one_seq: # Print results in file outfile.write(format(one_enz_res, fmt)) # Print on stdout if verbose >= 2: # Big verbose print(one_enz_res.get_more_info()) if header: print(header) # Default stdout if not quiet: print(format(one_enz_res, fmt), end='') except IOError: handle_errors(output_file + " can't be open in 'w' mode", 0, "File ")
[docs] def next_read(file, offset_start, offset_end): """ Return each sequence between offsets range of a file as a tuple (header, seq) using a generator. Can be fasta or fastq, gzipped or not. :param file: fasta/fastq file to read :param offset_start: offset in the file from where to read :param offset_end: offset in the file until where to read :type file: str :type offset_start: int :type offset_end: int """ # Is it a GZIP file? test_file = open(file, "rb") # Get the first values magic = test_file.read(2) # Close the file test_file.close() # Open the file, GZIP or not with (gzip.open(file, "rb") if magic == b"\x1f\x8b" else open(file, "rb")) as in_file: first_line = in_file.readline().decode('utf-8') # FASTQ file if first_line.startswith("@"): # Go to starting offset in_file.seek(offset_start) # Set current offset beg_line_offset = offset_start # Read each line from this point for line in in_file: # Consider this line as a header header = line.decode('utf-8').strip() # It is a proper fastq header if header.startswith("@"): # The beginning of header is in the offset range if beg_line_offset < offset_end: # Get the sequence sequence = in_file.readline().decode('utf-8').strip() # Skip the two next lines in_file.readline() in_file.readline() # Return header and sequence and wait for the next one yield (header, sequence.upper()) # Out of offset, stop this loop else: break # Current offset beg_line_offset = in_file.tell() # (multi?)FASTA file elif first_line.startswith(">"): # Go to starting offset in_file.seek(offset_start) # Set current offset beg_line_offset = offset_start # Read each line from this point for line in in_file: # Consider this line as a header header = line.decode('utf-8').strip() # It is a proper fasta header if header.startswith(">"): # The beginning of header is in the offset range if beg_line_offset < offset_end: # Get the sequence sequence = in_file.readline().decode('utf-8').strip() # Get current offset current_offset = in_file.tell() # Get next line next_l = in_file.readline().decode('utf-8').strip() # While this next line is not a fasta header... while next_l and not next_l.startswith(">"): # Add this to the Sequence sequence += next_l # Get current offset current_offset = in_file.tell() # Get next line next_l = in_file.readline().decode('utf-8').strip() # Next line is a fasta header, go back to its beginning in_file.seek(current_offset) # Return header and sequence and wait for the next one yield (header, sequence.upper()) # Out of offset, stop this loop else: break # Current offset beg_line_offset = in_file.tell() # Not a valid file else: # Stop the generator with the error to show raise ValueError("input file format not recognized (%s)"\ "." % first_line[0])