# -*- coding: utf-8 -*-
########################################################################
# Author: Nicolas Maillet #
# Copyright © 2018 Institut Pasteur, Paris. #
# See the COPYRIGHT file for details #
# #
# This file is part of Rapid Peptide Generator (RPG) software. #
# #
# RPG is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or #
# any later version. #
# #
# RPG is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# #
# You should have received a copy of the GNU General Public license #
# along with RPG (LICENSE file). #
# If not, see <http://www.gnu.org/licenses/>. #
########################################################################
"""Contains generic functions and global variables used by RPG"""
import decimal
import sys
import gzip
AMINOACIDS = ["A", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N",
"O", "P", "Q", "R", "S", "T", "U", "V", "W", "Y", "B", "X", "Z",
"#", "*"]
"""All character accepted in a peptide."""
AA_MASS_AVERAGE = {"*" : decimal.Decimal("0.0"),
"A" : decimal.Decimal("71.0788"),
"C" : decimal.Decimal("103.1388"),
"D" : decimal.Decimal("115.0886"),
"E" : decimal.Decimal("129.1155"),
"F" : decimal.Decimal("147.1766"),
"G" : decimal.Decimal("57.0519"),
"H" : decimal.Decimal("137.1411"),
"I" : decimal.Decimal("113.1594"),
"J" : decimal.Decimal("113.1594"),
"K" : decimal.Decimal("128.1741"),
"L" : decimal.Decimal("113.1594"),
"M" : decimal.Decimal("131.1926"),
"N" : decimal.Decimal("114.1038"),
"O" : decimal.Decimal("237.3018"),
"P" : decimal.Decimal("97.1167"),
"Q" : decimal.Decimal("128.1307"),
"R" : decimal.Decimal("156.1875"),
"S" : decimal.Decimal("87.0782"),
"T" : decimal.Decimal("101.1051"),
"U" : decimal.Decimal("150.0388"),
"V" : decimal.Decimal("99.1326"),
"W" : decimal.Decimal("186.2132"),
"Y" : decimal.Decimal("163.1760"),
"B" : decimal.Decimal("0.0"),
"X" : decimal.Decimal("0.0"),
"Z" : decimal.Decimal("0.0"),
"#" : decimal.Decimal("0.0")}
""" Average mass of all amino acids."""
AA_MASS_MONO = {"*" : decimal.Decimal("0.0"),
"A" : decimal.Decimal("71.037113805"),
"C" : decimal.Decimal("103.009184505"),
"D" : decimal.Decimal("115.026943065"),
"E" : decimal.Decimal("129.042593135"),
"F" : decimal.Decimal("147.068413945"),
"G" : decimal.Decimal("57.021463735"),
"H" : decimal.Decimal("137.058911875"),
"I" : decimal.Decimal("113.084064015"),
"J" : decimal.Decimal("113.084064015"),
"K" : decimal.Decimal("128.09496305"),
"L" : decimal.Decimal("113.084064015"),
"M" : decimal.Decimal("131.040484645"),
"N" : decimal.Decimal("114.04292747"),
"O" : decimal.Decimal("237.147726925"),
"P" : decimal.Decimal("97.052763875"),
"Q" : decimal.Decimal("128.05857754"),
"R" : decimal.Decimal("156.10111105"),
"S" : decimal.Decimal("87.032028435"),
"T" : decimal.Decimal("101.047678505"),
"U" : decimal.Decimal("150.953633405"),
"V" : decimal.Decimal("99.068413945"),
"W" : decimal.Decimal("186.07931298"),
"Y" : decimal.Decimal("163.063328575"),
"B" : decimal.Decimal("0.0"),
"X" : decimal.Decimal("0.0"),
"Z" : decimal.Decimal("0.0"),
"#" : decimal.Decimal("0.0")}
""" Monoisotopic mass of all amino acids.
From https://proteomicsresource.washington.edu/protocols06/masses.php
"""
WATER_MASS = decimal.Decimal("18.01528")
"""Mass of a water molecule."""
WATER_MASS_MONO = decimal.Decimal("18.0105647")
"""Monoisotopic mass of a water molecule."""
# Biochemistry Stryer 7th
AA_PKA_S = {"Nterm" : 8.0,
"C" : 8.3,
"D" : 4.1,
"E" : 4.1,
"H" : 6.0,
"K" : 10.8,
"R" : 12.5,
"Y" : 10.9,
"Cterm" : 3.1}
"""pKa of important amino acid to compute pI (from Stryer)."""
# IPC_peptide
AA_PKA_IPC = {"Nterm" : 9.564,
"C" : 8.297,
"D" : 3.887,
"E" : 4.317,
"H" : 6.018,
"K" : 10.517,
"R" : 12.503,
"Y" : 10.071,
"Cterm" : 2.383}
"""pKa of important amino acid to compute pI (from IPC_peptide. See http://isoelectric.org/theory.html for details)."""
# IPC_peptide2
AA_PKA_IPC_2 = {"Nterm" : 7.947,
"C" : 9.454,
"D" : 3.969,
"E" : 4.507,
"H" : 6.439,
"K" : 8.165,
"R" : 11.493,
"Y" : 9.153,
"Cterm" : 2.977}
"""pKa of important amino acid to compute pI (from IPC_peptide2. See http://www.ipc2-isoelectric-point.org/ for details)."""
[docs]
def handle_errors(message="", err=1, error_type=""):
"""Custom handling of errors and warnings.
:param message: error message to print
:param err: Type of message
:param error_type: header of error to print
:type message: str
:type err: int
:type error_type: str
*Type of message* is:\n
- **0** for critical error (exit)\n
- **1** for warning (no exit, default)\n
- **2** for print in stderr
"""
if err == 0:
print(error_type + "Error: " + message, file=sys.stderr)
sys.exit(1)
elif err == 2:
print(error_type + message, file=sys.stderr)
else:
print(error_type + "Warning: " + message, file=sys.stderr)
[docs]
def output_results(output_file, all_seq_digested, fmt, quiet, verbose):
"""Output results of digestion in file and optionally in `stdout`.
:param output_file: the file where to print results, if exist
:param all_seq_digested: results of digestions
:param fmt: output format (`csv`, `tsv` or `fasta`)
:param quiet: quiet mode, no `stdout` message
:param verbose: verbosity level
:type output_file: str
:type all_seq_digested: list(list(:py:class:`~rpg.digest.ResultOneDigestion`))
:type fmt: str
:type quiet: bool
:type verbose: int
"""
# Not output file
if not output_file:
# Header
header = get_header(fmt)
# If we have a header to print (csv/tsv)
if header:
# Stdout if small verbose
if verbose < 2 and not quiet:
print(header)
# Print all peptides
for one_seq in all_seq_digested: # list of list of ResultOneDig
# For all ResultOneDigestion
for one_enz_res in one_seq:
# Print on stdout
if verbose >= 2:
# Big verbose
print(one_enz_res.get_more_info())
if header:
print(header)
# Default stdout
if not quiet:
print(format(one_enz_res, fmt), end='')
# Output file exist
else:
try:
with open(output_file, 'w') as outfile:
# Header
header = get_header(fmt)
# If we have a header to print (csv/tsv)
if header:
# Print it in file
outfile.write(header + "\n")
# Stdout if small verbose
if verbose < 2 and not quiet:
print(header)
# Print all peptides
for one_seq in all_seq_digested: # list of list of ResultOneDi
# For all ResultOneDigestion
for one_enz_res in one_seq:
# Print results in file
outfile.write(format(one_enz_res, fmt))
# Print on stdout
if verbose >= 2:
# Big verbose
print(one_enz_res.get_more_info())
if header:
print(header)
# Default stdout
if not quiet:
print(format(one_enz_res, fmt), end='')
except IOError:
handle_errors(output_file + " can't be open in 'w' mode", 0,
"File ")
[docs]
def next_read(file, offset_start, offset_end):
""" Return each sequence between offsets range of a file
as a tuple (header, seq) using a generator.
Can be fasta or fastq, gzipped or not.
:param file: fasta/fastq file to read
:param offset_start: offset in the file from where to read
:param offset_end: offset in the file until where to read
:type file: str
:type offset_start: int
:type offset_end: int
"""
# Is it a GZIP file?
test_file = open(file, "rb")
# Get the first values
magic = test_file.read(2)
# Close the file
test_file.close()
# Open the file, GZIP or not
with (gzip.open(file, "rb") if magic == b"\x1f\x8b"
else open(file, "rb")) as in_file:
first_line = in_file.readline().decode('utf-8')
# FASTQ file
if first_line.startswith("@"):
# Go to starting offset
in_file.seek(offset_start)
# Set current offset
beg_line_offset = offset_start
# Read each line from this point
for line in in_file:
# Consider this line as a header
header = line.decode('utf-8').strip()
# It is a proper fastq header
if header.startswith("@"):
# The beginning of header is in the offset range
if beg_line_offset < offset_end:
# Get the sequence
sequence = in_file.readline().decode('utf-8').strip()
# Skip the two next lines
in_file.readline()
in_file.readline()
# Return header and sequence and wait for the next one
yield (header, sequence.upper())
# Out of offset, stop this loop
else:
break
# Current offset
beg_line_offset = in_file.tell()
# (multi?)FASTA file
elif first_line.startswith(">"):
# Go to starting offset
in_file.seek(offset_start)
# Set current offset
beg_line_offset = offset_start
# Read each line from this point
for line in in_file:
# Consider this line as a header
header = line.decode('utf-8').strip()
# It is a proper fasta header
if header.startswith(">"):
# The beginning of header is in the offset range
if beg_line_offset < offset_end:
# Get the sequence
sequence = in_file.readline().decode('utf-8').strip()
# Get current offset
current_offset = in_file.tell()
# Get next line
next_l = in_file.readline().decode('utf-8').strip()
# While this next line is not a fasta header...
while next_l and not next_l.startswith(">"):
# Add this to the Sequence
sequence += next_l
# Get current offset
current_offset = in_file.tell()
# Get next line
next_l = in_file.readline().decode('utf-8').strip()
# Next line is a fasta header, go back to its beginning
in_file.seek(current_offset)
# Return header and sequence and wait for the next one
yield (header, sequence.upper())
# Out of offset, stop this loop
else:
break
# Current offset
beg_line_offset = in_file.tell()
# Not a valid file
else:
# Stop the generator with the error to show
raise ValueError("input file format not recognized (%s)"\
"." % first_line[0])