"""
SHEPHARD:
Sequence-based Hierarchical and Extendable Platform for High-throughput Analysis of Region of Disorder
Authors: Garrett M. Ginell & Alex S. Holehouse
Contact: (g.ginell@wustl.edu)
Holehouse Lab - Washington University in St. Louis
"""
from . import general_utilities
from .exceptions import ProteomeException
from .protein import Protein
from itertools import islice
import copy
[docs]
class Proteome:
"""
The Proteome object is the main unit for information storage in
SHEPHARD.
There are a few ways that new Proteomes can be generated:
* By reading in a FASTA file (using shephard.interfaces.apis.fasta)
The Proteome constructor takes a single argument, which is a
list of protein dictionaries or a list of Protein objects. This
means Proteome objects can be generated directly (see below for
a definition of protein dictionaries). However, it is often more
convenient to build Proteomes from FASTA files. For more information
on this see the ``api`` documentation.
Protein dictionaries are dictionaries that must contain four elements
(others are ignored).
* ``sequence`` : *str* - Amino acid sequence of the protein.
* ``name`` : *str* - Name of the protein (this can be anything, it is not used internally so no constraints on what this is.
* ``unique_ID`` : *str* - This must be unique with respect to all other unique_IDs in the set of proteins in the input list.
* ``attributes`` : *dict* - Dictionary of one or more attributes to apply to this protein. Key/value pairs in this dictionary can be arbitrary and are user defined.
As an example:
>>> protein_dictionary_example = {'sequence':'ALAPSLLPAMPALSPALSP',
'name': 'my protein fragment',
'unique_ID':'UXX01', 'attributes':{}}
>>> dictionary_list = []
>>> dictionary_list.append(protein_dictionary_example)
>>> P = Proteome(dictionary_list)
Note that ``sequence``, ``name`` and ``unique_ID`` are cast to *str* by
the function, so if numerical values are passed for any these will be
converted to strings.
**Notes**
* NOTE that ALL FOUR of these are required for EACH protein, even if the
attributes dictionary is empty.
* The unique_ID is checked for uniqueness against all others in the Proteomes
and will throw and exception if it is, in fact, not unique.
* Additional proteins can be added using the `.add_protein()` or
`.add_proteins() function.
"""
## ------------------------------------------------------------------------
##
def __init__(self, input_list = None, attributes = None, force_overwrite=False):
# See the Proteome class documentation for constructor info
"""
Constructor that generates a new Proteome object. This includes
taking a list of Protein objects or protein dictionaries (input_list)
and, optionally an attributes dictionary for the proteome itself.
In addition, force_overwrite can be used to deal with duplicate
entries in the input_list.
There are two types of lists that can be tolerated when passed to
the Proteome constructor: a list of protein dictionaries and a
list of Protein objects. Note this simply is using the
.add_proteins() function to add the passed input_list.
**Protein dictionaries**
One mode of adding multiple proteins is by passing a list of
Protein dictionaries.
Protein dictionaries are dictionaries that posses the following
key-value pairs ::
'sequence' : amino acid sequence (str)
'name' : protein name (str)
'unique_ID' : The unique identification number used for the
protein (str)
'attributes' : A dictionary of arbitrary key-value pairs to
associate with the protein (dict or None)
Additional keys/value pairs are ignored and ALL four of these must
be included. If any are missing for any protein entry this function
raises a ProteomeException.
**Protein objects**
A second mode of adding multiple proteins is by passing a list of
Protein objects. This is useful if you're creating a new Proteome
based on a subset of proteins from an existing Proteome.
In both cases, the function automatically determines the type of
the passed list, and adds dictionaries accordingly. Note that
in both cases proteins are added by value - i.e. a new Protein
objects are generated, and changes to Proteomes in the new
Proteome will not affect the original Proteome.
Parameters
-----------
input_list : list (default = None)
List of Protein dictionaries
attributes : dict (default = None)
A an arbitrary set of key-value pairs to annotate the proteome
with metadata.
force_overwrite : bool (default = False)
If set to False and there are duplicate unique_IDs in protein
dictionaries in the input_list this will trigger an exception.
However, if set to True then the 'last' entry overwrites a
more recent one in the case of duplicates.
"""
# initiallize book-keeping instruments
self._records = {}
self._unique_domain_types = {}
self._unique_site_types = {} # dict that mapes a Site type to a count
self._unique_track_names = {} # dict that mapes Track name to count
self._track_name_to_track_type = {} # dict that maps Track name to track type ('values' or 'symbols')
# check attributs dictionary
general_utilities.variable_is_dictionary(attributes, ProteomeException, 'attributes argument passed to proteome is not a dictionary', or_none = True)
if attributes is None:
self._attributes = {}
else:
self._attributes = attributes
# if no/empty input provided then we're done
if input_list is None or len(input_list) == 0:
return
# else try and add proteins - probably could be more soph
self.add_proteins(input_list)
## ------------------------------------------------------------------------
##
def check_unique_ID(self, unique_id):
"""
Function that checks if a given unique ID is found. Note that this
function is not needed for testing if a unique_ID is present if the
goal is to request Protein Objects (or not). Instead, one can use
the .protein(<unique_ID>, safe=False). By setting safe=False if the
unique_ID is not found then this function will simply return None.
Parameters
-----------
unique_id : string
String corresponding to a unique_ID associated with some protein
Returns
----------
bool
Returns True if the passed ID is present, or False if not.
"""
if unique_id in self._records.keys():
return True
else:
return False
## ------------------------------------------------------------------------
##
@property
def proteins(self):
"""
Returns a list of unique_IDs that correspond to the proteins in
this Proteome. NOTE this returns a list of the IDs, not the actual
Protein objects. To get the corresponding protein object one must
use the ``.protein(<unique_ID>)`` notation.
Returns
--------
``list`` of ``str``
Returns a list of unique IDs
"""
return list(self._records.keys())
## ------------------------------------------------------------------------
##
def protein(self, unique_ID, safe=True):
"""
Returns the ``Protein`` object associated with the passed unique_ID.
If there is no Protein associated with the provided unique_ID then
if ``safe=True`` (default) an exception is raised, while if
``safe=False`` then ``None`` is returned.
Parameters
-----------
unique_id : string
String corresponding to a unique_ID associated with some protein
safe : bool (default = True)
If set to True then a missing unique_ID will raise an exception.
If ``False`` then a missing unique_ID will simply return None
Returns
--------
Protein Object, None
Depending on if the passed unique_ID is found in the ``Proteome``,
a ``Protein`` object or None will be returned.
"""
# convert unique IDs to strings because this typing also happens
# when new proteins are added
unique_ID_str = str(unique_ID)
try:
return self._records[unique_ID_str]
except KeyError:
if safe:
raise ProteomeException("unique_ID '%s' not found in proteome" % (unique_ID))
else:
return None
## ------------------------------------------------------------------------
##
def add_protein(self, sequence, name, unique_ID, attributes=None, force_overwrite=False):
"""
Function that allows the user to add a new protein to a Proteomes
in an ad-hoc fashion. In general most of the time it will make
sense to add proteins all at once from some input source, but the
ability to add proteins one at a time is also useful.
If a duplicate unique_ID is passed an exception (ProteomeException)
is raised.
Parameters
-----------
sequence : string
Amino acid sequence of the protein. Note - no sanity check of
the sequence is performed.
name : string
String reflecting the protein name. Again this can be
anything.
unique_id : string
String corresponding to a unique_ID associated with some
protein.
attributes : dict (default = None)
The attributes dictionary provides a key-value pairing for
arbitrary information. This could include gene names, different
types of identifies, protein copy number, a set of protein
partners, or anything else one might wish to associated with the
protein as a whole. Default is None.
force_overwrite : Bool (default = False)
If set to False and a unique_ID is included that already is
found then this function will raise an exception. However, if
set to True it will automatically overwrite the pre-existing
entry. (Default = False).
Returns
--------
None
No return status, but valid proteins included in the input_list
will be added to to the underlying proteome.
"""
unique_ID_str = str(unique_ID)
if unique_ID_str in self._records:
if force_overwrite is False:
raise ProteomeException('Non-unique unitque_ID passed [%s]' % (unique_ID_str))
self._records[unique_ID_str] = Protein(sequence, name, self, unique_ID_str, attributes)
## ------------------------------------------------------------------------
##
def add_proteins(self, input_list, force_overwrite=False):
r"""
Function that allows the user to add a multiple new proteins using
either a list of protein dictionaries (described below) or a list
of Protein objects.
**Protein dictionaries**
One mode of adding multiple proteins is by passing a list of
Protein dictionaries.
Protein dictionaries are dictionaries that posses the following
key-value pairs ::
'sequence' : amino acid sequence (str)
'name' : protein name (str)
'unique_ID' : The unique identification number used for the
protein (str)
'attributes' : A dictionary of arbitrary key-value pairs to
associate with the protein (dict or None)
Additional keys/value pairs are ignored and ALL four of these must
be included. If any are missing for any protein entry this function
raises a ProteomeException.
**Protein objects**
A second mode of adding multiple proteins is by passing a list of
Protein objects
In both cases, the function automatically determines the type of
the passed list, and adds dictionaries accordingly. Note that
in both cases proteins are added by value - i.e. a new Protein
object is generated.
Parameters
-----------
input_list : list
List of Protein dictionaries or list of Protein objects
force_overwrite : bool (default = False)
If set to False and a unique_ID is included that already is
found then this function will raise an exception. However, if
set to True it will automatically overwrite the pre-existing
entry.
Returns
--------
None
No return status, but valid proteins included in the input_list
will be added to to the underlying proteome.
"""
# cycles over every element in the input list and builds a new
# list where each
type_list = list(set([type(i) for i in input_list]))
# checks if only one type of object is found here
if len(type_list) > 1:
raise ProteomeException(f'Trying to add Proteins to a Proteome and the input_list contains more than one type {type_list}')
# if we're using a list of protein dictionaries
if type_list[0] == dict:
self._add_proteins_dict(input_list, force_overwrite)
# if we're using a list of Protein objects
elif type_list[0] == Protein:
self._add_proteins_Protein(input_list, force_overwrite)
## ------------------------------------------------------------------------
##
def _add_proteins_dict(self, input_list, force_overwrite=False):
"""
Internal function that mirrors add_proteins() but operates if every
element in the input_list is a dictionary.
Importantly, this works by create a NEW proteins and populating
based on the key-value mapping in the protein dictionary.
Protein dictionaries are dictionaries that posses the following
key-value pairs:
'sequence' : amino acid sequence (str)
'name' : protein name (str)
'unique_ID' : The unique identification number used for the
protein (str)
'attributes' : A dictionary of arbitrary key-value pairs to
associate with the protein (dict or None)
This function is not be called directly, but instead used by
add_proteins() function
Parameters
--------------
input_list : list
List of dictionaries
force_overwrite : bool, default=False
Flag which if set to True will mean a Protein in the input_list
would overwrite an existing protein with the same unique_ID.
Returns
--------
None
No return type, but will add the protein dictoinaries in the
input_list into this Proteome.
"""
# for each protein entry in the input list
for entry in input_list:
try:
sequence = str(entry['sequence'])
name = str(entry['name'])
unique_ID = str(entry['unique_ID'])
attributes = entry['attributes']
except KeyError:
# if something goes wrong while extracting the four required attributes we build a
# diagnosis string and then print this as we raise an exception. The goal here is to
# try and provide the user with as much info as possible to diagnose the problem
diagnosis_string = self.__build_diagnosis_string_proteome_construction(entry)
raise ProteomeException('%s'%(diagnosis_string))
if unique_ID in self._records:
if force_overwrite is False:
raise ProteomeException('Non-unique unique_ID passed [%s]' % (unique_ID))
# add in a new protein
self._records[unique_ID] = Protein(sequence, name, self, unique_ID, attributes)
## ------------------------------------------------------------------------
##
def _add_proteins_Protein(self, input_list, force_overwrite=False):
"""
Internal function that mirrors add_proteins() but operates if every
element in the input_list is a Protein object.
Importantly, this works by create a NEW protein, and ensure all
complex datatypes associated with that new protein are copied.
This function is not be called directly, but instead used by
add_proteins() function
Parameters
--------------
input_list : list of shephard.protein.Protein objects
List of Protein objects to be copied into this Proteome
force_overwrite : bool, default=False
Flag which if set to True will mean a Protein in the input_list
would overwrite an existing protein with the same unique_ID.
Returns
--------
None
No return type, but will add the Protein objects in the input_list
into this Proteome.
"""
# for each protein entry in the input list
for entry in input_list:
# get the unique ID and, if this ID is already found in the Proteome
# raise an exception UNLESS force_overwrite is True
unique_ID = entry.unique_ID
if unique_ID in self._records:
if force_overwrite is False:
raise ProteomeException('Non-unique unique_ID passed [%s]' % (unique_ID))
##
## New protein is fully created and all complex data types are copied, so this
## new protein is a completely distinct entity to the protein in the original
## input list. This avoids any possible issues with cross-referencing back against
## old data structures and keeps things clean. Also ensures that the new proteome
## has updated unique sites and unique domains lists in the usual way. Basically
## this is the most appropritae way to add an existing protein to a new proteome
##
# create new protein
new_protein = Protein(entry.sequence, entry.name, self, entry.unique_ID)
# update attributes
for a in entry.attributes:
new_protein.add_attribute(a, copy.deepcopy(entry.attribute(a)))
# update domains
for d in entry.domains:
new_protein.add_domain(d.start, d.end, d.domain_type, copy.deepcopy(d._attributes))
# update sites
for s in entry.sites:
new_protein.add_site(s.position. s.site_type, s.symbol, s.value, copy.deepcopy(s._attributes))
# update tracks
for t in entry.tracks:
vals = t.values
# if vals present then we're creating a values track. Note we can get away with
# a shallow copy because we know these tracks will only have ints or chars in their
# elements, which are appropriately copied by a shallow copy
if vals:
new_protein.add_track(t.name, vals.copy(), None)
else:
new_protein.add_track(t.name, None, t.symbols.copy())
self._records[unique_ID] = new_protein
## ------------------------------------------------------------------------
##
def remove_protein(self, unique_ID, safe=True):
"""
Function that removes a given protein from the Proteome based on the
passed unique_ID. If the passed unique_ID does not exist then this
will trigger an exception unless safe=False.
Parameters
------------
unique_ID : str
Unique ID that will be used to retrieve a given protein
safe : bool (default = True)
Flag that if set to True means if a passed unique_ID is missing
from the underlying proteome object an exception wll be raised
(ProteomeException). If set to False, a missing unique_ID is
ignored.
Returns
-----------
None
No return type but will remove an entry from the Proteome.
"""
unique_ID_str = str(unique_ID)
if unique_ID_str in self._records:
del self._records[unique_ID_str]
else:
if safe:
raise ProteomeException('Passed unique_ID [%s] not found in this proteome' % (unique_ID_str))
## ------------------------------------------------------------------------
##
def remove_proteins(self, input_list, safe=True):
"""
Function that removes a given proteome from the Proteome based on
the passed unique_ID. If the passed unique_ID does not exist then
this will trigger an exception unless safe = False.
Parameters
------------
input_list : list of str
List that contains the unique IDs that will be used to select
proteins for deletion.
safe : bool (default = True)
Flag that if set to True means if a passed unique_ID is missing
from the underlying proteome object an exception wll be raised
(ProteomeException). If False a missing unique_ID is ignored.
Returns
-----------
None
No return type but will remove an entry from the proteome
"""
for unique_ID in input_list:
self.remove_protein(unique_ID, safe=safe)
###################################
## ##
## ATTRIBUTE FUNCTIONS ##
## ##
###################################
## ------------------------------------------------------------------------
##
@property
def attributes(self):
"""
Provides a list of the keys associated with every attribute associated
with this protein.
Returns
-------
list
returns a list of the attribute keys associated with the Proteome.
"""
return list(self._attributes.keys())
## ------------------------------------------------------------------------
##
def attribute(self, name, safe=True):
"""
Function that returns a specific attribute as defined by the name.
Recall that attributes are name : value pairs, where the 'value'
can be anything and is user defined. This function will return
the value associated with a given name.
Parameters
----------------
name : str
The attribute name. A list of valid names can be found by
calling the ``<Proteome>.attributes()`` (which returns a list
of the valid names).
safe : bool (default = True)
Flag which if true with throw an exception if an attribute with
the same name already exists.
Returns
---------
Unknown
Will either return whatever was associated with that attribute
(which could be anything) or None if that attribute is missing.
"""
# if name is in the _atributes dictionary the return
if name in self._attributes:
return self._attributes[name]
else:
# else if safe was passed raise an exception if that attribute was missing
if safe:
raise ProteomeException('Requesting attribute [%s] from Proteome [%s] but this attribute has not been assigned' % (name, str(self)))
# if safe not passed just return None
else:
return None
## ------------------------------------------------------------------------
##
def add_attribute(self, name, val, safe=True):
"""
Function that adds an attribute. Note that if safe is true,
this function will raise an exception if the attribute is
already present. If safe=False, then an exisiting value will
be overwritten.
Parameters
----------------
name : str
The parameter name that will be used to identify it
val : <anything>
An object or primitive we wish to associate with this
attribute
safe : bool (default = True)
Flag which if True with throw an exception if an attribute
with the same name already exists, otherwise the newly
introduced attribute will overwrite the previous one.
Returns
---------
None - but adds an attribute to the calling object
"""
if safe:
if name in self._attributes:
raise ProteomeException("Trying to add attribute [%s=%s] to Proteome [%s] but this attribute is already set.\nPossible options are: %s" %(name,val, str(self), str(self._attributes.keys())))
self._attributes[name] = val
## ------------------------------------------------------------------------
##
def remove_attribute(self, name, safe=True):
"""
Function that removes a given attribute from the Proteome based on the
passed attribute name. If the passed attribute does not exist or is not
associate with the protein then this will trigger an exception
unless safe=False.
Parameters
----------------
name : str
The parameter name that will be used to identify it
safe : bool (default = True)
Flag which if True with throw an exception if an
attribute this name does not exists. If set to
False then if an attribute is not found it is simply
ignored
Returns
---------
None
No return type but will remove an attribute from the
Proteome if present.
"""
if name not in self._attributes:
if safe:
raise ProteomeException(f'Passed attribute [{name}] not found in {self}')
else:
del self._attributes[name]
###################################
## ##
## DOMAIN FUNCTIONS ##
## ##
###################################
## ------------------------------------------------------------------------
##
@property
def domains(self):
"""
Function that returns a list of all domain objects associated with
the Proteome.
This function is useful if you wish to indiscriminately ask questions
of domains without considering the proteins they come from. However,
each Domain has a Protein object associated with it (via the .protein
operator), so one can always map a Domain back to a Protein.
Returns
--------------
list of Domains
A list of all the Domains from every protein in the Proteome
"""
all_domains = []
for prot in self:
all_domains.extend(prot.domains)
return all_domains
## ------------------------------------------------------------------------
##
@property
def unique_domain_types(self):
"""
Returns the list of unique Domain types associated with this Proteome.
Return
-------
list of str
Each element in the list is a string that corresponds to a Domain
type.
"""
# Some description of what's going on here is in order. Every time a new domain
# is added, the Domain constructor calls the function _Domain__update_domain_types
# which checks if the domain_type of the domain being added is already in the
# _unique_domain_types list. If yes, fine, if no, it gets added. This means
# _unique_domain_types keeps track of a count of the complete number of unique
# domains in the Proteome. An analogous setup holds true for the sites and tracks.
return list(self._unique_domain_types.keys())
## ------------------------------------------------------------------------
##
def get_domains_by_type(self, domain_type, perfect_match=True):
"""
Function that returns a list of domains from all proteins that matched against
a specific domain type name.
Parameters
------------
domain_type : string
String associated domain_type that you want to search for.
perfect_match : bool (default = True)
Flag that identifies if the domain names should be a perfect
match (=True) or if the string passed should just appear
somewhere in the domain_type string
Returns
-----------
list
Returns a list of Domain objects that match the requested type.
Objects are ordered by starting position in sequence.
"""
return_list = []
for p in self:
return_list.extend(p.get_domains_by_type(domain_type, perfect_match))
return return_list
###################################
## ##
## SITES FUNCTIONS ##
## ##
###################################
## ------------------------------------------------------------------------
##
@property
def sites(self):
"""
Function that returns a list of all Site objects associated with
the Proteome.
This function is useful if you wish to indiscriminately ask questions
of sites without considering the proteins they come from. However,
each Site has a Protein object associated with it (via .protein
operator), so one can always map a Site back to a Protein.
Returns
--------------
list of Sites
A list of all the Sites from every protein in the Proteome
"""
all_sites = []
for prot in self:
all_sites.extend(prot.sites)
return all_sites
## ------------------------------------------------------------------------
##
@property
def unique_site_types(self):
"""
Returns the list of unique Site types associated with this Proteome.
Return
-------
list of str
Each element in the list is a string that corresponds to a Site type
"""
# Some description of what's going on here is in order. Every time a new site
# is added, the Site constructor calls the function _Site__update_site_types
# which checks if the domain_type of the domain being added is already in the
# _unique_site_types list. If yes, fine, if no, it gets added. This means
# _unique_site_types keeps track of a count of the complete number of unique
# sites in the Proteome. An analogous setup holds true for domains and tracks.
return list(self._unique_site_types.keys())
## ------------------------------------------------------------------------
##
def get_sites_by_type(self, site_types):
"""
Function that returns a list of sites from all proteins that matched against
a specific site type name or set of site type names.
Parameters
------------
site_types : string or list of strings
One or more possible site_types that may be found in the protein.
Either a single string or a list of strings can be passed,
allowing for one or more sites to be grouped together
Returns
-----------
list
Returns a list of Domain objects that match the requested type.
Objects are ordered by starting position in sequence.
"""
return_list = []
for p in self:
return_list.extend(p.get_sites_by_type(site_types, return_list=True))
return return_list
###################################
## ##
## TRACK FUNCTIONS ##
## ##
###################################
## ------------------------------------------------------------------------
##
@property
def unique_track_names(self):
"""
Returns the list of unique Track names associated with this Proteome.
Return
-------
list of strings
Each element in the list is a string that corresponds to a
Track name found in one (or more) proteins
"""
# Some description of what's going on here is in order. Every time a new track
# is added, the Track constructor calls the function _Track__update_track_names
# which checks if the track_name of the domain being added is already in the
# _unique_track_types list. If yes, fine, if no, it gets added. This means
# _unique_track_names keeps track of a count of the complete number of unique
# domains in the Proteome. An analogous setup holds true for sites and domains
return list(self._unique_track_names.keys())
## ------------------------------------------------------------------------
##
@property
def track_names_to_track_type(self):
"""
Returns a (copy of a) dictionry that maps track name to track type.
We return a copy so there's no way we can accidentally break the
internal book-keeping of the Proteome object.
Return
-------
dict
A dictionary that contains the unique track names and maps
each name to either values or type.
"""
return dict(self._track_name_to_track_type)
####################################
## ##
## INTERNAL FUNCTIONS ##
## ##
####################################
## ------------------------------------------------------------------------
##
def __len__(self):
"""
The length of the Proteome is defined as the number of proteins
in it.
Returns
-------
int
Returns an integer that reflects the number of proteins
"""
# this function means when we call len(Proteome_Object) we get back
# the number of proteins in it
return len(self._records)
## ------------------------------------------------------------------------
##
def __repr__(self):
"""
Provides a nice representation of the Proteome
Returns
-------
string
Formatted description of the Proteome.
"""
# this function means when we print a Proteome object or cast it to
# a string we get a nice/informative representation, rather than the
# id of the object
return "[Proteome]: Sequence dataset with %i protein records" %(len(self))
## ------------------------------------------------------------------------
##
def __iter__(self):
"""
Allows a Proteome object to act as a generator that yields actual
proteins, so the syntax
.. code-block:: python
for protein in ProteomeObject:
print(protein.sequence)
is be valid and would iterate through the proteins in the Proteome.
This makes performing some analysis over all proteins quite easy.
"""
for i in self._records:
yield self._records[i]
## ------------------------------------------------------------------------
##
def __contains__(self, m):
"""
Enables the syntax X in Proteome to be used, where X can be
either a unique ID or a Proteome object.
.. code-block:: python
if protein.unique_ID in ProteomeObject:
print(f'The protein {protein} is in the Proteome!')
"""
if type(m) == str:
if m in self._records.keys():
return True
else:
return False
elif type(m) == Protein:
if m.unique_ID in self._records.keys():
return True
else:
return False
## ------------------------------------------------------------------------
##
def __getitem__(self, key):
"""
Allows slicing index into Proteome to retrieve subsets of protein
.. code-block:: python
first_protein = ProteomeObject[0]
print(f'The first protein is {first_protein}')
"""
if isinstance(key, int) and key >= 0:
return list(islice([self._records[i] for i in self._records], key, key+1))[0]
elif isinstance(key, slice):
return list(islice([self._records[i] for i in self._records], key.start, key.stop, key.step))
else:
raise KeyError("Key must be non-negative integer or slice, not {}"
.format(key))
## ------------------------------------------------------------------------
##
def _Domain__update_domain_types(self, domain_type):
"""
INTERNAL FUNCTION (not for public API use)
Note - we this function is named as __Domain_... so it can be
specifically and uniquely be called from a Domain object. This
function is ONLY called last thing in the Domain constructor
where it allows the Proteome object to keep track of the total
number of unique domain types in the Proteome.
The function is (by default) called by the Domain constructor.
Parameters
----------------
domain_type : string
String that defines a domain type
Returns
---------------
No return value, but will appropriately update the Proteome object
"""
# because _unique_track_names is a dictionary this scales O(1) with number
# of track names
if domain_type not in self._unique_domain_types:
self._unique_domain_types[domain_type] = 1
else:
self._unique_domain_types[domain_type] = self._unique_domain_types[domain_type] + 1
## ------------------------------------------------------------------------
##
def _Protein__decrement_domain_types(self, domain_type):
"""
INTERNAL FUNCTION (not for public API use)
Note - we this function is named as __Protein_... so it
can be specifically and uniquely be called from a Protein
object. This function is ONLY called last thing when a
Protein object deletes a domain
Parameters
----------------
track_name : string
String that defines a Domain type
Returns
---------------
No return value, but will appropriately update the Proteome
object
"""
# if we can't find the domain name in the unique domain types... this is bad!
if domain_type not in self._unique_domain_types:
raise ProteomeException("Tried to remove a Domain type [{domain_type}] from the Proteome.unique_domain_types dictionary but the Domain type could not be found. This is a bug. Please report as a GitHub Issue.")
# we are removing a unique domain_type name! Big event!
elif self._unique_domain_types[domain_type] == 1:
del self._unique_domain_types[domain_type]
# else decrement one
else:
self._unique_domain_types[domain_type] = self._unique_domain_types[domain_type] - 1
## ------------------------------------------------------------------------
##
def _Track__update_track_names(self, track_name, track_type):
"""
INTERNAL FUNCTION (not for public API use)
Note - we this function is named as __Track_... so it can be
specifically and uniquely be called from a Track object. This
function is ONLY called last thing in the Track constructor
where it allows the Proteome object to keep track of the total
number of unique Track types in the Proteome.
The function is (by default) called by the Track constructor
Parameters
----------------
track_name : string
String that defines the Track name
Returns
---------------
No return value, but will appropriately update the Proteome object
"""
# because _unique_track_names is a dictionary this scales O(1) with number
# of track names
if track_name not in self._unique_track_names:
self._unique_track_names[track_name] = 1
self._track_name_to_track_type[track_name] = track_type
else:
self._unique_track_names[track_name] = self._unique_track_names[track_name] + 1
if self._track_name_to_track_type[track_name] != track_type:
raise ProteomeException(f"Tried to assigned track name [{track_name}] as a [{track_type}] track, but this track was already assigned as a [{self._track_name_to_track_type[track_name]}] track. Cannot have two tracks with the same name but different types")
## ------------------------------------------------------------------------
##
def _Protein__decrement_track_names(self, track_name):
"""
INTERNAL FUNCTION (not for public API use)
Note - we this function is named as __Protein_... so it can be
specifically and uniquely be called from a Protein object.
This function is ONLY called last thing when a Protein object
deletes a Track
Parameters
----------------
track_name : string
String that defines the Track name
Returns
---------------
No return value, but will appropriately update the Proteome
object.
"""
# if we can't find the track name in the unique track names... this is bad!
if track_name not in self._unique_track_names:
raise ProteomeException("Tried to remove a Track name [{track_name}] from the Proteome.unique_track_names dictionary but the track could not be found. This is a bug. Please report as a GitHub Issue.")
# we are removing a unique track name! Big event!
elif self._unique_track_names[track_name] == 1:
del self._unique_track_names[track_name]
del self._track_name_to_track_type[track_name]
# else decrement one
else:
self._unique_track_names[track_name] = self._unique_track_names[track_name] - 1
## ------------------------------------------------------------------------
##
def _Site__update_site_types(self, site_type):
"""
INTERNAL FUNCTION (not for public API use).
Note - we this function is named as __SITE_... so it can be
specifically and uniquely be called from a Site object. This
function is ONLY called last thing in the Site constructor where
it allows the Proteome object to keep track of the total number
of unique Site types in the Proteome.
The function is (by default) called by the Site constructor.
Parameters
----------------
site_type : string
String that defines a site type
Returns
---------------
No return value, but will appropriately update the Proteome object.
"""
# because _unique_track_names is a dictionary this scales O(1) with number
# of track names
if site_type not in self._unique_site_types:
self._unique_site_types[site_type] = 1
else:
self._unique_site_types[site_type] = self._unique_site_types[site_type] + 1
## ------------------------------------------------------------------------
##
def _Protein__decrement_site_types(self, site_type):
"""
INTERNAL FUNCTION (not for public API use)
Note - we this function is named as __Protein_... so it can be specifically
and uniquely be called from a Protein object. This function is ONLY called
last thing when a Protein object deletes a Site
Parameters
----------------
track_name : string
String that defines a site type
Returns
---------------
No return value, but will appropriately update the Proteome object
"""
# if we can't find the track name in the unique track names... this is bad!
if site_type not in self._unique_site_types:
raise ProteomeException("Tried to remove a Site type [{site_type}] from the Proteome.unique_site_types dictionary but the Site type could not be found. This is a bug. Please report as a GitHub Issue.")
# we are removing a unique site_type name! Big event!
elif self._unique_site_types[site_type] == 1:
del self._unique_site_types[site_type]
# else decrement one
else:
self._unique_site_types[site_type] = self._unique_site_types[site_type] - 1
## ------------------------------------------------------------------------
##
def __build_diagnosis_string_proteome_construction(self, entry):
"""
INTERNAL FUNCTION (not for public API use)
Function that builds a string to help in diagnosing what might go
wrong during Proteome construction. Called by the Proteome constructor.
"""
ds = "Error building proteome when parsing the following entry:\n"
# check the sequence
try:
s = str(entry['sequence'])
except Exception:
s = 'FAILED'
ds = ds + "sequence: %s\n" %(s)
# check the name
try:
s = str(entry['name'])
except Exception:
s = 'FAILED'
ds = ds +"name: %s\n" %(s)
# check the unique_ID
try:
s = str(entry['unique_ID'])
except Exception:
s = 'FAILED'
ds = ds +"unique_ID: %s\n" %(s)
#
try:
s = str(entry['attributes'])
except Exception:
s = 'FAILED'
ds = ds +"attributes: %s\n" %(s)
return ds