Source code for shephard.proteome

"""
SHEPHARD: 
Sequence-based Hierarchical and Extendable Platform for High-throughput Analysis of Region of Disorder

Authors: Garrett M. Ginell & Alex S. Holehouse
Contact: (g.ginell@wustl.edu)

Holehouse Lab - Washington University in St. Louis
"""

from . import general_utilities
from .exceptions import ProteomeException
from .protein import Protein
from itertools import islice
import copy

[docs]class Proteome:
    """
    The Proteome object is the main unit for information storage in 
    SHEPHARD.

    There are a few ways that new Proteomes can be generated:

    * By reading in a FASTA file (using shephard.interfaces.apis.fasta)

    The Proteome constructor takes a single argument, which is a 
    list of protein dictionaries or a list of Protein objects. This 
    means Proteome objects can be generated directly (see below for 
    a definition of protein dictionaries). However, it is often more 
    convenient to build Proteomes from FASTA files. For more information 
    on this see the ``api`` documentation.
    
        
    Protein dictionaries are dictionaries that must contain four elements 
    (others are ignored).
    
    * ``sequence`` : *str* - Amino acid sequence of the protein.        

    * ``name`` : *str* - Name of the protein (this can be anything, it is not used internally so no constraints on what this is.

    * ``unique_ID`` :  *str* - This must be unique with respect to all other unique_IDs in the set of proteins in the input list.
                                          
    * ``attributes`` : *dict* - Dictionary of one or more attributes to apply to this protein. Key/value pairs in this dictionary can be arbitrary and are user defined.
    
    

    As an example:

    >>> protein_dictionary_example = {'sequence':'ALAPSLLPAMPALSPALSP', 
                                      'name': 'my protein fragment', 
                                      'unique_ID':'UXX01', 'attributes':{}}
    >>> dictionary_list = []
    >>> dictionary_list.append(protein_dictionary_example)
    >>> P = Proteome(dictionary_list)
                                          
    Note that ``sequence``, ``name`` and ``unique_ID`` are cast to *str* by 
    the function, so if numerical values are passed for any these will be 
    converted to strings.
    
    **Notes**

    * NOTE that ALL FOUR of these are required for EACH protein, even if the 
      attributes dictionary is empty.

    * The unique_ID is checked for uniqueness against all others in the Proteomes 
      and will throw and exception if it is, in fact, not unique.

    * Additional proteins can be added using the `.add_protein()` or 
      `.add_proteins() function.

    """

    ## ------------------------------------------------------------------------
    ##
    def __init__(self, input_list = None, attributes = None, force_overwrite=False):
        # See the Proteome class documentation for constructor info
        """
        Constructor that generates a new Proteome object. This includes 
        taking a list of Protein objects or protein dictionaries (input_list) 
        and, optionally an attributes dictionary for the proteome itself.
                
        In addition, force_overwrite can be used to deal with duplicate 
        entries in the input_list.

        There are two types of lists that can be tolerated when passed to
        the Proteome constructor: a list of protein dictionaries and a 
        list of Protein objects. Note this simply is using the 
        .add_proteins() function to add the passed input_list.

        **Protein dictionaries**        
        One mode of adding multiple proteins is by passing a list of 
        Protein dictionaries.

        Protein dictionaries are dictionaries that posses the following 
        key-value pairs ::

            'sequence'   : amino acid sequence (str)
            'name'       : protein name (str)
            'unique_ID'  : The unique identification number used for the 
                           protein (str)
            'attributes' : A dictionary of arbitrary key-value pairs to 
                           associate with the protein (dict or None)
        
        Additional keys/value pairs are ignored and ALL four of these must
        be included. If any are missing for any protein entry this function 
        raises a ProteomeException.

        **Protein objects**
        A second mode of adding multiple proteins is by passing a list of
        Protein objects. This is useful if you're creating a new Proteome
        based on a subset of proteins from an existing Proteome.

        In both cases, the function automatically determines the type of 
        the passed list, and adds dictionaries accordingly. Note that 
        in both cases proteins are added by value - i.e. a new Protein
        objects are generated, and changes to Proteomes in the new 
        Proteome will not affect the original Proteome.
        
        Parameters
        -----------
        input_list : list (default = None)
            List of Protein dictionaries

        attributes : dict (default = None)
            A an arbitrary set of key-value pairs to annotate the proteome 
            with metadata.
        
        force_overwrite : bool (default = False)
            If set to False and there are duplicate unique_IDs in protein 
            dictionaries in the input_list this will trigger an exception. 
            However, if set to True then the 'last' entry overwrites a             
            more recent one in the case of duplicates.

        """

        # initiallize book-keeping instruments
        self._records = {}
        self._unique_domain_types = {}
        self._unique_site_types = {}         # dict that mapes a Site type to a count
        self._unique_track_names = {}        # dict that mapes Track name to count
        self._track_name_to_track_type = {}  # dict that maps Track name to track type ('values' or 'symbols')
        
        # check attributs dictionary
        general_utilities.variable_is_dictionary(attributes, ProteomeException, 'attributes argument passed to proteome is not a dictionary', or_none = True)

        if attributes is None:            
            self._attributes = {}
        else:
            self._attributes = attributes

        # if no/empty input provided then we're done
        if input_list is None or len(input_list) == 0:
            return

        # else try and add proteins - probably could be more soph
        self.add_proteins(input_list)



    ## ------------------------------------------------------------------------
    ##                
    def check_unique_ID(self, unique_id):
        """
        Function that checks if a given unique ID is found. Note that this 
        function is not needed for testing if a unique_ID is present if the 
        goal is to request Protein Objects (or not). Instead, one can use 
        the .protein(<unique_ID>, safe=False). By setting safe=False if the
        unique_ID is not found then this function will simply return None.

        Parameters
        -----------
        unique_id : string
            String corresponding to a unique_ID associated with some protein

        Returns
        ----------
        bool
            Returns True if the passed ID is present, or False if not.

        """
        if unique_id in self._records.keys():
            return True
        else:
            return False



    ## ------------------------------------------------------------------------
    ##
    @property
    def proteins(self):
        """
        Returns a list of unique_IDs that correspond to the proteins in 
        this Proteome. NOTE this returns a list of the IDs, not the actual 
        Protein objects. To get the corresponding protein object one must 
        use the ``.protein(<unique_ID>)`` notation.
                
        Returns
        --------

        ``list`` of ``str``
            Returns a list of unique IDs

        """

        return list(self._records.keys())



    ## ------------------------------------------------------------------------
    ##
    def protein(self, unique_ID, safe=True):
        """
        Returns the ``Protein`` object associated with the passed unique_ID. 
        If there is no Protein associated with the provided unique_ID then 
        if ``safe=True`` (default) an exception is raised, while if 
        ``safe=False`` then ``None`` is returned.

        Parameters
        -----------
        unique_id : string
            String corresponding to a unique_ID associated with some protein

        safe : bool (default = True)
            If set to True then a missing unique_ID  will raise an exception. 
            If ``False`` then a missing unique_ID will simply return None
            
        Returns
        --------
        
        Protein Object, None
            Depending on if the passed unique_ID is found in the ``Proteome``, 
            a ``Protein`` object or None will be returned.            

        """

        # convert unique IDs to strings because this typing also happens 
        # when new proteins are added
        unique_ID_str = str(unique_ID)

        try:
            return self._records[unique_ID_str]
        except KeyError:
            if safe:
                raise ProteomeException("unique_ID '%s' not found in proteome" % (unique_ID))
            else:
                return None



    ## ------------------------------------------------------------------------
    ##
    def add_protein(self, sequence, name, unique_ID, attributes=None, force_overwrite=False):
        """
        Function that allows the user to add a new protein to a Proteomes 
        in an ad-hoc fashion. In general most of the time it will make 
        sense to add proteins all at once from some input source, but the 
        ability to add proteins one at a time is also useful. 

        If a duplicate unique_ID is passed an exception (ProteomeException) 
        is raised.
        
        Parameters
        -----------
        sequence : string
            Amino acid sequence of the protein. Note  - no sanity check of 
            the sequence is performed.

        name : string
            String reflecting the protein name. Again this can be 
            anything.
        
        unique_id : string
            String corresponding to a unique_ID associated with some 
            protein.

        attributes : dict (default = None)
            The attributes dictionary provides a key-value pairing for 
            arbitrary information. This could include gene names, different 
            types of identifies, protein copy number, a set of protein 
            partners, or anything else one might wish to associated with the
            protein as a whole. Default is None.

        force_overwrite : Bool (default = False)
            If set to False and a unique_ID is included that already is 
            found then this function will raise an exception. However, if 
            set to True it will automatically overwrite the pre-existing 
            entry. (Default = False).
           
        Returns
        --------        
        None
            No return status, but valid proteins included in the input_list 
            will be added to to the underlying proteome.

        """
        
        unique_ID_str = str(unique_ID)

        if unique_ID_str in self._records:
            if force_overwrite is False:
                raise ProteomeException('Non-unique unitque_ID passed [%s]' % (unique_ID_str))

        self._records[unique_ID_str] = Protein(sequence, name, self, unique_ID_str, attributes)



    ## ------------------------------------------------------------------------
    ##        
    def add_proteins(self, input_list, force_overwrite=False):
        r"""
        Function that allows the user to add a multiple new proteins using 
        either a list of protein dictionaries (described below) or a list 
        of Protein objects.        

        **Protein dictionaries**
        
        One mode of adding multiple proteins is by passing a list of 
        Protein dictionaries.

        Protein dictionaries are dictionaries that posses the following 
        key-value pairs ::

            'sequence'   : amino acid sequence (str)
            'name'       : protein name (str)
            'unique_ID'  : The unique identification number used for the 
                           protein (str)
            'attributes' : A dictionary of arbitrary key-value pairs to 
                           associate with the protein (dict or None)
        
        Additional keys/value pairs are ignored and ALL four of these must
        be included. If any are missing for any protein entry this function 
        raises a ProteomeException.

        **Protein objects**
        A second mode of adding multiple proteins is by passing a list of
        Protein objects

        In both cases, the function automatically determines the type of 
        the passed list, and adds dictionaries accordingly. Note that 
        in both cases proteins are added by value - i.e. a new Protein
        object is generated.
        
        Parameters
        -----------
        input_list : list
            List of Protein dictionaries or list of Protein objects
        
        force_overwrite : bool (default = False)
            If set to False and a unique_ID is included that already is 
            found then this function will raise an exception. However, if 
            set to True it will automatically overwrite the pre-existing 
            entry. 

        Returns
        --------
        
        None
            No return status, but valid proteins included in the input_list 
            will be added to to the underlying proteome.
            

        """

        # cycles over every element in the input list and builds a new
        # list where each 
        type_list = list(set([type(i) for i in input_list]))

        # checks if only one type of object is found here
        if len(type_list) > 1:
            raise ProteomeException(f'Trying to add Proteins to a Proteome and the input_list contains more than one type {type_list}')

        # if we're using a list of protein dictionaries
        if type_list[0] == dict:
            self._add_proteins_dict(input_list, force_overwrite)

        # if we're using a list of Protein objects
        elif type_list[0] == Protein:
            self._add_proteins_Protein(input_list, force_overwrite)

        

    ## ------------------------------------------------------------------------
    ##        
    def _add_proteins_dict(self, input_list, force_overwrite=False):
        """
        Internal function that mirrors add_proteins() but operates if every
        element in the input_list is a dictionary.

        Importantly, this works by create a NEW proteins and populating
        based on the key-value mapping in the protein dictionary. 

        Protein dictionaries are dictionaries that posses the following 
        key-value pairs:

            'sequence'   : amino acid sequence (str)
            'name'       : protein name (str)
            'unique_ID'  : The unique identification number used for the 
                           protein (str)
            'attributes' : A dictionary of arbitrary key-value pairs to 
                           associate with the protein (dict or None)


        This function is not be called directly, but instead used by 
        add_proteins() function

        Parameters
        --------------
        input_list : list 
            List of dictionaries  

        force_overwrite : bool, default=False
            Flag which if set to True will mean a Protein in the input_list
            would overwrite an existing protein with the same unique_ID.

        Returns
        --------
        None
            No return type, but will add the protein dictoinaries in the 
            input_list into this Proteome.

        
        """
        
        # for each protein entry in the input list
        for entry in input_list:
        
            try:
                sequence       = str(entry['sequence'])
                name           = str(entry['name'])
                unique_ID      = str(entry['unique_ID'])
                attributes     = entry['attributes']
            except KeyError:
                # if something goes wrong while extracting the four required attributes we build a 
                # diagnosis string and then print this as we raise an exception. The goal here is to
                # try and provide the user with as much info as possible to diagnose the problem
            
                diagnosis_string = self.__build_diagnosis_string_proteome_construction(entry)
                raise ProteomeException('%s'%(diagnosis_string))
            
            if unique_ID in self._records:
                if force_overwrite is False:
                    raise ProteomeException('Non-unique unique_ID passed [%s]' % (unique_ID))

            # add in a new protein
            self._records[unique_ID] = Protein(sequence, name, self, unique_ID, attributes)


    ## ------------------------------------------------------------------------
    ##        
    def _add_proteins_Protein(self, input_list, force_overwrite=False):
        """
        Internal function that mirrors add_proteins() but operates if every
        element in the input_list is a Protein object.

        Importantly, this works by create a NEW protein, and ensure all 
        complex datatypes associated with that new protein are copied. 

        This function is not be called directly, but instead used by 
        add_proteins() function

        Parameters
        --------------
        input_list : list of shephard.protein.Protein objects
            List of Protein objects to be copied into this Proteome

        force_overwrite : bool, default=False
            Flag which if set to True will mean a Protein in the input_list
            would overwrite an existing protein with the same unique_ID.

        Returns
        --------
        None
            No return type, but will add the Protein objects in the input_list
            into this Proteome.
        
        """
        
        # for each protein entry in the input list
        for entry in input_list:
            
            # get the unique ID and, if this ID is already found in the Proteome
            # raise an exception UNLESS force_overwrite is True
            unique_ID = entry.unique_ID
            if unique_ID in self._records:
                if force_overwrite is False:
                    raise ProteomeException('Non-unique unique_ID passed [%s]' % (unique_ID))


            ##
            ## New protein is fully created and all complex data types are copied, so this
            ## new protein is a completely distinct entity to the protein in the original
            ## input list. This avoids any possible issues with cross-referencing back against
            ## old data structures and keeps things clean. Also ensures that the new proteome
            ## has updated unique sites and unique domains lists in the usual way. Basically
            ## this is the most appropritae way to add an existing protein to a new proteome
            ##
            # create new protein
            new_protein = Protein(entry.sequence, entry.name, self, entry.unique_ID)

            # update attributes
            for a in entry.attributes:
                new_protein.add_attribute(a, copy.deepcopy(entry.attribute(a)))

            # update domains
            for d in entry.domains:
                new_protein.add_domain(d.start, d.end, d.domain_type, copy.deepcopy(d._attributes))

            # update sites
            for s in entry.sites:
                new_protein.add_site(s.position. s.site_type, s.symbol, s.value, copy.deepcopy(s._attributes))

            # update tracks
            for t in entry.tracks:
                vals = t.values

                # if vals present then we're creating a values track. Note we can get away with
                # a shallow copy because we know these tracks will only have ints or chars in their
                # elements, which are appropriately copied by a shallow copy
                if vals:                    
                    new_protein.add_track(t.name, vals.copy(), None)
                else:
                    new_protein.add_track(t.name, None, t.symbols.copy())

            self._records[unique_ID] = new_protein
           
            


    ## ------------------------------------------------------------------------
    ##        
    def remove_protein(self, unique_ID, safe=True):
        """
        Function that removes a given protein from the Proteome based on the
        passed unique_ID. If the passed unique_ID does not exist then this 
        will trigger an exception unless safe=False.

        
        Parameters
        ------------
        unique_ID : str
            Unique ID that will be used to retrieve a given protein

        safe : bool (default = True)
            Flag that if set to True means if a passed unique_ID is missing 
            from the underlying proteome object an exception wll be raised 
            (ProteomeException). If set to False, a missing unique_ID is 
            ignored.


        Returns
        -----------
        None
            No return type but will remove an entry from the Proteome.
           
        """

        unique_ID_str = str(unique_ID)

        if unique_ID_str in self._records:
            del self._records[unique_ID_str]
        else:
            if safe:
                raise ProteomeException('Passed unique_ID [%s] not found in this proteome' % (unique_ID_str))
            


    ## ------------------------------------------------------------------------
    ##        
    def remove_proteins(self, input_list, safe=True):
        """
        Function that removes a given proteome from the Proteome based on
        the passed unique_ID. If the passed unique_ID does not exist then 
        this will trigger an exception unless safe = False.

        Parameters
        ------------
        input_list : list of str
            List that contains the unique IDs that will be used to select 
            proteins for deletion.

        safe : bool (default = True)
            Flag that if set to True means if a passed unique_ID is missing 
            from the underlying proteome object an exception wll be raised 
            (ProteomeException). If False a missing unique_ID is ignored.

        Returns
        -----------
        None
            No return type but will remove an entry from the proteome           

        """

        for unique_ID in input_list:
            self.remove_protein(unique_ID, safe=safe)
            


     
    ###################################
    ##                               ##
    ##     ATTRIBUTE FUNCTIONS       ##
    ##                               ##
    ###################################


    ## ------------------------------------------------------------------------
    ##
    @property
    def attributes(self):
        """
        Provides a list of the keys associated with every attribute associated
        with this protein.

        Returns
        -------
        list
            returns a list of the attribute keys associated with the Proteome. 


        """
        return list(self._attributes.keys())
    
    
    ## ------------------------------------------------------------------------
    ##
    def attribute(self, name, safe=True):

        """
        Function that returns a specific attribute as defined by the name. 

        Recall that attributes are name : value pairs, where the 'value' 
        can be anything and is user defined. This function will return 
        the value associated with a given name.

        Parameters
        ----------------
        name : str
             The attribute name. A list of valid names can be found by 
             calling the ``<Proteome>.attributes()`` (which returns a list 
             of the valid names).
             
        safe : bool (default = True)
            Flag which if true with throw an exception if an attribute with 
            the same name already exists.
            
        Returns
        ---------
        Unknown 
            Will either return whatever was associated with that attribute 
            (which could be anything) or None if that attribute is missing.            
        
        """

        # if name is in the _atributes dictionary the  return
        if name in self._attributes:
            return self._attributes[name]
        else:

            # else if safe was passed raise an exception if that attribute was missing
            if safe:
                raise ProteomeException('Requesting attribute [%s] from Proteome [%s] but this attribute has not been assigned' % (name, str(self))) 

            # if safe not passed just return None
            else:
                return None
                


    ## ------------------------------------------------------------------------
    ##
    def add_attribute(self, name, val, safe=True):
        """
        Function that adds an attribute. Note that if safe is true, 
        this function will raise an exception if the attribute is 
        already present. If safe=False, then an exisiting value will 
        be overwritten.

        Parameters
        ----------------

        name : str
            The parameter name that will be used to identify it

        val : <anything>
            An object or primitive we wish to associate with this 
            attribute

        safe : bool (default = True)
            Flag which if True with throw an exception if an attribute 
            with the same name already exists, otherwise the newly 
            introduced attribute will overwrite the previous one.

        Returns
        ---------
            None - but adds an attribute to the calling object

        """

        if safe:
            if name in self._attributes:
                raise ProteomeException("Trying to add attribute [%s=%s] to Proteome [%s] but this attribute is already set.\nPossible options are: %s" %(name,val, str(self), str(self._attributes.keys())))
                
        self._attributes[name] = val        
        

    ## ------------------------------------------------------------------------
    ##
    def remove_attribute(self, name, safe=True):
        """
        Function that removes a given attribute from the Proteome based on the 
        passed attribute name. If the passed attribute does not exist or is not 
        associate with the protein then this will trigger an exception 
        unless safe=False.

        Parameters
        ----------------

        name : str
            The parameter name that will be used to identify it

        safe : bool (default = True)
            Flag which if True with throw an exception if an 
            attribute this name does not exists. If set to
            False then if an attribute is not found it is simply
            ignored
            
        Returns
        ---------
        None
            No return type but will remove an attribute from the 
            Proteome if present.
            
        """

        if name not in self._attributes:
            if safe:
                raise ProteomeException(f'Passed attribute [{name}] not found in {self}')
        else:
            del self._attributes[name]



    ###################################
    ##                               ##
    ##       DOMAIN FUNCTIONS        ##
    ##                               ##
    ###################################

    ## ------------------------------------------------------------------------
    ##
    @property
    def domains(self):
        """
        Function that returns a list of all domain objects associated with
        the Proteome. 

        This function is useful if you wish to indiscriminately ask questions 
        of domains without considering the proteins they come from. However, 
        each Domain has a Protein object associated with it (via the .protein 
        operator), so one can always map a Domain back to a Protein.

        Returns
        --------------
        list of Domains
             A list of all the Domains from every protein in the Proteome
        
        """

        all_domains = []
        for prot in self:
            all_domains.extend(prot.domains)

        return all_domains


        
    ## ------------------------------------------------------------------------
    ##                
    @property
    def unique_domain_types(self):
        """
        Returns the list of unique Domain types associated with this Proteome.

        Return
        -------

        list of str
            Each element in the list is a string that corresponds to a Domain 
            type.

        """

        # Some description of what's going on here is in order. Every time a new domain
        # is added, the Domain constructor calls the function _Domain__update_domain_types
        # which checks if the domain_type of the domain being added is already in the
        # _unique_domain_types list. If yes, fine, if no, it gets added. This means
        # _unique_domain_types keeps track of a count of the complete number of unique
        # domains in the Proteome. An analogous setup holds true for the sites and tracks.
        
        return list(self._unique_domain_types.keys())


    ## ------------------------------------------------------------------------
    ##
    def get_domains_by_type(self, domain_type, perfect_match=True):
        """
        Function that returns a list of domains from all proteins that matched against
        a specific domain type name. 
        
        Parameters
        ------------
        domain_type : string
            String associated domain_type that you want to search for.

        perfect_match : bool (default = True)
            Flag that identifies if the domain names should be a perfect 
            match (=True) or if the string passed should just appear 
            somewhere in the domain_type string
            
        Returns
        -----------
        list
            Returns a list of Domain objects that match the requested type. 
            Objects are ordered by starting position in sequence.
                                
        """
        return_list = []
        for p in self:
            return_list.extend(p.get_domains_by_type(domain_type, perfect_match))

        return return_list



    ###################################
    ##                               ##
    ##        SITES FUNCTIONS        ##
    ##                               ##
    ###################################

    ## ------------------------------------------------------------------------
    ##                
    @property
    def sites(self):
        """
        Function that returns a list of all Site objects associated with 
        the Proteome. 

        This function is useful if you wish to indiscriminately ask questions 
        of sites without considering the proteins they come from. However, 
        each Site has a Protein object associated with it (via .protein 
        operator), so one can always map a Site back to a Protein.

        Returns
        --------------
        list of Sites
             A list of all the Sites from every protein in the Proteome
        
        """

        all_sites = []

        for prot in self:
            all_sites.extend(prot.sites)
        return all_sites
                    

    ## ------------------------------------------------------------------------
    ##                
    @property
    def unique_site_types(self):
        """
        Returns the list of unique Site types associated with this Proteome.

        Return
        -------

        list of str
            Each element in the list is a string that corresponds to a Site type

        """

        # Some description of what's going on here is in order. Every time a new site
        # is added, the Site constructor calls the function _Site__update_site_types
        # which checks if the domain_type of the domain being added is already in the
        # _unique_site_types list. If yes, fine, if no, it gets added. This means
        # _unique_site_types keeps track of a count of the complete number of unique
        # sites in the Proteome. An analogous setup holds true for domains and tracks.

        return list(self._unique_site_types.keys())
    

    ## ------------------------------------------------------------------------
    ##
    def get_sites_by_type(self, site_types):
        """
        Function that returns a list of sites from all proteins that matched against
        a specific site type name or set of site type names.
        
        Parameters
        ------------

        site_types : string or list of strings
            One or more possible site_types that may be found in the protein. 
            Either a single string or a list of strings can be passed, 
            allowing for one or more sites to be grouped together

            
        Returns
        -----------
        list
            Returns a list of Domain objects that match the requested type. 
            Objects are ordered by starting position in sequence.
                                
        """
        return_list = []
        for p in self:
            return_list.extend(p.get_sites_by_type(site_types, return_list=True))

        return return_list



    ###################################
    ##                               ##
    ##        TRACK FUNCTIONS        ##
    ##                               ##
    ###################################

    ## ------------------------------------------------------------------------
    ##                
    @property
    def unique_track_names(self):
        """
        Returns the list of unique Track names associated with this Proteome.

        Return
        -------

        list of strings
            Each element in the list is a string that corresponds to a 
            Track name found in one (or more) proteins

        """

        # Some description of what's going on here is in order. Every time a new track
        # is added, the Track constructor calls the function _Track__update_track_names
        # which checks if the track_name of the domain being added is already in the
        # _unique_track_types list. If yes, fine, if no, it gets added. This means
        # _unique_track_names keeps track of a count of the complete number of unique
        # domains in the Proteome. An analogous setup holds true for sites and domains
        
        
        return list(self._unique_track_names.keys())

    ## ------------------------------------------------------------------------
    ##                
    @property
    def track_names_to_track_type(self):
        """
        Returns a (copy of a) dictionry that maps track name to track type. 
        We return a copy so there's no way we can accidentally break the 
        internal book-keeping of the Proteome object.

        Return
        -------
        dict
            A dictionary that contains the unique track names and maps 
            each name to either values or type.
        
        """

        return dict(self._track_name_to_track_type)


    ####################################
    ##                                ##
    ##       INTERNAL FUNCTIONS       ##
    ##                                ##
    ####################################

    ## ------------------------------------------------------------------------
    ##                
    def __len__(self):
        """
        The length of the Proteome is defined as the number of proteins 
        in it.
        
        Returns
        -------
        int
            Returns an integer that reflects the number of proteins
        """

        # this function means when we call len(Proteome_Object) we get back
        # the number of proteins in it

        return len(self._records)



    ## ------------------------------------------------------------------------
    ##                
    def __repr__(self):
        """
        Provides a nice representation of the Proteome
        
        Returns
        -------
        string
            Formatted description of the Proteome.

        """

        # this function means when we print a Proteome object or cast it to 
        # a string we get a nice/informative representation, rather than the
        # id of the object

        return "[Proteome]: Sequence dataset with %i protein records" %(len(self))



    ## ------------------------------------------------------------------------
    ##                
    def __iter__(self):
        """
        Allows a Proteome object to act as a generator that yields actual 
        proteins, so the syntax
        
        .. code-block:: python

           for protein in ProteomeObject:
               print(protein.sequence)
        
        is be valid and would iterate through the proteins in the Proteome. 
        
        This makes performing some analysis over all proteins quite easy.

        """

        for i in self._records:
            yield self._records[i]

    ## ------------------------------------------------------------------------
    ##                
    def __contains__(self, m):
        """
        Enables the syntax X in Proteome to be used, where X can be 
        either a unique ID or a Proteome object.

        .. code-block:: python

           if protein.unique_ID in ProteomeObject:
               print(f'The protein {protein} is in the Proteome!')


        """

        if type(m) == str:
            if m in self._records.keys():
                return True
            else:
                return False

        elif type(m) == Protein:
            if m.unique_ID in self._records.keys():
                return True
            else:
                return False

    ## ------------------------------------------------------------------------
    ##                        
    def __getitem__(self, key):
        """
        Allows slicing index into Proteome to retrieve subsets of protein

        .. code-block:: python

           first_protein = ProteomeObject[0]
           print(f'The first protein is {first_protein}')

        """


        if isinstance(key, int) and key >= 0:
            return list(islice([self._records[i] for i in self._records], key, key+1))[0]

        elif isinstance(key, slice):
            return list(islice([self._records[i] for i in self._records], key.start, key.stop, key.step))
        else:
            raise KeyError("Key must be non-negative integer or slice, not {}"
                           .format(key))


    ## ------------------------------------------------------------------------
    ##                
    def _Domain__update_domain_types(self, domain_type):
        """
        INTERNAL FUNCTION (not for public API use)

        Note - we this function is named as __Domain_... so it can be 
        specifically and uniquely be called from a Domain object. This 
        function is ONLY called last thing in the Domain constructor 
        where it allows the Proteome object to keep track of the total 
        number of unique domain types in the Proteome.

        The function is (by default) called by the Domain constructor.
        
        Parameters
        ----------------
        domain_type : string
            String that defines a domain type

        Returns
        ---------------

        No return value, but will appropriately update the Proteome object

        """
        # because _unique_track_names is a dictionary this scales O(1) with number 
        # of track names
        if domain_type not in self._unique_domain_types:
            self._unique_domain_types[domain_type] = 1
        else:
            self._unique_domain_types[domain_type] = self._unique_domain_types[domain_type] + 1

    ## ------------------------------------------------------------------------
    ##                
    def _Protein__decrement_domain_types(self, domain_type):
        """
        INTERNAL FUNCTION (not for public API use)
        
        Note - we this function is named as __Protein_... so it 
        can be specifically and uniquely be called from a Protein 
        object. This function is ONLY called last thing when a 
        Protein object deletes a domain

        Parameters
        ----------------
        track_name : string
            String that defines a Domain type

        Returns
        ---------------

        No return value, but will appropriately update the Proteome 
        object

        """

        # if we can't find the domain name in the unique domain types... this is bad!
        if domain_type not in self._unique_domain_types:
            raise ProteomeException("Tried to remove a Domain type [{domain_type}] from the Proteome.unique_domain_types dictionary but the Domain type could not be found. This is a bug. Please report as a GitHub Issue.")
            
        # we are removing a unique domain_type name! Big event!
        elif self._unique_domain_types[domain_type] == 1:
            del self._unique_domain_types[domain_type]
        # else decrement one
        else:
            self._unique_domain_types[domain_type] = self._unique_domain_types[domain_type] - 1


    ## ------------------------------------------------------------------------
    ##                
    def _Track__update_track_names(self, track_name, track_type):
        """
        INTERNAL FUNCTION (not for public API use)
        
        Note - we this function is named as __Track_... so it can be 
        specifically and uniquely be called from a Track object. This 
        function is ONLY called last thing in the Track constructor 
        where it allows the Proteome object to keep track of the total 
        number of unique Track types in the Proteome.

        The function is (by default) called by the Track constructor

        Parameters
        ----------------
        track_name : string
            String that defines the Track name 

        Returns
        ---------------

        No return value, but will appropriately update the Proteome object

        """

        # because _unique_track_names is a dictionary this scales O(1) with number 
        # of track names
        if track_name not in self._unique_track_names:
            self._unique_track_names[track_name] = 1
            self._track_name_to_track_type[track_name] = track_type

        else:
            self._unique_track_names[track_name] = self._unique_track_names[track_name] + 1
            if self._track_name_to_track_type[track_name] != track_type:
                raise ProteomeException(f"Tried to assigned track name [{track_name}] as a [{track_type}] track, but this track was already assigned as a [{self._track_name_to_track_type[track_name]}] track. Cannot have two tracks with the same name but different types")
            

    ## ------------------------------------------------------------------------
    ##                
    def _Protein__decrement_track_names(self, track_name):
        """
        INTERNAL FUNCTION (not for public API use)
        
        Note - we this function is named as __Protein_... so it can be
        specifically and uniquely be called from a Protein object. 
        This function is ONLY called last thing when a Protein object 
        deletes a Track
                

        Parameters
        ----------------
        track_name : string
            String that defines the Track name 

        Returns
        ---------------

        No return value, but will appropriately update the Proteome 
        object.

        """

        # if we can't find the track name in the unique track names... this is bad!
        if track_name not in self._unique_track_names:
            raise ProteomeException("Tried to remove a Track name [{track_name}] from the Proteome.unique_track_names dictionary but the track could not be found. This is a bug. Please report as a GitHub Issue.")
            
        # we are removing a unique track name! Big event!
        elif self._unique_track_names[track_name] == 1:
            del self._unique_track_names[track_name]
            del self._track_name_to_track_type[track_name]

        # else decrement one
        else:
            self._unique_track_names[track_name] = self._unique_track_names[track_name] - 1


    ## ------------------------------------------------------------------------
    ##                
    def _Site__update_site_types(self, site_type):
        """
        INTERNAL FUNCTION (not for public API use).

        Note - we this function is named as __SITE_... so it can be 
        specifically and uniquely be called from a Site object. This 
        function is ONLY called last thing in the Site constructor where
        it allows the Proteome object to keep track of the total number 
        of unique Site types in the Proteome.

       
        The function is (by default) called by the Site constructor.

        Parameters
        ----------------
        site_type : string
            String that defines a site type

        Returns
        ---------------

        No return value, but will appropriately update the Proteome object.

        """

        # because _unique_track_names is a dictionary this scales O(1) with number 
        # of track names
        if site_type not in self._unique_site_types:
            self._unique_site_types[site_type] = 1

        else:
            self._unique_site_types[site_type] = self._unique_site_types[site_type] + 1


    ## ------------------------------------------------------------------------
    ##                
    def _Protein__decrement_site_types(self, site_type):
        """
        INTERNAL FUNCTION (not for public API use)

        
        Note - we this function is named as __Protein_... so it can be specifically
        and uniquely be called from a Protein object. This function is ONLY called
        last thing when a Protein object deletes a Site

        Parameters
        ----------------
        track_name : string
            String that defines a site type

        Returns
        ---------------

        No return value, but will appropriately update the Proteome object

        """

        # if we can't find the track name in the unique track names... this is bad!
        if site_type not in self._unique_site_types:
            raise ProteomeException("Tried to remove a Site type [{site_type}] from the Proteome.unique_site_types dictionary but the Site type could not be found. This is a bug. Please report as a GitHub Issue.")
            
        # we are removing a unique site_type name! Big event!
        elif self._unique_site_types[site_type] == 1:
            del self._unique_site_types[site_type]
        # else decrement one
        else:
            self._unique_site_types[site_type] = self._unique_site_types[site_type] - 1



    ## ------------------------------------------------------------------------
    ##                
    def __build_diagnosis_string_proteome_construction(self, entry):
        """
        INTERNAL FUNCTION (not for public API use)
        
        Function that builds a string to help in diagnosing what might go 
        wrong during Proteome construction. Called by the Proteome constructor.
        
        
        """

        ds = "Error building proteome when parsing the following entry:\n"


        # check the sequence
        try:
            s = str(entry['sequence'])
        except Exception:
            s = 'FAILED'

        ds = ds + "sequence: %s\n" %(s)


        # check the name
        try:
            s = str(entry['name'])
        except Exception:
            s = 'FAILED'

        ds = ds +"name: %s\n" %(s)


        # check the unique_ID
        try:
            s = str(entry['unique_ID'])
        except Exception:
            s = 'FAILED'

        ds = ds +"unique_ID: %s\n" %(s)


        # 
        try:
            s = str(entry['attributes'])
        except Exception:
            s = 'FAILED'

        ds = ds +"attributes: %s\n" %(s)

        return ds