Source code for shephard.protein

"""
SHEPHARD: 
Sequence-based Hierarchical and Extendable Platform for High-throughput Analysis of Region of Disorder

Authors: Garrett M. Ginell & Alex S. Holehouse
Contact: (alex.holehouse@wustl.edu, g.ginell@wustl.edu)

Holehouse Lab - Washington University in St. Louis
"""

import numpy as np
from . import exceptions
from . import sequence_utilities
from .domain import Domain 
from .site import Site
from .track import Track
from .exceptions import ProteinException
from .import general_utilities
from .interfaces.si_domains import add_domains_from_dictionary


# <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
# Class that defines a protein entry
#
[docs]class Protein:
    
    def __init__(self, seq, name, proteome, unique_ID, attributes = None):
        
        """
        Protein objects are the parent object to all sequence-based information. 
        Protein objects are explicitly associated with several different types 
        of objects:
                
        * **tracks** - Vectorial information that maps to each residue and 
                       contains some set of information. A protein can have 
                       multiple tracks, but there must be a one-to-one mapping 
                       for sequence to track.
        
        * **domains** - Information on a single contiguous region in the 
                        protein. A protein can have multiple domains.
        
        * **sites** - Information associated with a single amino acid site.  
                      A protein can have multiple sites.
        
        * **attributes** - Protein-specific information associated.
        
        Parameters
        ------------
        
        seq : str
            Amino acid sequence for the protein. No validation is performed. 
        
        name : str
            Some sort of name-based identifier for the protein. Can be 
            anything - is not used internally so no restraints, but could be 
            used by other bits of analysis. 
        
        unique_ID : str 
            The unique_ID should be a short unique identifier. When added to 
            a Proteome the Proteome object ensures the unique_ID is unique with 
            respect to that Proteome. 
        
            We HIGHLY recommend using the uniprot accession number, as this meets 
            the requirement of a unique ID as well as allowing effective 
            cross-refering from other databases.
        
        attributes : dict (default = None)
            The attributes provides a key-value pairing for arbitrary information.
            This could include gene names, different types of identifies, protein 
            copy number, a set of protein partners, or anything else one might 
            wish to associated with the protein as a whole. 
        
        Returns
        ---------
        Proteine object (constructor)
        
        Raises
        ----------
        shephard.exceptions.ProteinException
        
        """
        
        # define internal attributes that are then accessed via @properties 
        self._name     = name
        self._sequence = "-" + seq # the '-' at the start fixes our indexing woes
        self._unique_ID = unique_ID
        self._proteome = proteome

        self._len = len(seq) # protein length 
        self._true_len = len(self._sequence) # length of string the protein is in

        general_utilities.variable_is_dictionary(attributes, ProteinException, 'attributes argument passed to protein %s is not a dictionary' %(self._name), or_none=True)

        if attributes is None:
            self._attributes  = {}
        else:
            self._attributes  = attributes

        
        # initialize the empty dictionaries for the set of sites, domains and tracks
        self._sites   = {}
        self._domains = {}    
        self._tracks  = {}

        # the domains by type and sites by type dictionaries are only built IF we request
        # domains or sites by type. This provides a mode of conditional memoization, so at 
        # least within a single session we do not have to search through domains and sites
        # multiple times to find a specific type
        self._domains_by_type = {}
        self._sites_by_type = {}



    ## ------------------------------------------------------------------------
    ##
    @property
    def unique_ID(self):
        """
        Returns the protein's unique_ID

        Returns
        ---------------
        str
            Returns the protein's unique_ID

        """
        return self._unique_ID



    ## ------------------------------------------------------------------------
    ##
    @property
    def name(self):
        """
        Returns the protein name.

        Returns
        ---------------
        str
            Returns a string that corresponds to the region of interest
        """

        return self._name



    ## ------------------------------------------------------------------------
    ##
    @property
    def proteome(self):
        """
        Returns the Proteome object this protein is associated with.

        Returns
        --------
        Proteome 
            Returns a Proteome object that contains this Protein.

        """
        return self._proteome




    ###################################
    ##                               ##
    ##      SEQUENCE FUNCTIONS       ##
    ##                               ##
    ###################################

    ## ------------------------------------------------------------------------
    ##
    def residue(self, position):
        """
        Function that returns the natural residue found at a given position.

        Parameters
        ----------
        position : int
            Position of interest.

        Returns
        ----------
        str
            Returns a single character that corresponds to the string of 
            interest.

        """
        
        # only check if safe is true
        self._check_position_is_valid(position)

        # cast to integer incase...
        return self._sequence[int(position)]



    ## ------------------------------------------------------------------------
    ##
    @property
    def sequence(self):
        """
        Returns the protein amino acid sequence as a Python string (str). 
        Recall that in strings indexing occurs from 0 and is non-inclusive. 
        For proteins/biology indexing is from 1 and is inclusive.
                
        i.e. for sequence 'MAPSTA...' real/biological indexing of region 
        1-3 would give you 'MAP' while Python's indexing would give you 'AP'.
        
        As a result BEWARE if using the raw sequence for analysis! The Protein 
        class provides a ``get_sequence_region()``, ``get_sequence_context()`` and 
        analogous functions for tracks that allow you to use normal indexing to 
        select ranges or regions around a specific point. We suggest this is a 
        safer way to extract vectorial information.

        Returns
        --------
        str 
            Amino acid sequence associated with the protein.

        """
        return self._sequence[1:]



    ## ------------------------------------------------------------------------
    ##
    def get_sequence_region(self, start, end):
        """
        Function that allows a region of the sequence to be extracted out.

        Parameters
        ---------------
        start : int
            Start position for region

        end : int
            End position for region (note this is inclusive)

        Returns
        ---------------
        str
            Returns a string that corresponds to the region of interest

        """

        # validate passed range
        self._check_position_is_valid(start, helper_string=f'Invalid sequence start position [{start}]. Sequence runs between 1 and {self._len}')
        self._check_position_is_valid(end, f'Invalid sequence end position [{end}]. Sequence runs between 1 and {self._len}')
            
        # note +1 because we're inclusive with positions
        return self._sequence[start:end + 1]



    ## ------------------------------------------------------------------------
    ##
    def get_sequence_context(self, position, offset=5, return_indices=False):
        """
        Function that allows a local region of the sequence centered on a 
        specific position to be extracted, including +/- an offset border 
        that intelligently truncates if the offset would extend outside the 
        sequence region.

        Parameters
        ---------------
        position : int
            Position for which we'll interrogate the local sequence

        offset : int (default = 5)
            Plus/Minus offset used to investigate the region around the 
            position. Note that an offset is symmetrical around the position. 
            
        return_indices : bool (default = False)
            Flag which, if set to true, means this function returns a TUPLE 
            where position 0 is the string corresponding to the region of 
            interest, position 2 is the start index (in normal SHEPHARD            
            indexing, i.e. starting from 1) and position 3 is the end index 
            (in normal SHEPHARD indexing).

        Returns
        ---------------        
        str, (str, int, int)
            If return_indices is set to False, this just returns a string 
            that corresponds to the region of interest.

            If return_indices is set to True, this just returns a string 
            that corresponds to the region of interest, as well as the start 
            and end positions that are inclusive in the sequence indexing 
            from 1.
        """
        
        # sanity check position input
        self._check_position_is_valid(position, helper_string='Sequence position %i is outside of protein limits (1-%i)'%(position, len(self)))
        
        # compute start/end of context according to the offset
        (p1, p2) = sequence_utilities.get_bounding_sites(position, offset, self._len)

        # note +1 because we're inclusive with positioning here (and index from 1)
        if return_indices:
            return (self._sequence[p1:p2 + 1], p1, p2)
        else:
            return self._sequence[p1:p2 + 1]



    ## ------------------------------------------------------------------------
    ##
    def check_sequence_is_valid(self):
        """
        Function that checks if the current protein sequence is valid 
        (i.e. consists of only the standard 20 amino acids).
        
        Returns
        ---------------
        bool
            Returns True if all residues are in the standard 20 amino 
            acids, and False if not.
        
        """

        # recal we start at +1 to discard the leading '-' used to ensure we can use
        # real-world indexing
        for i in self._sequence[1:]:
            if i not in general_utilities.STANDARD_AAs:
                return False
        return True


    ## ------------------------------------------------------------------------
    ##
    def convert_to_valid(self, copy=False, safe=True):
        """
        Function that converts non-standard amino acid residues to  
        standard ones and applies this version to the Protein's 
        sequence.

        Specifically:

        ``B -> N``

        ``U -> C``

        ``X -> G``

        ``Z -> Q``

        ``* -> <empty string>``

        ``- -> <empty string>``

        By default this alters the underlying sequence. If you wish to 
        return a copy of the altered sequence instead set copy=True. 
        Otherwise the underlying sequence is changed. Note that removing 
        the ``*`` and ``-`` characters will change the sequence length which
        could cause major issues as none of the internal position-specific
        references will automatically update. Note that if safe=True such 
        changes will trigger an exception.

        Parameters
        ---------------
        copy : bool (default = False)
            Boolean flag - if set to true a copy of the updated sequence is 
            returned, if False then the function returns None. In both cases 
            the associated protein's sequence is altered.            
        
        safe : bool (default = True)
            Boolean flag that defines how to respond if an update changes 
            the sequence length. If set to true, a change that alters the 
            sequence length will trigger an exception, if False it will 
            continue unannounced.

        Returns
        ---------------
        None, str
            If copy = False then no return value is provided. If copy = True 
            then the function returns a string.
        """

        if copy is True:
            # create a copy, such that within the protein the underlying sequence is
            # unaltered
            s = self._sequence[:]            
        else:
            # create a view, which means the protein sequence is altered
            s = self._sequence

        old_len=len(s)
        
        # systematically replace common  'non-canonical' one-letter codes
        # with acceptable codes. Code explanations from
        # https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=BlastHelp
        s = s.replace('B','N') # B = N/E
        s = s.replace('U','C') # U = selenocysteine
        s = s.replace('X','G') # X = Any
        s = s.replace('Z','Q') # Z = Q/D
        s = s.replace('*','')  # * = stop
        s = s.replace('-','')  # - = gap

        if safe is True:
            if len(s) != old_len:
                raise ProteinException('When altering the sequence to remove invalid characters the sequence length changed. This will invalidate positional attributes, such as sites and domains, as these are not automatically updated!')

        if copy is True:
            return s
        else:
            return None


    ## ------------------------------------------------------------------------
    ##
    def _check_position_is_valid(self, position, helper_string=None):
        """
        Internal function that tests that a passed position is valid in the
        given protein.

        The helper-string allows the exception raised to be customized.

        Parameters
        ---------------
        position : int 
            A position in question.

        helper_string : str
            A customizable string that can be passed should an error be
            raised.

        Returns
        -----------
        None
            If the position falls within the protein sequence None is 
            returned, otherwise an exception is raised.

        """

        # recal we're operating in a 1-indexed space
        if sequence_utilities.inside_region(1, self._len, position):
            return None
        else:
            if helper_string:                
                raise ProteinException(helper_string)
            else:
                raise ProteinException('Position %i falls outside of sequence'%(position))


    
    ###################################
    ##                               ##
    ##     ATTRIBUTE FUNCTIONS       ##
    ##                               ##
    ###################################


    ## ------------------------------------------------------------------------
    ##
    @property
    def attributes(self):
        """
        Provides a list of the keys associated with every 
        attribute associated with this protein.
        

        Returns
        -------
        list
            returns a list of the attribute keys associated with the 
            protein. 


        """
        return list(self._attributes.keys())



    ## ------------------------------------------------------------------------
    ##
    def attribute(self, name, safe=True):

        """
        Function that returns a specific attribute as defined by the name. 

        Recall that attributes are name : value pairs, where the 'value' 
        can be anything and is user defined. This function will return 
        the value associated with a given name.
        
        Parameters
        ----------------
        name : str
             The attribute name. A list of valid names can be found by 
             calling the ``<Protein>.attributes()`` (which returns a 
             list of the valid names).
             

        safe : bool (default = True)
            Flag which if true with throw an exception if an attribute 
            with the same name already exists
                        
        Returns
        ---------
        Unknown 
            Will either return whatever was associated with that attribute 
            (which could be anything) or None if that attribute is missing.
                    
        """

        # if name is in the _attributes dictionary the  return
        if name in self._attributes:
            return self._attributes[name]
        else:

            # else if safe was passed raise an exception if that attribute was missing
            if safe:
                raise ProteinException('Requesting attribute [%s] from protein [%s] but this attribute has not been assigned' % (name, str(self))) 

            # if safe not passed just return None
            else:
                return None
                


    ## ------------------------------------------------------------------------
    ##
    def add_attribute(self, name, val, safe=True):
        """
        Function that adds an attribute. Note that if safe is true, this 
        function will raise an exception if the attribute is already 
        present. If safe=False, then an existing value will be overwritten.

        Parameters
        ----------------

        name : str
            The parameter name that will be used to identify it

        val : <anything>
            An object or primitive we wish to associate with this 
            attribute.

        safe : bool (default = True)
            Flag which if True with throw an exception if an 
            attribute with the same name already exists, otherwise the 
            newly introduced attribute will overwrite the previous 
            one.
            
        Returns
        ---------
            None - but adds an attribute to the calling object

        """

        if safe:
            if name in self._attributes:
                raise ProteinException("Trying to add attribute [%s=%s] to protein [%s] but this attribute is already set.\nPossible options are: %s" %(name,val, str(self), str(self._attributes.keys())))
                
        self._attributes[name] = val


    ## ------------------------------------------------------------------------
    ##
    def remove_attribute(self, name, safe=True):
        """
        Function that removes a given attribute from the Protein based on the 
        passed attribute name. If the passed attribute does not exist or is not 
        associate with the Protein then this will trigger an exception 
        unless safe=False.

        Parameters
        ----------------

        name : str
            The parameter name that will be used to identify it

        safe : bool (default = True)
            Flag which if True with throw an exception if an 
            attribute this name does not exists. If set to
            False then if an attribute is not found it is simply
            ignored
            
        Returns
        ---------
        None
            No return type but will remove an attribute from the 
            protein if present.
            
        """

        if name not in self._attributes:
            if safe:
                raise ProteinException(f'Passed attribute [{name}] not found in {self}')
        else:
            del self._attributes[name]




    ###############################
    ##                           ##
    ##     TRACK FUNCTIONS       ##
    ##                           ##
    ###############################

    ## ------------------------------------------------------------------------
    ##
    @property
    def tracks(self):
        """
        Provides a list of Track objects associated with this
        protein
        
        Returns
        -------
        list
            returns a list of the Tracks (order will be consistent but is not 
            sorted).


        """
        return [self._tracks[k] for k in self._tracks]


    ## ------------------------------------------------------------------------
    ##
    @property
    def track_names(self):
        """
        Provides a list of the keys associated with each track 
        associated with this protein.

        These keys can then be used to extract a specific track, or can be used
        to check if a Track is present.
        
        Returns
        -------
        list
            returns a list of the track keys associated with the protein. 


        """
        return list(self._tracks.keys())


    ## ------------------------------------------------------------------------
    ##
    def track(self, name, safe=True):

        """
        Function that returns a specific Track as defined by the name. 

        Recall that Tracks are defined by a name. If a Track by this name 
        exists this function returns the actual Track object, NOT the 
        **values** or **symbols** associated with the track. If a Track by 
        this name does *not* exist then if safe=True an exception will 
        be raised, otherwise the function returns None.
        
        For direct access to values and symbols, use the 
        ``<Protein>.get_track_values(<track_name>)`` and 
        ``<Protein>.get_track_symbols(<track_name>)``.

        Parameters
        ----------------
        name : str
            The track name. A list of valid names can be found by calling 
            the ``<Protein>.tracks()`` (which returns a list of the valid 
            track names).

        Returns
        ---------
        Unknown 
            Will either return the Track object associated with the name, OR
            will return None if safe=False and there was no Track object that
            matched the name.
        
        """

        if name in self._tracks:
            return self._tracks[name]

        elif safe:
            raise exceptions.ProteinException('No track named [%s] in protein %s\n\nAvailable options are: %s' %(name, self.unique_ID, str(self.track_names)))



    ## ------------------------------------------------------------------------
    ##
    def get_track_values(self, name, start=None, end=None, safe=True):
        """
        Function that returns the values associated with a specific track, as 
        defined by the name.

        Recall that tracks are defined by a name. If a track by this name 
        exists this function returns the values IF these are associated 
        with the track. If no values are associated then the function will 
        throw an exception unless safe is set to False, in which case it 
        will return None.
        
        Parameters
        ----------------
        name : string
            The track name. A list of valid names can be found by calling 
            the <Protein>.tracks (which returns a list of the valid track 
            names).

        start : int (default None)
            If provided defines the start position along the track. If not
            provided defaults to 1 (first residue in the protein).

        end : int (default None)
            If provided defines the end position along the track. If not
            provided defaults to the final residue in the protein.

        safe : bool (default = True)
            Flag which if true with throw an exception if a track that 
            matches the passed name does not already exist.
            
        Returns
        ---------
        Unknown 
            Will either return the values associated with the track, OR
            will return None if safe=False and there was no Track that
            matched the name.
        
        """

        (_start, _end) = self.__build_start_end(start,end)
        
        # call internal function
        return self.__get_track_info(name, safe, _start, _end, 'values')



    ## ------------------------------------------------------------------------
    ##
    def get_track_symbols(self, name, start=None, end=None, safe=True):
        """
        Function that returns the symbols associated with a specific track,
        as defined by the name.
        
        Recall that tracks are defined by a name. If a track by this name 
        exists this function returns the symbols IF these are associated 
        with the track. If no symbols are associated then the function will 
        throw an exception unless safe is set to False, in which case it 
        will return None.

        Parameters
        ----------------
        name : string
            The track name. A list of valid names can be found by calling 
            the <Protein>.tracks (which returns a list of the valid track 
            names).

        start : int (default = None)
            If provided defines the start position along the track. If not
            provided defaults to 1 (first residue in the protein).

        end : int (default = None)
            If provided defines the end position along the track. If not
            provided defaults to the final residue in the protein.

        safe : bool (default = True)
            Flag which if true with throw an exception if a track that 
            matches the passed name does not already exist.
            
        Returns
        ---------
        Unknown 
            Will either return the values associated with the track, OR
            will return None if safe=False and there was no Track that
            matched the name.
        
        """

        # build the start and end position
        (_start, _end) = self.__build_start_end(start,end)

        # call internal function
        return self.__get_track_info(name, safe, _start, _end, 'symbols')



    ## ------------------------------------------------------------------------
    ##
    def __build_start_end(self, start, end):
        """
        Internal function that sanity checks requested start end positions 
        and peforms type conversion
        
        Parameters
        -----------
        start : str or int or float or None:
            Start position, will be type converted to Int if not None

        end : str or int or float or None:
            End position, will be type converted to Int if not None

        Returns
        ----------
        tuple
           Returns a 2-position tuple where the first position is the 
           start po

        """

        # set to default values OR convert to int
        try:
            if start is None:
                _start = 1
            else:
                _start = int(start)

            if end is None:
                _end = len(self)
            else:
                _end = int(end)

        except ValueError:
            raise exceptions.ProteinException('When selecting sub-region for track values could not convert one of the start/end to an int: start=%s, end=%s'% (start,end))
            

        return (_start, _end)

            

    ## ------------------------------------------------------------------------
    ##
    def __get_track_info(self, name, safe, start, end, mode):
        """
        Internal function that follows the exact same logic as the 
        public-facing get_track_symbols or get_track_values.
        

        Note that the start and end values passed here have already
        been validated.

        Parameters
        -----------
        name : str
            Track name

        safe : bool 
            Flag which if true with throw an exception if a track that 
            matches the passed name does not already exist.
                    
        start : int 
            Start position 

        end : int 
            End position

        Returns
        ----------
        None or list
            Returns either a list of values or symbols, or None if no
            Track with the passed name is present and safe is False.

        """
        
        t = self.track(name, safe)

        if t is None:
            # note - technically as the code is written now we don't need this, (the safety is dealt in get_track())
            # but I'm including it for best practice to avoid implicit dependencies in the code
            if safe:
                raise exceptions.ProteinException('No track named [%s] in protein %s\n\nAvailable options are: %s' %(name, str(self), self.track_names))
            else:
                return None

        # try and get values
        if mode == 'values':
            v = t.values_region(start, end)
        elif mode == 'symbols':
            v = t.symbols_region(start, end)

        # if v is a value or safe is False just return v (will either be values
        # or None)
        if v is not None or not safe:
            return v

        # we only get here if v is None and safe is True
        else:
            raise exceptions.ProteinException('Requested track values for track [%s] in protein [%s] but no values available' %(t.name, self))
        


    ## ------------------------------------------------------------------------
    ##
    def add_track(self, name, values=None, symbols=None, safe=True):
        """
        Function that adds a track to this protein. For more information 
        on Tracks see the relevant documentation. However, some general 
        guidelines are provided below for convenience.

        * A **values track** should be a list/array of numerical values

        * A **symbols track** should be a list or string of symbolic characters
        
        In either case, the iterable should have a 1:1 mapping with the sequence
        Finally, Tracks can have both a value and a symbol, although in general 
        it probably makes sense to use multiple tracks. 

        Parameters
        ---------------
        name : string
            Name for track. NOTE that this is a unique identifier, 
            and each track within a given protein should must have a 
            unique name. 
            
        values : list or np.array (default None)
            A numerical iterable collection of values, where each value 
            maps to a specific residue in the sequence. 
            
        symbols : list or string (default None)
            A symbolic collection of characters, where each symbol maps 
            to a specific residue in the sequence. 
                   
        safe : bool (default = True)
            If set to True over-writing tracks will raise an exception, 
            otherwise overwriting a track will simply over-write it.
            

        Returns
        ----------
        None
            Nothing, but adds a track to the calling object.

        """

        if name in self.track_names:
            if safe is True:
                raise exceptions.ProteinException('Trying to add Track [%s] in protein [%s] but Track already exists' % (name, self.name))
                
        self._tracks[name] = Track(name, self, values, symbols)


    ## ------------------------------------------------------------------------
    ##
    def build_track_values_from_sequence(self, name, trackfunction, input_dictionary=None, safe=True):
        """
        Tracks can be added as pre-loaded values. However, sometimes you 
        want to build a track based on some analysis of the sequence on 
        the fly. This function allows you to pass in your own function 
        (with keyword arguments in the keywords dictionary) that will take 
        in the protein sequence, generate a new track, and add that track 
        to the protein.
        
        build_track_values allows you to define a function that converts 
        amino acid sequence into a numerical list or np.array, which gets 
        written as a values track. If you want a symbols track, use 
        build_track_symbols().
        
        Specifically, the argument trackfunction must be a user-defined 
        function. This function can be defined anywhere, but should take 
        either one or two arguments:
        
        (1) The first/only argument should be an amino acid sequence.
        (2) The second argument a dictionary of key-value pairs.

        When build_track_values_from_sequence is called, the sequence of 
        the protein is passed as the first argument into the trackfunction, 
        and - if present - the input_dictionary is passed as the second 
        argument.
        
        In this way a new track is defined internally, with the track 
        function using the proteins sequence and any/all pass 
        input_dictionary to convert the sequence into some numerical 
        representation.

        Parameters
        ------------

        name : string
            Name of the track to be used. Should be unique and will always 
            overwrite an existing track with the same name (no safe keyword 
            provided here).
            
        trackfunction : function
            A user define function that has the following properties:
        
            (1) First argument is expected to be amino acid sequence
            (2) Second argument (if provided) should be a dictionary which 
                is passed (untouched) THROUGH build_track_values 
                from sequence to the trackfunction at runtime

        function_keywords : dictionary
            This is a dictionary that will be passed to the trackfunction 
            as the second argument IF it is provided. In this way, the user 
            can pass an arbitrarily complex set of arguments to the 
            track function each time 
            
            the build_track_values_from_sequence is called.

        safe : bool (default = True)
            If set to True over-writing tracks will raise an exception, 
            otherwise overwriting a track will simply over-write it.
            
        Example
        ----------

        Below we offer an example for how one might defined a custom track-building function::

            # define a function that takes in a sequence and converts it 
            # into some other numerical list. Note this is INLINE with the 
            # code, or could be elsewhere. This function MUST take either 
            # ONE argument (sequence) or TWO arguments (sequence and 
            # input_dictionary). Also the names of these arguments does 
            # not matter, but the order does (i.e. first argument will 
            # always get the sequence).

            def trackbuilder(seq, input_dictionary):
                ''' 
                    This function takes in a sequence (seq) as first argument, 
                    and the v1 and v2 as additional arguments. See below for 
                    what it's doing (pretty simple).
                     
                '''
                newseq=[]  
                
                # we are extracting out the 'values' from the input dictionary
                # for the sake of code clarity
                v1 = input_dictionary['v1']
                v2 = input_dictionary['v2']
    
                # for each residue in the sequence
                for i in seq:
    
                    # is that residue in v1 (append 1) or v2 (append -1)? If 
                    # neither append 0
                    if i in v1:
                        newseq.append(1)
                    elif i in v2:
                        newseq.append(-1)
                    else:
                        newseq.append(0)
            
                return newseq
            
            # define the input_dictionary (note again that the variable names 
            # here do not matter)
            input_dictionary = {'v1':['K','R'], 'v2':['E','D']}  
    
            # now assuming ProtOb is a Protein object, this will add a new 
            # track
            ProtOb.build_track_values('charge_vector', trackbuilder, 
            function_dictionary=input_dictionary)
 
        In this example we defined a function that converts an amino acid 
        string into a numerical list where positively charged residues = +1
        and negatively charged residues = -1. We applied this function to 
        generate a 'charge_vector' track.

        Note this is analagous to defining our function and then running::

            s = ProtOb.sequence
            newtrack = trackbuilder(s, ['K','R'], ['E',D'])
            ProbOb.add_track('charge_vector', values=newtrack)


        **Some FAQs:**

        * Do I need to pass an input_dictionary to the custom function? No!
        * Does the name of the custom function matter? No!
        * Does the custom function have to accepted the amino acid sequence as the first argument? Yes!

        """

        # if this will overwrite an existing track and safe is on...
        if name in self.track_names:
            if safe is True:
                raise exceptions.ProteinException('Trying to add Track [%s] in protein [%s] but Track already exists' % (name, self.name))

        # build the new track with the trackfunction, correctly handling between 0 and n additional
        # arguments to be passed to the trackfunction
        if input_dictionary is None:
            built_track = trackfunction(self.sequence)
        else:
            built_track = trackfunction(self.sequence, input_dictionary)
            
        # finally add the track
        self._tracks[name] = Track(name, self, values=built_track, symbols=None)



    ## ------------------------------------------------------------------------
    ##
    def build_track_symbols_from_sequence(self, name, trackfunction, input_dictionary = None, safe = True):
        """
        Tracks can be added as pre-loaded values. However, sometimes you 
        want to build a track based on some analysis of the sequence on 
        the fly. This function allows you to pass in your own function 
        (with keyword arguments) that will take in the protein sequence, 
        generate a new track, and add that track to the Protein.

        build_track_symbols allows you to define a function that converts 
        amino acid sequence into a symbolic list or string, which gets 
        written as a symbols track. If you want a values track, use 
        build_track_values().
        
        Specifically, the argument trackfunction must be a user-defined 
        function. This function can be defined anywhere, but should take 
        either one or two arguments:        

        (1) The first/only argument should be an amino acid sequence.
        (2) The second argument a dictionary of key-value pairs.

        When build_track_symbols_from_sequence is called, the sequence of 
        the protein is passed as the first argument into the trackfunction, 
        and  - if present - the input_dictionary is passed as the second 
        argument.
        
        In this way a new track is defined internally, with the track 
        function using the proteins sequence and any/all pass 
        input_dictionary to convert the sequence into some other symbolic 
        representation.

        Parameters
        ------------

        name : string
            Name of the track to be used. Should be unique and will always 
            overwrite an existing track with the same name (no safe keyword 
            provided here).

        trackfunction : funct
            A user define function that has the following properties:
        
            (1) First argument is expected to be amino acid sequence
            (2) Second argument (if provided) should be a dictionary which is 
                passed (untouched) THROUGH build_track_values from sequence to 
                the trackfunction at runtime

        function_keywords : dict (default None)
            This is a dictionary that will be passed to the trackfunction as 
            the second argument IF it is provided. In this way, the user can 
            pass an arbitrarily complex set of arguments to the trackfunction 
            each time the build_track_symbols_from_sequence is called.            

        safe : bool (default = True)
            If set to True over-writing tracks will raise an exception, 
            otherwise overwriting a track will simply over-write it.

        Example
        --------        
        
        Below we offer an example for how one might defined a custom track-building function::

            # define a function that takes in a sequence and converts it into some 
            # other symbolic representation as a string. Note this is INLINE with 
            # the code, or could be elsewhere. This function MUST take either ONE 
            # argument (sequence) or TWO arguments (sequence and input_dictionary).
            # 
            # Also the names of these arguments does not matter, but the order does 
            # (i.e. first argument will always get the sequence).

            def trackbuilder(seq, input_dictionary):
                ''' 
                    This function takes in a sequence (seq) as first argument, 
                    and the v1 and v2 as additional arguments. See below for what 
                    it's doing (pretty simple).                
                '''
                new_string_list=[]  
                
                # we are extracting out the 'values' from the input dictionary
                # for the sake of code clarity
                v1 = input_dictionary['v1']
                v2 = input_dictionary['v2']

                # for each residue in the sequence
                for i in seq:
    
                    # is that residue in v1 (append 1) or v2 (append -1)? If neither 
                    # append 0
                    if i in v1:
                        new_string_list.append('+')
                    elif i in v2:
                        new_string_list.append('-')
                    else:
                        new_string_list.append('0')
            
                # convert the list into a string
                newstring = "".join(new_string_list)
                return newstring
            
            # define the input_dictionary (note again that the variable names  
            # here do not matter)
            input_dictionary = {'v1':['K','R'], 'v2':['E','D']}

            # now assuming ProtOb is a Protein object, this will add a new track
            ProtOb.build_track_values('charge_string', trackbuilder, 
                                       function_dictionary=input_dictionary)
        
        In this example we defined a function that converts an amino acid 
        string into a coarse-grained string representation where positive 
        residues are "+", negative are "-" and neutral are "0".
        
        Note this is analagous to defining our function and then running::

            s = ProtOb.sequence
            newtrack = trackbuilder(s, ['K','R'], ['E',D'])
            ProbOb.add_track('charge_vector', values=newtrack)

        **FAQs:**

        * Do I need to pass an input_dictionary to the custom function? No        
        * Does the name of the custom function matter? No!
        * Does the custom function have to accepted the amino acid sequence as the first argument? Yes!

        """

        # if this will overwrite an existing track and safe is on...
        if name in self.track_names:
            if safe is True:
                raise exceptions.ProteinException('Trying to add Track [%s] in protein [%s] but Track already exists' % (name, self.name))

        # build the new track with the trackfunction, correctly handling between 0 and n additional
        # arguments to be passed to the trackfunction
        if input_dictionary is None:
            built_track = trackfunction(self.sequence)
        else:
            built_track = trackfunction(self.sequence, input_dictionary)
            
        # finally add the track
        self._tracks[name] = Track(name, self, values=None, symbols=built_track)



    ## ------------------------------------------------------------------------
    ##
    def build_track(self, name, input_data, track_definition_function, safe=True):
        """
        Function that constructs a track using a given 
        track_definition_function and a user provided input_data object. Very 
        little constraint is set here, other than the fact the name should be a 
        string and  track_definition function should return a dictionary with 
        (at least)  two key:value pairings: `symbols` and `values`, where the 
        corresponding  value for each is bona-fide track input data.
        
        Parameters
        ------------

        name : string
            Name of the track to be used. Should be unique and will always 
            overwrite an existing track with the same name (no safe keyword 
            provided here).
            
        input_data : ?
            Some kind of data that will be passed to the track_definition_function 
        
        track_definition_function : function
            Function that takes in `input_data` and returns a dictionary 
            with a 'values' and a 'symbols' key and value pairing. The 
            values that map to 'values' and 'symbols' will be added as a 
            single new track defined by name.

        safe : bool (default = True)
            If set to True over-writing tracks will raise an exception, 
            otherwise overwriting a track will simply over-write it.

        Returns
        ------------
        None
            No return type, but a new track is added to the Protein.

        """

        if name in self.track_names:
            if safe is True:
                raise exceptions.ProteinException('Trying to add Track [%s] in protein [%s] but Track already exists' % (name, self.name))

        track_out = track_definition_function(input_data)

        values = track_out['values']
        symbols = track_out['symbols']

        self._tracks[name] = Track(name, self, values=values, symbols=symbols)



    ## ------------------------------------------------------------------------
    ##        
    def remove_track(self, track_object, safe=True):
        """
        Function that removes a given Track from the Protein based on the 
        passed Track object. If the passed Track does not exist or is not 
        associate with the protein then this will trigger an exception 
        unless safe=False.

        Parameters
        ------------
        track_object : shephard.track.Track Object or None
            Track Object that will be used to retrieve a given protein.
            Note that remove_track() can tollerate None as the object if 
            Safe=False to enable a single for-loop to iterate over a 
            proteome and remove all tracks of a specific type without 
            worrying as to if the track is present or not.

        safe : bool (default = True)
            Flag that if set to True means if a passed track is missing 
            from the underlying protein object an exception wll be raised 
            (ProteinException). If False a missing track is ignored.

        Returns
        -----------
        None
            No return type but will remove track from the protein
           
        """

        # this means we can pass a None into the remove tracks function and it doesnt kill things - makes
        # it syntactically simple to search over a proteome to remove tracks of a specific type using
        if type(track_object) != Track:
            if safe is False:
                return 
            else:
                raise ProteinException(f'track_object was not a Track, but Safe=True')
                
        # failsafe to ensure we can only delete tracks that truly come from the protein we're passing
        # into
        if track_object.protein.unique_ID != self.unique_ID:
            raise ProteinException(f'Passed Track [{track_object}] not found in this protein [{self.protein}]')
                    
        # if the passed track object name was found in this protein
        if track_object.name in self._tracks:
            self.proteome.__decrement_track_names(track_object.name)
            del self._tracks[track_object.name]
        else:
            if safe:
                raise ProteinException(f'Passed Track [{track_object}] not found in {self}')



    ###############################
    ##                           ##
    ##     DOMAIN FUNCTIONS      ##
    ##                           ##
    ###############################

    ## ------------------------------------------------------------------------
    ##
    @property
    def domains(self):
        """
        Returns a list of the Domain objects associated with this protein,
        sorted by first reside of the domain.
        """
    
        domain_list = [self._domains[k] for k in self._domains]
        domain_list.sort(key=lambda x: x.start, reverse=False)

        return domain_list


    ## ------------------------------------------------------------------------
    ##
    @property
    def domain_names(self):
        """
        Returns a list of the domain names associated with this protein
        """
        
        domain_list = self.domains
                
        return [d.domain_name for d in domain_list]


    ## ------------------------------------------------------------------------
    ##
    @property
    def domain_types(self):
        """
        Returns a list of the unique domain types associated with this protein. 
        There will be no duplicates here.
        
        """

        # define an empty set
        domain_types = set([])

        # cycle through the domains and add the domain type to the set
        for domain in self._domains:
            domain_types.add(self._domains[domain].domain_type)

        # convert the set to a list and return
        return list(domain_types)



    ## ------------------------------------------------------------------------
    ##
    def domain(self, name, safe=True):

        """
        Function that returns a specific domain as defined by the name. 
        Note it is often more useful to request a domain by type rather 
        than by the name, in which case get_domains_by_type(<domain_type>) 
        is the relevant syntax. Note domains can also be requested based 
        on position (get_domains_by_position).

        Parameters
        ----------------
        name : string
            The Domain name. A list of valid names can be found by calling 
            the <Protein>.domains (which returns a list of the valid track 
            names).
             
        safe : bool (default = True)
            Flag which if true with throw an exception if no domain exists 
            with this name. If false function will return None instead.

        Returns
        ---------
        Unknown 
            Will either return the Domain object associated with the name, 
            OR will return None if safe=False and there was no Domain object 
            that matched the name.
        """

        if name in self._domains:
            return self._domains[name]
        elif safe:
            raise exceptions.ProteinException('No domains named [%s] in protein %s\n\nAvailable domains are: %s' % (name, self.unique_ID, str(self.domain_names)))
      
      
    ## ------------------------------------------------------------------------
    ##
    def add_domains(self, list_of_domains, safe=True, autoname=False, verbose=False):
        """
        Function that takes a list of domain dictionaries and adds those 
        domains to the protein.

        Each domain dictionary within the list must have a key-value pair 
        that defines the following info:
                    
        * **start** - domain start position (in real sequence, not i0 indexing)
        * **end** - domain end position (in real sequence, not i0 indexing)
        * **domain_type** - type of the domain (string)
        * **attributes** - a dictionary of attributes to associated with the domain (optional)
        
        Note that in start, end, and domain_type are the only required 
        key-value pairs required in the dictionary.
        
        If you wish to add many domains to main proteins, see 
        interfaces.si_domains.add_domains_from_dictionary()

        Parameters
        -------------

        list_of_domains : list 
            A list of domain dictionaries. A "domain dictionary" is defined above, 
            but in short is a dictionary with the following key-value pairs:

            * REQUIRED:
               * start - int (domain start position)
               * end - int (domain end position)
               * domain_type - string (domain type)

            * OPTIONAL:
               * attributes - dictionary of arbitrary key-value pairs that will be associated with the domain

        safe : bool (default = True)
            If set to True over-writing domains will raise an exception. 
            If False, overwriting a domain will silently over-write. 
            
        autoname : bool (default = False)
            If autoname is set to true, this function ensures each domain 
            ALWAYS has a unique name - i.e. the allows for multiple domains 
            to be perfectly overlapping in position and type. This is 
            generally not going to be required and/or make sense, but having
            this feature in place is useful. In general we want to avoid this 
            as it makes it easy to include duplicates which by default are 
            prevented when autoname=False.

        verbose : bool (default = True)
            Flag that defines how 'loud' output is. Will warn about errors on 
            adding domains.

        Returns
        -------
        None
            No return value, but will add the passed domains to the protein or 
            throw an exception if something goes wrong!
        
        """

        # create the input dictionary
        in_dict = {self.unique_ID:list_of_domains}
        add_domains_from_dictionary(self.proteome, in_dict, autoname=autoname, safe=safe, verbose=verbose)


        
    ## ------------------------------------------------------------------------
    ##
    def add_domain(self, start, end, domain_type, attributes=None, safe=True, autoname=False):
        """
        Function that adds a domain, automatically generating a unique name if 
        none is provided. Domain type can be used to assign a specific type if
        we want to retrieve domains of a specific type at some point. Position
        indexing is done for 1 - i.e. the first residue in a protein is 1, not
        0.

        Allows a domain at a specific position to be 

        Parameters
        -----------

        start : int
            Position of the start of the domain, inclusive.

        end : int 
            Position of the end of the domain (not inclusive). i.e. if we had 
            a domain that ran from start=10 end=20, it would be 10 residues 
            long and include residues [10, 11, 12, 13, 14, 15, 16, 17, 18, 19].

        domain_type : str 
            None unique string that allows a type identifier to be associated 
            with a domain. 

        attributes : dict (default = None)
            Optional dictionary which allows an arbitrary set of attributes to 
            be associated with a domain, in much the same way that they can be 
            associated with a protein. 

        safe : bool (default = True)
            If set to True over-writing tracks will raise an exception, 
            otherwise overwriting a track will simply over-write it. 

        autoname : bool (default = False)
            If autoname is set to true, this function ensures each domain 
            ALWAYS has a unique name - i.e. the allows for multiple domains 
            to be perfectly overlapping in position and type. This is generally 
            not going to be required and/or make sense, but having this feature 
            in place is useful. In general we want to avoid this as it makes it 
            easy to include duplicates which by default are prevented when 
            autoname=False.
        """

        # cast input data
        start = int(start)
        end = int(end)
        domain_type = str(domain_type)

        # append start and end position to name. 
        full_name = "%s_%i_%i"%(domain_type, start, end)    
        
        # if this domain name was already found...
        if full_name in self.domain_names:

            # if we're in autoname mode create a new unique name. This acts to add an incrementer to the
            # end and cycles through until a unique domaintype_star_end_incrementer name is found, where 
            # incremementer keeps being incremented
            if autoname:
                increment = 0
                found = False

                while found is False:

                    increment = increment + 1
                    newname = "%s_%i_%i_%i"%(domain_type, start, end, increment)    

                    if newname not in self.domain_names:
                        found = True                    
                full_name = newname
            elif safe:
                raise exceptions.ProteinException('Domain [%s] already found in proteins %s' % (full_name, self.name))
            
        self._domains[full_name] = Domain(start, end, self, domain_type, full_name, attributes=attributes)



    ## ------------------------------------------------------------------------
    ##
    def build_domain(self, input_data, domain_definition_function, safe=True, autoname=False):
        """
        Function that is somewhat analogous to build_tracks, but allows the
        user to define a custom function (domain_definition_function) that 
        takes input_data and returns domain information, and then assigns        
        that domain information to a new domain


        Parameters
        ----------
        input_data : anything
            Any input that makes sense when passed to the 
            domain_definition_function().

        domain_definition_function : function that takes a single argument 
            (input_data) and returns a list of 0 or more dictionaries. Each 
            dictionary within the list has a key-value pair that 
            
            defines the following info:
                    
                start : domain start position
                end   : domain end position
                domain_type : type of the domain
                attributes : a dictionary of attributes to associated with 
                             the domain (optional)
            
            Note that in principle only start and end are required, although
            we highly recommend one/both of domain name and domain type.
            
            Some important requirements to consider:

            (1) domain_definition_function must return a list of zero or more 
                dictionaries.
            
        safe : bool (default = True)
            Flag which if true with throw an exception of a domain with the 
            same name already exists.

        autoname : bool
            If autoname is set to True, this function ensures each domain 
            ALWAYS has a unique name - i.e. the allows for multiple domains 
            to be perfectly overlapping in position and type. This is generally 
            not going to be required and/or make sense, but having this feature
            in place is useful. In general we want to avoid this as it makes it 
            easy to include duplicates which by default are prevented when 
            autoname=False.
        """

        # build our domain definitions. Should probably add code that ensyres
        # domain_definitions is a list
        domain_definitions = domain_definition_function(input_data)

        self.add_domains(domain_definitions)

    ## ------------------------------------------------------------------------
    ##        
    def remove_domain(self, domain_object, safe=True):
        """
        Function that removes a given domain from the protein based on the 
        passed domain object. If the passed domain does not exist or is not 
        associate with the protein then this will trigger an exception unless 
        safe=False.

        Parameters
        ------------
        domain_object : Domain Object
            Domain Object that will be removed from the protein

        safe : bool (default = True)
            Flag that if set to True means the function is robust to the 
            type of domain_object, and if no such domain exist it is 
            silently skipped.

        Returns
        -----------
        None
            No return type but will remove a domain from the protein if 
            present.
           
        """
        if type(domain_object) != Domain:
            if safe is False:
                return 
            else:
                raise ProteinException(f'{domain_object} was not a Domain, but safe=True.')

        # if the passed object is found at the excised position
        if domain_object.domain_name in self._domains:
            
            # update proteome
            self.proteome.__decrement_domain_types(domain_object.domain_type)

            # remove object
            del self._domains[domain_object.domain_name]

        else:
            if safe:
                raise ProteinException(f'Passed Domain [{domain_object}] not found in {self}')
            


    ## ------------------------------------------------------------------------
    ##
    def get_domains_by_position(self, position, wiggle = 0):
        """
        Functions that allows all domains found at a position to be returned.

        Wiggle defines +/- residues that are allowed (default = 0) in the search
        operation.

        Parameters
        ----------------
        
        position : int
            Residue position of interest (position in sequence).

        wiggle : int (default = 0)
            Value +/- the position (i.e. lets you look at sites around a 
            specific position).

        Returns
        --------------
        list
            Returns a list of Domain objects in the order they appear
            in the protein.

        """

        return self.get_domains_by_range(position, position, wiggle=wiggle, mode='overlap')



    ## ------------------------------------------------------------------------
    ##
    def get_domains_by_position_and_type(self, position, domain_type, wiggle = 0):
        """
        Functions that allows all domains found at a position and of a specific
        type to be returned.

        Wiggle defines +/- residues that are allowed (default = 0) in the search
        operation.

        Parameters
        ----------------
        
        position : int
            Residue position of interest (position in sequence).

        domain_type : str
            String used to match the against the domain types

        wiggle : int (default = 0)
            Value +/- the position (i.e. lets you look at sites around a 
            specific position).

        Returns
        --------------
        list
            Returns a list of Domain objects in the order they appear
            in the protein.

        """

        # get all domains at the position
        local_domains =  self.get_domains_by_range(position, position, wiggle=wiggle, mode='overlap')


        return_domains = []
        for d in local_domains:
            if d.domain_type == domain_type:
                return_domains.append(d)

        return return_domains



    ## ------------------------------------------------------------------------
    ##
    def get_domains_by_range(self, start, end, wiggle = 0, mode='overlap-strict'):

        """
        Function that allows all domains in a protein that are found within
        a given range to be returned. Three possible modes can be used here;
        'internal', 'overlap-strict' and 'overlap' (default = 
        'overlap-strict').
        
        'internal' means that the range defined by start and end is 100% 
        within the domains identified. For example, if a domain was 
        between positions 50 and 100 then a range of 60 to 80 would 
        identify that domain but a range of (say) 40 to 120 would not.
        This is the least permissive mode.

        'overlap-strict' means that the range defined by start and end
        overlaps with the entire domain, but extra residues on at 
        the start and the end of domain are not penalized. For example,
        if a domain was between positions 50 and 100 then a range 
        of 40 to 120 would be identified because the domain fully 
        overlaps. However a range of 40 to 70 would not. This is the
        second least permissive mmode, and all domains defined by 
        'internal' are also identified by overlap-strict.

        'overlap' means that the range can also straddle domain boundaires.
        for example if a domain was between position 50 and 100 and the
        range was between 40 and 70 this would count - essentially this
        means any domains that overlap with the passed range in any
        way are included. This is the most permissive mode, and all domains
        identified by 'internal' and 'overlap-strict' are also identified
        by 'overlap'.

        Parameters
        ---------------
        start : int
            Start of region of interest (position in sequence)

        end : int
            End of region of interest (position in sequence)

        wiggle : int (default = 0)
            Value +/- at the edges that are included. 

        mode : str (default = 'overlap-strict')
            Selector that allows the mode to be used for domain overlap
            to be defined. Must be one of 'internal', 'overlap-strict',
            or 'overlap'. Definitions and meaning described above.
        
        Returns
        --------------
        list
            Returns a list of Domain objects in the order they appear
            in the protein.

        """

        # check the mode keyword is valid
        general_utilities.valid_keyword('mode', mode, ['internal','overlap-strict','overlap'])

        # check the start and end values are valid
        self._check_position_is_valid(start,  helper_string='Sequence region cannot start below 1 [%i]'%(start))
        self._check_position_is_valid(end, 'Sequence region cannot end after the sequence length (%i) [%i]'%(self._len, end))

        # check the wiggle passed is valid
        if wiggle < 0:
            raise ProteinException('Passed a wiggle value less than 0')

        return_list = []
        
        p1 = max(1, start - wiggle)
        p2 = min(end + wiggle, self._len)

        # cycle through each domain in the protein and build a list of those domains defined by the range
        for domain in self.domains:

            valid = False

            # this scenario always means overlap
            if p1 >= domain.start and p2 <= domain.end:
                valid = True

            # if we're using overlap strict or overlap then have a second criterion
            if valid is False and (mode == 'overlap-strict' or mode == 'overlap'):
                if p1 <= domain.start and p2 >= domain.end:
                    valid = True

            # if we haven't already found an domain and the mode is overlap
            if valid is False and mode == 'overlap':

                if p1 <= domain.start and p2 > domain.start:
                    valid = True

                if p1 <= domain.end and p2 > domain.end:
                    valid = True
            

            if valid is True:
                return_list.append(domain)

        return return_list


    ## ------------------------------------------------------------------------
    ##
    def get_domains_by_type(self, domain_type, perfect_match=True):
        """
        Function that returns a list of domains as matched against
        a specific domain type name.
        
        Parameters
        ------------
        domain_type : string
            String associated domain_type that you want to search for.

        perfect_match : bool (default = True)
            Flag that identifies if the domain names should be a perfect 
            match (=True) or if the string passed should just appear 
            somewhere in the domain_type .
            
        Returns
        -----------
        list
            Returns a list of Domain objects that match the requested type. 
            Objects are ordered by starting position in sequence.
                                
        """

        if perfect_match:
            def selection(t):
                if t  == domain_type: 
                    return True
                else:
                    return False
        else:
            def selection(t):
                if t.find(domain_type) > -1:
                    return True
                else:
                    return False

        domain_list = []
        for d in self.domains:
            if selection(d.domain_type):
                domain_list.append(d)

        domain_list.sort(key=lambda x: x.start, reverse=False)


        return domain_list
            
        

    ###############################
    ##                           ##
    ##     SITE FUNCTIONS        ##
    ##                           ##
    ###############################


    ## ------------------------------------------------------------------------
    ##
    @property
    def sites(self):
        """
        Provides a list of the sites associated with 
        every site on the protein. Sorted N to C terminal.
        """

        # this means we will always
        all_sites = []
        for k in self._sites:
            for s in self._sites[k]:
                all_sites.append(s)

        return all_sites


    ## ------------------------------------------------------------------------
    ##
    @property
    def site_positions(self):
        """
         Provides a list of the sorted positions where 
        a site is found the protein. Sorted N to C terminal.
        """

        site_keys = list(self._sites.keys())
        site_keys.sort()
        return site_keys


    ## ------------------------------------------------------------------------
    ##
    @property
    def site_types(self):
        """
        Returns a list of the unique site types associated 
        with this protein. There will be no duplicates here.        
        """

        # define an empty set
        site_types = set([])

        # cycle through the domains and add the domain type to the set
        for position in self._sites:
            site_types.update(set(site.site_type for site in self._sites[position]))

        # convert the set to a list and return
        return list(site_types)


    ## ------------------------------------------------------------------------
    ##
    def site(self, position, safe=True):
        """
        Returns the list of sites that are found at a given position. 
        Note that - in general site() should be used to retrieve sites 
        you know exist while `get_sites_by_position()` offers a way to more 
        safely get sites at a position. Site will throw an exception 
        if the position passed does not exist (while `get_sites_by_position()` 
        will not).

        Parameters
        -------------
        position : int
            Defines the position in the sequence we want to interrogate 

        Returns
        ---------
        list
            Returns a list with between 0 and n sites. Will raise an exception
            if the passed position cannot be found in the codebase unless safe=False,
            in which case an empty list is returned.

        """

        if int(position) in self._sites:
            return self._sites[int(position)]


        if safe:
            raise exceptions.ProteinException('No sites at position %i in protein %s\n\nAvailable sites are: %s' % (position, self.unique_ID, str(self.site_positions)))
        else:
            return []
            


    ## ------------------------------------------------------------------------
    ##
    def add_site(self, position, site_type, symbol=None, value=None, attributes=None):
        """
        Function that adds a site to a specific position in the sequence. 
        Sites are indexed by residue position, and multiple sites can 
        co-exist  on the same site, so no name is required (unlike Proteins, 
        Tracks or Domains).
       
        site_type is a non-unique identifier that allows sites to be 
        specifically identified/selected.

        Sites can be associated with a numerical value, a symbol, or both. 
        Sites can also have attributes associated with them.        

        If you wish to add many sites to many proteins, see:
        
            interfaces.si_sites.add_sites_from_dictionary()
        
        Parameters
        -----------

        position : int
            Position of site (recall we index from 1 - i.e. the first 
            residue in  a protein = 1, not 0. Note that this value is cast 
            to int.

        site_type : string 
            Non-unique string that allows a type identifier to be associated 
            with a site.

        symbol : string (default = None)
            Symbol associated with a site. Symbols are string-based - will 
            often be a single character but could be multiple characters. 

        value : float64 (default = None)
            Numerical value associated with a site. Note that the value is 
            cast to a float64. 

        attributes : dict (default = None)
            Optional dictionary which allows an arbitrary set of attributes 
            to be associated with a domain, in much the same way that they 
            can be associated with a protein. 
        
        """
        
        # recal inside_regions is inclusive
        if not sequence_utilities.inside_region(1, self._len, position):
            raise ProteinException("Trying to add site to protein [%s] at positions [%i] - this falls outside the protein's dimensions [%i-%i]" %(str(self), position, 1, self._len))

        # cast the position to an int and if there are no sites at that position create an empty list there
        position = int(position)        
        if position not in self._sites:
            self._sites[position] = []

        # add the site!
        self._sites[position].append(Site(position, site_type, self, symbol, value, attributes))


    ## ------------------------------------------------------------------------
    ##        
    def remove_site(self, site_object, safe=True):
        """
        Function that removes a given site from the protein based on the
        passed site object. If the passed site does not exist or is not 
        associate with the protein then this will trigger an exception 
        unless safe=False.

        Parameters
        ------------
        site : Site Object
            Unique ID that will be used to retrieve a given protein. Note 
            that remove_site() can tollerate None as the site_object if 
            Safe=False to enable a single for-loop to iterate over a 
            proteome and remove all sites of a specific type without 
            worrying as to if the site is present or not.

        safe : bool
            Flag that if set to True means if a passed unique_ID is missing
            from the underlying proteome object an exception wll be raised 
            (ProteomeException). If False a missing unique_ID is ignored.

        Returns
        -----------
        None
            No return type but will remove site from the protein
           
        """

        if type(site_object) != Site:
            if safe is False:
                return 
            else:
                raise ProteinException(f'site_object was not a Site, but safe=True.')


        # excise the site positions
        site_position = site_object.position

        if site_position not in self._sites:
            if safe is False:
                return
            else:
                raise ProteinException(f'Site object is at position {site_position} but no sites were found in protein {self.unique_ID} at this position')
            

        # if the passed object is found at the excised position
        if site_object in self._sites[site_position]:
            
            self.proteome.__decrement_site_types(site_object.site_type)

            # remove object
            self._sites[site_position].remove(site_object)

            # remove position entry if no other sites at that location 
            if len(self._sites[site_position]) == 0:
                del self._sites[site_position]
        else:
            if safe:
                raise ProteinException(f'Passed Site [{site_object}] not found in {self}')


    ## ------------------------------------------------------------------------
    ##
    def get_sites_by_position(self, position, wiggle = 0, return_list=False):
        """
        Get all sites at a specific position

        Parameters
        ---------------
        position : int
            Residue position of interest (position in sequence)

        wiggle : int (default = 0)
            Value +/- the position (i.e. lets you look at sites around a 
            specific position)

        return_list : bool
            By default, the flag returns a dictionary, which is conveninet as 
            it makes it easy to index into one or more sites at a specific 
            position in the sequence. However, you may instead want a list 
            of sites, in which case setting return_list will have the function
            simply return a list of sites. As of right now we do not guarentee
            the order of these returned sites.

        Returns
        -----------
        dict 
            Returns a dictionary where the key is a position (location) and the 
            value is a list of one or more sites at that position.

        list
            If return_list is set to True, then a list of Site objects is
            returned instead.

        """

        return self.get_sites_by_range(position, position, wiggle, return_list)


    ## ------------------------------------------------------------------------
    ##
    def get_sites_by_range(self, start, end, wiggle = 0, return_list=False):
        """
        Get all sites within a certain range.

        Parameters
        ---------------
        start : int
            Start of region of interest (position in sequence)

        end : int
            End of region of interest (position in sequence)

        wiggle : int (default = 0)
            Value +/- at the edges that are included. 

        return_list : bool
            By default, the flag returns a dictionary, which is conveninet as 
            it makes it easy to index into one or more sites at a specific 
            position in the sequence. However, you may instead want a list 
            of sites, in which case setting return_list will have the function
            simply return a list of sites. As of right now we do not guarentee
            the order of these returned sites.

        Returns
        -----------

        dict 
            Returns a dictionary where the key is a position (location) and the 
            value is a list of one or more sites at that position.

        list
            If return_list is set to True, then a list of Site objects is
            returned instead.

        """
       
        return_dict = {}

        self._check_position_is_valid(start,  helper_string='Sequence region cannot start below 1 [%i]'%(start))
        self._check_position_is_valid(end, 'Sequence region cannot end after the sequence length (%i) [%i]'%(self._len, end))

        # check the wiggle passed is valid
        if wiggle < 0:
            raise ProteinException('Passed a wiggle value less than 0')


        # recal p1 and p2 should be in real-world indices 
        p1 = max(1, start - wiggle)
        p2 = min(end + wiggle, self._len)

        # recall we need +1 offset so we go to the end - positions/ranges are inclusive
        # when talking about proteins
        for j in range(p1, p2+1):
            if j in self._sites:
                return_dict[j] = self._sites[j]


        if return_list is True:
            # the list comprehension here flattens the returned list
            return [i for sublist in list(return_dict.values()) for i in sublist]
        else:
            return return_dict


    ## ------------------------------------------------------------------------
    ##
    def get_sites_by_type(self, site_types, return_list=False):
        """
        Get a set of sites that match a specified site-type.

        Parameters
        ------------------
    
        site_types : string or list of strings
            One or more possible site_types that may be found in the protein. 
            Either a single string or a list of strings can be passed, 
            allowing for one or more sites to be grouped together

        return_list : bool
            By default, the flag returns a dictionary, which is conveninet as 
            it makes it easy to index into one or more sites at a specific 
            position in the sequence. However, you may instead want a list 
            of sites, in which case setting return_list will have the function
            simply return a list of sites. As of right now we do not guarentee
            the order of these returned sites.

        Returns 
        ----------
        dict 
            Returns a dictionary where the key is a position (location) and the 
            value is a list of one or more sites at that position that match 
            the site type of interest.

        list
            If return_list is set to True, then a list of Site objects is
            returned instead.
        
        """

        return_dict = self.__site_by_type_internal(self._sites, site_types)

        if return_list is True:

            # the list comprehension here flattens the returned list
            return [i for sublist in list(return_dict.values()) for i in sublist]
        else:
            return return_dict
        
        

    ## ------------------------------------------------------------------------
    ##
    def get_sites_by_type_and_range(self, site_types, start, end, wiggle=0, return_list=False):
        """
        Returns a set of sites that match both a type of interest and are 
        found in the range provided. 
    
        Parameters
        ------------------
    
        site_types : string or list of strings
            One or more possible site_types that may be found in the 
            protein. Either a single string or a list of strings can be 
            passed, allowing for one or more sites to be grouped together.

        start : int
            Start residue that defines start of region to be examined

        end : int
            End reidue that defines end of region to be examined

        wiggle : int (default = 0)
            Value that adds slack to the start/end positions symmetrically
            around the start and end positions.

        return_list : bool
            By default, the flag returns a dictionary, which is conveninet as 
            it makes it easy to index into one or more sites at a specific 
            position in the sequence. However, you may instead want a list 
            of sites, in which case setting return_list will have the function
            simply return a list of sites. As of right now we do not guarentee
            the order of these returned sites.


        Returns 
        ----------
        dict 
            Returns a dictionary where the key is a position (location) and the 
            value is a list of one or more sites at that position that match 
            the site type of interest.

        list
            If return_list is set to True, then a list of Site objects is
            returned instead.
      
        """

        # first get sites within the range
        initial_dict = self.get_sites_by_range(start, end, wiggle)
        
        # and then subselect sites of the right type
        return self.__site_by_type_internal(initial_dict, site_types, return_list=return_list)



    ## ------------------------------------------------------------------------
    ##
    def __site_by_type_internal(self, indict, site_types, return_list=False):
        """
        Internal function that allows a subset of sites to be selected 
        based  on the passed site_type(s).

        Parameters
        ------------------
        site_types : string or list of strings
            One or more possible site_types that may be found in the 
            protein. Either a single string or a list of strings can be 
            passed, allowing for one or more sites to be grouped together.

        return_list : bool
            By default, the flag returns a dictionary, which is conveninet as 
            it makes it easy to index into one or more sites at a specific 
            position in the sequence. However, you may instead want a list 
            of sites, in which case setting return_list will have the function
            simply return a list of sites. As of right now we do not guarentee
            the order of these returned sites.


        Returns
        -----------
        dict 
            Returns a dictionary where the key is a position (location) and 
            the value is a list of one or more sites at that position 
            that match the site type of interest. This is exactly the 
            same structure as the  self._sites dictionary, just filtered 
            for a specific site_type.

        list
            If return_list is set to True, then a list of Site objects is
            returned instead.

        """

        # function that allows site_types to be either a string or a list
        # of strings so one or more sity_types can be passed
        site_types = general_utilities.string_to_list_of_strings(site_types)

        return_dict = {}

        # for each key (which reflects a site position in the passed dictionary)
        for i in indict:

            # for each site found in the list associated with that position
            for site_object in indict[i]:

                # for the one or more site types in the site_types list
                for ST in site_types:

                    # if that site type matches the target type
                    if site_object.site_type == ST:

                        # add that site to a new dictionary
                        if i in return_dict:
                            return_dict[i].append(site_object)
                        else:
                            return_dict[i] = [site_object]


        if return_list is True:

            # the list comprehension here flattens the returned list
            return [i for sublist in list(return_dict.values()) for i in sublist]
        else:
            return return_dict
        

    ## ------------------------------------------------------------------------
    ##
    def __repr__(self):             
        return "| Protein: %s - L=%i, #t=%i, #d=%i, #s=%i, #a=%i |" %(self.unique_ID, self._len, len(self.tracks), len(self.domains), len(self.sites), len(self.attributes))
        


    ## ------------------------------------------------------------------------
    ##
    def __len__(self):             
        return self._len