Source code for shephard.protein

"""
SHEPHARD: 
Sequence-based Hierarchical and Extendable Platform for High-throughput Analysis of Region of Disorder

Authors: Garrett M. Ginell & Alex S. Holehouse
Contact: (alex.holehouse@wustl.edu, g.ginell@wustl.edu)

Holehouse Lab - Washington University in St. Louis
"""

import numpy as np
from . import exceptions
from . import sequence_utilities
from .domain import Domain 
from .site import Site
from .track import Track
from .exceptions import ProteinException
from .import general_utilities
from .interfaces.si_domains import add_domains_from_dictionary


# <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
# Class that defines a protein entry
#
[docs]class Protein: def __init__(self, seq, name, proteome, unique_ID, attributes = None): """ Protein objects are the parent object to all sequence-based information. Protein objects are explicitly associated with several different types of objects: * **tracks** - Vectorial information that maps to each residue and contains some set of information. A protein can have multiple tracks, but there must be a one-to-one mapping for sequence to track. * **domains** - Information on a single contiguous region in the protein. A protein can have multiple domains. * **sites** - Information associated with a single amino acid site. A protein can have multiple sites. * **attributes** - Protein-specific information associated. Parameters ------------ seq : str Amino acid sequence for the protein. No validation is performed. name : str Some sort of name-based identifier for the protein. Can be anything - is not used internally so no restraints, but could be used by other bits of analysis. unique_ID : str The unique_ID should be a short unique identifier. When added to a Proteome the Proteome object ensures the unique_ID is unique with respect to that Proteome. We HIGHLY recommend using the uniprot accession number, as this meets the requirement of a unique ID as well as allowing effective cross-refering from other databases. attributes : dict (default = None) The attributes provides a key-value pairing for arbitrary information. This could include gene names, different types of identifies, protein copy number, a set of protein partners, or anything else one might wish to associated with the protein as a whole. Returns --------- Proteine object (constructor) Raises ---------- shephard.exceptions.ProteinException """ # define internal attributes that are then accessed via @properties self._name = name self._sequence = "-" + seq # the '-' at the start fixes our indexing woes self._unique_ID = unique_ID self._proteome = proteome self._len = len(seq) # protein length self._true_len = len(self._sequence) # length of string the protein is in general_utilities.variable_is_dictionary(attributes, ProteinException, 'attributes argument passed to protein %s is not a dictionary' %(self._name), or_none=True) if attributes is None: self._attributes = {} else: self._attributes = attributes # initialize the empty dictionaries for the set of sites, domains and tracks self._sites = {} self._domains = {} self._tracks = {} # the domains by type and sites by type dictionaries are only built IF we request # domains or sites by type. This provides a mode of conditional memoization, so at # least within a single session we do not have to search through domains and sites # multiple times to find a specific type self._domains_by_type = {} self._sites_by_type = {} ## ------------------------------------------------------------------------ ## @property def unique_ID(self): """ Returns the protein's unique_ID Returns --------------- str Returns the protein's unique_ID """ return self._unique_ID ## ------------------------------------------------------------------------ ## @property def name(self): """ Returns the protein name. Returns --------------- str Returns a string that corresponds to the region of interest """ return self._name ## ------------------------------------------------------------------------ ## @property def proteome(self): """ Returns the Proteome object this protein is associated with. Returns -------- Proteome Returns a Proteome object that contains this Protein. """ return self._proteome ################################### ## ## ## SEQUENCE FUNCTIONS ## ## ## ################################### ## ------------------------------------------------------------------------ ## def residue(self, position): """ Function that returns the natural residue found at a given position. Parameters ---------- position : int Position of interest. Returns ---------- str Returns a single character that corresponds to the string of interest. """ # only check if safe is true self._check_position_is_valid(position) # cast to integer incase... return self._sequence[int(position)] ## ------------------------------------------------------------------------ ## @property def sequence(self): """ Returns the protein amino acid sequence as a Python string (str). Recall that in strings indexing occurs from 0 and is non-inclusive. For proteins/biology indexing is from 1 and is inclusive. i.e. for sequence 'MAPSTA...' real/biological indexing of region 1-3 would give you 'MAP' while Python's indexing would give you 'AP'. As a result BEWARE if using the raw sequence for analysis! The Protein class provides a ``get_sequence_region()``, ``get_sequence_context()`` and analogous functions for tracks that allow you to use normal indexing to select ranges or regions around a specific point. We suggest this is a safer way to extract vectorial information. Returns -------- str Amino acid sequence associated with the protein. """ return self._sequence[1:] ## ------------------------------------------------------------------------ ## def get_sequence_region(self, start, end): """ Function that allows a region of the sequence to be extracted out. Parameters --------------- start : int Start position for region end : int End position for region (note this is inclusive) Returns --------------- str Returns a string that corresponds to the region of interest """ # validate passed range self._check_position_is_valid(start, helper_string=f'Invalid sequence start position [{start}]. Sequence runs between 1 and {self._len}') self._check_position_is_valid(end, f'Invalid sequence end position [{end}]. Sequence runs between 1 and {self._len}') # note +1 because we're inclusive with positions return self._sequence[start:end + 1] ## ------------------------------------------------------------------------ ## def get_sequence_context(self, position, offset=5, return_indices=False): """ Function that allows a local region of the sequence centered on a specific position to be extracted, including +/- an offset border that intelligently truncates if the offset would extend outside the sequence region. Parameters --------------- position : int Position for which we'll interrogate the local sequence offset : int (default = 5) Plus/Minus offset used to investigate the region around the position. Note that an offset is symmetrical around the position. return_indices : bool (default = False) Flag which, if set to true, means this function returns a TUPLE where position 0 is the string corresponding to the region of interest, position 2 is the start index (in normal SHEPHARD indexing, i.e. starting from 1) and position 3 is the end index (in normal SHEPHARD indexing). Returns --------------- str, (str, int, int) If return_indices is set to False, this just returns a string that corresponds to the region of interest. If return_indices is set to True, this just returns a string that corresponds to the region of interest, as well as the start and end positions that are inclusive in the sequence indexing from 1. """ # sanity check position input self._check_position_is_valid(position, helper_string='Sequence position %i is outside of protein limits (1-%i)'%(position, len(self))) # compute start/end of context according to the offset (p1, p2) = sequence_utilities.get_bounding_sites(position, offset, self._len) # note +1 because we're inclusive with positioning here (and index from 1) if return_indices: return (self._sequence[p1:p2 + 1], p1, p2) else: return self._sequence[p1:p2 + 1] ## ------------------------------------------------------------------------ ## def check_sequence_is_valid(self): """ Function that checks if the current protein sequence is valid (i.e. consists of only the standard 20 amino acids). Returns --------------- bool Returns True if all residues are in the standard 20 amino acids, and False if not. """ # recal we start at +1 to discard the leading '-' used to ensure we can use # real-world indexing for i in self._sequence[1:]: if i not in general_utilities.STANDARD_AAs: return False return True ## ------------------------------------------------------------------------ ## def convert_to_valid(self, copy=False, safe=True): """ Function that converts non-standard amino acid residues to standard ones and applies this version to the Protein's sequence. Specifically: ``B -> N`` ``U -> C`` ``X -> G`` ``Z -> Q`` ``* -> <empty string>`` ``- -> <empty string>`` By default this alters the underlying sequence. If you wish to return a copy of the altered sequence instead set copy=True. Otherwise the underlying sequence is changed. Note that removing the ``*`` and ``-`` characters will change the sequence length which could cause major issues as none of the internal position-specific references will automatically update. Note that if safe=True such changes will trigger an exception. Parameters --------------- copy : bool (default = False) Boolean flag - if set to true a copy of the updated sequence is returned, if False then the function returns None. In both cases the associated protein's sequence is altered. safe : bool (default = True) Boolean flag that defines how to respond if an update changes the sequence length. If set to true, a change that alters the sequence length will trigger an exception, if False it will continue unannounced. Returns --------------- None, str If copy = False then no return value is provided. If copy = True then the function returns a string. """ if copy is True: # create a copy, such that within the protein the underlying sequence is # unaltered s = self._sequence[:] else: # create a view, which means the protein sequence is altered s = self._sequence old_len=len(s) # systematically replace common 'non-canonical' one-letter codes # with acceptable codes. Code explanations from # https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=BlastHelp s = s.replace('B','N') # B = N/E s = s.replace('U','C') # U = selenocysteine s = s.replace('X','G') # X = Any s = s.replace('Z','Q') # Z = Q/D s = s.replace('*','') # * = stop s = s.replace('-','') # - = gap if safe is True: if len(s) != old_len: raise ProteinException('When altering the sequence to remove invalid characters the sequence length changed. This will invalidate positional attributes, such as sites and domains, as these are not automatically updated!') if copy is True: return s else: return None ## ------------------------------------------------------------------------ ## def _check_position_is_valid(self, position, helper_string=None): """ Internal function that tests that a passed position is valid in the given protein. The helper-string allows the exception raised to be customized. Parameters --------------- position : int A position in question. helper_string : str A customizable string that can be passed should an error be raised. Returns ----------- None If the position falls within the protein sequence None is returned, otherwise an exception is raised. """ # recal we're operating in a 1-indexed space if sequence_utilities.inside_region(1, self._len, position): return None else: if helper_string: raise ProteinException(helper_string) else: raise ProteinException('Position %i falls outside of sequence'%(position)) ################################### ## ## ## ATTRIBUTE FUNCTIONS ## ## ## ################################### ## ------------------------------------------------------------------------ ## @property def attributes(self): """ Provides a list of the keys associated with every attribute associated with this protein. Returns ------- list returns a list of the attribute keys associated with the protein. """ return list(self._attributes.keys()) ## ------------------------------------------------------------------------ ## def attribute(self, name, safe=True): """ Function that returns a specific attribute as defined by the name. Recall that attributes are name : value pairs, where the 'value' can be anything and is user defined. This function will return the value associated with a given name. Parameters ---------------- name : str The attribute name. A list of valid names can be found by calling the ``<Protein>.attributes()`` (which returns a list of the valid names). safe : bool (default = True) Flag which if true with throw an exception if an attribute with the same name already exists Returns --------- Unknown Will either return whatever was associated with that attribute (which could be anything) or None if that attribute is missing. """ # if name is in the _attributes dictionary the return if name in self._attributes: return self._attributes[name] else: # else if safe was passed raise an exception if that attribute was missing if safe: raise ProteinException('Requesting attribute [%s] from protein [%s] but this attribute has not been assigned' % (name, str(self))) # if safe not passed just return None else: return None ## ------------------------------------------------------------------------ ## def add_attribute(self, name, val, safe=True): """ Function that adds an attribute. Note that if safe is true, this function will raise an exception if the attribute is already present. If safe=False, then an existing value will be overwritten. Parameters ---------------- name : str The parameter name that will be used to identify it val : <anything> An object or primitive we wish to associate with this attribute. safe : bool (default = True) Flag which if True with throw an exception if an attribute with the same name already exists, otherwise the newly introduced attribute will overwrite the previous one. Returns --------- None - but adds an attribute to the calling object """ if safe: if name in self._attributes: raise ProteinException("Trying to add attribute [%s=%s] to protein [%s] but this attribute is already set.\nPossible options are: %s" %(name,val, str(self), str(self._attributes.keys()))) self._attributes[name] = val ## ------------------------------------------------------------------------ ## def remove_attribute(self, name, safe=True): """ Function that removes a given attribute from the Protein based on the passed attribute name. If the passed attribute does not exist or is not associate with the Protein then this will trigger an exception unless safe=False. Parameters ---------------- name : str The parameter name that will be used to identify it safe : bool (default = True) Flag which if True with throw an exception if an attribute this name does not exists. If set to False then if an attribute is not found it is simply ignored Returns --------- None No return type but will remove an attribute from the protein if present. """ if name not in self._attributes: if safe: raise ProteinException(f'Passed attribute [{name}] not found in {self}') else: del self._attributes[name] ############################### ## ## ## TRACK FUNCTIONS ## ## ## ############################### ## ------------------------------------------------------------------------ ## @property def tracks(self): """ Provides a list of Track objects associated with this protein Returns ------- list returns a list of the Tracks (order will be consistent but is not sorted). """ return [self._tracks[k] for k in self._tracks] ## ------------------------------------------------------------------------ ## @property def track_names(self): """ Provides a list of the keys associated with each track associated with this protein. These keys can then be used to extract a specific track, or can be used to check if a Track is present. Returns ------- list returns a list of the track keys associated with the protein. """ return list(self._tracks.keys()) ## ------------------------------------------------------------------------ ## def track(self, name, safe=True): """ Function that returns a specific Track as defined by the name. Recall that Tracks are defined by a name. If a Track by this name exists this function returns the actual Track object, NOT the **values** or **symbols** associated with the track. If a Track by this name does *not* exist then if safe=True an exception will be raised, otherwise the function returns None. For direct access to values and symbols, use the ``<Protein>.get_track_values(<track_name>)`` and ``<Protein>.get_track_symbols(<track_name>)``. Parameters ---------------- name : str The track name. A list of valid names can be found by calling the ``<Protein>.tracks()`` (which returns a list of the valid track names). Returns --------- Unknown Will either return the Track object associated with the name, OR will return None if safe=False and there was no Track object that matched the name. """ if name in self._tracks: return self._tracks[name] elif safe: raise exceptions.ProteinException('No track named [%s] in protein %s\n\nAvailable options are: %s' %(name, self.unique_ID, str(self.track_names))) ## ------------------------------------------------------------------------ ## def get_track_values(self, name, start=None, end=None, safe=True): """ Function that returns the values associated with a specific track, as defined by the name. Recall that tracks are defined by a name. If a track by this name exists this function returns the values IF these are associated with the track. If no values are associated then the function will throw an exception unless safe is set to False, in which case it will return None. Parameters ---------------- name : string The track name. A list of valid names can be found by calling the <Protein>.tracks (which returns a list of the valid track names). start : int (default None) If provided defines the start position along the track. If not provided defaults to 1 (first residue in the protein). end : int (default None) If provided defines the end position along the track. If not provided defaults to the final residue in the protein. safe : bool (default = True) Flag which if true with throw an exception if a track that matches the passed name does not already exist. Returns --------- Unknown Will either return the values associated with the track, OR will return None if safe=False and there was no Track that matched the name. """ (_start, _end) = self.__build_start_end(start,end) # call internal function return self.__get_track_info(name, safe, _start, _end, 'values') ## ------------------------------------------------------------------------ ## def get_track_symbols(self, name, start=None, end=None, safe=True): """ Function that returns the symbols associated with a specific track, as defined by the name. Recall that tracks are defined by a name. If a track by this name exists this function returns the symbols IF these are associated with the track. If no symbols are associated then the function will throw an exception unless safe is set to False, in which case it will return None. Parameters ---------------- name : string The track name. A list of valid names can be found by calling the <Protein>.tracks (which returns a list of the valid track names). start : int (default = None) If provided defines the start position along the track. If not provided defaults to 1 (first residue in the protein). end : int (default = None) If provided defines the end position along the track. If not provided defaults to the final residue in the protein. safe : bool (default = True) Flag which if true with throw an exception if a track that matches the passed name does not already exist. Returns --------- Unknown Will either return the values associated with the track, OR will return None if safe=False and there was no Track that matched the name. """ # build the start and end position (_start, _end) = self.__build_start_end(start,end) # call internal function return self.__get_track_info(name, safe, _start, _end, 'symbols') ## ------------------------------------------------------------------------ ## def __build_start_end(self, start, end): """ Internal function that sanity checks requested start end positions and peforms type conversion Parameters ----------- start : str or int or float or None: Start position, will be type converted to Int if not None end : str or int or float or None: End position, will be type converted to Int if not None Returns ---------- tuple Returns a 2-position tuple where the first position is the start po """ # set to default values OR convert to int try: if start is None: _start = 1 else: _start = int(start) if end is None: _end = len(self) else: _end = int(end) except ValueError: raise exceptions.ProteinException('When selecting sub-region for track values could not convert one of the start/end to an int: start=%s, end=%s'% (start,end)) return (_start, _end) ## ------------------------------------------------------------------------ ## def __get_track_info(self, name, safe, start, end, mode): """ Internal function that follows the exact same logic as the public-facing get_track_symbols or get_track_values. Note that the start and end values passed here have already been validated. Parameters ----------- name : str Track name safe : bool Flag which if true with throw an exception if a track that matches the passed name does not already exist. start : int Start position end : int End position Returns ---------- None or list Returns either a list of values or symbols, or None if no Track with the passed name is present and safe is False. """ t = self.track(name, safe) if t is None: # note - technically as the code is written now we don't need this, (the safety is dealt in get_track()) # but I'm including it for best practice to avoid implicit dependencies in the code if safe: raise exceptions.ProteinException('No track named [%s] in protein %s\n\nAvailable options are: %s' %(name, str(self), self.track_names)) else: return None # try and get values if mode == 'values': v = t.values_region(start, end) elif mode == 'symbols': v = t.symbols_region(start, end) # if v is a value or safe is False just return v (will either be values # or None) if v is not None or not safe: return v # we only get here if v is None and safe is True else: raise exceptions.ProteinException('Requested track values for track [%s] in protein [%s] but no values available' %(t.name, self)) ## ------------------------------------------------------------------------ ## def add_track(self, name, values=None, symbols=None, safe=True): """ Function that adds a track to this protein. For more information on Tracks see the relevant documentation. However, some general guidelines are provided below for convenience. * A **values track** should be a list/array of numerical values * A **symbols track** should be a list or string of symbolic characters In either case, the iterable should have a 1:1 mapping with the sequence Finally, Tracks can have both a value and a symbol, although in general it probably makes sense to use multiple tracks. Parameters --------------- name : string Name for track. NOTE that this is a unique identifier, and each track within a given protein should must have a unique name. values : list or np.array (default None) A numerical iterable collection of values, where each value maps to a specific residue in the sequence. symbols : list or string (default None) A symbolic collection of characters, where each symbol maps to a specific residue in the sequence. safe : bool (default = True) If set to True over-writing tracks will raise an exception, otherwise overwriting a track will simply over-write it. Returns ---------- None Nothing, but adds a track to the calling object. """ if name in self.track_names: if safe is True: raise exceptions.ProteinException('Trying to add Track [%s] in protein [%s] but Track already exists' % (name, self.name)) self._tracks[name] = Track(name, self, values, symbols) ## ------------------------------------------------------------------------ ## def build_track_values_from_sequence(self, name, trackfunction, input_dictionary=None, safe=True): """ Tracks can be added as pre-loaded values. However, sometimes you want to build a track based on some analysis of the sequence on the fly. This function allows you to pass in your own function (with keyword arguments in the keywords dictionary) that will take in the protein sequence, generate a new track, and add that track to the protein. build_track_values allows you to define a function that converts amino acid sequence into a numerical list or np.array, which gets written as a values track. If you want a symbols track, use build_track_symbols(). Specifically, the argument trackfunction must be a user-defined function. This function can be defined anywhere, but should take either one or two arguments: (1) The first/only argument should be an amino acid sequence. (2) The second argument a dictionary of key-value pairs. When build_track_values_from_sequence is called, the sequence of the protein is passed as the first argument into the trackfunction, and - if present - the input_dictionary is passed as the second argument. In this way a new track is defined internally, with the track function using the proteins sequence and any/all pass input_dictionary to convert the sequence into some numerical representation. Parameters ------------ name : string Name of the track to be used. Should be unique and will always overwrite an existing track with the same name (no safe keyword provided here). trackfunction : function A user define function that has the following properties: (1) First argument is expected to be amino acid sequence (2) Second argument (if provided) should be a dictionary which is passed (untouched) THROUGH build_track_values from sequence to the trackfunction at runtime function_keywords : dictionary This is a dictionary that will be passed to the trackfunction as the second argument IF it is provided. In this way, the user can pass an arbitrarily complex set of arguments to the track function each time the build_track_values_from_sequence is called. safe : bool (default = True) If set to True over-writing tracks will raise an exception, otherwise overwriting a track will simply over-write it. Example ---------- Below we offer an example for how one might defined a custom track-building function:: # define a function that takes in a sequence and converts it # into some other numerical list. Note this is INLINE with the # code, or could be elsewhere. This function MUST take either # ONE argument (sequence) or TWO arguments (sequence and # input_dictionary). Also the names of these arguments does # not matter, but the order does (i.e. first argument will # always get the sequence). def trackbuilder(seq, input_dictionary): ''' This function takes in a sequence (seq) as first argument, and the v1 and v2 as additional arguments. See below for what it's doing (pretty simple). ''' newseq=[] # we are extracting out the 'values' from the input dictionary # for the sake of code clarity v1 = input_dictionary['v1'] v2 = input_dictionary['v2'] # for each residue in the sequence for i in seq: # is that residue in v1 (append 1) or v2 (append -1)? If # neither append 0 if i in v1: newseq.append(1) elif i in v2: newseq.append(-1) else: newseq.append(0) return newseq # define the input_dictionary (note again that the variable names # here do not matter) input_dictionary = {'v1':['K','R'], 'v2':['E','D']} # now assuming ProtOb is a Protein object, this will add a new # track ProtOb.build_track_values('charge_vector', trackbuilder, function_dictionary=input_dictionary) In this example we defined a function that converts an amino acid string into a numerical list where positively charged residues = +1 and negatively charged residues = -1. We applied this function to generate a 'charge_vector' track. Note this is analagous to defining our function and then running:: s = ProtOb.sequence newtrack = trackbuilder(s, ['K','R'], ['E',D']) ProbOb.add_track('charge_vector', values=newtrack) **Some FAQs:** * Do I need to pass an input_dictionary to the custom function? No! * Does the name of the custom function matter? No! * Does the custom function have to accepted the amino acid sequence as the first argument? Yes! """ # if this will overwrite an existing track and safe is on... if name in self.track_names: if safe is True: raise exceptions.ProteinException('Trying to add Track [%s] in protein [%s] but Track already exists' % (name, self.name)) # build the new track with the trackfunction, correctly handling between 0 and n additional # arguments to be passed to the trackfunction if input_dictionary is None: built_track = trackfunction(self.sequence) else: built_track = trackfunction(self.sequence, input_dictionary) # finally add the track self._tracks[name] = Track(name, self, values=built_track, symbols=None) ## ------------------------------------------------------------------------ ## def build_track_symbols_from_sequence(self, name, trackfunction, input_dictionary = None, safe = True): """ Tracks can be added as pre-loaded values. However, sometimes you want to build a track based on some analysis of the sequence on the fly. This function allows you to pass in your own function (with keyword arguments) that will take in the protein sequence, generate a new track, and add that track to the Protein. build_track_symbols allows you to define a function that converts amino acid sequence into a symbolic list or string, which gets written as a symbols track. If you want a values track, use build_track_values(). Specifically, the argument trackfunction must be a user-defined function. This function can be defined anywhere, but should take either one or two arguments: (1) The first/only argument should be an amino acid sequence. (2) The second argument a dictionary of key-value pairs. When build_track_symbols_from_sequence is called, the sequence of the protein is passed as the first argument into the trackfunction, and - if present - the input_dictionary is passed as the second argument. In this way a new track is defined internally, with the track function using the proteins sequence and any/all pass input_dictionary to convert the sequence into some other symbolic representation. Parameters ------------ name : string Name of the track to be used. Should be unique and will always overwrite an existing track with the same name (no safe keyword provided here). trackfunction : funct A user define function that has the following properties: (1) First argument is expected to be amino acid sequence (2) Second argument (if provided) should be a dictionary which is passed (untouched) THROUGH build_track_values from sequence to the trackfunction at runtime function_keywords : dict (default None) This is a dictionary that will be passed to the trackfunction as the second argument IF it is provided. In this way, the user can pass an arbitrarily complex set of arguments to the trackfunction each time the build_track_symbols_from_sequence is called. safe : bool (default = True) If set to True over-writing tracks will raise an exception, otherwise overwriting a track will simply over-write it. Example -------- Below we offer an example for how one might defined a custom track-building function:: # define a function that takes in a sequence and converts it into some # other symbolic representation as a string. Note this is INLINE with # the code, or could be elsewhere. This function MUST take either ONE # argument (sequence) or TWO arguments (sequence and input_dictionary). # # Also the names of these arguments does not matter, but the order does # (i.e. first argument will always get the sequence). def trackbuilder(seq, input_dictionary): ''' This function takes in a sequence (seq) as first argument, and the v1 and v2 as additional arguments. See below for what it's doing (pretty simple). ''' new_string_list=[] # we are extracting out the 'values' from the input dictionary # for the sake of code clarity v1 = input_dictionary['v1'] v2 = input_dictionary['v2'] # for each residue in the sequence for i in seq: # is that residue in v1 (append 1) or v2 (append -1)? If neither # append 0 if i in v1: new_string_list.append('+') elif i in v2: new_string_list.append('-') else: new_string_list.append('0') # convert the list into a string newstring = "".join(new_string_list) return newstring # define the input_dictionary (note again that the variable names # here do not matter) input_dictionary = {'v1':['K','R'], 'v2':['E','D']} # now assuming ProtOb is a Protein object, this will add a new track ProtOb.build_track_values('charge_string', trackbuilder, function_dictionary=input_dictionary) In this example we defined a function that converts an amino acid string into a coarse-grained string representation where positive residues are "+", negative are "-" and neutral are "0". Note this is analagous to defining our function and then running:: s = ProtOb.sequence newtrack = trackbuilder(s, ['K','R'], ['E',D']) ProbOb.add_track('charge_vector', values=newtrack) **FAQs:** * Do I need to pass an input_dictionary to the custom function? No * Does the name of the custom function matter? No! * Does the custom function have to accepted the amino acid sequence as the first argument? Yes! """ # if this will overwrite an existing track and safe is on... if name in self.track_names: if safe is True: raise exceptions.ProteinException('Trying to add Track [%s] in protein [%s] but Track already exists' % (name, self.name)) # build the new track with the trackfunction, correctly handling between 0 and n additional # arguments to be passed to the trackfunction if input_dictionary is None: built_track = trackfunction(self.sequence) else: built_track = trackfunction(self.sequence, input_dictionary) # finally add the track self._tracks[name] = Track(name, self, values=None, symbols=built_track) ## ------------------------------------------------------------------------ ## def build_track(self, name, input_data, track_definition_function, safe=True): """ Function that constructs a track using a given track_definition_function and a user provided input_data object. Very little constraint is set here, other than the fact the name should be a string and track_definition function should return a dictionary with (at least) two key:value pairings: `symbols` and `values`, where the corresponding value for each is bona-fide track input data. Parameters ------------ name : string Name of the track to be used. Should be unique and will always overwrite an existing track with the same name (no safe keyword provided here). input_data : ? Some kind of data that will be passed to the track_definition_function track_definition_function : function Function that takes in `input_data` and returns a dictionary with a 'values' and a 'symbols' key and value pairing. The values that map to 'values' and 'symbols' will be added as a single new track defined by name. safe : bool (default = True) If set to True over-writing tracks will raise an exception, otherwise overwriting a track will simply over-write it. Returns ------------ None No return type, but a new track is added to the Protein. """ if name in self.track_names: if safe is True: raise exceptions.ProteinException('Trying to add Track [%s] in protein [%s] but Track already exists' % (name, self.name)) track_out = track_definition_function(input_data) values = track_out['values'] symbols = track_out['symbols'] self._tracks[name] = Track(name, self, values=values, symbols=symbols) ## ------------------------------------------------------------------------ ## def remove_track(self, track_object, safe=True): """ Function that removes a given Track from the Protein based on the passed Track object. If the passed Track does not exist or is not associate with the protein then this will trigger an exception unless safe=False. Parameters ------------ track_object : shephard.track.Track Object or None Track Object that will be used to retrieve a given protein. Note that remove_track() can tollerate None as the object if Safe=False to enable a single for-loop to iterate over a proteome and remove all tracks of a specific type without worrying as to if the track is present or not. safe : bool (default = True) Flag that if set to True means if a passed track is missing from the underlying protein object an exception wll be raised (ProteinException). If False a missing track is ignored. Returns ----------- None No return type but will remove track from the protein """ # this means we can pass a None into the remove tracks function and it doesnt kill things - makes # it syntactically simple to search over a proteome to remove tracks of a specific type using if type(track_object) != Track: if safe is False: return else: raise ProteinException(f'track_object was not a Track, but Safe=True') # failsafe to ensure we can only delete tracks that truly come from the protein we're passing # into if track_object.protein.unique_ID != self.unique_ID: raise ProteinException(f'Passed Track [{track_object}] not found in this protein [{self.protein}]') # if the passed track object name was found in this protein if track_object.name in self._tracks: self.proteome.__decrement_track_names(track_object.name) del self._tracks[track_object.name] else: if safe: raise ProteinException(f'Passed Track [{track_object}] not found in {self}') ############################### ## ## ## DOMAIN FUNCTIONS ## ## ## ############################### ## ------------------------------------------------------------------------ ## @property def domains(self): """ Returns a list of the Domain objects associated with this protein, sorted by first reside of the domain. """ domain_list = [self._domains[k] for k in self._domains] domain_list.sort(key=lambda x: x.start, reverse=False) return domain_list ## ------------------------------------------------------------------------ ## @property def domain_names(self): """ Returns a list of the domain names associated with this protein """ domain_list = self.domains return [d.domain_name for d in domain_list] ## ------------------------------------------------------------------------ ## @property def domain_types(self): """ Returns a list of the unique domain types associated with this protein. There will be no duplicates here. """ # define an empty set domain_types = set([]) # cycle through the domains and add the domain type to the set for domain in self._domains: domain_types.add(self._domains[domain].domain_type) # convert the set to a list and return return list(domain_types) ## ------------------------------------------------------------------------ ## def domain(self, name, safe=True): """ Function that returns a specific domain as defined by the name. Note it is often more useful to request a domain by type rather than by the name, in which case get_domains_by_type(<domain_type>) is the relevant syntax. Note domains can also be requested based on position (get_domains_by_position). Parameters ---------------- name : string The Domain name. A list of valid names can be found by calling the <Protein>.domains (which returns a list of the valid track names). safe : bool (default = True) Flag which if true with throw an exception if no domain exists with this name. If false function will return None instead. Returns --------- Unknown Will either return the Domain object associated with the name, OR will return None if safe=False and there was no Domain object that matched the name. """ if name in self._domains: return self._domains[name] elif safe: raise exceptions.ProteinException('No domains named [%s] in protein %s\n\nAvailable domains are: %s' % (name, self.unique_ID, str(self.domain_names))) ## ------------------------------------------------------------------------ ## def add_domains(self, list_of_domains, safe=True, autoname=False, verbose=False): """ Function that takes a list of domain dictionaries and adds those domains to the protein. Each domain dictionary within the list must have a key-value pair that defines the following info: * **start** - domain start position (in real sequence, not i0 indexing) * **end** - domain end position (in real sequence, not i0 indexing) * **domain_type** - type of the domain (string) * **attributes** - a dictionary of attributes to associated with the domain (optional) Note that in start, end, and domain_type are the only required key-value pairs required in the dictionary. If you wish to add many domains to main proteins, see interfaces.si_domains.add_domains_from_dictionary() Parameters ------------- list_of_domains : list A list of domain dictionaries. A "domain dictionary" is defined above, but in short is a dictionary with the following key-value pairs: * REQUIRED: * start - int (domain start position) * end - int (domain end position) * domain_type - string (domain type) * OPTIONAL: * attributes - dictionary of arbitrary key-value pairs that will be associated with the domain safe : bool (default = True) If set to True over-writing domains will raise an exception. If False, overwriting a domain will silently over-write. autoname : bool (default = False) If autoname is set to true, this function ensures each domain ALWAYS has a unique name - i.e. the allows for multiple domains to be perfectly overlapping in position and type. This is generally not going to be required and/or make sense, but having this feature in place is useful. In general we want to avoid this as it makes it easy to include duplicates which by default are prevented when autoname=False. verbose : bool (default = True) Flag that defines how 'loud' output is. Will warn about errors on adding domains. Returns ------- None No return value, but will add the passed domains to the protein or throw an exception if something goes wrong! """ # create the input dictionary in_dict = {self.unique_ID:list_of_domains} add_domains_from_dictionary(self.proteome, in_dict, autoname=autoname, safe=safe, verbose=verbose) ## ------------------------------------------------------------------------ ## def add_domain(self, start, end, domain_type, attributes=None, safe=True, autoname=False): """ Function that adds a domain, automatically generating a unique name if none is provided. Domain type can be used to assign a specific type if we want to retrieve domains of a specific type at some point. Position indexing is done for 1 - i.e. the first residue in a protein is 1, not 0. Allows a domain at a specific position to be Parameters ----------- start : int Position of the start of the domain, inclusive. end : int Position of the end of the domain (not inclusive). i.e. if we had a domain that ran from start=10 end=20, it would be 10 residues long and include residues [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]. domain_type : str None unique string that allows a type identifier to be associated with a domain. attributes : dict (default = None) Optional dictionary which allows an arbitrary set of attributes to be associated with a domain, in much the same way that they can be associated with a protein. safe : bool (default = True) If set to True over-writing tracks will raise an exception, otherwise overwriting a track will simply over-write it. autoname : bool (default = False) If autoname is set to true, this function ensures each domain ALWAYS has a unique name - i.e. the allows for multiple domains to be perfectly overlapping in position and type. This is generally not going to be required and/or make sense, but having this feature in place is useful. In general we want to avoid this as it makes it easy to include duplicates which by default are prevented when autoname=False. """ # cast input data start = int(start) end = int(end) domain_type = str(domain_type) # append start and end position to name. full_name = "%s_%i_%i"%(domain_type, start, end) # if this domain name was already found... if full_name in self.domain_names: # if we're in autoname mode create a new unique name. This acts to add an incrementer to the # end and cycles through until a unique domaintype_star_end_incrementer name is found, where # incremementer keeps being incremented if autoname: increment = 0 found = False while found is False: increment = increment + 1 newname = "%s_%i_%i_%i"%(domain_type, start, end, increment) if newname not in self.domain_names: found = True full_name = newname elif safe: raise exceptions.ProteinException('Domain [%s] already found in proteins %s' % (full_name, self.name)) self._domains[full_name] = Domain(start, end, self, domain_type, full_name, attributes=attributes) ## ------------------------------------------------------------------------ ## def build_domain(self, input_data, domain_definition_function, safe=True, autoname=False): """ Function that is somewhat analogous to build_tracks, but allows the user to define a custom function (domain_definition_function) that takes input_data and returns domain information, and then assigns that domain information to a new domain Parameters ---------- input_data : anything Any input that makes sense when passed to the domain_definition_function(). domain_definition_function : function that takes a single argument (input_data) and returns a list of 0 or more dictionaries. Each dictionary within the list has a key-value pair that defines the following info: start : domain start position end : domain end position domain_type : type of the domain attributes : a dictionary of attributes to associated with the domain (optional) Note that in principle only start and end are required, although we highly recommend one/both of domain name and domain type. Some important requirements to consider: (1) domain_definition_function must return a list of zero or more dictionaries. safe : bool (default = True) Flag which if true with throw an exception of a domain with the same name already exists. autoname : bool If autoname is set to True, this function ensures each domain ALWAYS has a unique name - i.e. the allows for multiple domains to be perfectly overlapping in position and type. This is generally not going to be required and/or make sense, but having this feature in place is useful. In general we want to avoid this as it makes it easy to include duplicates which by default are prevented when autoname=False. """ # build our domain definitions. Should probably add code that ensyres # domain_definitions is a list domain_definitions = domain_definition_function(input_data) self.add_domains(domain_definitions) ## ------------------------------------------------------------------------ ## def remove_domain(self, domain_object, safe=True): """ Function that removes a given domain from the protein based on the passed domain object. If the passed domain does not exist or is not associate with the protein then this will trigger an exception unless safe=False. Parameters ------------ domain_object : Domain Object Domain Object that will be removed from the protein safe : bool (default = True) Flag that if set to True means the function is robust to the type of domain_object, and if no such domain exist it is silently skipped. Returns ----------- None No return type but will remove a domain from the protein if present. """ if type(domain_object) != Domain: if safe is False: return else: raise ProteinException(f'{domain_object} was not a Domain, but safe=True.') # if the passed object is found at the excised position if domain_object.domain_name in self._domains: # update proteome self.proteome.__decrement_domain_types(domain_object.domain_type) # remove object del self._domains[domain_object.domain_name] else: if safe: raise ProteinException(f'Passed Domain [{domain_object}] not found in {self}') ## ------------------------------------------------------------------------ ## def get_domains_by_position(self, position, wiggle = 0): """ Functions that allows all domains found at a position to be returned. Wiggle defines +/- residues that are allowed (default = 0) in the search operation. Parameters ---------------- position : int Residue position of interest (position in sequence). wiggle : int (default = 0) Value +/- the position (i.e. lets you look at sites around a specific position). Returns -------------- list Returns a list of Domain objects in the order they appear in the protein. """ return self.get_domains_by_range(position, position, wiggle=wiggle, mode='overlap') ## ------------------------------------------------------------------------ ## def get_domains_by_position_and_type(self, position, domain_type, wiggle = 0): """ Functions that allows all domains found at a position and of a specific type to be returned. Wiggle defines +/- residues that are allowed (default = 0) in the search operation. Parameters ---------------- position : int Residue position of interest (position in sequence). domain_type : str String used to match the against the domain types wiggle : int (default = 0) Value +/- the position (i.e. lets you look at sites around a specific position). Returns -------------- list Returns a list of Domain objects in the order they appear in the protein. """ # get all domains at the position local_domains = self.get_domains_by_range(position, position, wiggle=wiggle, mode='overlap') return_domains = [] for d in local_domains: if d.domain_type == domain_type: return_domains.append(d) return return_domains ## ------------------------------------------------------------------------ ## def get_domains_by_range(self, start, end, wiggle = 0, mode='overlap-strict'): """ Function that allows all domains in a protein that are found within a given range to be returned. Three possible modes can be used here; 'internal', 'overlap-strict' and 'overlap' (default = 'overlap-strict'). 'internal' means that the range defined by start and end is 100% within the domains identified. For example, if a domain was between positions 50 and 100 then a range of 60 to 80 would identify that domain but a range of (say) 40 to 120 would not. This is the least permissive mode. 'overlap-strict' means that the range defined by start and end overlaps with the entire domain, but extra residues on at the start and the end of domain are not penalized. For example, if a domain was between positions 50 and 100 then a range of 40 to 120 would be identified because the domain fully overlaps. However a range of 40 to 70 would not. This is the second least permissive mmode, and all domains defined by 'internal' are also identified by overlap-strict. 'overlap' means that the range can also straddle domain boundaires. for example if a domain was between position 50 and 100 and the range was between 40 and 70 this would count - essentially this means any domains that overlap with the passed range in any way are included. This is the most permissive mode, and all domains identified by 'internal' and 'overlap-strict' are also identified by 'overlap'. Parameters --------------- start : int Start of region of interest (position in sequence) end : int End of region of interest (position in sequence) wiggle : int (default = 0) Value +/- at the edges that are included. mode : str (default = 'overlap-strict') Selector that allows the mode to be used for domain overlap to be defined. Must be one of 'internal', 'overlap-strict', or 'overlap'. Definitions and meaning described above. Returns -------------- list Returns a list of Domain objects in the order they appear in the protein. """ # check the mode keyword is valid general_utilities.valid_keyword('mode', mode, ['internal','overlap-strict','overlap']) # check the start and end values are valid self._check_position_is_valid(start, helper_string='Sequence region cannot start below 1 [%i]'%(start)) self._check_position_is_valid(end, 'Sequence region cannot end after the sequence length (%i) [%i]'%(self._len, end)) # check the wiggle passed is valid if wiggle < 0: raise ProteinException('Passed a wiggle value less than 0') return_list = [] p1 = max(1, start - wiggle) p2 = min(end + wiggle, self._len) # cycle through each domain in the protein and build a list of those domains defined by the range for domain in self.domains: valid = False # this scenario always means overlap if p1 >= domain.start and p2 <= domain.end: valid = True # if we're using overlap strict or overlap then have a second criterion if valid is False and (mode == 'overlap-strict' or mode == 'overlap'): if p1 <= domain.start and p2 >= domain.end: valid = True # if we haven't already found an domain and the mode is overlap if valid is False and mode == 'overlap': if p1 <= domain.start and p2 > domain.start: valid = True if p1 <= domain.end and p2 > domain.end: valid = True if valid is True: return_list.append(domain) return return_list ## ------------------------------------------------------------------------ ## def get_domains_by_type(self, domain_type, perfect_match=True): """ Function that returns a list of domains as matched against a specific domain type name. Parameters ------------ domain_type : string String associated domain_type that you want to search for. perfect_match : bool (default = True) Flag that identifies if the domain names should be a perfect match (=True) or if the string passed should just appear somewhere in the domain_type . Returns ----------- list Returns a list of Domain objects that match the requested type. Objects are ordered by starting position in sequence. """ if perfect_match: def selection(t): if t == domain_type: return True else: return False else: def selection(t): if t.find(domain_type) > -1: return True else: return False domain_list = [] for d in self.domains: if selection(d.domain_type): domain_list.append(d) domain_list.sort(key=lambda x: x.start, reverse=False) return domain_list ############################### ## ## ## SITE FUNCTIONS ## ## ## ############################### ## ------------------------------------------------------------------------ ## @property def sites(self): """ Provides a list of the sites associated with every site on the protein. Sorted N to C terminal. """ # this means we will always all_sites = [] for k in self._sites: for s in self._sites[k]: all_sites.append(s) return all_sites ## ------------------------------------------------------------------------ ## @property def site_positions(self): """ Provides a list of the sorted positions where a site is found the protein. Sorted N to C terminal. """ site_keys = list(self._sites.keys()) site_keys.sort() return site_keys ## ------------------------------------------------------------------------ ## @property def site_types(self): """ Returns a list of the unique site types associated with this protein. There will be no duplicates here. """ # define an empty set site_types = set([]) # cycle through the domains and add the domain type to the set for position in self._sites: site_types.update(set(site.site_type for site in self._sites[position])) # convert the set to a list and return return list(site_types) ## ------------------------------------------------------------------------ ## def site(self, position, safe=True): """ Returns the list of sites that are found at a given position. Note that - in general site() should be used to retrieve sites you know exist while `get_sites_by_position()` offers a way to more safely get sites at a position. Site will throw an exception if the position passed does not exist (while `get_sites_by_position()` will not). Parameters ------------- position : int Defines the position in the sequence we want to interrogate Returns --------- list Returns a list with between 0 and n sites. Will raise an exception if the passed position cannot be found in the codebase unless safe=False, in which case an empty list is returned. """ if int(position) in self._sites: return self._sites[int(position)] if safe: raise exceptions.ProteinException('No sites at position %i in protein %s\n\nAvailable sites are: %s' % (position, self.unique_ID, str(self.site_positions))) else: return [] ## ------------------------------------------------------------------------ ## def add_site(self, position, site_type, symbol=None, value=None, attributes=None): """ Function that adds a site to a specific position in the sequence. Sites are indexed by residue position, and multiple sites can co-exist on the same site, so no name is required (unlike Proteins, Tracks or Domains). site_type is a non-unique identifier that allows sites to be specifically identified/selected. Sites can be associated with a numerical value, a symbol, or both. Sites can also have attributes associated with them. If you wish to add many sites to many proteins, see: interfaces.si_sites.add_sites_from_dictionary() Parameters ----------- position : int Position of site (recall we index from 1 - i.e. the first residue in a protein = 1, not 0. Note that this value is cast to int. site_type : string Non-unique string that allows a type identifier to be associated with a site. symbol : string (default = None) Symbol associated with a site. Symbols are string-based - will often be a single character but could be multiple characters. value : float64 (default = None) Numerical value associated with a site. Note that the value is cast to a float64. attributes : dict (default = None) Optional dictionary which allows an arbitrary set of attributes to be associated with a domain, in much the same way that they can be associated with a protein. """ # recal inside_regions is inclusive if not sequence_utilities.inside_region(1, self._len, position): raise ProteinException("Trying to add site to protein [%s] at positions [%i] - this falls outside the protein's dimensions [%i-%i]" %(str(self), position, 1, self._len)) # cast the position to an int and if there are no sites at that position create an empty list there position = int(position) if position not in self._sites: self._sites[position] = [] # add the site! self._sites[position].append(Site(position, site_type, self, symbol, value, attributes)) ## ------------------------------------------------------------------------ ## def remove_site(self, site_object, safe=True): """ Function that removes a given site from the protein based on the passed site object. If the passed site does not exist or is not associate with the protein then this will trigger an exception unless safe=False. Parameters ------------ site : Site Object Unique ID that will be used to retrieve a given protein. Note that remove_site() can tollerate None as the site_object if Safe=False to enable a single for-loop to iterate over a proteome and remove all sites of a specific type without worrying as to if the site is present or not. safe : bool Flag that if set to True means if a passed unique_ID is missing from the underlying proteome object an exception wll be raised (ProteomeException). If False a missing unique_ID is ignored. Returns ----------- None No return type but will remove site from the protein """ if type(site_object) != Site: if safe is False: return else: raise ProteinException(f'site_object was not a Site, but safe=True.') # excise the site positions site_position = site_object.position if site_position not in self._sites: if safe is False: return else: raise ProteinException(f'Site object is at position {site_position} but no sites were found in protein {self.unique_ID} at this position') # if the passed object is found at the excised position if site_object in self._sites[site_position]: self.proteome.__decrement_site_types(site_object.site_type) # remove object self._sites[site_position].remove(site_object) # remove position entry if no other sites at that location if len(self._sites[site_position]) == 0: del self._sites[site_position] else: if safe: raise ProteinException(f'Passed Site [{site_object}] not found in {self}') ## ------------------------------------------------------------------------ ## def get_sites_by_position(self, position, wiggle = 0, return_list=False): """ Get all sites at a specific position Parameters --------------- position : int Residue position of interest (position in sequence) wiggle : int (default = 0) Value +/- the position (i.e. lets you look at sites around a specific position) return_list : bool By default, the flag returns a dictionary, which is conveninet as it makes it easy to index into one or more sites at a specific position in the sequence. However, you may instead want a list of sites, in which case setting return_list will have the function simply return a list of sites. As of right now we do not guarentee the order of these returned sites. Returns ----------- dict Returns a dictionary where the key is a position (location) and the value is a list of one or more sites at that position. list If return_list is set to True, then a list of Site objects is returned instead. """ return self.get_sites_by_range(position, position, wiggle, return_list) ## ------------------------------------------------------------------------ ## def get_sites_by_range(self, start, end, wiggle = 0, return_list=False): """ Get all sites within a certain range. Parameters --------------- start : int Start of region of interest (position in sequence) end : int End of region of interest (position in sequence) wiggle : int (default = 0) Value +/- at the edges that are included. return_list : bool By default, the flag returns a dictionary, which is conveninet as it makes it easy to index into one or more sites at a specific position in the sequence. However, you may instead want a list of sites, in which case setting return_list will have the function simply return a list of sites. As of right now we do not guarentee the order of these returned sites. Returns ----------- dict Returns a dictionary where the key is a position (location) and the value is a list of one or more sites at that position. list If return_list is set to True, then a list of Site objects is returned instead. """ return_dict = {} self._check_position_is_valid(start, helper_string='Sequence region cannot start below 1 [%i]'%(start)) self._check_position_is_valid(end, 'Sequence region cannot end after the sequence length (%i) [%i]'%(self._len, end)) # check the wiggle passed is valid if wiggle < 0: raise ProteinException('Passed a wiggle value less than 0') # recal p1 and p2 should be in real-world indices p1 = max(1, start - wiggle) p2 = min(end + wiggle, self._len) # recall we need +1 offset so we go to the end - positions/ranges are inclusive # when talking about proteins for j in range(p1, p2+1): if j in self._sites: return_dict[j] = self._sites[j] if return_list is True: # the list comprehension here flattens the returned list return [i for sublist in list(return_dict.values()) for i in sublist] else: return return_dict ## ------------------------------------------------------------------------ ## def get_sites_by_type(self, site_types, return_list=False): """ Get a set of sites that match a specified site-type. Parameters ------------------ site_types : string or list of strings One or more possible site_types that may be found in the protein. Either a single string or a list of strings can be passed, allowing for one or more sites to be grouped together return_list : bool By default, the flag returns a dictionary, which is conveninet as it makes it easy to index into one or more sites at a specific position in the sequence. However, you may instead want a list of sites, in which case setting return_list will have the function simply return a list of sites. As of right now we do not guarentee the order of these returned sites. Returns ---------- dict Returns a dictionary where the key is a position (location) and the value is a list of one or more sites at that position that match the site type of interest. list If return_list is set to True, then a list of Site objects is returned instead. """ return_dict = self.__site_by_type_internal(self._sites, site_types) if return_list is True: # the list comprehension here flattens the returned list return [i for sublist in list(return_dict.values()) for i in sublist] else: return return_dict ## ------------------------------------------------------------------------ ## def get_sites_by_type_and_range(self, site_types, start, end, wiggle=0, return_list=False): """ Returns a set of sites that match both a type of interest and are found in the range provided. Parameters ------------------ site_types : string or list of strings One or more possible site_types that may be found in the protein. Either a single string or a list of strings can be passed, allowing for one or more sites to be grouped together. start : int Start residue that defines start of region to be examined end : int End reidue that defines end of region to be examined wiggle : int (default = 0) Value that adds slack to the start/end positions symmetrically around the start and end positions. return_list : bool By default, the flag returns a dictionary, which is conveninet as it makes it easy to index into one or more sites at a specific position in the sequence. However, you may instead want a list of sites, in which case setting return_list will have the function simply return a list of sites. As of right now we do not guarentee the order of these returned sites. Returns ---------- dict Returns a dictionary where the key is a position (location) and the value is a list of one or more sites at that position that match the site type of interest. list If return_list is set to True, then a list of Site objects is returned instead. """ # first get sites within the range initial_dict = self.get_sites_by_range(start, end, wiggle) # and then subselect sites of the right type return self.__site_by_type_internal(initial_dict, site_types, return_list=return_list) ## ------------------------------------------------------------------------ ## def __site_by_type_internal(self, indict, site_types, return_list=False): """ Internal function that allows a subset of sites to be selected based on the passed site_type(s). Parameters ------------------ site_types : string or list of strings One or more possible site_types that may be found in the protein. Either a single string or a list of strings can be passed, allowing for one or more sites to be grouped together. return_list : bool By default, the flag returns a dictionary, which is conveninet as it makes it easy to index into one or more sites at a specific position in the sequence. However, you may instead want a list of sites, in which case setting return_list will have the function simply return a list of sites. As of right now we do not guarentee the order of these returned sites. Returns ----------- dict Returns a dictionary where the key is a position (location) and the value is a list of one or more sites at that position that match the site type of interest. This is exactly the same structure as the self._sites dictionary, just filtered for a specific site_type. list If return_list is set to True, then a list of Site objects is returned instead. """ # function that allows site_types to be either a string or a list # of strings so one or more sity_types can be passed site_types = general_utilities.string_to_list_of_strings(site_types) return_dict = {} # for each key (which reflects a site position in the passed dictionary) for i in indict: # for each site found in the list associated with that position for site_object in indict[i]: # for the one or more site types in the site_types list for ST in site_types: # if that site type matches the target type if site_object.site_type == ST: # add that site to a new dictionary if i in return_dict: return_dict[i].append(site_object) else: return_dict[i] = [site_object] if return_list is True: # the list comprehension here flattens the returned list return [i for sublist in list(return_dict.values()) for i in sublist] else: return return_dict ## ------------------------------------------------------------------------ ## def __repr__(self): return "| Protein: %s - L=%i, #t=%i, #d=%i, #s=%i, #a=%i |" %(self.unique_ID, self._len, len(self.tracks), len(self.domains), len(self.sites), len(self.attributes)) ## ------------------------------------------------------------------------ ## def __len__(self): return self._len