Source code for shephard.domain

"""
SHEPHARD: 
Sequence-based Hierarchical and Extendable Platform for High-throughput Analysis of Region of Disorder

Authors: Garrett M. Ginell & Alex S. Holehouse
Contact: (g.ginell@wustl.edu)

Holehouse Lab - Washington University in St. Louis
 

"""

from . import sequence_utilities
from .exceptions import DomainException
from . import general_utilities
from .tools import domain_tools

# <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
# Class that defines a sequence region 
#
[docs]class Domain: ## ------------------------------------------------------------------------ ## def __init__(self, start, end, protein, domain_type, domain_name, attributes=None): """ Domains are defined sub-regions within a protein. Proteins contain a list of 0 or more domains, and each domain is associated with the protein it originates from via the linking protein object. Domains are indexed using the same indexing as the overall protein sequence (i.e. a protein does not automatically start from 1), and as such the 'native' sequence indexing should be used for working with Proteins. This is a long-winded way of saying position X refers to the same residue regardless of if it's taken from the Protein or Domain or Track object. Parameters ------------- start : int Start position in sequence (recall we index from 1) end : int End position in sequence (recall we index from 1) protein : Protein Protein object for which this Domain is part of domain_type : str Name of the domain type - can be any free-form string domain_name : str The name used as an index by the associated protein to identify this domain. This is ONLY used such that you can re-reference a domain back to the protein if needed. attributes : dict (default = None) Dictionary where key/value pairs allow a Domain to have arbitrary metadata associated with it. """ if start > end: raise DomainException("Trying to a domain to protein [%s] where start site is bigger than the end site (positions: %i-%i - this does not work!" %(str(protein), start, end)) # check the domain falls within the region helper_string="Trying to add domain to protein [%s] at positions [%i-%i] - this falls outside the protein's dimensions [%i-%i]" %(protein, start, end, 1, protein._len) protein._check_position_is_valid(start, helper_string) protein._check_position_is_valid(end, helper_string) # assign if all OK self._start = int(start) self._end = int(end) self._protein = protein self._domain_type = domain_type self._domain_name = domain_name general_utilities.variable_is_dictionary(attributes, DomainException, 'attributes argument passed to domain %s [%i-%i] in protein %s is not a dictionary' %(self._domain_type, self._start, self._end, self._protein), or_none=True) if attributes is None: self._attributes = {} else: self._attributes = attributes # update unique domain types protein.proteome.__update_domain_types(self._domain_type) ## ------------------------------------------------------------------------ ## @property def attributes(self): """ Provides a list of the keys associated with every attribute associated with this domain. Returns ------- list returns a list of the attribute keys associated with the domain. """ return list(self._attributes.keys()) ## ------------------------------------------------------------------------ ## def attribute(self, name, safe=True): """ Function that returns a specific attribute as defined by the name. Recall that attributes are name : value pairs, where the 'value' can be anything and is user defined. This function will return the value associated with a given name. Parameters ---------------- name : str The attribute name. A list of valid names can be found by calling the ``<domain>.attributes()`` (which returns a list of the valid names) safe : bool (default = True) Flag which if true with throw an exception if an attribute with the same name already exists. Returns --------- Unknown Will either return whatever was associated with that attribute (which could be anything) or None if that attribute is missing. """ # if name is in the _atributes dictionary the return if name in self._attributes: return self._attributes[name] else: # else if safe was passed raise an exception if that attribute was missing if safe: raise DomainException('Requesting attribute [%s] from domain [%s] but this attribute has not been assigned' % (name, str(self))) # if safe not passed just return None else: return None ## ------------------------------------------------------------------------ ## def add_attribute(self, name, val, safe=True): """ Function that adds an attribute. Note that if safe is true, this function will raise an exception if the attribute is already present. If safe=False, then an exisiting value will be overwritten. Parameters ---------------- name : str Name that will be used to identify the attribute val : <anything> An object or primitive we wish to associate with this attribute safe : bool (default = True) Flag which if True with throw an exception if an attribute with the same name already exists, otherwise the newly introduced attribute will overwrite the previous one. Returns --------- None - but adds an attribute to the calling object """ if safe: if name in self._attributes: raise DomainException("Trying to add attribute [%s=%s] to domain [%s] but this attribute is already set.\nPossible options are: %s" %(name,val, str(self), str(self._attributes.keys()))) self._attributes[name] = val ## ------------------------------------------------------------------------ ## def remove_attribute(self, name, safe=True): """ Function that removes a given attribute from the Domain based on the passed attribute name. If the passed attribute does not exist or is not associate with the Domain then this will trigger an exception unless safe=False. Parameters ---------------- name : str The attribute name that will be used to identify it safe : bool (default = True) Flag which if True with throw an exception if an attribute this name does not exists. If set to False then if an attribute is not found it is simply ignored Returns --------- None No return type but will remove an attribute from the protein if present. """ if name not in self._attributes: if safe: raise DomainException(f'Passed attribute [{name}] not found in {self}') else: del self._attributes[name] ## ------------------------------------------------------------------------ ## @property def start(self): """ **[Property]**: Returns the start position that defines this domain :getter: Returns the start of the domain (indexed from 1) :setter: None :type: int """ return self._start ## ------------------------------------------------------------------------ ## @property def end(self): """ **[Property]**: Returns the end position that defines this domain """ return self._end ## ------------------------------------------------------------------------ ## @property def protein(self): """ **[Property]**: Returns the Protein that this Domain is associated with """ return self._protein ## ------------------------------------------------------------------------ ## @property def sequence(self): """ **[Property]**: Returns the amino acid sequence associated with this domain """ return self._protein.get_sequence_region(self._start, self._end) ## ------------------------------------------------------------------------ ## @property def domain_type(self): """ Returns the domain type as a string """ return self._domain_type ## ------------------------------------------------------------------------ ## @property def domain_name(self): """ Returns the domain name as generated when added to the protein """ return self._domain_name ## ------------------------------------------------------------------------ ## def update_domain_name(self, new_name): """ Function that updates the domains name Parameters ----------- new_name : str String passed as new name Returns ----------- None Nothing but sets the name to be the new name """ self._domain_name = new_name ###################################### ## ## ## DOMAIN FUNCTIONS # ## ## ###################################### ## ------------------------------------------------------------------------ ## def inside_domain(self, position): """ Function that returns True/False depending on if the provided position lies inside the domain. Parameters ------------ position : int Position in the sequence Returns ----------- bool Returns True if position is inside the domain region, else False """ return sequence_utilities.inside_region(self.start, self.end, position) ## ------------------------------------------------------------------------ ## def domain_overlap(self, domain2): """ Function that takes in a second domain and calculates if those two domains overlap at all. This is a binary check and does not compute the extent of overlap. Parameters ------------ domain2 : Domain The Domain object of interest Returns ----------- bool Returns True if the domains overlap, False if not. Note this will throw an exception if the domains are from different proteins. """ return domain_tools.domain_overlap(self, domain2) ###################################### ## ## ## DOMAIN SITE FUNCTIONS ## ## ## ###################################### ## ------------------------------------------------------------------------ ## @property def sites(self): """ Get list of all sites inside the domain. Returns -------- list Returns a list of all the sites """ all_sites = [] sites_dict = self._protein.get_sites_by_range(self.start, self.end) for k in sites_dict: for local_site in sites_dict[k]: all_sites.append(local_site) return all_sites ## ------------------------------------------------------------------------ ## @property def site_positions(self): """ Get list of all sites inside the domain. Returns -------- list Returns a list of all the site positions """ return list(self._protein.get_sites_by_range(self.start, self.end).keys()) ## ------------------------------------------------------------------------ ## def site(self, position): """ Returns the list of sites that are found at a given position. Note that - in generalsite() should be used to retrieve sites you know exist while get_sites_by_position() offers a way to more safely get sites at a position. Site will throw an exception if the position passed does not exist (while get_sites_by_position() will not). Parameters ------------- position : int Defines the position in the sequence we want to interrogate Returns --------- list Returns a list with between 1 and n sites. Will raise an exception if the passed position cannot be found in the codebase. """ ipos = int(position) if sequence_utilities.inside_region(self.start, self.end, ipos): return self._protein._sites[int(position)] else: raise DomainException('Passed position [%i] is outside of the domain boundaries [%i-%i]' %(ipos, domain.start, domain.end)) ## ------------------------------------------------------------------------ ## def get_sites_by_type(self, site_type, return_list=False): """ Get dictionary of list of sites inside the domain Parameters ------------ site_type : string The site type identifier for which the function will search for matching sites return_list : bool By default, the flag returns a dictionary, which is convenient as it makes it easy to index into one or more sites at a specific position in the sequence. However, you may instead want a list of sites, in which case setting return_list will have the function simply return a list of sites. As of right now we do not guarentee the order of these returned sites. Returns -------- list Returns a dictionary, where each key-value pair is: key - site position (integer) value - list of one or more site object """ return self._protein.get_sites_by_type_and_range(site_type, self.start, self.end) ####################################### ## ## ## DOMAIN TRACK FUNCTIONS ## ## ## ####################################### ## ------------------------------------------------------------------------ ## def get_track_values(self, name, safe=True): """ Function that returns the region of a protein's values- track associated with this domain. If the track name is not found in this protein and safe is True, this will throw an exception, otherwise (if safe=False) then if the track is missing the function will return None. Parameters -------------- name : str Track name safe : bool (default = True) If set to True, missing tracks trigger an exception, else they just return None Returns ---------- list Returns a list of floats that corresponds to the set of residues associated with the domain of interest, or None if the track does not exist and safe=False. """ t = self._protein.track(name, safe) if t is not None: try: return t.values_region(self._start, self._end) except TypeError: if t.values == None: raise DomainException('Passed associated track has no values - try get_track_symbols') else: raise DomainException('Error with passed associated track') else: return None ## ------------------------------------------------------------------------ ## def get_track_symbols(self, name, safe=True): """ Function that returns the region of a protein's symbols track associated with this domain. If the track name is missing and safe is True, this will throw an exception, otherwise (if safe=False) then if the track is missing the function returns None Parameters -------------- name : str Track name safe : bool (default = True) If set to True, missing tracks trigger an exception, else they just return None Returns ---------- list Returns a list of strings that corresponds to the set of residues associated with the domain of interest. """ t = self._protein.track(name, safe) if t is not None: try: return t.symbols_region(self._start, self._end) except TypeError: if t.symbols == None: raise DomainException('Passed associated track has no symbols - try get_track_values') else: raise DomainException('Error with passed associated track') else: return None ## ------------------------------------------------------------------------ ## def __repr__(self): return "|Domain: %s (%i-%i, len=%i) in protein %s" % (self._domain_type, self.start, self.end, len(self), self.protein.unique_ID) ## ------------------------------------------------------------------------ ## def __len__(self): return len(self.sequence)