Source code for shephard.interfaces.si_domains

"""
SHEPHARD: 
Sequence-based Hierarchical and Extendable Platform for High-throughput Analysis of Region of Disorder

Authors: Garrett M. Ginell & Alex S. Holehouse
Contact: (alex.holehouse@wustl.edu, g.ginell@wustl.edu)

Holehouse Lab - Washington University in St. Louis
"""


from . import interface_tools 
from shephard.exceptions import InterfaceException, ProteinException, DomainException
import shephard.exceptions as shephard_exceptions

MAX_BAD_COUNT  = 10

class _DomainsInterface:

    """
    Class whose sole purpose is to encapsulate and then store
    parsed Domains files. This is a hidden class and is not 
    accessible outside of this file
    
    """

    def __init__(self, filename, delimiter='\t', skip_bad=True, preauthorized_uids=None):
        r"""
        Expect files of the following format:

        Unique_ID, start, stop, domain_type, key1:value1, key2:value2, ..., keyn:valuen

        Note that the first four arguments are required, while all of the 
        key:value pairs are optional. Key value must be separated by a ':', 
        but any delimiter (other than ':') 
        
        is allowed.

        When created, this constructor parses the keyfile to generate a .data 
        class object, 
        which itself maps a uniqueID to a list of domain dictionaries.

        Domain dictionaries have the following key-value pairs

        REQUIRED::
            start                : int (domain start position)
            end                  : int (domain end position)
            domain_type          : string (domain type)

            OPTIONAL:
            attributes           : dictionary of arbitrary key-value pairs 
                                   that will be associated with the domain

        Parameters
        ----------------
        
        filename : str
            Name of the shephard domains file to read

        delimiter : str (default = '\\t')
            String used as a delimiter on the input file. 

        skip_bad : bool (default = True)
            Flag that means if bad lines (lines that trigger an exception) 
            are encountered the code will just skip them. By default this is 
            true, which adds a certain robustness to file parsing, but could 
            also hide errors. Note that if lines are skipped a warning will be 
            printed (regardless of verbose flag). 

        preauthorized_uids : list of str (default = None)
            List of unique_IDs that are allowed to be added to the domains
            dictionary. If None then all domains are allowed. Avoids parsing
            lines that are not needed into the interface objects


        """

        bad_count = 0

        if delimiter == ':':
            raise InterfaceException('When parsing domain file cannot use ":" as a delimiter because this is used to delimit key/value pairs (if provided)')

        with open(filename,'r') as fh:
            content = fh.readlines()


        # convert the preauthorized uids to a set for faster lookup.         
        if preauthorized_uids is not None:
            preauthorized_uids = set(preauthorized_uids)
            
        ID2domain = {}

        linecount = 0
        for line in content:

            linecount = linecount + 1

            # skip comment lines
            if interface_tools.is_comment_line(line):
                continue

            sline = line.strip().split(delimiter)
            
            try:
                unique_ID = sline[0].strip()

                # check if UID associated with this line is found in the
                # preauthorized list. If not, skip this line 
                if preauthorized_uids is not None and unique_ID not in preauthorized_uids:
                    continue
                
                start = int(sline[1].strip())
                end = int(sline[2].strip())
                domain_type = sline[3].strip()
                attributes = {}
                
            except Exception as e:

                msg = f'Failed parsing file [{filename}] on line [{linecount}].\n\nException raised: {str(e)}\n\nline printed below:\n{line}'

                # if we're skipping bad things then...
                if skip_bad and bad_count < MAX_BAD_COUNT:
                    bad_count = bad_count + 1
                    shephard_exceptions.print_warning(msg + f"\nSkipping this line (count {bad_count} of {MAX_BAD_COUNT} ...)")
                    continue
                else:
                    raise InterfaceException(msg)
            
            # if some key/value pairs were included then parse these out one at a time
            if len(sline) > 4:
                attributes = interface_tools.parse_key_value_pairs(sline[4:], filename, linecount, line)
                                          
            if unique_ID in ID2domain:
                ID2domain[unique_ID].append({'start':start, 'end':end, 'domain_type':domain_type, 'attributes':attributes})
            else:
                ID2domain[unique_ID] =[{'start':start, 'end':end, 'domain_type':domain_type, 'attributes':attributes}]

        self.data = ID2domain



##############################################
##                                          ##
##     PUBLIC FACING FUNCTIONS BELOW        ##
##                                          ##
##############################################


## ------------------------------------------------------------------------
##
[docs]def add_domains_from_file(proteome, filename, delimiter='\t', autoname=False, return_dictionary=False, safe=True, skip_bad=True, verbose=True): r""" Function that takes a correctly formatted shephard 'domains' file and reads all domains into the passed Proteome. Expect Domain files to have the following format: One domain per line where with the format:: 1 2 3 4 5 6 n Unique_ID start stop domain_type key_1:value_1 key_2:value_2 ... key_n:value_n A couple of key points here: * The default delimiter is tabs ('\\t') but this can be changed with the delimiter argument. * The first four elements in the each line are required, while all of the key:value pairs are optional * Attribute key-value pairs must be separated by a ``:`` character. As a result any column delimiter (other than ``:``) can be used, but ``:`` is reserved for this role Parameters ---------- proteome : shephard.proteome.Proteome Proteome object to which domains will be added filename : str Name of the shephard domains file to read delimiter : str (default = '\\t') String used as a delimiter on the input file. autoname : bool (default = False) If autoname is set to True, this function ensures each domain ALWAYS has a unique name - i.e. the allows for multiple domains to be perfectly overlapping in position and type. This is generally not going to be required and/or make sense, but having this feature in place is useful. In general we want to avoid this as it makes it easy to include duplicates which by default are prevented when autoname = False. return_dictionary : bool, default=False If set to true, this function will return the domains dictionary and will NOT add that dictionary to the proteome - i.e. the function basically becomes a parser for SHEPHARD-compliant domains files. safe : bool (default = True) If set to True then any exceptions raised during the domain-adding process (i.e. after file parsing) are acted on. If set to false, exceptions simply mean the domain in question is skipped. Note if set to False, pre-existing domains with the same name would be silently overwritten (although this is not consider an error), while overwriting will trigger an exception in safe=True. There are various reasons domain addition could fail (start/end position outside of the protein limits etc) and so if verbose=True then the cause of an exception is also printed to screen. It is highly recommend that if you choose to use safe=False you also set verbose=True. skip_bad : bool (default = True) Flag that means if bad lines (lines that trigger an exception) are encountered the code will just skip them. By default this is true, which adds a certain robustness to file parsing, but could also hide errors. Note that if lines are skipped a warning will be printed (regardless of verbose flag). verbose : bool (default = True) Flag that defines how 'loud' output is. Will warn about errors on adding domains. Returns ----------- None or dict If return_dictionary is set to False (default) then this function has no return value, but the domains are added to the Proteome object passed as the first argument. If return_dictionary is set to True the function returns the parsed domains dictionary without adding the newly-read domains to the proteome. """ # check first argument is a proteome interface_tools.check_proteome(proteome, 'add_domains_from_file (si_domains)') # next read in the file domains_interface = _DomainsInterface(filename, delimiter = delimiter, skip_bad=skip_bad, preauthorized_uids = proteome.proteins) if return_dictionary: return domains_interface.data # finally add the domains from the dictionary generated by the # DomainsInterface parser add_domains_from_dictionary(proteome, domains_interface.data, autoname=autoname, safe=safe, verbose=verbose)
## ------------------------------------------------------------------------ ##
[docs]def add_domains_from_dictionary(proteome, domain_dictionary, autoname=False, safe=True, verbose=True): """ Function that takes a correctly formatted Domains dictionary and will add those domains to the proteins in the Proteome. Domains dictionaries are key-value pairs, where the key is a unique_ID associated with a given protein, and the value is a list of dictionaries. Each subdictionary has four key-value pairs:: * 'start' = start position (int showing start of the domain, starting at 1) * 'end' = end position (int showing end of the domain, inclusive) * 'domain_type' = domain type (string that names the domain) * 'attributes' = dictionary of arbitrary key:value pairings (optional) The start and end positions should be locations within the sequence defined by the unique_ID, and if they are out of the sequence bounds this will throw an exception. Domain type is a string that names the type of domain. The attributes dictionary is an arbitrary key-value pair dictionary where key-values map an arbitrary key to an arbitrary value (read in as strings). In this way, each domain that maps to a give unique_ID will be added. Note the attribute is optional. Parameters ---------- proteome : Proteome object Proteome object to which domains will be added domain_dictionary : dict Dictionary that maps unique_IDs to a list of one or more domain dictionaries autoname : bool (default = False) If autoname is set to true, this function ensures each domain ALWAYS has a unique name - i.e. the allows for multiple domains to be perfecly overlapping in position and type. This is generally not going to be required and/or make sense, but having this feature in place is useful. In general we want to avoid this as it makes it easy to include duplicates which by default are prevented when autoname = False. safe : bool (default = True) If set to True then any exceptions raised during the Domain-adding process are acted on. If set to False, exceptions simply mean the domain in question is skipped. Note if set to False, pre-existing Domains with the same name would be silently overwritten (although this is not consider an error), while overwriting will trigger an exception in safe=True There are various reasons Domain addition could fail (start/end position outside of the protein limits etc.) and so if verbose=True then the cause of an exception is also printed to screen. It is highly recommend that if you choose to use safe=False you also set verbose=True. verbose : bool (default = True) Flag that defines how 'loud' output is. Will warn about errors on adding domains. Returns ----------- None No return value, but domains are added to the Proteome object passed as the first argument. """ # Note - the safe keyword is actually dealt with in this function in conjunction with the Verbose # keyword, so we pass safe=False to the add_domain function and then catch the exception in this # function. # check first argument is a proteome interface_tools.check_proteome(proteome, 'add_domains (si_domains)') for protein in proteome: if protein.unique_ID in domain_dictionary: for domain in domain_dictionary[protein.unique_ID]: start = domain['start'] end = domain['end'] domain_type = domain['domain_type'] try: ad = domain['attributes'] except: ad = {} # try and add the domain... try: protein.add_domain(start, end, domain_type, attributes=ad, safe=safe, autoname=autoname) except (ProteinException, DomainException) as e: msg='- skipping domain at %i-%i on %s' %(start, end, protein) if safe: shephard_exceptions.print_and_raise_error(msg, e) else: if verbose: shephard_exceptions.print_warning(msg) continue
## ------------------------------------------------------------------------ ##
[docs]def add_domain_attributes_from_file(proteome, filename, delimiter='\t', safe=True, add_new=True, skip_bad=True, verbose=True): r""" Function that takes a correctly formatted 'domain' files and reads all domain attributes adding them to domains in the passed proteome, if new domains are inclued the add_new flag determins if new domains are added. The function expects domain attribute files to have the following format: One domain defined per line (although the same protein can appear multiple times):: Unique_ID, domain_name, key1:value1, key2:value2, ..., keyn:valuen A couple of key points here: * The default delimiter is tabs ('\\t') but this can be changed with the delimiter argument * Key value must be separated by a ':', as a result, any delimiter (other than ':') can be used, but ':' is reserved for this role. Parameters ------------ proteome : Proteome Object Proteome object to which attributes will be added filename : str Name of the shephard protein attributes file to read delimiter : str (default = '\t') String used as a delimiter on the input file. add_new : boolean (default = True) If set to True then any new found domains are added to their associated protein. If False any unfound domains are not added and are skipped over. If a new domain is passed that does not have an associated protein in the passed proteome an exception will always be raised regardless of the status of this parameter. safe : bool (default = True) If set to True then any exceptions raised during the protein_attribute-adding process are acted on. If set to False, exceptions simply mean the protein_attribute in question is skipped. Note if set to False, pre-existing protein_attributes with the same name would be silently overwritten (although this is not consider an error), while overwriting will trigger an exception in safe=True. The only reason protein attribute addition could fail is if the attribute already exists, so this is effectively a flag to define if pre-existing attributes should be overwritten (False) or not (True). skip_bad : bool (default = True) Flag that means if bad lines (lines that trigger an exception) are encountered the code will just skip them. By default this is true, which adds a certain robustness to file parsing, but could also hide errors. Note that if lines are skipped a warning will be printed (regardless of verbose flag). verbose : bool (default = True) Flag that defines how 'loud' output is. Will warn about errors on adding attributes. Returns ----------- None or dict If return_dictionary is set to False (default) then this function has no return value, but the protein_attributes are added to the Proteome object passed as the first argument. If return_dictionary is set to True the function returns the parsed domains_dictionary without adding the newly-read protein_attributes to the proteome. """ # check first argument is a proteome interface_tools.check_proteome(proteome, 'add_attributes_from_file (si_protein_attributes)') # next read in the domain file domains_interface = _DomainsInterface(filename, delimiter, skip_bad=skip_bad) # finally add the domains from the dictionary generated by the DomainsInterface parser add_domain_attributes_from_dictionary(proteome, domains_interface.data, add_new=add_new, safe=safe, verbose=verbose)
## ------------------------------------------------------------------------ ##
[docs]def add_domain_attributes_from_dictionary(proteome, domain_dictionary, add_new=True, safe=True, verbose=True): """ Function that takes a correctly formatted Domains dictionary and will add those associated attributes domains to the proteins in the Proteome. Domains dictionaries are key-value pairs, where the key is a unique_ID associated with a given protein, and the value is a list of dictionaries. Each subdictionary has four key-value pairs: * 'protein' the unique_ID of the protein for which to domain is associated with * 'domain_name' = domain type (string that names the domain) * 'attributes' = dictionary of arbitrary key:value pairings (optional) The start and end positions should be locations within the sequence defined by the unique_ID, and if they are out of the sequence bounds this will throw an exception. Domain type is a string that names the type of domain. The attributes dictionary is an arbitrary key-value pair dictionary where key-values map an arbitrary key to an arbitrary value (read in as strings). In this way, each domain that maps to a give unique_ID will be added. Note the attribute is optional. Parameters ---------- proteome : Proteome object Proteome object to which domains will be added domain_dictionary : dict Dictionary that maps unique_IDs to a list of one or more domain dictionaries. add_new : boolean (default = True) If set to True then any new found domains are added to their associated protein. If False any unfound domains are not added and are skipped over. If a new domain is passed that does not have an associated protein in the passed proteome an exception will always be raised regardless of the status of this parameter. safe : bool (default = True) If set to True then any exceptions raised during the Domain-adding process are acted on. If set to False, exceptions simply mean the domain in question is skipped. Note if set to False, pre-existing Domains with the same name would be silently overwritten (although this is not consider an error), while overwriting will trigger an exception in safe=True There are various reasons Domain addition could fail (start/end position outside of the protein limits etc.) and so if verbose=True then the cause of an exception is also printed to screen. It is highly recommend that if you choose to use safe=False you also set verbose=True. verbose : bool (default = True) Flag that defines how 'loud' output is. Will warn about errors on adding domains. Returns ----------- None No return value, but domains are added to the Proteome object passed as the first argument. """ # Note - the safe keyword is actually dealt with in this function in conjunction with the Verbose # keyword, so we pass safe=False to the add_domain function and then catch the exception in this # function. # check first argument is a proteome interface_tools.check_proteome(proteome, 'add_domains (si_domains)') # iterate proteins with new domains for unique_ID in domain_dictionary: # check if protein in proteome if unique_ID in proteome: # build dict of local domains with domain IDs as keys local_domain_dict = {"%s_%i_%i" % (d.domain_type, d.start, d.end): d for d in proteome.protein(unique_ID).domains} # iterate new domains for new_domain in domain_dictionary[unique_ID]: new_domain_ID = "%s_%i_%i" % (new_domain['domain_type'], new_domain['start'], new_domain['end']) # check if domain is in local domain by ID if new_domain_ID in local_domain_dict: local_domain = local_domain_dict[new_domain_ID] try: ad = new_domain['attributes'] except: ad = {} # merge attributes for k, v in ad.items(): try: local_domain.add_attribute(k, v, safe=safe) except (ProteinException, DomainException) as e: msg = f"- skipping attribute being added to {unique_ID} for domain type {new_domain['domain_type']}, with start={new_domain['start']} and end={new_domain['end']})" if safe: shephard_exceptions.print_and_raise_error(msg, e) else: if verbose: shephard_exceptions.print_warning(msg) continue # move on to next new domain continue # add new domain if new domain not found and flag is true if add_new: try: ad = new_domain['attributes'] except: ad = {} # try and add the domain... try: proteome.protein(unique_ID).add_domain(new_domain['start'], new_domain['end'], new_domain['domain_type'], attributes=ad, safe=safe) except (ProteinException, DomainException) as e: msg='- skipping domain at %i-%i on %s' %(new_domain['start'], new_domain['end'], proteome.protein(unique_ID)) if safe: shephard_exceptions.print_and_raise_error(msg, e) else: if verbose: shephard_exceptions.print_warning(msg) continue
## ------------------------------------------------------------------------ ##
[docs]def write_domains(proteome, filename, delimiter='\t', domain_types=None): r""" Function that writes out domains to a SHEPHARD domains file. Note that attributes are converted to a string, which for simple attributes is reasonable but is not really a viable stratergy for complex objects, although this will not yeild and error. Parameters ----------- proteome : Proteome object Proteome object from which the domains will be extracted from filename : str Filename that will be used to write the new domains file delimiter : str (default = '\\t') Character (or characters) used to separate between fields. Default is '\t' Which is recommended to maintain compliance with default `add_domains_from_file()` function. domain_types : list (default None) Lets you define a list of one or more domain types that will be written out. Domain types are passed as strings which should map to named domain types in the Proteome. Returns -------- None No return type, but generates a new file with the complete set of domains from this proteome written to disk. """ # added so that we ensure domain_types is a list if passed if domain_types is not None: if type(domain_types) is not list: raise InterfaceException('When passing a domain_type this must be a list') with open(filename, 'w') as fh: for protein in proteome: for d in protein.domains: # if domain_types is passed check if each domain # is found in the list if domain_types is not None: if d.domain_type not in domain_types: continue line = __build_domain_line(d, delimiter) fh.write(line)
## ------------------------------------------------------------------------ ##
[docs]def write_domains_from_list(domain_list, filename, delimiter='\t'): r""" Function that writes out domains to a SHEPHARD domains file from a list of Domain objects. Note that attributes are converted to a string, which for simple attributes is reasonable but is not really a viable stratergy for complex objects, although this will not yeild and error. Parameters ----------- domain_list : List of Domain objects List of domain objects which will be written filename : str Filename that will be used to write the new domains file delimiter : str (default = '\\t') Character (or characters) used to separate between fields. Default is '\\t' which is recommended to maintain compliance with default `add_domains_from_file()` function Returns -------- None No return type, but generates a new file with the complete set of domains from this proteome written to disk. """ # first check if items in the list are Domain objects for d in domain_list: interface_tools.check_domain(d, 'write_domains_from_list') with open(filename, 'w') as fh: for d in domain_list: line = __build_domain_line(d, delimiter) fh.write(line)
## ------------------------------------------------------------------------ ## def __build_domain_line(d, delimiter): """ Internal function that takes a Domain object and returns a line that can be written to a Domains file. This is called internally by functions that write Domains. Parameters ---------------------- d : shephard.Domain Domain object being converted to a string delimiter : str (default = '\\t') Character (or characters) used to separate between fields. Default is the tab character ('\\t'), which is recommended to maintain compliance with default SHEPHARD file-reading functions. Returns -------------- str Returns a string that is ready to be written to file """ # systematically construct each line in the file line = '' line = line + str(d.protein.unique_ID) + delimiter start = d.start line = line + str(start) + delimiter end = d.end line = line + str(end) + delimiter domain_type = d.domain_type # note last required element has no trailing delimiter line = line + str(domain_type) if d.attributes: for k in d.attributes: # atrbt = interface_tools.full_clean_string(d.attribute(k)) line = line + delimiter + f"{k}:{atrbt}" line = line + "\n" return line