Source code for shephard.interfaces.si_sites

"""
SHEPHARD: 
Sequence-based Hierarchical and Extendable Platform for High-throughput Analysis of Region of Disorder

Authors: Garrett M. Ginell & Alex S. Holehouse
Contact: (alex.holehouse@wustl.edu, g.ginell@wustl.edu)

Holehouse Lab - Washington University in St. Louis
"""

from . import interface_tools 
import shephard.exceptions as shephard_exceptions
from shephard.exceptions import InterfaceException, ProteinException, SiteException

MAX_BAD_COUNT  = 10

class _SitesInterface:

    def __init__(self, filename, delimiter='\t', skip_bad=True, preauthorized_uids=None):
        """
        Expect files of the following format:
        
        A SHEPHARD sites file is a tab (or other) delineated file where each 
        line has the following convention::
    
               1        2          3       4      5   [      6            7        ...     n         ] 
            Unique_ID position site_type symbol value [key_1:value_1 key_2:value_2 ... key_n:value_n ]
    
        Each line has six required values and then can have as many key:value pairs as may be
        desired.

        Note that the first four arguments are required, while all of the 
        key:value pairs are optional. Key value must be separated by a ':', 
        but any delimiter (other than ':') is allowed. 

        Parameters
        ----------------
        
        filename : str
            Name of the shephard domains file to read

        delimiter : str (default = \t)
            String used as a delimiter on the input file. 

        skip_bad : bool (default = True)
            Flag that means if bad lines (lines that trigger an exception) 
            are encountered the code will just skip them. By default this is 
            true, which adds a certain robustness to file parsing, but could 
            also hide errors. Note that if lines are skipped a warning will be 
            printed (regardless of verbose flag). 

        preauthorized_ids : list of str (default = None)
            List of unique_IDs that are allowed to be added to the sites
            dictionary. If None then all sites are allowed. Avoids parsing
            lines that are not needed into the interface objects

        """

        bad_count = 0

        if delimiter == ':':
            raise InterfaceException('When parsing site file cannot use ":" as a delimeter because this is used to delimit key/value pairs (if provided)')


        with open(filename,'r') as fh:
            content = fh.readlines()

        # convert the preauthorized uids to a set for faster lookup
        if preauthorized_uids is not None:
            preauthorized_uids = set(preauthorized_uids)

        ID2site = {}
        
        linecount=0
        for line in content:

            linecount = linecount + 1

            # skip comment lines
            if interface_tools.is_comment_line(line):
                continue

            sline = line.strip().split(delimiter)

            try:
                unique_ID = sline[0].strip()

                # check if UID associated with this line is found in the
                # preauthorized list. If  not then skip this line
                if preauthorized_uids is not None and unique_ID not in preauthorized_uids:
                    continue
                
                position = int(sline[1].strip())
                site_type = sline[2].strip()
                symbol = sline[3].strip()

                # this enables the value to be None if you
                # write a symbol where there's no value associated
                # with a site
                tmp = sline[4].strip()
                if tmp == 'None':
                    value = None
                else:
                    value = float(tmp)

                attributes = {}
                
            except Exception as e:
                msg = f'Failed parsing file [{filename}] on line [{linecount}].\n\nException raised: {str(e)}\n\nline printed below:\n{line}'

                # should update this to also display the actual error...
                if skip_bad and bad_count < MAX_BAD_COUNT:
                    bad_count = bad_count + 1
                    shephard_exceptions.print_warning(msg + f"\nSkipping this line (count {bad_count} of {MAX_BAD_COUNT} ...)")
                    continue
                else:
                    raise InterfaceException(msg)

            # if there's more parse attribute dictionary entries
            if len(sline) > 5:
                attributes = interface_tools.parse_key_value_pairs(sline[5:], filename, linecount, line)

            if unique_ID in ID2site:
                ID2site[unique_ID].append({'position':position, 'site_type':site_type, 'symbol':symbol, 'value':value, 'attributes':attributes})
            else:
                ID2site[unique_ID] =[{'position':position, 'site_type':site_type, 'symbol':symbol, 'value':value, 'attributes':attributes}]

        self.data = ID2site



##############################################
##                                          ##
##     PUBLIC FACING FUNCTIONS BELOW        ##
##                                          ##
##############################################


## ------------------------------------------------------------------------
##
[docs]def add_sites_from_file(proteome, filename, delimiter='\t', return_dictionary=False, safe=True, skip_bad=True, verbose=True): r""" Function that provides the user-facing interface for reading correctly configured SHEPHARD sites files and adding those sites to the proteins of interest. A SHEPHARD sites file is a tab (or other) delineated file where each line has the following convention:: 1 2 3 4 5 [ 6 7 ... n ] Unique_ID position site_type symbol value [key_1:value_1 key_2:value_2 ... key_n:value_n ] Each line has six required values and then can have as many key:value pairs as may be desired. Parameters ---------- proteome : Proteome Proteome object to which we're adding sites. Note that ONLY sites for which a protein is found will be used. Protein-Site cross-referencing is done using the protein's unique_ID which should be the key used in the sites_dictionary filename : str Name of the shephard site file to be read delimiter : str (default = '\\t') String used as a delimiter on the input file. return_dictionary : bool, default=False If set to true, this function will return the sites dictionary and will NOT add that dictionary to the proteome - i.e. the function basically becomes a parser for SHEPHARD-compliant sites files. safe : bool (default = True) If set to True then any exceptions raised during the site-adding process (i.e. after file parsing) are acted on. If set to False, exceptions simply mean the site in question is skipped. There are various reasons site addition could fail (e.g. site falls outside of protein position so if verbose=True then the cause of an exception is also printed to screen. It is highly recommend that if you choose to use safe=False you also set verbose=True. Default = True. skip_bad : bool (default = True) Flag that means if bad lines (lines that trigger an exception) are encountered the code will just skip them. By default this is true, which adds a certain robustness to file parsing, but could also hide errors. Note that if lines are skipped a warning will be printed (regardless of verbose flag). verbose : bool (default = True) Flag that defines how 'loud' output is. Will warn about errors on adding sites. Returns --------- None or dict If return_dictionary is set to False (default) then this function has no return value, but the sites are added to the Proteome object passed as the first argument. If return_dictionary is set to True the function returns the parsed sites dictionary without adding the newly-read sites to the proteome. """ # check first argument is a proteome interface_tools.check_proteome(proteome, 'add_sites_from_file (si_sites)') # build the SitesInterface object sites_interface = _SitesInterface(filename, delimiter=delimiter, skip_bad=skip_bad, preauthorized_uids = proteome.proteins) if return_dictionary: return sites_interface.data # finally add the site from the dictionary generated by the # SitesInterface parser add_sites_from_dictionary(proteome, sites_interface.data, safe, verbose)
## ------------------------------------------------------------------------ ##
[docs]def add_sites_from_dictionary(proteome, sites_dictionary, safe=True, verbose=False): """ Function that takes a correctly formatted Sites dictionary and will add those Sites to the proteins in the Proteome. Sites dictionaries are key-value pairs, where the key is a unique_ID associated with a given Protein, and the value is a list of dictionaries. Each subdirectionay has the following elements:: 'position' = site position 'site_type' = site type 'symbol' = site symbol 'value' = site value 'attributes' = site attribute dictionary In this way, each site that maps to a give unique_ID will be added to the associated protein. The use of a list of dictionaries (as opposed to a simple unique_ID:site_dictionary pairing) means multiple sites for a single protein can be added at once. Parameters ------------- proteome : Proteome Proteome object to which we're adding sites. Note that ONLY sites for which a protein is found will be used. Protein:Site cross-referencing is done using the protein's unique_ID which should be the key used in the sites_dictionary sites_dictionary : dict A sites dictionary (defined above) is dictionary that maps a unique_ID back to a list of dictionaries, where each subdictionay has five elements, desribed above. Recall the only type-specific values (position and value) are cast automatically when a site is added by the Protein object, so there is no need to do that in this function too. Extra key-value paris in each sub-dictionary are ignored safe : bool (default = True) If set to True then any exceptions raised during the site-adding process are acted on. If set to false, exceptions simply mean the site in question is skipped. There are various reasons site addition could fail (notably position of the site is outside of the protein limits) and so if verbose=True then the cause of an exception is also printed to screen. It is highly recommend that if you choose to use safe=False you also set verbose=True verbose : bool (default = False) Flag that defines how 'loud' output is. Will warn about errors on adding sites. Returns --------- None No return value, but adds all of the passed sites to the protein """ for protein in proteome: if protein.unique_ID in sites_dictionary: for site in sites_dictionary[protein.unique_ID]: try: position = site['position'] site_type = site['site_type'] symbol = site['symbol'] value = site['value'] try: ad = site['attributes'] except: ad = {} except Exception: raise InterfaceException('When sites dictionary for key [%s] was unable to extract five distinct parametes. Entry is:\n%s\n'% (protein.unique_ID, site)) # assuming we can read all five params try and add the site try: protein.add_site(position, site_type, symbol, value, attributes = ad) except ProteinException as e: msg='- skipping site %s at %i on %s' %(site_type, position, protein) if safe: shephard_exceptions.print_and_raise_error(msg, e) else: if verbose: shephard_exceptions.print_warning(msg) continue
## ------------------------------------------------------------------------ ##
[docs]def write_sites(proteome, filename, delimiter='\t', site_types=None): r""" Function that writes out sites to file in a standardized format. Note that attributes are converted to a string, which for simple attributes is reasonable but is not really a viable stratergy for complex objects, although this will not yeild and error. If a site_types list is provided, only site_types that match to strings in this list are written out. Parameters ----------- proteome : Proteome Proteome object from which the sites will be extracted from filename : str Filename that will be used to write the new sites file site_type : str (default = None) If provided, this is an identifier that allows you to specificy a specific site type to write out. delimiter : str (default = '\\t') Character (or characters) used to separate between fields. Default is the tab character ('\\t'), which is recommended to maintain compliance with default SHEPHARD file-reading functions. Returns -------- None No return type, but generates a new file with the complete set of sites from this proteome written to disk. """ # added so that we ensure site_types is a list if passed if site_types is not None: if type(site_types) is not list: raise InterfaceException('When passing a site_type this must be a list') with open(filename, 'w') as fh: for protein in proteome: for s in protein.sites: # if we're using site_types and the current sites if site_types is not None: if s.site_type not in site_types: continue # build a line # if the passed parameter site_types is being # used line = __build_site_line(s, delimiter) fh.write(f"{line}")
## ------------------------------------------------------------------------ ##
[docs]def write_sites_from_list(site_list, filename, delimiter='\t'): r""" Function that writes out sites to a SHEPHARD sites file from a list of Site objects. Note that attributes are converted to a string, which for simple attributes is reasonable but is not really a viable stratergy for complex objects, although this will not yeild and error. Parameters ----------- site_list : List of Site objects List of site objects which will be written filename : str Filename that will be used to write the new sites file delimiter : str (default = '\\t') Character (or characters) used to separate between fields. Default is '\\t' which is recommended to maintain compliance with default `add_sites_from_file()` function Returns -------- None No return type, but generates a new file with the complete set of sites from this proteome written to disk. """ # first check if items in the list are site objects for s in site_list: interface_tools.check_site(s, 'write_sites_from_list') with open(filename, 'w') as fh: # for each site in the list for s in site_list: # build a line # if the passed parameter site_types is being # used line = __build_site_line(s, delimiter) fh.write(f"{line}")
## ------------------------------------------------------------------------ ## def __build_site_line(s, delimiter): """ Internal function that takes a Site object and returns a line that can be written to a Sites file. This is called internally by functions that write Sites. Parameters ---------------------- s : shephard.Site Site object being converted to a string delimiter : str (default = '\\t') Character (or characters) used to separate between fields. Default is the tab character ('\\t'), which is recommended to maintain compliance with default SHEPHARD file-reading functions. Returns -------------- str Returns a string that is ready to be written to file """ # systematically construct each line in the file line = '' line = line + str(s.protein.unique_ID) + delimiter line = line + str(s.position) + delimiter line = line + str(s.site_type) + delimiter line = line + str(s.symbol) + delimiter # note last required element has no trailing delimiter line = line + str(s.value) if s.attributes: for k in s.attributes: atrbt = interface_tools.full_clean_string(s.attribute(k)) line = line + delimiter + f"{k}:{atrbt}" line = line + "\n" return line