Source code for shephard.interfaces.si_protein_attributes

"""
SHEPHARD: 
Sequence-based Hierarchical and Extendable Platform for High-throughput Analysis of Region of Disorder

Authors: Garrett M. Ginell & Alex S. Holehouse
Contact: (alex.holehouse@wustl.edu, g.ginell@wustl.edu)

Holehouse Lab - Washington University in St. Louis
"""

from shephard.exceptions import InterfaceException
from . import interface_tools 
from shephard.exceptions import ProteinException
import shephard.exceptions as shephard_exceptions

MAX_BAD_COUNT  = 10

class _ProteinAttributesInterface:

    """
    Class whose sole purpose is to encapsulate and then store
    parsed Protein Attribute files. This is a hidden class and is not 
    accessible outside of this file
    
    """

    def __init__(self, filename, delimiter='\t', skip_bad=True, preauthorized_uids=None):
        r"""
        Expect files of the followin format:

        Unique_ID, key1:value1, key2:value2, ..., keyn:valuen


        Parameters
        ----------------
        
        filename : str
            Name of the shephard domains file to read


        Other Parameters
        ----------------

        delimiter : str (default = '\\t')
            String used as a delimiter on the input file. 

        skip_bad : bool (default = True)
            Flag that means if bad lines (lines that trigger an exception) 
            are encountered the code will just skip them. By default this is 
            true, which adds a certain robustness to file parsing, but could 
            also hide errors. Note that if lines are skipped a warning will be 
            printed (regardless of verbose flag). 
            Default = True

        preauthorized_ids : list of str (default = None)
            List of unique_IDs that are expected to have relevant protein attributes
            If None then all protein attributes are parsed. Avoids parsing
            lines that are not needed into the interface objects.

        """

        bad_count = 0

        if delimiter == ':':
            raise InterfaceException('When parsing domain file cannot use ":" as a delimeter because this is used to delimit key/value pairs (if provided)')

        with open(filename,'r') as fh:
            content = fh.readlines()

        # convert the preauthorized uids to a set for faster lookup
        if preauthorized_uids is not None:
            preauthorized_uids = set(preauthorized_uids)
            
        ID2ADs={}

        linecount=0
        for line in content:

            linecount = linecount + 1

            # skip comment lines
            if interface_tools.is_comment_line(line):
                continue

            sline = line.strip().split(delimiter)

            # try
            try:
                unique_ID = sline[0].strip()
                
                # check if UID associated with this line is found in the
                # preauthorized list. If  not then skip this line
                if preauthorized_uids is not None and unique_ID not in preauthorized_uids:
                    continue
                
                attributes = {}                
            except Exception as e:

                msg = f'Failed parsing file [{filename}] on line [{linecount}].\n\nException raised: {str(e)}\n\nline printed below:\n{line}'

                # should update this to also display the actual error...
                if skip_bad and bad_count < MAX_BAD_COUNT:
                    bad_count = bad_count + 1
                    shephard_exceptions.print_warning(msg + f"\nSkipping this line (count {bad_count} of {MAX_BAD_COUNT} ...)")

                    continue
                else:
                    raise InterfaceException(msg)

            # if some key/value pairs were included then parse these out one at a time
            if len(sline) > 1:
                attributes = interface_tools.parse_key_value_pairs(sline[1:], filename, linecount, line)
            else:
                # skip over empty entries
                continue
  
            if unique_ID in ID2ADs:
                ID2ADs[unique_ID].append(attributes)
            else:
                ID2ADs[unique_ID] = [attributes]

        self.data = ID2ADs



##############################################
##                                          ##
##     PUBLIC FACING FUNCTIONS BELOW        ##
##                                          ##
##############################################

## ------------------------------------------------------------------------
##
[docs] def add_protein_attributes_from_file(proteome, filename, delimiter='\t', return_dictionary=False, safe=True, skip_bad=True, verbose=True): r""" Function that takes a correctly formatted 'protein attributes' file and reads all attributes into the proteins in the passed proteome. The function expects protein attribute files to have the following format: One protein defined per line (although the same protein can appear multiple times) >>> Unique_ID, key1:value1, key2:value2, ..., keyn:valuen A couple of key points here: - The default delimiter is tabs ('\\t') but this can be changed with the delimiter argument - Key value must be separated by a ':', as a result any delimiter (other than ':') can be used, but ':' is reserved for this role Parameters ---------- proteome : Proteome Object Proteome object to which attributes will be added. filename : str Name of the shephard protein attributes file to read. delimiter : str (default = '\\t') String used as a delimiter on the input file. return_dictionary : bool (default = False) If set to True, this function will return the protein_attributes dictionary and will NOT add that dictionary to the proteome - i.e. the function basically becomes a parser for SHEPHARD-compliant protein_attributes files. safe : bool (default = True) If set to True then any exceptions raised during the protein_attribute-adding process are acted on. If set to False, exceptions simply mean the protein_attribute in question is skipped. Note if set to False, pre-existing protein_attributes with the same name would be silently overwritten (although this is not consider an error), while overwriting will trigger an exception if safe=True. The only reason protein attribute addition could fail is if the attribute already exists, so this is effectively a flag to define if pre-existing attributes should be overwritten (False) or not (True). skip_bad : bool (default = True) Flag that means if bad lines (lines that trigger an exception) are encountered the code will just skip them. By default this is true, which adds a certain robustness to file parsing, but could also hide errors. Note that if lines are skipped a warning will be printed (regardless of verbose flag). skip_bad exclusively influences the file-reading part of the process. verbose : bool (default = True) Flag that defines how 'loud' output is. Will warn about errors on adding attributes. Returns ----------- None or dict If return_dictionary is set to False (default) then this function has no return value, but the protein_attributes are added to the Proteome object passed as the first argument. If return_dictionary is set to True the function returns the parsed domains_dictionary without adding the newly-read protein_attributes to the proteome. """ # check first argument is a proteome interface_tools.check_proteome(proteome, 'add_attributes_from_file (si_protein_attributes)') # next read in the file protein_attribute_interface = _ProteinAttributesInterface(filename, delimiter=delimiter, skip_bad=skip_bad, preauthorized_uids=proteome.proteins) if return_dictionary: return protein_attribute_interface.data # finally add the domains from the dictionary generated by the ProteinAttributesInterface parser add_protein_attributes_from_dictionary(proteome, protein_attribute_interface.data, safe=safe, verbose=verbose)
## ------------------------------------------------------------------------ ##
[docs] def add_protein_attributes_from_dictionary(proteome, protein_attribute_dictionary, safe=True, verbose=True): r""" Function that takes a correctly formatted protein_atttribute dictionary and will add those attributes to the proteins in the Proteome. protein attribute dictionaries are key-value pairs, where the key is a unique ID and the value is a list of dictionaries. For each sub-dictionary, the key-value pair reflects the attribute key-value pairing. Parameters ---------- proteome : Proteome Object Proteome object to which attributes will be added protein_attribute_dictionary : dict Dictionary that defines protein attributes. This is slightly confusing, but the keys for this dictionary is a unique protein IDs and the values is a list of dictionaries. Each of THOSE sub-dictionaries has one (or more) key:value pairs that define key:value pairs that will be associated with the protein of interest. safe : boolean (default = True) If set to True then any exceptions raised during the process of adding a protein_attribute are further raised. If set to False, exceptions simply mean the protein_attribute in question is skipped. Note if set to False, pre-existing protein_attributes with the same name would be silently overwritten (although this is not consider an error), while overwriting will trigger an exception if safe=True. Default = True The only reason protein attribute addition could fail is if the attribute already exists, so this is effectively a flag to define if pre-existing attributes should be overwritten (False) or not (True). verbose : bool (default = True) Flag that defines how 'loud' output is. Will warn about errors on adding attributes. Returns ----------- None No return value, but attributes are added to proteins in the Proteome object passed as the first argument """ # check first argument is a Proteome interface_tools.check_proteome(proteome, 'add_protein_attributes (si_protein_attributes)') for protein in proteome: if protein.unique_ID in protein_attribute_dictionary: # note here each AD is its own dictionary for AD in protein_attribute_dictionary[protein.unique_ID]: # for each attribute-key for k in AD: # get the value v = AD[k] try: protein.add_attribute(k, v, safe=safe) except ProteinException as e: msg='- skipping attribute entry on protein %s (key: %s) ' % (protein.unique_ID, k) if safe: shephard_exceptions.print_and_raise_error(msg, e) else: if verbose: shephard_exceptions.print_warning(msg) continue
## ------------------------------------------------------------------------ ##
[docs] def write_protein_attributes(proteome, filename, delimiter='\t'): r""" Function that writes out protein attributes to file in a standardized format. Note that attributes are converted to a string, which for simple attributes is reasonable but is not really a viable stratergy for complex objects, although this will not yeild and error. Parameters ----------- proteome : Proteome object Proteome object from which the domains will be extracted from filename : str Filename that will be used to write the new domains file delimiter : str (default = '\\t') Character (or characters) used to separate between fields. Default is '\t', which is recommended to maintain compliance with default `add_protein_attributes_from_file()` function. Returns -------- None No return type, but generates a new file with the complete set of protein attributes from this proteome written to disk. """ with open(filename, 'w') as fh: for protein in proteome: if len(protein.attributes) > 0: line = protein.unique_ID for k in protein.attributes: atrbt = interface_tools.full_clean_string(protein.attribute(k)) line = line + delimiter + "%s:%s" %(k, atrbt) line = line + "\n" fh.write(line)
## ------------------------------------------------------------------------ ## def write_protein_attributes_from_dictionary(protein_attribute_dictionary, filename, delimiter='\t'): r""" Function that writes out protein attributes to file in a standardized format. Note that attributes are converted to a string, which for simple attributes is reasonable but is not really a viable stratergy for complex objects, although this will not yeild and error. Parameters ----------- protein_attribute_dictionary : dictionary protein_attribute_dictionary for which the protein IDs are keys and the values are dictionaries with key:value pairs of attributes which are to be writen filename : str Filename that will be used to write the new domains file delimiter : str (default = '\\t') Character (or characters) used to separate between fields. Default is '\t', which is recommended to maintain compliance with default `add_protein_attributes_from_file()` function. Returns -------- None No return type, but generates a new file with the complete set of protein attributes from this proteome written to disk. """ with open(filename, 'w') as fh: for protein in protein_attribute_dictionary: local_attributes = protein_attribute_dictionary[protein] if len(local_attributes) > 0: line = protein for k,v in local_attributes.items(): atrbt = interface_tools.full_clean_string(v) line = line + delimiter + "%s:%s" %(k, atrbt) line = line + "\n" fh.write(line)