Source code for shephard.interfaces.si_proteins

"""
SHEPHARD: 
Sequence-based Hierarchical and Extendable Platform for High-throughput Analysis of Region of Disorder

Authors: Garrett M. Ginell & Alex S. Holehouse
Contact: (alex.holehouse@wustl.edu, g.ginell@wustl.edu)

Holehouse Lab - Washington University in St. Louis
"""

from shephard.exceptions import InterfaceException
from . import interface_tools 
from shephard.exceptions import ProteinException, ProteomeException
import shephard.exceptions as shephard_exceptions

MAX_BAD_COUNT  = 10

class _ProteinsInterface:

    """
    Class whose sole purpose is to encapsulate and then store
    parsed Proteins files. This is a hidden class and is not 
    accessible outside of this file.
    
    """

    def __init__(self, filename, delimiter='\t', skip_bad=True):
        r"""
        Expect files of the following format:

        Unique_ID, name, sequence, key1:value1, key2:value2, ..., keyn:valuen

        NOTE that each unique_ID can ONLY appear once!

        Parameters
        ----------------
                
        filename : str
            Name of the shephard proteins file to read

        delimiter : str (default = '\\t')
            String used as a delimiter on the input file. 

        skip_bad : bool (default = True)
            Flag that means if bad lines (lines that trigger an exception) 
            are encountered the code will just skip them. By default this is 
            true, which adds a certain robustness to file parsing, but could 
            also hide errors. Note that if lines are skipped a warning will be 
            printed (regardless of verbose flag). 


        """

        bad_count = 0

        if delimiter == ':':
            raise InterfaceException('When parsing protein file cannot use ":" as a delimeter because this is used to delimit key/value pairs (if provided)')

        with open(filename,'r') as fh:
            content = fh.readlines()
            

        ID2protein = {}
        linecount = 0

        for line in content:

            linecount = linecount + 1

            # skip comment lines
            if interface_tools.is_comment_line(line):
                continue

            sline = line.strip().split(delimiter)

            # try
            try:
                unique_ID = sline[0].strip()
                
                name = sline[1].strip()
                sequence = sline[2].strip()
                attributes = {}                
            except Exception as e:
                msg = f'Failed parsing file [{filename}] on line [{linecount}].\n\nException raised: {str(e)}\n\nline printed below:\n{line}'

                # should update this to also display the actual error...
                if skip_bad and bad_count < MAX_BAD_COUNT:
                    bad_count = bad_count + 1
                    shephard_exceptions.print_warning(msg + f"\nSkipping this line (count {bad_count} of {MAX_BAD_COUNT} ...)")                    
                    continue
                else:
                    raise InterfaceException(msg)

            # if some key/value pairs were included then parse these out one at a time
            if len(sline) > 3:
                attributes = interface_tools.parse_key_value_pairs(sline[3:], filename, linecount, line)
            else:
                # skip over empty entries
                pass
  
            if unique_ID in ID2protein:
                raise InterfaceException("Duplicate protein found in the file %s (offending UID=%s). This cannot be skipped" % (filename, UID))            
            else:
                ID2protein[unique_ID] = {'name':name, 'sequence':sequence, 'attributes':attributes}


        self.data = ID2protein



##############################################
##                                          ##
##     PUBLIC FACING FUNCTIONS BELOW        ##
##                                          ##
##############################################


## ------------------------------------------------------------------------
##
[docs]def add_proteins_from_file(proteome, filename, delimiter='\t', return_dictionary = False, safe=True, skip_bad=True, verbose=True): r""" Function that takes a correctly formatted 'protein' file and reads every protein into the passed proteome. The function expects protein files to have the following format: >>> Unique_ID name sequence key_1:value_1 key_2:value_2 ... key_n:value_n One protein defined per line (with NO duplicates allowed - duplicate entries on the file will trigger an un-rescuable error) where key:values are optional and can be between 0 and n. **A couple of key points here**: * The default delimiter is tabs ('\\t') but this can be changed with the delimiter argument * Key value must be separated by a ':', as a result any delimiter (other than ':') can be used, but ':' is reserved for this role. * If a protein with the UID from the file exists in the passed proteome then this will throw an exception unless safe=False Parameters ---------- proteome : Proteome Proteome object to which attributes will be added filename : str Name of the shephard protein attributes file to read Other Parameters ---------------- delimiter : str (default = '\\t') String used as a delimiter on the input file. return_dictionary : bool (default = False) If set to true, this function will return the protein dictionary and will NOT add that dictionary to the proteome - i.e. the function basically becomes a parser for SHEPHARD-compliant protein files. Default = False safe : bool (default = True) If set to True then any exceptions raised during the protein-adding process are acted on. Specifically this becomes relevant if we wish to overwrite duplicates (or throw an exception on duplicates). skip_bad : bool (default = True) Flag that means if bad lines (lines that trigger an exception) are encountered the code will just skip them. By default this is true, which adds a certain robustness to file parsing, but could also hide errors. Note that if lines are skipped a warning will be printed (regardless of verbose flag). skip_bad exclusively influences the file-reading part of the process. verbose : bool (default = True) Flag that defines how 'loud' output is. Will warn about errors on adding attributes. Returns ----------- None or dict If return_dictionary is set to False (default) then this function has no return value, but the proteins are added to the Proteome object passed as the first argument. If return_dictionary is set to True the function returns the parsed proteins dictionary without adding the newly-read proteins to the proteome. """ # check first argument is a proteome interface_tools.check_proteome(proteome, 'add_proteins_from_file (si_protein)') # next read in the file proteins_interface = _ProteinsInterface(filename, delimiter=delimiter, skip_bad=skip_bad) if return_dictionary: return proteins_interface.data # finally add the proteins from the dictionary generated by the ProteinsInterface parser add_proteins_from_dictionary(proteome, proteins_interface.data, safe=safe, verbose=verbose)
## ------------------------------------------------------------------------ ##
[docs]def add_proteins_from_dictionary(proteome, protein_dictionary, safe=True, verbose=True): """ Function that takes a correctly formatted protein dictionary and will add those proteins to the Proteome. protein dictionaries are key-value pairs, where the key is a unique ID and the value is itself a dictionary which has the following keys: * **name** - Protein name (uncontrolled vocabulary, but should be a string) * **sequence** - Amino acid sequence for the protein (note that no sanity checking is done here) * **attributes** - Dictionary of arbitrary key:value pairings (optional) Parameters ---------- proteome : Proteome Proteome object to which attributes will be added protein_dictionary : dict Dictionary that defines proteins. The keys for this dictionary is a unique protein IDs and the values is a list of dictionaries. Each of THOSE sub dictionaries contains key-value pairs are described above. safe : bool (default = True) If set to True then any exceptions raised during the protein-adding process are acted on. If set to False, exceptions simply mean the protein_attribute in question is skipped. Note if set to False, pre-existing protein_attributes with the same name would be silently overwritten (although this is not consider an error), while overwriting will trigger an exception. The only reason protein attribute addition could fail is if the attribute already exists, so this is effectively a flag to define if pre-existing attributes should be overwritten (False) or not (True). Default = True. verbose : bool (default = True) Flag that defines how 'loud' output is. Will warn about errors on adding attributes. Returns ----------- None No return value, but attributes are added to proteins in the Proteome object passed as the first argument. """ # check first argument is a Proteome interface_tools.check_proteome(proteome, 'add_protein_from_dictionary (si_proteins)') if safe is False: force_overwrite = True else: force_overwrite = False # for each entry in the overall dictionary for UID in protein_dictionary: # if attributes are included read these out. Note we expect # ats to be a dictionary try: ats = protein_dictionary[UID]['attributes'] except: ats = None s = protein_dictionary[UID]['sequence'] # note we use the clean_string to remove tab characters from # the name should they exist n = interface_tools.clean_string(protein_dictionary[UID]['name']) try: proteome.add_protein(s, n, UID, attributes=ats, force_overwrite=force_overwrite) except (ProteinException, ProteomeException) as e: msg='- skipping protein %s (name = %s, len=%i' %(UID, n, len(s)) if safe: shephard_exceptions.print_and_raise_error(msg, e) else: if verbose: shephard_exceptions.print_warning(msg) continue
## ------------------------------------------------------------------------ ##
[docs]def write_proteins(proteome, filename, delimiter='\t'): """ Function that writes out proteins to file in a standardized format. Note that attributes are converted to a string, which for simple attributes is reasonable but is not really a viable stratergy for complex objects, although this will not yeild and error. Writes out files with the format: >>> Unique_ID name sequence key_1:value_1 key_2:value_2 ... key_n:value_n Parameters ----------- proteome : Proteome Proteome object from which the proteins will be extracted from filename : str Filename that will be used to write the new proteins file Other Parameters ---------------- delimiter : str (default = '\\t') Character (or characters) used to separate between fields. Default is '\\t', which is recommended to maintain compliance with default `add_protein_attributes_from_file()` function Returns -------- None No return type, but generates a new file with the complete set of protein attributes from this proteome written to disk. """ with open(filename, 'w') as fh: for protein in proteome: uid = protein.unique_ID n = interface_tools.clean_string(protein.name, delimiter=delimiter) s = protein.sequence line = uid line = line + delimiter + n line = line + delimiter + s if len(protein.attributes) > 0: for k in protein.attributes: atrbt = interface_tools.full_clean_string(protein.attribute(k)) line = line + delimiter + "%s:%s" %(k, atrbt) line = line + "\n" fh.write(line)