Source code for shephard.interfaces.si_tracks

"""
SHEPHARD: 
Sequence-based Hierarchical and Extendable Platform for High-throughput Analysis of Region of Disorder

Authors: Garrett M. Ginell & Alex S. Holehouse
Contact: (alex.holehouse@wustl.edu, g.ginell@wustl.edu)

Holehouse Lab - Washington University in St. Louis
"""


from shephard.exceptions import InterfaceException, ProteinException, TrackException
import shephard.exceptions as shephard_exceptions
from . import interface_tools 
from shephard import general_utilities
import os

MAX_BAD_COUNT  = 10


class _TracksInterface:

    def __init__(self, filename, delimiter='\t', mode='values', skip_bad=True, preauthorized_uids=None):
        """
        
        Class for reading in correctly formatted tracks files for parsing 
        into a Proteome object.

        Tracks files must adhere to the following specification

            unique_ID, track_name, val_1, val_2, ...., val_n 
si
        where n = length of protein.

        This class allows a tracksfile to be read in and defined as either 
        a values track file, or a symbols track file, returning a tracks 
        dictionary. 

        Parameters
        ----------------
        
        filename : str
            Name of the SHEPHARD Tracks file to read.


        Other Parameters
        ----------------

        delimiter : str (default = '\\t')
            String used as a delimiter on the input file. 

        mode : str (default = 'values')
            A selector that defines the type of track file to be read. 
            Must be either 'symbols' or 'values'.

        skip_bad : bool (default = True)
            Flag that means if bad lines (lines that trigger an exception) 
            are encountered the code will just skip them. By default this is 
            true, which adds a certain robustness to file parsing, but could 
            also hide errors. Note that if lines are skipped a warning will 
            be printed (regardless of verbose flag). 

        preauthorized_ids : list of str (default = None)
            List of unique_IDs that are allowed to be added to the track
            dictionary. If None then all tracks are allowed. Avoids parsing
            lines that are not needed into the interface objects


        """

        bad_count = 0
        
        with open(filename,'r') as fh:
            content = fh.readlines()

        # convert the preauthorized uids to a set for faster lookup
        if preauthorized_uids is not None:
            preauthorized_uids = set(preauthorized_uids)
            
        ID2track = {}

        linecount = 0
        # cycle over every line in the file
        for line in content:

            linecount = linecount + 1

            # skip comment lines
            if interface_tools.is_comment_line(line):
                continue

            # extract chop off lagging whitespace and divide up using the delimiter
            sline = line.strip().split(delimiter)                        
            track_data = []
            
            # for this list 
            try:

                # extract track name and unique_id
                unique_ID = sline[0].strip()

                # check if UID associated with this line is found in the
                # preauthorized list. If  not then skip this line
                if preauthorized_uids is not None and unique_ID not in preauthorized_uids:
                    continue
                
                track_name = sline[1].strip()
                
                # parse track values or symbols
                if mode == 'values':

                    # for each element in sline strip whitespace and convert to a float
                    track_data = [float(i.strip()) for i in sline[2:]]

                elif mode == 'symbols':
                    # for each element in sline strip whitespace 
                    track_data = [i.strip() for i in sline[2:]]
                else:
                    raise InterfaceException(f"Error: mode='{mode}' passed, yet this does not match 'symbols' or 'values'")

                        
                if unique_ID in ID2track:
                    ID2track[unique_ID].append({'track_name':track_name, 'track_data':track_data})                    
                else:
                    ID2track[unique_ID] = [{'track_name':track_name, 'track_data':track_data}]
                    

            except Exception as e:

                msg = f'Failed parsing file [{filename}] on line [{linecount}].\n\nException raised: {str(e)}\n\nline printed below:\n{line}'

                # should update this to also display the actual error...
                if skip_bad and bad_count < MAX_BAD_COUNT:
                    bad_count = bad_count + 1
                    shephard_exceptions.print_warning(msg + f"\nSkipping this line (count {bad_count} of {MAX_BAD_COUNT} ...)")
                    continue
                else:
                    raise InterfaceException(msg)

        self.data = ID2track


## ------------------------------------------------------------------------
##
def __write_all_tracks_single_file(proteome, 
                                   outfile, 
                                   track_type,
                                   value_fmt = "%.3f", 
                                   delimiter='\t'):
    r"""
    Internal function Function that writes all tracks associated with  
    a Proteome out to a single file. 

    See also:

        write_all_values_tracks_single_file() 
        write_all_symbols_tracks_single_file() 
    

    Parameters
    -----------

    proteome : Proteome object
        Proteome object from which the Domains will be extracted from

    outfile : str
        String that defines the name of the output file.


    Other Parameters
    ----------------

    value_fmt : str (default = "%.3f")
        Format string that will be used for values. 
        
    delimiter : str (default = '\\t')
        Character (or characters) used to separate between fields. 
        Default is '\t' which is recommended to maintain compliance 
        with default `add_tracks_from_files()` function.

    
    Returns
    --------
    None
        No return type, but generates a new file with the complete 
        set of tracks from this Proteome written to disk.
        

    """
    
    # open the file handle
    fh = open(outfile,'w')

    # build a list of track names that are values-tracks
    tn2tt = proteome.track_names_to_track_type
    
    valid_names = []
    for name in tn2tt:
        if tn2tt[name] == track_type:
            valid_names.append(name)
        
    # cyle through each track name we designated as valid
    for t_name in valid_names:        
        write_tracks(proteome, None, t_name, value_fmt, delimiter, file_handle=fh)

    fh.close()


##############################################
##                                          ##
##     PUBLIC FACING FUNCTIONS BELOW        ##
##                                          ##
##############################################

## ------------------------------------------------------------------------
##
[docs]def add_tracks_from_file(proteome, filename, mode, delimiter='\t', return_dictionary=False, safe=True, skip_bad=True, verbose=True): r""" Function that takes a correctly formatted shephard 'tracks' file and reads all Tracks into the passed Proteome. Expect Track files to have the following format: One protein per line, where each protein has the following information: >>> Unique_ID track_name res1 res2 res3 .... resn Where ``res1``, ``res2``, ``resn`` are symbol or values to be mapped to the 1st, 2nd, or nth residue. There should be the same number of res1, 2, ...n entries are there are residues in the associated protein. A couple of key points here: - The default delimiter is tabs ('\\t') but this can be changed with the delimiter argument - Each track must assign a value or a symbol to EVERY residue in the protein Parameters ---------- proteome : Proteome Proteome object filename : str Name of the shephard Domains file to read mode : str (default = 'values') A selector that defines the type of track file to be read. Must be either 'symbols' or 'values'. delimiter : str (default = '\\t') String used as a delimiter on the input file. return_dictionary : bool (default = False) If set to true, this function will return the tracks dictionary and will NOT add that dictionary to the Proteome - i.e. the function basically becomes a parser for SHEPHARD-compliant tracks files. safe : bool (default = True) If set to True then any exceptions raised during the Track-adding process (i.e. after file parsing) are acted on. If set to False, exceptions simply mean the site in question is skipped. Note if set to False pre-existing tracks with the same name would be silently overwritten (although this is not consider an error), while overwriting will trigger an exception in safe=True. There are various reasons site addition could fail (e.g. track does not match length of protein) so if verbose=True then the cause of an exception is also printed to screen. It is highly recommend that if you choose to use safe=False you also set verbose=True. skip_bad : bool (default = True) Flag that means if bad lines (lines that trigger an exception) are encountered the code will just skip them. By default this is true, which adds a certain robustness to file parsing, but could also hide errors. Note that if lines are skipped a warning will be printed (regardless of verbose flag). verbose : bool (default = True) Flag that defines how 'loud' output is. Will warn about errors on adding tracks. Returns ----------- None or dict If return_dictionary is set to False (default) then this function has no return value, but the tracks are added to the Proteome object passed as the first argument. If return_dictionary is set to True the function returns the parsed tracks dictionary without adding the newly-read tracks to the proteome. """ # check first argument is a Proteome interface_tools.check_proteome(proteome, 'add_tracks_from_file (si_tracks)') # check mode is valid general_utilities.valid_keyword('mode', mode, ['symbols','values']) # next read in the file tracks_interface = _TracksInterface(filename, delimiter=delimiter, mode=mode, skip_bad=skip_bad, preauthorized_uids=proteome.proteins) if return_dictionary: return tracks_interface.data # finally add the domains from the dictionary generated by the DomainsInterface parser add_tracks_from_dictionary(proteome, tracks_interface.data, mode, safe=safe, verbose=verbose)
## ------------------------------------------------------------------------ ##
[docs]def add_tracks_from_dictionary(proteome, tracks_dictionary, mode, safe=True, verbose=True): """ Function that takes a correctly formatted Tracks dictionary and will add those Tracks to the proteins in the Proteome. Track dictionaries are key-value pairs, where the key is a unique ID and the value is a list of dictionaries. For each sub-dictionary, there are two key-value pairs that reflect: * 'track_name' : name of the track (str) * 'track_data' : parsed list of floats (if expecting values) or strings (if expecting symbols) that should equal the length of the associated protein. Parameters ------------ proteome : Proteome Object Proteome object which tracks will be added to tracks_dictionary : dict Dictionary in which keys are unique IDs for proteins and the value is a list of dictionaries, where each subdictionary has the two key-value pairs: * **track_name** : name of the track (str) * **track_data** : parsed list of floats (if expecting values) or strings (if expecting symbols) that should equal the length of the associated protein. mode : str (default = 'values') A selector that defines the type of track file to be read. Must be either 'symbols' or 'values'. safe : bool (default = True) If set to True then any exceptions raised during the track-adding process are acted on. If set to False, exceptions simply mean the Track in question is skipped. Note if set to False, pre-existing Tracks with the same name would be silently overwritten (although this is not consider an error), while overwriting will trigger an exception in safe=True. There are various reasons Track addition could fail (length does not match the protein etc) and so if verbose=True then the cause of an exception is also printed to screen. It is highly recommend that if you choose to use safe=False you also set verbose=True. verbose : boolean (default = True) Flag that defines how 'loud' output is. Will warn about errors on adding tracks. Returns ----------- None No return value, but tracks are added to the Proteome object passed as the first argument. """ # check first argument is a proteome interface_tools.check_proteome(proteome, 'add_tracks_from_dictionary (si_tracks)') # check mode is valid general_utilities.valid_keyword('mode', mode, ['symbols','values']) # cycle through each protein in the proteome... for protein in proteome: if protein.unique_ID in tracks_dictionary: for track in tracks_dictionary[protein.unique_ID]: # get the track name and vector info track_name = track['track_name'] track_data = track['track_data'] # add the track as either values or symbols depending # on what was provided try: if mode == 'values': protein.add_track(track_name, values=track_data, safe=safe) else: protein.add_track(track_name, symbols=track_data, safe=safe) # if an ProteinException was raised when trying to add a track some # anticipated error occurred except (ProteinException, TrackException) as e: msg='- skipping track at %s on %s' %(track_name, protein) if safe: shephard_exceptions.print_and_raise_error(msg, e) else: if verbose: shephard_exceptions.print_warning(msg) continue
## ------------------------------------------------------------------------ ##
[docs]def write_all_tracks_separate_files(proteome, outdirectory='.', value_fmt = "%.3f", delimiter='\t'): """ Function that writes all tracks associated with a proteome out to seperate files. This may be preferable in some situations, but in others maybe only a subset of tracks are requested, for which write_tracks() would be good, or alternatively you want all tracks in a single file, in which case write_all_tracks_single_files() would be the way to go. The the output filenames are defined as: > `shephard_track_<trackname>.tsv` and are written to the outdirectory. Because track files MUST be written as one per track_name, this function is equivalent to cycling through each unique track name and writing it out sequentially. Parameters ----------- proteome : Proteome object Proteome object from which the Domains will be extracted from outdirectory : str (default = '.') String that defines the output directory. By default sets to the present working directory. value_fmt : str (default = "%.3f") Format string that will be used for values. Default = "%.3f" delimiter : str (default = '\\t') Character (or characters) used to separate between fields. Default is '\t' Which is recommended to maintain compliance with default `add_tracks_from_files()` function. Returns -------- None No return type, but generates a new file with the complete set of Domains from this Proteome written to disk. """ for t_name in proteome.unique_track_names: outname = os.path.join(outdirectory, "shephard_track_%s.tsv" %( t_name)) write_tracks(proteome, outname, t_name, value_fmt, delimiter)
## ------------------------------------------------------------------------ ##
[docs]def write_all_values_tracks_single_file(proteome, outfile, value_fmt = "%.3f", delimiter='\t'): r""" Function that writes all tracks associated with a Proteome out to a single file. This may be preferable in some situations, but in others maybe only a subset of tracks are requested, for which write_tracks() would be good, or alternatively you want all tracks in seperate files, in which case write_all_tracks_separate_files() would be the way to go. Parameters ----------- proteome : Proteome object Proteome object from which the Domains will be extracted from outfile : str String that defines the name of the output file. value_fmt : str (default = "%.3f") Format string that will be used for values. Default = "%.3f" delimiter : str (default = '\\t') Character (or characters) used to separate between fields. Default is '\t' Which is recommended to maintain compliance with default `add_tracks_from_files()` function. Returns -------- None No return type, but generates a new file with the complete set of tracks from this Proteome written to disk. """ return __write_all_tracks_single_file(proteome, outfile, 'values', value_fmt, delimiter)
## ------------------------------------------------------------------------ ##
[docs]def write_all_symbols_tracks_single_file(proteome, outfile, value_fmt = "%.3f", delimiter='\t'): r""" Function that writes all tracks associated with a Proteome out to a single file. This may be preferable in some situations, but in others maybe only a subset of tracks are requested, for which write_tracks() would be good, or alternatively you want all tracks in seperate files, in which case write_all_tracks_separate_files() would be the way to go. Parameters ----------- proteome : Proteome object Proteome object from which the Domains will be extracted from outfile : str String that defines the name of the output file. value_fmt : str (default = "%.3f") Format string that will be used for values. delimiter : str (default = '\\t') Character (or characters) used to separate between fields. Default is '\t' Which is recommended to maintain compliance with default `add_tracks_from_files()` function. Returns -------- None No return type, but generates a new file with the complete set of tracks from this proteome written to disk. """ return __write_all_tracks_single_file(proteome, outfile, 'symbols', value_fmt, delimiter)
## ------------------------------------------------------------------------ ##
[docs]def write_tracks(proteome, filename, track_name, value_fmt = "%.3f", delimiter='\t', file_handle=None): r""" Function that writes out a specific track to file in a standardized format. Note that because track files are inevitably quite big default behaviour is to only write out a single track file at a time (i.e. unlike write_domains or write_sites where ALL domains or all sites are - by default - written out, here ONLY a single type of track, defined by track_name, can be written. To write ALL the tracks from a file, see si_tracks.write_all_tracks(). Parameters ----------- proteome : Proteome object Proteome object from which the Domains will be extracted from filename : str Filename that will be used to write the new Domains file track_name : str Name of the track to be written out. value_fmt : str (default = "%.3f") Format string that will be used for values. Default = "%.3f". Note that this is not a smart value so if the actual value used means that %.3f looses all meaning this will not trigger a warning, so, be careful! delimiter : str (default = '\\t') Character (or characters) used to separate between fields. Default is '\t' which is recommended to maintain compliance with default `add_tracks_from_files()` function. file_handle : filehandle (_io.TextIOWrapper) or None If passed, output is written to this handle rather than to a new file. The filename variable is ignored in this case. Returns -------- None No return type, but generates a new file with the complete set of Domains from this Proteome written to disk. """ # test the passed value_fmt string works. This is not fullproof but at least validates that # the string can parse a float (which is a necessary requirement for tracks values to be read # back in again by shephard try: a = value_fmt %( 1.5 ) if float(a) != 1.5: raise InterfaceException('Invalid value_fmt passed [%s]'%(str(value_fmt))) except TypeError: raise InterfaceException('Invalid value_fmt passed [%s]'%(str(value_fmt))) if file_handle is not None: fh = file_handle else: fh = open(filename, 'w') for protein in proteome: # try and extract out the track in question t = protein.track(track_name, safe=False) if t is not None: # construct an outstring from the track outstring = __build_track_line(t, delimiter, value_fmt) fh.write(outstring) if file_handle is None: fh.close()
## ------------------------------------------------------------------------ ## def __build_track_line(t, delimiter, value_fmt): """ Internal function that takes a Track object and returns a line that can be written to a Track file. This is called internally by functions that write Tracks. Parameters ---------------------- t : shephard.Track Track object being converted to a string delimiter : str (default = '\\t') Character (or characters) used to separate between fields. Default is the tab character ('\\t'), which is recommended to maintain compliance with default SHEPHARD file-reading functions. Returns -------------- str Returns a string that is ready to be written to file """ unique_ID = t.protein.unique_ID # build the initial string #out_string = "%s%s%s%s" % (unique_ID, delimiter, t.name, delimiter) # build the initial string out_string = f"{unique_ID}{delimiter}{t.name}{delimiter}" if t.values is not None: for v in t.values: out_string = out_string + "%s%s" % (value_fmt %(v), delimiter) else: for v in t.symbols: out_string = out_string + "%s%s" % (v, delimiter) return out_string + "\n" ## ------------------------------------------------------------------------ ##
[docs]def write_tracks_from_list(track_list, filename, value_fmt = "%.3f", delimiter='\t'): r""" Function that writes out tracks to a SHEPHARD tracks file from a list of Track objects. Note that attributes are converted to a string, which for simple attributes is reasonable but is not really a viable stratergy for complex objects, although this will not yeild and error. Note also that a single track file cannot have both values and symbols tracks, and this is checked first Parameters ----------- track_list : List of Track objects List of track objects which will be written filename : str Filename that will be used to write the new tracks file value_fmt : str (default = "%.3f") Format string that will be used for values. Default = "%.3f". Note that this is not a smart value so if the actual value used means that %.3f looses all meaning this will not trigger a warning, so, be careful! delimiter : str (default = '\\t') Character (or characters) used to separate between fields. Default is '\\t' which is recommended to maintain compliance with default `add_tracks_from_file()` function Returns -------- None No return type, but generates a new file with the complete set of tracks from this proteome written to disk. """ # first check if items in the list are track objects for t in track_list: interface_tools.check_track(t, 'write_tracks_from_list') ## START OF SANITY CHECKING FOR CONSISTENT TRACK TYPES ## ## The code below checks all tracks in the list are consistent, i.e. ## that they are ALL symbol or ALL value tracks ## # if none then this must be a symbol track if track_list[0].values is None: symbol = True values = False # else this must be a value track else: symbol = False values = True # make sure all tracks are consistent! for t in track_list: if symbol is True: if t.values is not None: raise InterfaceException(f'First track [{track_list[0]}] was a symbols track, yet track {t} is a values track. All tracks in the list must be the same type') if values is True: if t.symbols is not None: raise InterfaceException(f'First track [{track_list[0]}] was a values track, yet track {t} is a symbols track. All tracks in the list must be the same type') ## END OF SANITY CHECKING FOR CONSISTENT TRACK TYPES # test the passed value_fmt string works. This is not fullproof but at least validates that # the string can parse a float (which is a necessary requirement for tracks values to be read # back in again by shephard try: a = value_fmt %( 1.5 ) if float(a) != 1.5: raise InterfaceException(f'Invalid value_fmt passed [{value_fmt}]') except TypeError: raise InterfaceException(f'Invalid value_fmt passed [{value_fmt}]') with open(filename, 'w') as fh: # for each track in the list for t in track_list: # build a line # if the passed parameter track_types is being # used outstring = __build_track_line(t, delimiter, value_fmt) fh.write(outstring)