Source code for shephard.interfaces.si_tracks

"""
SHEPHARD: 
Sequence-based Hierarchical and Extendable Platform for High-throughput Analysis of Region of Disorder

Authors: Garrett M. Ginell & Alex S. Holehouse
Contact: (alex.holehouse@wustl.edu, g.ginell@wustl.edu)

Holehouse Lab - Washington University in St. Louis
"""


from shephard.exceptions import InterfaceException, ProteinException, TrackException
import shephard.exceptions as shephard_exceptions
from . import interface_tools 
from shephard import general_utilities
import os

MAX_BAD_COUNT  = 10


class _TracksInterface:

    def __init__(self, filename, delimiter='\t', mode='values', skip_bad=True, preauthorized_uids=None):
        """
        
        Class for reading in correctly formatted tracks files for parsing 
        into a Proteome object.

        Tracks files must adhere to the following specification

            unique_ID, track_name, val_1, val_2, ...., val_n 
si
        where n = length of protein.

        This class allows a tracksfile to be read in and defined as either 
        a values track file, or a symbols track file, returning a tracks 
        dictionary. 

        Parameters
        ----------------
        
        filename : str
            Name of the SHEPHARD Tracks file to read.


        Other Parameters
        ----------------

        delimiter : str (default = '\\t')
            String used as a delimiter on the input file. 

        mode : str (default = 'values')
            A selector that defines the type of track file to be read. 
            Must be either 'symbols' or 'values'.

        skip_bad : bool (default = True)
            Flag that means if bad lines (lines that trigger an exception) 
            are encountered the code will just skip them. By default this is 
            true, which adds a certain robustness to file parsing, but could 
            also hide errors. Note that if lines are skipped a warning will 
            be printed (regardless of verbose flag). 

        preauthorized_ids : list of str (default = None)
            List of unique_IDs that are allowed to be added to the track
            dictionary. If None then all tracks are allowed. Avoids parsing
            lines that are not needed into the interface objects


        """

        bad_count = 0
        
        with open(filename,'r') as fh:
            content = fh.readlines()

        # convert the preauthorized uids to a set for faster lookup
        if preauthorized_uids is not None:
            preauthorized_uids = set(preauthorized_uids)
            
        ID2track = {}

        linecount = 0
        # cycle over every line in the file
        for line in content:

            linecount = linecount + 1

            # skip comment lines
            if interface_tools.is_comment_line(line):
                continue

            # extract chop off lagging whitespace and divide up using the delimiter
            sline = line.strip().split(delimiter)                        
            track_data = []
            
            # for this list 
            try:

                # extract track name and unique_id
                unique_ID = sline[0].strip()

                # check if UID associated with this line is found in the
                # preauthorized list. If  not then skip this line
                if preauthorized_uids is not None and unique_ID not in preauthorized_uids:
                    continue
                
                track_name = sline[1].strip()
                
                # parse track values or symbols
                if mode == 'values':

                    # for each element in sline strip whitespace and convert to a float
                    track_data = [float(i.strip()) for i in sline[2:]]

                elif mode == 'symbols':
                    # for each element in sline strip whitespace 
                    track_data = [i.strip() for i in sline[2:]]
                else:
                    raise InterfaceException(f"Error: mode='{mode}' passed, yet this does not match 'symbols' or 'values'")

                        
                if unique_ID in ID2track:
                    ID2track[unique_ID].append({'track_name':track_name, 'track_data':track_data})                    
                else:
                    ID2track[unique_ID] = [{'track_name':track_name, 'track_data':track_data}]
                    

            except Exception as e:

                msg = f'Failed parsing file [{filename}] on line [{linecount}].\n\nException raised: {str(e)}\n\nline printed below:\n{line}'

                # should update this to also display the actual error...
                if skip_bad and bad_count < MAX_BAD_COUNT:
                    bad_count = bad_count + 1
                    shephard_exceptions.print_warning(msg + f"\nSkipping this line (count {bad_count} of {MAX_BAD_COUNT} ...)")
                    continue
                else:
                    raise InterfaceException(msg)

        self.data = ID2track


## ------------------------------------------------------------------------
##
def __write_all_tracks_single_file(proteome, 
                                   outfile, 
                                   track_type,
                                   value_fmt = "%.3f", 
                                   delimiter='\t'):
    r"""
    Internal function Function that writes all tracks associated with  
    a Proteome out to a single file. 

    See also:

        write_all_values_tracks_single_file() 
        write_all_symbols_tracks_single_file() 
    

    Parameters
    -----------

    proteome : Proteome object
        Proteome object from which the Domains will be extracted from

    outfile : str
        String that defines the name of the output file.


    Other Parameters
    ----------------

    value_fmt : str (default = "%.3f")
        Format string that will be used for values. 
        
    delimiter : str (default = '\\t')
        Character (or characters) used to separate between fields. 
        Default is '\t' which is recommended to maintain compliance 
        with default `add_tracks_from_files()` function.

    
    Returns
    --------
    None
        No return type, but generates a new file with the complete 
        set of tracks from this Proteome written to disk.
        

    """
    
    # open the file handle
    fh = open(outfile,'w')

    # build a list of track names that are values-tracks
    tn2tt = proteome.track_names_to_track_type
    
    valid_names = []
    for name in tn2tt:
        if tn2tt[name] == track_type:
            valid_names.append(name)
        
    # cyle through each track name we designated as valid
    for t_name in valid_names:        
        write_tracks(proteome, None, t_name, value_fmt, delimiter, file_handle=fh)

    fh.close()


##############################################
##                                          ##
##     PUBLIC FACING FUNCTIONS BELOW        ##
##                                          ##
##############################################

## ------------------------------------------------------------------------
##
[docs]def add_tracks_from_file(proteome, filename, mode, delimiter='\t', return_dictionary=False, safe=True, skip_bad=True, verbose=True):
    r"""
    Function that takes a correctly formatted shephard 'tracks' file and reads 
    all Tracks into the passed Proteome.

    Expect Track files to have the following format:

    One protein per line, where each protein has the following information:
    
    >>> Unique_ID    track_name    res1    res2    res3 .... resn

    Where ``res1``, ``res2``, ``resn`` are symbol or values to be mapped to 
    the 1st, 2nd, or nth residue. There should be the same number of res1, 2, 
    ...n entries are there are residues in the associated protein.
    
    A couple of key points here:

    - The default delimiter is tabs ('\\t') but this can be changed with 
      the delimiter argument

    - Each track must assign a value or a symbol to EVERY residue in the 
      protein
    
    Parameters
    ----------

    proteome : Proteome
        Proteome object 

    filename : str
        Name of the shephard Domains file to read

    mode : str (default = 'values')
        A selector that defines the type of track file to be read. 
        Must be either 'symbols' or 'values'.

    delimiter : str (default = '\\t')
        String used as a delimiter on the input file. 

    return_dictionary : bool (default = False)
        If set to true, this function will return the tracks dictionary 
        and will NOT add that dictionary to the Proteome - i.e. the function 
        basically becomes a parser for SHEPHARD-compliant tracks files. 
        
    safe : bool (default = True)
        If set to True then any exceptions raised during the Track-adding 
        process (i.e. after file parsing) are acted on. If set to False, 
        exceptions simply mean the site in question is skipped. Note if set 
        to False pre-existing tracks with the same name would be silently 
        overwritten (although this is not consider an error), while 
        overwriting will trigger an exception in safe=True. There are various 
        reasons site addition could fail (e.g. track does not match length of 
        protein) so if verbose=True then the cause of an exception is also 
        printed to screen. It is highly recommend that if you choose to use 
        safe=False you also set verbose=True. 
        
    skip_bad : bool (default = True)
        Flag that means if bad lines (lines that trigger an exception) are 
        encountered the code will just skip them. By default this is true, 
        which adds a certain robustness to file parsing, but could also hide 
        errors. Note that if lines are skipped a warning will be printed 
        (regardless of verbose flag). 

    verbose : bool (default = True)
        Flag that defines how 'loud' output is. Will warn about errors on 
        adding tracks.

    Returns
    -----------
    None or dict
        If return_dictionary is set to False (default) then this function 
        has no return value, but the tracks are added to the Proteome object
        passed as the first argument. If return_dictionary is set to True the 
        function returns the parsed tracks dictionary without adding the 
        newly-read tracks to the proteome.
        
    """        

    # check first argument is a Proteome
    interface_tools.check_proteome(proteome, 'add_tracks_from_file (si_tracks)')

    # check mode is valid
    general_utilities.valid_keyword('mode', mode, ['symbols','values'])
    
    # next read in the file
    tracks_interface = _TracksInterface(filename,
                                        delimiter=delimiter,
                                        mode=mode,
                                        skip_bad=skip_bad,
                                        preauthorized_uids=proteome.proteins)
                                        

    if return_dictionary:
        return tracks_interface.data

    # finally add the domains from the dictionary generated by the DomainsInterface parser
    add_tracks_from_dictionary(proteome, tracks_interface.data, mode, safe=safe, verbose=verbose)



## ------------------------------------------------------------------------
##
[docs]def add_tracks_from_dictionary(proteome, tracks_dictionary, mode, safe=True, verbose=True):
    """
    Function that takes a correctly formatted Tracks dictionary and 
    will add those Tracks to the proteins in the Proteome.
    
    Track dictionaries are key-value pairs, where the key is a unique 
    ID and the value is a list of dictionaries. For each sub-dictionary, 
    there are two key-value pairs that reflect:

       * 'track_name'  : name of the track (str)

       * 'track_data'  : parsed list of floats (if expecting values) or strings (if expecting symbols) that should equal the length of the associated protein.
                       
    Parameters
    ------------

    proteome : Proteome Object
        Proteome object which tracks will be added to

    tracks_dictionary : dict
        Dictionary in which keys are unique IDs for proteins and the value 
        is a list of dictionaries, where each subdictionary has the two 
        key-value pairs:
        
        * **track_name**  : name of the track (str)
        * **track_data**  : parsed list of floats (if expecting values) or strings (if expecting symbols) that should equal the length of the associated protein.
                            
    mode : str (default = 'values')
        A selector that defines the type of track file to be read. 
        Must be either 'symbols' or 'values'.

    safe : bool (default = True)
        If set to True then any exceptions raised during the track-adding 
        process are acted on. If set to False, exceptions simply mean the 
        Track in question is skipped. Note if set to False, pre-existing 
        Tracks with the same name would be silently overwritten (although 
        this is not consider an error), while overwriting will trigger an 
        exception in safe=True. There are various reasons Track addition 
        could fail (length does not match the protein etc) and so if 
        verbose=True then the cause of an exception is also printed to 
        screen. It is highly recommend that if you choose to use 
        safe=False you also set verbose=True. 

    verbose : boolean (default = True)
        Flag that defines how 'loud' output is. Will warn about errors on 
        adding tracks.
        
    Returns
    -----------
    None
        No return value, but tracks are added to the Proteome object 
        passed as the first argument.
        
    """
        
    # check first argument is a proteome
    interface_tools.check_proteome(proteome, 'add_tracks_from_dictionary (si_tracks)')

    # check mode is valid
    general_utilities.valid_keyword('mode', mode, ['symbols','values'])
    
    # cycle through each protein in the proteome...
    for protein in proteome:
        if protein.unique_ID in tracks_dictionary:
            for track in tracks_dictionary[protein.unique_ID]:

                # get the track name and vector info
                track_name = track['track_name']
                track_data = track['track_data']

                # add the track as either values or symbols depending 
                # on what was provided
                try:
                    if mode == 'values':
                        protein.add_track(track_name, values=track_data, safe=safe)
                    else:
                        protein.add_track(track_name, symbols=track_data, safe=safe)

                # if an ProteinException was raised when trying to add a track some
                # anticipated error occurred
                except (ProteinException, TrackException) as e:      

                    msg='- skipping track at %s on %s' %(track_name, protein)
                    if safe:
                        shephard_exceptions.print_and_raise_error(msg, e)
                    else:
                        if verbose:
                            shephard_exceptions.print_warning(msg)
                        continue





## ------------------------------------------------------------------------
##
[docs]def write_all_tracks_separate_files(proteome, 
                     outdirectory='.', 
                     value_fmt = "%.3f", 
                     delimiter='\t'):

    """
    Function that writes all tracks associated with a proteome out to seperate
    files. This may be preferable in some situations, but in others maybe only
    a subset of tracks are requested, for which write_tracks() would be good, 
    or alternatively you want all tracks in a single file, in which case
    write_all_tracks_single_files() would be the way to go.

    The the output filenames are defined as:
        
    > `shephard_track_<trackname>.tsv`
    
    and are written to the outdirectory.

    Because track files MUST be written as one per track_name, this 
    function is equivalent to cycling through each unique track name 
    and writing it out sequentially. 
    
    Parameters
    -----------

    proteome :  Proteome object
        Proteome object from which the Domains will be extracted from

    outdirectory : str (default = '.')
        String that defines the output directory. By default sets to the 
        present working directory.

    value_fmt : str (default = "%.3f")
        Format string that will be used for values. Default = "%.3f"
        
    delimiter : str (default = '\\t')
        Character (or characters) used to separate between fields. 
        Default is '\t' Which is recommended to maintain compliance with 
        default `add_tracks_from_files()` function.
    
    Returns
    --------
    None
        No return type, but generates a new file with the complete set of 
        Domains from this Proteome written to disk.

    """

    for t_name in proteome.unique_track_names:
        outname = os.path.join(outdirectory, "shephard_track_%s.tsv" %( t_name))            
        write_tracks(proteome, outname, t_name, value_fmt, delimiter)


## ------------------------------------------------------------------------
##
[docs]def write_all_values_tracks_single_file(proteome, 
                                 outfile, 
                                 value_fmt = "%.3f", 
                                 delimiter='\t'):
    r"""
    Function that writes all tracks associated with a Proteome out to a single
    file. This may be preferable in some situations, but in others maybe only
    a subset of tracks are requested, for which write_tracks() would be good, 
    or alternatively you want all tracks in seperate files, in which case
    write_all_tracks_separate_files() would be the way to go.
        
    Parameters
    -----------

    proteome :  Proteome object
        Proteome object from which the Domains will be extracted from

    outfile : str
        String that defines the name of the output file.

    value_fmt : str (default = "%.3f")
        Format string that will be used for values. Default = "%.3f"
        
    delimiter : str (default = '\\t')
        Character (or characters) used to separate between fields. Default 
        is '\t' Which is recommended to maintain compliance with default 
        `add_tracks_from_files()` function.
    
    Returns
    --------
    None
        No return type, but generates a new file with the complete set of tracks
        from this Proteome written to disk.


    """

    return __write_all_tracks_single_file(proteome, outfile, 'values', value_fmt, delimiter)

## ------------------------------------------------------------------------
##
[docs]def write_all_symbols_tracks_single_file(proteome, 
                                 outfile, 
                                 value_fmt = "%.3f", 
                                 delimiter='\t'):
    r"""
    Function that writes all tracks associated with a Proteome out to a single
    file. This may be preferable in some situations, but in others maybe only
    a subset of tracks are requested, for which write_tracks() would be good, 
    or alternatively you want all tracks in seperate files, in which case
    write_all_tracks_separate_files() would be the way to go.
        
    Parameters
    -----------

    proteome :  Proteome object
        Proteome object from which the Domains will be extracted from

    outfile : str
        String that defines the name of the output file.

    value_fmt : str (default = "%.3f")
        Format string that will be used for values.
            
    delimiter : str (default = '\\t')
        Character (or characters) used to separate between fields. Default
        is '\t' Which is recommended to maintain compliance with default 
        `add_tracks_from_files()` function.
    
    Returns
    --------
    None
        No return type, but generates a new file with the complete set of 
        tracks from this proteome written to disk.


    """
    return __write_all_tracks_single_file(proteome, outfile, 'symbols', value_fmt, delimiter)



## ------------------------------------------------------------------------
##
[docs]def write_tracks(proteome, filename, track_name, value_fmt = "%.3f", delimiter='\t', file_handle=None):
    r"""
    Function that writes out a specific track to file in a standardized 
    format. Note that because track files are inevitably quite big default 
    behaviour is to only write out a single track file at a time (i.e. 
    unlike write_domains or write_sites where ALL domains or all sites are
    - by default - written out, here ONLY a single type of track, defined
    by track_name, can be written.

    To write ALL the tracks from a file, see si_tracks.write_all_tracks().
    
    Parameters
    -----------
    proteome :  Proteome object
        Proteome object from which the Domains will be extracted from

    filename : str
        Filename that will be used to write the new Domains file

    track_name : str
        Name of the track to be written out.

    value_fmt : str (default = "%.3f")
        Format string that will be used for values. Default = "%.3f". Note 
        that this is not a smart value so if the actual value used means 
        that %.3f looses all meaning this will not trigger a warning, so, 
        be careful!
        
    delimiter : str (default = '\\t')
        Character (or characters) used to separate between fields. Default 
        is '\t' which is recommended to maintain compliance with default 
        `add_tracks_from_files()` function.

    file_handle : filehandle (_io.TextIOWrapper) or None
        If passed, output is written to this handle rather than to a new 
        file. The filename variable is ignored in this case.
        
    
    Returns
    --------
    None
        No return type, but generates a new file with the complete set of 
        Domains from this Proteome written to disk.

    """

    # test the passed value_fmt string works. This is not fullproof but at least validates that
    # the string can parse a float (which is a necessary requirement for tracks values to be read
    # back in again by shephard
    try:
        a = value_fmt %( 1.5 )

        if float(a) != 1.5:
            raise InterfaceException('Invalid value_fmt passed [%s]'%(str(value_fmt)))
    except TypeError:
        raise InterfaceException('Invalid value_fmt passed [%s]'%(str(value_fmt)))

    if file_handle is not None:
        fh  = file_handle
    else:
        fh = open(filename, 'w')

    for protein in proteome:

            
        # try and extract out the track in question
        t = protein.track(track_name, safe=False)
        if t is not None:

            # construct an outstring from the track
            outstring = __build_track_line(t, delimiter, value_fmt)
            fh.write(outstring)

    if file_handle is None:
        fh.close()
                    
## ------------------------------------------------------------------------
##
def __build_track_line(t, delimiter, value_fmt):
    """
    Internal function that takes a Track object and returns a line that can
    be written to a Track file. This is called internally by functions that
    write Tracks.

    Parameters
    ----------------------
    t : shephard.Track
        Track object being converted to a string

    delimiter : str (default = '\\t')
        Character (or characters) used to separate between fields. 
        Default is the tab character ('\\t'), which is recommended to 
        maintain compliance with default SHEPHARD file-reading functions.     

    Returns
    --------------
    str
        Returns a string that is ready to be written to file

    """

    unique_ID = t.protein.unique_ID

    # build the initial string
    #out_string = "%s%s%s%s" % (unique_ID, delimiter, t.name, delimiter)

    # build the initial string
    out_string = f"{unique_ID}{delimiter}{t.name}{delimiter}"

    if t.values is not None:
        for v in t.values:
            out_string = out_string + "%s%s" % (value_fmt %(v), delimiter)
    else:
        for v in t.symbols:
            out_string = out_string + "%s%s" % (v, delimiter)


    return out_string + "\n"
                    

## ------------------------------------------------------------------------
##
[docs]def write_tracks_from_list(track_list, filename, value_fmt = "%.3f", delimiter='\t'):
    r"""
    Function that writes out tracks to a SHEPHARD tracks file from a list
    of Track objects. 

    Note that attributes are converted to a string, which for simple 
    attributes is reasonable but is not really a viable stratergy for 
    complex objects, although this will not yeild and error.

    Note also that a single track file cannot have both values and
    symbols tracks, and this is checked first 
            
    Parameters
    -----------

    track_list : List of Track objects
        List of track objects which will be written

    filename : str
        Filename that will be used to write the new tracks file

    value_fmt : str (default = "%.3f")
        Format string that will be used for values. Default = "%.3f". Note 
        that this is not a smart value so if the actual value used means 
        that %.3f looses all meaning this will not trigger a warning, so, 
        be careful!

    delimiter : str (default = '\\t')
        Character (or characters) used to separate between fields. Default is 
        '\\t' which is recommended to maintain compliance with default 
        `add_tracks_from_file()` function
       
    Returns
    --------
    None
        No return type, but generates a new file with the complete set of 
        tracks from this proteome written to disk.

    """

    # first check if items in the list are track objects
    for t in track_list:
        interface_tools.check_track(t, 'write_tracks_from_list')

    ## START OF SANITY CHECKING FOR CONSISTENT TRACK TYPES
    ##
    ## The code below checks all tracks in the list are consistent, i.e.
    ## that they are ALL symbol or ALL value tracks
    ##

    # if none then this must be a symbol track
    if track_list[0].values is None:
        symbol = True
        values = False

    # else this must be a value track
    else:
        symbol = False
        values = True
        
    # make sure all tracks are consistent!
    for t in track_list:
        if symbol is True:
            if t.values is not None:
                raise InterfaceException(f'First track [{track_list[0]}] was a symbols track, yet track {t} is a values track. All tracks in the list must be the same type')

        if values is True:
            if t.symbols is not None:
                raise InterfaceException(f'First track [{track_list[0]}] was a values track, yet track {t} is a symbols track. All tracks in the list must be the same type')
            
    ## END OF SANITY CHECKING FOR CONSISTENT TRACK TYPES


    # test the passed value_fmt string works. This is not fullproof but at least validates that
    # the string can parse a float (which is a necessary requirement for tracks values to be read
    # back in again by shephard
    try:
        a = value_fmt %( 1.5 )

        if float(a) != 1.5:
            raise InterfaceException(f'Invalid value_fmt passed [{value_fmt}]')
    except TypeError:
        raise InterfaceException(f'Invalid value_fmt passed [{value_fmt}]')

            
    with open(filename, 'w') as fh:

        # for each track in the list
        for t in track_list:

            # build a line 
            # if the passed parameter track_types is being
            # used
            outstring = __build_track_line(t, delimiter, value_fmt)

            fh.write(outstring)