Source code for shephard.tools.domain_tools

"""
SHEPHARD: 
Sequence-based Hierarchical and Extendable Platform for High-throughput Analysis of Region of Disorder

Authors: Garrett M. Ginell & Alex S. Holehouse
Contact: (g.ginell@wustl.edu)

Holehouse Lab - Washington University in St. Louis
"""

import numpy as np
import random
from . import site_tools
from shephard import general_utilities
from shephard import exceptions
from shephard.interfaces.interface_tools import check_protein

## ------------------------------------------------------------------------
##
[docs]def domain_overlap(domain_1, domain_2, check_origin=True):
    """
    Given two domains asks if their boundaries overlap. By default
    this expects the two domains in question to be from the same protein
    and checks this. If we dont want to enforce this assumption set
    check_origin to False.

    Parameters
    -----------
    domain_1 : shephard.domain.Domain
        The first domain object of interest

    domain_2 : shephard.domain.Domain
        The first domain object of interest

    check_origin : bool
        Flag that if set to True will cause an exception if domain_1 and
        domain_2 are from different proteins. If set to false, no such
        sanity checks are performed.

    Returns
    ----------
    boolean
        Returns true if the two domains overlap, else returns false
      
    """

    if check_origin:
        if domain_1.protein.unique_ID != domain_2.protein.unique_ID:
            raise exceptions.DomainException('Examining overlap of %s and %s but these are from different proteins' % (str(domain_1), str(domain_2)))
            
    return domain_overlap_by_position(domain_1.start, domain_1.end, domain_2.start, domain_2.end)



## ------------------------------------------------------------------------
##
[docs]def domain_overlap_fraction(domain_1, domain_2, check_origin=True):
    """
    Given two domains asks what fraction the shorter domain 
    overlaps the longer one with.
    Parameters
    -----------
    domain_1 : shephard.domain.Domain
        The first domain object of interest

    domain_2 : shephard.domain.Domain
        The first domain object of interest

    check_origin : bool (default = True)
        Flag that if set to True will cause an exception if domain_1 and
        domain_2 are from different proteins. If set to false, no such
        sanity checks are performed.

    Returns
    ----------
    float
        Returns a float between 0 and 1 that corresponds to what 
        fraction of the shorter domain overlaps with the longer domain.
      
    """

    if check_origin:
        if domain_1.protein.unique_ID != domain_2.protein.unique_ID:
            raise exceptions.DomainException('Examining overlap of %s and %s but these are from different proteins' % (str(domain_1), str(domain_2)))


    if len(domain_1) < len(domain_2):
        d_short = domain_1
        d_long = domain_2
    else:
        d_short = domain_2
        d_long = domain_1

    
    # ......OOOOOOOOOOOOOOOOOOOOOOO............       long
    #...XXXXXXXXXXXX?????                             short
    #
    if d_short.start < d_long.start:
        if d_short.end < d_long.start:
            return 0.0
        else:
            return ((d_short.end - d_long.start)+1)/len(d_short)

    # ......OOOOOOOOOOOOOOOOOOOOOOO.............     end
    #.                   ?????XXXXXXXXXXXX......     short
    #
    if d_short.end > d_long.end:
        if d_short.start > d_long.end:
            return 0.0
        else:

            # note - the +1 accounts for the fact we're using 
            # an inclusive counting 
            return ((d_long.end - d_short.start)+1)/len(d_short)

    # if we get here d_short.start equal to or larger than d_long start
    # and d_short end equal to or smaller than d_long end, so 100% overlap
    return 1.0
    


## ------------------------------------------------------------------------
##
[docs]def domain_overlap_by_position(boundary_start1, boundary_end1, boundary_start2, boundary_end2):
    """
    Given four sets of starting/ending positions, this function asks if 
    their boundaries overlap. 

    Parameters
    -----------
    boundary_start1 : int
        Position of domain 1 start

    boundary_end : int
        Position of domain 1 end

    boundary_start2 : int
        Position of domain 2 start

    boundary_end : int
        Position of domain 2 end

    Returns
    ----------
    boolean
        Returns true if the two domains overlap, else returns false
    """

    # note we do this swapping around which of the two domains passed as boundaries is 'a' and 'b' in 
    # the visual schematic below
    for x in [[boundary_start1, boundary_end1, boundary_start2, boundary_end2], [boundary_start2, boundary_end2,boundary_start1, boundary_end1]]:
        
        a_start = x[0]
        a_end   = x[1]
        b_start = x[2]
        b_end   = x[3]


        # scenario 1
        # AAAAAAAA
        #     BBBBBBBBB
        if a_start <= b_start and a_end >= b_start:
            return True

        # scenario 2
        #           AAAAAAAA
        #     BBBBBBBBB
        if a_start >= b_start and a_start <= b_end:
            return True

    return False
    



## ------------------------------------------------------------------------
##
[docs]def build_missing_domains(protein, new_domain_type = 'missing'):
    """
    Function which takes a protein and builds a set of domains that represent 
    the "empty spaces". Domains are returned as a list of domain dictionaries
    which can be added to a protein via the add_domains() function.

    This tool is stateless - i.e. it does not alter the passed protein
    but instead only generates a numerical list which could be added as 
    a track.

    One could always combine this directly with the add_domains() function
    into a single line - e.g.
    
    # this line will automatically add all the missing regions as 
    # 'missing' proteins to the domain
    protein.add_domains(build_missing_domains(protein))

    Parameters
    -----------
    protein : shephard.protein.Protein object
        Protein object over which sites are identified

    new_domain_type : str (default = 'missing')
        Name to assign to the 'empty' domains.

    Returns
    -------------
    list of domain dictionaries

        Returns a list of domain dictionaries which can be then parsed or
        added to a protein via the add_domains() function.
    
    """

    check_protein(protein, 'build_missing_domains()')

    ##
    ## NOTE -this function uses i0 indexing to keep things simple and then
    ## corrects at the end
    ##
    
    # first constrct an empty vector of 0s. We're going to build '1s' into
    # the positions in the sequence occupied by domains
    all_res = [0]*len(protein)
    
    # for each domain in the protein
    for d in protein.domains:

        # for each position in each domain (note this will overwrite
        # which is fine). NOTE that we cyle from d.start-1 to d.end because
        # start and end index from real-world positions, but range is a non
        # inclusive function (while the domain boundaries are inclusive). 
        for i in range(d.start-1, d.end):
            all_res[i] = 1
                
    # we are now left with an all_res list where the 0s represent regions
    # not assigned to any domain

    # if we start inside an empty domain set inside to true
    if all_res[0] == 0:
        start = 0
        inside=True

    # else set start to -1  and inside to false
    else:
        start = -1
        inside=False

    all_domains=[]

    # for each position in the sequence (starting at 1 because 
    # we already assessed 0 above. 
    for i in range(1,len(all_res)):

        # if we find ourselves in an empty space
        if all_res[i] == 0:
            if not inside:
                inside=True
                start = i
        # if we fine ourselves in a domain of some kind
        else:

            # if we were previously in an empty space
            if inside:
                inside = False

                # not here we're adding in i0 indexing
                all_domains.append([start, i-1])

    # if we ended and were inside when we ended then the last
    # empty domain spans to the final residue in the sequence
    if inside:
        all_domains.append([start,len(all_res)-1])

    new_domain_list =[]
    for d in all_domains:

        # note the +1 here is so we insert at residue positions
        # that correctly map to real-space positions (i.e.i1 indexing)

        # construct a domain dictionary 
        local_domain={}        
        local_domain['start'] = d[0]+1
        local_domain['end'] = d[1]+1
        local_domain['domain_type'] = new_domain_type

        new_domain_list.append(local_domain)

    return new_domain_list
            
        

## ------------------------------------------------------------------------
##
[docs]def build_domains_from_track_values(proteome, 
                                    track_name, 
                                    binerize_function, 
                                    domain_type, 
                                    gap_closure = 3, 
                                    minimum_region_size = 20, 
                                    extend_ends = None, 
                                    verbose = True):

    
    """
    Function which takes a Proteome and builds a set of domains based on
    values tracks in each Protein in that Proteome. This effectively 
    allows you to discretize some continous variable into distinct local 
    domains, which can often facilitate specific types of analysis. This
    conversion is done using a custom-passed binerize function which 
    converts a normal track into a track of 0s and 1s. Residues that
    are assigned a value of 1 will be included in a domain assuming they
    fall within a contigous region of sufficient size, as defined by
    the parameters gap_closure and minimum_region_size, as discussed 
    below.

    This function operates on an entire Proteome-level, and is stateless
    (i.e. does not directly alter the passed proteome). Instead, the 
    function dictionary where keys are unique_IDs of proteins and values 
    is a list of one or more Domain dictioinaries (with a start, end,
    and domain_type key:value pair).

    The domains dictionary can be added to a proteome using the 
    si_domains.add_domains_from_dictionary(). As an example, as possible 
    workflow is as follows:

    >>> d = build_domains_from_track_values(proteome, 'cool_track', trackfx)
    >>> si_domains.add_domains_from_dictionary(proteome, d)
    
    Under the hood, the function works by cycling through each protein, 
    extracting the track, and converting into domains.
        
    If a protein is too short or it lacks a given track, the protein is 
    skipped.
    
    Parameters
    -----------
    proteome : shephard.proteome.Proteome
        The Proteome which is going to be scanned for each track. Note that
        the underlying Proteome is not altered by this function

    track_name : string
        Name of the track to convert. If the track name does not exist in
        a given protein that protein is skipped. In this way, a Proteome
        where only a subset of Proteins have tracks can be parsed without
        issue. The track must be a values track - symbols tracks should be
        converted to a values track first to avoid issue.

    binerize_function : function
        A function which takes a track and converts it to 0 or 1 (binerize, 
        as in, make binary). This enables a complex and continous 
        track to be converted into a binary classification, which is 
        practically what a domain-assigment needs (yes/no inside domain).
        This function must take in a single variable (the track values) 
        and return a new list or numpy array that is the same length as 
        the track values but possesses only 0 and 1 in each element.

    domain_type : str
        String that defines the name of the new domains to create. Can in
        principle be anything.

    gap_closure : int (default = 3)
        Defines spacing between 1s or 0s that will be filled in to generate
        contigous stretches of 0s or 1s. This helps avoid a scenario where
        breaks in contigous stretches impede the definition of a domain 
        above a certain size, as defined by minimum_region_size. In general
        a value of 3 works reasonably well in most scenarios.

    minimum_region_size : int (default = 20)
        Defines the smallest size for a domain allowed. This can be varied
        depending on the question or data, and it may make sense to have 
        corresponding changes in gap_closure if this value becomes 
        substantially larger than a gap_closure of 3.

    extend_ends : int (default = None)
        This is a somewhat niche feature which, if set to a number, means 
        that we check the extend_ends-th value at the N- and C-terminus 
        of the binarized track, and if 1 set all values from that position
        to the N and/or C terminus to 1. This is provided because sometimes
        binerize functions will inherently struggle with the very ends of 
        sequences, so this provides a way to cast the first and last 
        extend_ends values to be 1. This is fairly specific and probably
        only worth using in a scenario where there is a clear issue

    verbose : bool (default = True)
        This flag enables the function to print statues every 500 proteins.
        If the binerize function is expensive this can be good to ensure 
        progress is proceeding.

    Returns
    -------------
    dict
        Returns a dictionary of key-value pairs, where each key is a unique
        ID and each value is a list of 1 or more domain dictionaries. This
        return dictionary can be directly added to a Proteome using the 
        Proteome.add_domains_from_dictonary() function.

    """
    
    new_domains = {}

    c = 0
    for protein in proteome:

        # if our protein is too short do not try and generate a domain
        if len(protein) < 3*gap_closure + 1:
            continue
        
        # this is the counter of proteins we've actually scanned - if
        # we hit a 500-protein milestone print status if verbose is true
        c = c + 1
        if verbose and c % 500 == 0:
            print('On %i of %i' %(c, len(proteome)))
        
        # safe = false so will return None if no track of that name
        # found, and if so we continue to next protein
        t = protein.track(track_name, safe=False)
        if t is None:
            continue

        # extract out the values
        t = t.values
        
        # and apply the binerize function to the values
        B = binerize_function(t)

        # add to help debugging
        if len(B) != len(protein):
            raise exceptions.DomainException('Binerize function failed to return an array or list that matches the protein sequence length')
            

        ## Part 1 - remove gapes
        for g in range(1, gap_closure+1):
            # first fill in 


            # for each position
            i = 0
            finished = False
            while not finished:

                p1 = i
                p2 = i + g
                p3 = i + 2*g
                p4 = i + 3*g

                # if the complete set of smaller regions ahead is empty or 
                # fully assigned skip ahead because nothing to do here...
                if np.sum(B[p1:p4]) == 0:
                    i = p4

                elif np.sum(B[p1:p4]) == 3*g:

                    # we jump to the p3 position (and NOT p4) as this allows us to skip along without
                    # discarding positions we need for filling. Note if we know everything is empty
                    # it doesn't matter and we can jump to p4
                    i = p3

                else:
                    # if we have gapsize number of hits
                    if np.sum(B[p1:p2])  == g:

                        # and if a gap away there is another gapsize 
                        if general_utilities.numerical_sum(B[p3:p4])  == g:                        
                            B[p2:p3] = [1]*g
                    
                    i = i + 1

                if i + 3*g >= len(B):
                    finished = True


        ## Part 2 - remove domains that are too small - we adde the '-' caps so we can use
        # replace and distinguish c/n terminal values
        B_string = '-'
        for i in B:
            if i == 1:
                B_string = B_string + "1"
            else:
                B_string = B_string + "0"

        B_string=B_string+'-'
        
        # for sizes of contigous stretches that are up minimum_region_size + 1
        # replace with empty ('0') strings
        for i in range(1, minimum_region_size + 1):      

            # 011110 -> 000000
            B_string = B_string.replace('0' + i*'1' + '0', '0' + i*'0' + '0')

            # -11110 -> -00000
            B_string = B_string.replace('-'+i*'1' + '0', '-'+i*'0' + '0')

            # 01111- -> 00000-
            B_string = B_string.replace('0' + i*'1'+'-' , '0'+ i*'0'+'-' )

            # -1111- -> -0000-
            B_string = B_string.replace('-' + i*'1'+'-' , '-'+ i*'0'+'-' )


        # 1 to -1 to cut off the artifical caps we added
        for i in range(1, len(B_string)-1):
            B[i-1] = int(B_string[i])

        ## Part 3 - extend ends if required
        if extend_ends:
                
            # note we don't check length of extend ends initially, so if they're too big for the sequence
            # dynamicaly resize them so they're 1/2 of the sequence length
            if extend_ends + 1 >= len(B):
                extend_ends_val = len(B)/2
            else:
                extend_ends_val = extend_ends
                
            if B[extend_ends_val + 1] == 1:
                B[0:extend_ends_val] = [1]*len(B[0:extend_ends_val])

            if B[:-(extend_ends_val + 1)] == 1:
                B[-extend_ends_val:] = [1]*len(B[0:extend_ends_val])
            
        ## Part 4 - extract domain boundaires
        local_domains=[]
        if B[0] == 1:
            inside = True
            start = 0
        else:
            inside = False
            
        # for each position
        for idx in range(0,len(B)):

            i = B[idx]
                
            if i == 1:
                if inside:
                    continue
                else:
                    inside = True
                    start = idx

            if i == 0:
                if inside:
                    inside = False
                    end = idx-1
                    local_domains.append({'start':start+1,'end':end+1,'domain_type':domain_type})

        # if we finished inside
        if inside:
            local_domains.append({'start':start+1,'end':len(B),'domain_type':domain_type})

        # if we found any local domains...
        if len(local_domains) > 0:
            new_domains[protein.unique_ID] = local_domains

    return new_domains


   
# used for test example - kept in case we want to expand or update this in          

# def calculate_domain_length(domain):
#    """
#    Function that returns a domain's length
#    
#    Parameters
#    -------------
#    domain : Domain
#    Domain object in question
#
#    Returns
#    ------------
#    int
#    Returns the length of the domain
#    """
#    
#    return len(domain)