Source code for shephard.apis.metapredict_api



##
## API into metapredict
##
## For all APIS we do do not make hard dependencies, safe informative
## import checking should be done.
##


## Check metapredict is installed
try:
    import metapredict as meta


    # run this AFTER checking metapredict is installed
    if not hasattr(meta, 'predict_disorder_batch'):
        print('Unable to import metapredict batch mode')
        print('This means you probably have an earlier version of metapredict installed')    
        print('This can be fixed by updating metapredict as follows:')
        print('pip install --upgrade metapredict')
    
except ModuleNotFoundError:
    print('Unable to import metapredict')
    print('To use the metapredict API, make sure metapredict is installed')
    print('This can be done as follows:')
    print('pip install metapredict')

## then check batch mode is available

    
## ------------------------------------------------------------------------
##
[docs]def annotate_proteome_with_disorder_track(proteome, name='disorder', gpuid=00, show_progress_bar=True, batch_mode=None, safe=True): """ Function that annotates a proteome with disorder Tracks for every protein. By default, disorder Tracks are named 'disorder', although this can be changed by setting the `track_name` parameter. Disorder prediction uses the batch mode in metapredict, which leverages parallel predictions automatically on GPUs or CPUs. However, if a specific device is requested, this can be passed Parameters ----------------- proteome : shephard.proteome.Proteome Proteome object to be annotated. track_name : str Name of the Track added to each Protein. Default = 'disorder' gpuid : int Identifier for the GPU being requested. Note that if this is left unset the code will use the first GPU available and if none is available will default back to CPU; in general, it is recommended not to try and set this unless there's a specific reason why a specific GPU should be used. Default = 0. show_progress_bar : bool Flag which, if set to True, means a progress bar is printed as predictions are made, while if False no progress bar is printed. Default = True safe : bool Flag which, if set to False, means the function overwrites existing tracks and domains if present. If True, overwriting will trigger an exception. Default = True. Returns ----------------- None No return type, but the Protein objects in the Proteome will be annotated with per-residue disorder Tracks. """ uid2seq = {} for p in proteome: uid2seq[p.unique_ID] = p.sequence # batch predict disorder D = meta.predict_disorder_batch(uid2seq, gpuid=gpuid, show_progress_bar=show_progress_bar) for k in uid2seq: proteome.protein(k).add_track(name, values=D[k][1], safe=safe)
## ------------------------------------------------------------------------ ##
[docs]def annotate_proteome_with_disordered_domains(proteome, name='IDR', disorder_threshold=0.5, annotate_folded_domains=False, folded_domain_name = 'FD', gpuid=00, show_progress_bar=True, safe=True): """ Function that annotates a proteome with disordered Domains (IDRs) for every protein. By default, disordered Domains are named as 'IDR's, although this can be changed by setting the `name` parameter. In addition, if requested, folded domains can also be annotated as those domains which are not IDRs. These folded domains are named 'FD's by default, although this can be changed by setting the `folded_domain_name` parameter. Disorder prediction uses the batch mode in metapredict, which leverages parallel predictions automatically on GPUs or CPUs. However, if a specific device is requested this can be passed Parameters ----------------- proteome : shephard.proteome.Proteome Proteome object to be annotated. disorder_threshold : float Threshold to be used to define IDRs by the metapredict domain decomposition algorithm. The default is 0.5, and we strongly recommend sticking with this value. track_name : str Name of the Track added to each Protein. Default = 'disorder' domain_name : str Name of the Domain added to each Protein. Default = 'IDR' annotate_folded_domains : bool Flag which, if included, means we ALSO annotate the regions that are not IDRs as 'FD' (folded domains), where the name can be changed using the folded_domain_name variable. Default = False folded_domain_name : str String used to name Folded Domains. Only relevant if annotate_folded_domains is set to True. Default = 'FD' gpuid : int Identifier for the GPU being requested. Note that if this is left unset the code will use the first GPU available and if none is available will default back to CPU; in general, it is recommended not to try and set this unless there's a specific reason why a specific GPU should be used. Default = 0. show_progress_bar : bool Flag which, if set to True, means a progress bar is printed as predictions are made, while if False no progress bar is printed. Default = True safe : bool Flag which, if set to False, means the function overwrites existing tracks and domains if present. If True, overwriting will trigger an exception. Default = True. Returns ----------------- None No return type, but the Protein objects in the Proteome will be annotated with disordered Domain annotations. """ uid2seq = {} for p in proteome: uid2seq[p.unique_ID] = p.sequence # batch predict disorder D = meta.predict_disorder_batch(uid2seq, return_domains=True, gpuid=gpuid, show_progress_bar=show_progress_bar) for k in uid2seq: X = D[k] for boundaries in X.disordered_domain_boundaries: proteome.protein(k).add_domain(boundaries[0]+1, boundaries[1], name, safe=safe) if annotate_folded_domains: for boundaries in X.folded_domain_boundaries: proteome.protein(k).add_domain(boundaries[0]+1, boundaries[1], folded_domain_name, safe=safe)
## ------------------------------------------------------------------------ ##
[docs]def annotate_proteome_with_disorder_tracks_and_disordered_domains(proteome, disorder_threshold=0.5, track_name='disorder', domain_name='IDR', annotate_folded_domains=False, folded_domain_name = 'FD', gpuid=00, show_progress_bar=True, safe=True): """ Function that annotates a proteome with disorder Tracks and disorder Domains for every protein. By default, disorder Tracks are named 'disoder', although this can be changed by setting the `track_name` parameter. By default, disordered Domains are named as 'IDR's, although this can be changed by setting the `name` parameter. In addition, if requested, folded domains can also be annotated as those domains which are not IDRs. These folded domains are named 'FD's by default, although this can be changed by setting the `folded_domain_name` parameter. Disorder prediction uses the batch mode in metapredict, which leverages parallel predictions automatically on GPUs or CPUs. However, if a specific device is requested this can be passed Parameters ----------------- proteome : shephard.proteome.Proteome Proteome object to be annotated. disorder_threshold : float Threshold to be used to define IDRs by the metapredict domain decomposition algorithm. Default is 0.5 and strongly recommend sticking with this value. track_name : str Name of the Track added to each Protein. Default = 'disorder' domain_name : str Name of the Domain added to each Protein. Default = 'IDR' annotate_folded_domains : bool Flag which, if included, means we ALSO annotate the regions that are not IDRs as 'FD' (folded domains), where the name can be changed using the folded_domain_name variable. Default = False folded_domain_name : str String used to name Folded Domains. Only relevant if annotate_folded_domains is set to True. Default = 'FD' gpuid : int Identifier for the GPU being requested. Note that if this is left unset the code will use the first GPU available and if none is available will default back to CPU; in general, it is recommended not to try and set this unless there's a specific reason why a specific GPU should be used. Default = 0. show_progress_bar : bool Flag which, if set to True, means a progress bar is printed as predictions are made, while if False no progress bar is printed. Default = True safe : bool Flag which, if set to False, means the function overwrites existing tracks and domains if present. If True, overwriting will trigger an exception. Default = True. Returns ----------------- None No return type, but the Protein objects in the Proteome will be annotated with per-residue disorder Tracks and disordered Domain annotations. """ uid2seq = {} for p in proteome: uid2seq[p.unique_ID] = p.sequence # batch predict disorder annotations/scores D = meta.predict_disorder_batch(uid2seq, return_domains=True, gpuid=gpuid, show_progress_bar=show_progress_bar) # for each unique ID for k in uid2seq: # X = DisorderObject X = D[k] # cycle through IDR boundaries for boundaries in X.disordered_domain_boundaries: proteome.protein(k).add_domain(boundaries[0]+1, boundaries[1], domain_name, safe=safe) if annotate_folded_domains: for boundaries in X.folded_domain_boundaries: proteome.protein(k).add_domain(boundaries[0]+1, boundaries[1], folded_domain_name, safe=safe) proteome.protein(k).add_track(track_name, values=X.disorder, safe=safe)