##
## API into metapredict
##
## For all APIS we do do not make hard dependencies, safe informative
## import checking should be done.
##
## Check metapredict is installed
try:
import metapredict as meta
try:
meta.predict_disorder('AAPAPA',version=3)
except TypeError:
print('shephard requires metapredict V3 or higher. Please upgrade:')
print('pip install --upgrade metapredict')
except ModuleNotFoundError:
print('Unable to import metapredict')
print('To use the metapredict API, make sure metapredict is installed')
print('This can be done as follows:')
print('pip install metapredict')
## then check batch mode is available
## ------------------------------------------------------------------------
##
[docs]
def annotate_proteome_with_disorder_track(proteome,
name='disorder',
device='cpu',
version=3,
show_progress_bar=True,
safe=True):
"""
Function that annotates a proteome with disorder Tracks
for every protein.
By default, disorder Tracks are named 'disorder', although
this can be changed by setting the `track_name` parameter.
Disorder prediction uses the batch mode in metapredict, which
leverages parallel predictions automatically on GPUs or CPUs.
However, if a specific device is requested, this can be passed
Parameters
-----------------
proteome : shephard.proteome.Proteome
Proteome object to be annotated.
track_name : str
Name of the Track added to each Protein.
Default = 'disorder'
device : str
Define the device to use, either 'cpu', 'mps', 'cuda'
or the integer index of a specific GPU device to use.
Default = 'cpu'.
version : int
Defines the metapredict version to use (must be one of 1, 2
or 3).
show_progress_bar : bool
Flag which, if set to True, means a progress bar is printed as
predictions are made, while if False no progress bar is printed.
Default = True
safe : bool
Flag which, if set to False, means the function overwrites
existing tracks and domains if present. If True, overwriting
will trigger an exception.
Default = True.
Returns
-----------------
None
No return type, but the Protein objects in the Proteome
will be annotated with per-residue disorder Tracks.
"""
uid2seq = {}
for p in proteome:
uid2seq[p.unique_ID] = p.sequence
# batch predict disorder
try:
D = meta.predict_disorder(uid2seq, device=device, show_progress_bar=show_progress_bar, version=version)
except:
D = meta.predict_disorder(uid2seq, device=device, show_progress_bar=show_progress_bar, version=version)
for k in uid2seq:
proteome.protein(k).add_track(name, values=D[k][1], safe=safe)
## ------------------------------------------------------------------------
##
[docs]
def annotate_proteome_with_disordered_domains(proteome,
name='IDR',
disorder_threshold=0.5,
annotate_folded_domains=False,
folded_domain_name = 'FD',
device='cpu',
version=3,
show_progress_bar=True,
safe=True):
"""
Function that annotates a proteome with disordered
Domains (IDRs) for every protein.
By default, disordered Domains are named as 'IDR's, although
this can be changed by setting the `name` parameter.
In addition, if requested, folded domains can also be annotated
as those domains which are not IDRs. These folded domains are
named 'FD's by default, although this can be changed by setting
the `folded_domain_name` parameter.
Disorder prediction uses the batch mode in metapredict, which
leverages parallel predictions automatically on GPUs or CPUs.
However, if a specific device is requested this can be passed
Parameters
-----------------
proteome : shephard.proteome.Proteome
Proteome object to be annotated.
name : str
Name to give IDR domains.
disorder_threshold : float
Threshold to be used to define IDRs by the metapredict
domain decomposition algorithm. The default is 0.5,
and we strongly recommend sticking with this value.
annotate_folded_domains : bool
Flag which, if included, means we ALSO annotate
the regions that are not IDRs as 'FD' (folded
domains), where the name can be changed using
the folded_domain_name variable.
Default = False
folded_domain_name : str
String used to name Folded Domains. Only relevant
if annotate_folded_domains is set to True.
Default = 'FD'
device : str
Define the device to use, either 'cpu', 'mps', 'cuda'
or the integer index of a specific GPU device to use.
Default = 'cpu'.
version : int
Defines the metapredict version to use (must be one of 1, 2
or 3).
show_progress_bar : bool
Flag which, if set to True, means a progress bar is printed as
predictions are made, while if False no progress bar is printed.
Default = True
safe : bool
Flag which, if set to False, means the function overwrites
existing tracks and domains if present. If True, overwriting
will trigger an exception.
Default = True.
Returns
-----------------
None
No return type, but the Protein objects in the Proteome
will be annotated with disordered Domain annotations.
"""
uid2seq = {}
for p in proteome:
uid2seq[p.unique_ID] = p.sequence
# batch predict disorder
D = meta.predict_disorder(uid2seq, device=device, show_progress_bar=show_progress_bar, version=version, return_domains=True)
for k in uid2seq:
X = D[k]
for boundaries in X.disordered_domain_boundaries:
proteome.protein(k).add_domain(boundaries[0]+1, boundaries[1], name, safe=safe)
if annotate_folded_domains:
for boundaries in X.folded_domain_boundaries:
proteome.protein(k).add_domain(boundaries[0]+1, boundaries[1], folded_domain_name, safe=safe)
## ------------------------------------------------------------------------
##
[docs]
def annotate_proteome_with_disorder_tracks_and_disordered_domains(proteome,
track_name='disorder',
domain_name='IDR',
disorder_threshold=0.5,
annotate_folded_domains=False,
folded_domain_name = 'FD',
device='cpu',
version=3,
show_progress_bar=True,
safe=True):
"""
Function that annotates a proteome with disorder Tracks and
disorder Domains for every protein.
By default, disorder Tracks are named 'disoder', although
this can be changed by setting the `track_name` parameter.
By default, disordered Domains are named as 'IDR's, although
this can be changed by setting the `name` parameter.
In addition, if requested, folded domains can also be annotated
as those domains which are not IDRs. These folded domains are
named 'FD's by default, although this can be changed by setting
the `folded_domain_name` parameter.
Disorder prediction uses the batch mode in metapredict, which
leverages parallel predictions automatically on GPUs or CPUs.
However, if a specific device is requested this can be passed
Parameters
-----------------
proteome : shephard.proteome.Proteome
Proteome object to be annotated.
track_name : str
Name of the Track added to each Protein.
Default = 'disorder'
domain_name : str
Name of the Domain added to each Protein.
Default = 'IDR'
disorder_threshold : float
Threshold to be used to define IDRs by the metapredict
domain decomposition algorithm. Default is 0.5 and strongly
recommend sticking with this value.
annotate_folded_domains : bool
Flag which, if included, means we ALSO annotate
the regions that are not IDRs as 'FD' (folded
domains), where the name can be changed using
the folded_domain_name variable.
Default = False
folded_domain_name : str
String used to name Folded Domains. Only relevant
if annotate_folded_domains is set to True.
Default = 'FD'
device : str
Define the device to use, either 'cpu', 'mps', 'cuda'
or the integer index of a specific GPU device to use.
Default = 'cpu'.
version : int
Defines the metapredict version to use (must be one of 1, 2
or 3).
show_progress_bar : bool
Flag which, if set to True, means a progress bar is printed as
predictions are made, while if False no progress bar is printed.
Default = True
safe : bool
Flag which, if set to False, means the function overwrites
existing tracks and domains if present. If True, overwriting
will trigger an exception.
Default = True.
Returns
-----------------
None
No return type, but the Protein objects in the Proteome
will be annotated with per-residue disorder Tracks and
disordered Domain annotations.
"""
uid2seq = {}
for p in proteome:
uid2seq[p.unique_ID] = p.sequence
# batch predict disorder annotations/scores
D = meta.predict_disorder(uid2seq, device=device, show_progress_bar=show_progress_bar, version=version, return_domains=True)
# for each unique ID
for k in uid2seq:
# X = DisorderObject
X = D[k]
# cycle through IDR boundaries
for boundaries in X.disordered_domain_boundaries:
proteome.protein(k).add_domain(boundaries[0]+1, boundaries[1], domain_name, safe=safe)
if annotate_folded_domains:
for boundaries in X.folded_domain_boundaries:
proteome.protein(k).add_domain(boundaries[0]+1, boundaries[1], folded_domain_name, safe=safe)
proteome.protein(k).add_track(track_name, values=X.disorder, safe=safe)