Source code for shephard.apis.albatross_api

##
## API into ALBATROSS
##
## For all APIS we do do not make hard dependencies, safe informative
## import checking should be done.
##


## Check metapredict is installed
try:
    from sparrow.predictors.batch_predict import batch_predict
    
except ModuleNotFoundError:
    print('Unable to import sparrow (the package where ALBATROSS is implemeted)')
    print('To use the ALBATROSS, make sure sparrow is installed')
    print('This can be done as follows:')
    print('pip install git+https://git@github.com/idptools/sparrow.git')

    
## ------------------------------------------------------------------------
##
[docs] def annotate_proteome_with_dimensions(proteome, rg_name = 'rg', re_name = 're', gpuid=00, show_progress_bar=True, batch_mode=None, safe=True): """ Function that annotates a proteome with it's predicted radius of gyration (rg) and end-to-end distance (re) for every protein. By default, rg and re are added as attributes to each Protein, with the names 'rg' and 're' respectively. However, this can be changed by setting the `rg_name` and `re_name` parameters. Dimension prediction uses the batch mode in sparrow, which leverages parallel predictions automatically on GPUs or CPUs. However, if a specific device is requested, this can be passed via the `gpuid` parameter. Parameters ----------------- proteome : shephard.proteome.Proteome Proteome object to be annotated. rg_name : str Name of the rg attribute added to each Protein. re_name : str Name of the re attribute added to each Protein. gpuid : int Identifier for the GPU being requested. Note that if this is left unset the code will use the first GPU available and if none is available will default back to CPU; in general, it is recommended not to try and set this unless there's a specific reason why a specific GPU should be used. Default = 0. show_progress_bar : bool Flag which, if set to True, means a progress bar is printed as predictions are made, while if False no progress bar is printed. Default = True safe : bool Flag which, if set to False, means the function overwrites existing tracks and domains if present. If True, overwriting will trigger an exception. Default = True. Returns ----------------- None No return type, but the Protein objects in the Proteome will be annotated with per-residue disorder Tracks. """ uid2seq = {} for p in proteome: uid2seq[p.unique_ID] = p.sequence # batch predict dimensions for all proteins rg = batch_predict(uid2seq, network='scaled_rg', gpuid=gpuid, show_progress_bar=show_progress_bar) re = batch_predict(uid2seq, network='scaled_re', gpuid=gpuid, show_progress_bar=show_progress_bar) # add as an attribute to the proteins for k in rg: proteome.protein(k).add_attribute(rg_name, rg[k][1], safe=safe) for k in re: proteome.protein(k).add_attribute(re_name, re[k][1], safe=safe)
## ------------------------------------------------------------------------ ##
[docs] def annotate_domains_with_dimensions(proteome, domain_type, rg_name = 'rg', re_name = 're', gpuid=00, show_progress_bar=True, batch_mode=None, safe=True): """ Function that annotates every domain matching the domain_name in a proteome with it's predicted radius of gyration (rg) and end-to-end distance (re). By default, rg and re are added as attributes to each Domain, with the names 'rg' and 're' respectively. However, this can be changed by setting the `rg_name` and `re_name` parameters. Dimension prediction uses the batch mode in sparrow, which leverages parallel predictions automatically on GPUs or CPUs. However, if a specific device is requested, this can be passed via the `gpuid` parameter. Parameters ----------------- proteome : shephard.proteome.Proteome Proteome object to be annotated. domain_type : str Type of the domain to be annotated. rg_name : str Name of the rg attribute added to each Protein. re_name : str Name of the re attribute added to each Protein. gpuid : int Identifier for the GPU being requested. Note that if this is left unset the code will use the first GPU available and if none is available will default back to CPU; in general, it is recommended not to try and set this unless there's a specific reason why a specific GPU should be used. Default = 0. show_progress_bar : bool Flag which, if set to True, means a progress bar is printed as predictions are made, while if False no progress bar is printed. Default = True safe : bool Flag which, if set to False, means the function overwrites existing tracks and domains if present. If True, overwriting will trigger an exception. Default = True. Returns ----------------- None No return type, but the Protein objects in the Proteome will be annotated with per-residue disorder Tracks. """ # build the dictionary of unique IDs to sequences uid2seq = {} for d in proteome.domains: if d.domain_type == domain_type: unique_name = d.protein.unique_ID + '_' + d.domain_name uid2seq[unique_name] = d.sequence # batch predict dimensions for all proteins rg = batch_predict(uid2seq, network='scaled_rg', gpuid=gpuid, show_progress_bar=show_progress_bar) re = batch_predict(uid2seq, network='scaled_re', gpuid=gpuid, show_progress_bar=show_progress_bar) for d in proteome.domains: if d.domain_type == domain_type: unique_name = d.protein.unique_ID + '_' + d.domain_name rg_val = rg[unique_name][1] re_val = re[unique_name][1] d.add_attribute(rg_name, rg_val, safe=safe) d.add_attribute(re_name, re_val, safe=safe)