Source code for shephard.apis.uniprot

"""
uniprot.py

From the SHEPHARD package
Sequence-based Hierachical and Extendable Platform for High-throughput Analysis of Region of Disorder
Ginell & Holehouse, 2020-2022

Handles all I/O associated with uniprot-derived files.

"""

from shephard.exceptions import UtilitiesException
import protfasta
from . import fasta

## ------------------------------------------------------------------------
##
def uniprot_accession_from_line(line):
    """
    Function that converts a header from a uniprot fasta file
    to extract the uniprot ID. This an example of the type of function
    that can be passed to quickstart using the extract_unique_id argument.

    This function assumes the uniprot-standard format for the header
    file has been maintained - i.e.

    >>> >xx|ACCESSION|xxxx

    where ACCESSION is the uniprot accession. 

    Parameters
    -----------

    line : string
        String where we expect the uniprot ID to be contained within two 'pipe' 
        characters ('|'). 

    Returns
    -----------
    string
        Returns the uniprot ID, although this is not formally validated. 
        However, assuming the string follows standard uniprot fasta header 
        conventions this should be true!

    """
    try:
        return line.split('|')[1].strip()
    except:
        raise UtilitiesException('Unable to parse string [%s] to identify uniprot ID' %(line))

        
        
## ------------------------------------------------------------------------
##
[docs]def uniprot_fasta_to_proteome(filename, proteome = None, force_overwrite=False, invalid_sequence_action='fail'): """ Stand alone function that allows the user to build a proteome from a standard FASTA file downloaded from UniProt This function assumes the uniprot-standard format for the header file has been maintained - i.e. >>> >xx|ACCESSION|xxxx Where ACCESSION is the uniprot accession and will be used as the unique_ID Parameters ------------ filename : string Name of the FASTA file we're going to parse in. Note the protein name will be defined as the full FASTA header for each entry. proteome : Proteome If a Proteome object is provided the FASTA file will be read and added to the existing proteome, whereas if set to None a new Proteome will be generated. force_overwrite : bool (default = False) If this flag is set to true and we encounter a unique_ID that is already in the proteome the newer value overwrites the older one. This is mostly useful if you are adding in a file with known duplicate entries OR combining multiple FASTA files where you know there's some duplications. Important - if we're building unique IDs based on numerical record indices then EVERY FASTA entry will be given a unique_ID (meaning force_overwrite is irrelevant in this case). invalid_sequence_action : str (default = 'fail') Selector which defines the behaviour if a sequence with a non- standard amino acid is encountered. Valid options and their meaning are listed below: * ``ignore`` - invalid sequences are completely ignored * ``fail`` - invalid sequence cause parsing to fail and throw an exception * ``remove`` - invalid sequences are removed * ``convert`` - invalid residues are converted to valid residues * ``convert-ignore`` - invalid sequences are converted to valid sequences and any remaining invalid residues are ignored. Returns -------- Proteome Returns an initialized Proteome object """ return fasta.fasta_to_proteome(filename, proteome=proteome, build_unique_ID=uniprot_accession_from_line, force_overwrite=force_overwrite, invalid_sequence_action=invalid_sequence_action)
## ------------------------------------------------------------------------ ##
[docs]def uniprot_proteome_to_fasta(filename, proteome): """ Stand alone function that allows the user to write a FASTA file from a Proteome under the assumption that the Proteome was built from a uniprot FASTA. Practically, this just means that the Protein.name variable is used for the FASTA header, although the function will fail if duplicate headers are found. Parameters ------------ filename : string Name of the FASTA file we're going to write sequences to proteome : Proteome The Proteome object from which FASTA file will be generated Returns -------- None No return variable but wll write to file """ out_dict = {} for p in proteome: header = p.name if header in out_dict: raise UtilitiesException(f'Duplicate name entries found in Proteome ({header}). Should not happen for UniProt headers') out_dict[header] = p.sequence protfasta.write_fasta(out_dict, filename, linelength=80)