"""
SHEPHARD:
Sequence-based Hierarchical and Extendable Platform for High-throughput Analysis of Region of Disorder
Authors: Garrett M. Ginell & Alex S. Holehouse
Contact: (alex.holehouse@wustl.edu, g.ginell@wustl.edu)
Holehouse Lab - Washington University in St. Louis
"""
from . import interface_tools
import shephard.exceptions as shephard_exceptions
from shephard.exceptions import InterfaceException, ProteinException, SiteException
MAX_BAD_COUNT = 10
class _SitesInterface:
def __init__(self, filename, delimiter='\t', skip_bad=True, preauthorized_uids=None):
"""
Expect files of the following format:
A SHEPHARD sites file is a tab (or other) delineated file where each
line has the following convention::
1 2 3 4 5 [ 6 7 ... n ]
Unique_ID position site_type symbol value [key_1:value_1 key_2:value_2 ... key_n:value_n ]
Each line has six required values and then can have as many key:value pairs as may be
desired.
Note that the first four arguments are required, while all of the
key:value pairs are optional. Key value must be separated by a ':',
but any delimiter (other than ':') is allowed.
Parameters
----------------
filename : str
Name of the shephard domains file to read
delimiter : str (default = \t)
String used as a delimiter on the input file.
skip_bad : bool (default = True)
Flag that means if bad lines (lines that trigger an exception)
are encountered the code will just skip them. By default this is
true, which adds a certain robustness to file parsing, but could
also hide errors. Note that if lines are skipped a warning will be
printed (regardless of verbose flag).
preauthorized_ids : list of str (default = None)
List of unique_IDs that are allowed to be added to the sites
dictionary. If None then all sites are allowed. Avoids parsing
lines that are not needed into the interface objects
"""
bad_count = 0
if delimiter == ':':
raise InterfaceException('When parsing site file cannot use ":" as a delimeter because this is used to delimit key/value pairs (if provided)')
with open(filename,'r') as fh:
content = fh.readlines()
# convert the preauthorized uids to a set for faster lookup
if preauthorized_uids is not None:
preauthorized_uids = set(preauthorized_uids)
ID2site = {}
linecount=0
for line in content:
linecount = linecount + 1
# skip comment lines
if interface_tools.is_comment_line(line):
continue
sline = line.strip().split(delimiter)
try:
unique_ID = sline[0].strip()
# check if UID associated with this line is found in the
# preauthorized list. If not then skip this line
if preauthorized_uids is not None and unique_ID not in preauthorized_uids:
continue
position = int(sline[1].strip())
site_type = sline[2].strip()
symbol = sline[3].strip()
# this enables the value to be None if you
# write a symbol where there's no value associated
# with a site
tmp = sline[4].strip()
if tmp == 'None':
value = None
else:
value = float(tmp)
attributes = {}
except Exception as e:
msg = f'Failed parsing file [{filename}] on line [{linecount}].\n\nException raised: {str(e)}\n\nline printed below:\n{line}'
# should update this to also display the actual error...
if skip_bad and bad_count < MAX_BAD_COUNT:
bad_count = bad_count + 1
shephard_exceptions.print_warning(msg + f"\nSkipping this line (count {bad_count} of {MAX_BAD_COUNT} ...)")
continue
else:
raise InterfaceException(msg)
# if there's more parse attribute dictionary entries
if len(sline) > 5:
attributes = interface_tools.parse_key_value_pairs(sline[5:], filename, linecount, line)
if unique_ID in ID2site:
ID2site[unique_ID].append({'position':position, 'site_type':site_type, 'symbol':symbol, 'value':value, 'attributes':attributes})
else:
ID2site[unique_ID] =[{'position':position, 'site_type':site_type, 'symbol':symbol, 'value':value, 'attributes':attributes}]
self.data = ID2site
##############################################
## ##
## PUBLIC FACING FUNCTIONS BELOW ##
## ##
##############################################
## ------------------------------------------------------------------------
##
[docs]
def add_sites_from_file(proteome, filename, delimiter='\t', return_dictionary=False, safe=True, skip_bad=True, verbose=True):
r"""
Function that provides the user-facing interface for reading correctly
configured SHEPHARD sites files and adding those sites to the proteins
of interest.
A SHEPHARD sites file is a tab (or other) delineated file where each
line has the following convention::
1 2 3 4 5 [ 6 7 ... n ]
Unique_ID position site_type symbol value [key_1:value_1 key_2:value_2 ... key_n:value_n ]
Each line has six required values and then can have as many key:value pairs as may be
desired.
Parameters
----------
proteome : Proteome
Proteome object to which we're adding sites. Note that ONLY sites
for which a protein is found will be used. Protein-Site
cross-referencing is done using the protein's unique_ID which
should be the key used in the sites_dictionary
filename : str
Name of the shephard site file to be read
delimiter : str (default = '\\t')
String used as a delimiter on the input file.
return_dictionary : bool, default=False
If set to true, this function will return the sites dictionary
and will NOT add that dictionary to the proteome - i.e. the
function basically becomes a parser for SHEPHARD-compliant
sites files.
safe : bool (default = True)
If set to True then any exceptions raised during the site-adding
process (i.e. after file parsing) are acted on. If set to False,
exceptions simply mean the site in question is skipped. There are
various reasons site addition could fail (e.g. site falls outside
of protein position so if verbose=True then the cause of an exception
is also printed to screen. It is highly recommend that if you choose
to use safe=False you also set verbose=True. Default = True.
skip_bad : bool (default = True)
Flag that means if bad lines (lines that trigger an exception) are
encountered the code will just skip them. By default this is true,
which adds a certain robustness to file parsing, but could also hide
errors. Note that if lines are skipped a warning will be printed
(regardless of verbose flag).
verbose : bool (default = True)
Flag that defines how 'loud' output is. Will warn about errors
on adding sites.
Returns
---------
None or dict
If return_dictionary is set to False (default) then this function
has no return value, but the sites are added to the Proteome object
passed as the first argument. If return_dictionary is set to True
the function returns the parsed sites dictionary without adding the
newly-read sites to the proteome.
"""
# check first argument is a proteome
interface_tools.check_proteome(proteome, 'add_sites_from_file (si_sites)')
# build the SitesInterface object
sites_interface = _SitesInterface(filename,
delimiter=delimiter,
skip_bad=skip_bad,
preauthorized_uids = proteome.proteins)
if return_dictionary:
return sites_interface.data
# finally add the site from the dictionary generated by the
# SitesInterface parser
add_sites_from_dictionary(proteome, sites_interface.data, safe, verbose)
## ------------------------------------------------------------------------
##
[docs]
def add_sites_from_dictionary(proteome, sites_dictionary, safe=True, verbose=False):
"""
Function that takes a correctly formatted Sites dictionary and will add
those Sites to the proteins in the Proteome.
Sites dictionaries are key-value pairs, where the key is a unique_ID
associated with a given Protein, and the value is a list of dictionaries.
Each subdirectionay has the following elements::
'position' = site position
'site_type' = site type
'symbol' = site symbol
'value' = site value
'attributes' = site attribute dictionary
In this way, each site that maps to a give unique_ID will be added to
the associated protein. The use of a list of dictionaries (as opposed
to a simple unique_ID:site_dictionary pairing) means multiple sites
for a single protein can be added at once.
Parameters
-------------
proteome : Proteome
Proteome object to which we're adding sites. Note that ONLY sites
for which a protein is found will be used. Protein:Site
cross-referencing is done using the protein's unique_ID
which should be the key used in the sites_dictionary
sites_dictionary : dict
A sites dictionary (defined above) is dictionary that maps a
unique_ID back to a list of dictionaries, where each
subdictionay has five elements, desribed above.
Recall the only type-specific values (position and value) are
cast automatically when a site is added by the Protein object,
so there is no need to do that in this function too.
Extra key-value paris in each sub-dictionary are ignored
safe : bool (default = True)
If set to True then any exceptions raised during the site-adding
process are acted on. If set to false, exceptions simply mean the
site in question is skipped. There are various reasons site addition
could fail (notably position of the site is outside of the protein
limits) and so if verbose=True then the cause of an exception is
also printed to screen. It is highly recommend that if you choose to
use safe=False you also set verbose=True
verbose : bool (default = False)
Flag that defines how 'loud' output is. Will warn about errors on
adding sites.
Returns
---------
None
No return value, but adds all of the passed sites to the protein
"""
for protein in proteome:
if protein.unique_ID in sites_dictionary:
for site in sites_dictionary[protein.unique_ID]:
try:
position = site['position']
site_type = site['site_type']
symbol = site['symbol']
value = site['value']
try:
ad = site['attributes']
except:
ad = {}
except Exception:
raise InterfaceException('When sites dictionary for key [%s] was unable to extract five distinct parametes. Entry is:\n%s\n'% (protein.unique_ID, site))
# assuming we can read all five params try and add the site
try:
protein.add_site(position, site_type, symbol, value, attributes = ad)
except ProteinException as e:
msg='- skipping site %s at %i on %s' %(site_type, position, protein)
if safe:
shephard_exceptions.print_and_raise_error(msg, e)
else:
if verbose:
shephard_exceptions.print_warning(msg)
continue
## ------------------------------------------------------------------------
##
[docs]
def write_sites(proteome, filename, delimiter='\t', site_types=None):
r"""
Function that writes out sites to file in a standardized format. Note
that attributes are converted to a string, which for simple attributes
is reasonable but is not really a viable stratergy for complex objects,
although this will not yeild and error.
If a site_types list is provided, only site_types that match to
strings in this list are written out.
Parameters
-----------
proteome : Proteome
Proteome object from which the sites will be extracted from
filename : str
Filename that will be used to write the new sites file
site_type : str (default = None)
If provided, this is an identifier that allows you to specificy
a specific site type to write out.
delimiter : str (default = '\\t')
Character (or characters) used to separate between fields.
Default is the tab character ('\\t'), which is recommended to
maintain compliance with default SHEPHARD file-reading functions.
Returns
--------
None
No return type, but generates a new file with the complete set of
sites from this proteome written to disk.
"""
# added so that we ensure site_types is a list if passed
if site_types is not None:
if type(site_types) is not list:
raise InterfaceException('When passing a site_type this must be a list')
with open(filename, 'w') as fh:
for protein in proteome:
for s in protein.sites:
# if we're using site_types and the current sites
if site_types is not None:
if s.site_type not in site_types:
continue
# build a line
# if the passed parameter site_types is being
# used
line = __build_site_line(s, delimiter)
fh.write(f"{line}")
## ------------------------------------------------------------------------
##
[docs]
def write_sites_from_list(site_list, filename, delimiter='\t'):
r"""
Function that writes out sites to a SHEPHARD sites file from a list
of Site objects.
Note that attributes are converted to a string, which for simple
attributes is reasonable but is not really a viable stratergy for
complex objects, although this will not yeild and error.
Parameters
-----------
site_list : List of Site objects
List of site objects which will be written
filename : str
Filename that will be used to write the new sites file
delimiter : str (default = '\\t')
Character (or characters) used to separate between fields. Default is
'\\t' which is recommended to maintain compliance with default
`add_sites_from_file()` function
Returns
--------
None
No return type, but generates a new file with the complete set of
sites from this proteome written to disk.
"""
# first check if items in the list are site objects
for s in site_list:
interface_tools.check_site(s, 'write_sites_from_list')
with open(filename, 'w') as fh:
# for each site in the list
for s in site_list:
# build a line
# if the passed parameter site_types is being
# used
line = __build_site_line(s, delimiter)
fh.write(f"{line}")
## ------------------------------------------------------------------------
##
def __build_site_line(s, delimiter):
"""
Internal function that takes a Site object and returns a line that can
be written to a Sites file. This is called internally by functions that
write Sites.
Parameters
----------------------
s : shephard.Site
Site object being converted to a string
delimiter : str (default = '\\t')
Character (or characters) used to separate between fields.
Default is the tab character ('\\t'), which is recommended to
maintain compliance with default SHEPHARD file-reading functions.
Returns
--------------
str
Returns a string that is ready to be written to file
"""
# systematically construct each line in the file
line = ''
line = line + str(s.protein.unique_ID) + delimiter
line = line + str(s.position) + delimiter
line = line + str(s.site_type) + delimiter
line = line + str(s.symbol) + delimiter
# note last required element has no trailing delimiter
line = line + str(s.value)
if s.attributes:
for k in s.attributes:
atrbt = interface_tools.full_clean_string(s.attribute(k))
line = line + delimiter + f"{k}:{atrbt}"
line = line + "\n"
return line