Source code for shephard.tools.sequence_tools

"""
SHEPHARD: 
Sequence-based Hierarchical and Extendable Platform for High-throughput Analysis of Region of Disorder

Authors: Garrett M. Ginell & Alex S. Holehouse
Contact: (g.ginell@wustl.edu)

Holehouse Lab - Washington University in St. Louis
"""
import re

## ------------------------------------------------------------------------
##
[docs]def build_mega_string(object_list, return_as_list=False): """ This takes a list of protein or domain SHEPHARD objects and builds a single long str object that concatinates all object.sequence elements together (i.e. a "megastring"). Allowed types for the object_list are Protein and Domain objects. This string can be used for simple statistical analysis of composition. Parameters ---------- object_list : list List of SHEPHARD objects with object.sequence variable, for example, a list of Domains or a list of Proteins return_as_list : bool If provided, rather than a single megastring, the function returns a list of sequences from the objects in question. Returns ---------- str or list Returns either a concatinated str object of the amino acid sequences associated with the passed object """ megastring = '' for obj in object_list: megastring = megastring + obj.sequence return megastring
## ------------------------------------------------------------------------ ##
[docs]def find_string_positions(query, target, protein_indexing=True): """ Returns list of start positions where stringA is in stringB - including overlaps. Note that by default the indices use 1-indexing so that this works directly with protein sequence numbering. However, for manipulating Python strings this may be undesirable and 0 indexing may be better, in which case setting protein_indexing = False will address this. Practically, this uses the re regex expression under the hood and searches left-to-right across the target, so if you want to get fancier with your searching you can always pass in a regular expression. Examples ---------------- Conveninet regular expression syntax includes: 1. ``'.'`` for wildcards (e.g. ``'L.P'`` would match an L and P around any other character 2. ``[A|C]`` for requiring matching of a subset of residues (e.g. residue A and C). But the python re module has a fairly complex pattern matching ability Parameters -------------- query : str The search query. target : str The string that we'll search for 1 or more entries of the query protein_indexing : bool Flag which, if set to True, means the first residue in a string indexes at '1' instead of '0' (as would be normal in Python. If set to False, then indexing is done from 0. Returns ---------- list Returns a list with the start positions """ if protein_indexing is True: offset = 1 else: offset = 0 return [s.start() + offset for s in re.finditer('(?=%s)' % (query), target)]