Source code for dnachisel.builtin_specifications.AvoidChanges

"""Implementation of AvoidChanges."""

import numpy as np

from ..Specification import Specification, SpecEvaluation

# from .VoidSpecification import VoidSpecification
from ..biotools import (
    sequences_differences_array,
    group_nearby_indices,
)
from ..Location import Location


[docs]class AvoidChanges(Specification): """Specify that some locations of the sequence should not be changed. Shorthand for annotations: "change". Parameters ---------- location Location object indicating the position of the segment that must be left unchanged. Alternatively, indices can be provided. If neither is provided, the assumed location is the whole sequence. indices List of indices that must be left unchanged. target_sequence At the moment, this is rather an internal variable. Do not use unless you're not afraid of side effects. """ localization_interval_length = 6 # used when optimizing the minimize_diffs best_possible_score = 0 enforced_by_nucleotide_restrictions = True shorthand_name = "keep" priority = -1000 def __init__( self, max_edits=0, max_edits_percent=None, location=None, indices=None, target_sequence=None, boost=1.0, ): """Initialize.""" if location is None and (indices is not None): location = (min(indices), max(indices) + 1) self.location = Location.from_data(location) if (self.location is not None) and self.location.strand == -1: self.location.strand = 1 self.indices = np.array(indices) if (indices is not None) else None self.target_sequence = target_sequence self.max_edits = max_edits self.max_edits_percent = max_edits_percent self.boost = boost def extract_subsequence(self, sequence): """Extract a subsequence from the location or indices. Used to initialize the function when the sequence is provided. """ if (self.location is None) and (self.indices is None): return sequence elif self.indices is not None: return "".join(np.array(list(sequence))[self.indices]) else: # self.location is not None: return self.location.extract_sequence(sequence) def initialized_on_problem(self, problem, role=None): """Find out what sequence it is that we are supposed to conserve.""" result = self._copy_with_full_span_if_no_location(problem) L = len(result.location if result.indices is None else result.indices) if result.max_edits_percent is not None: result.max_edits = np.floor(result.max_edits_percent * L / 100.0) result.enforced_by_nucleotide_restrictions = result.max_edits == 0 # Initialize the "target_sequence" in two cases: # - Always at the very beginning # - When the new sequence is bigger than the previous one # (used in CircularDnaOptimizationProblem) if result.target_sequence is None or ( len(result.target_sequence) < len(self.location) ): result = result.copy_with_changes() result.target_sequence = self.extract_subsequence(problem.sequence) return result def evaluate(self, problem): """Return a score equal to -number_of modifications. Locations are "binned" modifications regions. Each bin has a length in nucleotides equal to ``localization_interval_length`.` """ target = self.target_sequence sequence = self.extract_subsequence(problem.sequence) differing_indices = np.nonzero( sequences_differences_array(sequence, target) )[0] if self.indices is not None: differing_indices = self.indices[differing_indices] elif self.location is not None: if self.location.strand == -1: differing_indices = self.location.end - differing_indices else: differing_indices = differing_indices + self.location.start intervals = [ (r[0], r[-1] + 1) for r in group_nearby_indices( differing_indices, max_group_spread=self.localization_interval_length, ) ] locations = [Location(start, end, 1) for start, end in intervals] score = self.max_edits - len(differing_indices) return SpecEvaluation(self, problem, score=score, locations=locations) def localized(self, location, problem=None, with_righthand=False): """Localize the spec to the overlap of its location and the new. """ if self.max_edits != 0: return self start, end = location.start, location.end if self.indices is not None: pos = ((start <= self.indices) & (self.indices < end)).nonzero()[0] new_indices = self.indices[pos] new_target = "".join(np.array(list(self.target_sequence))[pos]) return self.copy_with_changes( indices=new_indices, target_sequence=new_target ) else: new_location = self.location.overlap_region(location) if new_location is None: return None else: new_constraint = self.copy_with_changes(location=new_location) relative_location = new_location + (-self.location.start) new_constraint.target_sequence = relative_location.extract_sequence( self.target_sequence ) return new_constraint def restrict_nucleotides(self, sequence, location=None): """When localizing, forbid any nucleotide but the one already there.""" if self.max_edits or self.max_edits_percent: return [] if location is not None: start = max(location.start, self.location.start) end = min(location.end, self.location.end) else: start, end = self.location.start, self.location.end if self.indices is not None: return [ ((i, i + 1), set([sequence[i : i + 1]])) for i in self.indices if start <= i < end ] else: return [((start, end), set([sequence[start:end]]))] def short_label(self): return "keep" def breach_label(self): return "edits"