Source code for dnachisel.builtin_specifications.AvoidChanges
"""Implementation of AvoidChanges."""
import numpy as np
from ..Specification import Specification, SpecEvaluation
# from .VoidSpecification import VoidSpecification
from ..biotools import (
sequences_differences_array,
group_nearby_indices,
)
from ..Location import Location
[docs]class AvoidChanges(Specification):
"""Specify that some locations of the sequence should not be changed.
Shorthand for annotations: "change".
Parameters
----------
location
Location object indicating the position of the segment that must be
left unchanged. Alternatively,
indices can be provided. If neither is provided, the assumed location
is the whole sequence.
indices
List of indices that must be left unchanged.
target_sequence
At the moment, this is rather an internal variable. Do not use unless
you're not afraid of side effects.
"""
localization_interval_length = 6 # used when optimizing the minimize_diffs
best_possible_score = 0
enforced_by_nucleotide_restrictions = True
shorthand_name = "keep"
priority = -1000
def __init__(
self,
max_edits=0,
max_edits_percent=None,
location=None,
indices=None,
target_sequence=None,
boost=1.0,
):
"""Initialize."""
if location is None and (indices is not None):
location = (min(indices), max(indices) + 1)
self.location = Location.from_data(location)
if (self.location is not None) and self.location.strand == -1:
self.location.strand = 1
self.indices = np.array(indices) if (indices is not None) else None
self.target_sequence = target_sequence
self.max_edits = max_edits
self.max_edits_percent = max_edits_percent
self.boost = boost
def extract_subsequence(self, sequence):
"""Extract a subsequence from the location or indices.
Used to initialize the function when the sequence is provided.
"""
if (self.location is None) and (self.indices is None):
return sequence
elif self.indices is not None:
return "".join(np.array(list(sequence))[self.indices])
else: # self.location is not None:
return self.location.extract_sequence(sequence)
def initialized_on_problem(self, problem, role=None):
"""Find out what sequence it is that we are supposed to conserve."""
result = self._copy_with_full_span_if_no_location(problem)
L = len(result.location if result.indices is None else result.indices)
if result.max_edits_percent is not None:
result.max_edits = np.floor(result.max_edits_percent * L / 100.0)
result.enforced_by_nucleotide_restrictions = result.max_edits == 0
# Initialize the "target_sequence" in two cases:
# - Always at the very beginning
# - When the new sequence is bigger than the previous one
# (used in CircularDnaOptimizationProblem)
if result.target_sequence is None or (
len(result.target_sequence) < len(self.location)
):
result = result.copy_with_changes()
result.target_sequence = self.extract_subsequence(problem.sequence)
return result
def evaluate(self, problem):
"""Return a score equal to -number_of modifications.
Locations are "binned" modifications regions. Each bin has a length
in nucleotides equal to ``localization_interval_length`.`
"""
target = self.target_sequence
sequence = self.extract_subsequence(problem.sequence)
differing_indices = np.nonzero(
sequences_differences_array(sequence, target)
)[0]
if self.indices is not None:
differing_indices = self.indices[differing_indices]
elif self.location is not None:
if self.location.strand == -1:
differing_indices = self.location.end - differing_indices
else:
differing_indices = differing_indices + self.location.start
intervals = [
(r[0], r[-1] + 1)
for r in group_nearby_indices(
differing_indices,
max_group_spread=self.localization_interval_length,
)
]
locations = [Location(start, end, 1) for start, end in intervals]
score = self.max_edits - len(differing_indices)
return SpecEvaluation(self, problem, score=score, locations=locations)
def localized(self, location, problem=None, with_righthand=False):
"""Localize the spec to the overlap of its location and the new.
"""
if self.max_edits != 0:
return self
start, end = location.start, location.end
if self.indices is not None:
pos = ((start <= self.indices) & (self.indices < end)).nonzero()[0]
new_indices = self.indices[pos]
new_target = "".join(np.array(list(self.target_sequence))[pos])
return self.copy_with_changes(
indices=new_indices, target_sequence=new_target
)
else:
new_location = self.location.overlap_region(location)
if new_location is None:
return None
else:
new_constraint = self.copy_with_changes(location=new_location)
relative_location = new_location + (-self.location.start)
new_constraint.target_sequence = relative_location.extract_sequence(
self.target_sequence
)
return new_constraint
def restrict_nucleotides(self, sequence, location=None):
"""When localizing, forbid any nucleotide but the one already there."""
if self.max_edits or self.max_edits_percent:
return []
if location is not None:
start = max(location.start, self.location.start)
end = min(location.end, self.location.end)
else:
start, end = self.location.start, self.location.end
if self.indices is not None:
return [
((i, i + 1), set([sequence[i : i + 1]]))
for i in self.indices
if start <= i < end
]
else:
return [((start, end), set([sequence[start:end]]))]
def short_label(self):
return "keep"
def breach_label(self):
return "edits"