Source code for dnachisel.builtin_specifications.AvoidBlastMatches

"""Implementation of AvoidBlastMatches."""

from ..Specification import Specification, SpecEvaluation

# from .VoidSpecification import VoidSpecification
from ..biotools import blast_sequence
from ..Location import Location


[docs]class AvoidBlastMatches(Specification): """Enforce that the sequence has no BLAST matches with a given database. WARNING: try using AvoidMatches instead, it is much better!! Uses NCBI Blast+. Only local BLAST is supported/tested as for now Parameters ---------- blast_db Path to a local BLAST database. These databases can be obtained with NCBI's `makeblastdb`. Omit the extension, e.g. `ecoli_db/ecoli_db`. word_size Word size used by the BLAST algorithm perc_identity Minimal percentage of identity for BLAST matches. 100 means that only perfect matches are considered. num_alignments Number alignments num_threads Number of threads/CPU cores to use for the BLAST algorithm. min_align_length Minimal length that an alignment should have to be considered. """ priority = -2 best_possible_score = 0 blasts_paths = {} def __init__( self, blast_db=None, sequences=None, word_size=4, perc_identity=100, num_alignments=100000, num_threads=3, min_align_length=20, ungapped=True, e_value=1e80, culling_limit=1, location=None, ): """Initialize.""" self.blast_db = blast_db self.sequences = sequences self.word_size = word_size self.perc_identity = perc_identity self.num_alignments = num_alignments self.num_threads = num_threads self.min_align_length = min_align_length self.location = Location.from_data(location) self.e_value = e_value self.ungapped = ungapped self.culling_limit = culling_limit def initialized_on_problem(self, problem, role=None): return self._copy_with_full_span_if_no_location(problem) def evaluate(self, problem): """Score as (-total number of blast identities in matches).""" location = self.location if location is None: location = Location(0, len(problem.sequence)) sequence = location.extract_sequence(problem.sequence) blast_record = blast_sequence( sequence, blast_db=self.blast_db, subject_sequences=self.sequences, word_size=self.word_size, perc_identity=self.perc_identity, num_alignments=self.num_alignments, num_threads=self.num_threads, ungapped=self.ungapped, e_value=self.e_value, culling_limit=self.culling_limit, task="megablast" ) if isinstance(blast_record, list): alignments = [ alignment for rec in blast_record for alignment in rec.alignments ] else: alignments = blast_record.alignments query_hits = [ ( min(hit.query_start, hit.query_end) + location.start - 1, max(hit.query_start, hit.query_end) + location.start, 1 - 2 * (hit.query_start > hit.query_end), hit.identities, ) for alignment in alignments for hit in alignment.hsps ] locations = sorted( [ (start, end, ids) for (start, end, strand, ids) in query_hits if (end - start) >= self.min_align_length ] ) score = -sum([ids for start, end, ids in locations]) locations = [Location(start, end) for start, end, ids in locations] if locations == []: return SpecEvaluation( self, problem, score=1, message="Passed: no BLAST match found" ) return SpecEvaluation( self, problem, score=score, locations=locations, message="Failed - %s matches at %s" % (len(locations), locations), ) def localized(self, location, problem=None, with_righthand=True): """Localize the evaluation.""" new_location = self.location.overlap_region(location) if new_location is None: return None new_location = location.extended( self.min_align_length - 1, right=with_righthand ) return self.copy_with_changes(location=new_location) def feature_label_parameters(self): return [self.blast_db]