Source code for dnachisel.builtin_specifications.codon_optimization.MatchTargetCodonUsage

import numpy as np
from ...Specification.SpecEvaluation import SpecEvaluation
from ...biotools import dict_to_pretty_string

from .BaseCodonOptimizationClass import BaseCodonOptimizationClass


[docs]class MatchTargetCodonUsage(BaseCodonOptimizationClass): """Codon-optimize a sequence so it has the same codon usage as a target. The objective minimized here is the sum of the discrepancies, over every possible triplet ATG, CCG, etc. between the codon frequency of this triplet in the sequence, and its frequency in the target organism. This method has had several names through the ages. It may have been first proposed by Hale and Thompson, 1998. It is called Individual Codon Usage Optimization in Chung 2012, Global CAI Harmonization in Mignon 2018, and Codon Harmonization in Jayaral 2005. We didn't call it "harmonization" in DNA Chisel to avoid any confusion with the now more common host-to-target codon harmonization. See DnaChisel's HarmonizeRCA class for Codon Harmonization. Parameters ---------- species Species for which the sequence will be codon-optimized. Either a TaxID (this requires a web connection as the corresponding table will be downloaded from the internet) or the name of the species to codon-optimize for (the name must be supported by ``python_codon_tables`` e.g. ``e_coli``, ``s_cerevisiae``, ``h_sapiens``, ``c_elegans``, ``b_subtilis``, ``d_melanogaster``). Note that a ``codon_usage_table`` can be provided instead, or even in addition, for species whose codon usage table cannot be auto-imported. location Either a DnaChisel Location or a tuple of the form (start, end, strand) or just (start, end), with strand defaulting to +1, indicating the position of the gene to codon-optimize. If not provided, the whole sequence is considered as the gene. The location should have a length that is a multiple of 3. The location strand is either 1 if the gene is encoded on the (+) strand, or -1 for antisense. codon_usage_table A dict of the form ``{'*': {"TGA": 0.112, "TAA": 0.68}, 'K': ...}`` giving the RSCU table (relative usage of each codon). Only provide if no ``species`` parameter was provided. boost Score multiplicator (=weight) for when the specification is used as an optimization objective alongside competing objectives. References ---------- Hale and Thompson, Codon Optimization of the Gene Encoding a Domain from Human Type 1 Neurofibromin Protein... Protein Expression and Purification 1998. Jayaraj et. al. GeMS: an advanced software package for designing synthetic genes, Nucleic Acids Research, 2005 Mignon et. al. Codon harmonization – going beyond the speed limit for protein expression. FEBS Lett, 2018 Chung BK, Lee DY. Computational codon optimization of synthetic gene for protein expression. BMC Syst Biol. 2012 """ shorthand_name = "match_codon_usage" def __init__( self, species=None, location=None, codon_usage_table=None, boost=1.0 ): BaseCodonOptimizationClass.__init__( self, species=species, location=location, codon_usage_table=codon_usage_table, boost=boost, ) self.codons_translations = self.get_codons_translations() def codon_usage_matching_stats(self, problem): """Return a codon harmonisation score and a suboptimal locations list. Parameters ---------- sequence An ATGC string species Any species name from the DnaChisel codon tables, such as ``e_coli``. Returns ------- score, list_of_over_represented_codons_positions ``score`` is a negative number equals to sum(fi - ei) where for the i-th codon in the sequence fi is the relative frequency of this triplet in the sequence and ei is the relative frequency in the reference species. The ``list_of_suboptimal_codons_positions`` is of the form [1, 4, 5, 6...] a number k in that list indicates that the k-th codon is over-represented, and that a synonymous mutation of this codon can improve the harmonization score. """ codons = self.get_codons(problem) codons_positions, aa_comparisons = self.compare_frequencies(codons) score = 0 nonoptimal_aa_indices = [] for aa, data in aa_comparisons.items(): total = data.pop("total") for codon, codon_freq in data.items(): frequency_diff = codon_freq["sequence"] - codon_freq["table"] score -= total * abs(frequency_diff) if codon_freq["sequence"] > codon_freq["table"]: nonoptimal_aa_indices += codons_positions[codon] return score, nonoptimal_aa_indices def evaluate(self, problem): """Evaluate on a problem""" score, nonoptimal_indices = self.codon_usage_matching_stats(problem) locations = self.codons_indices_to_locations(nonoptimal_indices) np.random.shuffle(locations) return SpecEvaluation( self, problem, score=score, locations=locations, message="Codon opt. on window %s scored %.02E" % (self.location, score), ) def localized_on_window(self, new_location, start_codon, end_codon): """Relocate without changing much.""" return self def label_parameters(self): return ["(custom table)" if self.species is None else self.species] def compare_frequencies(self, codons, text_mode=False): """Return a dict indicating differences between codons frequencies in the sequence and in this specifications's codons usage table. Examples -------- >>> codons = spec.get_codons(problem) >>> print(spec.compare_frequencies(codons) Returns ------- positions, comparisons (if text_mode = False) a formatted print-ready string (if text_mode = True) >>> { >>> "K": { >>> "total": 6, >>> "AAA": { >>> "sequence": 1.0, >>> "table": 0.7 >>> }, >>> ... >>> }, >>> "D": ... >>> } """ codons_positions = {cod: [] for cod in self.codons_translations} for i, codon in enumerate(codons): codons_positions[codon].append(i) # aa: amino-acid codons_frequencies = { aa: {"total": 0} for aa in self.codon_usage_table } for codon, positions in codons_positions.items(): count = len(positions) aa = self.codons_translations[codon] codons_frequencies[aa][codon] = count codons_frequencies[aa]["total"] += count for aa, data in codons_frequencies.items(): total = max(1, data["total"]) for codon, value in data.items(): if codon != "total": data[codon] = 1.0 * value / total codons_frequencies = { aa: data for aa, data in codons_frequencies.items() if data["total"] } comparisons = { aa: { "total": seq_data["total"], **{ codon: {"sequence": seq_data[codon], "table": table_data} for codon, table_data in self.codon_usage_table[aa].items() }, } for aa, seq_data in codons_frequencies.items() } if text_mode: return dict_to_pretty_string(comparisons) else: return codons_positions, comparisons def short_label(self): result = "match-codon-usage" if self.species is not None: result += " (%s)" % self.species return result