Source code for dnachisel.biotools.random_sequences
"""Methods to generate random sequences.
Also see utils.random_compatible_sequence() for random sequences verifying
user-provided constraints.
"""
import numpy as np
[docs]def random_dna_sequence(length, gc_share=None, probas=None, seed=None):
"""Return a random DNA sequence ("ATGGCGT...") with the specified length.
Parameters
----------
length
Length of the DNA sequence.
proba
Frequencies for the different nucleotides, for instance
``probas={"A":0.2, "T":0.3, "G":0.3, "C":0.2}``.
If not specified, all nucleotides are equiprobable (p=0.25).
seed
The seed to feed to the random number generator. When a seed is provided
the random results depend deterministically on the seed, thus enabling
reproducibility
"""
if seed is not None:
np.random.seed(seed)
if gc_share is not None:
g_or_c = gc_share / 2.0
not_g_or_c = (1 - gc_share) / 2.0
probas = {"G": g_or_c, "C": g_or_c, "A": not_g_or_c, "T": not_g_or_c}
if probas is None:
sequence = np.random.choice(list("ATCG"), length)
else:
bases, probas = zip(*probas.items())
sequence = np.random.choice(bases, length, p=probas)
return "".join(sequence)
[docs]def random_protein_sequence(length, seed=None):
"""Return a random protein sequence "MNQTW...YL*" of the specified length.
Parameters
----------
length
Length of the protein sequence (in number of amino-acids). Note that the
sequence will always start with ``"M"`` and end with a stop codon ``"*"``
with (length-2) random amino-acids in the middle
seed
The seed to feed to the random number generator. When a seed is provided
the random results depend deterministically on the seed, thus enabling
reproducibility
"""
if seed is not None:
np.random.seed(seed)
aa_list = list("ACEDGFIHKLNQPSRTWVY")
aa_choices = np.random.choice(aa_list, length - 2)
return "M" + "".join(aa_choices) + "*"