utils

Core utility functions shared across the DGRec package. Includes sequence alignment, genotype string parsing and validation, protein-level mutation analysis, file I/O helpers, and codon/amino acid constants. The is_dgrec() function tests for the adenine bias signature characteristic of DGR mutagenesis (≥70% A mutations, ≥2 mutations).

Alignment and mutation detection

Functions to align sequences and extract mutations.


source

align2mut


def align2mut(
    align
):

Converts a sequence alignment result from Bio.pairwise2.Align.globalms into a list of mutations. Positions are those of the alignment.

seqA = "ATCCCGGCAGC"
seqB = "ATCCACGGTCAGC"
align=pairwise2.align.globalms(seqA,seqB, 2, -1, -1, -.5, one_alignment_only=True)[0]
align2mut(align)
[('-', 4, 'A'), ('-', 8, 'T')]

source

mut_rix


def mut_rix(
    mutations
):

Reindexes the positions of the mutations to go from their position in the sequence alignment to their position in the original sequence.

seqA = "ATCCCGGCAGC"
seqB = "ATCCACGGTCAGC"
align=pairwise2.align.globalms(seqA,seqB, 2, -1, -1, -.5, one_alignment_only=True)[0]
print(format_alignment(*align))

mutations=align2mut(align) 
print("Output of align2mut:")
print(mutations)

print("Output of mut_rix:")
print(mut_rix(mutations))
ATCC-CGG-CAGC
|||| ||| ||||
ATCCACGGTCAGC
  Score=20

Output of align2mut:
[('-', 4, 'A'), ('-', 8, 'T')]
Output of mut_rix:
[('-', 4, 'A'), ('-', 7, 'T')]

source

get_mutations


def get_mutations(
    seqA, seqB, match:int=2, mismatch:int=-1, gap_open:int=-1, gap_extend:float=-0.5
):

Aligns two sequences and returns a list of mutations. Each mutation is a tuple of (reference_base, position, query_base).

seqA = "ATCCGGCAGCAGGTCGTGAGC"
seqB = "ATCCACGGTCAGCACGTCGTGGC"
align=pairwise2.align.globalms(seqA,seqB, 2, -1, -1, -.5, one_alignment_only=True)[0]
print(format_alignment(*align))

get_mutations(seqA,seqB)
ATC--CGG-CAGCAGGTCGTGAGC
|||  ||| |||||.|||||| ||
ATCCACGGTCAGCACGTCGTG-GC
  Score=33.5
[('-', 3, 'C'), ('-', 3, 'A'), ('-', 6, 'T'), ('G', 11, 'C'), ('A', 18, '-')]

source

get_mutations_noalign


def get_mutations_noalign(
    seqA, seqB
):

Compares two equal-length sequences and returns a list of mutations without alignment. Each mutation is a tuple of (reference_base, position_str, query_base).

get_mutations_noalign("AGCTATGG","AGCTCTGG")
[('A', '4', 'C')]

Genotype string manipulation

Convert between mutation lists and genotype string format (e.g., A50T,G75C).


source

mut_to_str


def mut_to_str(
    mutations:list
):

Converts list of mutations to a comma separated string

mut_to_str([('-', 3, 'C'), ('-', 3, 'A'), ('-', 6, 'T'), ('G', 11, 'C'), ('A', 18, '-')])
'-3C,-3A,-6T,G11C,A18-'

source

str_to_mut


def str_to_mut(
    gen:str
):

Converts genotype string to a list of mutations

assert(str_to_mut('')==[])
str_to_mut('-4A,-7T,G12C')
[['-', 4, 'A'], ['-', 7, 'T'], ['G', 12, 'C']]

source

is_dgrec


def is_dgrec(
    genotype:str, min_length:int=2, min_A_proportion:float=0.7
)->bool:

Checks if a genotype string is a valid dgrec genotype.

assert is_dgrec('-3C,-3A,-6T,G11C,A18-') == False
assert is_dgrec('A56T,A68G') == True
assert is_dgrec('A56T,A68G,C89G', min_A_proportion=0.6) == True
assert is_dgrec('A56T,A68G,C89G', min_A_proportion=0.7) == False
assert is_dgrec('A56T,A68G,C89G,A198C', min_A_proportion=0.7) == True

Sequence reconstruction

Reconstruct full sequences from reference and genotype strings.


source

genstr_to_seq


def genstr_to_seq(
    genstr, refseq
):

Reconstructs a mutated sequence by applying mutations from a genotype string to a reference sequence. Handles substitutions, insertions, and deletions.

genstr='-3C,-3A,-6T,G11C,A18-'
genstr_to_seq(genstr,seqA)
'ATCCACGGTCAGCACGTCGTGGC'
s="AGCGAGC"
print(len(s)%3)
s[:-(len(s)%3)]
1
'AGCGAG'

source

reverse_complement


def reverse_complement(
    dna
):

Returns the reverse complement of a DNA sequence.

reverse_complement('ATCGATCG')
'CGATCGAT'

Protein-level analysis

Translate DNA-level mutations to amino acid changes.


source

get_prot_mut


def get_prot_mut(
    genstr, refseq, frame:int=0, ori:int=1
):

Translates DNA-level mutations into protein-level mutations. Applies the genotype to the reference, translates both in the given reading frame and orientation, and returns a genotype string of amino acid changes.

seqA = "AGCGCTATGCTGCGCGCGTACTGCCGCTAGCTATGCTCAGGCCGATATATGCGAGCA"
seqB = "AGCGCTATGCTGCGCGCGAAAACCCGCTAGCTATGCTCAGGTCGATATATGCGAGCA"
genstr=mut_to_str(get_mutations(seqA,seqB,gap_open=-5))
print(genstr)
get_prot_mut(genstr,seqA,frame=1)
T18A,C20A,T21A,G22C,C41T
'T6K,A7P,A13V'
genstr='A61G,-63T,A79T'
refseq='CGCCTTGGTAGCCATCTTCAGTTCCAGTGTTTGCTTCAAATACTAAGTATTTGTGGCCTTTATCTTCTACGTAGTGAGGATCTCTCAGCGTATGGTTGTCGCCTGAGCTGTAGTTGCCTTCATCGATGAACTGCTGTAC'
ori=-1
frame=0
get_prot_mut(genstr,refseq,frame=frame,ori=ori)
'D19E,D25E,K26Q,G27R,H28P,K29Q,Y30I,V32S,F33I,E34*,A35S,N36K,T37H,G38W,T39N,E40*,D41R,G42W,Y43L,Q44P,G45R'
genstr='A61G,A79T'
refseq='CGCCTTGGTAGCCATCTTCAGTTCCAGTGTTTGCTTCAAATACTAAGTATTTGTGGCCTTTATCTTCTACGTAGTGAGGATCTCTCAGCGTATGGTTGTCGCCTGAGCTGTAGTTGCCTTCATCGATGAACTGCTGTAC'
ori=-1
frame=0
get_prot_mut(genstr,refseq,frame=frame,ori=ori)
'D19E'

File I/O

Read and write genotype data and FASTQ files.


source

parse_genotypes


def parse_genotypes(
    genotypes_file
):

Reads a tab-separated genotypes file and returns a list of (genotype_string, count) tuples.

from dgrec.example_data import get_example_data_dir
data_path=get_example_data_dir()
gen_list=parse_genotypes(os.path.join(data_path,"sacB_genotypes.csv"))
for g,n in itertools.islice(gen_list,30,40):
    print(n,"\t",g)
20   A72G,A79G
19   A72G,A79T,A91G
17   T67G,A91G
17   A76G,A79T
17   A68C,A72G
17   A111G
16   A68G,A91G
16   A86G,A91T
15   A72G,A91T
15   A79G,A86G

source

get_aa_mut_list


def get_aa_mut_list(
    gen_list, refseq, frame:int=0, ori:int=1
):

Converts a DNA genotype list to an amino acid genotype list. Excludes genotypes with indels or Ns. Returns a sorted list of (aa_genotype_string, count) tuples.

read_ref_file="sacB_ref.fasta"
refseq=next(SeqIO.parse(os.path.join(data_path,read_ref_file),"fasta"))
refseq=str(refseq.seq)
aa_mut_list=get_aa_mut_list(gen_list,refseq,ori=-1)
aa_mut_list[:10]
[('', 43341),
 ('Y22H', 351),
 ('H15Q', 277),
 ('D19E', 246),
 ('L17P', 200),
 ('V23A', 162),
 ('S11P', 117),
 ('D25E', 113),
 ('D19E,Y22H', 75),
 ('T16P', 61)]

source

downsample_fastq_gz


def downsample_fastq_gz(
    input_file, output_file, num_reads:int=10000
):

Downsamples a compressed FASTQ file to the specified number of reads.

Args: input_file (str): Path to the input FASTQ.gz file. output_file (str): Path to the output FASTQ.gz file. num_reads (int, optional): Number of reads to keep. Defaults to 10000.

input_file=os.path.join(data_path,"sacB_example.fastq.gz")
output_file="sacB_example_downsampled.fastq.gz"
downsample_fastq_gz(input_file, output_file, num_reads=100)

source

get_basename_without_extension


def get_basename_without_extension(
    file_path
):

Extracts the basename of a file without the extension.

Args: file_path (str): The path to the file.

Returns: str: The basename of the file without the extension.

# Example usage
file_path = "C:/Users/John/Documents/my_file.txt"
basename_without_extension = get_basename_without_extension(file_path)
print(basename_without_extension)  # Output: my_file
my_file

source

pickle_save


def pickle_save(
    data_in, file_name_out
):

Saves data to a file using pickle.


source

pickle_load


def pickle_load(
    file_name_in
):

Loads data from a pickle file.

test_data = {'key': [1, 2, 3], 'value': 'test'}
with tempfile.NamedTemporaryFile(suffix='.pkl', delete=False) as f:
    tmp_path = f.name
pickle_save(test_data, tmp_path)
loaded = pickle_load(tmp_path)
print(loaded)
os.remove(tmp_path)
{'key': [1, 2, 3], 'value': 'test'}

source

make_dgr_oligos


def make_dgr_oligos(
    target:str, # TR DNA
    split_number:int, # Number of desired splits
):

Split the TR target into the input number and then generates the oligos to order

target='CCTCAGATACAAGCCGGCATAAATAATAACATATTCTATGACCATGATAATAGTGTAGGTGCAAACGCCAACGCTAAAAACACTGGAACCATGAACGGTAATACTGCAGGGACGAATATAGCCAAAACTTCT'
make_dgr_oligos(target,4)
['ATAACCTCAGATACAAGCCGGCATAAATAATAACA',
 'AATATGTTATTATTTATGCCGGCTTGTATCTGAGG',
 'TATTCTATGACCATGATAATAGTGTAGGTGCAA',
 'GCGTTTGCACCTACACTATTATCATGGTCATAG',
 'ACGCCAACGCTAAAAACACTGGAACCATGAACG',
 'TTACCGTTCATGGTTCCAGTGTTTTTAGCGTTG',
 'GTAATACTGCAGGGACGAATATAGCCAAAACTTCT',
 'CAGAAGAAGTTTTGGCTATATTCGTCCCTGCAGTA']

Genotype filtering

Filter mutations by position.


source

reverse_comp_geno_list


def reverse_comp_geno_list(
    geno_list:list, # List of genotypes
    ref_seq:str, # string of the template sequence
):

Converts a genotype list to its reverse complement. Mirrors mutation positions and complements bases relative to the reference sequence length.


source

remove_position


def remove_position(
    geno, pos_list
):

Removes mutations at the specified positions from a genotype string.

remove_position('A50T,G75C,T150A', [75])
'A50T,T150A'

source

remove_position_list


def remove_position_list(
    geno_list, pos_list
):

Removes mutations at specified positions from all genotypes in a list.

ref_genome='AACGTATACGGCGGAATATTTGCCGAATGCCGTGTGGACGTAAGCGTGAACGTCAGGATCACGTTTCCCCGACCCGCTGGCATGTCAACAATACGGGAGAACACCTGTACCGCCTCGTTCGCCGCGC'
geno_list_test=[('T19C', 176012),
 ('T19C,T64A,T65A', 169),
 ('T19C,G36T', 40),
 ('T19C,T58G,T63A,T64G', 4),
 ('T19C,A42C', 14),
 ('T19C,T63G', 13),
 ('T19C,T52A,T64A,T65A', 19),
 ('T19C,A41C,A57C,T58C', 1),
 ('T19C,T52A', 94),
 ('T19C,T64A,T65G', 214),
 ('T19C,T63A,T64C,T65A', 2),
 ('T19C,A49C,T64C,T91C', 2),
 ('T19C,T32C,T52C,T64A,T65G,T84C,T91A', 1),
 ('T19C,T82C,T84A', 8),
 ('T19C,T64C', 308),
 ('T19C,T40C,G43A,T58A,A71C,T91C', 1),
 ('T52A,T77C', 1),
 ('T19C,T52C,T58A', 9),
 ('T20C,-52C,T58C,A71C,T77C', 1),
 ('T19C,G70T', 110),
 ('T19C,T32C,T65C,T82C', 1),
 ('T19C,T32C,T46C', 4),
 ('T19C,T32C,A41C,A49C,T64A,T65G', 1),
 ('T19C,T64A', 115),
 ('T19C,T58C', 68)]

geno_list_test=remove_position_list(geno_list_test,[19])
reverse_comp_geno_list(geno_list_test,ref_genome)
[('', 176012),
 ('A61T,A62T', 169),
 ('C90A', 40),
 ('A62C,A63T,A68C', 4),
 ('T84G', 14),
 ('A63C', 13),
 ('A61T,A62T,A74T', 19),
 ('A68G,T69G,T85G', 1),
 ('A74T', 94),
 ('A61C,A62T', 214),
 ('A61T,A62G,A63T', 2),
 ('A35G,A62G,T77G', 2),
 ('A35T,A42G,A61C,A62T,A74G,A94G', 1),
 ('A42T,A44G', 8),
 ('A62G', 308),
 ('A35G,T55G,A68T,C83T,A86G', 1),
 ('A49G,A74T', 1),
 ('A68T,A74G', 9),
 ('A49G,T55G,A68G,-74G,A106G', 1),
 ('C56A', 110),
 ('A44G,A61G,A94G', 1),
 ('A80G,A94G', 4),
 ('A61C,A62T,T77G,T85G,A94G', 1),
 ('A62T', 115),
 ('A68G', 68)]

Codon and amino acid constants

Codon conversion

Convert between nucleotide sequences, codons, and amino acids.


source

nt_prot


def nt_prot(
    nt
):

Converts a nucleotide sequence into a sequence of amino acids.


source

codons_to_aa


def codons_to_aa(
    codons
):

Converts a list of codons into a sequence of amino acids.


source

nt_to_codons


def nt_to_codons(
    nt_sequence:str
):

Converts a nucleotide sequence into a list of codons.

# Demo: codon functions
nt_to_codons('ATGGCATGA')
['ATG', 'GCA', 'TGA']