utils


source

align2mut

 align2mut (align)

Converts a sequence alignment result from Bio.pairwise2.Align.globalms into a list of mutations. Positions are those of the alignment.

seqA = "ATCCCGGCAGC"
seqB = "ATCCACGGTCAGC"
align=pairwise2.align.globalms(seqA,seqB, 2, -1, -1, -.5, one_alignment_only=True)[0]
align2mut(align)
[('-', 4, 'A'), ('-', 8, 'T')]

source

mut_rix

 mut_rix (mutations)

Reindexes the positions of the mutations to go from their position in the sequence alignment to their position in the original sequence.

seqA = "ATCCCGGCAGC"
seqB = "ATCCACGGTCAGC"
align=pairwise2.align.globalms(seqA,seqB, 2, -1, -1, -.5, one_alignment_only=True)[0]
print(format_alignment(*align))

mutations=align2mut(align) 
print("Output of align2mut:")
print(mutations)

print("Output of mut_rix:")
print(mut_rix(mutations))
ATCC-CGG-CAGC
|||| ||| ||||
ATCCACGGTCAGC
  Score=20

Output of align2mut:
[('-', 4, 'A'), ('-', 8, 'T')]
Output of mut_rix:
[('-', 4, 'A'), ('-', 7, 'T')]

source

get_mutations

 get_mutations (seqA, seqB, match=2, mismatch=-1, gap_open=-1,
                gap_extend=-0.5)

Aligns two sequences and returns a genotype string. The string is a comma separated list of mutations.

seqA = "ATCCGGCAGCAGGTCGTGAGC"
seqB = "ATCCACGGTCAGCACGTCGTGGC"
align=pairwise2.align.globalms(seqA,seqB, 2, -1, -1, -.5, one_alignment_only=True)[0]
print(format_alignment(*align))

get_mutations(seqA,seqB)
ATC--CGG-CAGCAGGTCGTGAGC
|||  ||| |||||.|||||| ||
ATCCACGGTCAGCACGTCGTG-GC
  Score=33.5
[('-', 3, 'C'), ('-', 3, 'A'), ('-', 6, 'T'), ('G', 11, 'C'), ('A', 18, '-')]

source

get_mutations_noalign

 get_mutations_noalign (seqA, seqB)

Returns a genotype string. The string is a comma separated list of mutations.

get_mutations_noalign("AGCTATGG","AGCTCTGG")
[('A', '4', 'C')]

source

mut_to_str

 mut_to_str (mutations:list)

Converts list of mutations to a comma separated string

mut_to_str([('-', 3, 'C'), ('-', 3, 'A'), ('-', 6, 'T'), ('G', 11, 'C'), ('A', 18, '-')])
'-3C,-3A,-6T,G11C,A18-'

source

str_to_mut

 str_to_mut (gen:str)

Converts genotype string to a list of mutations

assert(str_to_mut('')==[])
str_to_mut('-4A,-7T,G12C')
[['-', 4, 'A'], ['-', 7, 'T'], ['G', 12, 'C']]

source

genstr_to_seq

 genstr_to_seq (genstr, refseq)
genstr='-3C,-3A,-6T,G11C,A18-'
genstr_to_seq(genstr,seqA)
'ATCCACGGTCAGCACGTCGTGGC'
s="AGCGAGC"
print(len(s)%3)
s[:-(len(s)%3)]
1
'AGCGAG'

source

reverse_complement

 reverse_complement (dna)

source

get_prot_mut

 get_prot_mut (genstr, refseq, frame=0, ori=1)
seqA = "AGCGCTATGCTGCGCGCGTACTGCCGCTAGCTATGCTCAGGCCGATATATGCGAGCA"
seqB = "AGCGCTATGCTGCGCGCGAAAACCCGCTAGCTATGCTCAGGTCGATATATGCGAGCA"
genstr=mut_to_str(get_mutations(seqA,seqB,gap_open=-5))
print(genstr)
get_prot_mut(genstr,seqA,frame=1)
T18A,C20A,T21A,G22C,C41T
'T6K,A7P,A13V'
genstr='A61G,-63T,A79T'
refseq='CGCCTTGGTAGCCATCTTCAGTTCCAGTGTTTGCTTCAAATACTAAGTATTTGTGGCCTTTATCTTCTACGTAGTGAGGATCTCTCAGCGTATGGTTGTCGCCTGAGCTGTAGTTGCCTTCATCGATGAACTGCTGTAC'
ori=-1
frame=0
get_prot_mut(genstr,refseq,frame=frame,ori=ori)
'D19E,D25E,K26Q,G27R,H28P,K29Q,Y30I,V32S,F33I,E34*,A35S,N36K,T37H,G38W,T39N,E40*,D41R,G42W,Y43L,Q44P,G45R'
genstr='A61G,A79T'
refseq='CGCCTTGGTAGCCATCTTCAGTTCCAGTGTTTGCTTCAAATACTAAGTATTTGTGGCCTTTATCTTCTACGTAGTGAGGATCTCTCAGCGTATGGTTGTCGCCTGAGCTGTAGTTGCCTTCATCGATGAACTGCTGTAC'
ori=-1
frame=0
get_prot_mut(genstr,refseq,frame=frame,ori=ori)
'D19E'

source

parse_genotypes

 parse_genotypes (genotypes_file)
from dgrec.example_data import get_example_data_dir
data_path=get_example_data_dir()
gen_list=parse_genotypes(os.path.join(data_path,"sacB_genotypes.csv"))
for g,n in itertools.islice(gen_list,30,40):
    print(n,"\t",g)
20   A72G,A79G
19   A72G,A79T,A91G
17   T67G,A91G
17   A76G,A79T
17   A68C,A72G
17   A111G
16   A68G,A91G
16   A86G,A91T
15   A72G,A91T
15   A79G,A86G

source

get_aa_mut_list

 get_aa_mut_list (gen_list, refseq, frame=0, ori=1)
read_ref_file="sacB_ref.fasta"
refseq=next(SeqIO.parse(os.path.join(data_path,read_ref_file),"fasta"))
refseq=str(refseq.seq)
aa_mut_list=get_aa_mut_list(gen_list,refseq,ori=-1)
aa_mut_list[:10]
[('', 43341),
 ('Y22H', 351),
 ('H15Q', 277),
 ('D19E', 246),
 ('L17P', 200),
 ('V23A', 162),
 ('S11P', 117),
 ('D25E', 113),
 ('D19E,Y22H', 75),
 ('T16P', 61)]

source

downsample_fastq_gz

 downsample_fastq_gz (input_file, output_file, num_reads=10000)

*Downsamples a compressed FASTQ file to the specified number of reads.

Args: input_file (str): Path to the input FASTQ.gz file. output_file (str): Path to the output FASTQ.gz file. num_reads (int, optional): Number of reads to keep. Defaults to 10000.*

input_file=os.path.join(data_path,"sacB_example.fastq.gz")
output_file="sacB_example_downsampled.fastq.gz"
downsample_fastq_gz(input_file, output_file, num_reads=100)

source

get_basename_without_extension

 get_basename_without_extension (file_path)

*Extracts the basename of a file without the extension.

Args: file_path (str): The path to the file.

Returns: str: The basename of the file without the extension.*

# Example usage
file_path = "C:/Users/John/Documents/my_file.txt"
basename_without_extension = get_basename_without_extension(file_path)
print(basename_without_extension)  # Output: my_file
my_file

source

pickle_save

 pickle_save (data_in, file_name_out)

source

pickle_load

 pickle_load (file_name_in)

source

make_dgr_oligos

 make_dgr_oligos (target:str, split_number:int)

Split the TR target into the input number and then generates the oligos to order

Type Details
target str TR DNA
split_number int Number of desired splits
target='CCTCAGATACAAGCCGGCATAAATAATAACATATTCTATGACCATGATAATAGTGTAGGTGCAAACGCCAACGCTAAAAACACTGGAACCATGAACGGTAATACTGCAGGGACGAATATAGCCAAAACTTCT'
make_dgr_oligos(target,4)
['ATAACCTCAGATACAAGCCGGCATAAATAATAACA',
 'AATATGTTATTATTTATGCCGGCTTGTATCTGAGG',
 'TATTCTATGACCATGATAATAGTGTAGGTGCAA',
 'GCGTTTGCACCTACACTATTATCATGGTCATAG',
 'ACGCCAACGCTAAAAACACTGGAACCATGAACG',
 'TTACCGTTCATGGTTCCAGTGTTTTTAGCGTTG',
 'GTAATACTGCAGGGACGAATATAGCCAAAACTTCT',
 'CAGAAGAAGTTTTGGCTATATTCGTCCCTGCAGTA']

source

reverse_comp_geno_list

 reverse_comp_geno_list (geno_list:list, ref_seq:str)
Type Details
geno_list list List of genotypes
ref_seq str string of the template sequence

source

remove_position

 remove_position (geno, pos_list)

source

remove_position_list

 remove_position_list (geno_list, pos_list)
ref_genome='AACGTATACGGCGGAATATTTGCCGAATGCCGTGTGGACGTAAGCGTGAACGTCAGGATCACGTTTCCCCGACCCGCTGGCATGTCAACAATACGGGAGAACACCTGTACCGCCTCGTTCGCCGCGC'
geno_list_test=[('T19C', 176012),
 ('T19C,T64A,T65A', 169),
 ('T19C,G36T', 40),
 ('T19C,T58G,T63A,T64G', 4),
 ('T19C,A42C', 14),
 ('T19C,T63G', 13),
 ('T19C,T52A,T64A,T65A', 19),
 ('T19C,A41C,A57C,T58C', 1),
 ('T19C,T52A', 94),
 ('T19C,T64A,T65G', 214),
 ('T19C,T63A,T64C,T65A', 2),
 ('T19C,A49C,T64C,T91C', 2),
 ('T19C,T32C,T52C,T64A,T65G,T84C,T91A', 1),
 ('T19C,T82C,T84A', 8),
 ('T19C,T64C', 308),
 ('T19C,T40C,G43A,T58A,A71C,T91C', 1),
 ('T52A,T77C', 1),
 ('T19C,T52C,T58A', 9),
 ('T20C,-52C,T58C,A71C,T77C', 1),
 ('T19C,G70T', 110),
 ('T19C,T32C,T65C,T82C', 1),
 ('T19C,T32C,T46C', 4),
 ('T19C,T32C,A41C,A49C,T64A,T65G', 1),
 ('T19C,T64A', 115),
 ('T19C,T58C', 68)]

geno_list_test=remove_position_list(geno_list_test,[19])
reverse_comp_geno_list(geno_list_test,ref_genome)
[('', 176012),
 ('A61T,A62T', 169),
 ('C90A', 40),
 ('A62C,A63T,A68C', 4),
 ('T84G', 14),
 ('A63C', 13),
 ('A61T,A62T,A74T', 19),
 ('A68G,T69G,T85G', 1),
 ('A74T', 94),
 ('A61C,A62T', 214),
 ('A61T,A62G,A63T', 2),
 ('A35G,A62G,T77G', 2),
 ('A35T,A42G,A61C,A62T,A74G,A94G', 1),
 ('A42T,A44G', 8),
 ('A62G', 308),
 ('A35G,T55G,A68T,C83T,A86G', 1),
 ('A49G,A74T', 1),
 ('A68T,A74G', 9),
 ('A49G,T55G,A68G,-74G,A106G', 1),
 ('C56A', 110),
 ('A44G,A61G,A94G', 1),
 ('A80G,A94G', 4),
 ('A61C,A62T,T77G,T85G,A94G', 1),
 ('A62T', 115),
 ('A68G', 68)]