= "ATCCCGGCAGC"
seqA = "ATCCACGGTCAGC"
seqB =pairwise2.align.globalms(seqA,seqB, 2, -1, -1, -.5, one_alignment_only=True)[0]
align align2mut(align)
[('-', 4, 'A'), ('-', 8, 'T')]
align2mut (align)
Converts a sequence alignment result from Bio.pairwise2.Align.globalms into a list of mutations. Positions are those of the alignment.
seqA = "ATCCCGGCAGC"
seqB = "ATCCACGGTCAGC"
align=pairwise2.align.globalms(seqA,seqB, 2, -1, -1, -.5, one_alignment_only=True)[0]
align2mut(align)
[('-', 4, 'A'), ('-', 8, 'T')]
mut_rix (mutations)
Reindexes the positions of the mutations to go from their position in the sequence alignment to their position in the original sequence.
seqA = "ATCCCGGCAGC"
seqB = "ATCCACGGTCAGC"
align=pairwise2.align.globalms(seqA,seqB, 2, -1, -1, -.5, one_alignment_only=True)[0]
print(format_alignment(*align))
mutations=align2mut(align)
print("Output of align2mut:")
print(mutations)
print("Output of mut_rix:")
print(mut_rix(mutations))
ATCC-CGG-CAGC
|||| ||| ||||
ATCCACGGTCAGC
Score=20
Output of align2mut:
[('-', 4, 'A'), ('-', 8, 'T')]
Output of mut_rix:
[('-', 4, 'A'), ('-', 7, 'T')]
get_mutations (seqA, seqB, match=2, mismatch=-1, gap_open=-1, gap_extend=-0.5)
Aligns two sequences and returns a genotype string. The string is a comma separated list of mutations.
seqA = "ATCCGGCAGCAGGTCGTGAGC"
seqB = "ATCCACGGTCAGCACGTCGTGGC"
align=pairwise2.align.globalms(seqA,seqB, 2, -1, -1, -.5, one_alignment_only=True)[0]
print(format_alignment(*align))
get_mutations(seqA,seqB)
ATC--CGG-CAGCAGGTCGTGAGC
||| ||| |||||.|||||| ||
ATCCACGGTCAGCACGTCGTG-GC
Score=33.5
[('-', 3, 'C'), ('-', 3, 'A'), ('-', 6, 'T'), ('G', 11, 'C'), ('A', 18, '-')]
get_mutations_noalign (seqA, seqB)
Returns a genotype string. The string is a comma separated list of mutations.
mut_to_str (mutations:list)
Converts list of mutations to a comma separated string
'-3C,-3A,-6T,G11C,A18-'
str_to_mut (gen:str)
Converts genotype string to a list of mutations
[['-', 4, 'A'], ['-', 7, 'T'], ['G', 12, 'C']]
genstr_to_seq (genstr, refseq)
reverse_complement (dna)
get_prot_mut (genstr, refseq, frame=0, ori=1)
seqA = "AGCGCTATGCTGCGCGCGTACTGCCGCTAGCTATGCTCAGGCCGATATATGCGAGCA"
seqB = "AGCGCTATGCTGCGCGCGAAAACCCGCTAGCTATGCTCAGGTCGATATATGCGAGCA"
genstr=mut_to_str(get_mutations(seqA,seqB,gap_open=-5))
print(genstr)
get_prot_mut(genstr,seqA,frame=1)
T18A,C20A,T21A,G22C,C41T
'T6K,A7P,A13V'
genstr='A61G,-63T,A79T'
refseq='CGCCTTGGTAGCCATCTTCAGTTCCAGTGTTTGCTTCAAATACTAAGTATTTGTGGCCTTTATCTTCTACGTAGTGAGGATCTCTCAGCGTATGGTTGTCGCCTGAGCTGTAGTTGCCTTCATCGATGAACTGCTGTAC'
ori=-1
frame=0
get_prot_mut(genstr,refseq,frame=frame,ori=ori)
'D19E,D25E,K26Q,G27R,H28P,K29Q,Y30I,V32S,F33I,E34*,A35S,N36K,T37H,G38W,T39N,E40*,D41R,G42W,Y43L,Q44P,G45R'
genstr='A61G,A79T'
refseq='CGCCTTGGTAGCCATCTTCAGTTCCAGTGTTTGCTTCAAATACTAAGTATTTGTGGCCTTTATCTTCTACGTAGTGAGGATCTCTCAGCGTATGGTTGTCGCCTGAGCTGTAGTTGCCTTCATCGATGAACTGCTGTAC'
ori=-1
frame=0
get_prot_mut(genstr,refseq,frame=frame,ori=ori)
'D19E'
parse_genotypes (genotypes_file)
data_path=get_example_data_dir()
gen_list=parse_genotypes(os.path.join(data_path,"sacB_genotypes.csv"))
for g,n in itertools.islice(gen_list,30,40):
print(n,"\t",g)
20 A72G,A79G
19 A72G,A79T,A91G
17 T67G,A91G
17 A76G,A79T
17 A68C,A72G
17 A111G
16 A68G,A91G
16 A86G,A91T
15 A72G,A91T
15 A79G,A86G
get_aa_mut_list (gen_list, refseq, frame=0, ori=1)
read_ref_file="sacB_ref.fasta"
refseq=next(SeqIO.parse(os.path.join(data_path,read_ref_file),"fasta"))
refseq=str(refseq.seq)
aa_mut_list=get_aa_mut_list(gen_list,refseq,ori=-1)
aa_mut_list[:10]
[('', 43341),
('Y22H', 351),
('H15Q', 277),
('D19E', 246),
('L17P', 200),
('V23A', 162),
('S11P', 117),
('D25E', 113),
('D19E,Y22H', 75),
('T16P', 61)]
downsample_fastq_gz (input_file, output_file, num_reads=10000)
*Downsamples a compressed FASTQ file to the specified number of reads.
Args: input_file (str): Path to the input FASTQ.gz file. output_file (str): Path to the output FASTQ.gz file. num_reads (int, optional): Number of reads to keep. Defaults to 10000.*
get_basename_without_extension (file_path)
*Extracts the basename of a file without the extension.
Args: file_path (str): The path to the file.
Returns: str: The basename of the file without the extension.*
# Example usage
file_path = "C:/Users/John/Documents/my_file.txt"
basename_without_extension = get_basename_without_extension(file_path)
print(basename_without_extension) # Output: my_file
my_file
pickle_save (data_in, file_name_out)
pickle_load (file_name_in)
make_dgr_oligos (target:str, split_number:int)
Split the TR target into the input number and then generates the oligos to order
Type | Details | |
---|---|---|
target | str | TR DNA |
split_number | int | Number of desired splits |
target='CCTCAGATACAAGCCGGCATAAATAATAACATATTCTATGACCATGATAATAGTGTAGGTGCAAACGCCAACGCTAAAAACACTGGAACCATGAACGGTAATACTGCAGGGACGAATATAGCCAAAACTTCT'
make_dgr_oligos(target,4)
['ATAACCTCAGATACAAGCCGGCATAAATAATAACA',
'AATATGTTATTATTTATGCCGGCTTGTATCTGAGG',
'TATTCTATGACCATGATAATAGTGTAGGTGCAA',
'GCGTTTGCACCTACACTATTATCATGGTCATAG',
'ACGCCAACGCTAAAAACACTGGAACCATGAACG',
'TTACCGTTCATGGTTCCAGTGTTTTTAGCGTTG',
'GTAATACTGCAGGGACGAATATAGCCAAAACTTCT',
'CAGAAGAAGTTTTGGCTATATTCGTCCCTGCAGTA']
reverse_comp_geno_list (geno_list:list, ref_seq:str)
Type | Details | |
---|---|---|
geno_list | list | List of genotypes |
ref_seq | str | string of the template sequence |
remove_position (geno, pos_list)
remove_position_list (geno_list, pos_list)
ref_genome='AACGTATACGGCGGAATATTTGCCGAATGCCGTGTGGACGTAAGCGTGAACGTCAGGATCACGTTTCCCCGACCCGCTGGCATGTCAACAATACGGGAGAACACCTGTACCGCCTCGTTCGCCGCGC'
geno_list_test=[('T19C', 176012),
('T19C,T64A,T65A', 169),
('T19C,G36T', 40),
('T19C,T58G,T63A,T64G', 4),
('T19C,A42C', 14),
('T19C,T63G', 13),
('T19C,T52A,T64A,T65A', 19),
('T19C,A41C,A57C,T58C', 1),
('T19C,T52A', 94),
('T19C,T64A,T65G', 214),
('T19C,T63A,T64C,T65A', 2),
('T19C,A49C,T64C,T91C', 2),
('T19C,T32C,T52C,T64A,T65G,T84C,T91A', 1),
('T19C,T82C,T84A', 8),
('T19C,T64C', 308),
('T19C,T40C,G43A,T58A,A71C,T91C', 1),
('T52A,T77C', 1),
('T19C,T52C,T58A', 9),
('T20C,-52C,T58C,A71C,T77C', 1),
('T19C,G70T', 110),
('T19C,T32C,T65C,T82C', 1),
('T19C,T32C,T46C', 4),
('T19C,T32C,A41C,A49C,T64A,T65G', 1),
('T19C,T64A', 115),
('T19C,T58C', 68)]
geno_list_test=remove_position_list(geno_list_test,[19])
reverse_comp_geno_list(geno_list_test,ref_genome)
[('', 176012),
('A61T,A62T', 169),
('C90A', 40),
('A62C,A63T,A68C', 4),
('T84G', 14),
('A63C', 13),
('A61T,A62T,A74T', 19),
('A68G,T69G,T85G', 1),
('A74T', 94),
('A61C,A62T', 214),
('A61T,A62G,A63T', 2),
('A35G,A62G,T77G', 2),
('A35T,A42G,A61C,A62T,A74G,A94G', 1),
('A42T,A44G', 8),
('A62G', 308),
('A35G,T55G,A68T,C83T,A86G', 1),
('A49G,A74T', 1),
('A68T,A74G', 9),
('A49G,T55G,A68G,-74G,A106G', 1),
('C56A', 110),
('A44G,A61G,A94G', 1),
('A80G,A94G', 4),
('A61C,A62T,T77G,T85G,A94G', 1),
('A62T', 115),
('A68G', 68)]