genotypes_paired

Functions to go a fastq file to a list of genotypes

get_UMI_genotype_paired

 get_UMI_genotype_paired (fastq_path_fwd:str, fastq_path_rev:str,
                          ref_seq:str, fwd_span:tuple, rev_span:tuple,
                          fwd_ref_read_size:int=None,
                          rev_ref_read_size:int=None,
                          require_perfect_pair_agreement:bool=True,
                          umi_size_fwd:int=10, umi_size_rev:int=0,
                          quality_threshold:int=30, ignore_pos:list=[],
                          N=None, **kwargs)

	Type	Default	Details
fastq_path_fwd	str		path to the input fastq file reading the ref_seq in the forward orientation
fastq_path_rev	str		path to the input fastq file reading the ref_seq in the reverse orientation
ref_seq	str		sequence of the reference amplicon
fwd_span	tuple		span of the ref_seq that is read in the forward orientation format: (start, end)
rev_span	tuple		span of the ref_seq that is read in the reverse orientation format: (start, end)
fwd_ref_read_size	int	None	number of nucleotides in the fwd read expected to align to the ref_seq. If None the whole read will be used.
rev_ref_read_size	int	None	number of nucleotides in the rev read expected to align to the ref_seq. If None the whole read will be used.
require_perfect_pair_agreement	bool	True	if True only pairs of reads that perfectly agree on the sequence within the common span will be used. If False the fwd sequence will be used. Will be set to False by default if there is no overlap.
umi_size_fwd	int	10	number of nucleotides at the begining of the fwd read that will be used as the UMI
umi_size_rev	int	0	number of nucleotides at the begining of the rev read that will be used as the UMI (if both are provided the umi will be the concatenation of both)
quality_threshold	int	30	threshold value used to filter out reads of poor average quality. Both reads have to pass the threshold.
ignore_pos	list	[]	list of positions that are ignored in the genotype
N	NoneType	None	number of reads to consider (useful to get a quick view of the data without going through the whole fastq files). If None the whole data will be used.
kwargs
Returns	dict		alignment parameters can be passed here (match, mismatch, gap_open, gap_extend)

fastq_path_fwd=os.path.join(data_path,"paired_example1_R2.fastq.gz")
fastq_path_rev=os.path.join(data_path,"paired_example1_R1.fastq.gz")


read_ref_file="sacB_ref.fasta"
ref=next(SeqIO.parse(os.path.join(data_path,read_ref_file),"fasta"))
ref_seq="GAGGAGACGGTGACCTGGGTCCCCTGGCCCCAGTTGTTGAAGTCATACCCTTAGGCACCATAGTATCCAGACGCACAGTAGTACAAGGCAGTGTCCTCAGGTTTCAGGCTGTTCATTTGCAGATACACCGTGTTCTTGGCGTTGTCTTGGG"

UMI_gencounter = get_UMI_genotype_paired(fastq_path_fwd, 
                                         fastq_path_rev, 
                                         ref_seq, 
                                         fwd_span=(0,150), 
                                         rev_span=(0,0),
                                         umi_size_fwd=0,
                                         umi_size_rev=10,
                                         N=10,
                                         ignore_pos=[0,1,2,150,151])

for umi in itertools.islice(UMI_gencounter,20):
    print(umi, list(UMI_gencounter[umi].items()))

n reads:    10
n_reads pass filter:    10
n_reads aligned:    10
n_pairs agree:  0
Number of UMIs: 10
Median number of reads per UMI: 1.0
GGACGCGATA [('T50A,T51G', 1)]
GCTGTATGTT [('T50A,T51G', 1)]
GGGAAGGCGT [('', 1)]
GGCGACAGTG [('', 1)]
CCATGTCGGG [('', 1)]
CNGATGTTTG [('T33G,T34A,T51G', 1)]
GTCACACCAC [('T33C,T34C,T36C,T50A', 1)]
CCAGACTGTT [('', 1)]
CAGGGTTTAT [('T50A,T51G', 1)]
ATGGGTACGG [('T130A', 1)]

source

get_genotypes_paired

 get_genotypes_paired (fastq_path_fwd:str, fastq_path_rev:str,
                       ref_seq:str, fwd_span:tuple, rev_span:tuple,
                       fwd_ref_read_size:int=None,
                       rev_ref_read_size:int=None,
                       require_perfect_pair_agreement:bool=True,
                       umi_size_fwd:int=10, umi_size_rev:int=0,
                       quality_threshold:int=30, ignore_pos:list=[],
                       reads_per_umi_thr:int=0, save_umi_data:str=None,
                       N=None, **kwargs)

Putting things together in a single wrapper function that takes the fastq as input and returns the list of genotypes.

	Type	Default	Details
fastq_path_fwd	str		path to the input fastq file reading the ref_seq in the forward orientation
fastq_path_rev	str		path to the input fastq file reading the ref_seq in the reverse orientation
ref_seq	str		sequence of the reference amplicon
fwd_span	tuple		span of the ref_seq that is read in the forward orientation format: (start, end)
rev_span	tuple		span of the ref_seq that is read in the reverse orientation format: (start, end)
fwd_ref_read_size	int	None	number of nucleotides in the fwd read expected to align to the ref_seq. If None the whole read will be used.
rev_ref_read_size	int	None	number of nucleotides in the rev read expected to align to the ref_seq. If None the whole read will be used.
require_perfect_pair_agreement	bool	True	if True only pairs of reads that perfectly agree on the sequence within the common span will be used. If False the fwd sequence will be used. Will be set to False by default if there is no overlap.
umi_size_fwd	int	10	number of nucleotides at the begining of the fwd read that will be used as the UMI
umi_size_rev	int	0	number of nucleotides at the begining of the rev read that will be used as the UMI (if both are provided the umi will be the concatenation of both)
quality_threshold	int	30	threshold value used to filter out reads of poor average quality
ignore_pos	list	[]	list of positions that are ignored in the genotype
reads_per_umi_thr	int	0	minimum number of reads required to take a UMI into account. Using a number >2 enables to perform error correction for UMIs with multiple reads.
save_umi_data	str	None	path to the csv file where to save the details of the genotypes reads for each UMI. If None the data isn’t saved.
N	NoneType	None	number of reads to consider (useful to get a quick view of the data without going through the whole fastq files). If None the whole data will be used.
kwargs

gen_list = get_genotypes_paired(fastq_path_fwd, 
                        fastq_path_rev, 
                        ref_seq, 
                        fwd_span=(0,150), 
                        rev_span=(0,0),
                        umi_size_fwd=0,
                        umi_size_rev=10,
                        ignore_pos=[0,1,150,151],
                        N=100,
                        save_umi_data="test.csv")
for g in gen_list[:20]:
    print(f"{g[1]}\t{g[0]}")

n reads:    100
n_reads pass filter:    96
n_reads aligned:    94
n_pairs agree:  0
Number of UMIs: 85
Median number of reads per UMI: 1.0
Number of genotypes: 12
49  
23  T50A,T51G
2   T33G,T34A,T51G
2   T33C,T34C,T36C,T50A
2   T37C,T42C,T45A,T50C,T51C
1   T130A
1   A52C
1   G8A
1   T51C
1   A52C,G131C
1   G69A
1   T37C,T42C,T45A,T50C,T51C,T146A