utils

This contains useful functions

download_file

 download_file (url, save_path)

Checks if a file with the same name is already in the save_path. If not download it.

source

is_gzipped_file

 is_gzipped_file (file_path)

source

default_open_gz

 default_open_gz (gff_path)

If file is gzipped then opens it with gzip.open, otherwise opens it with open

# Example usage
file_url = 'https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gff.gz'
human_genome_gff = 'GRCh38_latest_genomic.gff.gz'

download_file(file_url, human_genome_gff)
is_gzipped_file(human_genome_gff)

File already exists: GRCh38_latest_genomic.gff.gz

True

source

extract_attribute

 extract_attribute (input_str:str, attr_name:str)

Extracts the attribute called attr_name from the GFF attributes string

	Type	Details
input_str	str	attribute string to parse
attr_name	str	name of the attribute to extract
Returns	str

input_str = 'ID=cds-ATV02827.1;Parent=gene-SaO11_00001;Dbxref=NCBI_GP:ATV02827.1;Name=ATV02827.1;gbkey=CDS;gene=dnaA;locus_tag=SaO11_00001;product=Chromosomal replication initiator protein DnaA;protein_id=ATV02827.1;transl_table=11'
extract_attribute(input_str,"gene")

'dnaA'

source

extract_all_attributes

 extract_all_attributes (input_str:str)

Extracts all attributes from the GFF attributes column

extract_all_attributes(input_str)

defaultdict(None,
            {'locus_tag': 'SaO11_00001',
             'product': 'Chromosomal replication initiator protein DnaA',
             'protein_id': 'ATV02827.1',
             'transl_table': '11'})

source

attributes_to_columns

 attributes_to_columns (features:pandas.core.frame.DataFrame)

source

set_positions

 set_positions (annotation:pandas.core.frame.DataFrame)

Sets left and right as the position of the feature on the sequence, left is always lower than right. start and end represent the begining and end of the feature where start can be greater than end depending on the feature strand.

	Type	Details
annotation	DataFrame	an annotation DataFrame extracted from a gff file

source

parse_gff

 parse_gff (gff_path:str, seq_id:str=None, bounds:tuple=None,
            feature_types:list=None)

	Type	Default	Details
gff_path	str		path to the gff file
seq_id	str	None	sequence id (first column of the gff)
bounds	tuple	None	(left limit, right limit)
feature_types	list	None	list of feature types to extract
Returns	DataFrame

df=parse_gff(human_genome_gff, 
             seq_id="NC_000001.11",
             bounds=(10000,50000))
df.head()

	seq_id	source	type	start	end	score	strand	phase	attributes	matchable_bases	...	mol_type	pseudo	assembly_bases_seq	pct_coverage	bit_score	genome	pct_identity_gapopen_only	left	right	middle
0	NC_000001.11	RefSeq	region	1	248956422	.	+	.	ID=NC_000001.11:1..248956422;Dbxref=taxon:9606...	None	...	genomic DNA	None	None	None	None	chromosome	None	1	248956422	124478211.5
1	NC_000001.11	BestRefSeq	pseudogene	11874	14409	.	+	.	ID=gene-DDX11L1;Dbxref=GeneID:100287102,HGNC:H...	None	...	None	true	None	None	None	None	None	11874	14409	13141.5
2	NC_000001.11	BestRefSeq	transcript	11874	14409	.	+	.	ID=rna-NR_046018.2;Parent=gene-DDX11L1;Dbxref=...	None	...	None	true	None	None	None	None	None	11874	14409	13141.5
3	NC_000001.11	BestRefSeq	exon	11874	12227	.	+	.	ID=exon-NR_046018.2-1;Parent=rna-NR_046018.2;D...	None	...	None	true	None	None	None	None	None	11874	12227	12050.5
4	NC_000001.11	BestRefSeq	exon	12613	12721	.	+	.	ID=exon-NR_046018.2-2;Parent=rna-NR_046018.2;D...	None	...	None	true	None	None	None	None	None	12613	12721	12667.0

5 rows × 56 columns

source

available_feature_types

 available_feature_types (gff_path)

from genomenotebook.data import get_example_data_dir
import os

data_path = get_example_data_dir()
gff_path = os.path.join(data_path, "MG1655_U00096.gff3")
available_feature_types(gff_path)

{'CDS',
 'exon',
 'gene',
 'mobile_genetic_element',
 'ncRNA',
 'origin_of_replication',
 'pseudogene',
 'rRNA',
 'recombination_feature',
 'region',
 'repeat_region',
 'sequence_feature',
 'tRNA'}

source

available_attributes

 available_attributes (gff_path)

available_attributes(gff_path)

Index(['seq_id', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase',
       'attributes', 'protein_id', 'Parent', 'rpt_type', 'gene', 'Dbxref',
       'gene_synonym', 'orig_protein_id', 'exception', 'recombination_class',
       'mobile_element_type', 'transl_except', 'gene_biotype', 'gbkey',
       'Is_circular', 'strain', 'substrain', 'Name', 'product', 'ID', 'genome',
       'Note', 'mol_type', 'pseudo', 'locus_tag', 'orig_transcript_id', 'part',
       'transl_table', 'left', 'right', 'middle'],
      dtype='object')