source
download_file
download_file (url, save_path)
Checks if a file with the same name is already in the save_path. If not download it.
source
is_gzipped_file
is_gzipped_file (file_path)
source
default_open_gz
default_open_gz (gff_path)
If file is gzipped then opens it with gzip.open
, otherwise opens it with open
# Example usage
file_url = 'https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gff.gz'
human_genome_gff = 'GRCh38_latest_genomic.gff.gz'
download_file(file_url, human_genome_gff)
is_gzipped_file(human_genome_gff)
File already exists: GRCh38_latest_genomic.gff.gz
source
get_attributes
get_attributes (df:pandas.core.frame.DataFrame,
attributes:Optional[Dict[str,List]]=None)
Iterates over each row of the df and extracts the attributes specified in the attributes dictionary for each feature type
df
DataFrame
a features DataFrame with at least a “type” column and an “attributes_str” column
attributes
Optional
None
a dictionary with feature types as keys and a list of attributes to extract as values
Returns
List
source
attributes_to_columns
attributes_to_columns (features:pandas.core.frame.DataFrame)
source
set_positions
set_positions (annotation:pandas.core.frame.DataFrame)
Sets left and right as the position of the feature on the sequence, left is always lower than right. start and end represent the begining and end of the feature where start can be greater than end depending on the feature strand.
annotation
DataFrame
an annotation DataFrame extracted from a gff file
Returns
DataFrame
source
parse_gff
parse_gff (gff_path:str, seq_id:Optional[str]=None, first:bool=True,
bounds:Optional[tuple]=None,
feature_types:Optional[list]=None,
attributes:Optional[Dict[str,List]]=None)
Parses a GFF3 file and returns a list of Pandas DataFrames with the data for a specific contig. If seq_id is None then only the first contig is parsed. If feature_types is None then all feature types are extracted.
gff_path
str
path to the gff file
seq_id
Optional
None
sequence id (first column of the gff), if not None, then return only the annotations for the seq_id with this name
first
bool
True
if True then return only the annotations for the first sequence (or the first with seq_id)
bounds
Optional
None
(left limit, right limit)
feature_types
Optional
None
list of feature types to extract
attributes
Optional
None
a dictionary with feature types as keys and a list of attributes to extract as values
Returns
List
from genomenotebook.data import get_example_data_dir
import os
data_path = get_example_data_dir()
gff_path = os.path.join(data_path, "jmh43.gff" )
df= parse_gff(gff_path,
bounds= (10000 ,50000 ))[0 ]
df.head()
0
NZ_JAGURL010000100.1
RefSeq
region
1
16949
.
+
.
{'ID': 'NZ_JAGURL010000100.1:1..16949', 'Dbxre...
1
16949
8475.0
1
NZ_JAGURL010000100.1
RefSeq
gene
10059
8311
.
-
.
{'ID': 'gene-KFX61_RS20985', 'Name': 'KFX61_RS...
8311
10059
9185.0
2
NZ_JAGURL010000100.1
Protein Homology
CDS
10059
8311
.
-
0
{'ID': 'cds-WP_225638104.1', 'Parent': 'gene-K...
8311
10059
9185.0
3
NZ_JAGURL010000100.1
RefSeq
gene
10541
10092
.
-
.
{'ID': 'gene-KFX61_RS20990', 'Name': 'KFX61_RS...
10092
10541
10316.5
4
NZ_JAGURL010000100.1
Protein Homology
CDS
10541
10092
.
-
0
{'ID': 'cds-WP_016268480.1', 'Parent': 'gene-K...
10092
10541
10316.5
df= parse_gff(gff_path,
seq_id= "NZ_JAGURL010000013.1" ,
bounds= (10000 ,50000 ))[0 ]
df.head()
0
NZ_JAGURL010000013.1
RefSeq
region
1
98838
.
+
.
{'ID': 'NZ_JAGURL010000013.1:1..98838', 'Dbxre...
1
98838
49419.5
1
NZ_JAGURL010000013.1
RefSeq
gene
10195
11175
.
+
.
{'ID': 'gene-KFX61_RS05880', 'Name': 'KFX61_RS...
10195
11175
10685.0
2
NZ_JAGURL010000013.1
Protein Homology
CDS
10195
11175
.
+
0
{'ID': 'cds-WP_048697200.1', 'Parent': 'gene-K...
10195
11175
10685.0
3
NZ_JAGURL010000013.1
RefSeq
gene
11234
11824
.
+
.
{'ID': 'gene-KFX61_RS05885', 'Name': 'KFX61_RS...
11234
11824
11529.0
4
NZ_JAGURL010000013.1
Protein Homology
CDS
11234
11824
.
+
0
{'ID': 'cds-WP_048697198.1', 'Parent': 'gene-K...
11234
11824
11529.0
genome_path = os.path.join(data_path, "MG1655_U00096.fasta" )
gff_path = os.path.join(data_path, "MG1655_U00096.gff3" )
parse_gff(gff_path)[0 ].head()
0
U00096.3
Genbank
region
1
4641652
.
+
.
{'ID': 'U00096.3:1..4641652', 'Dbxref': 'taxon...
1
4641652
2320826.5
1
U00096.3
Genbank
gene
190
255
.
+
.
{'ID': 'gene-b0001', 'Dbxref': 'ASAP:ABE-00000...
190
255
222.5
2
U00096.3
Genbank
CDS
190
255
.
+
0
{'ID': 'cds-AAC73112.1', 'Parent': 'gene-b0001...
190
255
222.5
3
U00096.3
Genbank
gene
337
2799
.
+
.
{'ID': 'gene-b0002', 'Dbxref': 'ASAP:ABE-00000...
337
2799
1568.0
4
U00096.3
Genbank
CDS
337
2799
.
+
0
{'ID': 'cds-AAC73113.1', 'Parent': 'gene-b0002...
337
2799
1568.0
source
available_feature_types
available_feature_types (gff_path)
data_path = get_example_data_dir()
gff_path = os.path.join(data_path, "MG1655_U00096.gff3" )
available_feature_types(gff_path)
{'CDS',
'exon',
'gene',
'mobile_genetic_element',
'ncRNA',
'origin_of_replication',
'pseudogene',
'rRNA',
'recombination_feature',
'region',
'repeat_region',
'sequence_feature',
'tRNA'}
source
available_attributes
available_attributes (gff_path)
available_attributes(gff_path)
Index(['seq_id', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase',
'attributes', 'left', 'right', 'middle'],
dtype='object')
source
parse_fasta
parse_fasta (genome_path, seq_id)
Retrieves the Biopython SeqRecord object that matches the seq_id in a fasta file
source
regions_overlap
regions_overlap (region1, region2, min_overlap_fraction=0.0)
regions are tuples of start and stop coordinates returns true if a fraction of region2 >= min_overlap_fraction overlaps with region1 coordinates within regions must be sorted low to high
source
add_z_order
add_z_order (features, prescedence=['source', 'CDS', 'repeat_region',
'ncRNA', 'rRNA', 'tRNA', 'exon'])
features is a dataframe of features prescedence is a list of feature types in order of prescedence, e.g. [“CDS”, “repeat_region”, “ncRNA”, “rRNA”, “tRNA”] will put “CDS” features closer to the bottom of the plot than “repeat_region” features. returns features with a z_order column added
source
get_cds_name
get_cds_name (feature)
feature
(contig_id, feature):
source
get_cds_unique_name
get_cds_unique_name (feature)
If the feature already has a cds_id, then keep it, otherwise generate one based on the position on the contig.
source
seqRecord_to_df
seqRecord_to_df (rec:<module'Bio.SeqRecord'from'/opt/hostedtoolcache/Pyth
on/3.10.15/x64/lib/python3.10/site-
packages/Bio/SeqRecord.py'>,
feature_types:Optional[List[str]]=None,
attributes:Optional[Dict[str,List]]=None)
rec
Bio.SeqRecord
feature_types
Optional
None
if None then get all features, otherwise only those with type in FeatureTypes.
attributes
Optional
None
Returns
DataFrame
if None, then get all attributes of all feature types. If dict, then only get attributes of feature types keys. If value is None, get all
gb_path= os.path.join(data_path, "colored_genbank.gb" )
recs= SeqIO.parse(gb_path, "genbank" )
rec= next (recs)
df= seqRecord_to_df(rec, feature_types= ["CDS" ])
df.loc[1 ]["attributes" ]
OrderedDict([('gene_id', 'pDONR201_2'),
('gene_type', 'bacteria'),
('complete', 'true'),
('gc', '50'),
('length', '306'),
('source', 'GeneMark.hmm2'),
('score', '20.71'),
('phase', '0'),
('name', 'pDONR201_2'),
('cds_id', '1264_-1_959'),
('domainator_pdonr_hmms', 'CcdB (CcdB protein, 1.1e-32, 103.1)'),
('Color', '#FF0000')])
df= seqRecord_to_df(rec, feature_types= ["CDS" ], attributes= {"CDS" :["gene_type" ]})
df.head()
0
pDONR201_1
Genbank
CDS
2
106
.
+
.
{'gene_type': 'bacteria'}
1
pDONR201_1
Genbank
CDS
959
1264
.
-
.
{'gene_type': 'bacteria'}
2
pDONR201_1
Genbank
CDS
1266
1391
.
-
.
{'gene_type': 'bacteria'}
3
pDONR201_1
Genbank
CDS
1606
2265
.
-
.
{'gene_type': 'bacteria'}
4
pDONR201_1
Genbank
CDS
2916
3677
.
+
.
{'gene_type': 'bacteria'}
source
parse_recs
parse_recs (recs, seq_id:Optional[str]=None, first=True,
bounds:Optional[tuple]=None,
feature_types:Optional[list]=None,
attributes:Optional[Dict[str,List]]=None)
recs
iterator over Bio.SeqRecord.SeqRecord
seq_id
Optional
None
sequence id (first column of the gff), if not None, then return only the annotations for the seq_id with this name
first
bool
True
if True then return only the annotations for the first sequence (or the first with seq_id)
bounds
Optional
None
(left limit, right limit)
feature_types
Optional
None
list of feature types to extract
attributes
Optional
None
a dictionary with feature types as keys and a list of attributes to extract as values
Returns
Tuple
source
parse_genbank
parse_genbank (gb_path, seq_id:Optional[str]=None, first=True,
bounds:Optional[tuple]=None,
feature_types:Optional[list]=None,
attributes:Optional[Dict[str,List]]=None)
gb_path
path to the genbank file
seq_id
Optional
None
sequence id (first column of the gff), if not None, then return only the annotations for the seq_id with this name
first
bool
True
if True then return only the annotations for the first sequence (or the first with seq_id)
bounds
Optional
None
(left limit, right limit)
feature_types
Optional
None
list of feature types to extract
attributes
Optional
None
a dictionary with feature types as keys and a list of attributes to extract as values
Returns
Tuple
gb_path= os.path.join(data_path, "colored_genbank.gb" )
recs= SeqIO.parse(gb_path, "genbank" )
rec= next (recs)
testid= rec.id
seqs, dfs= parse_genbank(gb_path, seq_id = testid,
feature_types= ["CDS" , "Domainator" ]
)
# dfs[0].head()
assert len (dfs) == len (seqs)
assert len (dfs) == 1
assert dfs[0 ].loc[0 , "seq_id" ] == "pDONR201_1"
gb_path= os.path.join(data_path, "colored_genbank.gb" )
seqs, dfs= parse_genbank(gb_path,
seq_id= None ,
first= False ,
feature_types= ["CDS" , "Domainator" ]
)
assert len (dfs) == len (seqs)
assert len (dfs) == 4
assert dfs[0 ].loc[0 , "seq_id" ] == "pDONR201_1"
assert dfs[1 ].loc[0 , "seq_id" ] == "pDONR201_2"
assert dfs[2 ].loc[0 , "seq_id" ] == "pDONR201_3"
assert dfs[3 ].loc[0 , "seq_id" ] == "pDONR201_4"
source
inspect_feature_types
inspect_feature_types (file_path:str, frmt:str)
Outputs a table that recapitulates the feature types and attributes available in the file.
file_path
str
frmt
str
gff or genbank
inspect_feature_types(gff_path, "gff" )
exon
ID
Parent
anticodon
gbkey
inference
locus_tag
product
CDS
ID
Parent
Dbxref
Name
gbkey
inference
locus_tag
product
protein_id
transl_table
tRNA
ID
Parent
anticodon
gbkey
inference
locus_tag
product
region
ID
Dbxref
country
gbkey
genome
mol_type
nat-host
strain
gene
ID
Name
gbkey
gene_biotype
locus_tag
old_locus_tag