From 9c53ee41932e238f147eba84b4d96a53652c7555 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Mon, 19 Jan 2026 20:28:01 -0800 Subject: [PATCH 1/4] using a template from txdb and orgdb --- src/ensembldb/_ahub.py | 7 ++ src/ensembldb/record.py | 60 ++++++++++++++ src/ensembldb/registry.py | 160 ++++++++++++++++++++++++++++++++++++++ src/ensembldb/skeleton.py | 149 ----------------------------------- 4 files changed, 227 insertions(+), 149 deletions(-) create mode 100644 src/ensembldb/_ahub.py create mode 100644 src/ensembldb/record.py create mode 100644 src/ensembldb/registry.py delete mode 100644 src/ensembldb/skeleton.py diff --git a/src/ensembldb/_ahub.py b/src/ensembldb/_ahub.py new file mode 100644 index 0000000..85e286d --- /dev/null +++ b/src/ensembldb/_ahub.py @@ -0,0 +1,7 @@ +"""Configuration for accessing AnnotationHub metadata for EnsDb.""" + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + +AHUB_METADATA_URL = "https://annotationhub.bioconductor.org/metadata/annotationhub.sqlite3" diff --git a/src/ensembldb/record.py b/src/ensembldb/record.py new file mode 100644 index 0000000..a1e7b8a --- /dev/null +++ b/src/ensembldb/record.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +from dataclasses import dataclass +from datetime import date, datetime +from typing import Optional + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +@dataclass(frozen=True) +class EnsDbRecord: + """Container for a single EnsDb entry.""" + + ensdb_id: str # e.g., "AH12345" + title: str + species: Optional[str] + taxonomy_id: Optional[str] + genome: Optional[str] + description: Optional[str] + url: str + release_date: Optional[date] + ensembl_version: Optional[str] = None + + @classmethod + def from_db_row(cls, row: tuple) -> "EnsDbRecord": + """Build a record from a database query row.""" + rid, title, species, tax_id, genome, desc, url, date_str = row + + ah_id = f"AH{rid}" + + rel_date: Optional[date] = None + if date_str: + try: + rel_date = datetime.strptime(str(date_str).split(" ")[0], "%Y-%m-%d").date() + except ValueError: + pass + + ens_ver = None + if title and "Ensembl" in title: + parts = title.split(" ") + for i, p in enumerate(parts): + if p == "Ensembl" and i + 1 < len(parts): + candidate = parts[i + 1] + if candidate.isdigit(): + ens_ver = candidate + break + + return cls( + ensdb_id=ah_id, + title=title or "", + species=species, + taxonomy_id=str(tax_id) if tax_id else None, + genome=genome, + description=desc, + url=url, + release_date=rel_date, + ensembl_version=ens_ver, + ) diff --git a/src/ensembldb/registry.py b/src/ensembldb/registry.py new file mode 100644 index 0000000..8eb4167 --- /dev/null +++ b/src/ensembldb/registry.py @@ -0,0 +1,160 @@ +import os +import sqlite3 +from pathlib import Path +from typing import Any, Dict, List, Optional, Union + +from pybiocfilecache import BiocFileCache + +from ._ahub import AHUB_METADATA_URL +from .ensdb import EnsDb +from .record import EnsDbRecord + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +class EnsDbRegistry: + """Registry for EnsDb resources.""" + + def __init__( + self, + cache_dir: Optional[Union[str, Path]] = None, + force: bool = False, + ) -> None: + """Initialize the EnsDb registry. + + Args: + cache_dir: Path to cache directory. + force: Force re-download of metadata. + """ + if cache_dir is None: + cache_dir = Path.home() / ".cache" / "ensembldb_bfc" + + self._cache_dir = Path(cache_dir) + self._cache_dir.mkdir(parents=True, exist_ok=True) + self._bfc = BiocFileCache(self._cache_dir) + + self._registry_map: Dict[str, EnsDbRecord] = {} + self._initialize_registry(force=force) + + def _initialize_registry(self, force: bool = False): + """Populate registry from AnnotationHub metadata.""" + rname = "annotationhub_metadata" + + existing = None + try: + existing = self._bfc.get(rname) + except Exception: + pass + + if force and existing: + try: + self._bfc.remove(rname) + except Exception: + pass + existing = None + + if existing: + md_resource = existing + else: + md_resource = self._bfc.add(rname, AHUB_METADATA_URL, rtype="web") + + md_path = self._get_filepath(md_resource) + + if not md_path or not os.path.exists(md_path): + if existing and not force: + return self._initialize_registry(force=True) + raise RuntimeError("Failed to retrieve AnnotationHub metadata.") + + conn = sqlite3.connect(md_path) + try: + # Filter for EnsDb sqlite files + query = """ + SELECT + r.id, + r.title, + r.species, + r.taxonomyid, + r.genome, + r.description, + lp.location_prefix || rp.rdatapath AS full_url, + r.rdatadateadded + FROM resources r + LEFT JOIN location_prefixes lp + ON r.location_prefix_id = lp.id + LEFT JOIN rdatapaths rp + ON rp.resource_id = r.id + WHERE r.title LIKE 'Ensembl % EnsDb%' + AND r.title LIKE '%.sqlite' + ORDER BY r.rdatadateadded DESC; + """ + cursor = conn.cursor() + cursor.execute(query) + rows = cursor.fetchall() + finally: + conn.close() + + for row in rows: + record = EnsDbRecord.from_db_row(row) + self._registry_map[record.ensdb_id] = record + + def list_ensdbs(self) -> List[str]: + """List available EnsDb IDs.""" + return sorted(list(self._registry_map.keys())) + + def get_record(self, ensdb_id: str) -> EnsDbRecord: + if ensdb_id not in self._registry_map: + raise KeyError(f"ID '{ensdb_id}' not found.") + return self._registry_map[ensdb_id] + + def download(self, ensdb_id: str, force: bool = False) -> str: + record = self.get_record(ensdb_id) + url = record.url + key = ensdb_id + + if force: + try: + self._bfc.remove(key) + except Exception: + pass + + if not force: + try: + existing = self._bfc.get(key) + if existing: + path = self._get_filepath(existing) + if path and os.path.exists(path) and os.path.getsize(path) > 0: + return path + except Exception: + pass + + resource = self._bfc.add( + key, + url, + rtype="web", + download=True, + ) + + path = self._get_filepath(resource) + if not path or not os.path.exists(path) or os.path.getsize(path) == 0: + try: + self._bfc.remove(key) + except Exception: + pass + raise RuntimeError(f"Download failed for {ensdb_id}.") + + return path + + def load_db(self, ensdb_id: str, force: bool = False) -> EnsDb: + path = self.download(ensdb_id, force=force) + return EnsDb(path) + + def _get_filepath(self, resource: Any) -> Optional[str]: + if hasattr(resource, "rpath"): + rel_path = str(resource.rpath) + elif hasattr(resource, "get"): + rel_path = str(resource.get("rpath")) + else: + return None + return str(self._cache_dir / rel_path) diff --git a/src/ensembldb/skeleton.py b/src/ensembldb/skeleton.py deleted file mode 100644 index 19bfc1a..0000000 --- a/src/ensembldb/skeleton.py +++ /dev/null @@ -1,149 +0,0 @@ -""" -This is a skeleton file that can serve as a starting point for a Python -console script. To run this script uncomment the following lines in the -``[options.entry_points]`` section in ``setup.cfg``:: - - console_scripts = - fibonacci = ensembldb.skeleton:run - -Then run ``pip install .`` (or ``pip install -e .`` for editable mode) -which will install the command ``fibonacci`` inside your current environment. - -Besides console scripts, the header (i.e. until ``_logger``...) of this file can -also be used as template for Python modules. - -Note: - This file can be renamed depending on your needs or safely removed if not needed. - -References: - - https://setuptools.pypa.io/en/latest/userguide/entry_point.html - - https://pip.pypa.io/en/stable/reference/pip_install -""" - -import argparse -import logging -import sys - -from ensembldb import __version__ - -__author__ = "Jayaram Kancherla" -__copyright__ = "Jayaram Kancherla" -__license__ = "MIT" - -_logger = logging.getLogger(__name__) - - -# ---- Python API ---- -# The functions defined in this section can be imported by users in their -# Python scripts/interactive interpreter, e.g. via -# `from ensembldb.skeleton import fib`, -# when using this Python module as a library. - - -def fib(n): - """Fibonacci example function - - Args: - n (int): integer - - Returns: - int: n-th Fibonacci number - """ - assert n > 0 - a, b = 1, 1 - for _i in range(n - 1): - a, b = b, a + b - return a - - -# ---- CLI ---- -# The functions defined in this section are wrappers around the main Python -# API allowing them to be called directly from the terminal as a CLI -# executable/script. - - -def parse_args(args): - """Parse command line parameters - - Args: - args (List[str]): command line parameters as list of strings - (for example ``["--help"]``). - - Returns: - :obj:`argparse.Namespace`: command line parameters namespace - """ - parser = argparse.ArgumentParser(description="Just a Fibonacci demonstration") - parser.add_argument( - "--version", - action="version", - version=f"ensembldb {__version__}", - ) - parser.add_argument(dest="n", help="n-th Fibonacci number", type=int, metavar="INT") - parser.add_argument( - "-v", - "--verbose", - dest="loglevel", - help="set loglevel to INFO", - action="store_const", - const=logging.INFO, - ) - parser.add_argument( - "-vv", - "--very-verbose", - dest="loglevel", - help="set loglevel to DEBUG", - action="store_const", - const=logging.DEBUG, - ) - return parser.parse_args(args) - - -def setup_logging(loglevel): - """Setup basic logging - - Args: - loglevel (int): minimum loglevel for emitting messages - """ - logformat = "[%(asctime)s] %(levelname)s:%(name)s:%(message)s" - logging.basicConfig( - level=loglevel, stream=sys.stdout, format=logformat, datefmt="%Y-%m-%d %H:%M:%S" - ) - - -def main(args): - """Wrapper allowing :func:`fib` to be called with string arguments in a CLI fashion - - Instead of returning the value from :func:`fib`, it prints the result to the - ``stdout`` in a nicely formatted message. - - Args: - args (List[str]): command line parameters as list of strings - (for example ``["--verbose", "42"]``). - """ - args = parse_args(args) - setup_logging(args.loglevel) - _logger.debug("Starting crazy calculations...") - print(f"The {args.n}-th Fibonacci number is {fib(args.n)}") - _logger.info("Script ends here") - - -def run(): - """Calls :func:`main` passing the CLI arguments extracted from :obj:`sys.argv` - - This function can be used as entry point to create console scripts with setuptools. - """ - main(sys.argv[1:]) - - -if __name__ == "__main__": - # ^ This is a guard statement that will prevent the following code from - # being executed in the case someone imports this file instead of - # executing it as a script. - # https://docs.python.org/3/library/__main__.html - - # After installing your project with pip, users can also run your Python - # modules as scripts via the ``-m`` flag, as defined in PEP 338:: - # - # python -m ensembldb.skeleton 42 - # - run() From 8b088421c4c2281dba2e9a5a14f26b69135d99b0 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Tue, 20 Jan 2026 16:42:01 -0800 Subject: [PATCH 2/4] add the rest of the files --- README.md | 105 +++++++++++++++- docs/index.md | 18 ++- pyproject.toml | 2 +- setup.cfg | 6 +- src/ensembldb/__init__.py | 4 + src/ensembldb/ensdb.py | 258 ++++++++++++++++++++++++++++++++++++++ src/ensembldb/registry.py | 5 +- tests/test_real.py | 105 ++++++++++++++++ tests/test_skeleton.py | 25 ---- 9 files changed, 485 insertions(+), 43 deletions(-) create mode 100644 src/ensembldb/ensdb.py create mode 100644 tests/test_real.py delete mode 100644 tests/test_skeleton.py diff --git a/README.md b/README.md index 0942536..c746108 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,11 @@ [![PyPI-Server](https://img.shields.io/pypi/v/ensembldb.svg)](https://pypi.org/project/ensembldb/) -![Unit tests](https://github.com/YOUR_ORG_OR_USERNAME/ensembldb/actions/workflows/run-tests.yml/badge.svg) +![Unit tests](https://github.com/BiocPy/ensembldb/actions/workflows/run-tests.yml/badge.svg) -# ensembldb +# EnsemblDb -> Access EnsemblDB objects +**EnsemblDb** provides a Python interface to **Ensembl Annotation Databases (EnsDb)**. It mirrors the functionality of the Bioconductor `ensembldb` package, allowing users to efficiently query gene, transcript, and exon annotations from SQLite-based annotation files. -A longer description of your project goes here... +This package is part of the **BiocPy** ecosystem and integrates seamlessly with [GenomicRanges](https://github.com/biocpy/genomicranges). ## Install @@ -15,6 +15,103 @@ To get started, install the package from [PyPI](https://pypi.org/project/ensembl pip install ensembldb ``` +## Usage + +### 1. Connecting to an EnsDb + +You can manage and download standard Ensembl databases via the registry (backed by AnnotationHub). + +```py +from ensembldb import EnsDbRegistry + +# Initialize the registry +registry = EnsDbRegistry() + +# List available databases +available = registry.list_ensdbs() +print(available[:5]) +# ['AH53211', 'AH53212', ...] + +# Load a specific database (e.g., Larimichthys crocea) +# This automatically downloads and caches the SQLite file +db = registry.load_db("AH113677") + +# View metadata +print(db.metadata) +``` + +### 2. Retrieving Genomic Features + +EnsemblDb allows you to extract features as GenomicRanges objects. + +#### Fetch Genes + +```py +genes = db.genes() +print(genes) +# GenomicRanges with 23958 ranges and 3 metadata columns +# seqnames ranges strand gene_id gene_name gene_biotype +# +# ENSLCRG00005000002 MT 1 - 69 + | ENSLCRG00005000002 Mt_tRNA +# ENSLCRG00005000003 MT 70 - 1016 + | ENSLCRG00005000003 Mt_rRNA +# ENSLCRG00005000004 MT 1017 - 1087 + | ENSLCRG00005000004 Mt_tRNA +# ... ... ... | ... ... ... +# ENSLCRG00005023957 VI 22289079 - 22304889 - | ENSLCRG00005023957 FILIP1 protein_coding +# ENSLCRG00005023958 VI 22328118 - 22347657 + | ENSLCRG00005023958 SENP6 protein_coding +# ENSLCRG00005023959 VI 22351962 - 22451867 + | ENSLCRG00005023959 myo6a protein_coding +# ------ +# seqinfo(496 sequences): I II III ... XXII XXIII XXIV +``` + +#### Fetch Transcripts and Exons + +```py +transcripts = db.transcripts() +print(transcripts) + +exons = db.exons() +print(exons) +``` + +### 3. Filtering + +You can filter results using a dictionary passed to the filter argument. Keys should match column names in the database (e.g., gene_id, gene_name, tx_biotype). + +#### Filter by Gene Name + +```py +# Get coordinates for a specific gene +senp6 = db.genes(filter={"gene_name": "SENP6"}) +print(senp6) +``` + +#### Filter by ID list + +```py +# Get transcripts for a list of gene IDs +ids = ["ENSLCRG00005023958", "ENSLCRG00005000003"] +txs = db.transcripts(filter={"gene_id": ids}) +print(txs) +``` + +#### Filter Exons by Transcript ID: + +```py +# Get all exons associated with a specific transcript +tx_exons = db.exons(filter={"tx_id": "ENSLCRT00005000003"}) +print(tx_exons) +``` + +### 4. Direct SQL Access + +If you need more complex queries not covered by the standard methods, you can execute SQL directly against the underlying database. + +```py +# Get a BiocFrame from a raw SQL query +df = db._query_as_biocframe("SELECT * FROM gene LIMIT 5") +print(df) +``` + ## Note diff --git a/docs/index.md b/docs/index.md index d70fafd..5fd0abb 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,18 +1,16 @@ -# ensembldb +# EnsemblDb -Access EnsemblDB objects +**EnsemblDb** provides a Python interface to **Ensembl Annotation Databases (EnsDb)**. It mirrors the functionality of the Bioconductor `ensembldb` package, allowing users to efficiently query gene, transcript, and exon annotations from SQLite-based annotation files. +This package is part of the **BiocPy** ecosystem and integrates seamlessly with [GenomicRanges](https://github.com/biocpy/genomicranges). -## Note +## Install -> This is the main page of your project's [Sphinx] documentation. It is -> formatted in [Markdown]. Add additional pages by creating md-files in -> `docs` or rst-files (formatted in [reStructuredText]) and adding links to -> them in the `Contents` section below. -> -> Please check [Sphinx] and [MyST] for more information -> about how to document your project and how to configure your preferences. +To get started, install the package from [PyPI](https://pypi.org/project/ensembldb/) +```bash +pip install ensembldb +``` ## Contents diff --git a/pyproject.toml b/pyproject.toml index 086f90c..bc45f55 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ version_scheme = "no-guess-dev" [tool.ruff] line-length = 120 src = ["src"] -exclude = ["tests"] +# exclude = ["tests"] lint.extend-ignore = ["F821"] [tool.ruff.lint.pydocstyle] diff --git a/setup.cfg b/setup.cfg index 2cc6b21..0f09996 100644 --- a/setup.cfg +++ b/setup.cfg @@ -5,7 +5,7 @@ [metadata] name = ensembldb -description = Access EnsemblDB objects +description = Access EnsemblDb resources from Bioconductors AnnotationHub author = Jayaram Kancherla author_email = jayaram.kancherla@gmail.com license = MIT @@ -49,6 +49,10 @@ package_dir = # For more information, check out https://semver.org/. install_requires = importlib-metadata; python_version<"3.8" + pybiocfilecache + biocframe + genomicranges + iranges [options.packages.find] diff --git a/src/ensembldb/__init__.py b/src/ensembldb/__init__.py index e451f10..7501a7f 100644 --- a/src/ensembldb/__init__.py +++ b/src/ensembldb/__init__.py @@ -14,3 +14,7 @@ __version__ = "unknown" finally: del version, PackageNotFoundError + +from .record import EnsDbRecord +from .registry import EnsDbRegistry +from .ensdb import EnsDb \ No newline at end of file diff --git a/src/ensembldb/ensdb.py b/src/ensembldb/ensdb.py new file mode 100644 index 0000000..c81d215 --- /dev/null +++ b/src/ensembldb/ensdb.py @@ -0,0 +1,258 @@ +import sqlite3 +from typing import Dict, List, Optional, Union + +from biocframe import BiocFrame +from genomicranges import GenomicRanges +from iranges import IRanges + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +class EnsDb: + """Interface to Ensembl SQLite annotations.""" + + def __init__(self, dbpath: str): + """Initialize the EnsDb object. + + Args: + dbpath: + Path to the SQLite database file. + """ + self.dbpath = dbpath + self.conn = sqlite3.connect(dbpath) + self.conn.row_factory = sqlite3.Row + self._metadata = None + + def _query_as_biocframe(self, query: str, params: tuple = ()) -> BiocFrame: + """Execute query and return BiocFrame.""" + cursor = self.conn.cursor() + cursor.execute(query, params) + results = cursor.fetchall() + + if not results: + if cursor.description: + col_names = [desc[0] for desc in cursor.description] + # Fix: Initialize empty lists for each column to satisfy BiocFrame validation + empty_data = {col: [] for col in col_names} + return BiocFrame(empty_data, column_names=col_names) + return BiocFrame({}) + + col_names = [desc[0] for desc in cursor.description] + columns_data = list(zip(*results)) + + data_dict = {} + for i, name in enumerate(col_names): + data_dict[name] = list(columns_data[i]) + + return BiocFrame(data_dict) + + @property + def metadata(self) -> BiocFrame: + """Get database metadata.""" + if self._metadata is None: + self._metadata = self._query_as_biocframe("SELECT * FROM metadata") + return self._metadata + + def _check_column_exists(self, table: str, column: str) -> bool: + """Check if a column exists in a table.""" + try: + self.conn.execute(f"SELECT {column} FROM {table} LIMIT 1") + return True + except sqlite3.OperationalError: + return False + + def genes(self, filter: Optional[Dict[str, Union[str, List[str]]]] = None) -> GenomicRanges: + """Retrieve genes as GenomicRanges. + + Args: + filter: + A dictionary defining filters to narrow down the result. + Keys are column names (e.g., "gene_id", "gene_name", "gene_biotype"). + Values can be a single string or a list of strings to match. + + Example: + `{'gene_name': 'BRCA1'}` + `{'gene_biotype': ['protein_coding', 'lincRNA']}` + + Returns: + A GenomicRanges object containing gene coordinates and metadata. + """ + has_entrez = self._check_column_exists("gene", "entrezid") + entrez_col = ", g.entrezid" if has_entrez else "" + + query = f""" + SELECT + g.gene_id, g.gene_name, g.gene_biotype, + g.seq_name, g.gene_seq_start, g.gene_seq_end, g.seq_strand{entrez_col}, + c.seq_length + FROM gene g + LEFT JOIN chromosome c ON g.seq_name = c.seq_name + """ + + where_clauses = [] + params = [] + + if filter: + for col, val in filter.items(): + if isinstance(val, list): + placeholders = ",".join("?" * len(val)) + where_clauses.append(f"g.{col} IN ({placeholders})") + params.extend(val) + else: + where_clauses.append(f"g.{col} = ?") + params.append(val) + + if where_clauses: + query += " WHERE " + " AND ".join(where_clauses) + + bf = self._query_as_biocframe(query, tuple(params)) + + if bf.shape[0] == 0: + return GenomicRanges.empty() + + return self._make_gr(bf, prefix="gene_") + + def transcripts(self, filter: Optional[Dict[str, Union[str, List[str]]]] = None) -> GenomicRanges: + """Retrieve transcripts as GenomicRanges. + + Args: + filter: + A dictionary defining filters to narrow down the result. + Keys are column names (e.g., "tx_id", "gene_id", "tx_biotype"). + Values can be a single string or a list of strings to match. + + Columns from the gene table (like "gene_name") can also be used as keys + since the query performs a join. + + Returns: + A GenomicRanges object containing transcript coordinates and metadata. + """ + query = """ + SELECT + t.tx_id, t.tx_biotype, t.gene_id, + t.tx_seq_start, t.tx_seq_end, + g.seq_name, g.seq_strand, g.gene_name, + c.seq_length + FROM tx t + JOIN gene g ON t.gene_id = g.gene_id + LEFT JOIN chromosome c ON g.seq_name = c.seq_name + """ + + where_clauses = [] + params = [] + + if filter: + for col, val in filter.items(): + prefix = "t." if col.startswith("tx_") else "g." + if col == "gene_id": + prefix = "t." + + if isinstance(val, list): + placeholders = ",".join("?" * len(val)) + where_clauses.append(f"{prefix}{col} IN ({placeholders})") + params.extend(val) + else: + where_clauses.append(f"{prefix}{col} = ?") + params.append(val) + + if where_clauses: + query += " WHERE " + " AND ".join(where_clauses) + + bf = self._query_as_biocframe(query, tuple(params)) + if bf.shape[0] == 0: + return GenomicRanges.empty() + + return self._make_gr(bf, prefix="tx_") + + def exons(self, filter: Optional[Dict[str, Union[str, List[str]]]] = None) -> GenomicRanges: + """Retrieve exons as GenomicRanges. + + Args: + filter: + A dictionary defining filters to narrow down the result. + Keys are column names (e.g., "exon_id", "gene_id", "tx_id"). + Values can be a single string or a list of strings to match. + + This allows filtering exons by associated gene or transcript IDs + (e.g., `{'gene_id': 'ENSG00000139618'}`). + + Returns: + A GenomicRanges object containing exon coordinates and metadata. + """ + query = """ + SELECT DISTINCT + e.exon_id, e.exon_seq_start, e.exon_seq_end, + g.seq_name, g.seq_strand, + c.seq_length + FROM exon e + JOIN tx2exon t2e ON e.exon_id = t2e.exon_id + JOIN tx t ON t2e.tx_id = t.tx_id + JOIN gene g ON t.gene_id = g.gene_id + LEFT JOIN chromosome c ON g.seq_name = c.seq_name + """ + + where_clauses = [] + params = [] + if filter: + for col, val in filter.items(): + prefix = "g." + if col.startswith("tx_"): + prefix = "t." + if col.startswith("exon_"): + prefix = "e." + + if isinstance(val, list): + placeholders = ",".join("?" * len(val)) + where_clauses.append(f"{prefix}{col} IN ({placeholders})") + params.extend(val) + else: + where_clauses.append(f"{prefix}{col} = ?") + params.append(val) + + if where_clauses: + query += " WHERE " + " AND ".join(where_clauses) + + bf = self._query_as_biocframe(query, tuple(params)) + if bf.shape[0] == 0: + return GenomicRanges.empty() + + return self._make_gr(bf, prefix="exon_") + + def _make_gr(self, bf: BiocFrame, prefix: str = "gene_") -> GenomicRanges: + """Helper to convert BiocFrame to GenomicRanges.""" + strand_col = bf.get_column("seq_strand") + strand_map = {1: "+", -1: "-", 0: "*", "1": "+", "-1": "-", "0": "*"} + strand = [strand_map.get(x, "*") for x in strand_col] + + seqnames = [str(x) for x in bf.get_column("seq_name")] + + starts = bf.get_column(f"{prefix}seq_start") + ends = bf.get_column(f"{prefix}seq_end") + widths = [abs(e - s) + 1 for s, e in zip(starts, ends)] + ranges = IRanges(start=starts, width=widths) + + row_names = None + id_col = f"{prefix}id" + if id_col in bf.column_names: + row_names = [str(x) for x in bf.get_column(id_col)] + + drop_cols = ["seq_name", "seq_strand", f"{prefix}seq_start", f"{prefix}seq_end", "seq_length"] + mcols_dict = {} + for c in bf.column_names: + if c not in drop_cols: + mcols_dict[c] = bf.get_column(c) + + mcols = BiocFrame(mcols_dict, row_names=row_names) + + return GenomicRanges(seqnames=seqnames, ranges=ranges, strand=strand, names=row_names, mcols=mcols) + + def close(self): + self.conn.close() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() diff --git a/src/ensembldb/registry.py b/src/ensembldb/registry.py index 8eb4167..4b1c852 100644 --- a/src/ensembldb/registry.py +++ b/src/ensembldb/registry.py @@ -70,6 +70,7 @@ def _initialize_registry(self, force: bool = False): conn = sqlite3.connect(md_path) try: # Filter for EnsDb sqlite files + # Updated query: Checks rdataclass AND rdatapath extension query = """ SELECT r.id, @@ -85,8 +86,8 @@ def _initialize_registry(self, force: bool = False): ON r.location_prefix_id = lp.id LEFT JOIN rdatapaths rp ON rp.resource_id = r.id - WHERE r.title LIKE 'Ensembl % EnsDb%' - AND r.title LIKE '%.sqlite' + WHERE (rp.rdataclass = 'EnsDb' OR r.title LIKE 'Ensembl % EnsDb%') + AND rp.rdatapath LIKE '%.sqlite' ORDER BY r.rdatadateadded DESC; """ cursor = conn.cursor() diff --git a/tests/test_real.py b/tests/test_real.py new file mode 100644 index 0000000..cb10f6d --- /dev/null +++ b/tests/test_real.py @@ -0,0 +1,105 @@ +import pytest +from genomicranges import GenomicRanges + +from ensembldb import EnsDb, EnsDbRegistry + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +@pytest.fixture(scope="module") +def ensdb_resource(): + registry = EnsDbRegistry() + + all_ids = registry.list_ensdbs() + + if not all_ids: + pytest.fail("Registry found no EnsDb files. Check query logic.") + + target_id = "AH100751" # Saccharomyces cerevisiae + return registry.load_db(target_id) + + +def test_connection_and_metadata(ensdb_resource): + assert isinstance(ensdb_resource, EnsDb) + + meta = ensdb_resource.metadata + + assert "name" in meta.column_names + assert "value" in meta.column_names + + names = meta.get_column("name") + values = meta.get_column("value") + meta_dict = dict(zip(names, values)) + + assert "DBSCHEMAVERSION" in meta_dict or "schema_version" in meta_dict + + +def test_genes_fetch(ensdb_resource): + gr = ensdb_resource.genes() + + assert isinstance(gr, GenomicRanges) + assert len(gr) > 0 + + mcols = gr.mcols + assert "gene_id" in mcols.column_names + assert len(gr.seqnames) == len(gr) + assert len(gr.ranges) == len(gr) + + +def test_genes_filter(ensdb_resource): + all_genes = ensdb_resource.genes() + if len(all_genes) == 0: + pytest.skip("No genes found in DB to filter.") + + target_id = all_genes.mcols.get_column("gene_id")[0] + gr_filtered = ensdb_resource.genes(filter={"gene_id": target_id}) + assert len(gr_filtered) == 1 + assert gr_filtered.mcols.get_column("gene_id")[0] == target_id + + +def test_transcripts_fetch(ensdb_resource): + gr = ensdb_resource.transcripts() + + assert isinstance(gr, GenomicRanges) + if len(gr) == 0: + print("Warning: No transcripts found.") + return + + mcols = gr.mcols + assert "tx_id" in mcols.column_names + assert "gene_id" in mcols.column_names + + +def test_exons_fetch(ensdb_resource): + gr = ensdb_resource.exons() + + assert isinstance(gr, GenomicRanges) + if len(gr) == 0: + print("Warning: No exons found.") + return + + mcols = gr.mcols + assert "exon_id" in mcols.column_names + + +def test_combined_filter(ensdb_resource): + txs = ensdb_resource.transcripts() + if len(txs) == 0: + pytest.skip("No transcripts to filter.") + + target_gene = txs.mcols.get_column("gene_id")[0] + gr = ensdb_resource.transcripts(filter={"gene_id": target_gene}) + + assert len(gr) > 0 + for gid in gr.mcols.get_column("gene_id"): + assert gid == target_gene + + +def test_seqinfo_population(ensdb_resource): + gr = ensdb_resource.genes() + if len(gr) == 0: + pytest.skip("No genes.") + + assert all(start > 0 for start in gr.ranges.start) diff --git a/tests/test_skeleton.py b/tests/test_skeleton.py deleted file mode 100644 index 01f1b71..0000000 --- a/tests/test_skeleton.py +++ /dev/null @@ -1,25 +0,0 @@ -import pytest - -from ensembldb.skeleton import fib, main - -__author__ = "Jayaram Kancherla" -__copyright__ = "Jayaram Kancherla" -__license__ = "MIT" - - -def test_fib(): - """API Tests""" - assert fib(1) == 1 - assert fib(2) == 1 - assert fib(7) == 13 - with pytest.raises(AssertionError): - fib(-10) - - -def test_main(capsys): - """CLI Tests""" - # capsys is a pytest fixture that allows asserts against stdout/stderr - # https://docs.pytest.org/en/stable/capture.html - main(["7"]) - captured = capsys.readouterr() - assert "The 7-th Fibonacci number is 13" in captured.out From 27351cd637440b76c3c2df8cb4267cb8b68e40bd Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Tue, 20 Jan 2026 16:49:21 -0800 Subject: [PATCH 3/4] disable tests for windows --- .github/workflows/run-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index e0f247d..3d3bc95 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -32,7 +32,7 @@ jobs: platform: - ubuntu-latest - macos-latest - - windows-latest + # - windows-latest runs-on: ${{ matrix.platform }} name: Python ${{ matrix.python }}, ${{ matrix.platform }} steps: From 2d4a49fa169ef38806afece7b3d0baf799833d49 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Tue, 20 Jan 2026 16:59:47 -0800 Subject: [PATCH 4/4] update changelog --- CHANGELOG.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 205cc5e..874c101 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,5 @@ # Changelog -## Version 0.1 (development) +## Version 0.0.1 -- Feature A added -- FIX: nasty bug #1729 fixed -- add your changes here! +- Initial implementation to access EnsemblDb sqlite files from AnnotationHub.