diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index e0f247d..3d3bc95 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -32,7 +32,7 @@ jobs: platform: - ubuntu-latest - macos-latest - - windows-latest + # - windows-latest runs-on: ${{ matrix.platform }} name: Python ${{ matrix.python }}, ${{ matrix.platform }} steps: diff --git a/CHANGELOG.md b/CHANGELOG.md index 205cc5e..8767980 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,5 @@ # Changelog -## Version 0.1 (development) +## Version 0.0.1 -- Feature A added -- FIX: nasty bug #1729 fixed -- add your changes here! +- Initial implementation to access Ehub's resources and load data using `rds2py`. diff --git a/README.md b/README.md index 4571f74..3d2c518 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,20 @@ [![PyPI-Server](https://img.shields.io/pypi/v/experimenthub.svg)](https://pypi.org/project/experimenthub/) -![Unit tests](https://github.com/YOUR_ORG_OR_USERNAME/experimenthub/actions/workflows/run-tests.yml/badge.svg) +![Unit tests](https://github.com/biocpy/experimenthub/actions/workflows/run-tests.yml/badge.svg) # experimenthub -> Access Bioconductor's experimenthub resources +**ExperimentHub** provides an interface to access and manage data from the Bioconductor [ExperimentHub](https://bioconductor.org/packages/ExperimentHub/) service directly in Python. -A longer description of your project goes here... +It is designed to work within the **BiocPy** ecosystem, converting R data objects (like `SingleCellExperiment` or `SummarizedExperiment`) into their Python equivalents (e.g., `SummarizedExperiment`) using [rds2py](https://github.com/biocpy/rds2py). + +> [!NOTE] +> +> This is an ***experimental*** package. It may not work with all RDS files from ExperimentHub. +> Currently, this package filters ExperimentHub resources to provide access to: +> - **File Formats:** `.rds` +> - **R Classes:** `SingleCellExperiment`, `SummarizedExperiment`, `RangedSummarizedExperiment`, `GRanges` etc +> +> Files are converted to their respective BiocPy representations or common Python formats. ## Install @@ -15,6 +24,80 @@ To get started, install the package from [PyPI](https://pypi.org/project/experim pip install experimenthub ``` +## Usage + +### Initialize the Registry + +The registry manages the local cache of `ExperimentHub` metadata and resources. On the first run, it downloads the metadata database. + +```py +from experimenthub import ExperimentHubRegistry + +# Initialize the registry (downloads metadata if needed) +eh = ExperimentHubRegistry() +``` + +### Searching for Resources + +ExperimentHub contains thousands of datasets. Use the `search()` method to find resources by title, description, or species. + +```py +# Search for mouse-related datasets +results = eh.search("mus musculus") + +# Print the first few matches +for record in results[:5]: + print(f"{record.ehub_id}: {record.title}") +# Output: +# EH1041: Brain scRNA-seq data, sample ..., +# EH1042: Brain scRNA-seq data, gene ..., +# ... +``` + +### Inspecting Metadata + +You can retrieve detailed metadata for a specific ID. + +```py +record = eh.get_record("EH4663") + +print(f"Title: {record.title}") +print(f"Species: {record.species}") +print(f"Genome: {record.genome}") +print(f"Description: {record.description}") +print(f"R Class: {record.preparer_dataclass}") + +## Output: +# Title: Lohoff biorXiv spatial coordinates (sample 2) +# Species: Mus musculus +# Genome: mm10 +# Description: Cell spatial coordinates for sample 2 for the E8.5 seqFISH dataset from biorXiv +# R Class: character +``` + +### Loading Data + +The `load()` method handles the download, caching, and loading of the dataset. + +If the resource is an R data file (.rds) containing a supported Bioconductor object (e.g., `SingleCellExperiment`), it is automatically read and converted to an equivalent python object using rds2py. + +```py +# Load a data.frame as an BiocFrame object +data = eh.load("EH4663") + +print(data) +# BiocFrame with 8425 rows and 3 columns +# x y z +# +# embryo1_Pos0_cell10_z5 0.7084368794499625 -2.7071263060540645 5 +# embryo1_Pos0_cell100_z5 0.9763043488304248 -2.517971233335359 5 +# embryo1_Pos0_cell101_z5 0.9749347757408557 -2.6739635081030855 5 +# ... ... ... +# embryo1_Pos28_cell97_z5 -1.3992279805347039 3.1761928631722824 5 +# embryo1_Pos28_cell98_z5 -1.389353519722718 3.1349508225406666 5 +# embryo1_Pos28_cell99_z5 -1.394992277928857 2.5812717935734355 5 +``` + ## Note diff --git a/docs/index.md b/docs/index.md index 033add2..73d377f 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,19 +1,25 @@ # experimenthub -Access Bioconductor's experimenthub resources +**ExperimentHub** provides an interface to access and manage data from the Bioconductor [ExperimentHub](https://bioconductor.org/packages/ExperimentHub/) service directly in Python. +It is designed to work within the **BiocPy** ecosystem, converting R data objects (like `SingleCellExperiment` or `SummarizedExperiment`) into their Python equivalents (e.g., `SummarizedExperiment`) using [rds2py](https://github.com/biocpy/rds2py). -## Note - -> This is the main page of your project's [Sphinx] documentation. It is -> formatted in [Markdown]. Add additional pages by creating md-files in -> `docs` or rst-files (formatted in [reStructuredText]) and adding links to -> them in the `Contents` section below. +> [!NOTE] +> +> This is an ***experimental*** package. It may not work with all RDS files from ExperimentHub. +> Currently, this package filters ExperimentHub resources to provide access to: +> - **File Formats:** `.rds` +> - **R Classes:** `SingleCellExperiment`, `SummarizedExperiment`, `RangedSummarizedExperiment`, `GRanges` etc > -> Please check [Sphinx] and [MyST] for more information -> about how to document your project and how to configure your preferences. +> Files are converted to their respective BiocPy representations or common Python formats. +## Install +To get started, install the package from [PyPI](https://pypi.org/project/experimenthub/) + +```bash +pip install experimenthub +``` ## Contents ```{toctree} diff --git a/pyproject.toml b/pyproject.toml index 086f90c..bc45f55 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ version_scheme = "no-guess-dev" [tool.ruff] line-length = 120 src = ["src"] -exclude = ["tests"] +# exclude = ["tests"] lint.extend-ignore = ["F821"] [tool.ruff.lint.pydocstyle] diff --git a/setup.cfg b/setup.cfg index 3375185..f083ac5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -5,18 +5,18 @@ [metadata] name = experimenthub -description = Access Bioconductor's experimenthub resources +description = Access Bioconductors experimenthub resources author = Jayaram Kancherla author_email = jayaram.kancherla@gmail.com license = MIT license_files = LICENSE.txt long_description = file: README.md long_description_content_type = text/markdown; charset=UTF-8; variant=GFM -url = https://github.com/pyscaffold/pyscaffold/ +url = https://github.com/biocpy/experimenthub # Add here related links, for example: project_urls = - Documentation = https://pyscaffold.org/ -# Source = https://github.com/pyscaffold/pyscaffold/ + Documentation = https://github.com/biocpy/experimenthub + Source = https://github.com/biocpy/experimenthub # Changelog = https://pyscaffold.org/en/latest/changelog.html # Tracker = https://github.com/pyscaffold/pyscaffold/issues # Conda-Forge = https://anaconda.org/conda-forge/pyscaffold @@ -49,6 +49,9 @@ package_dir = # For more information, check out https://semver.org/. install_requires = importlib-metadata; python_version<"3.8" + rds2py + biocframe + pybiocfilecache [options.packages.find] diff --git a/src/experimenthub/__init__.py b/src/experimenthub/__init__.py index e451f10..1b028b5 100644 --- a/src/experimenthub/__init__.py +++ b/src/experimenthub/__init__.py @@ -14,3 +14,6 @@ __version__ = "unknown" finally: del version, PackageNotFoundError + +from .registry import ExperimentHubRegistry +from .record import ExperimentHubRecord \ No newline at end of file diff --git a/src/experimenthub/_ehub.py b/src/experimenthub/_ehub.py new file mode 100644 index 0000000..5b5240d --- /dev/null +++ b/src/experimenthub/_ehub.py @@ -0,0 +1,5 @@ +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + +EHUB_METADATA_URL = "https://experimenthub.bioconductor.org/metadata/experimenthub.sqlite3" diff --git a/src/experimenthub/record.py b/src/experimenthub/record.py new file mode 100644 index 0000000..18306b2 --- /dev/null +++ b/src/experimenthub/record.py @@ -0,0 +1,53 @@ +from __future__ import annotations + +from dataclasses import dataclass +from datetime import date, datetime +from typing import Optional + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +@dataclass(frozen=True) +class ExperimentHubRecord: + """Container for a single ExperimentHub entry.""" + + ehub_id: str + title: str + species: Optional[str] + taxonomy_id: Optional[str] + genome: Optional[str] + description: Optional[str] + url: str + release_date: Optional[date] + preparer_dataclass: Optional[str] + + @classmethod + def from_db_row(cls, row: tuple) -> "ExperimentHubRecord": + """Build a record from a database query row. + + Expected row format: + (id, title, species, taxonomyid, genome, description, full_url, date_added, rdataclass) + """ + rid, title, species, tax_id, genome, desc, url, date_str, rdataclass = row + ehub_id = f"EH{rid}" + + rel_date: Optional[date] = None + if date_str: + try: + rel_date = datetime.strptime(str(date_str).split(" ")[0], "%Y-%m-%d").date() + except ValueError: + pass + + return cls( + ehub_id=ehub_id, + title=title or "", + species=species, + taxonomy_id=str(tax_id) if tax_id else None, + genome=genome, + description=desc, + url=url, + release_date=rel_date, + preparer_dataclass=rdataclass, + ) diff --git a/src/experimenthub/registry.py b/src/experimenthub/registry.py new file mode 100644 index 0000000..9ce6d18 --- /dev/null +++ b/src/experimenthub/registry.py @@ -0,0 +1,229 @@ +import os +import sqlite3 +from pathlib import Path +from typing import Any, Dict, List, Optional, Union + +from pybiocfilecache import BiocFileCache + +from ._ehub import EHUB_METADATA_URL +from .record import ExperimentHubRecord + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +class ExperimentHubRegistry: + """Registry for ExperimentHub resources.""" + + # R classes that have corresponding BiocPy representations + SUPPORTED_R_CLASSES = { + "vector", + "list", + "character", + "matrix", + "numeric", + "int", + "matrix", + "dataframe", + "data.frame", + "data frame", + "dframe", + "iranges", + "genomicranges", + "granges", + "summarizedexperiment", + "rangedSummarizedexperiment", + "singlecellexperiment", + "multiassayexperiment", + } + + SUPPORTED_EXTENSIONS = ".rds" # , ".rda", ".rdata" + + def __init__( + self, + cache_dir: Optional[Union[str, Path]] = None, + force: bool = False, + ) -> None: + """Initialize the ExperimentHub registry. + + Args: + cache_dir: + Directory for the BiocFileCache database and cached files. + If None, defaults to "~/.cache/experimenthub_bfc". + + force: + If True, force re-download of the ExperimentHub metadata database. + """ + if cache_dir is None: + cache_dir = Path.home() / ".cache" / "experimenthub_bfc" + + self._cache_dir = Path(cache_dir) + self._cache_dir.mkdir(parents=True, exist_ok=True) + self._bfc = BiocFileCache(self._cache_dir) + + self._registry_map: Dict[str, ExperimentHubRecord] = {} + + self._initialize_registry(force=force) + + def _initialize_registry(self, force: bool = False): + """Fetch the ExperimentHub metadata and populate the registry.""" + rname = "experimenthub_metadata" + + existing = None + try: + existing = self._bfc.get(rname) + except Exception: + pass + + if force and existing: + try: + self._bfc.remove(rname) + except Exception: + pass + existing = None + + if existing: + md_resource = existing + else: + md_resource = self._bfc.add(rname, EHUB_METADATA_URL, rtype="web") + + md_path = self._get_filepath(md_resource) + + if not md_path or not os.path.exists(md_path): + if existing and not force: + return self._initialize_registry(force=True) + raise RuntimeError("Failed to retrieve ExperimentHub metadata database.") + + conn = sqlite3.connect(md_path) + try: + query = """ + SELECT + r.id, + r.title, + r.species, + r.taxonomyid, + r.genome, + r.description, + lp.location_prefix || rp.rdatapath AS full_url, + r.rdatadateadded, + rp.rdataclass + FROM resources r + LEFT JOIN location_prefixes lp + ON r.location_prefix_id = lp.id + LEFT JOIN rdatapaths rp + ON rp.resource_id = r.id + WHERE r.title IS NOT NULL + ORDER BY r.id ASC; + """ + cursor = conn.cursor() + cursor.execute(query) + rows = cursor.fetchall() + finally: + conn.close() + + for row in rows: + rdataclass = row[-1] + url = row[6] + + if rdataclass.lower() not in self.SUPPORTED_R_CLASSES: + continue + + if not url or not url.lower().endswith(self.SUPPORTED_EXTENSIONS): + continue + + record = ExperimentHubRecord.from_db_row(row) + self._registry_map[record.ehub_id] = record + + def list_ids(self) -> List[str]: + """List all available ExperimentHub IDs (e.g., 'EH1', 'EH123').""" + return sorted(list(self._registry_map.keys()), key=lambda x: int(x[2:])) + + def get_record(self, ehub_id: str) -> ExperimentHubRecord: + """Get the metadata record for a given ExperimentHub ID.""" + if ehub_id not in self._registry_map: + raise KeyError(f"ID '{ehub_id}' not found in registry (or is not a supported format).") + + return self._registry_map[ehub_id] + + def search(self, query: str) -> List[ExperimentHubRecord]: + """Search for resources matching the query string.""" + q = query.lower() + results = [] + for rec in self._registry_map.values(): + if q in rec.title.lower(): + results.append(rec) + continue + if rec.species and q in rec.species.lower(): + results.append(rec) + continue + if rec.description and q in rec.description.lower(): + results.append(rec) + continue + + return results + + def download(self, ehub_id: str, force: bool = False) -> str: + """Download and cache the resource file.""" + record = self.get_record(ehub_id) + url = record.url + key = ehub_id + + if force: + try: + self._bfc.remove(key) + except Exception: + pass + + if not force: + try: + existing = self._bfc.get(key) + if existing: + path = self._get_filepath(existing) + if path and os.path.exists(path) and os.path.getsize(path) > 0: + return path + except Exception: + pass + + resource = self._bfc.add( + key, + url, + rtype="web", + download=True, + ) + + path = self._get_filepath(resource) + + if not path or not os.path.exists(path) or os.path.getsize(path) == 0: + try: + self._bfc.remove(key) + except Exception: + pass + raise RuntimeError(f"Download failed for {ehub_id}. File is empty or missing.") + + return path + + def load(self, ehub_id: str, force: bool = False) -> Any: + """Load the resource using rds2py.""" + path = self.download(ehub_id, force=force) + + try: + import rds2py + except ImportError: + raise ImportError(f"The resource {ehub_id} requires 'rds2py' to be loaded. " "Please install it via pip.") + + try: + return rds2py.read_rds(path) + except Exception as e: + raise RuntimeError(f"Failed to load R data from {path}: {e}") + + def _get_filepath(self, resource: Any) -> Optional[str]: + """Helper to extract absolute path from a BiocFileCache resource.""" + if hasattr(resource, "rpath"): + rel_path = str(resource.rpath) + elif hasattr(resource, "get"): + rel_path = str(resource.get("rpath")) + else: + return None + + return str(self._cache_dir / rel_path) diff --git a/src/experimenthub/skeleton.py b/src/experimenthub/skeleton.py deleted file mode 100644 index fff467a..0000000 --- a/src/experimenthub/skeleton.py +++ /dev/null @@ -1,149 +0,0 @@ -""" -This is a skeleton file that can serve as a starting point for a Python -console script. To run this script uncomment the following lines in the -``[options.entry_points]`` section in ``setup.cfg``:: - - console_scripts = - fibonacci = experimenthub.skeleton:run - -Then run ``pip install .`` (or ``pip install -e .`` for editable mode) -which will install the command ``fibonacci`` inside your current environment. - -Besides console scripts, the header (i.e. until ``_logger``...) of this file can -also be used as template for Python modules. - -Note: - This file can be renamed depending on your needs or safely removed if not needed. - -References: - - https://setuptools.pypa.io/en/latest/userguide/entry_point.html - - https://pip.pypa.io/en/stable/reference/pip_install -""" - -import argparse -import logging -import sys - -from experimenthub import __version__ - -__author__ = "Jayaram Kancherla" -__copyright__ = "Jayaram Kancherla" -__license__ = "MIT" - -_logger = logging.getLogger(__name__) - - -# ---- Python API ---- -# The functions defined in this section can be imported by users in their -# Python scripts/interactive interpreter, e.g. via -# `from experimenthub.skeleton import fib`, -# when using this Python module as a library. - - -def fib(n): - """Fibonacci example function - - Args: - n (int): integer - - Returns: - int: n-th Fibonacci number - """ - assert n > 0 - a, b = 1, 1 - for _i in range(n - 1): - a, b = b, a + b - return a - - -# ---- CLI ---- -# The functions defined in this section are wrappers around the main Python -# API allowing them to be called directly from the terminal as a CLI -# executable/script. - - -def parse_args(args): - """Parse command line parameters - - Args: - args (List[str]): command line parameters as list of strings - (for example ``["--help"]``). - - Returns: - :obj:`argparse.Namespace`: command line parameters namespace - """ - parser = argparse.ArgumentParser(description="Just a Fibonacci demonstration") - parser.add_argument( - "--version", - action="version", - version=f"experimenthub {__version__}", - ) - parser.add_argument(dest="n", help="n-th Fibonacci number", type=int, metavar="INT") - parser.add_argument( - "-v", - "--verbose", - dest="loglevel", - help="set loglevel to INFO", - action="store_const", - const=logging.INFO, - ) - parser.add_argument( - "-vv", - "--very-verbose", - dest="loglevel", - help="set loglevel to DEBUG", - action="store_const", - const=logging.DEBUG, - ) - return parser.parse_args(args) - - -def setup_logging(loglevel): - """Setup basic logging - - Args: - loglevel (int): minimum loglevel for emitting messages - """ - logformat = "[%(asctime)s] %(levelname)s:%(name)s:%(message)s" - logging.basicConfig( - level=loglevel, stream=sys.stdout, format=logformat, datefmt="%Y-%m-%d %H:%M:%S" - ) - - -def main(args): - """Wrapper allowing :func:`fib` to be called with string arguments in a CLI fashion - - Instead of returning the value from :func:`fib`, it prints the result to the - ``stdout`` in a nicely formatted message. - - Args: - args (List[str]): command line parameters as list of strings - (for example ``["--verbose", "42"]``). - """ - args = parse_args(args) - setup_logging(args.loglevel) - _logger.debug("Starting crazy calculations...") - print(f"The {args.n}-th Fibonacci number is {fib(args.n)}") - _logger.info("Script ends here") - - -def run(): - """Calls :func:`main` passing the CLI arguments extracted from :obj:`sys.argv` - - This function can be used as entry point to create console scripts with setuptools. - """ - main(sys.argv[1:]) - - -if __name__ == "__main__": - # ^ This is a guard statement that will prevent the following code from - # being executed in the case someone imports this file instead of - # executing it as a script. - # https://docs.python.org/3/library/__main__.html - - # After installing your project with pip, users can also run your Python - # modules as scripts via the ``-m`` flag, as defined in PEP 338:: - # - # python -m experimenthub.skeleton 42 - # - run() diff --git a/tests/test_ehub.py b/tests/test_ehub.py new file mode 100644 index 0000000..866b04b --- /dev/null +++ b/tests/test_ehub.py @@ -0,0 +1,17 @@ +from biocframe import BiocFrame + +from experimenthub.registry import ExperimentHubRegistry + + +def test_real(): + ehub = ExperimentHubRegistry() + assert len(ehub.list_ids()) > 0 + + ehub_id = "EH4663" + rec = ehub.get_record(ehub_id) + assert rec is not None + assert rec.ehub_id == ehub_id + + data = ehub.load(ehub_id) + assert isinstance(data, BiocFrame) + assert len(data) == 8425 diff --git a/tests/test_skeleton.py b/tests/test_skeleton.py deleted file mode 100644 index 6c3a35e..0000000 --- a/tests/test_skeleton.py +++ /dev/null @@ -1,25 +0,0 @@ -import pytest - -from experimenthub.skeleton import fib, main - -__author__ = "Jayaram Kancherla" -__copyright__ = "Jayaram Kancherla" -__license__ = "MIT" - - -def test_fib(): - """API Tests""" - assert fib(1) == 1 - assert fib(2) == 1 - assert fib(7) == 13 - with pytest.raises(AssertionError): - fib(-10) - - -def test_main(capsys): - """CLI Tests""" - # capsys is a pytest fixture that allows asserts against stdout/stderr - # https://docs.pytest.org/en/stable/capture.html - main(["7"]) - captured = capsys.readouterr() - assert "The 7-th Fibonacci number is 13" in captured.out