Source code for celldex.list_references

import json
import sqlite3
from functools import lru_cache

from biocframe import BiocFrame
from gypsum_client import (
    cache_directory,
    fetch_metadata_database,
)

__author__ = "Jayaram Kancherla"
__copyright__ = "Jayaram Kancherla"
__license__ = "MIT"


[docs] @lru_cache def list_references(cache_dir: str = cache_directory(), overwrite: bool = False, latest: bool = True) -> BiocFrame: """List all available reference datasets. Example: .. code-block:: python refs = list_references() Args: cache_dir: Path to cache directory. overwrite: Whether to overwrite the database in cache. Defaults to False. latest: Whether to only fetch the latest version of each reference. Defaults to True. Returns: A :py:class:`~biocframe.BiocFrame` where each row corresponds to a reference dataset. Each row contains title and description for each reference, the number of rows and columns, the organisms and genome builds involved, whether the dataset has any pre-computed reduced dimensions, and so on. More details can be found in the `Bioconductor metadata schema <https://github.com/ArtifactDB/bioconductor-metadata-index>`_. """ db_path = fetch_metadata_database(cache_dir=cache_dir, overwrite=overwrite) conn = sqlite3.connect(db_path, check_same_thread=False) stmt = "SELECT json_extract(metadata, '$') AS meta, versions.asset AS asset, versions.version AS version, path" key_names = ["meta", "asset", "version", "path"] if latest is not True: stmt = f"{stmt} versions.latest AS latest" key_names.append("latest") stmt = f"{stmt} FROM paths LEFT JOIN versions ON paths.vid = versions.vid WHERE versions.project = 'celldex'" if latest is True: stmt = f"{stmt} AND versions.latest = 1" _qresults = conn.execute(stmt).fetchall() conn.close() results = _format_query_results(_qresults, key_names) return _sanitize_query_to_output(results, latest)
def _format_query_results(results: list, key_names: list): """Format the results from sqlite as a pandas dataframe. Key names must be in the exact same order as the query. """ _out = {} for k in key_names: _out[k] = [] for r in results: for idx, k in enumerate(key_names): _out[k].append(r[idx]) return _out def _sanitize_query_to_output(results: list, latest: bool, meta_name: str = "meta"): _all_paths = [None if "/" not in p else p.rsplit("/", 1)[0] for p in results["path"]] df = BiocFrame( { "name": results["asset"], "version": results["version"], "path": _all_paths, } ) if not latest: _all_latest = [s == 1 for s in results["latest"]] df["latest"] = _all_latest _all_metas = [json.loads(s) for s in results[meta_name]] df["object"] = _extract_atomic_from_json( _all_metas, lambda x: x.get("applications", {}).get("takane", {}).get("type") ) df["title"] = _extract_atomic_from_json(_all_metas, lambda x: x.get("title")) df["description"] = _extract_atomic_from_json(_all_metas, lambda x: x.get("title")) df["taxonomy_id"] = _extract_charlist_from_json(_all_metas, lambda x: x.get("taxonomy_id")) df["genome"] = _extract_charlist_from_json(_all_metas, lambda x: x.get("genome")) df["rows"] = _extract_atomic_from_json( _all_metas, lambda x: x.get("applications", {}).get("takane", {}).get("summarized_experiment", {}).get("rows"), ) df["columns"] = _extract_atomic_from_json( _all_metas, lambda x: x.get("applications", {}).get("takane", {}).get("summarized_experiment", {}).get("columns"), ) df["assays"] = _extract_charlist_from_json( _all_metas, lambda x: x.get("applications", {}).get("takane", {}).get("summarized_experiment", {}).get("assays"), ) df["column_annotations"] = _extract_charlist_from_json( _all_metas, lambda x: x.get("applications", {}) .get("takane", {}) .get("summarized_experiment", {}) .get("column_annotations"), ) df["reduced_dimensions"] = _extract_charlist_from_json( _all_metas, lambda x: x.get("applications", {}) .get("takane", {}) .get("single_cell_experiment", {}) .get("reduced_dimensions"), ) df["alternative_experiments"] = _extract_charlist_from_json( _all_metas, lambda x: x.get("applications", {}) .get("takane", {}) .get("single_cell_experiment", {}) .get("alternative_experiments"), ) df["bioconductor_version"] = _extract_atomic_from_json(_all_metas, lambda x: x.get("bioconductor_version")) df["maintainer_name"] = _extract_atomic_from_json(_all_metas, lambda x: x.get("maintainer_name")) df["maintainer_email"] = _extract_atomic_from_json(_all_metas, lambda x: x.get("maintainer_email")) sources = [] for meta in _all_metas: cursources = meta.get("sources") if cursources is None: sources.append(BiocFrame(columns=["provider", "id", "version"])) else: sources.append( BiocFrame( { "provider": [s.get("provider") for s in cursources], "id": [s.get("id") for s in cursources], "version": [s.get("version") for s in cursources], } ) ) df["sources"] = sources return df def _extract_atomic_from_json(metadata, extract): return [extract(_meta) if extract(_meta) is not None else None for _meta in metadata] def _extract_charlist_from_json(metadata, extract): return [extract(_meta) if extract(_meta) is not None else [] for _meta in metadata]