Source code for celldex.save_reference

import json
import os
import shutil
from functools import singledispatch
from typing import Any, List

import dolomite_base as dl
import numpy
from gypsum_client import fetch_metadata_schema, validate_metadata
from summarizedexperiment import SummarizedExperiment

from .utils import format_object_metadata

__author__ = "Jayaram Kancherla"
__copyright__ = "Jayaram Kancherla"
__license__ = "MIT"



[docs]
@singledispatch
def save_reference(x: Any, labels: List[str], path: str, metadata: dict):
    """Save a reference dataset to disk.

    Args:
        x:
            An object containing reference data.
            May be a
            :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`
            containing a assay matricx called `logcounts` of
            log-normalized expression values.

            Each row of ``column_data`` corresponds to a column of ``x`` and
            contains the label(s) for that column.
            Each column of ``labels`` represents a different label type;
            typically, the column name has a ``label.`` prefix to distinguish
            between, e.g., ``label.fine``, ``label.broad`` and so on.

            At least one column should be present.

        path:
            Path to a new directory to save the dataset.

        metadata:
            Dictionary containing the metadata for this dataset.
            see the schema returned by
            :py:func:`~gypsum_client.fetch_metadata_schema.fetch_metadata_schema`.

            Note that the ``applications.takane`` property will be automatically
            added by this function and does not have to be supplied.

    See Also:
        `metadata index <https://github.com/ArtifactDB/bioconductor-metadata-index>`_,
        on the expected schema for the metadata.

        :py:func:`~celldex.upload_referene.upload_reference`, to upload the saved contents.

    Example:

        .. code-block:: python

            # create a summarized experiment object
            mat = np.random.poisson(1, (100, 10))
            row_names = [f"GENE_{i}" for i in range(mat.shape[0])]
            col_names = list("ABCDEFGHIJ")
            sce = SummarizedExperiment(
                assays={"logcounts": mat},
                row_data=BiocFrame(row_names=row_names),
                column_data=BiocFrame({
                  "label.fine": col_names
                }),
            )

            # Provide metadata for search and findability
            meta = {
                "title": "New reference dataset",
                "description": "This is a new reference dataset",
                "taxonomy_id": ["10090"], # NCBI ID
                "genome": ["GRCm38"], # genome build
                "sources": [{"provider": "GEO", "id": "GSE12345"}],
                "maintainer_name": "Jayaram kancherla",
                "maintainer_email": "jayaram.kancherla@gmail.com",
            }

            import shutil
            import tempfile

            cache_dir = tempfile.mkdtemp()

            # Make sure the directory is clean
            shutil.rmtree(cache_dir)

            # Save the reference
            celldex.save_reference(sce, cache_dir, meta)
    """
    raise NotImplementedError(f"'save_dataset' is not supported for objects of class: {type(x)}")



def _save_se(x: SummarizedExperiment, path, metadata):
    schema = fetch_metadata_schema()

    if "bioconductor_version" not in metadata:
        metadata["bioconductor_version"] = "3.19"  # current release

    validate_metadata(metadata, schema)

    # checks if columns exist
    _cols = x.get_column_data()
    if len(_cols.get_column_names()) == 0:
        raise ValueError("'SummarizedExperiment' must contain atleast one column.")

    for _cn in _cols.get_column_names():
        _data = _cols.get_column(_cn)
        if not all(isinstance(y, str) for y in _data):
            raise ValueError(f"All labels in 'column_data' must be a list of strings; column {_cn} does not.")

    if "logcounts" not in list(x.get_assay_names()):
        raise ValueError("Assay 'logcounts' does not exist.")

    _mat = x.assay("logcounts")
    if not numpy.issubdtype(_mat.dtype, numpy.floating):
        raise ValueError("Assay 'logcounts' must be log-normalized values (floats).")

    if numpy.any(numpy.isnan(_mat)):
        raise ValueError("Assay 'logcounts' cannot contain 'NaN' values.")

    _rows = x.get_row_names()
    if len(set(_rows)) != len(_rows):
        raise ValueError("'row_data' must contain unique row names.")

    if os.path.exists(path):
        shutil.rmtree(path)

    dl.save_object(x, path, reloaded_array_reuse_mode="symlink")

    takane = format_object_metadata(x)
    takane["type"] = dl.read_object_file(path)["type"]

    if "applications" not in metadata:
        metadata["applications"] = {}

    metadata["applications"]["takane"] = takane

    # Second validation with the takane metadata.
    contents = json.dumps(metadata, indent=4)
    validate_metadata(json.loads(contents), schema=schema)
    with open(os.path.join(path, "_bioconductor.json"), "w") as f:
        f.write(contents)



[docs]
@save_reference.register
def save_reference_se(x: SummarizedExperiment, path: str, metadata: dict):
    """Save :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` to disk."""
    return _save_se(x, path, metadata)