[docs]@singledispatchdefsave_reference(x:Any,labels:List[str],path:str,metadata:dict):"""Save a reference dataset to disk. Args: x: An object containing reference data. May be a :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` containing a assay matricx called `logcounts` of log-normalized expression values. Each row of ``column_data`` corresponds to a column of ``x`` and contains the label(s) for that column. Each column of ``labels`` represents a different label type; typically, the column name has a ``label.`` prefix to distinguish between, e.g., ``label.fine``, ``label.broad`` and so on. At least one column should be present. path: Path to a new directory to save the dataset. metadata: Dictionary containing the metadata for this dataset. see the schema returned by :py:func:`~gypsum_client.fetch_metadata_schema.fetch_metadata_schema`. Note that the ``applications.takane`` property will be automatically added by this function and does not have to be supplied. See Also: `metadata index <https://github.com/ArtifactDB/bioconductor-metadata-index>`_, on the expected schema for the metadata. :py:func:`~celldex.upload_referene.upload_reference`, to upload the saved contents. Example: .. code-block:: python # create a summarized experiment object mat = np.random.poisson(1, (100, 10)) row_names = [f"GENE_{i}" for i in range(mat.shape[0])] col_names = list("ABCDEFGHIJ") sce = SummarizedExperiment( assays={"logcounts": mat}, row_data=BiocFrame(row_names=row_names), column_data=BiocFrame({ "label.fine": col_names }), ) # Provide metadata for search and findability meta = { "title": "New reference dataset", "description": "This is a new reference dataset", "taxonomy_id": ["10090"], # NCBI ID "genome": ["GRCm38"], # genome build "sources": [{"provider": "GEO", "id": "GSE12345"}], "maintainer_name": "Jayaram kancherla", "maintainer_email": "jayaram.kancherla@gmail.com", } import shutil import tempfile cache_dir = tempfile.mkdtemp() # Make sure the directory is clean shutil.rmtree(cache_dir) # Save the reference celldex.save_reference(sce, cache_dir, meta) """raiseNotImplementedError(f"'save_dataset' is not supported for objects of class: {type(x)}")
def_save_se(x:SummarizedExperiment,path,metadata):schema=fetch_metadata_schema()if"bioconductor_version"notinmetadata:metadata["bioconductor_version"]="3.19"# current releasevalidate_metadata(metadata,schema)# checks if columns exist_cols=x.get_column_data()iflen(_cols.get_column_names())==0:raiseValueError("'SummarizedExperiment' must contain atleast one column.")for_cnin_cols.get_column_names():_data=_cols.get_column(_cn)ifnotall(isinstance(y,str)foryin_data):raiseValueError(f"All labels in 'column_data' must be a list of strings; column {_cn} does not.")if"logcounts"notinlist(x.get_assay_names()):raiseValueError("Assay 'logcounts' does not exist.")_mat=x.assay("logcounts")ifnotnumpy.issubdtype(_mat.dtype,numpy.floating):raiseValueError("Assay 'logcounts' must be log-normalized values (floats).")ifnumpy.any(numpy.isnan(_mat)):raiseValueError("Assay 'logcounts' cannot contain 'NaN' values.")_rows=x.get_row_names()iflen(set(_rows))!=len(_rows):raiseValueError("'row_data' must contain unique row names.")ifos.path.exists(path):shutil.rmtree(path)dl.save_object(x,path,reloaded_array_reuse_mode="symlink")takane=format_object_metadata(x)takane["type"]=dl.read_object_file(path)["type"]if"applications"notinmetadata:metadata["applications"]={}metadata["applications"]["takane"]=takane# Second validation with the takane metadata.contents=json.dumps(metadata,indent=4)validate_metadata(json.loads(contents),schema=schema)withopen(os.path.join(path,"_bioconductor.json"),"w")asf:f.write(contents)
[docs]@save_reference.registerdefsave_reference_se(x:SummarizedExperiment,path:str,metadata:dict):"""Save :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` to disk."""return_save_se(x,path,metadata)