Register files from Census release 2023-07-25ΒΆ
import lamindb as ln
import lnschema_bionty as lb
# import cellxgene_census
import pandas as pd
π‘ lamindb instance: laminlabs/cellxgene
ln.track()
π‘ notebook imports: lamindb==0.67.2 lnschema_bionty==0.39.0 pandas==2.1.4 requests==2.31.0
π‘ loaded: Transform(uid='pNa7RdI26sp45zKv', name='Register files from Census release 2023-07-25', short_name='census-release-2023-07-25', version='1', type='notebook', updated_at=2024-01-27 05:27:26 UTC, created_by_id=1)
π‘ loaded: Run(uid='dJ9t75LeOeqYWA4B0WbA', run_at=2024-01-30 09:03:47 UTC, transform_id=18, created_by_id=1)
census_version = "2023-07-25" # LTS release of Census
Register collections (updated 2024-01-27)ΒΆ
artifacts = ln.Artifact.filter(version=census_version).all()
artifacts.count()
850
collection = ln.Collection(artifacts, name="cellxgene-census", version=census_version)
collection.save()
collections = ln.Collection.filter(version=census_version).all()
collections.count()
80
Register datasetsΒΆ
Get the h5ad files directory on s3 from Census:
h5ad_dir = (
cellxgene_census.get_census_version_directory()
.get("stable")
.get("h5ads")
.get("uri")
)
h5ad_dir
's3://cellxgene-data-public/cell-census/2023-07-25/h5ads/'
ln.UPath(h5ad_dir).view_tree()
(0 sub-directories & 850 files with suffixes '.h5ad'):
βββ 00099d5e-154f-4a7a-aa8d-fa30c8c0c43c.h5ad
βββ 0041b9c3-6a49-4bf7-8514-9bc7190067a7.h5ad
βββ 00476f9f-ebc1-4b72-b541-32f912ce36ea.h5ad
βββ 00e5dedd-b9b7-43be-8c28-b0e5c6414a62.h5ad
βββ 00ff600e-6e2e-4d76-846f-0eec4f0ae417.h5ad
βββ 01209dce-3575-4bed-b1df-129f57fbc031.h5ad
...
files = ln.File.from_dir("s3://cellxgene-data-public/cell-census/2023-07-25/h5ads")
ln.save(files)
dataset = ln.Dataset(files, name="cellxgene-census", version=census_version)
dataset.save()
dataset = ln.Dataset.filter(name="cellxgene-census", version=census_version).one()
files = dataset.files.all()
Register metadataΒΆ
Get all datasets and associated metadata using cellxgene REST API:
import requests
def get_metadata_from_cxg():
api_url_base = "https://api.cellxgene.cziscience.com"
datasets_path = "/curation/v1/datasets"
datasets_url = f"{api_url_base}{datasets_path}"
headers = {"Content-Type": "application/json"}
res = requests.get(url=datasets_url, headers=headers)
res.raise_for_status()
cellxgene_meta = res.json()
return cellxgene_meta
cellxgene_meta = get_metadata_from_cxg()
len(cellxgene_meta)
1132
cellxgene_meta[0].keys()
dict_keys(['assay', 'assets', 'cell_count', 'cell_type', 'collection_doi', 'collection_id', 'collection_name', 'collection_version_id', 'dataset_id', 'dataset_version_id', 'development_stage', 'disease', 'donor_id', 'explorer_url', 'is_primary_data', 'mean_genes_per_cell', 'organism', 'primary_cell_count', 'processing_status', 'published_at', 'revised_at', 'schema_version', 'self_reported_ethnicity', 'sex', 'suspension_type', 'tissue', 'title', 'tombstone', 'x_approximate_distribution'])
featuresΒΆ
obs_features = {
"assay": "bionty.ExperimentalFactor",
"cell_type": "bionty.CellType",
"development_stage": "bionty.DevelopmentalStage",
"disease": "bionty.Disease",
"donor_id": "core.ULabel",
"self_reported_ethnicity": "bionty.Ethnicity",
"sex": "bionty.Phenotype",
"suspension_type": "core.ULabel",
"tissue": "bionty.Tissue",
}
obs_features_records = []
for name, registry in obs_features.items():
record = ln.Feature(name=name, type="category", registries=registry)
obs_features_records.append(record)
ln.save(obs_features_records)
obs_feature_set = ln.FeatureSet(features=obs_features_records, name="obs features")
obs_feature_set.save()
obs_feature_set.files.set(files, through_defaults={"slot": "obs"})
ext_features = {"organism": "bionty.Organism", "collection": "core.ULabel"}
ext_features_records = []
for name, registry in ext_features.items():
record = ln.Feature(name=name, type="category", registries=registry)
ext_features_records.append(record)
ln.save(ext_features_records)
ext_feature_set = ln.FeatureSet(features=ext_features_records, name="external features")
ext_feature_set.save()
ext_feature_set.files.set(files, through_defaults={"slot": "external"})
collections, organismsΒΆ
Register collections:
is_collection = ln.ULabel(name="is_collection")
is_collection.save()
collections_meta = set()
for dataset_meta in cellxgene_meta:
collections_meta.add(
(
dataset_meta["collection_name"],
dataset_meta["collection_doi"],
dataset_meta["collection_id"],
)
)
collections_records = []
for collection_name, collection_doi, collection_id in collections_meta:
collection = ln.ULabel(
name=collection_name,
description=collection_doi,
reference=collection_id,
reference_type="collection_id",
)
collections_records.append(collection)
ln.save(collections_records)
is_collection.children.add(*collections_records)
Register organisms:
ncbitaxon_source = lb.BiontySource.filter(source="ncbitaxon").one()
organisms_meta = set()
for dataset_meta in cellxgene_meta:
organisms_meta.update({i["ontology_term_id"] for i in dataset_meta["organism"]})
organisms_records = lb.Organism.from_values(
organisms_meta, field=lb.Organism.ontology_id, bionty_source=ncbitaxon_source
)
# rename house mouse to mouse
for r in organisms_records:
if r.name == "house mouse":
r.name = "mouse"
ln.save(organisms_records, parents=False)
Annotate files with collections and organisms:
ext_features = ext_feature_set.members.lookup()
files = dataset.files.all()
collections = is_collection.children.all()
organisms = lb.Organism.filter().all()
for dataset_meta in cellxgene_meta:
# get registered file record based on dataset_id
file = files.filter(key__contains=dataset_meta["dataset_id"]).one_or_none()
if file is None:
continue
# register collection
collection = ln.ULabel.filter(reference=dataset_meta["collection_id"]).one()
file.labels.add(collection, feature=ext_features.collection)
# register organism
organism_ontology_ids = [i["ontology_term_id"] for i in dataset_meta["organism"]]
organism_records = lb.Organism.filter(ontology_id__in=organism_ontology_ids).list()
file.labels.add(organism_records, feature=ext_features.organism)
ontologiesΒΆ
Register all ontology ids:
from typing import Optional
from lnschema_bionty.models import Registry
from lamindb.dev._feature_manager import get_accessor_by_orm
obs_features_records = obs_feature_set.members.lookup()
ACCESSORS = get_accessor_by_orm(ln.File)
FEATURE_TO_ACCESSOR = {}
for name in obs_features.keys():
feature = getattr(obs_features_records, name)
accessor = ACCESSORS.get(feature.registries)
orm = getattr(ln.File, accessor).field.model
# TODO: ulabels are defined in the File model, improve this in LaminDB
if orm == ln.File:
orm = getattr(ln.File, accessor).field.related_model
FEATURE_TO_ACCESSOR[name] = (accessor, orm)
def create_ontology_record_from_source(
ontology_id: str,
from_orm: Registry,
target_orm: Registry,
bionty_source: Optional[lb.BiontySource] = None,
):
from_record = from_orm.from_bionty(
ontology_id=ontology_id, bionty_source=bionty_source
)
try:
target_record = target_orm(
name=from_record.name,
description=from_record.description,
ontology_id=from_record.ontology_id,
bionty_source_id=from_record.bionty_source_id,
)
return target_record
except Exception:
pass
ln.settings.upon_create_search_names = False
ontology_ids = {}
for name in obs_features.keys():
if name in ["donor_id", "suspension_type"]:
continue
allids = set()
for i in cellxgene_meta:
if name in i:
allids.update([(j["label"], j["ontology_term_id"]) for j in i[name]])
ontology_ids[name] = allids
bionty_source_ds_mouse = lb.BiontySource.filter(
entity="DevelopmentalStage", organism="mouse"
).one()
bionty_source_pato = lb.BiontySource.filter(source="pato").one()
# register all ontology ids
for name, terms in ontology_ids.items():
print(f"registering {name}")
accessor, orm = FEATURE_TO_ACCESSOR.get(name)
terms_ids = [i[1] for i in terms]
records = orm.from_values(terms_ids, field="ontology_id")
if len(records) > 0:
ln.save(records)
inspect_result = orm.inspect(terms_ids, field="ontology_id", mute=True)
if len(inspect_result.non_validated) > 0:
if name == "development_stage":
records = orm.from_values(
inspect_result.non_validated,
field="ontology_id",
bionty_source=bionty_source_ds_mouse,
)
records += [
create_ontology_record_from_source(
ontology_id=term_id, from_orm=lb.Tissue, target_orm=orm
)
for term_id in inspect_result.non_validated
if term_id.startswith("UBERON:")
]
records += [
orm(name=term_id, ontology_id=term_id)
for term_id in inspect_result.non_validated
if term_id == "unknown"
]
else:
records = [
orm(name=term[0], ontology_id=term[1])
for term in terms
if (not term[1].startswith("PATO:"))
and (term[1] in inspect_result.non_validated)
]
records += [
create_ontology_record_from_source(
ontology_id=term_id,
from_orm=lb.Phenotype,
target_orm=orm,
bionty_source=bionty_source_pato,
)
for term_id in inspect_result.non_validated
if term_id.startswith("PATO:")
]
if len(records) > 0:
print(f"registered {len(records)} records: {records}")
ln.save(records)
Show code cell output
registering assay
β did not create ExperimentalFactor record for 1 non-validated ontology_id: 'EFO:0700016'
β now recursing through parents: this only happens once, but is much slower than bulk saving
registered 1 records: [ExperimentalFactor(uid='gWUGSA9l', name='Smart-seq v4', ontology_id='EFO:0700016', created_by_id=1)]
registering cell_type
β now recursing through parents: this only happens once, but is much slower than bulk saving
registering development_stage
β did not create DevelopmentalStage records for 6 non-validated ontology_ids: 'UBERON:0018241', 'UBERON:0000113', 'UBERON:0034919', 'UBERON:0007220', 'UBERON:0007222', 'unknown'
β now recursing through parents: this only happens once, but is much slower than bulk saving
registered 6 records: [DevelopmentalStage(uid='wksJWjer', name='prime adult stage', ontology_id='UBERON:0018241', description='A Life Cycle Stage That Starts At Completion Of Development And Growth Of The Sexually Mature Adult Animal, And Ends Before Senescence.', bionty_source_id=47, created_by_id=1), DevelopmentalStage(uid='GDaE3j6Z', name='post-juvenile adult stage', ontology_id='UBERON:0000113', description='The Stage Of Being A Sexually Mature Adult Animal.', bionty_source_id=47, created_by_id=1), DevelopmentalStage(uid='l00DTC4g', name='juvenile stage', ontology_id='UBERON:0034919', description='The Stage Of Being No More Dependent Of The Nest And/Or From Caregivers For Subsistence While Having Not Reach Sexual Maturity.', bionty_source_id=47, created_by_id=1), DevelopmentalStage(uid='lNh8U4YZ', name='late embryonic stage', ontology_id='UBERON:0007220', description='An Embryo Stage That Covers Late Steps Of The Embryogenesis With A Fully Formed Embryo Still Developing Before Birth Or Egg Hatching.', bionty_source_id=47, created_by_id=1), DevelopmentalStage(uid='hqyIKjfF', name='late adult stage', ontology_id='UBERON:0007222', bionty_source_id=47, created_by_id=1), DevelopmentalStage(uid='xL8yuEN7', name='unknown', ontology_id='unknown', created_by_id=1)]
registering disease
β did not create Disease record for 1 non-validated ontology_id: 'PATO:0000461'
β now recursing through parents: this only happens once, but is much slower than bulk saving
registered 1 records: [Disease(uid='4r2nqggf', name='normal', ontology_id='PATO:0000461', description='A Quality Inhering In A Bearer By Virtue Of The Bearer'S Exhibiting No Deviation From Normal Or Average.', bionty_source_id=38, created_by_id=1)]
registering self_reported_ethnicity
β did not create Ethnicity records for 3 non-validated ontology_ids: 'multiethnic', 'na', 'unknown'
β now recursing through parents: this only happens once, but is much slower than bulk saving
registered 3 records: [Ethnicity(uid='xL8yuEN7', name='unknown', ontology_id='unknown', created_by_id=1), Ethnicity(uid='UY1fNAFT', name='na', ontology_id='na', created_by_id=1), Ethnicity(uid='8lAgy5Ej', name='multiethnic', ontology_id='multiethnic', created_by_id=1)]
registering sex
β did not create Phenotype records for 3 non-validated ontology_ids: 'PATO:0000384', 'unknown', 'PATO:0000383'
registered 3 records: [Phenotype(uid='xL8yuEN7', name='unknown', ontology_id='unknown', created_by_id=1), Phenotype(uid='Pl1UiuS0', name='male', ontology_id='PATO:0000384', description='A Biological Sex Quality Inhering In An Individual Or A Population Whose Sex Organs Contain Only Male Gametes.', bionty_source_id=38, created_by_id=1), Phenotype(uid='hSl0sSF0', name='female', ontology_id='PATO:0000383', description='A Biological Sex Quality Inhering In An Individual Or A Population That Only Produces Gametes That Can Be Fertilised By Male Gametes.', bionty_source_id=38, created_by_id=1)]
registering tissue
β did not create Tissue records for 18 non-validated ontology_ids: 'CL:0000010 (cell culture)', 'CL:0000082 (cell culture)', 'CL:0000084 (cell culture)', 'CL:0000115 (cell culture)', 'CL:0000351 (cell culture)', 'CL:0002322 (cell culture)', 'CL:0002327 (cell culture)', 'CL:0002328 (cell culture)', 'CL:0002334 (cell culture)', 'CL:0002335 (cell culture)', 'CL:0002633 (cell culture)', 'CL:0010003 (cell culture)', 'UBERON:0000088 (organoid)', 'UBERON:0000310 (organoid)', 'UBERON:0000966 (organoid)', 'UBERON:0001295 (organoid)', 'UBERON:0002048 (organoid)', 'UBERON:0002370 (organoid)'
β now recursing through parents: this only happens once, but is much slower than bulk saving
registered 18 records: [Tissue(uid='x3tRcugV', name='trophoblast (organoid)', ontology_id='UBERON:0000088 (organoid)', created_by_id=1), Tissue(uid='UoElNxsj', name='endothelial cell (cell culture)', ontology_id='CL:0000115 (cell culture)', created_by_id=1), Tissue(uid='9YB5clqY', name='cultured cell (cell culture)', ontology_id='CL:0000010 (cell culture)', created_by_id=1), Tissue(uid='WSs6UA9e', name='lung (organoid)', ontology_id='UBERON:0002048 (organoid)', created_by_id=1), Tissue(uid='CevFMDqD', name='preadipocyte (cell culture)', ontology_id='CL:0002334 (cell culture)', created_by_id=1), Tissue(uid='RkE6D8y1', name='endometrium (organoid)', ontology_id='UBERON:0001295 (organoid)', created_by_id=1), Tissue(uid='rIPA0OEl', name='T cell (cell culture)', ontology_id='CL:0000084 (cell culture)', created_by_id=1), Tissue(uid='dwdBlCNp', name='breast (organoid)', ontology_id='UBERON:0000310 (organoid)', created_by_id=1), Tissue(uid='Ash8pGf8', name='trophoblast cell (cell culture)', ontology_id='CL:0000351 (cell culture)', created_by_id=1), Tissue(uid='uS0Cw8zN', name='retina (organoid)', ontology_id='UBERON:0000966 (organoid)', created_by_id=1), Tissue(uid='vg9s890t', name='respiratory basal cell (cell culture)', ontology_id='CL:0002633 (cell culture)', created_by_id=1), Tissue(uid='lfIFQFR5', name='epithelial cell of lung (cell culture)', ontology_id='CL:0000082 (cell culture)', created_by_id=1), Tissue(uid='w6gzNa8D', name='mammary gland epithelial cell (cell culture)', ontology_id='CL:0002327 (cell culture)', created_by_id=1), Tissue(uid='yPk6E1V8', name='epithelial cell of alveolus of lung (cell culture)', ontology_id='CL:0010003 (cell culture)', created_by_id=1), Tissue(uid='K4RSNRBc', name='thymus (organoid)', ontology_id='UBERON:0002370 (organoid)', created_by_id=1), Tissue(uid='9ICArUMH', name='embryonic stem cell (cell culture)', ontology_id='CL:0002322 (cell culture)', created_by_id=1), Tissue(uid='7MzqN14b', name='bronchial epithelial cell (cell culture)', ontology_id='CL:0002328 (cell culture)', created_by_id=1), Tissue(uid='kWD0kb5x', name='brown preadipocyte (cell culture)', ontology_id='CL:0002335 (cell culture)', created_by_id=1)]
donors and suspension_typesΒΆ
donor_ids = set()
suspension_types = set()
for i in cellxgene_meta:
if "donor_id" in i:
donor_ids.update(i["donor_id"])
if "suspension_type" in i:
suspension_types.update(i["suspension_type"])
is_donor = ln.ULabel(name="is_donor", description="parent of donor ids")
is_donor.save()
is_suspension_type = ln.ULabel(
name="is_suspension_type", description="parent of suspension types"
)
is_suspension_type.save()
is_donor = ln.ULabel.filter(name="is_donor").one()
donors = is_donor.children.all()
result = donors.inspect(donor_ids, mute=True)
new_donors = [ln.ULabel(name=name) for name in result.non_validated]
ln.save(new_donors)
is_donor.children.add(*new_donors)
is_suspension_type = ln.ULabel.filter(name="is_suspension_type").one()
stypes = is_suspension_type.children.all()
result = stypes.inspect(suspension_types, mute=True)
new_stypes = [ln.ULabel(name=name) for name in result.non_validated]
ln.save(new_stypes)
is_suspension_type.children.add(*new_stypes)
Annotate files with metadataΒΆ
features = ln.Feature.lookup()
for idx, dataset_meta in enumerate(cellxgene_meta):
if idx % 100 == 0:
print(f"annotating dataset {idx} of {len(cellxgene_meta)}")
file = files.filter(key__contains=dataset_meta["dataset_id"]).one_or_none()
if file is None:
continue
for field, terms in dataset_meta.items():
if field not in FEATURE_TO_ACCESSOR:
continue
accessor, orm = FEATURE_TO_ACCESSOR.get(field)
if field in ["donor_id", "suspension_type"]:
records = orm.from_values(terms, field="name")
if len(records) > 0:
# stratify by feature so that link tables records are written
file.labels.add(records, feature=getattr(features, field))
else:
records = orm.from_values(
[i["ontology_term_id"] for i in terms], field="ontology_id"
)
if len(records) > 0:
getattr(file, accessor).add(*records)
Show code cell output
annotating dataset 0 of 1132
annotating dataset 100 of 1132
annotating dataset 200 of 1132
annotating dataset 300 of 1132
annotating dataset 400 of 1132
annotating dataset 500 of 1132
annotating dataset 600 of 1132
annotating dataset 700 of 1132
annotating dataset 800 of 1132
annotating dataset 900 of 1132
annotating dataset 1000 of 1132
annotating dataset 1100 of 1132
Validate and register genesΒΆ
# register synthetic constructs and sars_cov_2 as new organisms
lb.Organism.from_bionty(
ontology_id="NCBITaxon:32630", bionty_source=ncbitaxon_source
).save(parents=False)
lb.Organism.from_bionty(
ontology_id="NCBITaxon:2697049", bionty_source=ncbitaxon_source
).save(parents=False)
# genes files
organisms = lb.Organism.lookup(field=lb.Organism.scientific_name)
genes_files = {
"homo_sapiens": "https://github.com/chanzuckerberg/single-cell-curation/raw/main/cellxgene_schema_cli/cellxgene_schema/ontology_files/genes_homo_sapiens.csv.gz",
"mus_musculus": "https://github.com/chanzuckerberg/single-cell-curation/raw/main/cellxgene_schema_cli/cellxgene_schema/ontology_files/genes_mus_musculus.csv.gz",
"synthetic_construct": "https://github.com/chanzuckerberg/single-cell-curation/raw/main/cellxgene_schema_cli/cellxgene_schema/ontology_files/genes_ercc.csv.gz",
"severe_acute_respiratory_syndrome_coronavirus_2": "https://github.com/chanzuckerberg/single-cell-curation/raw/main/cellxgene_schema_cli/cellxgene_schema/ontology_files/genes_sars_cov_2.csv.gz",
}
Register all genes for each organism:
for organism_name, genes_file in genes_files.items():
print(f"registering {organism_name} genes")
df = pd.read_csv(genes_file, header=None, index_col=0)
organism_record = getattr(organisms, organism_name)
gene_records = lb.Gene.from_values(
df.index, field=lb.Gene.ensembl_gene_id, organism=organism_record
)
ln.save(gene_records)
validated = lb.Gene.validate(
df.index, field=lb.Gene.ensembl_gene_id, organism=organism_record
)
# register legacy genes manually
new_records = []
for gene_id in df.index[~validated]:
new_records.append(
lb.Gene(
ensembl_gene_id=gene_id,
symbol=df.loc[gene_id][1],
organism=organism_record,
)
)
ln.save(new_records)
genes_feature_set = ln.FeatureSet(
features=gene_records + new_records, name=f"all {organism_record.name} genes"
)
genes_feature_set.save()
Show code cell output
registering homo_sapiens genes
β did not create Gene records for 147 non-validated ensembl_gene_ids: 'ENSG00000112096', 'ENSG00000137808', 'ENSG00000161149', 'ENSG00000182230', 'ENSG00000203812', 'ENSG00000204092', 'ENSG00000205485', 'ENSG00000212951', 'ENSG00000215271', 'ENSG00000221995', 'ENSG00000224739', 'ENSG00000224745', 'ENSG00000225178', 'ENSG00000225932', 'ENSG00000226377', 'ENSG00000226380', 'ENSG00000226403', 'ENSG00000227021', 'ENSG00000227220', 'ENSG00000227902', ...
β 147 terms (0.20%) are not validated for ensembl_gene_id: ENSG00000269933, ENSG00000261737, ENSG00000259834, ENSG00000256374, ENSG00000263464, ENSG00000203812, ENSG00000272196, ENSG00000272880, ENSG00000284299, ENSG00000270188, ENSG00000287116, ENSG00000237133, ENSG00000224739, ENSG00000227902, ENSG00000239467, ENSG00000272551, ENSG00000280374, ENSG00000284741, ENSG00000236886, ENSG00000229352, ...
registering mus_musculus genes
β did not create Gene records for 135 non-validated ensembl_gene_ids: 'ENSMUSG00000022591', 'ENSMUSG00000045506', 'ENSMUSG00000053706', 'ENSMUSG00000053861', 'ENSMUSG00000066378', 'ENSMUSG00000066810', 'ENSMUSG00000066936', 'ENSMUSG00000067085', 'ENSMUSG00000067122', 'ENSMUSG00000067292', 'ENSMUSG00000067627', 'ENSMUSG00000067929', 'ENSMUSG00000068181', 'ENSMUSG00000069518', 'ENSMUSG00000072693', 'ENSMUSG00000073290', 'ENSMUSG00000073291', 'ENSMUSG00000073682', 'ENSMUSG00000074210', 'ENSMUSG00000074302', ...
β 135 terms (0.20%) are not validated for ensembl_gene_id: ENSMUSG00000022591, ENSMUSG00000094127, ENSMUSG00000066936, ENSMUSG00000116275, ENSMUSG00000091312, ENSMUSG00000098794, ENSMUSG00000079353, ENSMUSG00000096240, ENSMUSG00000079286, ENSMUSG00000085431, ENSMUSG00000075015, ENSMUSG00000075014, ENSMUSG00000078091, ENSMUSG00000075006, ENSMUSG00000079175, ENSMUSG00000079171, ENSMUSG00000079170, ENSMUSG00000079169, ENSMUSG00000090353, ENSMUSG00000100963, ...
registering synthetic_construct genes
β loading non-default source inside a LaminDB instance
β no Bionty source found, skipping Bionty validation
β loading non-default source inside a LaminDB instance
β did not create Gene records for 92 non-validated ensembl_gene_ids: 'ERCC-00002', 'ERCC-00003', 'ERCC-00004', 'ERCC-00009', 'ERCC-00012', 'ERCC-00013', 'ERCC-00014', 'ERCC-00016', 'ERCC-00017', 'ERCC-00019', 'ERCC-00022', 'ERCC-00024', 'ERCC-00025', 'ERCC-00028', 'ERCC-00031', 'ERCC-00033', 'ERCC-00034', 'ERCC-00035', 'ERCC-00039', 'ERCC-00040', ...
β 92 terms (100.00%) are not validated for ensembl_gene_id: ERCC-00002, ERCC-00003, ERCC-00004, ERCC-00009, ERCC-00012, ERCC-00013, ERCC-00014, ERCC-00016, ERCC-00017, ERCC-00019, ERCC-00022, ERCC-00024, ERCC-00025, ERCC-00028, ERCC-00031, ERCC-00033, ERCC-00034, ERCC-00035, ERCC-00039, ERCC-00040, ...
registering severe_acute_respiratory_syndrome_coronavirus_2 genes
β loading non-default source inside a LaminDB instance
β no Bionty source found, skipping Bionty validation
β loading non-default source inside a LaminDB instance
β did not create Gene records for 12 non-validated ensembl_gene_ids: 'ENSSASG00005000002', 'ENSSASG00005000003', 'ENSSASG00005000004', 'ENSSASG00005000006', 'ENSSASG00005000010', 'ENSSASG00005000007', 'ENSSASG00005000011', 'ENSSASG00005000009', 'ENSSASG00005000012', 'ENSSASG00005000008', 'ENSSASG00005000005', 'ENSSASG00005000013'
β 12 terms (100.00%) are not validated for ensembl_gene_id: ENSSASG00005000002, ENSSASG00005000003, ENSSASG00005000004, ENSSASG00005000006, ENSSASG00005000010, ENSSASG00005000007, ENSSASG00005000011, ENSSASG00005000009, ENSSASG00005000012, ENSSASG00005000008, ENSSASG00005000005, ENSSASG00005000013
Link metadata to individual filesΒΆ
annotate with genes measured in each file:
for idx, file in enumerate(files):
if idx % 100 == 0:
print(f"annotating dataset {idx} of {len(files)}")
adata_backed = file.backed()
var_names = adata_backed.var_names
organism_record = file.organism.first()
if organism_record is None:
print(f"No organism found for file: {file}")
continue
genes = lb.Gene.from_values(
var_names, field=lb.Gene.ensembl_gene_id, organism=organism_record
)
if len(var_names[var_names.str.startswith("ERCC")]) > 0:
genes += lb.Gene.from_values(
var_names,
field=lb.Gene.ensembl_gene_id,
organism=organisms.synthetic_construct,
)
if len(var_names[var_names.str.startswith("ENSSASG")]) > 0:
genes += lb.Gene.from_values(
var_names,
field=lb.Gene.ensembl_gene_id,
organism=organisms.severe_acute_respiratory_syndrome_coronavirus_2,
)
var_feature_set_file = ln.FeatureSet(genes, type="number")
var_feature_set_file.save()
file.feature_sets.add(var_feature_set_file, through_defaults={"slot": "var"})
Show code cell output
β did not create Gene records for 10 non-validated ensembl_gene_ids: 'ENSSASG00005000004', 'ENSSASG00005000005', 'ENSSASG00005000006', 'ENSSASG00005000007', 'ENSSASG00005000008', 'ENSSASG00005000009', 'ENSSASG00005000010', 'ENSSASG00005000011', 'ENSSASG00005000012', 'ENSSASG00005000013'
β loading non-default source inside a LaminDB instance
β no Bionty source found, skipping Bionty validation
β loading non-default source inside a LaminDB instance
β did not create Gene records for 33234 non-validated ensembl_gene_ids: 'ENSG00000000003', 'ENSG00000000005', 'ENSG00000000419', 'ENSG00000000457', 'ENSG00000000460', 'ENSG00000000938', 'ENSG00000000971', 'ENSG00000001036', 'ENSG00000001084', 'ENSG00000001167', 'ENSG00000001460', 'ENSG00000001461', 'ENSG00000001497', 'ENSG00000001561', 'ENSG00000001617', 'ENSG00000001626', 'ENSG00000001629', 'ENSG00000001630', 'ENSG00000001631', 'ENSG00000002016', ...
β did not create Gene records for 10 non-validated ensembl_gene_ids: 'ENSSASG00005000004', 'ENSSASG00005000005', 'ENSSASG00005000006', 'ENSSASG00005000007', 'ENSSASG00005000008', 'ENSSASG00005000009', 'ENSSASG00005000010', 'ENSSASG00005000011', 'ENSSASG00005000012', 'ENSSASG00005000013'
β loading non-default source inside a LaminDB instance
β no Bionty source found, skipping Bionty validation
β loading non-default source inside a LaminDB instance
β did not create Gene records for 33234 non-validated ensembl_gene_ids: 'ENSG00000000003', 'ENSG00000000005', 'ENSG00000000419', 'ENSG00000000457', 'ENSG00000000460', 'ENSG00000000938', 'ENSG00000000971', 'ENSG00000001036', 'ENSG00000001084', 'ENSG00000001167', 'ENSG00000001460', 'ENSG00000001461', 'ENSG00000001497', 'ENSG00000001561', 'ENSG00000001617', 'ENSG00000001626', 'ENSG00000001629', 'ENSG00000001630', 'ENSG00000001631', 'ENSG00000002016', ...
These files are annotated as rhesus or pig, but using human genes:
for uid in ["Np1PSgWwIIYPWz0USN8z", "PuqnmUwzXQ56VPATgy9b"]:
file = ln.File.filter(uid=uid).one()
adata_backed = file.backed()
var_names = adata_backed.var_names
genes = lb.Gene.from_values(
var_names, field=lb.Gene.ensembl_gene_id, organism="human"
)
var_feature_set_file = ln.FeatureSet(genes, type="number")
var_feature_set_file.save()
file.feature_sets.add(var_feature_set_file, through_defaults={"slot": "var"})
file.describe()
File(uid='PuqnmUwzXQ56VPATgy9b', key='cell-census/2023-07-25/h5ads/db4a9ed2-e994-40c1-b7ec-4091fdf7b6c1.h5ad', suffix='.h5ad', accessor='AnnData', description='A transcriptional cross species map of pancreatic islet cells', size=286688588, hash='HXRDjbTdQSYFOXtU9q09qQ-35', hash_type='md5-n', visibility=1, key_is_virtual=False, updated_at=2023-11-28 22:52:09 UTC)
Provenance:
ποΈ storage: Storage(uid='oIYGbD74', root='s3://cellxgene-data-public', type='s3', region='us-west-2', updated_at=2023-10-16 15:04:08 UTC, created_by_id=1)
π transform: Transform(uid='pNa7RdI26sp4z8', name='Register files from Census release 2023-07-25', short_name='census-release-2023-07-25', version='0', type='notebook', updated_at=2023-11-28 21:30:25 UTC, created_by_id=1)
π£ run: Run(uid='ZYgsnqK5v2hPmFlS0kfG', run_at=2023-11-29 10:04:46 UTC, transform_id=11, created_by_id=1)
π€ created_by: User(uid='kmvZDIX9', handle='sunnyosun', name='Sunny Sun', updated_at=2023-11-28 21:14:48 UTC)
Features:
obs: FeatureSet(uid='kwKICViF5O3QjHdg0nov', name='obs features', n=9, type='category', registry='core.Feature', hash='Bx10EzvDxdlAVjqVKdKC', updated_at=2023-11-29 09:28:28 UTC, created_by_id=1)
π assay (1, bionty.ExperimentalFactor): '10x 3' v2'
π cell_type (4, bionty.CellType): 'pancreatic PP cell', 'type B pancreatic cell', 'pancreatic A cell', 'pancreatic D cell'
π development_stage (1, bionty.DevelopmentalStage): 'prime adult stage'
π disease (1, bionty.Disease): 'normal'
π donor_id (1, core.ULabel): 'pig_donor'
π self_reported_ethnicity (1, bionty.Ethnicity): 'na'
π sex (1, bionty.Phenotype): 'female'
π suspension_type (1, core.ULabel): 'cell'
π tissue (1, bionty.Tissue): 'islet of Langerhans'
external: FeatureSet(uid='zIgncie4AywRKgLmKHUW', name='external features', n=2, type='category', registry='core.Feature', hash='5E4xD6tOhDB5EOnLx3tv', updated_at=2023-11-29 09:28:20 UTC, created_by_id=1)
π organism (1, bionty.Organism): 'domestic pig'
π collection (1, core.ULabel): 'A transcriptional cross species map of pancreatic islet cells'
var: FeatureSet(uid='nxOy4SXpndR819ksIxDx', n=15824, type='number', registry='bionty.Gene', hash='gfxllJBvAvyJBu8S2gIF', updated_at=2023-11-29 13:46:53 UTC, created_by_id=1)
'SAMD11', 'NOC2L', 'KLHL17', 'PLEKHN1', 'PERM1', 'HES4', 'ISG15', 'AGRN', 'TTLL10', 'TNFRSF18', 'TNFRSF4', 'SDF4', 'B3GALT6', 'C1QTNF12', 'UBE2J2', 'SCNN1D', 'ACAP3', 'PUSL1', 'INTS11', 'TAS1R3', ...
Labels:
π·οΈ organism (1, bionty.Organism): 'domestic pig'
π·οΈ tissues (1, bionty.Tissue): 'islet of Langerhans'
π·οΈ cell_types (4, bionty.CellType): 'pancreatic PP cell', 'type B pancreatic cell', 'pancreatic A cell', 'pancreatic D cell'
π·οΈ diseases (1, bionty.Disease): 'normal'
π·οΈ phenotypes (1, bionty.Phenotype): 'female'
π·οΈ experimental_factors (1, bionty.ExperimentalFactor): '10x 3' v2'
π·οΈ developmental_stages (1, bionty.DevelopmentalStage): 'prime adult stage'
π·οΈ ethnicities (1, bionty.Ethnicity): 'na'
π·οΈ ulabels (3, core.ULabel): 'A transcriptional cross species map of pancreatic islet cells', 'pig_donor', 'cell'
Link metadata to datasetΒΆ
feature sets:
dataset.feature_sets.add(
ln.FeatureSet.filter(name__contains="obs").one(), through_defaults={"slot": "obs"}
)
dataset.feature_sets.add(
ln.FeatureSet.filter(name__contains="ext").one(),
through_defaults={"slot": "external"},
)
dataset.feature_sets.add(
ln.FeatureSet.filter(name__contains="human").one(),
through_defaults={"slot": "var-human"},
)
dataset.feature_sets.add(
ln.FeatureSet.filter(name__contains="mouse").one(),
through_defaults={"slot": "var-mouse"},
)
dataset.feature_sets.add(
ln.FeatureSet.filter(name__contains="sars-2").one(),
through_defaults={"slot": "var-sars-cov-2"},
)
dataset.feature_sets.add(
ln.FeatureSet.filter(name__contains="synthetic construct").one(),
through_defaults={"slot": "var-ercc"},
)
is_donor = ln.ULabel.filter(name="is_donor").one()
donors = is_donor.children.all().filter().exclude(files=None).all()
is_collection = ln.ULabel.filter(name="is_collection").one()
collections = is_collection.children.all().filter().exclude(files=None).all()
is_suspension_type = ln.ULabel.filter(name="is_suspension_type").one()
stypes = is_suspension_type.children.all().filter().exclude(files=None).all()
dataset.labels.add(donors, features.donor_id)
dataset.labels.add(collections, features.collection)
dataset.labels.add(stypes, features.suspension_type)
dataset.labels.add(
lb.ExperimentalFactor.filter().exclude(files=None).all(), features.assay
)
dataset.labels.add(lb.CellType.filter().exclude(files=None).all(), features.cell_type)
dataset.labels.add(
lb.DevelopmentalStage.filter().exclude(files=None).all(), features.development_stage
)
dataset.labels.add(lb.Disease.filter().exclude(files=None).all(), features.disease)
dataset.labels.add(
lb.Ethnicity.filter().exclude(files=None).all(), features.self_reported_ethnicity
)
dataset.labels.add(lb.Phenotype.filter().exclude(files=None).all(), features.sex)
dataset.labels.add(lb.Tissue.filter().exclude(files=None).all(), features.tissue)
dataset.describe()
Dataset(uid='OirHTWDrudY2TYltvIX1', name='cellxgene-census', version='2023-07-25', hash='pEJ9uvIeTLvHkZW2TBT5', visibility=1, updated_at=2023-11-28 21:46:40 UTC)
Provenance:
π transform: Transform(uid='pNa7RdI26sp4z8', name='Register files from Census release 2023-07-25', short_name='census-release-2023-07-25', version='0', type='notebook', updated_at=2023-11-28 21:30:25 UTC, created_by_id=1)
π£ run: Run(uid='ZYgsnqK5v2hPmFlS0kfG', run_at=2023-11-29 10:04:46 UTC, transform_id=11, created_by_id=1)
π€ created_by: User(uid='kmvZDIX9', handle='sunnyosun', name='Sunny Sun', updated_at=2023-11-28 21:14:48 UTC)
β¬οΈ input_of (core.Run): ['2023-11-29 12:51:05 UTC']
Features:
obs: FeatureSet(uid='kwKICViF5O3QjHdg0nov', name='obs features', n=9, type='category', registry='core.Feature', hash='Bx10EzvDxdlAVjqVKdKC', updated_at=2023-11-29 09:28:28 UTC, created_by_id=1)
π assay (32, bionty.ExperimentalFactor): 'Seq-Well S3', 'GEXSCOPE technology', 'sci-Plex', 'DroNc-seq', 'MERFISH', 'snmC-Seq2', 'CEL-seq2', '10x 5' transcription profiling', 'Drop-seq', 'microwell-seq', ...
π cell_type (699, bionty.CellType): 'cell of skeletal muscle', 'T-helper 1 cell', 'mesothelial fibroblast', 'kidney collecting duct epithelial cell', 'microglial cell', 'type G enteroendocrine cell', 'pericyte', 'supporting cell', 'CD14-positive, CD16-positive monocyte', 'retinal ganglion cell', ...
π development_stage (215, bionty.DevelopmentalStage): 'Theiler stage 19', '16 weeks', '17 weeks', 'Theiler stage 21', '26 weeks', '7 weeks', '8 month-old stage', '15 weeks', '5 month-old stage', '5 weeks', ...
π disease (76, bionty.Disease): 'epilepsy', 'long COVID-19', 'brain neoplasm', 'Alzheimer disease', 'influenza', 'Crohn disease', 'systemic lupus erythematosus', 'acute promyelocytic leukemia', 'squamous cell lung carcinoma', 'B-cell non-Hodgkin lymphoma', ...
π donor_id (6871, core.ULabel): 'D367', 'H20.33.032', '372317', 'SG_HEL_H136', 'SF11644', 'H18.03.318', '252599', 'KR_SGI_H049', 'homosapiens_None_2023_None_sikkemalisa_002_d10_1101_2022_03_10_483747210I', '426003', ...
π self_reported_ethnicity (28, bionty.Ethnicity): 'Singaporean Chinese', 'African American', 'Irish', 'South Asian', 'Pacific Islander', 'Bangladeshi', 'Hispanic or Latin American', 'admixed ancestry', 'Oceanian', 'European', ...
π sex (3, bionty.Phenotype): 'unknown', 'male', 'female'
π suspension_type (3, core.ULabel): 'cell', 'nucleus', 'na'
π tissue (298, bionty.Tissue): 'nose', 'cervical lymph node', 'body of stomach', 'bronchus', 'tongue', 'fimbria of uterine tube', 'renal glomerulus', 'olfactory region', 'mesenteric artery', 'subcutaneous abdominal adipose tissue', ...
external: FeatureSet(uid='zIgncie4AywRKgLmKHUW', name='external features', n=2, type='category', registry='core.Feature', hash='5E4xD6tOhDB5EOnLx3tv', updated_at=2023-11-29 09:28:20 UTC, created_by_id=1)
π organism (5, bionty.Organism): 'domestic pig', 'mouse', 'white-tufted-ear marmoset', 'human', 'rhesus macaque'
π collection (146, core.ULabel): 'Abdominal White Adipose Tissue', 'A molecular single-cell lung atlas of lethal COVID-19', 'Spatial multiomics map of trophoblast development in early pregnancy', 'Blood and immune development in human fetal bone marrow and Down syndrome', 'Mapping the developing human immune system across organs', 'Evolution of cellular diversity in primary motor cortex of human, marmoset monkey, and mouse', 'Construction of a human cell landscape at single-cell level', 'Impaired local intrinsic immunity to SARS-CoV-2 infection in severe COVID-19', 'Single-cell transcriptomes of the human skin reveal age-related loss of fibroblast priming', 'A single-cell transcriptome atlas of the adult human retina', ...
var-ercc: FeatureSet(uid='VDiO6vtqPe58U4HJPHeD', name='all synthetic construct genes', n=92, type='number', registry='bionty.Gene', hash='rMxzn166gRykjOZFnWRy', updated_at=2023-11-29 09:26:16 UTC, created_by_id=1)
'ERCC-00002 (spike-in control)', 'ERCC-00003 (spike-in control)', 'ERCC-00004 (spike-in control)', 'ERCC-00009 (spike-in control)', 'ERCC-00012 (spike-in control)', 'ERCC-00013 (spike-in control)', 'ERCC-00014 (spike-in control)', 'ERCC-00016 (spike-in control)', 'ERCC-00017 (spike-in control)', 'ERCC-00019 (spike-in control)', 'ERCC-00022 (spike-in control)', 'ERCC-00024 (spike-in control)', 'ERCC-00025 (spike-in control)', 'ERCC-00028 (spike-in control)', 'ERCC-00031 (spike-in control)', 'ERCC-00033 (spike-in control)', 'ERCC-00034 (spike-in control)', 'ERCC-00035 (spike-in control)', 'ERCC-00039 (spike-in control)', 'ERCC-00040 (spike-in control)', ...
var-mouse: FeatureSet(uid='h10gJKScXD72BjnxbIHD', name='all mouse genes', n=55416, type='number', registry='bionty.Gene', hash='umPHI2jmFQXA78M69WBD', updated_at=2023-11-29 09:26:09 UTC, created_by_id=1)
'4933401J01Rik', 'Gm26206', 'Xkr4', 'Gm18956', 'Gm37180', 'Gm37363', 'Gm37686', 'Gm1992', 'Gm37329', 'Gm7341', 'Gm38148', 'Gm19938', 'Gm10568', 'Gm38385', 'Gm27396', 'Gm37381', 'Rp1', 'Gm6101', 'Gm37483', 'Sox17', ...
var-human: FeatureSet(uid='CXzMBf4cCDtBq8N5Sg4a', name='all human genes', n=60664, type='number', registry='bionty.Gene', hash='DOnOv7runwo4TOR5P_do', updated_at=2023-11-29 10:29:23 UTC, created_by_id=1)
'DDX11L1', 'WASH7P', 'MIR6859-1', 'MIR1302-2HG', 'MIR1302-2', 'FAM138A', 'OR4G4P', 'OR4G11P', 'OR4F5', 'None', 'None', 'CICP27', 'None', 'None', 'None', 'None', 'RNU6-1100P', 'None', 'DDX11L17', 'WASH9P', ...
var-sars-cov-2: FeatureSet(uid='Q1oqEHSXHAogP5Ralgw2', name='all sars-2 genes', n=12, type='number', registry='bionty.Gene', hash='CLCjr_EazVM8KxnA7jhc', updated_at=2023-11-29 10:09:19 UTC, created_by_id=1)
'ORF1ab_ENSSASG00005000002', 'ORF1ab_ENSSASG00005000003', 'S', 'ORF3a', 'E', 'M', 'ORF6', 'ORF7a', 'ORF7b', 'ORF8', 'N', 'ORF10'
Labels:
π·οΈ organism (5, bionty.Organism): 'domestic pig', 'mouse', 'white-tufted-ear marmoset', 'human', 'rhesus macaque'
π·οΈ tissues (298, bionty.Tissue): 'nose', 'cervical lymph node', 'body of stomach', 'bronchus', 'tongue', 'fimbria of uterine tube', 'renal glomerulus', 'olfactory region', 'mesenteric artery', 'subcutaneous abdominal adipose tissue', ...
π·οΈ cell_types (699, bionty.CellType): 'cell of skeletal muscle', 'T-helper 1 cell', 'mesothelial fibroblast', 'kidney collecting duct epithelial cell', 'microglial cell', 'type G enteroendocrine cell', 'pericyte', 'supporting cell', 'CD14-positive, CD16-positive monocyte', 'retinal ganglion cell', ...
π·οΈ diseases (76, bionty.Disease): 'epilepsy', 'long COVID-19', 'brain neoplasm', 'Alzheimer disease', 'influenza', 'Crohn disease', 'systemic lupus erythematosus', 'acute promyelocytic leukemia', 'squamous cell lung carcinoma', 'B-cell non-Hodgkin lymphoma', ...
π·οΈ phenotypes (3, bionty.Phenotype): 'unknown', 'male', 'female'
π·οΈ experimental_factors (32, bionty.ExperimentalFactor): 'Seq-Well S3', 'GEXSCOPE technology', 'sci-Plex', 'DroNc-seq', 'MERFISH', 'snmC-Seq2', 'CEL-seq2', '10x 5' transcription profiling', 'Drop-seq', 'microwell-seq', ...
π·οΈ developmental_stages (215, bionty.DevelopmentalStage): 'Theiler stage 19', '16 weeks', '17 weeks', 'Theiler stage 21', '26 weeks', '7 weeks', '8 month-old stage', '15 weeks', '5 month-old stage', '5 weeks', ...
π·οΈ ethnicities (28, bionty.Ethnicity): 'Singaporean Chinese', 'African American', 'Irish', 'South Asian', 'Pacific Islander', 'Bangladeshi', 'Hispanic or Latin American', 'admixed ancestry', 'Oceanian', 'European', ...
π·οΈ ulabels (7020, core.ULabel): 'Abdominal White Adipose Tissue', 'A molecular single-cell lung atlas of lethal COVID-19', 'Spatial multiomics map of trophoblast development in early pregnancy', 'Blood and immune development in human fetal bone marrow and Down syndrome', 'Mapping the developing human immune system across organs', 'Evolution of cellular diversity in primary motor cortex of human, marmoset monkey, and mouse', 'Construction of a human cell landscape at single-cell level', 'Impaired local intrinsic immunity to SARS-CoV-2 infection in severe COVID-19', 'Single-cell transcriptomes of the human skin reveal age-related loss of fibroblast priming', 'A single-cell transcriptome atlas of the adult human retina', ...