Register files from Census release 2023-12-06ΒΆ
import lamindb as ln
import lnschema_bionty as lb
import pandas as pd
2023-12-13 12:03:04,521:INFO - NumExpr defaulting to 2 threads.
π‘ lamindb instance: laminlabs/cellxgene-latest
census_version = "2023-12-06"
s3path = f"s3://cellxgene-data-public/cell-census/{census_version}/h5ads"
ln.UPath(s3path).view_tree()
h5ads (0 sub-directories & 1139 files with suffixes '.h5ad'):
βββ 00099d5e-154f-4a7a-aa8d-fa30c8c0c43c.h5ad
βββ 0041b9c3-6a49-4bf7-8514-9bc7190067a7.h5ad
βββ 00476f9f-ebc1-4b72-b541-32f912ce36ea.h5ad
βββ 00e5dedd-b9b7-43be-8c28-b0e5c6414a62.h5ad
βββ 00ff600e-6e2e-4d76-846f-0eec4f0ae417.h5ad
βββ 01209dce-3575-4bed-b1df-129f57fbc031.h5ad
...
ln.track()
π‘ notebook imports: lamindb==0.64.0 lnschema_bionty==0.36.1 pandas==1.4.4 requests==2.31.0
π‘ loaded: Transform(uid='4hbVFAINneXCz8', name='Register files from Census release 2023-12-06', short_name='census-release-2023-12-06', version='0', type='notebook', updated_at=2023-12-11 15:39:44 UTC, created_by_id=2)
π‘ loaded: Run(uid='yq2FEOYiiNwTV6HJRReE', run_at=2023-12-13 12:03:11 UTC, transform_id=1, created_by_id=2)
Register artifacts (files)ΒΆ
artifacts = ln.Artifact.from_dir(s3path)
ln.save(artifacts)
dataset = ln.Dataset(artifacts, name="cellxgene-census", version=census_version)
dataset.save()
Register metadataΒΆ
Get all datasets and associated metadata using cellxgene REST API:
import requests
def get_datasets_df_from_cxg():
api_url_base = "https://api.cellxgene.cziscience.com"
datasets_path = "/curation/v1/datasets"
datasets_url = f"{api_url_base}{datasets_path}"
headers = {"Content-Type": "application/json"}
res = requests.get(url=datasets_url, headers=headers)
res.raise_for_status()
res_content = res.json()
return res_content
cellxgene_meta = get_datasets_df_from_cxg()
len(cellxgene_meta)
1152
cellxgene_meta[0].keys()
dict_keys(['assay', 'assets', 'batch_condition', 'cell_count', 'cell_type', 'citation', 'collection_doi', 'collection_id', 'collection_name', 'collection_version_id', 'dataset_id', 'dataset_version_id', 'development_stage', 'disease', 'donor_id', 'explorer_url', 'is_primary_data', 'mean_genes_per_cell', 'organism', 'primary_cell_count', 'processing_status', 'published_at', 'revised_at', 'schema_version', 'self_reported_ethnicity', 'sex', 'suspension_type', 'tissue', 'title', 'tombstone', 'x_approximate_distribution'])
Register new features and parent labelsΒΆ
obs_features = {
"assay": "bionty.ExperimentalFactor",
"cell_type": "bionty.CellType",
"development_stage": "bionty.DevelopmentalStage",
"disease": "bionty.Disease",
"donor_id": "core.ULabel",
"self_reported_ethnicity": "bionty.Ethnicity",
"sex": "bionty.Phenotype",
"suspension_type": "core.ULabel",
"tissue": "bionty.Tissue",
"tissue_type": "core.ULabel",
}
obs_features_records = []
for name, registry in obs_features.items():
record = ln.Feature(name=name, type="category", registries=registry)
obs_features_records.append(record)
ln.save(obs_features_records)
obs_feature_set = ln.FeatureSet(features=obs_features_records, name="obs features")
obs_feature_set.save()
obs_feature_set.artifacts.set(artifacts, through_defaults={"slot": "obs"})
ext_features = {"organism": "bionty.Organism", "collection": "core.ULabel"}
ext_features_records = []
for name, registry in ext_features.items():
record = ln.Feature(name=name, type="category", registries=registry)
ext_features_records.append(record)
ln.save(ext_features_records)
ext_feature_set = ln.FeatureSet(features=ext_features_records, name="external features")
ext_feature_set.save()
ext_feature_set.artifacts.set(artifacts, through_defaults={"slot": "external"})
ln.ULabel(name="is_collection", description="parents of collections").save()
ln.ULabel(name="is_donor", description="parents of donors").save()
ln.ULabel(name="is_suspension_type", description="parents of suspension types").save()
ln.ULabel(name="is_tissue_type", description="parents of tissue types").save()
features = ln.Feature.lookup()
artifacts = ln.File.filter(key__contains=census_version).all()
collections, organismsΒΆ
# register all collections
is_collection = ln.ULabel.filter(name="is_collection").one()
collections_meta = set()
for dataset_meta in cellxgene_meta:
collections_meta.add(
(
dataset_meta["collection_name"],
dataset_meta["collection_doi"],
dataset_meta["collection_id"],
)
)
collections_records = []
for collection_name, collection_doi, collection_id in collections_meta:
collection = ln.ULabel(
name=collection_name,
description=collection_doi,
reference=collection_id,
reference_type="collection_id",
)
collections_records.append(collection)
ln.save(collections_records)
is_collection.children.add(*collections_records)
# register all organisms
ncbitaxon_source = lb.BiontySource.filter(source="ncbitaxon").one()
organisms_meta = set()
for dataset_meta in cellxgene_meta:
organisms_meta.update({i["ontology_term_id"] for i in dataset_meta["organism"]})
organisms_records = lb.Organism.from_values(
organisms_meta, field=lb.Organism.ontology_id, bionty_source=ncbitaxon_source
)
# rename house mouse to mouse
for r in organisms_records:
if r.name == "house mouse":
r.name = "mouse"
ln.save(organisms_records, parents=False)
Link collections and organisms to artifacts:
ext_feature_set = ln.FeatureSet.filter(name="external features").one()
ext_features = ext_feature_set.members.lookup()
collections = is_collection.children.all()
organisms = lb.Organism.filter().all()
for dataset_meta in cellxgene_meta:
# get registered file record based on dataset_id
file = artifacts.filter(key__contains=dataset_meta["dataset_id"]).one_or_none()
if file is None:
continue
# register collection
collection = ln.ULabel.filter(reference=dataset_meta["collection_id"]).one()
file.labels.add(collection, feature=ext_features.collection)
# register organism
organism_ontology_ids = [i["ontology_term_id"] for i in dataset_meta["organism"]]
organism_records = lb.Organism.filter(ontology_id__in=organism_ontology_ids).list()
file.labels.add(organism_records, feature=ext_features.organism)
ontologiesΒΆ
Register all ontology ids:
from typing import Optional
from lnschema_bionty.models import Registry
from lamindb.dev._feature_manager import get_accessor_by_orm
obs_feature_set = ln.FeatureSet.filter(name="obs features").one()
obs_features_records = obs_feature_set.members.lookup()
ACCESSORS = get_accessor_by_orm(ln.File)
FEATURE_TO_ACCESSOR = {}
for name in obs_features.keys():
feature = getattr(obs_features_records, name)
accessor = ACCESSORS.get(feature.registries)
orm = getattr(ln.File, accessor).field.model
# TODO: ulabels are defined in the File model, improve this in LaminDB
if orm == ln.File:
orm = getattr(ln.File, accessor).field.related_model
FEATURE_TO_ACCESSOR[name] = (accessor, orm)
def create_ontology_record_from_source(
ontology_id: str,
from_orm: Registry,
target_orm: Registry,
bionty_source: Optional[lb.BiontySource] = None,
):
from_record = from_orm.from_bionty(
ontology_id=ontology_id, bionty_source=bionty_source
)
try:
target_record = target_orm(
name=from_record.name,
description=from_record.description,
ontology_id=from_record.ontology_id,
bionty_source_id=from_record.bionty_source_id,
)
return target_record
except Exception:
pass
obs_features.keys()
dict_keys(['assay', 'cell_type', 'development_stage', 'disease', 'donor_id', 'self_reported_ethnicity', 'sex', 'suspension_type', 'tissue', 'tissue_type'])
ln.settings.upon_create_search_names = False
ontology_ids = {}
for name in obs_features.keys():
if name in ["donor_id", "suspension_type", "tissue_type"]:
continue
allids = set()
for i in cellxgene_meta:
if name in i:
allids.update([(j["label"], j["ontology_term_id"]) for j in i[name]])
ontology_ids[name] = allids
bionty_source_ds_mouse = lb.BiontySource.filter(
entity="DevelopmentalStage", organism="mouse"
).one()
bionty_source_pato = lb.BiontySource.filter(source="pato").one()
# register all ontology ids
for name, terms in ontology_ids.items():
print(f"registering {name}")
accessor, orm = FEATURE_TO_ACCESSOR.get(name)
terms_ids = [i[1] for i in terms]
records = orm.from_values(terms_ids, field="ontology_id")
if len(records) > 0:
ln.save(records)
inspect_result = orm.inspect(terms_ids, field="ontology_id", mute=True)
if len(inspect_result.non_validated) > 0:
if name == "development_stage":
records = orm.from_values(
inspect_result.non_validated,
field="ontology_id",
bionty_source=bionty_source_ds_mouse,
)
records += [
create_ontology_record_from_source(
ontology_id=term_id, from_orm=lb.Tissue, target_orm=orm
)
for term_id in inspect_result.non_validated
if term_id.startswith("UBERON:")
]
records += [
orm(name=term_id, ontology_id=term_id)
for term_id in inspect_result.non_validated
if term_id == "unknown"
]
else:
records = [
orm(name=term[0], ontology_id=term[1])
for term in terms
if (not term[1].startswith("PATO:"))
and (term[1] in inspect_result.non_validated)
]
records += [
create_ontology_record_from_source(
ontology_id=term_id,
from_orm=lb.Phenotype,
target_orm=orm,
bionty_source=bionty_source_pato,
)
for term_id in inspect_result.non_validated
if term_id.startswith("PATO:")
]
if len(records) > 0:
print(f"registered {len(records)} records: {records}")
ln.save(records)
Show code cell output
registering assay
β now recursing through parents: this only happens once, but is much slower than bulk saving
registering cell_type
β now recursing through parents: this only happens once, but is much slower than bulk saving
registering development_stage
β did not create DevelopmentalStage records for 57 non-validated ontology_ids: 'MmusDv:0000021', 'MmusDv:0000024', 'MmusDv:0000025', 'MmusDv:0000026', 'MmusDv:0000027', 'MmusDv:0000028', 'MmusDv:0000029', 'MmusDv:0000032', 'MmusDv:0000033', 'MmusDv:0000034', 'MmusDv:0000035', 'MmusDv:0000036', 'MmusDv:0000037', 'MmusDv:0000041', 'MmusDv:0000046', 'MmusDv:0000048', 'MmusDv:0000049', 'MmusDv:0000050', 'MmusDv:0000051', 'MmusDv:0000052', ...
β now recursing through parents: this only happens once, but is much slower than bulk saving
β did not create DevelopmentalStage records for 6 non-validated ontology_ids: 'UBERON:0000113', 'UBERON:0007220', 'UBERON:0007222', 'UBERON:0018241', 'UBERON:0034919', 'unknown'
registered 57 records: [DevelopmentalStage(uid='oorzGtyN', name='24 weeks', ontology_id='MmusDv:0000074', description='5 Month-Old Stage That Refers To A Mouse Which Is Over 168 Days And Under 176 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='fiRoa3XX', name='22 weeks', ontology_id='MmusDv:0000072', description='5 Month-Old Stage That Refers To A Mouse Which Is Over 154 Days And Under 162 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='W2bOwIcG', name='18 month-old stage', ontology_id='MmusDv:0000089', description='Aged Adult Stage That Refers To A Mouse Which Is Over 18 And Under 19 Months Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='jOuS6eot', name='Theiler stage 24', ontology_id='MmusDv:0000033', synonyms='E16|TS24', description='Fetal Stage During Which The Umbilical Hernia Disappears And There Is A Corresponding Increase In The Size Of The Peritoneal Sac.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='cGDi3WbC', name='8 weeks', ontology_id='MmusDv:0000052', description='2 Month-Old Stage That Refers To A Mouse Which Is Over 56 Days And Under 64 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='0QLIsnxe', name='4 month-old stage', ontology_id='MmusDv:0000064', description='Early Adult Stage That Refers To A Mouse Which Is Over 4 And Under 5 Months Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='DgGRffIB', name='3 month-old stage', ontology_id='MmusDv:0000063', description='Early Adult Stage That Refers To A Mouse Which Is Over 3 And Under 4 Months Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='BHHRJHHR', name='12 weeks', ontology_id='MmusDv:0000056', description='3 Month-Old Stage That Refers To A Mouse Which Is Over 84 Days And Under 92 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='b8YHi0Nc', name='11 weeks', ontology_id='MmusDv:0000055', description='2 Month-Old Stage That Refers To A Mouse Which Is Over 77 Days And Under 85 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='o9MDgZ6M', name='Theiler stage 19', ontology_id='MmusDv:0000026', synonyms='TS19|E11-12.25', description='Organogenesis Stage During Which The Lens Vesicle Becomes Completely Closed And Detached From The Ectoderm, And Peripheral Margins Of The Eye Become Well Defined.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='IZx9mYT1', name='Theiler stage 21', ontology_id='MmusDv:0000028', synonyms='E12.5-14|TS21', description='Organogenesis Stage During Which The Digit Widths And Locations Can Be Discerned, And The Pinna Rapidly Develops And Forms A Crest At Right Angles To The Head.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='jl2j0dxn', name='6 weeks', ontology_id='MmusDv:0000050', description='Early Stage That Refers To A Mouse Which Is Over 42 Days And Under 50 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='Z0CP8fi0', name='4-7 days', ontology_id='MmusDv:0000113', description='Premature Stage That Refers To The Newborn Mouse Which Is Over 4 Days And Under 8 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='H0YqN4ng', name='9 weeks', ontology_id='MmusDv:0000053', description='2 Month-Old Stage That Refers To A Mouse Which Is Over 63 Days And Under 71 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='tJ45g0Sb', name='unknown', ontology_id='MmusDv:0000041', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='giDZt5Yz', name='19 weeks', ontology_id='MmusDv:0000068', description='4 Month-Old Stage That Refers To A Mouse Which Is Over 133 Days And Under 141 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='uDB0Ob7B', name='Theiler stage 28', ontology_id='MmusDv:0000037', description='Postnatal Development That Covers The Period After The Ts27 Stage (P0-P3 First Days Of Life) And Continuing To Adulthood. P4 To Adulthood.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='YF8WBL2Q', name='15 weeks', ontology_id='MmusDv:0000059', description='3 Month-Old Stage That Refers To A Mouse Which Is Over 105 Days And Under 113 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='8MuTFQO8', name='13 weeks', ontology_id='MmusDv:0000057', description='3 Month-Old Stage That Refers To A Mouse Which Is Over 91 Days And Under 99 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='BzdTRbz2', name='14 weeks', ontology_id='MmusDv:0000058', description='3 Month-Old Stage That Refers To A Mouse Which Is Over 98 Days And Under 106 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='Y4EsPXsh', name='10 weeks', ontology_id='MmusDv:0000054', description='2 Month-Old Stage That Refers To A Mouse Which Is Over 70 Days And Under 78 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='msz7ilUG', name='26 weeks', ontology_id='MmusDv:0000099', description='6 Month-Old Stage That Refers To A Mouse Which Is Over 182 Days And Under 190 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='Ni1mjjFg', name='mature stage', ontology_id='MmusDv:0000110', synonyms='mature', description='Mouse Developmental Stage That Refers To A Sexually Mature Adult Mouse Which Is Over 6 Weeks Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='8M0BbvJc', name='23 weeks', ontology_id='MmusDv:0000073', description='5 Month-Old Stage That Refers To A Mouse Which Is Over 161 Days And Under 169 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='df9sEkEI', name='early adult stage', ontology_id='MmusDv:0000061', description='Mature Stage That Refers To A Adult Mouse Which Is Over 6 Weeks And Under 7 Months Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='qqSg01zz', name='20 weeks', ontology_id='MmusDv:0000070', description='4 Month-Old Stage That Refers To A Mouse Which Is Over 140 Days And Under 148 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='ct1TGudU', name='Theiler stage 26', ontology_id='MmusDv:0000035', synonyms='TS26|E18', description='Fetal Stage Defined By Long Whiskers And During Which The Eyes Are Barely Visible Through The Closed Eyelids.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='vjWmEEv6', name='2 weeks', ontology_id='MmusDv:0000046', description='Premature Stage That Refers To A Mouse Which Is Over 14 Days And Under 22 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='U7yEA0lB', name='Theiler stage 27', ontology_id='MmusDv:0000036', description='Stage That Refers To The Newborn Mouse, Aged E19-20, P0. Description Of Anatomical Structures Related To This Stage Corresponds To The First Days Of The Mouse Life. Used For Postnatal Days 0 Through 3.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='TkE1VPQ6', name='7 weeks', ontology_id='MmusDv:0000051', description='Early Adult Stage That Refers To A Mouse Which Is Over 49 Days And Under 57 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='O7areJVw', name='5 month-old stage', ontology_id='MmusDv:0000069', description='Early Adult Stage That Refers To A Mouse Which Is Over 5 And Under 6 Months Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='jym1WIjI', name='2 month-old stage', ontology_id='MmusDv:0000062', description='Early Adult Stage That Refers To A Mouse Which Is Over 2 And Under 3 Months Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='PYPUpHe0', name='29 weeks', ontology_id='MmusDv:0000102', description='6 Month-Old Stage That Refers To A Mouse Which Is Over 203 Days And Under 211 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='loNJxGq0', name='Theiler stage 14', ontology_id='MmusDv:0000021', synonyms='TS14|E8.5-9.75', description='Organogenesis Stage During Which The Rostral Extremity Of The Neural Tube Closes In Embryos With Usually About 15-18 Somite Pairs. Late In The Stage The 3Rd Branchial Arch Becomes Visible.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='6N4RtxSR', name='Theiler stage 25', ontology_id='MmusDv:0000034', synonyms='TS25|E17', description='Fetal Stage During Which The Thickened Skin Forms Wrinkles And The Subcutaneous Veins Are Less Visible.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='Zlsx8Dmt', name='18 weeks', ontology_id='MmusDv:0000067', description='4 Month-Old Stage That Refers To A Mouse Which Is Over 126 Days And Under 134 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='asteE00k', name='20 month-old stage and over', ontology_id='MmusDv:0000091', description='Aged Adult Stage That Refers To A Mouse Which Is Over 20 Months Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='IFN01quy', name='21 weeks', ontology_id='MmusDv:0000071', description='5 Month-Old Stage That Refers To A Mouse Which Is Over 147 Days And Under 155 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='K7pFaQGM', name='25 weeks', ontology_id='MmusDv:0000098', description='6 Month-Old Stage That Refers To A Mouse Which Is Over 175 Days And Under 183 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='yOlwr0TC', name='4 weeks', ontology_id='MmusDv:0000048', description='Premature Stage That Refers To A Mouse Which Is Over 28 Days And Under 36 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='37VdjVb3', name='5 weeks', ontology_id='MmusDv:0000049', description='Premature Stage That Refers To A Mouse Which Is Over 35 Days And Under 43 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='070cWUIr', name='Theiler stage 22', ontology_id='MmusDv:0000029', synonyms='E13.5-15|TS22', description='Organogenesis Stage During Which The Fingers Are Clearly Visible And The Long Bones Of The Limbs Are Present.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='yWOXIv07', name='Theiler stage 17', ontology_id='MmusDv:0000024', synonyms='TS17|E10-11.25', description='Organogenesis Stage Defined By The Deepening Of The Lens Pit And The First Appearance Of The Physiological Umbilical Hernia.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='Xsadx8yg', name='8 month-old stage', ontology_id='MmusDv:0000079', description='Middle Aged Stage That Refers To A Mouse Which Is Over 8 And Under 9 Months Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='wlyIb2NL', name='16 weeks', ontology_id='MmusDv:0000065', description='4 Month-Old Stage That Refers To A Mouse Which Is Over 112 Days And Under 120 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='VEhMtYDB', name='Theiler stage 18', ontology_id='MmusDv:0000025', synonyms='E10.5-11.25|TS18', description='Organogenesis Stage During Which The Lens Vesicle Gradually Closes And The Nasal Pits Start To Form. The Rapid Growth Of The Brain Is Striking.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='QWeKuumK', name='Theiler stage 23', ontology_id='MmusDv:0000032', synonyms='E15|TS23', description='Fetal Stage During Which The Toes Separate And Hair Follicles Are Present In The Cephalic Region But Not At The Periphery Of The Vibrissae.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='p538b004', name='Theiler stage 20', ontology_id='MmusDv:0000027', synonyms='E11.5-13|TS20', description='Organogenesis Stage During Which The Handplate Develops Angles Corresponding To The Future Digits, And Tongue And Brain Vesicles Are Clearly Visible.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='HUrILEBs', name='16 month-old stage', ontology_id='MmusDv:0000087', description='Aged Adult Stage That Refers To A Mouse Which Is Over 16 And Under 17 Months Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='PzAvRdXi', name='6 month-old stage', ontology_id='MmusDv:0000077', description='Early Adult Stage That Refers To A Mouse Which Is Over 6 And Under 7 Months Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='Dx800l8t', name='17 weeks', ontology_id='MmusDv:0000066', description='4 Month-Old Stage That Refers To A Mouse Which Is Over 119 Days And Under 127 Days Old.', bionty_source_id=49, created_by_id=2), DevelopmentalStage(uid='lNh8U4YZ', name='late embryonic stage', ontology_id='UBERON:0007220', description='An Embryo Stage That Covers Late Steps Of The Embryogenesis With A Fully Formed Embryo Still Developing Before Birth Or Egg Hatching.', bionty_source_id=25, created_by_id=2), DevelopmentalStage(uid='wksJWjer', name='prime adult stage', ontology_id='UBERON:0018241', description='A Life Cycle Stage That Starts At Completion Of Development And Growth Of The Sexually Mature Adult Animal, And Ends Before Senescence.', bionty_source_id=25, created_by_id=2), DevelopmentalStage(uid='hqyIKjfF', name='late adult stage', ontology_id='UBERON:0007222', bionty_source_id=25, created_by_id=2), DevelopmentalStage(uid='l00DTC4g', name='juvenile stage', ontology_id='UBERON:0034919', description='The Stage Of Being No More Dependent Of The Nest And/Or From Caregivers For Subsistence While Having Not Reach Sexual Maturity.', bionty_source_id=25, created_by_id=2), DevelopmentalStage(uid='GDaE3j6Z', name='post-juvenile adult stage', ontology_id='UBERON:0000113', description='The Stage Of Being A Sexually Mature Adult Animal.', bionty_source_id=25, created_by_id=2), DevelopmentalStage(uid='xL8yuEN7', name='unknown', ontology_id='unknown', created_by_id=2)]
β now recursing through parents: this only happens once, but is much slower than bulk saving
registering disease
β did not create Disease record for 1 non-validated ontology_id: 'PATO:0000461'
β now recursing through parents: this only happens once, but is much slower than bulk saving
registered 1 records: [Disease(uid='4r2nqggf', name='normal', ontology_id='PATO:0000461', description='A Quality Inhering In A Bearer By Virtue Of The Bearer'S Exhibiting No Deviation From Normal Or Average.', bionty_source_id=43, created_by_id=2)]
registering self_reported_ethnicity
β did not create Ethnicity records for 3 non-validated ontology_ids: 'multiethnic', 'na', 'unknown'
β now recursing through parents: this only happens once, but is much slower than bulk saving
registered 3 records: [Ethnicity(uid='xL8yuEN7', name='unknown', ontology_id='unknown', created_by_id=2), Ethnicity(uid='UY1fNAFT', name='na', ontology_id='na', created_by_id=2), Ethnicity(uid='8lAgy5Ej', name='multiethnic', ontology_id='multiethnic', created_by_id=2)]
registering sex
β did not create Phenotype records for 3 non-validated ontology_ids: 'PATO:0000383', 'unknown', 'PATO:0000384'
registered 3 records: [Phenotype(uid='xL8yuEN7', name='unknown', ontology_id='unknown', created_by_id=2), Phenotype(uid='hSl0sSF0', name='female', ontology_id='PATO:0000383', description='A Biological Sex Quality Inhering In An Individual Or A Population That Only Produces Gametes That Can Be Fertilised By Male Gametes.', bionty_source_id=43, created_by_id=2), Phenotype(uid='Pl1UiuS0', name='male', ontology_id='PATO:0000384', description='A Biological Sex Quality Inhering In An Individual Or A Population Whose Sex Organs Contain Only Male Gametes.', bionty_source_id=43, created_by_id=2)]
registering tissue
β did not create Tissue records for 17 non-validated ontology_ids: 'CL:0000010 (cell culture)', 'CL:0000082 (cell culture)', 'CL:0000084 (cell culture)', 'CL:0000115 (cell culture)', 'CL:0000351 (cell culture)', 'CL:0002322 (cell culture)', 'CL:0002327 (cell culture)', 'CL:0002328 (cell culture)', 'CL:0002334 (cell culture)', 'CL:0002335 (cell culture)', 'CL:0002633 (cell culture)', 'CL:0010003 (cell culture)', 'UBERON:0000088 (organoid)', 'UBERON:0000966 (organoid)', 'UBERON:0001295 (organoid)', 'UBERON:0002048 (organoid)', 'UBERON:0002370 (organoid)'
β now recursing through parents: this only happens once, but is much slower than bulk saving
registered 17 records: [Tissue(uid='vg9s890t', name='respiratory basal cell (cell culture)', ontology_id='CL:0002633 (cell culture)', created_by_id=2), Tissue(uid='lfIFQFR5', name='epithelial cell of lung (cell culture)', ontology_id='CL:0000082 (cell culture)', created_by_id=2), Tissue(uid='rIPA0OEl', name='T cell (cell culture)', ontology_id='CL:0000084 (cell culture)', created_by_id=2), Tissue(uid='x3tRcugV', name='trophoblast (organoid)', ontology_id='UBERON:0000088 (organoid)', created_by_id=2), Tissue(uid='uS0Cw8zN', name='retina (organoid)', ontology_id='UBERON:0000966 (organoid)', created_by_id=2), Tissue(uid='kWD0kb5x', name='brown preadipocyte (cell culture)', ontology_id='CL:0002335 (cell culture)', created_by_id=2), Tissue(uid='UoElNxsj', name='endothelial cell (cell culture)', ontology_id='CL:0000115 (cell culture)', created_by_id=2), Tissue(uid='7MzqN14b', name='bronchial epithelial cell (cell culture)', ontology_id='CL:0002328 (cell culture)', created_by_id=2), Tissue(uid='RkE6D8y1', name='endometrium (organoid)', ontology_id='UBERON:0001295 (organoid)', created_by_id=2), Tissue(uid='yPk6E1V8', name='epithelial cell of alveolus of lung (cell culture)', ontology_id='CL:0010003 (cell culture)', created_by_id=2), Tissue(uid='K4RSNRBc', name='thymus (organoid)', ontology_id='UBERON:0002370 (organoid)', created_by_id=2), Tissue(uid='9ICArUMH', name='embryonic stem cell (cell culture)', ontology_id='CL:0002322 (cell culture)', created_by_id=2), Tissue(uid='WSs6UA9e', name='lung (organoid)', ontology_id='UBERON:0002048 (organoid)', created_by_id=2), Tissue(uid='w6gzNa8D', name='mammary gland epithelial cell (cell culture)', ontology_id='CL:0002327 (cell culture)', created_by_id=2), Tissue(uid='Ash8pGf8', name='trophoblast cell (cell culture)', ontology_id='CL:0000351 (cell culture)', created_by_id=2), Tissue(uid='CevFMDqD', name='preadipocyte (cell culture)', ontology_id='CL:0002334 (cell culture)', created_by_id=2), Tissue(uid='9YB5clqY', name='cultured cell (cell culture)', ontology_id='CL:0000010 (cell culture)', created_by_id=2)]
donors and suspension_typesΒΆ
donor_ids = set()
suspension_types = set()
for i in cellxgene_meta:
if "donor_id" in i:
donor_ids.update(i["donor_id"])
if "suspension_type" in i:
suspension_types.update(i["suspension_type"])
is_donor = ln.ULabel.filter(name="is_donor").one()
donors = is_donor.children.all()
result = donors.inspect(donor_ids, mute=True)
new_donors = [ln.ULabel(name=name) for name in result.non_validated]
ln.save(new_donors)
is_donor.children.add(*new_donors)
is_suspension_type = ln.ULabel.filter(name="is_suspension_type").one()
stypes = is_suspension_type.children.all()
result = stypes.inspect(suspension_types, mute=True)
new_stypes = [ln.ULabel(name=name) for name in result.non_validated]
ln.save(new_stypes)
is_suspension_type.children.add(*new_stypes)
Annotate artifacts with obs metadataΒΆ
FEATURE_TO_ACCESSOR
{'assay': ('experimental_factors', lnschema_bionty.models.ExperimentalFactor),
'cell_type': ('cell_types', lnschema_bionty.models.CellType),
'development_stage': ('developmental_stages',
lnschema_bionty.models.DevelopmentalStage),
'disease': ('diseases', lnschema_bionty.models.Disease),
'donor_id': ('ulabels', lnschema_core.models.ULabel),
'self_reported_ethnicity': ('ethnicities', lnschema_bionty.models.Ethnicity),
'sex': ('phenotypes', lnschema_bionty.models.Phenotype),
'suspension_type': ('ulabels', lnschema_core.models.ULabel),
'tissue': ('tissues', lnschema_bionty.models.Tissue),
'tissue_type': ('ulabels', lnschema_core.models.ULabel)}
features = ln.Feature.lookup()
for idx, dataset_meta in enumerate(cellxgene_meta):
if idx % 100 == 0:
print(f"annotating dataset {idx} of {len(cellxgene_meta)}")
file = artifacts.filter(key__contains=dataset_meta["dataset_id"]).one_or_none()
if file is None:
continue
for field, terms in dataset_meta.items():
if field not in FEATURE_TO_ACCESSOR:
continue
accessor, orm = FEATURE_TO_ACCESSOR.get(field)
if field in ["donor_id", "suspension_type", "tissue_type"]:
records = orm.from_values(terms, field="name")
if len(records) > 0:
# stratify by feature so that link tables records are written
file.labels.add(records, feature=getattr(features, field))
else:
records = orm.from_values(
[i["ontology_term_id"] for i in terms], field="ontology_id"
)
if len(records) > 0:
getattr(file, accessor).add(*records)
Show code cell output
annotating dataset 0 of 1152
annotating dataset 100 of 1152
annotating dataset 200 of 1152
annotating dataset 300 of 1152
annotating dataset 400 of 1152
annotating dataset 500 of 1152
annotating dataset 600 of 1152
annotating dataset 700 of 1152
annotating dataset 800 of 1152
annotating dataset 900 of 1152
annotating dataset 1000 of 1152
annotating dataset 1100 of 1152
Clean up the 2 βunknownsβ in DevelopmentalStage:
lb.DevelopmentalStage.filter(name="unknown").exclude(ontology_id="unknown").delete()
Validate and register genesΒΆ
# register synthetic constructs and sars_cov_2 as new organisms
new_organisms = lb.Organism.from_values(
["NCBITaxon:32630", "NCBITaxon:2697049"],
field=lb.Organism.ontology_id,
bionty_source=ncbitaxon_source,
)
ln.save(new_organisms, parents=False)
# genes files
organisms = lb.Organism.lookup(field=lb.Organism.scientific_name)
genes_files = {
"homo_sapiens": "https://github.com/chanzuckerberg/single-cell-curation/raw/main/cellxgene_schema_cli/cellxgene_schema/ontology_files/genes_homo_sapiens.csv.gz",
"mus_musculus": "https://github.com/chanzuckerberg/single-cell-curation/raw/main/cellxgene_schema_cli/cellxgene_schema/ontology_files/genes_mus_musculus.csv.gz",
"synthetic_construct": "https://github.com/chanzuckerberg/single-cell-curation/raw/main/cellxgene_schema_cli/cellxgene_schema/ontology_files/genes_ercc.csv.gz",
"severe_acute_respiratory_syndrome_coronavirus_2": "https://github.com/chanzuckerberg/single-cell-curation/raw/main/cellxgene_schema_cli/cellxgene_schema/ontology_files/genes_sars_cov_2.csv.gz",
}
Register all genes for each organism:
for organism_name, genes_file in genes_files.items():
print(f"registering {organism_name} genes")
df = pd.read_csv(genes_file, header=None, index_col=0)
organism_record = getattr(organisms, organism_name)
gene_records = lb.Gene.from_values(
df.index, field=lb.Gene.ensembl_gene_id, organism=organism_record
)
ln.save(gene_records)
validated = lb.Gene.validate(
df.index, field=lb.Gene.ensembl_gene_id, organism=organism_record
)
# register legacy genes manually
new_records = []
for gene_id in df.index[~validated]:
new_records.append(
lb.Gene(
ensembl_gene_id=gene_id,
symbol=df.loc[gene_id][1],
organism=organism_record,
)
)
ln.save(new_records)
genes_feature_set = ln.FeatureSet(
features=gene_records + new_records, name=f"all {organism_record.name} genes"
)
genes_feature_set.save()
Show code cell output
registering homo_sapiens genes
β did not create Gene records for 147 non-validated ensembl_gene_ids: 'ENSG00000112096', 'ENSG00000137808', 'ENSG00000161149', 'ENSG00000182230', 'ENSG00000203812', 'ENSG00000204092', 'ENSG00000205485', 'ENSG00000212951', 'ENSG00000215271', 'ENSG00000221995', 'ENSG00000224739', 'ENSG00000224745', 'ENSG00000225178', 'ENSG00000225932', 'ENSG00000226377', 'ENSG00000226380', 'ENSG00000226403', 'ENSG00000227021', 'ENSG00000227220', 'ENSG00000227902', ...
β 147 terms (0.20%) are not validated for ensembl_gene_id: ENSG00000269933, ENSG00000261737, ENSG00000259834, ENSG00000256374, ENSG00000263464, ENSG00000203812, ENSG00000272196, ENSG00000272880, ENSG00000284299, ENSG00000270188, ENSG00000287116, ENSG00000237133, ENSG00000224739, ENSG00000227902, ENSG00000239467, ENSG00000272551, ENSG00000280374, ENSG00000284741, ENSG00000236886, ENSG00000229352, ...
registering mus_musculus genes
β did not create Gene records for 135 non-validated ensembl_gene_ids: 'ENSMUSG00000022591', 'ENSMUSG00000045506', 'ENSMUSG00000053706', 'ENSMUSG00000053861', 'ENSMUSG00000066378', 'ENSMUSG00000066810', 'ENSMUSG00000066936', 'ENSMUSG00000067085', 'ENSMUSG00000067122', 'ENSMUSG00000067292', 'ENSMUSG00000067627', 'ENSMUSG00000067929', 'ENSMUSG00000068181', 'ENSMUSG00000069518', 'ENSMUSG00000072693', 'ENSMUSG00000073290', 'ENSMUSG00000073291', 'ENSMUSG00000073682', 'ENSMUSG00000074210', 'ENSMUSG00000074302', ...
β 135 terms (0.20%) are not validated for ensembl_gene_id: ENSMUSG00000022591, ENSMUSG00000094127, ENSMUSG00000066936, ENSMUSG00000116275, ENSMUSG00000091312, ENSMUSG00000098794, ENSMUSG00000079353, ENSMUSG00000096240, ENSMUSG00000079286, ENSMUSG00000085431, ENSMUSG00000075015, ENSMUSG00000075014, ENSMUSG00000078091, ENSMUSG00000075006, ENSMUSG00000079175, ENSMUSG00000079171, ENSMUSG00000079170, ENSMUSG00000079169, ENSMUSG00000090353, ENSMUSG00000100963, ...
registering synthetic_construct genes
β loading non-default source inside a LaminDB instance
β no Bionty source found, skipping Bionty validation
β loading non-default source inside a LaminDB instance
β did not create Gene records for 92 non-validated ensembl_gene_ids: 'ERCC-00002', 'ERCC-00003', 'ERCC-00004', 'ERCC-00009', 'ERCC-00012', 'ERCC-00013', 'ERCC-00014', 'ERCC-00016', 'ERCC-00017', 'ERCC-00019', 'ERCC-00022', 'ERCC-00024', 'ERCC-00025', 'ERCC-00028', 'ERCC-00031', 'ERCC-00033', 'ERCC-00034', 'ERCC-00035', 'ERCC-00039', 'ERCC-00040', ...
β 92 terms (100.00%) are not validated for ensembl_gene_id: ERCC-00002, ERCC-00003, ERCC-00004, ERCC-00009, ERCC-00012, ERCC-00013, ERCC-00014, ERCC-00016, ERCC-00017, ERCC-00019, ERCC-00022, ERCC-00024, ERCC-00025, ERCC-00028, ERCC-00031, ERCC-00033, ERCC-00034, ERCC-00035, ERCC-00039, ERCC-00040, ...
registering severe_acute_respiratory_syndrome_coronavirus_2 genes
β loading non-default source inside a LaminDB instance
β no Bionty source found, skipping Bionty validation
β loading non-default source inside a LaminDB instance
β did not create Gene records for 12 non-validated ensembl_gene_ids: 'ENSSASG00005000002', 'ENSSASG00005000003', 'ENSSASG00005000004', 'ENSSASG00005000006', 'ENSSASG00005000010', 'ENSSASG00005000007', 'ENSSASG00005000011', 'ENSSASG00005000009', 'ENSSASG00005000012', 'ENSSASG00005000008', 'ENSSASG00005000005', 'ENSSASG00005000013'
β 12 terms (100.00%) are not validated for ensembl_gene_id: ENSSASG00005000002, ENSSASG00005000003, ENSSASG00005000004, ENSSASG00005000006, ENSSASG00005000010, ENSSASG00005000007, ENSSASG00005000011, ENSSASG00005000009, ENSSASG00005000012, ENSSASG00005000008, ENSSASG00005000005, ENSSASG00005000013
Link metadata to individual artifactsΒΆ
annotate with genes measured in each file:
organisms = lb.Organism.lookup(field=lb.Organism.scientific_name)
for idx, file in enumerate(artifacts):
if idx % 100 == 0:
print(f"annotating dataset {idx} of {len(artifacts)}")
adata_backed = file.backed()
var_names = adata_backed.var_names
organism_record = file.organism.first()
if organism_record is None:
print(f"No organism found for file: {file}")
continue
genes = lb.Gene.from_values(
var_names, field=lb.Gene.ensembl_gene_id, organism=organism_record
)
if len(genes) == 0 and var_names[0].startswith("ENSG"):
genes += lb.Gene.from_values(
var_names, field=lb.Gene.ensembl_gene_id, organism="human"
)
if len(var_names[var_names.str.startswith("ERCC")]) > 0:
genes += lb.Gene.from_values(
var_names,
field=lb.Gene.ensembl_gene_id,
organism=organisms.synthetic_construct,
)
if len(var_names[var_names.str.startswith("ENSSASG")]) > 0:
genes += lb.Gene.from_values(
var_names,
field=lb.Gene.ensembl_gene_id,
organism=organisms.severe_acute_respiratory_syndrome_coronavirus_2,
)
var_feature_set_file = ln.FeatureSet(genes, type="number")
var_feature_set_file.save()
file.feature_sets.add(var_feature_set_file, through_defaults={"slot": "var"})
Show code cell output
annotating dataset 0 of 12
file.describe()
File(uid='8aIkAQpSXAWvebiuOT53', key='cell-census/2023-12-06/h5ads/ff7d15fa-f4b6-4a0e-992e-fd0c9d088ded.h5ad', suffix='.h5ad', accessor='AnnData', size=339098252, hash='wk4aVyHI7iZWNq2n99_s4w-41', hash_type='md5-n', visibility=1, key_is_virtual=False, updated_at=2023-12-11 15:46:45 UTC)
Provenance:
ποΈ storage: Storage(uid='vm6fiuHv', root='s3://cellxgene-data-public', type='s3', region='us-west-2', updated_at=2023-12-11 15:39:59 UTC, created_by_id=2)
π transform: Transform(uid='4hbVFAINneXCz8', name='Register files from Census release 2023-12-06', short_name='census-release-2023-12-06', version='0', type='notebook', updated_at=2023-12-11 15:39:44 UTC, created_by_id=2)
π£ run: Run(uid='yq2FEOYiiNwTV6HJRReE', run_at=2023-12-13 10:17:40 UTC, transform_id=1, created_by_id=2)
π€ created_by: User(uid='kmvZDIX9', handle='sunnyosun', name='Sunny Sun', updated_at=2023-12-12 10:05:51 UTC)
Features:
obs: FeatureSet(uid='VOhmBdxtNgUpMiUUMR56', name='obs features', n=10, registry='core.Feature', hash='vRtez9Dl4oTSutrbWK13', updated_at=2023-12-11 18:45:36 UTC, created_by_id=2)
π assay (1, bionty.ExperimentalFactor): '10x 3' v3'
π cell_type (10, bionty.CellType): 'vascular associated smooth muscle cell', 'astrocyte', 'central nervous system macrophage', 'oligodendrocyte', 'pericyte', 'neuron', 'oligodendrocyte precursor cell', 'endothelial cell', 'leukocyte', 'fibroblast'
π development_stage (3, bionty.DevelopmentalStage): '50-year-old human stage', '42-year-old human stage', '29-year-old human stage'
π disease (1, bionty.Disease): 'normal'
π donor_id (3, core.ULabel): 'H19.30.001', 'H19.30.002', 'H18.30.002'
π self_reported_ethnicity (1, bionty.Ethnicity): 'European'
π sex (1, bionty.Phenotype): 'male'
π suspension_type (1, core.ULabel): 'nucleus'
π tissue (1, bionty.Tissue): 'cerebral cortex'
π tissue_type (0, core.ULabel):
external: FeatureSet(uid='R9sY9Bx6I7Rv4aIY9BPM', name='external features', n=2, registry='core.Feature', hash='yOyigQuaDFnr-gu0w0qy', updated_at=2023-12-11 18:45:51 UTC, created_by_id=2)
π organism (1, bionty.Organism): 'human'
π collection (1, core.ULabel): 'Human Brain Cell Atlas v1.0'
var: FeatureSet(uid='GY9YeH2BaFhUz1egijMR', n=59357, type='number', registry='bionty.Gene', hash='pUpMpu0zd84pS0uKAhOq', updated_at=2023-12-13 10:25:21 UTC, created_by_id=2)
'DDX11L1', 'WASH7P', 'MIR6859-1', 'MIR1302-2', 'OR4G4P', 'OR4G11P', 'CICP27', 'None', 'None', 'None', 'RNU6-1100P', 'None', 'DDX11L17', 'WASH9P', 'MIR6859-2', 'None', 'None', 'RPL23AP24', 'None', 'WBP1LP7', ...
Labels:
π·οΈ organism (1, bionty.Organism): 'human'
π·οΈ tissues (1, bionty.Tissue): 'cerebral cortex'
π·οΈ cell_types (10, bionty.CellType): 'vascular associated smooth muscle cell', 'astrocyte', 'central nervous system macrophage', 'oligodendrocyte', 'pericyte', 'neuron', 'oligodendrocyte precursor cell', 'endothelial cell', 'leukocyte', 'fibroblast'
π·οΈ diseases (1, bionty.Disease): 'normal'
π·οΈ phenotypes (1, bionty.Phenotype): 'male'
π·οΈ experimental_factors (1, bionty.ExperimentalFactor): '10x 3' v3'
π·οΈ developmental_stages (3, bionty.DevelopmentalStage): '50-year-old human stage', '42-year-old human stage', '29-year-old human stage'
π·οΈ ethnicities (1, bionty.Ethnicity): 'European'
π·οΈ ulabels (5, core.ULabel): 'Human Brain Cell Atlas v1.0', 'H19.30.001', 'H19.30.002', 'H18.30.002', 'nucleus'
Annotate tissue_typeΒΆ
Before CxG schema 4.0, tissue_type column was not annotated, instead βcell cultureβ or βorganoidβ was added to the record ontology_id.
tissue_types = [ln.ULabel(name=i) for i in ["tissue", "organoid", "cell culture"]]
ln.save(tissue_types)
is_tissue_type = ln.ULabel.filter(name="is_tissue_type").one()
is_tissue_type.children.add(*tissue_types)
is_tissue_type = ln.ULabel.filter(name="is_tissue_type").one()
tissue_types = is_tissue_type.children.lookup()
features = ln.Feature.lookup()
organoids = lb.Tissue.filter(ontology_id__contains="organoid").all()
organoids.df()
uid | name | ontology_id | abbr | synonyms | description | bionty_source_id | updated_at | created_by_id | |
---|---|---|---|---|---|---|---|---|---|
id | |||||||||
692 | x3tRcugV | trophoblast (organoid) | UBERON:0000088 (organoid) | None | None | None | None | 2023-12-11 19:11:55.832890+00:00 | 2 |
693 | uS0Cw8zN | retina (organoid) | UBERON:0000966 (organoid) | None | None | None | None | 2023-12-11 19:11:55.832921+00:00 | 2 |
697 | RkE6D8y1 | endometrium (organoid) | UBERON:0001295 (organoid) | None | None | None | None | 2023-12-11 19:11:55.833155+00:00 | 2 |
699 | K4RSNRBc | thymus (organoid) | UBERON:0002370 (organoid) | None | None | None | None | 2023-12-11 19:11:55.833223+00:00 | 2 |
701 | WSs6UA9e | lung (organoid) | UBERON:0002048 (organoid) | None | None | None | None | 2023-12-11 19:11:55.833293+00:00 | 2 |
for record in organoids:
print(record.name)
ontology_id = record.ontology_id.split(" ")[0]
tissue_record = lb.Tissue.from_bionty(ontology_id=ontology_id)
if tissue_record._state.adding:
tissue_record.save()
for f in tissue_record.artifacts.all():
f.labels.add(tissue_types.organoid, features.tissue_type)
trophoblast (organoid)
retina (organoid)
endometrium (organoid)
thymus (organoid)
lung (organoid)
organoids.delete()
(10, {'lnschema_bionty.Tissue_files': 5, 'lnschema_bionty.Tissue': 5})
cell_cultures = lb.Tissue.filter(ontology_id__contains="cell culture").all()
cell_cultures.df()
uid | name | ontology_id | abbr | synonyms | description | bionty_source_id | updated_at | created_by_id | |
---|---|---|---|---|---|---|---|---|---|
id | |||||||||
691 | rIPA0OEl | T cell (cell culture) | CL:0000084 (cell culture) | None | None | None | None | 2023-12-11 19:11:55.832859+00:00 | 2 |
689 | vg9s890t | respiratory basal cell (cell culture) | CL:0002633 (cell culture) | None | None | None | None | 2023-12-11 19:11:55.832782+00:00 | 2 |
690 | lfIFQFR5 | epithelial cell of lung (cell culture) | CL:0000082 (cell culture) | None | None | None | None | 2023-12-11 19:11:55.832827+00:00 | 2 |
694 | kWD0kb5x | brown preadipocyte (cell culture) | CL:0002335 (cell culture) | None | None | None | None | 2023-12-11 19:11:55.833031+00:00 | 2 |
695 | UoElNxsj | endothelial cell (cell culture) | CL:0000115 (cell culture) | None | None | None | None | 2023-12-11 19:11:55.833064+00:00 | 2 |
696 | 7MzqN14b | bronchial epithelial cell (cell culture) | CL:0002328 (cell culture) | None | None | None | None | 2023-12-11 19:11:55.833122+00:00 | 2 |
698 | yPk6E1V8 | epithelial cell of alveolus of lung (cell cult... | CL:0010003 (cell culture) | None | None | None | None | 2023-12-11 19:11:55.833189+00:00 | 2 |
700 | 9ICArUMH | embryonic stem cell (cell culture) | CL:0002322 (cell culture) | None | None | None | None | 2023-12-11 19:11:55.833256+00:00 | 2 |
702 | w6gzNa8D | mammary gland epithelial cell (cell culture) | CL:0002327 (cell culture) | None | None | None | None | 2023-12-11 19:11:55.833349+00:00 | 2 |
703 | Ash8pGf8 | trophoblast cell (cell culture) | CL:0000351 (cell culture) | None | None | None | None | 2023-12-11 19:11:55.833382+00:00 | 2 |
704 | CevFMDqD | preadipocyte (cell culture) | CL:0002334 (cell culture) | None | None | None | None | 2023-12-11 19:11:55.833417+00:00 | 2 |
705 | 9YB5clqY | cultured cell (cell culture) | CL:0000010 (cell culture) | None | None | None | None | 2023-12-11 19:11:55.833450+00:00 | 2 |
for record in cell_cultures:
print(record.name)
ontology_id = record.ontology_id.split(" ")[0]
tissue_record = lb.CellType.from_bionty(ontology_id=ontology_id)
if tissue_record._state.adding:
tissue_record.save()
for f in tissue_record.artifacts.all():
f.labels.add(tissue_types.cell_culture, features.tissue_type)
T cell (cell culture)
respiratory basal cell (cell culture)
epithelial cell of lung (cell culture)
brown preadipocyte (cell culture)
endothelial cell (cell culture)
bronchial epithelial cell (cell culture)
epithelial cell of alveolus of lung (cell culture)
embryonic stem cell (cell culture)
mammary gland epithelial cell (cell culture)
trophoblast cell (cell culture)
preadipocyte (cell culture)
cultured cell (cell culture)
cell_cultures.delete()
(0, {})
Link metadata to collectionΒΆ
collection = ln.Collection.filter(name="cellxgene-census", version=census_version).one()
feature sets:
collection.feature_sets.add(
ln.FeatureSet.filter(name__contains="obs").one(), through_defaults={"slot": "obs"}
)
collection.feature_sets.add(
ln.FeatureSet.filter(name__contains="ext").one(),
through_defaults={"slot": "external"},
)
collection.feature_sets.add(
ln.FeatureSet.filter(name__contains="human").one(),
through_defaults={"slot": "var-human"},
)
collection.feature_sets.add(
ln.FeatureSet.filter(name__contains="mouse").one(),
through_defaults={"slot": "var-mouse"},
)
collection.feature_sets.add(
ln.FeatureSet.filter(name__contains="sars-2").one(),
through_defaults={"slot": "var-sars-cov-2"},
)
collection.feature_sets.add(
ln.FeatureSet.filter(name__contains="synthetic construct").one(),
through_defaults={"slot": "var-ercc"},
)
is_donor = ln.ULabel.filter(name="is_donor").one()
donors = is_donor.children.all().filter().exclude(artifacts=None).all()
is_collection = ln.ULabel.filter(name="is_collection").one()
collections = is_collection.children.all().filter().exclude(artifacts=None).all()
is_suspension_type = ln.ULabel.filter(name="is_suspension_type").one()
stypes = is_suspension_type.children.all().filter().exclude(artifacts=None).all()
is_tissue_type = ln.ULabel.filter(name="is_tissue_type").one()
tissue_types = is_tissue_type.children.all().filter().exclude(artifacts=None).all()
collection.labels.add(donors, features.donor_id)
collection.labels.add(collections, features.collection)
collection.labels.add(stypes, features.suspension_type)
collection.labels.add(tissue_types, features.tissue_type)
collection.labels.add(
lb.ExperimentalFactor.filter().exclude(artifacts=None).all(), features.assay
)
collection.labels.add(
lb.CellType.filter().exclude(artifacts=None).all(), features.cell_type
)
collection.labels.add(
lb.DevelopmentalStage.filter().exclude(artifacts=None).all(),
features.development_stage,
)
collection.labels.add(
lb.Disease.filter().exclude(artifacts=None).all(), features.disease
)
collection.labels.add(
lb.Ethnicity.filter().exclude(artifacts=None).all(),
features.self_reported_ethnicity,
)
collection.labels.add(lb.Phenotype.filter().exclude(artifacts=None).all(), features.sex)
collection.labels.add(lb.Tissue.filter().exclude(artifacts=None).all(), features.tissue)
collection.describe()
Dataset(uid='vAGS2R54eJGhRV6VWCYb', name='cellxgene-census', version='2023-12-06', hash='ak5599uHQCLwQNFgRusr', visibility=1, updated_at=2023-12-11 18:32:57 UTC)
Provenance:
π transform: Transform(uid='4hbVFAINneXCz8', name='Register files from Census release 2023-12-06', short_name='census-release-2023-12-06', version='0', type='notebook', updated_at=2023-12-11 15:39:44 UTC, created_by_id=2)
π£ run: Run(uid='yq2FEOYiiNwTV6HJRReE', run_at=2023-12-13 12:03:11 UTC, transform_id=1, created_by_id=2)
π€ created_by: User(uid='kmvZDIX9', handle='sunnyosun', name='Sunny Sun', updated_at=2023-12-12 10:05:51 UTC)
Features:
obs: FeatureSet(uid='VOhmBdxtNgUpMiUUMR56', name='obs features', n=10, registry='core.Feature', hash='vRtez9Dl4oTSutrbWK13', updated_at=2023-12-11 18:45:36 UTC, created_by_id=2)
π assay (36, bionty.ExperimentalFactor): 'BD Rhapsody Targeted mRNA', 'sci-RNA-seq3', '10x multiome', 'mCT-seq', 'DroNc-seq', 'MARS-seq', '10x 3' v3', 'Patch-seq', '10x 3' v2', 'Smart-seq2', ...
π cell_type (757, bionty.CellType): 'amacrine cell', 'caudal ganglionic eminence derived GABAergic cortical interneuron', 'mesothelial cell', 'aortic endothelial cell', 'CD8-positive, alpha-beta cytokine secreting effector T cell', 'OFF-bipolar cell', 'T-helper 22 cell', 'myelocyte', 'bipolar neuron', 'glandular epithelial cell', ...
π development_stage (229, bionty.DevelopmentalStage): '20-year-old human stage', '21st week post-fertilization human stage', 'fifth decade human stage', '38-year-old human stage', '69-year-old human stage', '47-year-old human stage', 'fourth LMP month human stage', 'Carnegie stage 19', 'sixth LMP month human stage', '9-month-old human stage', ...
π disease (89, bionty.Disease): 'Wilms tumor', 'disease', 'hypersensitivity pneumonitis', 'respiratory failure', 'non-specific interstitial pneumonia', 'dilated cardiomyopathy', 'age related macular degeneration 7', 'adenocarcinoma', 'heart failure', 'breast carcinoma', ...
π donor_id (7344, core.ULabel): '209_210', '403316', 'Hrv43_fetus', 'S00039', 'mouse_SQUNI', 'HGR0000115', 'homosapiens_None_2023_None_sikkemalisa_002_d10_1101_2022_03_10_483747VUILD57', '726_727', 'mouse_SOYWW', 'Wu_Zhou_2021_P2', ...
π self_reported_ethnicity (32, bionty.Ethnicity): 'Irish', 'Greater Middle Eastern (Middle Eastern or North African or Persian)', 'Hispanic or Latin American', 'Jewish Israeli', 'Iraqi', 'American', 'Malaysian', 'European American', 'African', 'Asian', ...
π sex (3, bionty.Phenotype): 'unknown', 'female', 'male'
π suspension_type (3, core.ULabel): 'nucleus', 'cell', 'na'
π tissue (349, bionty.Tissue): 'rectus abdominis muscle', 'neocortex', 'uterus', 'vein', 'lymph node', 'cerebral cortex', 'lateral septal complex', 'lamina propria', 'renal pelvis', 'anterior cingulate gyrus', ...
π tissue_type (2, core.ULabel): 'organoid', 'cell culture'
external: FeatureSet(uid='R9sY9Bx6I7Rv4aIY9BPM', name='external features', n=2, registry='core.Feature', hash='yOyigQuaDFnr-gu0w0qy', updated_at=2023-12-11 18:45:51 UTC, created_by_id=2)
π organism (0, bionty.Organism):
π collection (172, core.ULabel): 'COVID-19 mRNA vaccine elicits a potent adaptive immune response in the absence of persistent inflammation observed in SARS-CoV-2 infection', 'Cellular development and evolution of the mammalian cerebellum', 'Cells of the adult human heart', 'Single-cell RNA-seq reveals the cell-type-specific molecular and genetic associations to lupus', 'Resolving cellular and molecular diversity along the hippocampal anterior-to-posterior axis in humans', 'Single-cell RNA sequencing unifies developmental programs of Esophageal and Gastric Intestinal Metaplasia', 'Azimuth meta-analysis of human scRNA-seq datasets', 'Distinct microbial and immune niches of the human colon', 'Neuron type-specific effects of human aging and sex on DNA methylation and transcription', 'Integrated analysis of multimodal single-cell data', ...
var-human: FeatureSet(uid='kBjFSvlpphT5JuOaj1P0', name='all human genes', n=60664, type='number', registry='bionty.Gene', hash='DOnOv7runwo4TOR5P_do', updated_at=2023-12-11 20:57:34 UTC, created_by_id=2)
'DDX11L1', 'WASH7P', 'MIR6859-1', 'MIR1302-2HG', 'MIR1302-2', 'FAM138A', 'OR4G4P', 'OR4G11P', 'OR4F5', 'None', 'None', 'CICP27', 'None', 'None', 'None', 'None', 'RNU6-1100P', 'None', 'DDX11L17', 'WASH9P', ...
var-mouse: FeatureSet(uid='WR8zVlQ5Y4BwoisivGKr', name='all mouse genes', n=55416, type='number', registry='bionty.Gene', hash='umPHI2jmFQXA78M69WBD', updated_at=2023-12-11 19:39:46 UTC, created_by_id=2)
'4933401J01Rik', 'Gm26206', 'Xkr4', 'Gm18956', 'Gm37180', 'Gm37363', 'Gm37686', 'Gm1992', 'Gm37329', 'Gm7341', 'Gm38148', 'Gm19938', 'Gm10568', 'Gm38385', 'Gm27396', 'Gm37381', 'Rp1', 'Gm6101', 'Gm37483', 'Sox17', ...
var-sars-cov-2: FeatureSet(uid='ptmoMIBtI0M9UQ8vtbm9', name='all sars-2 genes', n=12, type='number', registry='bionty.Gene', hash='CLCjr_EazVM8KxnA7jhc', updated_at=2023-12-11 19:39:52 UTC, created_by_id=2)
'ORF1ab_ENSSASG00005000002', 'ORF1ab_ENSSASG00005000003', 'S', 'ORF3a', 'E', 'M', 'ORF6', 'ORF7a', 'ORF7b', 'ORF8', 'N', 'ORF10'
var-ercc: FeatureSet(uid='jo8TPYXhhk9FSaDnD8GU', name='all synthetic construct genes', n=92, type='number', registry='bionty.Gene', hash='rMxzn166gRykjOZFnWRy', updated_at=2023-12-11 19:39:52 UTC, created_by_id=2)
'ERCC-00002 (spike-in control)', 'ERCC-00003 (spike-in control)', 'ERCC-00004 (spike-in control)', 'ERCC-00009 (spike-in control)', 'ERCC-00012 (spike-in control)', 'ERCC-00013 (spike-in control)', 'ERCC-00014 (spike-in control)', 'ERCC-00016 (spike-in control)', 'ERCC-00017 (spike-in control)', 'ERCC-00019 (spike-in control)', 'ERCC-00022 (spike-in control)', 'ERCC-00024 (spike-in control)', 'ERCC-00025 (spike-in control)', 'ERCC-00028 (spike-in control)', 'ERCC-00031 (spike-in control)', 'ERCC-00033 (spike-in control)', 'ERCC-00034 (spike-in control)', 'ERCC-00035 (spike-in control)', 'ERCC-00039 (spike-in control)', 'ERCC-00040 (spike-in control)', ...
Labels:
π·οΈ tissues (349, bionty.Tissue): 'rectus abdominis muscle', 'neocortex', 'uterus', 'vein', 'lymph node', 'cerebral cortex', 'lateral septal complex', 'lamina propria', 'renal pelvis', 'anterior cingulate gyrus', ...
π·οΈ cell_types (757, bionty.CellType): 'amacrine cell', 'caudal ganglionic eminence derived GABAergic cortical interneuron', 'mesothelial cell', 'aortic endothelial cell', 'CD8-positive, alpha-beta cytokine secreting effector T cell', 'OFF-bipolar cell', 'T-helper 22 cell', 'myelocyte', 'bipolar neuron', 'glandular epithelial cell', ...
π·οΈ diseases (89, bionty.Disease): 'Wilms tumor', 'disease', 'hypersensitivity pneumonitis', 'respiratory failure', 'non-specific interstitial pneumonia', 'dilated cardiomyopathy', 'age related macular degeneration 7', 'adenocarcinoma', 'heart failure', 'breast carcinoma', ...
π·οΈ phenotypes (3, bionty.Phenotype): 'unknown', 'female', 'male'
π·οΈ experimental_factors (36, bionty.ExperimentalFactor): 'BD Rhapsody Targeted mRNA', 'sci-RNA-seq3', '10x multiome', 'mCT-seq', 'DroNc-seq', 'MARS-seq', '10x 3' v3', 'Patch-seq', '10x 3' v2', 'Smart-seq2', ...
π·οΈ developmental_stages (229, bionty.DevelopmentalStage): '20-year-old human stage', '21st week post-fertilization human stage', 'fifth decade human stage', '38-year-old human stage', '69-year-old human stage', '47-year-old human stage', 'fourth LMP month human stage', 'Carnegie stage 19', 'sixth LMP month human stage', '9-month-old human stage', ...
π·οΈ ethnicities (32, bionty.Ethnicity): 'Irish', 'Greater Middle Eastern (Middle Eastern or North African or Persian)', 'Hispanic or Latin American', 'Jewish Israeli', 'Iraqi', 'American', 'Malaysian', 'European American', 'African', 'Asian', ...
π·οΈ ulabels (7521, core.ULabel): 'COVID-19 mRNA vaccine elicits a potent adaptive immune response in the absence of persistent inflammation observed in SARS-CoV-2 infection', 'Cellular development and evolution of the mammalian cerebellum', 'Cells of the adult human heart', 'Single-cell RNA-seq reveals the cell-type-specific molecular and genetic associations to lupus', 'Resolving cellular and molecular diversity along the hippocampal anterior-to-posterior axis in humans', 'Single-cell RNA sequencing unifies developmental programs of Esophageal and Gastric Intestinal Metaplasia', 'Azimuth meta-analysis of human scRNA-seq datasets', 'Distinct microbial and immune niches of the human colon', 'Neuron type-specific effects of human aging and sex on DNA methylation and transcription', 'Integrated analysis of multimodal single-cell data', ...
π·οΈ artifacts (1139, core.Artifact): 'cell-census/2023-12-06/h5ads/00099d5e-154f-4a7a-aa8d-fa30c8c0c43c.h5ad', 'cell-census/2023-12-06/h5ads/0041b9c3-6a49-4bf7-8514-9bc7190067a7.h5ad', 'cell-census/2023-12-06/h5ads/00476f9f-ebc1-4b72-b541-32f912ce36ea.h5ad', 'cell-census/2023-12-06/h5ads/00e5dedd-b9b7-43be-8c28-b0e5c6414a62.h5ad', 'cell-census/2023-12-06/h5ads/00ff600e-6e2e-4d76-846f-0eec4f0ae417.h5ad', 'cell-census/2023-12-06/h5ads/01209dce-3575-4bed-b1df-129f57fbc031.h5ad', 'cell-census/2023-12-06/h5ads/0129dbd9-a7d3-4f6b-96b9-1da155a93748.h5ad', 'cell-census/2023-12-06/h5ads/015c230d-650c-4527-870d-8a805849a382.h5ad', 'cell-census/2023-12-06/h5ads/019c7af2-c827-4454-9970-44d5e39ce068.h5ad', 'cell-census/2023-12-06/h5ads/01ad3cd7-3929-4654-84c0-6db05bd5fd59.h5ad', ...
Register collectionsΒΆ
for i, ulabel in enumerate(is_collection.children.all()):
if i % 20 == 0:
print(i)
artifacts = ulabel.artifacts.all()
if artifacts.count() == 0:
continue
if artifacts.count() == 1:
artifacts = artifacts[0]
collection = ln.Collection(
artifacts,
name=ulabel.name,
description=ulabel.description,
reference=ulabel.reference,
reference_type="CELLxGENE Collection ID",
)
collection.save()