Comparing Modlyn & Scanpy feature selection methods¶
!pip install 'modlyn[dev]'
!lamin init --storage test-modlyn
import lamindb as ln
import modlyn as mn
import scanpy as sc
import pandas as pd
import seaborn as sns
sns.set_theme()
%config InlineBackend.figure_formats = ['svg']
ln.track()
# Configuration: switch between in-memory and Dask loader
USE_DASK = True # set False to use in-memory path
ZARR_UID = "1xSHIdfBjfUdxKHm0000" # example UID; change as needed
LABEL_COL = "cell_line"
# Dask runtime
DASK_DATASET_TYPE = "arrayloaders-dasd" # accepted alias (normalized internally)
BATCH_SIZE = 256
N_CHUNKS = 8
DASK_SCHEDULER = "threads"
Using a custom Dask data loader¶
Set USE_DASK = True and provide a zarr ZARR_UID from laminlabs/arrayloader-benchmarks.
The loader auto-detects whether the cached path is a single zarr store or a directory of shard stores (*.zarr) and selects the right reader. For quick runs, we cap steps with max_steps in the training call.
from pathlib import Path
import lamindb as ln
if USE_DASK:
artifact = ln.Artifact.using("laminlabs/arrayloader-benchmarks").get(ZARR_UID)
store_path = Path(artifact.cache())
if not store_path.is_dir():
raise ValueError(f"ZARR_UID must cache to a directory, got: {store_path}")
# Decide between a directory of shards (*.zarr) vs a single zarr store
has_shards = any(child.name.endswith(".zarr") for child in store_path.iterdir())
try:
from arrayloaders.io.dask_loader import read_lazy_store
except Exception:
read_lazy_store = None
from arrayloaders.io import read_lazy as read_single_store
if has_shards and read_lazy_store is not None:
adata = read_lazy_store(store_path, obs_columns=[LABEL_COL])
else:
# Single zarr store
adata = read_single_store(store_path, obs_columns=[LABEL_COL])
else:
# Example H5AD path (keep your current artifact if you prefer)
artifact = ln.Artifact.using("laminlabs/arrayloader-benchmarks").get(
"JNaxQe8zbljesdbK0000"
)
adata = artifact.load()
sc.pp.log1p(adata)
print("adata:", adata.shape)
Prepare dataset¶
keep = adata.obs["cell_line"].value_counts().loc[lambda x: x > 3].index
adata = adata[adata.obs["cell_line"].isin(keep)].copy()
adata
adata.obs["cell_line"].value_counts().tail()
Train LogReg with Modlyn¶
logreg = mn.models.SimpleLogReg(
adata=adata,
label_column=LABEL_COL,
learning_rate=1e-1,
weight_decay=1e-3,
)
fit_kwargs = {
"adata_train": adata,
"adata_val": None,
"train_dataloader_kwargs": {
"batch_size": BATCH_SIZE,
"drop_last": False,
"num_workers": 0,
},
"max_epochs": 1,
"num_sanity_val_steps": 0,
"log_every_n_steps": 1,
"max_steps": 50,
}
if USE_DASK:
fit_kwargs.update(
{
"dataset_type": DASK_DATASET_TYPE,
"n_chunks": N_CHUNKS,
"dask_scheduler": DASK_SCHEDULER,
}
)
# logreg.fit(**fit_kwargs)
logreg.fit(
adata_train=adata,
adata_val=adata, # reuse the lazy dataset so val has batches
train_dataloader_kwargs={
"batch_size": BATCH_SIZE,
"drop_last": False,
"num_workers": 0,
},
dataset_type=DASK_DATASET_TYPE,
n_chunks=N_CHUNKS,
dask_scheduler=DASK_SCHEDULER,
max_epochs=1,
num_sanity_val_steps=0,
log_every_n_steps=1,
max_steps=50,
)
print("dataset_type:", getattr(logreg.datamodule, "dataset_type", "in-memory"))
print("train_dataset:", type(logreg.datamodule.train_dataloader().dataset).__name__)
logreg.plot_losses()
# eval subset
adata_eval = adata[:10000]
adata_eval = adata_eval.to_memory() if hasattr(adata_eval, "to_memory") else adata_eval
if hasattr(adata_eval.X, "compute"):
adata_eval.X = adata_eval.X.compute()
logreg.plot_classification_report(adata_eval)
Get features scores of different methods¶
df_modlyn_logreg = logreg.get_weights()
df_modlyn_logreg.head()
sc.tl.rank_genes_groups(adata, "cell_line", method="logreg", key_added="sc_logreg")
df_scanpy_logreg = sc.get.rank_genes_groups_df(
adata, group=None, key="sc_logreg"
).pivot(index="group", columns="names", values="scores")
df_scanpy_logreg.attrs["method_name"] = "scanpy_logreg"
df_scanpy_logreg.head()
sc.tl.rank_genes_groups(adata, "cell_line", method="wilcoxon", key_added="sc_wilcoxon")
df_scanpy_wilcoxon = sc.get.rank_genes_groups_df(
adata, group=None, key="sc_wilcoxon"
).pivot(index="group", columns="names", values="scores")
df_scanpy_wilcoxon.attrs["method_name"] = "scanpy_wilcoxon"
df_scanpy_wilcoxon.head()
Compare feature selection results¶
compare = mn.eval.CompareScoresJaccard(
[df_modlyn_logreg, df_scanpy_logreg, df_scanpy_wilcoxon], n_top_values=[5, 10, 25]
)
compare.plot_heatmaps()
compare.compute_jaccard_comparison()
compare.plot_jaccard_comparison()
ln.finish()