# /// script # requires-python = ">=3.13" # dependencies = [ # "loguru==0.7.3", # "marimo", # "matplotlib==3.10.3", # "numpy==2.3.1", # "pandas==2.3.1", # "scikit-learn==1.7.0", # "scipy==1.16.0", # "seaborn==0.13.2", # "xarray==2025.7.1", # "zarr==3.0.10", # ] # /// import marimo __generated_with = "0.14.10" app = marimo.App(width="columns") @app.cell(hide_code=True) def _(): import marimo as mo mo.md(""" # Unified laboratory data storage for biological entities What if I told you that your entire experimental lifecycle could live in a single, queryable data structure? From the moment you collect measurements at the bench to the final machine learning predictions, everything coordinate-aligned and ready for analysis. I've been thinking about this problem for years. We generate laboratory data, then features, then model outputs, then train/test splits. Each piece typically lives in its own file, its own format, with its own indexing scheme. The cognitive overhead of keeping track of which peptide corresponds to which row in which CSV is exhausting. Here's what I've found works: store everything in a unified xarray Dataset where biological entities are the shared coordinate system. Your raw measurements, computed features, statistical estimates, and data splits all aligned by the same peptide sequences. No more integer indices. No more file juggling. Just clean, coordinated data that scales to the cloud. """) return (mo,) @app.cell(hide_code=True) def _(): import numpy as np import pandas as pd import xarray as xr import matplotlib.pyplot as plt import seaborn as sns from datetime import datetime, timedelta from sklearn.preprocessing import OneHotEncoder from sklearn.model_selection import train_test_split import warnings warnings.filterwarnings("ignore") np.random.seed(42) return datetime, np, pd, plt, sns, timedelta, train_test_split, xr @app.cell(hide_code=True) def _(mo): mo.md( """ I'm going to walk you through building this system step by step. We'll start simple and add complexity progressively, just like a real experiment unfolds. First, let's generate some peptides to work with. """ ) return @app.cell def _(): # Generate synthetic biological entities (peptides) n_peptides = 150 peptide_sequences = [ f"PEPTIDE_{seq_i:03d}" for seq_i in range(1, n_peptides + 1) ] return n_peptides, peptide_sequences @app.cell(hide_code=True) def _(mo): mo.md("""Now we need to model our experimental design factors. In any good biological experiment, you're thinking about treatments, replicates, time points, and which animals you're working with.""") return @app.cell def _(): # Experimental design factors treatments = ["control", "treatment_A", "treatment_B"] replicates = [f"rep_{rep_i}" for rep_i in range(1, 4)] mouse_ids = [f"mouse_{mouse_i:02d}" for mouse_i in range(1, 21)] time_points = ["0h", "2h", "6h", "24h"] return mouse_ids, replicates, time_points, treatments @app.cell(hide_code=True) def _(mo): mo.md("""And of course, experiments happen over time. Let's simulate a multi-week study.""") return @app.cell def _(datetime, timedelta): # Generate experimental dates start_date = datetime(2024, 1, 1) experiment_dates = [ start_date + timedelta(days=date_i * 7) for date_i in range(8) ] return (experiment_dates,) @app.cell(hide_code=True) def _(mo): mo.md("""Now comes the fun part - generating some fake experimental data and organizing it into our coordinate system. This is where xarray really shines. Instead of managing separate arrays and keeping track of which index corresponds to which condition, we create one unified structure where every measurement knows exactly which peptide, treatment, and time point it belongs to.""") return @app.cell def _( experiment_dates, mouse_ids, peptide_sequences, replicates, time_points, treatments, ): # Create coordinate system for our experimental data coords = { "peptide": peptide_sequences, "treatment": treatments, "replicate": replicates, "time_point": time_points, "mouse_id": mouse_ids[:12], # Use subset for demonstration "experiment_date": experiment_dates[:4], # Use subset } return (coords,) @app.cell(hide_code=True) def _(mo): mo.md("""Time to generate some synthetic activity measurements. I'm adding treatment effects so there's actually signal in our data.""") return @app.cell def _(coords, np): # Generate synthetic activity data with realistic structure shape = tuple(len(coords[dim]) for dim in coords.keys()) base_activity = np.random.normal(100, 20, shape) # Add treatment effects - this is what we'll try to recover later treatment_effects = {"control": 0, "treatment_A": 25, "treatment_B": -15} treatments_list = coords["treatment"] for treatment_idx, treatment in enumerate(treatments_list): base_activity[:, treatment_idx, :, :, :, :] += treatment_effects[treatment] # Add peptide-specific effects so some peptides are inherently more active peptide_effects = np.random.normal(0, 15, len(coords["peptide"])) for peptide_idx in range(len(coords["peptide"])): base_activity[peptide_idx, :, :, :, :, :] += peptide_effects[peptide_idx] return (base_activity,) @app.cell(hide_code=True) def _(mo): mo.md("""Now we wrap this into our unified xarray Dataset. This becomes the foundation that everything else builds on.""") return @app.cell def _(base_activity, coords, experiment_dates, peptide_sequences, xr): # Create the main laboratory data array lab_data = xr.DataArray( base_activity, coords=coords, dims=list(coords.keys()), name="activity_measurement", attrs={ "units": "relative_fluorescence_units", "description": "Peptide activity measurements across experimental conditions", "measurement_protocol": "fluorescence_assay_v2.1", "created_date": str(experiment_dates[0]), }, ) # Start the unified experiment dataset unified_stage1 = xr.Dataset( data_vars={"activity_measurement": lab_data}, coords=coords, attrs={ "title": "Unified Biological Experiment Dataset", "description": "Progressive experimental data accumulation", "experiment_type": "peptide_activity_screen", "workflow_stage": "1_laboratory_data", "created_date": str(experiment_dates[0]), "n_peptides": len(peptide_sequences), "storage_format": "xarray_zarr", }, ) return lab_data, unified_stage1 @app.cell(hide_code=True) def _(unified_stage1): unified_stage1 return @app.cell(hide_code=True) def _(mo): mo.md("""Now let's add Bayesian parameter estimates. This is where we model the treatment and replicate effects we built into our synthetic data.""") return @app.cell(hide_code=True) def _(mo): mo.md("""Let's prepare our data for modeling using xarray's native flattening capabilities. No more nested for loops - we'll use the built-in `.to_dataframe()` method to convert our multidimensional data into the tabular format that modeling libraries expect.""") return @app.cell def _(lab_data): # Use xarray's native flattening - much cleaner than nested loops # This creates a DataFrame with MultiIndex from all coordinates model_data = lab_data.to_dataframe() # Reset index to make coordinates into columns modeling_df = model_data.reset_index() modeling_df return (modeling_df,) @app.cell(hide_code=True) def _(mo): mo.md("""Perfect! Now we have our data in the right format for modeling. Each row represents one observation with all the experimental factors as columns. Time to fit our Bayesian model to recover the treatment effects we built into the data.""") return @app.cell def _(modeling_df, np, pd, unified_stage1, xr): # Simple Bayesian GLM estimation from sklearn.linear_model import BayesianRidge # Encode categorical variables for GLM treatment_dummies = pd.get_dummies(modeling_df["treatment"], prefix="treatment") replicate_dummies = pd.get_dummies(modeling_df["replicate"], prefix="replicate") # Simple GLM: activity ~ treatment + replicate X = pd.concat([treatment_dummies, replicate_dummies], axis=1) y = modeling_df["activity_measurement"] # Fit Bayesian Ridge model model = BayesianRidge(alpha_1=1e-6, alpha_2=1e-6, lambda_1=1e-6, lambda_2=1e-6) model.fit(X, y) # Add Bayesian estimates to unified dataset unified_stage2 = unified_stage1.assign_coords(model_feature=X.columns) unified_stage2 = unified_stage2.assign( { "model_coefficients": (["model_feature"], model.coef_), "model_coefficient_std": ( ["model_feature"], np.sqrt(np.diag(model.sigma_)), ), "model_alpha": model.alpha_, "model_lambda": model.lambda_, } ) # Update metadata unified_stage2.attrs.update( { "workflow_stage": "2_statistical_modeling", "model_type": "BayesianRidge", "model_target": "peptide_activity", "n_model_observations": len(y), } ) # Create separate bayesian_estimates for backward compatibility bayesian_estimates = xr.Dataset( { "coefficients": (["feature"], model.coef_), "coefficient_std": (["feature"], np.sqrt(np.diag(model.sigma_))), "alpha": model.alpha_, "lambda": model.lambda_, }, coords={"feature": X.columns}, ) return (unified_stage2,) @app.cell(hide_code=True) def _(mo): mo.md(r"""Now let's take a look at what the XArray dataset looks like:""") return @app.cell(hide_code=True) def _(unified_stage2): unified_stage2 return @app.cell(hide_code=True) def _(mo): mo.md( """ We see the addition of the Bayesian estimates and parameters, in particular the `model_feature` coordinate which indexes our treatment and replicate coefficients. I'm not doing the best job of showing Bayesian estimation here because that's not the main point. The main point is really to demonstrate how we store and index data in a unified structure. The modeling is just there to show that we can take our coordinate-aligned experimental data, flatten it properly for analysis, fit a model, and then store those results back into the same unified dataset with proper indexing. That's the magic - everything stays connected by our coordinate system. """ ) return @app.cell(hide_code=True) def _(mo): mo.md( """ ## Machine learning features Next up: machine learning features. I want to show you how features can live alongside your experimental data with perfect coordinate alignment. No more wondering which row corresponds to which peptide. """ ) return @app.cell(hide_code=True) def _(mo): mo.md("""Let's generate some synthetic features to demonstrate the concept. I'm not actually calculating real molecular features here - that would require proper cheminformatics libraries and peptide sequence analysis. Instead, I'm just generating what typical ML features might look like: amino acid composition, physicochemical properties, and structural predictions.""") return @app.cell def _(np, peptide_sequences): # Generate synthetic amino acid composition features amino_acids = ["A", "R", "N", "D", "C", "E", "Q", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"] feature_data = {} # Fake amino acid composition (normalized to sum to 1) aa_composition = np.random.rand(len(peptide_sequences), len(amino_acids)) aa_composition = aa_composition / aa_composition.sum(axis=1, keepdims=True) for aa_idx, aa in enumerate(amino_acids): feature_data[f"aa_{aa}"] = aa_composition[:, aa_idx] return (feature_data,) @app.cell(hide_code=True) def _(mo): mo.md("""Now let's add some fake physicochemical properties. In reality, these would be calculated from the actual peptide sequences using libraries like RDKit or BioPython.""") return @app.cell def _(feature_data, np, peptide_sequences): # Generate fake physicochemical properties feature_data["length"] = np.random.randint(8, 25, len(peptide_sequences)) feature_data["hydrophobicity"] = np.random.normal(0, 1, len(peptide_sequences)) feature_data["charge"] = np.random.normal(0, 2, len(peptide_sequences)) feature_data["molecular_weight"] = feature_data["length"] * 110 + np.random.normal(0, 200, len(peptide_sequences)) return @app.cell(hide_code=True) def _(mo): mo.md("""And finally, some synthetic structural predictions. These would typically come from tools like AlphaFold or secondary structure prediction algorithms.""") return @app.cell def _(feature_data, np, peptide_sequences): # Generate fake structural features secondary_structures = ["helix", "sheet", "coil"] structure_probs = np.random.dirichlet([1, 1, 1], len(peptide_sequences)) for struct_idx, struct in enumerate(secondary_structures): feature_data[f"structure_{struct}"] = structure_probs[:, struct_idx] return @app.cell(hide_code=True) def _(mo): mo.md("""Now comes the key part - adding these features to our unified dataset using the categorical coordinate approach. This keeps everything aligned with our peptide sequences.""") return @app.cell def _(feature_data, np, peptide_sequences, unified_stage2, xr): # Add ML features to unified dataset using categorical coordinate feature_names = list(feature_data.keys()) feature_matrix = np.array([feature_data[name] for name in feature_names]).T unified_stage3 = unified_stage2.assign( {"ml_features": (["peptide", "feature"], feature_matrix)} ).assign_coords(feature=feature_names) unified_stage3.attrs.update({ "workflow_stage": "3_feature_engineering", "n_ml_features": len(feature_data), "feature_types": "amino_acid_composition, physicochemical_properties, structural_features", "feature_encoding": "one_hot_and_continuous", }) # Create separate ml_features for backward compatibility ml_features = xr.Dataset( data_vars={name: (["peptide"], values) for name, values in feature_data.items()}, coords={"peptide": peptide_sequences}, ) return ml_features, unified_stage3 @app.cell(hide_code=True) def _(unified_stage3): unified_stage3 return @app.cell(hide_code=True) def _(mo): mo.md( """ Take a moment to click around and explore this xarray Dataset. Notice how we've got everything joined together: our `activity_measurement` data lives right next to `model_coefficients` which live right next to `ml_features`. Everything is coordinate-aligned by peptide sequence. This is the magic - you can slice across any dimension and everything stays connected. Want features for specific peptides? The model results for those same peptides come along automatically. This unified structure is going to be extremely helpful when we start building complex analysis pipelines. """ ) return @app.cell(hide_code=True) def _(mo): mo.md( """ Here's how simple it becomes to extract data across our unified structure. Notice the clean xarray syntax - we can slice features, select specific amino acids, or grab structural predictions with just coordinate indexing: ```python # Get amino acid features for first 10 peptides aa_features = unified_stage3.ml_features.sel( feature=[f for f in unified_stage3.feature.values if f.startswith("aa_")] ).isel(peptide=slice(0, 10)) # Get structural predictions structure_features = unified_stage3.ml_features.sel( feature=[f for f in unified_stage3.feature.values if f.startswith("structure_")] ) ``` Everything stays coordinate-aligned automatically. No manual bookkeeping required. Below you'll see some plots that demonstrate this convenience. They were made using exactly this selection syntax - clean coordinate-based indexing rather than having to do weird joins across multiple DataFrames. The unified structure makes data extraction for visualization incredibly straightforward. """ ) return @app.cell(hide_code=True) def _(ml_features, plt, sns, unified_stage3): # Visualize ML features using the new coordinate-based structure features_fig, features_axes = plt.subplots(2, 2, figsize=(12, 8)) # Amino acid composition heatmap (using unified dataset) aa_feature_mask = [f.startswith("aa_") for f in unified_stage3.feature.values] aa_features = unified_stage3.feature.values[aa_feature_mask] aa_data = unified_stage3.ml_features.sel(feature=aa_features).values[ :30, : ] # First 30 peptides sns.heatmap( aa_data, xticklabels=[f.split("_")[1] for f in aa_features], yticklabels=unified_stage3.peptide.values[:30], ax=features_axes[0, 0], cmap="viridis", ) features_axes[0, 0].set_title("Amino Acid Composition (First 30 Peptides)") # Peptide properties (using backward compatibility dataset) ml_features.length.plot(ax=features_axes[0, 1]) features_axes[0, 1].set_title("Peptide Length Distribution") # Hydrophobicity vs charge features_axes[1, 0].scatter( ml_features.hydrophobicity, ml_features.charge, alpha=0.6 ) features_axes[1, 0].set_xlabel("Hydrophobicity") features_axes[1, 0].set_ylabel("Charge") features_axes[1, 0].set_title("Physicochemical Properties") # Secondary structure (using unified dataset) struct_feature_mask = [ f.startswith("structure_") for f in unified_stage3.feature.values ] struct_features = unified_stage3.feature.values[struct_feature_mask] struct_data = unified_stage3.ml_features.sel(feature=struct_features).values struct_means = struct_data.mean(axis=0) features_axes[1, 1].bar(range(len(struct_features)), struct_means) features_axes[1, 1].set_xticks(range(len(struct_features))) features_axes[1, 1].set_xticklabels([f.split("_")[1] for f in struct_features]) features_axes[1, 1].set_title("Average Secondary Structure Content") plt.tight_layout() plt.show() return @app.cell(hide_code=True) def _(mo): mo.md("""Finally, let's add train/test splits. The beauty here is that splits are stored as boolean masks aligned with our peptide coordinate. No more integer indices that break when you reorder your data.""") return @app.cell def _(): # Define our train/test split strategies # Note: temporal_split is just a nod to cheminformaticians - we're not actually using temporal data here split_configs = { "random_80_20": {"test_size": 0.2, "random_state": 42}, "random_70_30": {"test_size": 0.3, "random_state": 123}, "temporal_split": {"test_size": 0.25, "random_state": 456}, } return (split_configs,) @app.cell(hide_code=True) def _(mo): mo.md("""Now let's generate the actual splits and create the boolean masks that will become part of our unified dataset.""") return @app.cell def _(n_peptides, np, peptide_sequences, split_configs, train_test_split): # Generate the splits and create boolean masks splits_data = {} split_types = list(split_configs.keys()) # Create matrices to hold all train/test masks n_splits = len(split_types) train_masks = np.zeros((n_peptides, n_splits), dtype=bool) test_masks = np.zeros((n_peptides, n_splits), dtype=bool) for split_idx, (config_split_name, config) in enumerate(split_configs.items()): train_peptides, test_peptides = train_test_split( peptide_sequences, test_size=config["test_size"], random_state=config["random_state"], ) # Create boolean masks for easy indexing train_mask = np.isin(peptide_sequences, train_peptides) test_mask = np.isin(peptide_sequences, test_peptides) # Store in matrices train_masks[:, split_idx] = train_mask test_masks[:, split_idx] = test_mask splits_data[config_split_name] = { "train_peptides": train_peptides, "test_peptides": test_peptides, "train_mask": train_mask, "test_mask": test_mask, } return split_types, splits_data, test_masks, train_masks @app.cell(hide_code=True) def _(mo): mo.md("""Finally, let's add these splits to our unified dataset to complete the experimental workflow.""") return @app.cell def _(split_configs, split_types, test_masks, train_masks, unified_stage3): # Add train/test splits using the new split_type dimension unified_final = unified_stage3.assign({ 'train_mask': (['peptide', 'split_type'], train_masks), 'test_mask': (['peptide', 'split_type'], test_masks) }).assign_coords(split_type=split_types) unified_final.attrs.update({ "workflow_stage": "4_data_splits_complete", "n_split_strategies": len(split_configs), "split_configs": str(split_configs), "experiment_complete": True, }) return (unified_final,) @app.cell(hide_code=True) def _(unified_final): unified_final return @app.cell(hide_code=True) def _(mo): mo.md( """ Take a moment to explore this final unified dataset. Click around and notice how `train_mask` and `test_mask` are now accessible by both `peptide` and `split_type` coordinates. This is the beauty of xarray's coordinate system - instead of having six separate variables cluttering our namespace, we have a clean dimensional structure. The coordinate system for `split_type` makes it so much easier to handle different splitting strategies. Want to compare training sets across all split types? Easy. Need just the temporal split masks? Simple coordinate selection. """ ) return @app.cell(hide_code=True) def _(mo): mo.md( """ Here's how elegant the selection becomes with proper coordinates: ```python # Get training mask for random 80/20 split train_mask = unified_final.train_mask.sel(split_type='random_80_20') # Get test mask for temporal split test_mask = unified_final.test_mask.sel(split_type='temporal_split') # Compare training set sizes across all split types train_counts = unified_final.train_mask.sum(dim='peptide') ``` No more remembering variable names like `split_random_80_20_train_mask`. Just clean, intuitive coordinate-based selection. The plots below demonstrate this elegant syntax in action. """ ) return @app.cell(hide_code=True) def _(lab_data, plt, splits_data, unified_final): # Demonstrate using splits with actual data splits_fig, splits_axes = plt.subplots(1, 3, figsize=(15, 4)) for viz_split_idx, (viz_split_name, viz_split_data) in enumerate( splits_data.items() ): # Get training data for this split using our new split_type dimension viz_train_mask = unified_final.train_mask.sel(split_type=viz_split_name).values viz_test_mask = unified_final.test_mask.sel(split_type=viz_split_name).values # Calculate mean activity for train/test sets viz_train_activity = lab_data.isel(peptide=viz_train_mask).mean( dim=[ "treatment", "replicate", "time_point", "mouse_id", "experiment_date", ] ) viz_test_activity = lab_data.isel(peptide=viz_test_mask).mean( dim=[ "treatment", "replicate", "time_point", "mouse_id", "experiment_date", ] ) # Plot distributions splits_axes[viz_split_idx].hist( viz_train_activity.values, alpha=0.7, label=f"Train (n={viz_train_mask.sum()})", bins=20, ) splits_axes[viz_split_idx].hist( viz_test_activity.values, alpha=0.7, label=f"Test (n={viz_test_mask.sum()})", bins=20, ) splits_axes[viz_split_idx].set_title( f"{viz_split_name.replace('_', ' ').title()}" ) splits_axes[viz_split_idx].set_xlabel("Mean Activity") splits_axes[viz_split_idx].set_ylabel("Count") splits_axes[viz_split_idx].legend() plt.tight_layout() plt.show() # Example of accessing specific peptides in a split example_split = "random_80_20" example_train_peptides = splits_data[example_split]["train_peptides"][:5] example_test_peptides = splits_data[example_split]["test_peptides"][:5] return @app.cell(hide_code=True) def _(mo): mo.md( """ ## Storing as zarr The final step is saving our unified dataset. Zarr format is perfect for this - it's cloud-native, supports chunking, and preserves all our coordinate information and metadata. """ ) return @app.cell def _(unified_final): # Save the unified dataset to zarr format import tempfile from pathlib import Path from loguru import logger # Save the progressively-built unified dataset temp_dir = Path(tempfile.mkdtemp()) zarr_path = temp_dir / "complete_experiment_lifecycle.zarr" unified_final.to_zarr(zarr_path, mode="w") logger.info("Complete experimental lifecycle saved to zarr") logger.info(f"Zarr file: {zarr_path}") logger.info(f"Final size: {unified_final.nbytes / 1e6:.2f} MB") logger.info(f"Workflow stage: {unified_final.attrs['workflow_stage']}") return @app.cell(hide_code=True) def _(mo): mo.md( """ ## What have we built? I cooked up this example while at SciPy 2025, while attending Ian Hunt-Isaak's talk on XArray for Biology, as an example of how we can unify data storage for biological experimental data with machine learning data. When everything lives in a single xarray Dataset, you stop losing track of which peptide corresponds to which row in which CSV file. Your lab measurements, computed features, model results, and data splits all stay connected through the same coordinate system. Here's what I love about this setup. You can slice across the entire experimental timeline with one line of code. Need ML features for high-activity peptides in your training set? It's just coordinate selection. Want to see which treatment effects your model captured? Same coordinate system makes it trivial. ```python # Save entire experimental workflow to cloud unified_final.to_zarr('s3://biodata/peptide_screen_2024.zarr') # Load and reproduce analysis anywhere experiment = xr.open_zarr('s3://biodata/peptide_screen_2024.zarr') # Query across the entire experimental lifecycle high_activity_peptides = experiment.where( experiment.activity_measurement.mean(['treatment', 'replicate', 'time_point']) > 120 ).peptide # Get ML features for training set of high-activity peptides train_mask = experiment.train_mask.sel(split_type='random_80_20') ml_data = experiment.ml_features.sel(peptide=high_activity_peptides & train_mask) ``` Time will distill the best practices in your context, but I've found this unified approach eliminates so much cognitive overhead. No more file juggling. No more wondering if your indices are still aligned. Just clean, coordinated data that scales to the cloud. """ ) return if __name__ == "__main__": app.run()