from pymongo import MongoClient client = MongoClient() aggregation = [ { "$set": { "collection_date.has_date_value": { "$dateFromString": { "dateString": "$collection_date.has_raw_value", }, }, }, }, { "$lookup": { "from": "study_set", "localField": "part_of.0", "foreignField": "id", "as": "study", }, }, { "$lookup": { "from": "omics_processing_set", "localField": "id", "foreignField": "has_input.0", "as": "omics_processing", }, }, { "$set": { "multiomics": { "$sortArray": { "input": { # This set difference removes duplicate omics types and removes lipidomics "$setDifference": [ "$omics_processing.omics_type.has_raw_value", ["Lipidomics"], ], }, "sortBy": 1, }, }, }, }, ] activity_types = { "mags_activity": "nmdc:MAGsAnalysisActivity", "metabolomics_analysis_activity": "nmdc:MetabolomicsAnalysisActivity", "metagenome_annotation_activity": "nmdc:MetagenomeAnnotation", "metagenome_assembly": "nmdc:MetagenomeAssembly", "metaproteomics_analysis_activity": "nmdc:MetaProteomicAnalysis", "metatranscriptome_activity": "nmdc:metaT", "nom_analysis_activity": "nmdc:NomAnalysisActivity", } for activity_type in activity_types: # Pull in activities and data_objects associated with each omics_processing aggregation.extend([ { "$lookup": { "from": f"{activity_type}_set", "localField": "omics_processing.id", "foreignField": "was_informed_by", "as": activity_type, }, }, { "$lookup": { "from": "data_object_set", "localField": f"{activity_type}.has_output", "foreignField": "id", "as": f"{activity_type}_data_object", "pipeline": [ {"$set": {"activity_type": activity_types[activity_type]}}, ], }, }, ]) aggregation.extend([ # Lookup metagenome annotations { "$lookup": { "from": "functional_annotation_agg", "localField": "metagenome_annotation_activity.id", "foreignField": "metagenome_annotation_id", "as": "metagenome_annotation", "pipeline": [ { "$set": { "id": "$gene_function_id", "activity_id": "$metagenome_annotation_id", }, }, {"$unset": ["_id", "metagenome_annotation_id", "gene_function_id"]}, ], }, }, # Lookup metaproteomics annotations { "$lookup": { "from": "metap_gene_function_aggregation", "localField": "metaproteomics_analysis_activity.id", "foreignField": "metaproteomic_analysis_id", "as": "metaproteomics_annotation", "pipeline": [ { "$set": { "id": "$gene_function_id", "activity_id": "$metaproteomic_analysis_id", }, }, {"$unset": ["_id", "metaproteomic_analysis_id", "gene_function_id"]}, ], }, }, # Combine annotations into a single annotation array { "$set": { "gene_function": { "$concatArrays": ["$metagenome_annotation", "$metaproteomics_annotation"] } }, }, { "$unset": ["metagenome_annotation", "metaproteomics_annotation"], }, # Combine all analyses into a single activity array { "$set": { "activity": { "$concatArrays": [f"${activity_type}" for activity_type in activity_types] } } }, # Remove the monstrous has_peptide_quantifications array { "$set": { "activity": { "$map": { "input": "$activity", "as": "d", "in": { "$setField": { "field": "has_peptide_quantifications", "value": "$$REMOVE", "input": "$$d" } } } }, } }, # We are done with the separate activity types since they are all in the activity array now { "$unset": list(activity_types.keys()), }, # Add a count so we can sort by the number of analyses each sample has { "$set": { "omics_processing_count": { "$size": "$omics_processing" } } } ]) aggregation.extend([ { "$set": { "data_object": { "$concatArrays": [f"${activity_type}_data_object" for activity_type in activity_types] } } }, { "$unset": [f"{activity_type}_data_object" for activity_type in activity_types] }, { "$out": "denormalized", }, ]) q = client.nmdc.biosample_set.aggregate(aggregation)