Last active
          August 13, 2023 03:58 
        
      - 
      
- 
        Save andycasey/1106d51aaec54633eedb451437fce2b7 to your computer and use it in GitHub Desktop. 
Revisions
- 
        andycasey revised this gist Apr 12, 2023 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewingThis file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -14,7 +14,7 @@ ------------------ A single bitfield is used to store all the primary keys of the cartons that a source is assigned. The minimum length of the bitfield flag used here is as small as 34 bytes (way less than some of the file paths that are stored in the database). What's bad about it? 
- 
        andycasey created this gist Apr 12, 2023 .There are no files selected for viewingThis file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,457 @@ """ This minimum reproducible example demonstrates a way to use bitfield flags to store the cartons that a source is assigned to. What does it do? ---------------- 1. Creates a `Source` table and inserts 10,000 random sources. 2. Creates some `SimplifiedCarton` entries based on unique names currently in targetdb. 3. Assigns sources to random cartons (many more than what would exist in reality). 4. Checks that when querying for sources based on a carton, it retrieves only the sources expected. How does it do it? ------------------ A single bitfield is used to store all the primary keys of the cartons that a source is assigned. The minimum length of the bitfield flag used here is as small as 34 bits (way less than some of the file paths that are stored in the database). What's bad about it? -------------------- The length of the bytearray varies per source, and is limited by the maximum primary key that a carton is assigned. In this example I have created a `SimplifiedCarton` just to make things more efficient, but this could in principle be used to map directly to the primary keys in `targetdb.carton.pk`. There are many gaps in primary keys in `targetdb.carton`, which makes the bitfield storage less efficient. Storing the carton assignments in a bitfield flag makes it a little less easy to query (than compared to a normal `peewee.BitField`), but I have included some hybrid methods to make it easy to query by carton primary key. Caveats ------- This example works in PostgreSQL. You should be able to store the bitfield flags in SQLite, but I think you won't be able to query with them in SQLite, because SQLite does not have as many bitwise string operators available (I *think*; it would be great if we can do it in SQLite too). """ import numpy as np from peewee import ( fn, AutoField, FloatField, TextField, Model, BigBitField, PostgresqlDatabase, ) from playhouse.hybrid import hybrid_method database = PostgresqlDatabase(...) class BaseModel(Model): class Meta: database = database schema = None legacy_table_names = False class SimplifiedCarton(BaseModel): """ A simplified carton for use with targeting flags. Simplified cartons have sanitised names, and do not track versions of cartons. A simplified carton is considered unique by its 'sanitised' name. The 'sanitised' name is where the dashes '-' are replaced with underscores '_'. For example, the cartons with names 'bhm_aqmes_bonus_bright' and 'bhm_aqmes_bonus-bright' in `targetdb.carton` are considered the same simplified carton with the name 'bhm_aqmes_bonus_bright'. If a source was assigned to a carton in `targetdb` in some late version, then that will not be tracked by this `SimplifiedCarton`. This `SimplifiedCarton` should only track whether or not a source was assigned to a carton, not when. """ id = AutoField() carton = TextField(unique=True, index=True) class Source(BaseModel): """ An astronomical source. """ # Identifiers id = AutoField() # Astrometry ra = FloatField() dec = FloatField() # Targeting carton_0 = TextField(null=True) carton_flags = BigBitField(null=True) @hybrid_method def is_in_simplified_carton(self, carton_id): """ Return whether this source is assigned to the given (simplified) carton. """ return ( (fn.length(self.carton_flags) > int(carton_id / 8)) & (fn.get_bit(self.carton_flags, carton_id) > 0) ) @hybrid_method def is_in_any_simplified_carton(self, *carton_ids): return fn.OR(*[self.is_in_simplified_carton(carton_id) for carton_id in carton_ids]) @property def simplified_carton_ids(self): """ Return the primary keys of the cartons that this source is assigned. """ i, cartons, cur_size = (0, [], len(self.carton_flags._buffer)) while True: byte_num, byte_offset = divmod(i, 8) if byte_num >= cur_size: break if bool(self.carton_flags._buffer[byte_num] & (1 << byte_offset)): cartons.append(i) i += 1 return cartons if __name__ == "__main__": import numpy as np from tqdm import tqdm database.drop_tables([ Source, SimplifiedCarton ]) database.create_tables([ Source, SimplifiedCarton, ]) carton_names = ( "bhm_aqmes_bonus_bright", "bhm_aqmes_bonus-bright", "bhm_aqmes_bonus_core", "bhm_aqmes_bonus-dark", "bhm_aqmes_bonus_faint", "bhm_aqmes_med", "bhm_aqmes_med_faint", "bhm_aqmes_med-faint", "bhm_aqmes_wide2", "bhm_aqmes_wide2_faint", "bhm_aqmes_wide2-faint", "bhm_aqmes_wide3", "bhm_aqmes_wide3-faint", "bhm_colr_galaxies_lsdr10", "bhm_colr_galaxies_lsdr8", "bhm_csc_apogee", "bhm_csc_boss", "bhm_csc_boss_bright", "bhm_csc_boss-bright", "bhm_csc_boss_dark", "bhm_csc_boss-dark", "bhm_gua_bright", "bhm_gua_dark", "bhm_rm_ancillary", "bhm_rm_core", "bhm_rm_known_spec", "bhm_rm_known-spec", "bhm_rm_var", "bhm_rm_xrayqso", "bhm_spiders_agn-efeds", "bhm_spiders_agn_efeds_stragglers", "bhm_spiders_agn_gaiadr2", "bhm_spiders_agn_gaiadr3", "bhm_spiders_agn_hard", "bhm_spiders_agn_lsdr10", "bhm_spiders_agn_lsdr8", "bhm_spiders_agn_ps1dr2", "bhm_spiders_agn_sep", "bhm_spiders_agn_skymapperdr2", "bhm_spiders_agn_supercosmos", "bhm_spiders_agn_tda", "bhm_spiders_clusters-efeds-erosita", "bhm_spiders_clusters-efeds-hsc-redmapper", "bhm_spiders_clusters-efeds-ls-redmapper", "bhm_spiders_clusters-efeds-sdss-redmapper", "bhm_spiders_clusters_efeds_stragglers", "bhm_spiders_clusters_lsdr10", "bhm_spiders_clusters_lsdr8", "bhm_spiders_clusters_ps1dr2", "comm_pleiades", "comm_spectrophoto", "manual_bhm_spiders_comm", "manual_bhm_spiders_comm_lco", "manual_bright_target", "manual_bright_target_offsets_1", "manual_bright_target_offsets_1_g13", "manual_bright_target_offsets_2", "manual_bright_target_offsets_2_g13", "manual_bright_target_offsets_3", "manual_bright_targets", "manual_bright_targets_g13", "manual_bright_targets_g13_offset_fixed_1", "manual_bright_targets_g13_offset_fixed_2", "manual_bright_targets_g13_offset_fixed_3", "manual_bright_targets_g13_offset_fixed_4", "manual_bright_targets_g13_offset_fixed_5", "manual_bright_targets_g13_offset_fixed_6", "manual_bright_targets_g13_offset_fixed_7", "manual_fps_position_stars", "manual_fps_position_stars_10", "manual_fps_position_stars_apogee_10", "manual_fps_position_stars_lco_apogee_10", "manual_mwm_crosscalib_apogee", "manual_mwm_crosscalib_yso_apogee", "manual_mwm_crosscalib_yso_boss", "manual_mwm_halo_distant_bhb", "manual_mwm_halo_distant_kgiant", "manual_mwm_halo_mp_bbb", "manual_mwm_magcloud_massive_apogee", "manual_mwm_magcloud_massive_boss", "manual_mwm_magcloud_symbiotic_apogee", "manual_mwm_planet_ca_legacy_v1", "manual_mwm_planet_gaia_astrometry_v1", "manual_mwm_planet_gpi_v1", "manual_mwm_planet_harps_v1", "manual_mwm_planet_known_v1", "manual_mwm_planet_sophie_v1", "manual_mwm_planet_sphere_v1", "manual_mwm_planet_tess_eb_v1", "manual_mwm_planet_tess_pc_v1", "manual_mwm_planet_transiting_bd_v1", "manual_mwm_tess_ob", "manual_mwm_validation_cool_apogee", "manual_mwm_validation_cool_boss", "manual_mwm_validation_hot_apogee", "manual_mwm_validation_hot_boss", "manual_mwm_validation_rv", "manual_nsbh_apogee", "manual_nsbh_boss", "manual_offset_mwmhalo_off00", "manual_offset_mwmhalo_off05", "manual_offset_mwmhalo_off10", "manual_offset_mwmhalo_off20", "manual_offset_mwmhalo_off30", "manual_offset_mwmhalo_offa", "manual_offset_mwmhalo_offb", "manual_planet_ca_legacy_v0", "manual_planet_gaia_astrometry_v0", "manual_planet_gpi_v0", "manual_planet_harps_v0", "manual_planet_known_v0", "manual_planet_sophie_v0", "manual_planet_sphere_v0", "manual_planet_tess_eb_v0", "manual_planet_tess_pc_v0", "manual_planet_transiting_bd_v0", "manual_validation_apogee", "manual_validation_boss", "manual_validation_cool_apogee", "manual_validation_cool_boss", "manual_validation_rv", "mwm_bin_rv_long", "mwm_bin_rv_short", "mwm_cb_300pc", "mwm_cb_300pc_apogee", "mwm_cb_300pc_boss", "mwm_cb_cvcandidates", "mwm_cb_cvcandidates_apogee", "mwm_cb_cvcandidates_boss", "mwm_cb_gaiagalex", "mwm_cb_gaiagalex_apogee", "mwm_cb_gaiagalex_boss", "mwm_cb_uvex1", "mwm_cb_uvex2", "mwm_cb_uvex3", "mwm_cb_uvex4", "mwm_cb_uvex5", "mwm_dust_core", "mwm_erosita_compact", "mwm_erosita_compact_deep", "mwm_erosita_compact_gen", "mwm_erosita_compact_var", "mwm_erosita_stars", "mwm_galactic_core", "mwm_galactic_core_dist", "mwm_gg_core", "mwm_halo_bb", "mwm_halo_bb_apogee", "mwm_halo_bb_boss", "mwm_halo_sm", "mwm_halo_sm_apogee", "mwm_halo_sm_boss", "mwm_legacy_ir2opt", "mwm_ob_cepheids", "mwm_ob_core", "mwm_planet_tess", "mwm_rv_long_bplates", "mwm_rv_long-bplates", "mwm_rv_long_fps", "mwm_rv_long-fps", "mwm_rv_long_rm", "mwm_rv_long-rm", "mwm_rv_short_bplates", "mwm_rv_short-bplates", "mwm_rv_short_fps", "mwm_rv_short-fps", "mwm_rv_short_rm", "mwm_rv_short-rm", "mwm_snc_100pc", "mwm_snc_100pc_apogee", "mwm_snc_100pc_boss", "mwm_snc_250pc", "mwm_snc_250pc_apogee", "mwm_snc_250pc_boss", "mwm_tess_2min", "mwm_tess_ob", "mwm_tess_planet", "mwm_tess_rgb", "mwm_tessrgb_core", "mwm_wd_core", "mwm_wd_pwd", "mwm_yso_cluster", "mwm_yso_cluster_apogee", "mwm_yso_cluster_boss", "mwm_yso_cmz", "mwm_yso_cmz_apogee", "mwm_yso_disk_apogee", "mwm_yso_disk_boss", "mwm_yso_embedded_apogee", "mwm_yso_nebula_apogee", "mwm_yso_ob", "mwm_yso_ob_apogee", "mwm_yso_ob_boss", "mwm_yso_pms_apogee", "mwm_yso_pms_apogee_sagitta_edr3", "mwm_yso_pms_apogee_zari18pms", "mwm_yso_pms_boss", "mwm_yso_pms_boss_sagitta_edr3", "mwm_yso_pms_boss_zari18pms", "mwm_yso_s1", "mwm_yso_s2", "mwm_yso_s2-5", "mwm_yso_s3", "mwm_yso_variable_apogee", "mwm_yso_variable_boss", "openfiberstargets_test", "openfibertargets_nov2020_10", "openfibertargets_nov2020_1000", "openfibertargets_nov2020_1001a", "openfibertargets_nov2020_1001b", "openfibertargets_nov2020_11", "openfibertargets_nov2020_12", "openfibertargets_nov2020_14", "openfibertargets_nov2020_15", "openfibertargets_nov2020_17", "openfibertargets_nov2020_18", "openfibertargets_nov2020_19a", "openfibertargets_nov2020_19b", "openfibertargets_nov2020_19c", "openfibertargets_nov2020_22", "openfibertargets_nov2020_24", "openfibertargets_nov2020_25", "openfibertargets_nov2020_26", "openfibertargets_nov2020_27", "openfibertargets_nov2020_28a", "openfibertargets_nov2020_28b", "openfibertargets_nov2020_28c", "openfibertargets_nov2020_29", "openfibertargets_nov2020_3", "openfibertargets_nov2020_30", "openfibertargets_nov2020_31", "openfibertargets_nov2020_32", "openfibertargets_nov2020_33", "openfibertargets_nov2020_34a", "openfibertargets_nov2020_34b", "openfibertargets_nov2020_35a", "openfibertargets_nov2020_35b", "openfibertargets_nov2020_35c", "openfibertargets_nov2020_46", "openfibertargets_nov2020_47a", "openfibertargets_nov2020_47b", "openfibertargets_nov2020_47c", "openfibertargets_nov2020_47d", "openfibertargets_nov2020_47e", "openfibertargets_nov2020_5", "openfibertargets_nov2020_6a", "openfibertargets_nov2020_6b", "openfibertargets_nov2020_6c", "openfibertargets_nov2020_8", "openfibertargets_nov2020_9", "ops_2mass_psc_brightneighbors", "ops_apogee_stds", "ops_gaia_brightneighbors", "ops_sky_apogee", "ops_sky_apogee_best", "ops_sky_apogee_good", "ops_sky_boss", "ops_sky_boss_best", "ops_sky_boss_fallback", "ops_sky_boss_good", "ops_std_apogee", "ops_std_boss", "ops_std_boss_gdr2", "ops_std_boss_lsdr10", "ops_std_boss_lsdr8", "ops_std_boss_ps1dr2", "ops_std_boss_red", "ops_std_boss-red", "ops_std_boss_tic", "ops_std_eboss", "ops_tycho2_brightneighbors" ) cartons = [SimplifiedCarton(carton=carton_name) for carton_name in carton_names] with database.atomic(): ( SimplifiedCarton .bulk_create(cartons) ) np.random.seed(0) C = len(cartons) N = 10_000 N_cartons_assigned_to = np.random.randint(0, C, size=N) print("Preparing sources") assignments = {} sources = [] for i in tqdm(range(N)): ra, dec = np.random.uniform(size=2) source = Source.create(ra=ra, dec=dec) carton_assignments = np.random.choice(cartons, size=N_cartons_assigned_to[i], replace=False) for carton in carton_assignments: source.carton_flags.set_bit(carton.id) assignments.setdefault(carton.id, []) assignments[carton.id].append(source.id) source.save() sources.append(source) for carton_id, expected_source_ids in tqdm(assignments.items(), desc="Checking.."): q = ( Source .select(Source.id) .where(Source.is_in_simplified_carton(carton_id)) .tuples() ) actual_source_ids = [source_id for source_id, in q] diff = set(expected_source_ids).symmetric_difference(actual_source_ids) assert len(diff) == 0 print("Done")