Skip to content

Instantly share code, notes, and snippets.

@andycasey
Last active August 13, 2023 03:58
Show Gist options
  • Save andycasey/1106d51aaec54633eedb451437fce2b7 to your computer and use it in GitHub Desktop.
Save andycasey/1106d51aaec54633eedb451437fce2b7 to your computer and use it in GitHub Desktop.

Revisions

  1. andycasey revised this gist Apr 12, 2023. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion target_bitfield_flags.py
    Original file line number Diff line number Diff line change
    @@ -14,7 +14,7 @@
    ------------------
    A single bitfield is used to store all the primary keys of the cartons that a source is assigned.
    The minimum length of the bitfield flag used here is as small as 34 bits (way less than some of
    The minimum length of the bitfield flag used here is as small as 34 bytes (way less than some of
    the file paths that are stored in the database).
    What's bad about it?
  2. andycasey created this gist Apr 12, 2023.
    457 changes: 457 additions & 0 deletions target_bitfield_flags.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,457 @@
    """
    This minimum reproducible example demonstrates a way to use bitfield flags to store the cartons
    that a source is assigned to.
    What does it do?
    ----------------
    1. Creates a `Source` table and inserts 10,000 random sources.
    2. Creates some `SimplifiedCarton` entries based on unique names currently in targetdb.
    3. Assigns sources to random cartons (many more than what would exist in reality).
    4. Checks that when querying for sources based on a carton, it retrieves only the sources expected.
    How does it do it?
    ------------------
    A single bitfield is used to store all the primary keys of the cartons that a source is assigned.
    The minimum length of the bitfield flag used here is as small as 34 bits (way less than some of
    the file paths that are stored in the database).
    What's bad about it?
    --------------------
    The length of the bytearray varies per source, and is limited by the maximum primary key that a
    carton is assigned. In this example I have created a `SimplifiedCarton` just to make things more
    efficient, but this could in principle be used to map directly to the primary keys in
    `targetdb.carton.pk`. There are many gaps in primary keys in `targetdb.carton`, which makes the
    bitfield storage less efficient.
    Storing the carton assignments in a bitfield flag makes it a little less easy to query (than
    compared to a normal `peewee.BitField`), but I have included some hybrid methods to make it easy
    to query by carton primary key.
    Caveats
    -------
    This example works in PostgreSQL. You should be able to store the bitfield flags in SQLite, but
    I think you won't be able to query with them in SQLite, because SQLite does not have as many
    bitwise string operators available (I *think*; it would be great if we can do it in SQLite too).
    """

    import numpy as np

    from peewee import (
    fn,
    AutoField,
    FloatField,
    TextField,
    Model,
    BigBitField,
    PostgresqlDatabase,
    )
    from playhouse.hybrid import hybrid_method

    database = PostgresqlDatabase(...)

    class BaseModel(Model):
    class Meta:
    database = database
    schema = None
    legacy_table_names = False


    class SimplifiedCarton(BaseModel):

    """
    A simplified carton for use with targeting flags.
    Simplified cartons have sanitised names, and do not track versions of cartons.
    A simplified carton is considered unique by its 'sanitised' name. The 'sanitised'
    name is where the dashes '-' are replaced with underscores '_'. For example, the
    cartons with names 'bhm_aqmes_bonus_bright' and 'bhm_aqmes_bonus-bright' in
    `targetdb.carton` are considered the same simplified carton with the name
    'bhm_aqmes_bonus_bright'.
    If a source was assigned to a carton in `targetdb` in some late version, then
    that will not be tracked by this `SimplifiedCarton`. This `SimplifiedCarton`
    should only track whether or not a source was assigned to a carton, not when.
    """

    id = AutoField()
    carton = TextField(unique=True, index=True)



    class Source(BaseModel):

    """ An astronomical source. """

    # Identifiers
    id = AutoField()

    # Astrometry
    ra = FloatField()
    dec = FloatField()

    # Targeting
    carton_0 = TextField(null=True)
    carton_flags = BigBitField(null=True)

    @hybrid_method
    def is_in_simplified_carton(self, carton_id):
    """ Return whether this source is assigned to the given (simplified) carton. """
    return (
    (fn.length(self.carton_flags) > int(carton_id / 8))
    & (fn.get_bit(self.carton_flags, carton_id) > 0)
    )

    @hybrid_method
    def is_in_any_simplified_carton(self, *carton_ids):
    return fn.OR(*[self.is_in_simplified_carton(carton_id) for carton_id in carton_ids])

    @property
    def simplified_carton_ids(self):
    """ Return the primary keys of the cartons that this source is assigned. """
    i, cartons, cur_size = (0, [], len(self.carton_flags._buffer))
    while True:
    byte_num, byte_offset = divmod(i, 8)
    if byte_num >= cur_size:
    break
    if bool(self.carton_flags._buffer[byte_num] & (1 << byte_offset)):
    cartons.append(i)
    i += 1
    return cartons


    if __name__ == "__main__":
    import numpy as np
    from tqdm import tqdm

    database.drop_tables([
    Source,
    SimplifiedCarton
    ])

    database.create_tables([
    Source,
    SimplifiedCarton,
    ])

    carton_names = (
    "bhm_aqmes_bonus_bright",
    "bhm_aqmes_bonus-bright",
    "bhm_aqmes_bonus_core",
    "bhm_aqmes_bonus-dark",
    "bhm_aqmes_bonus_faint",
    "bhm_aqmes_med",
    "bhm_aqmes_med_faint",
    "bhm_aqmes_med-faint",
    "bhm_aqmes_wide2",
    "bhm_aqmes_wide2_faint",
    "bhm_aqmes_wide2-faint",
    "bhm_aqmes_wide3",
    "bhm_aqmes_wide3-faint",
    "bhm_colr_galaxies_lsdr10",
    "bhm_colr_galaxies_lsdr8",
    "bhm_csc_apogee",
    "bhm_csc_boss",
    "bhm_csc_boss_bright",
    "bhm_csc_boss-bright",
    "bhm_csc_boss_dark",
    "bhm_csc_boss-dark",
    "bhm_gua_bright",
    "bhm_gua_dark",
    "bhm_rm_ancillary",
    "bhm_rm_core",
    "bhm_rm_known_spec",
    "bhm_rm_known-spec",
    "bhm_rm_var",
    "bhm_rm_xrayqso",
    "bhm_spiders_agn-efeds",
    "bhm_spiders_agn_efeds_stragglers",
    "bhm_spiders_agn_gaiadr2",
    "bhm_spiders_agn_gaiadr3",
    "bhm_spiders_agn_hard",
    "bhm_spiders_agn_lsdr10",
    "bhm_spiders_agn_lsdr8",
    "bhm_spiders_agn_ps1dr2",
    "bhm_spiders_agn_sep",
    "bhm_spiders_agn_skymapperdr2",
    "bhm_spiders_agn_supercosmos",
    "bhm_spiders_agn_tda",
    "bhm_spiders_clusters-efeds-erosita",
    "bhm_spiders_clusters-efeds-hsc-redmapper",
    "bhm_spiders_clusters-efeds-ls-redmapper",
    "bhm_spiders_clusters-efeds-sdss-redmapper",
    "bhm_spiders_clusters_efeds_stragglers",
    "bhm_spiders_clusters_lsdr10",
    "bhm_spiders_clusters_lsdr8",
    "bhm_spiders_clusters_ps1dr2",
    "comm_pleiades",
    "comm_spectrophoto",
    "manual_bhm_spiders_comm",
    "manual_bhm_spiders_comm_lco",
    "manual_bright_target",
    "manual_bright_target_offsets_1",
    "manual_bright_target_offsets_1_g13",
    "manual_bright_target_offsets_2",
    "manual_bright_target_offsets_2_g13",
    "manual_bright_target_offsets_3",
    "manual_bright_targets",
    "manual_bright_targets_g13",
    "manual_bright_targets_g13_offset_fixed_1",
    "manual_bright_targets_g13_offset_fixed_2",
    "manual_bright_targets_g13_offset_fixed_3",
    "manual_bright_targets_g13_offset_fixed_4",
    "manual_bright_targets_g13_offset_fixed_5",
    "manual_bright_targets_g13_offset_fixed_6",
    "manual_bright_targets_g13_offset_fixed_7",
    "manual_fps_position_stars",
    "manual_fps_position_stars_10",
    "manual_fps_position_stars_apogee_10",
    "manual_fps_position_stars_lco_apogee_10",
    "manual_mwm_crosscalib_apogee",
    "manual_mwm_crosscalib_yso_apogee",
    "manual_mwm_crosscalib_yso_boss",
    "manual_mwm_halo_distant_bhb",
    "manual_mwm_halo_distant_kgiant",
    "manual_mwm_halo_mp_bbb",
    "manual_mwm_magcloud_massive_apogee",
    "manual_mwm_magcloud_massive_boss",
    "manual_mwm_magcloud_symbiotic_apogee",
    "manual_mwm_planet_ca_legacy_v1",
    "manual_mwm_planet_gaia_astrometry_v1",
    "manual_mwm_planet_gpi_v1",
    "manual_mwm_planet_harps_v1",
    "manual_mwm_planet_known_v1",
    "manual_mwm_planet_sophie_v1",
    "manual_mwm_planet_sphere_v1",
    "manual_mwm_planet_tess_eb_v1",
    "manual_mwm_planet_tess_pc_v1",
    "manual_mwm_planet_transiting_bd_v1",
    "manual_mwm_tess_ob",
    "manual_mwm_validation_cool_apogee",
    "manual_mwm_validation_cool_boss",
    "manual_mwm_validation_hot_apogee",
    "manual_mwm_validation_hot_boss",
    "manual_mwm_validation_rv",
    "manual_nsbh_apogee",
    "manual_nsbh_boss",
    "manual_offset_mwmhalo_off00",
    "manual_offset_mwmhalo_off05",
    "manual_offset_mwmhalo_off10",
    "manual_offset_mwmhalo_off20",
    "manual_offset_mwmhalo_off30",
    "manual_offset_mwmhalo_offa",
    "manual_offset_mwmhalo_offb",
    "manual_planet_ca_legacy_v0",
    "manual_planet_gaia_astrometry_v0",
    "manual_planet_gpi_v0",
    "manual_planet_harps_v0",
    "manual_planet_known_v0",
    "manual_planet_sophie_v0",
    "manual_planet_sphere_v0",
    "manual_planet_tess_eb_v0",
    "manual_planet_tess_pc_v0",
    "manual_planet_transiting_bd_v0",
    "manual_validation_apogee",
    "manual_validation_boss",
    "manual_validation_cool_apogee",
    "manual_validation_cool_boss",
    "manual_validation_rv",
    "mwm_bin_rv_long",
    "mwm_bin_rv_short",
    "mwm_cb_300pc",
    "mwm_cb_300pc_apogee",
    "mwm_cb_300pc_boss",
    "mwm_cb_cvcandidates",
    "mwm_cb_cvcandidates_apogee",
    "mwm_cb_cvcandidates_boss",
    "mwm_cb_gaiagalex",
    "mwm_cb_gaiagalex_apogee",
    "mwm_cb_gaiagalex_boss",
    "mwm_cb_uvex1",
    "mwm_cb_uvex2",
    "mwm_cb_uvex3",
    "mwm_cb_uvex4",
    "mwm_cb_uvex5",
    "mwm_dust_core",
    "mwm_erosita_compact",
    "mwm_erosita_compact_deep",
    "mwm_erosita_compact_gen",
    "mwm_erosita_compact_var",
    "mwm_erosita_stars",
    "mwm_galactic_core",
    "mwm_galactic_core_dist",
    "mwm_gg_core",
    "mwm_halo_bb",
    "mwm_halo_bb_apogee",
    "mwm_halo_bb_boss",
    "mwm_halo_sm",
    "mwm_halo_sm_apogee",
    "mwm_halo_sm_boss",
    "mwm_legacy_ir2opt",
    "mwm_ob_cepheids",
    "mwm_ob_core",
    "mwm_planet_tess",
    "mwm_rv_long_bplates",
    "mwm_rv_long-bplates",
    "mwm_rv_long_fps",
    "mwm_rv_long-fps",
    "mwm_rv_long_rm",
    "mwm_rv_long-rm",
    "mwm_rv_short_bplates",
    "mwm_rv_short-bplates",
    "mwm_rv_short_fps",
    "mwm_rv_short-fps",
    "mwm_rv_short_rm",
    "mwm_rv_short-rm",
    "mwm_snc_100pc",
    "mwm_snc_100pc_apogee",
    "mwm_snc_100pc_boss",
    "mwm_snc_250pc",
    "mwm_snc_250pc_apogee",
    "mwm_snc_250pc_boss",
    "mwm_tess_2min",
    "mwm_tess_ob",
    "mwm_tess_planet",
    "mwm_tess_rgb",
    "mwm_tessrgb_core",
    "mwm_wd_core",
    "mwm_wd_pwd",
    "mwm_yso_cluster",
    "mwm_yso_cluster_apogee",
    "mwm_yso_cluster_boss",
    "mwm_yso_cmz",
    "mwm_yso_cmz_apogee",
    "mwm_yso_disk_apogee",
    "mwm_yso_disk_boss",
    "mwm_yso_embedded_apogee",
    "mwm_yso_nebula_apogee",
    "mwm_yso_ob",
    "mwm_yso_ob_apogee",
    "mwm_yso_ob_boss",
    "mwm_yso_pms_apogee",
    "mwm_yso_pms_apogee_sagitta_edr3",
    "mwm_yso_pms_apogee_zari18pms",
    "mwm_yso_pms_boss",
    "mwm_yso_pms_boss_sagitta_edr3",
    "mwm_yso_pms_boss_zari18pms",
    "mwm_yso_s1",
    "mwm_yso_s2",
    "mwm_yso_s2-5",
    "mwm_yso_s3",
    "mwm_yso_variable_apogee",
    "mwm_yso_variable_boss",
    "openfiberstargets_test",
    "openfibertargets_nov2020_10",
    "openfibertargets_nov2020_1000",
    "openfibertargets_nov2020_1001a",
    "openfibertargets_nov2020_1001b",
    "openfibertargets_nov2020_11",
    "openfibertargets_nov2020_12",
    "openfibertargets_nov2020_14",
    "openfibertargets_nov2020_15",
    "openfibertargets_nov2020_17",
    "openfibertargets_nov2020_18",
    "openfibertargets_nov2020_19a",
    "openfibertargets_nov2020_19b",
    "openfibertargets_nov2020_19c",
    "openfibertargets_nov2020_22",
    "openfibertargets_nov2020_24",
    "openfibertargets_nov2020_25",
    "openfibertargets_nov2020_26",
    "openfibertargets_nov2020_27",
    "openfibertargets_nov2020_28a",
    "openfibertargets_nov2020_28b",
    "openfibertargets_nov2020_28c",
    "openfibertargets_nov2020_29",
    "openfibertargets_nov2020_3",
    "openfibertargets_nov2020_30",
    "openfibertargets_nov2020_31",
    "openfibertargets_nov2020_32",
    "openfibertargets_nov2020_33",
    "openfibertargets_nov2020_34a",
    "openfibertargets_nov2020_34b",
    "openfibertargets_nov2020_35a",
    "openfibertargets_nov2020_35b",
    "openfibertargets_nov2020_35c",
    "openfibertargets_nov2020_46",
    "openfibertargets_nov2020_47a",
    "openfibertargets_nov2020_47b",
    "openfibertargets_nov2020_47c",
    "openfibertargets_nov2020_47d",
    "openfibertargets_nov2020_47e",
    "openfibertargets_nov2020_5",
    "openfibertargets_nov2020_6a",
    "openfibertargets_nov2020_6b",
    "openfibertargets_nov2020_6c",
    "openfibertargets_nov2020_8",
    "openfibertargets_nov2020_9",
    "ops_2mass_psc_brightneighbors",
    "ops_apogee_stds",
    "ops_gaia_brightneighbors",
    "ops_sky_apogee",
    "ops_sky_apogee_best",
    "ops_sky_apogee_good",
    "ops_sky_boss",
    "ops_sky_boss_best",
    "ops_sky_boss_fallback",
    "ops_sky_boss_good",
    "ops_std_apogee",
    "ops_std_boss",
    "ops_std_boss_gdr2",
    "ops_std_boss_lsdr10",
    "ops_std_boss_lsdr8",
    "ops_std_boss_ps1dr2",
    "ops_std_boss_red",
    "ops_std_boss-red",
    "ops_std_boss_tic",
    "ops_std_eboss",
    "ops_tycho2_brightneighbors"
    )

    cartons = [SimplifiedCarton(carton=carton_name) for carton_name in carton_names]
    with database.atomic():
    (
    SimplifiedCarton
    .bulk_create(cartons)
    )

    np.random.seed(0)

    C = len(cartons)
    N = 10_000
    N_cartons_assigned_to = np.random.randint(0, C, size=N)

    print("Preparing sources")
    assignments = {}

    sources = []
    for i in tqdm(range(N)):
    ra, dec = np.random.uniform(size=2)
    source = Source.create(ra=ra, dec=dec)


    carton_assignments = np.random.choice(cartons, size=N_cartons_assigned_to[i], replace=False)
    for carton in carton_assignments:
    source.carton_flags.set_bit(carton.id)
    assignments.setdefault(carton.id, [])
    assignments[carton.id].append(source.id)

    source.save()
    sources.append(source)


    for carton_id, expected_source_ids in tqdm(assignments.items(), desc="Checking.."):
    q = (
    Source
    .select(Source.id)
    .where(Source.is_in_simplified_carton(carton_id))
    .tuples()
    )
    actual_source_ids = [source_id for source_id, in q]
    diff = set(expected_source_ids).symmetric_difference(actual_source_ids)
    assert len(diff) == 0

    print("Done")