Skip to content

Instantly share code, notes, and snippets.

@andycasey
Last active August 13, 2023 03:58
Show Gist options
  • Save andycasey/1106d51aaec54633eedb451437fce2b7 to your computer and use it in GitHub Desktop.
Save andycasey/1106d51aaec54633eedb451437fce2b7 to your computer and use it in GitHub Desktop.
This demonstrates a way to use bitfield flags to store all targeting information for SDSS-V
"""
This minimum reproducible example demonstrates a way to use bitfield flags to store the cartons
that a source is assigned to.
What does it do?
----------------
1. Creates a `Source` table and inserts 10,000 random sources.
2. Creates some `SimplifiedCarton` entries based on unique names currently in targetdb.
3. Assigns sources to random cartons (many more than what would exist in reality).
4. Checks that when querying for sources based on a carton, it retrieves only the sources expected.
How does it do it?
------------------
A single bitfield is used to store all the primary keys of the cartons that a source is assigned.
The minimum length of the bitfield flag used here is as small as 34 bytes (way less than some of
the file paths that are stored in the database).
What's bad about it?
--------------------
The length of the bytearray varies per source, and is limited by the maximum primary key that a
carton is assigned. In this example I have created a `SimplifiedCarton` just to make things more
efficient, but this could in principle be used to map directly to the primary keys in
`targetdb.carton.pk`. There are many gaps in primary keys in `targetdb.carton`, which makes the
bitfield storage less efficient.
Storing the carton assignments in a bitfield flag makes it a little less easy to query (than
compared to a normal `peewee.BitField`), but I have included some hybrid methods to make it easy
to query by carton primary key.
Caveats
-------
This example works in PostgreSQL. You should be able to store the bitfield flags in SQLite, but
I think you won't be able to query with them in SQLite, because SQLite does not have as many
bitwise string operators available (I *think*; it would be great if we can do it in SQLite too).
"""
import numpy as np
from peewee import (
fn,
AutoField,
FloatField,
TextField,
Model,
BigBitField,
PostgresqlDatabase,
)
from playhouse.hybrid import hybrid_method
database = PostgresqlDatabase(...)
class BaseModel(Model):
class Meta:
database = database
schema = None
legacy_table_names = False
class SimplifiedCarton(BaseModel):
"""
A simplified carton for use with targeting flags.
Simplified cartons have sanitised names, and do not track versions of cartons.
A simplified carton is considered unique by its 'sanitised' name. The 'sanitised'
name is where the dashes '-' are replaced with underscores '_'. For example, the
cartons with names 'bhm_aqmes_bonus_bright' and 'bhm_aqmes_bonus-bright' in
`targetdb.carton` are considered the same simplified carton with the name
'bhm_aqmes_bonus_bright'.
If a source was assigned to a carton in `targetdb` in some late version, then
that will not be tracked by this `SimplifiedCarton`. This `SimplifiedCarton`
should only track whether or not a source was assigned to a carton, not when.
"""
id = AutoField()
carton = TextField(unique=True, index=True)
class Source(BaseModel):
""" An astronomical source. """
# Identifiers
id = AutoField()
# Astrometry
ra = FloatField()
dec = FloatField()
# Targeting
carton_0 = TextField(null=True)
carton_flags = BigBitField(null=True)
@hybrid_method
def is_in_simplified_carton(self, carton_id):
""" Return whether this source is assigned to the given (simplified) carton. """
return (
(fn.length(self.carton_flags) > int(carton_id / 8))
& (fn.get_bit(self.carton_flags, carton_id) > 0)
)
@hybrid_method
def is_in_any_simplified_carton(self, *carton_ids):
return fn.OR(*[self.is_in_simplified_carton(carton_id) for carton_id in carton_ids])
@property
def simplified_carton_ids(self):
""" Return the primary keys of the cartons that this source is assigned. """
i, cartons, cur_size = (0, [], len(self.carton_flags._buffer))
while True:
byte_num, byte_offset = divmod(i, 8)
if byte_num >= cur_size:
break
if bool(self.carton_flags._buffer[byte_num] & (1 << byte_offset)):
cartons.append(i)
i += 1
return cartons
if __name__ == "__main__":
import numpy as np
from tqdm import tqdm
database.drop_tables([
Source,
SimplifiedCarton
])
database.create_tables([
Source,
SimplifiedCarton,
])
carton_names = (
"bhm_aqmes_bonus_bright",
"bhm_aqmes_bonus-bright",
"bhm_aqmes_bonus_core",
"bhm_aqmes_bonus-dark",
"bhm_aqmes_bonus_faint",
"bhm_aqmes_med",
"bhm_aqmes_med_faint",
"bhm_aqmes_med-faint",
"bhm_aqmes_wide2",
"bhm_aqmes_wide2_faint",
"bhm_aqmes_wide2-faint",
"bhm_aqmes_wide3",
"bhm_aqmes_wide3-faint",
"bhm_colr_galaxies_lsdr10",
"bhm_colr_galaxies_lsdr8",
"bhm_csc_apogee",
"bhm_csc_boss",
"bhm_csc_boss_bright",
"bhm_csc_boss-bright",
"bhm_csc_boss_dark",
"bhm_csc_boss-dark",
"bhm_gua_bright",
"bhm_gua_dark",
"bhm_rm_ancillary",
"bhm_rm_core",
"bhm_rm_known_spec",
"bhm_rm_known-spec",
"bhm_rm_var",
"bhm_rm_xrayqso",
"bhm_spiders_agn-efeds",
"bhm_spiders_agn_efeds_stragglers",
"bhm_spiders_agn_gaiadr2",
"bhm_spiders_agn_gaiadr3",
"bhm_spiders_agn_hard",
"bhm_spiders_agn_lsdr10",
"bhm_spiders_agn_lsdr8",
"bhm_spiders_agn_ps1dr2",
"bhm_spiders_agn_sep",
"bhm_spiders_agn_skymapperdr2",
"bhm_spiders_agn_supercosmos",
"bhm_spiders_agn_tda",
"bhm_spiders_clusters-efeds-erosita",
"bhm_spiders_clusters-efeds-hsc-redmapper",
"bhm_spiders_clusters-efeds-ls-redmapper",
"bhm_spiders_clusters-efeds-sdss-redmapper",
"bhm_spiders_clusters_efeds_stragglers",
"bhm_spiders_clusters_lsdr10",
"bhm_spiders_clusters_lsdr8",
"bhm_spiders_clusters_ps1dr2",
"comm_pleiades",
"comm_spectrophoto",
"manual_bhm_spiders_comm",
"manual_bhm_spiders_comm_lco",
"manual_bright_target",
"manual_bright_target_offsets_1",
"manual_bright_target_offsets_1_g13",
"manual_bright_target_offsets_2",
"manual_bright_target_offsets_2_g13",
"manual_bright_target_offsets_3",
"manual_bright_targets",
"manual_bright_targets_g13",
"manual_bright_targets_g13_offset_fixed_1",
"manual_bright_targets_g13_offset_fixed_2",
"manual_bright_targets_g13_offset_fixed_3",
"manual_bright_targets_g13_offset_fixed_4",
"manual_bright_targets_g13_offset_fixed_5",
"manual_bright_targets_g13_offset_fixed_6",
"manual_bright_targets_g13_offset_fixed_7",
"manual_fps_position_stars",
"manual_fps_position_stars_10",
"manual_fps_position_stars_apogee_10",
"manual_fps_position_stars_lco_apogee_10",
"manual_mwm_crosscalib_apogee",
"manual_mwm_crosscalib_yso_apogee",
"manual_mwm_crosscalib_yso_boss",
"manual_mwm_halo_distant_bhb",
"manual_mwm_halo_distant_kgiant",
"manual_mwm_halo_mp_bbb",
"manual_mwm_magcloud_massive_apogee",
"manual_mwm_magcloud_massive_boss",
"manual_mwm_magcloud_symbiotic_apogee",
"manual_mwm_planet_ca_legacy_v1",
"manual_mwm_planet_gaia_astrometry_v1",
"manual_mwm_planet_gpi_v1",
"manual_mwm_planet_harps_v1",
"manual_mwm_planet_known_v1",
"manual_mwm_planet_sophie_v1",
"manual_mwm_planet_sphere_v1",
"manual_mwm_planet_tess_eb_v1",
"manual_mwm_planet_tess_pc_v1",
"manual_mwm_planet_transiting_bd_v1",
"manual_mwm_tess_ob",
"manual_mwm_validation_cool_apogee",
"manual_mwm_validation_cool_boss",
"manual_mwm_validation_hot_apogee",
"manual_mwm_validation_hot_boss",
"manual_mwm_validation_rv",
"manual_nsbh_apogee",
"manual_nsbh_boss",
"manual_offset_mwmhalo_off00",
"manual_offset_mwmhalo_off05",
"manual_offset_mwmhalo_off10",
"manual_offset_mwmhalo_off20",
"manual_offset_mwmhalo_off30",
"manual_offset_mwmhalo_offa",
"manual_offset_mwmhalo_offb",
"manual_planet_ca_legacy_v0",
"manual_planet_gaia_astrometry_v0",
"manual_planet_gpi_v0",
"manual_planet_harps_v0",
"manual_planet_known_v0",
"manual_planet_sophie_v0",
"manual_planet_sphere_v0",
"manual_planet_tess_eb_v0",
"manual_planet_tess_pc_v0",
"manual_planet_transiting_bd_v0",
"manual_validation_apogee",
"manual_validation_boss",
"manual_validation_cool_apogee",
"manual_validation_cool_boss",
"manual_validation_rv",
"mwm_bin_rv_long",
"mwm_bin_rv_short",
"mwm_cb_300pc",
"mwm_cb_300pc_apogee",
"mwm_cb_300pc_boss",
"mwm_cb_cvcandidates",
"mwm_cb_cvcandidates_apogee",
"mwm_cb_cvcandidates_boss",
"mwm_cb_gaiagalex",
"mwm_cb_gaiagalex_apogee",
"mwm_cb_gaiagalex_boss",
"mwm_cb_uvex1",
"mwm_cb_uvex2",
"mwm_cb_uvex3",
"mwm_cb_uvex4",
"mwm_cb_uvex5",
"mwm_dust_core",
"mwm_erosita_compact",
"mwm_erosita_compact_deep",
"mwm_erosita_compact_gen",
"mwm_erosita_compact_var",
"mwm_erosita_stars",
"mwm_galactic_core",
"mwm_galactic_core_dist",
"mwm_gg_core",
"mwm_halo_bb",
"mwm_halo_bb_apogee",
"mwm_halo_bb_boss",
"mwm_halo_sm",
"mwm_halo_sm_apogee",
"mwm_halo_sm_boss",
"mwm_legacy_ir2opt",
"mwm_ob_cepheids",
"mwm_ob_core",
"mwm_planet_tess",
"mwm_rv_long_bplates",
"mwm_rv_long-bplates",
"mwm_rv_long_fps",
"mwm_rv_long-fps",
"mwm_rv_long_rm",
"mwm_rv_long-rm",
"mwm_rv_short_bplates",
"mwm_rv_short-bplates",
"mwm_rv_short_fps",
"mwm_rv_short-fps",
"mwm_rv_short_rm",
"mwm_rv_short-rm",
"mwm_snc_100pc",
"mwm_snc_100pc_apogee",
"mwm_snc_100pc_boss",
"mwm_snc_250pc",
"mwm_snc_250pc_apogee",
"mwm_snc_250pc_boss",
"mwm_tess_2min",
"mwm_tess_ob",
"mwm_tess_planet",
"mwm_tess_rgb",
"mwm_tessrgb_core",
"mwm_wd_core",
"mwm_wd_pwd",
"mwm_yso_cluster",
"mwm_yso_cluster_apogee",
"mwm_yso_cluster_boss",
"mwm_yso_cmz",
"mwm_yso_cmz_apogee",
"mwm_yso_disk_apogee",
"mwm_yso_disk_boss",
"mwm_yso_embedded_apogee",
"mwm_yso_nebula_apogee",
"mwm_yso_ob",
"mwm_yso_ob_apogee",
"mwm_yso_ob_boss",
"mwm_yso_pms_apogee",
"mwm_yso_pms_apogee_sagitta_edr3",
"mwm_yso_pms_apogee_zari18pms",
"mwm_yso_pms_boss",
"mwm_yso_pms_boss_sagitta_edr3",
"mwm_yso_pms_boss_zari18pms",
"mwm_yso_s1",
"mwm_yso_s2",
"mwm_yso_s2-5",
"mwm_yso_s3",
"mwm_yso_variable_apogee",
"mwm_yso_variable_boss",
"openfiberstargets_test",
"openfibertargets_nov2020_10",
"openfibertargets_nov2020_1000",
"openfibertargets_nov2020_1001a",
"openfibertargets_nov2020_1001b",
"openfibertargets_nov2020_11",
"openfibertargets_nov2020_12",
"openfibertargets_nov2020_14",
"openfibertargets_nov2020_15",
"openfibertargets_nov2020_17",
"openfibertargets_nov2020_18",
"openfibertargets_nov2020_19a",
"openfibertargets_nov2020_19b",
"openfibertargets_nov2020_19c",
"openfibertargets_nov2020_22",
"openfibertargets_nov2020_24",
"openfibertargets_nov2020_25",
"openfibertargets_nov2020_26",
"openfibertargets_nov2020_27",
"openfibertargets_nov2020_28a",
"openfibertargets_nov2020_28b",
"openfibertargets_nov2020_28c",
"openfibertargets_nov2020_29",
"openfibertargets_nov2020_3",
"openfibertargets_nov2020_30",
"openfibertargets_nov2020_31",
"openfibertargets_nov2020_32",
"openfibertargets_nov2020_33",
"openfibertargets_nov2020_34a",
"openfibertargets_nov2020_34b",
"openfibertargets_nov2020_35a",
"openfibertargets_nov2020_35b",
"openfibertargets_nov2020_35c",
"openfibertargets_nov2020_46",
"openfibertargets_nov2020_47a",
"openfibertargets_nov2020_47b",
"openfibertargets_nov2020_47c",
"openfibertargets_nov2020_47d",
"openfibertargets_nov2020_47e",
"openfibertargets_nov2020_5",
"openfibertargets_nov2020_6a",
"openfibertargets_nov2020_6b",
"openfibertargets_nov2020_6c",
"openfibertargets_nov2020_8",
"openfibertargets_nov2020_9",
"ops_2mass_psc_brightneighbors",
"ops_apogee_stds",
"ops_gaia_brightneighbors",
"ops_sky_apogee",
"ops_sky_apogee_best",
"ops_sky_apogee_good",
"ops_sky_boss",
"ops_sky_boss_best",
"ops_sky_boss_fallback",
"ops_sky_boss_good",
"ops_std_apogee",
"ops_std_boss",
"ops_std_boss_gdr2",
"ops_std_boss_lsdr10",
"ops_std_boss_lsdr8",
"ops_std_boss_ps1dr2",
"ops_std_boss_red",
"ops_std_boss-red",
"ops_std_boss_tic",
"ops_std_eboss",
"ops_tycho2_brightneighbors"
)
cartons = [SimplifiedCarton(carton=carton_name) for carton_name in carton_names]
with database.atomic():
(
SimplifiedCarton
.bulk_create(cartons)
)
np.random.seed(0)
C = len(cartons)
N = 10_000
N_cartons_assigned_to = np.random.randint(0, C, size=N)
print("Preparing sources")
assignments = {}
sources = []
for i in tqdm(range(N)):
ra, dec = np.random.uniform(size=2)
source = Source.create(ra=ra, dec=dec)
carton_assignments = np.random.choice(cartons, size=N_cartons_assigned_to[i], replace=False)
for carton in carton_assignments:
source.carton_flags.set_bit(carton.id)
assignments.setdefault(carton.id, [])
assignments[carton.id].append(source.id)
source.save()
sources.append(source)
for carton_id, expected_source_ids in tqdm(assignments.items(), desc="Checking.."):
q = (
Source
.select(Source.id)
.where(Source.is_in_simplified_carton(carton_id))
.tuples()
)
actual_source_ids = [source_id for source_id, in q]
diff = set(expected_source_ids).symmetric_difference(actual_source_ids)
assert len(diff) == 0
print("Done")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment