From 0449f5a90ac9f0a568762426e290ff194996d1b9 Mon Sep 17 00:00:00 2001 From: Lionel Sambuc Date: Tue, 15 Oct 2019 19:30:01 +0200 Subject: [PATCH] Implement ViewPort & resolution selection * Implement multi-scale indices generation. * Deduplicate values before sorting them while generating an index, to reduce the number of points to sort. * Use a hastable to deduplicate values, instead of a sort + dedup call. * Minor code cleanups --- src/database/db_core.rs | 11 +- src/database/mod.rs | 20 +-- src/database/space/position.rs | 10 ++ src/database/space/shape.rs | 45 ++++++ src/database/space_db.rs | 266 ++++++++++++++++++++++++++++----- src/database/space_index.rs | 12 +- src/json/model.rs | 12 +- src/json/storage.rs | 9 +- src/lib.rs | 3 + src/main.rs | 6 +- 10 files changed, 334 insertions(+), 60 deletions(-) diff --git a/src/database/db_core.rs b/src/database/db_core.rs index cc73b95..3b5f4ba 100644 --- a/src/database/db_core.rs +++ b/src/database/db_core.rs @@ -11,7 +11,8 @@ pub struct CoreQueryParameters<'a> { pub db: &'a DataBase, pub output_space: Option<&'a str>, pub threshold_volume: Option, - pub resolution: Option>, + pub view_port: &'a Option<(Vec, Vec)>, + pub resolution: Option>, } #[derive(Clone, Debug, Deserialize, Eq, Hash, PartialEq, Serialize)] @@ -75,6 +76,8 @@ impl Core { spaces: &[Space], properties: Vec, space_objects: Vec, + scales: Option>>, + max_elements: Option, ) -> Self //Result where @@ -101,7 +104,7 @@ impl Core { }) .collect(); - space_dbs.push(SpaceDB::new(space.name(), filtered)) + space_dbs.push(SpaceDB::new(&space, filtered, scales.clone(), max_elements)) } Core { @@ -195,6 +198,7 @@ impl Core { output_space, threshold_volume, resolution, + .. } = parameters; let mut results = vec![]; @@ -238,6 +242,7 @@ impl Core { output_space, threshold_volume, resolution, + .. } = parameters; let mut results = vec![]; @@ -272,6 +277,7 @@ impl Core { output_space, threshold_volume, resolution, + .. } = parameters; let id: String = id.into(); @@ -310,6 +316,7 @@ impl Core { output_space, threshold_volume, resolution, + .. } = parameters; let id: String = id.into(); diff --git a/src/database/mod.rs b/src/database/mod.rs index 4671db3..587faa5 100644 --- a/src/database/mod.rs +++ b/src/database/mod.rs @@ -21,7 +21,7 @@ pub type ResultSet = Result, String>; pub type ReferenceSpaceIndex = ironsea_index_hashmap::Index, Space, String>; type CoreIndex = ironsea_index_hashmap::Index, Core, String>; -#[derive(Clone, Debug, Deserialize, Serialize)] +#[derive(Clone, Debug, Deserialize, Hash, PartialEq, Serialize)] pub struct SpaceId(String); impl SpaceId { @@ -49,12 +49,6 @@ where } } -impl PartialEq for SpaceId { - fn eq(&self, other: &Self) -> bool { - self.0 == other.0 - } -} - #[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize)] pub struct SpaceObject { pub space_id: String, @@ -62,18 +56,6 @@ pub struct SpaceObject { pub value: Properties, } -// FIXME: Which is faster, the code below or the automatically generated -// implementation? -/* -impl PartialEq for SpaceObject { - fn eq(&self, other: &Self) -> bool { - self.space_id == other.space_id - && self.value == other.value - && self.position == other.position - } -} -*/ - #[derive(Clone, Debug, Deserialize, Serialize)] pub struct DataBase { reference_spaces: ReferenceSpaceIndex, diff --git a/src/database/space/position.rs b/src/database/space/position.rs index 42d03b0..cde57bd 100644 --- a/src/database/space/position.rs +++ b/src/database/space/position.rs @@ -82,6 +82,16 @@ impl Position { product } + + pub fn reduce_precision(&self, scale: u32) -> Self { + let mut position = Vec::with_capacity(self.dimensions()); + + for i in 0..self.dimensions() { + position.push((self[i].u64() >> scale).into()) + } + + Position::new(position) + } } impl Display for Position { diff --git a/src/database/space/shape.rs b/src/database/space/shape.rs index ab01373..fe10d43 100644 --- a/src/database/space/shape.rs +++ b/src/database/space/shape.rs @@ -206,4 +206,49 @@ impl Shape { }) .collect()) } + + pub fn volume(&self) -> f64 { + match self { + Shape::Point(_) => std::f64::EPSILON, // Smallest non-zero volume possible + Shape::BoundingBox(low, high) => { + let mut volume = 1.0; + + // For each dimension, multiply by the length in that dimension + for i in 0..low.dimensions() { + let l = low[i].f64(); + let h = high[i].f64(); + let length = if h > l { h - l } else { l - h }; + + volume *= length; + } + + volume + } + Shape::HyperSphere(position, radius) => { + // Formula from https://en.wikipedia.org/wiki/N-sphere#/media/File:N_SpheresVolumeAndSurfaceArea.png + let k = position.dimensions(); // Number of dimensions. + let radius = radius.f64(); + + let pi = std::f64::consts::PI; + let factor = 2.0 * pi; + + // Set starting values for the coefficient + let mut a = 2.0; + let mut i = if (k % 2) == 0 { + a = pi; + 2 + } else { + 1 + }; + + while i < k { + i += 2; + a *= factor; + a /= i as f64; + } + + a * radius.powi(i as i32) + } + } + } } diff --git a/src/database/space_db.rs b/src/database/space_db.rs index bba70c3..3e21669 100644 --- a/src/database/space_db.rs +++ b/src/database/space_db.rs @@ -1,13 +1,21 @@ +use std::cmp::Ordering; +use std::collections::hash_map::DefaultHasher; +use std::collections::HashMap; +use std::collections::HashSet; +use std::hash::Hash; +use std::hash::Hasher; + +use ironsea_table_vector::VectorTable; + use super::space::Coordinate; use super::space::Position; use super::space::Shape; +use super::space::Space; use super::space_index::SpaceFields; use super::space_index::SpaceIndex; use super::space_index::SpaceSetIndex; use super::space_index::SpaceSetObject; -use ironsea_table_vector::VectorTable; - #[derive(Clone, Debug, Deserialize, Serialize)] pub struct SpaceDB { reference_space: String, @@ -16,17 +24,24 @@ pub struct SpaceDB { } impl SpaceDB { - pub fn new(reference_space: S, mut space_objects: Vec) -> Self - where - S: Into, - { + pub fn new( + reference_space: &Space, + mut space_objects: Vec, + scales: Option>>, + max_elements: Option, + ) -> Self { + //FIXME: Remove hard-coded constants for dimensions & bit length of morton codes. + const DIMENSIONS: usize = 3; + const CELL_BITS: usize = 10; + let mut values = space_objects .iter() .map(|object| *object.value()) + .collect::>() + .drain() .collect::>(); values.sort_unstable_by_key(|&c| c.u64()); - values.dedup_by_key(|c| c.u64()); space_objects.iter_mut().for_each(|object| { // Update the values to point into the local (shorter) mapping array. @@ -35,27 +50,174 @@ impl SpaceDB { }); // Build the set of SpaceIndices. - // FIXME: Build multiple-scale indices. What is the stopping condition, and what are the parameters? - let max_elem = 2_000; - // We cannot return less that the total number of individual Ids stored - // in the index. - let max = max_elem.max(values.len()); - // Generate indices as long as max is smaller than the number of point located in the whole space. - // For each new index, reduce precision by two, and push to resolutions vectors. + let mut resolutions = vec![]; + let mut indices = vec![]; + + if let Some(scales) = scales { + // We optimize scaling, by iteratively building coarser and coarser + // indexes. Powers holds a list of bit shift to apply based on the + // previous value. + let mut powers = Vec::with_capacity(scales.len()); + + // Limit temporary values lifetimes + { + // Sort by values, smaller to bigger. + let mut exps = scales.clone(); + exps.sort_unstable_by_key(|v| v[0]); + + let mut previous = 0u32; + for scale in exps { + // FIXME: Remove these assertions ASAP, and support multi-factor scaling + assert_eq!(scale.len(), DIMENSIONS); + assert!(scale[0] == scale[1] && scale[0] == scale[2]); + + powers.push((scale[0], scale[0] - previous)); + previous = scale[0]; + } + } + + // Apply fixed scales + let mut count = 0; + for power in &powers { + space_objects = space_objects + .into_iter() + .map(|mut o| { + let p = o.position().reduce_precision(power.1); + let mut hasher = DefaultHasher::new(); + o.set_position(p); + + // Hash, AFTER updating the position. + o.hash(&mut hasher); + + (hasher.finish(), o) + }) + .collect::>() + .drain() + .map(|(_k, v)| v) + .collect(); + + // Make sure we do not shift more position than available + let shift = if count >= 31 { 31 } else { count }; + count += 1; + indices.push(( + SpaceSetIndex::new( + &VectorTable::new(space_objects.to_vec()), + DIMENSIONS, + CELL_BITS, + ), + vec![power.0, power.0, power.0], + shift, + )); + } + } else { + // Generate scales, following max_elements + if let Some(max_elements) = max_elements { + // We cannot return less that the total number of individual Ids stored + // in the index for a full-volume query. + let max_elements = max_elements.max(values.len()); + let mut count = 0; + + // The next index should contain at most half the number of + // elements of the current index. + let mut element_count_target = space_objects.len() / 2; + + // Insert Full resolution index. + indices.push(( + SpaceSetIndex::new( + &VectorTable::new(space_objects.clone()), + DIMENSIONS, + CELL_BITS, + ), + vec![count, count, count], + 0, // Smallest value => highest resolution + )); + + // Generate coarser indices, until we reach the expect max_element + // values or we can't define bigger bit shift. + loop { + // Make sure we do not shift more position than available + let shift = if count >= 31 { 31 } else { count }; + count += 1; + space_objects = space_objects + .into_iter() + .map(|mut o| { + let p = o.position().reduce_precision(1); + let mut hasher = DefaultHasher::new(); + o.set_position(p); + + // Hash, AFTER updating the position. + o.hash(&mut hasher); + + (hasher.finish(), o) + }) + .collect::>() + .drain() + .map(|(_k, v)| v) + .collect(); + + // Skip a resolution if it does not bring down enough the + // number of points. It would be a waste of space to store it. + if element_count_target < space_objects.len() { + continue; + } else { + // The next index should contain at most half the number of + // elements of the current index. + element_count_target = space_objects.len() / 2; + } + + indices.push(( + SpaceSetIndex::new( + &VectorTable::new(space_objects.to_vec()), + DIMENSIONS, + CELL_BITS, + ), + vec![count, count, count], + shift, + )); + + if space_objects.len() <= max_elements || count == std::u32::MAX { + break; + } + } + + // Generate indices as long as max is smaller than the number of point located in the whole space. + // For each new index, reduce precision by two, and push to resolutions vectors. + } else { + // Generate only full-scale. + indices.push(( + SpaceSetIndex::new(&VectorTable::new(space_objects), DIMENSIONS, CELL_BITS), + vec![0, 0, 0], + 0, + )); + } + } // When done, go over the array, and set the threshold_volumes with Volume total / 8 * i in reverse order - // - let index = SpaceSetIndex::new(&VectorTable::new(space_objects), 3, 10); - let mut resolutions = vec![SpaceIndex::new(std::f64::MAX, vec![0, 0, 0], index)]; + let space_volume = reference_space.volume(); + let max_shift = match indices.last() { + None => 31, + Some((_, _, x)) => *x, + }; + + for (index, scale, shift) in indices { + // Compute threshold volume as Vt = V / 2^(max_shift) * 2^shift + // => the smaller shift is, the smaller the threshold is and the higher + // the resolution is. + let volume = space_volume / f64::from(1 << (max_shift - shift)); + + resolutions.push(SpaceIndex::new(volume, scale, index)); + } // Make sure the vector is sorted by threshold volumes, smallest to largest. // this means indices are sorted form highest resolution to lowest resolution. - // default_resolution() relies on it to find the correct index. - //FIXME: Domain check between f64 <-> u64 XOR implement Ord on f64 - resolutions.sort_unstable_by_key(|a| a.threshold() as u64); + // default_resolution() relies on this to find the correct index. + resolutions.sort_unstable_by(|a, b| match a.threshold().partial_cmp(&b.threshold()) { + Some(o) => o, + None => Ordering::Less, // FIXME: This is most likely incorrect... + }); SpaceDB { - reference_space: reference_space.into(), + reference_space: reference_space.name().clone(), values, resolutions, } @@ -88,29 +250,65 @@ impl SpaceDB { fn default_resolution(&self, volume: f64) -> usize { for i in 0..self.resolutions.len() { if volume <= self.resolutions[i].threshold() { + debug!( + "Selected {:?} -> {:?} vs {:?}", + i, + self.resolutions[i].threshold(), + volume, + ); + return i; } } - self.resolutions.len() + + debug!( + "Selected lowest resolution -> {:?} vs {:?}", + self.resolutions[self.lowest_resolution()].threshold(), + volume + ); + + self.lowest_resolution() } - fn find_resolution(&self, _scales: &[u64]) -> usize { - // FIXME: Implement stuff here! + fn find_resolution(&self, scale: &[u32]) -> usize { + for i in 0..self.resolutions.len() { + if scale <= self.resolutions[i].scale() { + debug!( + "Selected {:?} -> {:?} vs {:?}", + i, + self.resolutions[i].scale(), + scale + ); + + return i; + } + } + warn!( + "Scale factors {:?} not found, using lowest resolution: {:?}", + scale, + self.resolutions[self.lowest_resolution()].scale() + ); + self.lowest_resolution() } pub fn get_resolution( &self, threshold_volume: &Option, - resolution: &Option>, + resolution: &Option>, ) -> usize { - if let Some(threshold_volume) = threshold_volume { - self.default_resolution(*threshold_volume) - } else { - match resolution { - None => self.lowest_resolution(), - Some(v) => self.find_resolution(v), + // If a specific scale has been set, try to find it, otherwise use the + // threshold volume to figure a default value, and fall back to the most + // coarse resolution whenever nothing is specified. + match resolution { + None => { + if let Some(threshold_volume) = threshold_volume { + self.default_resolution(*threshold_volume) + } else { + self.lowest_resolution() + } } + Some(v) => self.find_resolution(v), } } @@ -128,7 +326,7 @@ impl SpaceDB { &self, id: usize, threshold_volume: &Option, - resolution: &Option>, + resolution: &Option>, ) -> Result, String> { // Is that ID referenced in the current space? if let Ok(offset) = self.values.binary_search(&id.into()) { @@ -154,7 +352,7 @@ impl SpaceDB { &self, positions: &[Position], threshold_volume: &Option, - resolution: &Option>, + resolution: &Option>, ) -> Result, String> { let index = self.get_resolution(threshold_volume, resolution); @@ -174,7 +372,7 @@ impl SpaceDB { &self, shape: &Shape, threshold_volume: &Option, - resolution: &Option>, + resolution: &Option>, ) -> Result, String> { let index = self.get_resolution(threshold_volume, resolution); diff --git a/src/database/space_index.rs b/src/database/space_index.rs index a0ef72c..3e09823 100644 --- a/src/database/space_index.rs +++ b/src/database/space_index.rs @@ -6,7 +6,7 @@ use super::space::Position; use super::space::Shape; use super::SpaceId; -#[derive(Clone, Debug, Deserialize, Serialize)] +#[derive(Clone, Debug, Deserialize, Hash, Serialize)] pub struct SpaceSetObject { space_id: SpaceId, position: Position, @@ -34,6 +34,10 @@ impl SpaceSetObject { &self.position } + pub fn set_position(&mut self, pos: Position) { + self.position = pos; + } + pub fn value(&self) -> &Coordinate { &self.value } @@ -97,7 +101,7 @@ pub type SpaceSetIndex = ironsea_index_sfc_dbc::IndexOwned< #[derive(Clone, Debug, Deserialize, Serialize)] pub struct SpaceIndex { threshold_volume: f64, - // lookup_ rounds up, so reverse sort of the list on threasholds and check for last index. + // lookup_ rounds up, so reverse sort of the list on thresholds and check for last index. scale: Vec, index: SpaceSetIndex, } @@ -115,6 +119,10 @@ impl SpaceIndex { self.threshold_volume } + pub fn scale(&self) -> &Vec { + &self.scale + } + pub fn find(&self, key: &Position) -> Vec { self.index.find(key) } diff --git a/src/json/model.rs b/src/json/model.rs index 2a4ad9e..7484a1b 100644 --- a/src/json/model.rs +++ b/src/json/model.rs @@ -162,6 +162,8 @@ pub fn build_index( version: &str, spaces: &[space::Space], objects: &[SpatialObject], + scales: Option>>, + max_elements: Option, ) -> Core { let mut properties = vec![]; let mut space_set_objects = vec![]; @@ -210,5 +212,13 @@ pub fn build_index( object.set_value(value.into()); }); - Core::new(name, version, spaces, properties, space_set_objects) + Core::new( + name, + version, + spaces, + properties, + space_set_objects, + scales, + max_elements, + ) } diff --git a/src/json/storage.rs b/src/json/storage.rs index 2612678..9a75f1f 100644 --- a/src/json/storage.rs +++ b/src/json/storage.rs @@ -74,7 +74,12 @@ pub fn convert(name: &str) { from_json::>(&fn_in, &fn_out); } -pub fn build(name: &str, version: &str) { +pub fn build( + name: &str, + version: &str, + scales: Option>>, + max_elements: Option, +) { let fn_spaces = format!("{}.spaces.bin", name); let fn_objects = format!("{}.objects.bin", name); let fn_index = format!("{}.index", name); @@ -89,6 +94,8 @@ pub fn build(name: &str, version: &str) { version, &spaces, &load::>(&fn_objects), + scales, + max_elements, ); store((spaces, core), &fn_index); diff --git a/src/lib.rs b/src/lib.rs index a39dfd1..fab7dbe 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,9 @@ #[macro_use] extern crate lazy_static; +#[macro_use] +extern crate log; + #[macro_use] extern crate arrayref; diff --git a/src/main.rs b/src/main.rs index 99edd38..9570715 100644 --- a/src/main.rs +++ b/src/main.rs @@ -22,7 +22,7 @@ fn main() { // Build a Database Index: if true { info_time!("Building database index"); - storage::build("10k", "v0.1"); + storage::build("10k", "v0.1", None, None); } // Load a Database: @@ -40,6 +40,7 @@ fn main() { db: &db, output_space: None, threshold_volume: Some(std::f64::MAX), + view_port: &None, resolution: None, }; let r = core.get_by_id(&c, id).unwrap(); @@ -50,6 +51,7 @@ fn main() { db: &db, output_space: None, threshold_volume: Some(0.0), + view_port: &None, resolution: None, }; let r = core.get_by_id(&c, id).unwrap(); @@ -60,6 +62,7 @@ fn main() { db: &db, output_space: None, threshold_volume: Some(std::f64::MAX), + view_port: &None, resolution: None, }; let r = core.get_by_label(&c, id).unwrap(); @@ -77,6 +80,7 @@ fn main() { db: &db, output_space: None, threshold_volume: Some(0.0), + view_port: &None, resolution: None, }; let r = core.get_by_shape(&c, &shape, "std").unwrap();