From 5a555b65186dba6bc2d19bb40460009b6bb4f4a6 Mon Sep 17 00:00:00 2001 From: Lionel Sambuc Date: Tue, 27 Aug 2019 11:39:04 +0200 Subject: [PATCH] Initial commit --- Cargo.toml | 31 ++++ README.md | 37 ++++ src/cell_space.rs | 348 ++++++++++++++++++++++++++++++++++++ src/lib.rs | 17 ++ src/morton.rs | 446 ++++++++++++++++++++++++++++++++++++++++++++++ src/sfc.rs | 388 ++++++++++++++++++++++++++++++++++++++++ 6 files changed, 1267 insertions(+) create mode 100644 Cargo.toml create mode 100644 README.md create mode 100644 src/cell_space.rs create mode 100644 src/lib.rs create mode 100644 src/morton.rs create mode 100644 src/sfc.rs diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..86eb390 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,31 @@ +[package] +name = "ironsea_index_sfc_dbc" +version = "0.1.0" +authors = ["EPFL-DIAS", "Lionel Sambuc "] + +edition = "2018" + +description = "Space-filling Curve over dictionnary-based compression, index implementation for the Iron Sea database toolkit." +homepage = "https://crates.io/crates/ironsea_index_sfc_dbc" +repository = "https://github.com/epfl-dias/ironsea_index_sfc_dbc" +readme = "README.md" + +keywords = [] +categories = ["database-implementations", "data-structures"] + +license = "MIT" +#license-file = "LICENSE" + +include = ["Cargo.toml", "README.md", "LICENSE", "ACKNOWLEDGEMENTS", "src/**/*.rs"] + +[dependencies] +ironsea_index = "^0.1" +ironsea_table = "^0.1" +ironsea_store = "^0.1" + +arrayref = "^0.3" +log = { version = "^0.4", features = ["max_level_trace", "release_max_level_info"] } + +serde = "^1.0" +serde_derive = "^1.0" +bincode = "^1.1" diff --git a/README.md b/README.md new file mode 100644 index 0000000..8bd726b --- /dev/null +++ b/README.md @@ -0,0 +1,37 @@ +# Iron Sea - Index SFC DBC + +Index for the Iron Sea toolkit, based on a Space Filling Curve (SFC), over dictionary-based compression (DBC), which offers great +performances for both range queries over point cloud data and at the same time uses a storage-efficient index. + +More details in the paper: https://infoscience.epfl.ch/record/232536?ln=en + +## Iron Sea: Database Toolkit + +**Iron Sea** provides a set of database engine bricks, which can be combined and applied on arbitrary data structures. + +Unlike a traditional database, it does not assume a specific physical structure for the tables nor the records, but relies on the developper to provide a set of extractor functions which are used by the specific indices provided. + +This enables the index implementations to be agnostic from the underlying data structure, and re-used. + +## Requirements + +### Software + + * Rust: https://www.rust-lang.org + +## Documentation + +For more information, please refer to the [documentation](https://epfl-dias.github.io/ironsea_index_sfc_dbc/). + +If you want to build the documentation and access it locally, you can use: + +```sh +cargo doc --open +``` + +## Acknowledgements + +This open source software code was developed in part or in whole in the +Human Brain Project, funded from the European Union’s Horizon 2020 +Framework Programme for Research and Innovation under the Specific Grant +Agreement No. 785907 (Human Brain Project SGA2). diff --git a/src/cell_space.rs b/src/cell_space.rs new file mode 100644 index 0000000..483849d --- /dev/null +++ b/src/cell_space.rs @@ -0,0 +1,348 @@ +use std::fmt::Debug; +use std::marker; +use std::ops::Index; + +use ironsea_index::Record; +use ironsea_table::Table; + +type Cell = Vec; + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct CellDictionary { + table: Vec>, + max_offset: usize, + _marker: marker::PhantomData<(K)>, +} + +impl CellDictionary +where + V: Clone + Ord + Debug, + K: Debug + Index, +{ + pub fn new(table: &T, dimension: usize, cell_bits: usize) -> Self + where + T: Table, + R: Record + Debug, + { + // Do not forget to initialise cells[0] + let mut cells: Vec> = vec![vec![]]; + + // 1. Retrieve a list of distinct values for the coordinate `dimension` + let mut distinct = vec![]; + let records = table.get_table(); + + for record in records { + distinct.push(record.key()[dimension].clone()); + } + + // 2. Build sorted, distinct lists + distinct.sort_unstable(); + distinct.dedup(); + + info!( + "Number of distinct coordinates on dim[{}]: {}", + dimension, + distinct.len() + ); + + trace!("min {:?}, max {:?}", distinct[0], distinct.last()); + + // 3. Build the dictionary space + // 3.1. Build dictionnary per dimension, Add cell and offset + // informations + let mut count = 0; + let mut cell = 0; + + // Beware integer division is rounded towards zero, so add 1 to the + // result as this is the max number of elements per bucket. + let max_offset = (distinct.len() / (1 << cell_bits)) + 1; + + for coordinate in distinct { + //trace!("{:?} {:?} {:?} {:?}", dimension, coordinate, cell, count); + + if count == max_offset { + count = 0; + cell += 1; + cells.push(vec![]); + } + + cells[cell].push(coordinate); + count += 1; + } + + info!( + "dim[{}]: {} cells, {} max per cell", + dimension, + cells.len(), + max_offset, + ); + + CellDictionary { + table: cells, + max_offset, + _marker: marker::PhantomData, + } + } + + fn max_offset(&self) -> usize { + self.max_offset + } + + fn cells(&self) -> &Vec> { + &self.table + } + + fn cell_id(&self, position: &V) -> Option + where + V: Clone + Ord + Debug, + { + let mut id = 0; + // If the last value of the current cell is >= than the value, then + // the value is stored in the cell. + // If this is the first cell, we will look into it as `id` is + // still 0. + for cell in self.cells() { + // last cell is likely to be only partially full + match cell.last() { + Some(x) => { + if x >= position { + break; + } + } + None => break, + }; + id += 1; + } + + if id >= self.cells().len() { + None + } else { + Some(id) + } + } + + fn key(&self, position: &V) -> Option<(usize, usize)> { + let mut result = None; + if let Some(id) = self.cell_id(position) { + if let Ok(offset) = self.table[id].binary_search(position) { + result = Some((id, offset)); + } + } + + result + } + + fn key_down(&self, position: &V) -> (usize, usize) { + match self.cell_id(position) { + Some(id) => match self.table[id].binary_search(position) { + Ok(offset) => (id, offset), + Err(offset) => { + if offset > 0 { + (id, offset - 1) + } else if id == 0 { + (0, 0) + } else { + let id = id - 1; + (id, self.table[id].len() - 1) + } + } + }, + None => self.last(), + } + } + + fn last(&self) -> (usize, usize) { + let last_id = self.table.len() - 1; + let last_offset = self.table[last_id].len() - 1; + + (last_id, last_offset) + } + + fn key_up(&self, position: &V) -> (usize, usize) { + match self.cell_id(position) { + Some(id) => match self.table[id].binary_search(position) { + Ok(offset) => (id, offset), + Err(offset) => { + if offset < self.max_offset { + (id, offset) + } else if id < self.table.len() { + (id + 1, 0) + } else { + self.last() + } + } + }, + None => self.last(), + } + } + + fn value(&self, cell_id: usize, offset: usize) -> V { + self.table[cell_id][offset].clone() + } +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct CellSpace { + dimensions: usize, + coordinates: Vec>, + coordinates_max_offsets: Vec, +} + +impl CellSpace +where + V: Clone + Ord + Debug, + K: Debug + Index, +{ + pub fn new(table: &T, dimensions: usize, cell_bits: usize) -> Self + where + T: Table, + R: Record + Debug, + V: Clone + Ord + Debug, + { + let mut space = CellSpace { + dimensions, + coordinates: vec![], + coordinates_max_offsets: vec![], + }; + + // FIXME: Add check to ensure all positions have the required number of dimensions. + for k in 0..dimensions { + let dic = CellDictionary::new(table, k, cell_bits); + let max = dic.max_offset(); + space.coordinates.push(dic); + space.coordinates_max_offsets.push(max); + } + + space + } + + /* + pub fn cells_id(&self, position: &Vec) -> Result>, String> { + trace!("cells_id: position {:?}", position); + //TODO: Should we check inside each objects, or just assume it is correct and/or rely on the bound checks? + if self.dimensions != position.len() { + return Err(format!( + "Incorrect number of dimensions, expected {}, got {} for {:?}", + self.dimensions, + position.len(), + position + )); + } + + let mut cells = vec![]; + for k in 0..self.dimensions { + cells.push(self.coordinates[k].cell_id(&position[k])); + } + trace!("cells_id: cells {:?}", cells); + Ok(cells) + } + */ + pub fn key(&self, position: &K) -> Result<(Vec, Vec), String> { + //TODO: Should we check inside each objects, or just assume it is correct and/or rely on the bound checks? + /* This impose to require ExactSizeIterator, which is not implemented on Vec, and can't be in any easy way. + if self.dimensions != position.len() { + return Err(format!( + "Incorrect number of dimensions, expected {}, got {} for {:?}", + self.dimensions, + position.len(), + position + )); + }*/ + + let mut cells = vec![]; + let mut offsets = vec![]; + for k in 0..self.dimensions { + match self.coordinates[k].key(&position[k]) { + None => { + return Err(format!( + "Incorrect value for position[{:?}]: {:?}", + k, &position[k] + )) + } + Some((id, offset)) => { + cells.push(id); + offsets.push(offset) + } + }; + } + + Ok((cells, offsets)) + } + + // Round down to the preceding element or self if in the space + pub fn key_down(&self, position: &K) -> Result<(Vec, Vec), String> { + //TODO: Should we check inside each objects, or just assume it is correct and/or rely on the bound checks? + /* This impose to require ExactSizeIterator, which is not implemented on Vec, and can't be in any easy way. + if self.dimensions != position.len() { + return Err(format!( + "Incorrect number of dimensions, expected {}, got {} for {:?}", + self.dimensions, + position.len(), + position + )); + }*/ + + let mut cells = vec![]; + let mut offsets = vec![]; + for k in 0..self.dimensions { + let (id, offset) = self.coordinates[k].key_down(&position[k]); + cells.push(id); + offsets.push(offset); + } + + Ok((cells, offsets)) + } + + // Round up to the next element or self if in the space + pub fn key_up(&self, position: &K) -> Result<(Vec, Vec), String> { + //TODO: Should we check inside each objects, or just assume it is correct and/or rely on the bound checks? + /* This impose to require ExactSizeIterator, which is not implemented on Vec, and can't be in any easy way. + if self.dimensions != position.len() { + return Err(format!( + "Incorrect number of dimensions, expected {}, got {} for {:?}", + self.dimensions, + position.len(), + position + )); + }*/ + + let mut cells = vec![]; + let mut offsets = vec![]; + for k in 0..self.dimensions { + let (id, offset) = self.coordinates[k].key_up(&position[k]); + cells.push(id); + offsets.push(offset); + } + + Ok((cells, offsets)) + } + + pub fn value(&self, cells_id: Vec, offsets: Vec) -> Result, String> { + //TODO: Should we check inside each objects, or just assume it is correct and/or rely on the bound checks? + if self.dimensions != cells_id.len() { + return Err(format!( + "Incorrect number of dimensions, expected {}, got {} for {:?}", + self.dimensions, + cells_id.len(), + cells_id + )); + } + + //TODO: Should we check inside each objects, or just assume it is correct and/or rely on the bound checks? + if self.dimensions != offsets.len() { + return Err(format!( + "Incorrect number of dimensions, expected {}, got {} for {:?}", + self.dimensions, + offsets.len(), + offsets + )); + } + + let mut values = vec![]; + for k in 0..self.dimensions { + values.push(self.coordinates[k].value(cells_id[k], offsets[k])); + } + + Ok(values) + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..fb833b3 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,17 @@ +#[macro_use] +extern crate log; + +#[macro_use] +extern crate arrayref; + +#[macro_use] +extern crate serde_derive; + +mod cell_space; +mod morton; +mod sfc; + +pub use sfc::Record; +pub use sfc::RecordBuild; +pub use sfc::RecordFields; +pub use sfc::SpaceFillingCurve as IndexOwned; diff --git a/src/morton.rs b/src/morton.rs new file mode 100644 index 0000000..e72ba7b --- /dev/null +++ b/src/morton.rs @@ -0,0 +1,446 @@ +use std::fmt; +use std::fmt::Debug; + +use serde::de; +use serde::de::Deserialize; +use serde::de::Deserializer; +use serde::de::MapAccess; +use serde::de::SeqAccess; +use serde::de::Visitor; +use serde::ser::Serialize; +use serde::ser::SerializeStruct; +use serde::ser::Serializer; + +pub type MortonCode = u32; +pub type MortonValue = u16; + +const MORTON_CODE_BITS: usize = 32; +const MORTON_VALUE_BITS: usize = 10; +const MORTON_MAX_VALUES: usize = 1024; + +#[derive(Clone)] +pub struct MortonEncoder { + cell_bits: usize, + cell_mask: usize, + dimensions: usize, + table: Vec<[MortonCode; MORTON_MAX_VALUES]>, +} + +impl Debug for MortonEncoder { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "MortonEncoder {{ cell_bits: {}, cell_mask: {}, dimensions: {}, table: ", + self.cell_bits, self.cell_mask, self.dimensions + )?; + write!(f, "[ ")?; + for k in &self.table { + write!(f, "[ ")?; + for v in k.iter() { + write!(f, "{}, ", v)?; + } + write!(f, "], ")?; + } + write!(f, "] }}") + } +} + +impl MortonEncoder { + pub fn new(dimensions: usize, cell_bits: usize) -> Self { + // Make sure we can store the encoding in a single T. + // Don't know how to make that test generically + assert!(MORTON_VALUE_BITS >= cell_bits); + assert!(MORTON_CODE_BITS >= cell_bits * dimensions); + + //let mut masks = vec![]; + let mut table = vec![]; + let cell_max = 1 << cell_bits; + let cell_mask = cell_max - 1; + + // Build lookup table & masks + for k in 0..dimensions { + table.push([0; MORTON_MAX_VALUES]); + for i in 0..cell_max { + let mut v = 0; + for p in 0..cell_bits { + // Note: bit is at position p, so shift it only K-1 p position again below, instead + // of K times + let bit = i & (1 << p); + let new_bit = bit << (p * (dimensions - 1) + k); + v |= new_bit; + } + table[k][i] = v as MortonCode; + } + /* + let mut v = 0usize; + for p in 0..cell_bits { + let new_bit = 1 << p * (dimensions - 1) + k; + v = v | new_bit; + } + masks.push(v as MortonCode); + */ + } + + MortonEncoder { + cell_bits, + cell_mask, + dimensions, + table, + //masks, + } + } + + fn encode_1(&self, k: usize, v: MortonValue) -> MortonCode { + // Already done by the array bound checker anyway + //assert!((v as usize) < MORTON_MAX_VALUES); + //assert!(k < self.table.len()); + + // Ensure we only have valid values in inputs, even when less bits than + // the maximum is used to define those values. + let v = v as usize & self.cell_mask; + self.table[k][v] + } + + fn decode_1(&self, k: usize, code: MortonCode) -> MortonValue { + // Already done by the array bound checker anyway + //assert!(k < self.table.len()); + + let mut v = 0; + + for i in 0..self.cell_bits { + let bit_pos = i * self.table.len() + k; + let bit = code as usize & (1 << bit_pos); + let bit_pos = bit_pos - i; + v |= (bit >> bit_pos) as MortonValue; + } + + v as MortonValue + } + + pub fn encode(&self, v: &[MortonValue]) -> Result { + //TODO: Should we check inside each objects, or just assume it is correct and/or rely on the bound checks? + if self.dimensions != v.len() { + return Err(format!( + "Incorrect number of dimensions, expected {}, got {} for {:?}", + self.dimensions, + v.len(), + v + )); + } + + let mut code = 0; + + for (k, i) in v.iter().enumerate().take(self.dimensions) { + code |= self.encode_1(k, *i); + } + + Ok(code) + } + + pub fn decode(&self, code: MortonCode) -> Vec { + let mut values = vec![]; + + for k in 0..self.dimensions { + values.push(self.decode_1(k, code)); + } + + values + } +} + +impl Serialize for MortonEncoder { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + // We serialize the minimum amount of information necessary to + // deserialize the table. + // This is the parameters to init(dimensions, cell_bits) + let mut state = serializer.serialize_struct("MortonEncoder", 2)?; + state.serialize_field("cell_bits", &self.cell_bits)?; + state.serialize_field("dimensions", &self.dimensions)?; + state.end() + } +} + +impl<'de> Deserialize<'de> for MortonEncoder { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + enum Field { + CellBits, + Dimensions, + }; + + impl<'de> Deserialize<'de> for Field { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + struct FieldVisitor; + + impl<'de> Visitor<'de> for FieldVisitor { + type Value = Field; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + formatter.write_str("`cell_bits` or `dimensions`") + } + + fn visit_str(self, value: &str) -> Result + where + E: de::Error, + { + match value { + "cell_bits" => Ok(Field::CellBits), + "dimensions" => Ok(Field::Dimensions), + _ => Err(de::Error::unknown_field(value, FIELDS)), + } + } + } + + deserializer.deserialize_identifier(FieldVisitor) + } + } + + struct MortonEncoderVisitor; + + impl<'de> Visitor<'de> for MortonEncoderVisitor { + type Value = MortonEncoder; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + formatter.write_str("struct MortonEncoder") + } + + fn visit_seq(self, mut seq: V) -> Result + where + V: SeqAccess<'de>, + { + let cell_bits = seq + .next_element()? + .ok_or_else(|| de::Error::invalid_length(0, &self))?; + let dimensions = seq + .next_element()? + .ok_or_else(|| de::Error::invalid_length(1, &self))?; + Ok(MortonEncoder::new(dimensions, cell_bits)) + } + + fn visit_map(self, mut map: V) -> Result + where + V: MapAccess<'de>, + { + let mut cell_bits = None; + let mut dimensions = None; + while let Some(key) = map.next_key()? { + match key { + Field::CellBits => { + if cell_bits.is_some() { + return Err(de::Error::duplicate_field("cell_bits")); + } + cell_bits = Some(map.next_value()?); + } + Field::Dimensions => { + if dimensions.is_some() { + return Err(de::Error::duplicate_field("dimensions")); + } + dimensions = Some(map.next_value()?); + } + } + } + let cell_bits = cell_bits.ok_or_else(|| de::Error::missing_field("cell_bits"))?; + let dimensions = + dimensions.ok_or_else(|| de::Error::missing_field("dimensions"))?; + Ok(MortonEncoder::new(dimensions, cell_bits)) + } + } + + const FIELDS: &[&str] = &["cell_bits", "dimensions"]; + deserializer.deserialize_struct("MortonEncoder", FIELDS, MortonEncoderVisitor) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + mod init { + use super::*; + + /* Check the assertions */ + #[test] + #[should_panic] + fn dim1_bit32() { + let _m = MortonEncoder::new(1, 31); + } + + #[test] + #[should_panic] + fn dim2_bit16() { + // Max 10 bit for the codes, even if 16 would fit + let _m = MortonEncoder::new(2, 16); + } + + #[test] + #[should_panic] + fn dim33_bit1() { + let _m = MortonEncoder::new(33, 1); + } + + #[test] + #[should_panic] + fn dim17_bit2() { + let _m = MortonEncoder::new(17, 2); + } + + #[test] + fn dim1_bit10() { + let _m = MortonEncoder::new(1, 10); + } + + #[test] + fn dim2_bit10() { + let _m = MortonEncoder::new(2, 10); + } + + #[test] + fn dim3_bit10() { + let _m = MortonEncoder::new(3, 10); + } + + #[test] + fn dim4_bit8() { + let _m = MortonEncoder::new(4, 8); + } + + #[test] + fn dim32_bit1() { + let _m = MortonEncoder::new(32, 1); + } + + /* + morton_init(); + // Morton table looks OK + // for n in 0..10 { + // println!("{:4}", n); + // for k in 0..K { + // println!("{:032b}", unsafe {MORTON[k][n]}); + // } + // } + + for n in 0..CELL_MAX { + println!("## {:04}", n); + let mut c = 0 as Code; + for k in 0..K { + // check diagonal + c = c | morton_encode(k, n as u16); + } + let f = n as u16; + for k in 1..2 { + // check diagonal + let p = morton_decode(k, c); + println!("\n{:04} \n f {:04}\n p {:04}\n 𝚫 {:06}\n", c, f, p, f-p); + + } + } + + + let mut f = 0.0f64; + // while f < 1.0 { + // let v = convert_to_fixed(&f); + // let p = convert_to_f64(&v); + // println!("\n{:010} \n f {:+0.16e}\n p {:+03.16e}\n 𝚫 {:+03.16e}\n", v, f, p, f - p); + // + // f += 0.1e-1; + // } + + let f =0.000724939184752; + let v = convert_to_fixed(&f); + let p = convert_to_f64(&v); + println!("\n{:010} \n f {:+0.16e}\n p {:+03.16e}\n 𝚫 {:+03.16e}\n", v, f, p, f - p); + + */ + } + + mod encode { + use super::*; + + /* Check the lookup table produced */ + #[test] + fn dim1_bit10() { + let m = MortonEncoder::new(1, 10); + for n in 0..MORTON_MAX_VALUES { + assert_eq!(n as MortonCode, m.encode_1(0, n as MortonValue)); + } + } + + #[test] + fn table_dim2_bit10() { + let m = MortonEncoder::new(2, 10); + let mut lookup = Vec::>::new(); + + for k in 0..2 { + lookup.push(Vec::new()); + + for n in 0..MORTON_MAX_VALUES { + // Morton numbers are number where the bit are exploded so that we can + // interleave them. This means that for each position of a value, we need to + // insert dimensions - 1 columns between each bits, and shift that result by the + // dimension number so that we can OR all the dimensions together without having + // bits colliding. + let mut v = 0; + for p in 0..MORTON_VALUE_BITS { + let b = (n & (1 << p)) >> p; + v = v | b << (p * 2 + k); + } + lookup[k].push(v as MortonCode); + } + } + + for k in 0..2 { + for n in 0..MORTON_MAX_VALUES { + assert_eq!(lookup[k][n], m.encode_1(k, n as MortonValue)); + } + } + } + + fn check(dimensions: usize, value_max: usize, value_bits: usize, m: MortonEncoder) -> () { + let mut lookup = Vec::>::new(); + + for k in 0..dimensions { + lookup.push(Vec::new()); + + for n in 0..value_max { + // Morton numbers are number where the bit are exploded so that we can + // interleave them. This means that for each position of a value, we need to + // insert dimensions -1 columns between each bits, and shift that result by the + // dimension number so that we can OR all the dimensions together without having + // bits colliding. + let mut v = 0; + for p in 0..value_bits { + let b = (n & (1 << p)) >> p; + v = v | b << (p * dimensions + k); + } + lookup[k].push(v as MortonCode); + } + } + + for k in 0..dimensions { + for n in 0..value_max { + assert_eq!(lookup[k][n], m.encode_1(k, n as MortonValue)); + } + } + } + + #[test] + fn table_dim3_bit10() { + let m = MortonEncoder::new(3, 10); + check(3, 1024, 10, m); + } + + #[test] + fn table_dim4_bit8() { + let m = MortonEncoder::new(4, 8); + check(4, 256, 8, m); + } + } +} diff --git a/src/sfc.rs b/src/sfc.rs new file mode 100644 index 0000000..9d26eab --- /dev/null +++ b/src/sfc.rs @@ -0,0 +1,388 @@ +use std::fmt::Debug; +use std::io; +use std::iter::FromIterator; +use std::marker; +use std::ops::Index; + +use serde::de::DeserializeOwned; +use serde::Serialize; + +pub use ironsea_index::IndexedOwned; +pub use ironsea_index::Record; +pub use ironsea_index::RecordBuild; +pub use ironsea_index::RecordFields; +use ironsea_store::Load; +use ironsea_store::Store; +use ironsea_table::Table; + +use super::cell_space::CellSpace; +use super::morton::MortonCode; +use super::morton::MortonEncoder; +use super::morton::MortonValue; + +type SFCCode = u32; +type SFCOffset = u32; + +//FIXME: Remove the need for a constant, how can we make it type-checked instead? +// type-num crate? +const MAX_K: usize = 3; + +#[derive(Debug)] +struct Limit { + idx: usize, + position: Vec, +} + +#[derive(Debug)] +struct Limits { + start: Limit, + end: Limit, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct SFCRecord { + //FIXME: Find a way around hardcoding MAX_K + offsets: [SFCOffset; MAX_K], + fields: F, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct SFCCell { + code: MortonCode, + records: Vec>, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct SpaceFillingCurve +where + T: Table, + R: Record + RecordFields + RecordBuild + Debug, + // K: Debug + ExactSizeIterator + Index + FromIterator, + V: Clone + Ord + Debug + From, +{ + dimensions: usize, + morton: MortonEncoder, + space: CellSpace, + index: Vec>, + _marker: marker::PhantomData<(T, R)>, +} + +impl SpaceFillingCurve +where + T: Table, + R: Record + RecordFields + RecordBuild + Debug, + V: Clone + Ord + Debug + From, + K: Debug + Index + FromIterator, +{ + //FIXME: Should accept indexing 0 elements, at least not crash! + pub fn new(table: &T, dimensions: usize, cell_bits: usize) -> Self { + // 1. build the dictionnary space, called here CellSpace, as well as + // initialize the morton encoder used to project the multi-dimensional + // coordinates into a single dimension. + let mut index = SpaceFillingCurve { + dimensions, + morton: MortonEncoder::new(dimensions, cell_bits), + space: CellSpace::new(table, dimensions, cell_bits), + index: vec![], + _marker: marker::PhantomData, + }; + + // 2. Build a flat table of (code, offset, entries) + let mut flat_table = vec![]; + + for record in table.get_table() { + let position = record.key(); + match index.space.key(&position) { + Ok((cell_ids, offsets)) => match index.encode(&cell_ids) { + Ok(code) => { + let offsets = offsets.iter().map(|i| *i as SFCOffset).collect::>(); + flat_table.push(( + code, + SFCRecord { + offsets: *array_ref!(offsets, 0, MAX_K), + fields: record.fields(), + }, + )) + } + Err(e) => error!("Unable to encode position {:#?}: {}", cell_ids, e), + }, + Err(e) => error!("Invalid position {:#?}: {}", position, e), + } + } + + debug!( + "Processed {:#?} records into the index", + table.get_table().len() + ); + + // 5. Sort by SFCcode + flat_table.sort_unstable_by(|a, b| a.0.cmp(&b.0)); + let nb_records = flat_table.len(); + + let mut current_cell_code = flat_table[0].0; + let mut count = 0; + index.index.push(SFCCell { + code: current_cell_code, + records: vec![], + }); + for (code, record) in flat_table { + if code == current_cell_code { + index.index[count].records.push(record); + } else { + index.index.push(SFCCell { + code, + records: vec![record], + }); + current_cell_code = code; + count += 1; + } + } + debug!("Inserted {:#?} records into the index", nb_records); + + index + } + + pub fn find_by_value(&self, value: &F) -> Vec + where + F: std::cmp::PartialEq, + { + let mut results = vec![]; + for cell in &self.index { + for record in &cell.records { + if &record.fields == value { + if let Ok(r) = self.get_record(cell.code, &record) { + results.push(r); + } + } + } + } + + results + } + + // Map the cell_ids of a point to its SFCcode + #[inline] + fn encode(&self, cell_ids: &[usize]) -> Result { + let mut t = vec![]; + for v in cell_ids.iter() { + t.push(*v as MortonValue); + } + + self.morton.encode(&t) + } + + // Build coordinate values from encoded value + fn position(&self, code: SFCCode, offsets: &[SFCOffset]) -> Result { + let position = self.space.value( + self.morton + .decode(code) + .iter() + .map(|e| *e as usize) + .collect(), + offsets.iter().map(|e| *e as usize).collect(), + )?; + + Ok(position.iter().map(|i| (*i).clone()).collect()) + } + + // Rebuild a specific record + fn get_record(&self, code: SFCCode, entry: &SFCRecord) -> Result { + let position = self.position(code, &entry.offsets)?; + + Ok(R::build(&position, &entry.fields)) + } + + fn limits(&self, start: &K, end: &K) -> Result, String> { + trace!("limits: {:?} - {:?}", start, end); + + // Round down if not found, for start of range: + let (cells, offsets) = self.space.key_down(start)?; + let code = self.encode(&cells)?; + let idx = match self.index.binary_search_by(|e| e.code.cmp(&code)) { + Err(e) => { + if e > 0 { + e - 1 + } else { + 0 + } + } + Ok(c) => c, + }; + let position = self.space.value(cells, offsets)?; + let start = Limit { idx, position }; + + // Round up if not found, for end of range: + let (cells, offsets) = self.space.key_up(end)?; + let code = self.encode(&cells)?; + let idx = match self.index.binary_search_by(|e| e.code.cmp(&code)) { + Err(e) => { + if e >= self.index.len() { + self.index.len() + } else { + e + } + } + Ok(c) => c + 1, + }; + + let position = self.space.value(cells, offsets)?; + let end = Limit { idx, position }; + + trace!("limits: {:?} - {:?}", start, end); + + Ok(Limits { start, end }) + } +} + +impl IndexedOwned for SpaceFillingCurve +where + T: Table, + R: Record + RecordFields + RecordBuild + Debug, + K: Debug + Index + FromIterator, + V: Clone + Debug + Ord + From + Debug, +{ + fn find(&self, key: &K) -> Vec { + let mut values = vec![]; + + if let Ok((cell_ids, offsets)) = self.space.key(key) { + match self.encode(&cell_ids) { + Err(e) => error!("{}", e), + Ok(code) => { + if let Ok(cell) = self.index.binary_search_by(|a| a.code.cmp(&code)) { + for record in &self.index[cell].records { + let mut select = true; + for (k, o) in offsets.iter().enumerate().take(self.dimensions) { + select &= record.offsets[k] == (*o as SFCOffset); + } + + if select { + match self.get_record(code, record) { + Err(e) => error!("{}", e), + Ok(r) => values.push(r), + } + } + } + } + } + } + } + + values + } + + fn find_range(&self, start: &K, end: &K) -> Vec { + let mut values = vec![]; + + match self.limits(start, end) { + Ok(limits) => { + for idx in limits.start.idx..limits.end.idx { + let code = self.index[idx].code; + for record in &self.index[idx].records { + let mut select = true; + let pos = match self.position(code, &record.offsets) { + Err(e) => { + error!("{}", e); + continue; + } + Ok(p) => p, + }; + + // FIXME: Reduce number of comparison by using the cells boundaries. + for k in 0..self.dimensions { + select = select + && limits.start.position[k] <= pos[k] + && limits.end.position[k] >= pos[k]; + } + if select { + match self.get_record(code, &record) { + Err(e) => error!("{}", e), + Ok(r) => values.push(r), + } + } + } + } + } + Err(e) => error!("find_range: limits failed: {}", e), + }; + + values + } +} +// Rough check, based on per-dimension cell Ids. +/* + // If the cell_ids are between ]pos_start and pos_end[, then the value is within the range, + // If the cell_ids are outside [pos_start, pos_end], then the value is out, stop checking + // Else, check the offsets of each entry to be within [off_start, off_end], then the value is within the range. + let mut rough_in = true; + for k in 0..self.dimensions { + if !(cells[k] > start_limits.cells[k] && cells[k] < end_limits.cells[k]) { + rough_in = false; + } + } + + if rough_in { + // This is a cell well within the volume, so all points are a match, add all points, + // go to next cell. + for entry in entries { + values.push(self.get_element(code, entry)) + } + + continue; + } + + let mut rough_out = false; + for k in 0..self.dimensions { + if cells[k] < start_limits.cells[k] || cells[k] > end_limits.cells[k] { + rough_out = false; + } + } + + // If rough is not true, then we have nothing to double check. + if rough_out { + continue; + } +*/ + +impl Store for SpaceFillingCurve +where + T: Table, + R: Record + RecordFields + RecordBuild + Debug, + // K: Debug + ExactSizeIterator + Index + FromIterator, + K: Serialize, + V: Clone + Ord + Debug + From + Serialize, + F: Serialize, +{ + fn store(&mut self, writer: W) -> io::Result<()> + where + W: std::io::Write, + { + match bincode::serialize_into(writer, &self) { + Ok(_) => Ok(()), + Err(e) => Err(io::Error::new(io::ErrorKind::WriteZero, e)), + } + } +} + +impl Load for SpaceFillingCurve +where + T: Table, + R: Record + RecordFields + RecordBuild + Debug, + K: DeserializeOwned, + V: Clone + Ord + Debug + From + DeserializeOwned, + F: DeserializeOwned, +{ + fn load(reader: Re) -> io::Result { + match bincode::deserialize_from(reader) { + Ok(data) => Ok(data), + Err(e) => Err(io::Error::new(io::ErrorKind::InvalidData, e)), + } + } + + // only required for store_mapped_file + fn load_slice(from: &[u8]) -> io::Result { + match bincode::deserialize(from) { + Ok(data) => Ok(data), + Err(e) => Err(io::Error::new(io::ErrorKind::InvalidData, e)), + } + } +}