From ef76bcfe9cbb9c7d1fb1c0c788579b33c89517e3 Mon Sep 17 00:00:00 2001 From: Geordon Worley Date: Sun, 3 Aug 2025 14:09:55 -0500 Subject: [PATCH 1/9] use new CI scripts --- .github/workflows/ci.yml | 69 ++++++++++++++++++++++++++++++++++++ .github/workflows/lints.yml | 55 ---------------------------- .github/workflows/no-std.yml | 26 -------------- .github/workflows/tests.yml | 28 --------------- 4 files changed, 69 insertions(+), 109 deletions(-) create mode 100644 .github/workflows/ci.yml delete mode 100644 .github/workflows/lints.yml delete mode 100644 .github/workflows/no-std.yml delete mode 100644 .github/workflows/tests.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..2e01e1a --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,69 @@ +name: ci +on: + push: + branches: + - main + pull_request: + +jobs: + lints: + name: lints + runs-on: ubuntu-latest + steps: + - name: Checkout sources + uses: actions/checkout@v4 + + - name: Install beta toolchain + uses: dtolnay/rust-toolchain@beta + with: + components: rustfmt, clippy + + - name: Set up cache + uses: Swatinem/rust-cache@v2 + with: + cache-on-failure: true + + - name: Run cargo fmt + run: cargo fmt --all -- --check + + - name: Run cargo clippy + run: cargo clippy --all-targets --tests -- -D warnings + + no_std: + name: no_std + runs-on: ubuntu-latest + steps: + - name: Checkout sources + uses: actions/checkout@v4 + + - name: Install beta toolchain for ARM + uses: dtolnay/rust-toolchain@beta + with: + targets: armv7a-none-eabi + + - name: Set up cache + uses: Swatinem/rust-cache@v2 + with: + cache-on-failure: true + + - name: Build binary for armv7a-none-eabi + run: cargo rustc --target=armv7a-none-eabi --manifest-path=ensure_no_std/Cargo.toml + + tests: + name: tests + runs-on: ubuntu-latest + steps: + - name: Checkout sources + uses: actions/checkout@v4 + + - name: Install beta toolchain + uses: dtolnay/rust-toolchain@beta + + - name: Set up cache + uses: Swatinem/rust-cache@v2 + with: + cache-on-failure: true + + - name: Run cargo test + #run: cargo test --all-features --no-fail-fast --locked --workspace -- --nocapture + run: cargo test --all-features --no-fail-fast --workspace -- --nocapture diff --git a/.github/workflows/lints.yml b/.github/workflows/lints.yml deleted file mode 100644 index 01845b3..0000000 --- a/.github/workflows/lints.yml +++ /dev/null @@ -1,55 +0,0 @@ -on: - push: - branches: - - main - pull_request: - -name: lints - -jobs: - rustfmt: - name: rustfmt - runs-on: ubuntu-latest - steps: - - name: Checkout sources - uses: actions/checkout@v2 - - - name: Install nightly toolchain - uses: actions-rs/toolchain@v1 - with: - profile: minimal - toolchain: nightly - override: true - components: rustfmt - - - name: Run cargo fmt - uses: actions-rs/cargo@v1 - with: - command: fmt - args: --all -- --check - clippy: - name: clippy - runs-on: ubuntu-latest - strategy: - matrix: - features: - - --all-features - - --no-default-features --features alloc - - --no-default-features - steps: - - name: Checkout sources - uses: actions/checkout@v2 - - - name: Install nightly toolchain - uses: actions-rs/toolchain@v1 - with: - profile: minimal - toolchain: nightly - override: true - components: clippy - - - name: Run cargo clippy - uses: actions-rs/cargo@v1 - with: - command: clippy - args: ${{ matrix.features }} -- -D warnings \ No newline at end of file diff --git a/.github/workflows/no-std.yml b/.github/workflows/no-std.yml deleted file mode 100644 index 27537fe..0000000 --- a/.github/workflows/no-std.yml +++ /dev/null @@ -1,26 +0,0 @@ -# This builds for armv7a-none-eabi to ensure we can build with no-std. -# It will fail if there is a dependency on std, as armv7a-none-eabi has no std. - -on: - push: - branches: - - main - pull_request: - -name: no-std - -jobs: - build: - name: no-std - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - uses: actions-rs/toolchain@v1 - with: - toolchain: nightly - target: armv7a-none-eabi - override: true - - uses: actions-rs/cargo@v1 - with: - command: rustc - args: --target=armv7a-none-eabi --manifest-path=ensure_no_std/Cargo.toml \ No newline at end of file diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml deleted file mode 100644 index d003abe..0000000 --- a/.github/workflows/tests.yml +++ /dev/null @@ -1,28 +0,0 @@ -on: - push: - branches: - - main - pull_request: - -name: tests - -jobs: - tests: - name: tests - runs-on: ubuntu-latest - steps: - - name: Checkout sources - uses: actions/checkout@v2 - - - name: Install nightly toolchain - uses: actions-rs/toolchain@v1 - with: - profile: minimal - toolchain: nightly - override: true - - - name: Run cargo test - uses: actions-rs/cargo@v1 - with: - command: test - args: --all-features \ No newline at end of file From 1727ccc3f997af6dd3eff99bb73685fd22b029d0 Mon Sep 17 00:00:00 2001 From: Geordon Worley Date: Sun, 3 Aug 2025 14:10:47 -0500 Subject: [PATCH 2/9] remove config; update edition; fix lints --- .cargo/config | 2 -- Cargo.toml | 7 +++++-- src/lib.rs | 2 +- tests/linear_knn.rs | 2 +- 4 files changed, 7 insertions(+), 6 deletions(-) delete mode 100644 .cargo/config diff --git a/.cargo/config b/.cargo/config deleted file mode 100644 index d5135e9..0000000 --- a/.cargo/config +++ /dev/null @@ -1,2 +0,0 @@ -[build] -rustflags = ["-C", "target-cpu=native"] \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index 1f6649a..877cf57 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,8 +1,11 @@ [package] name = "space" version = "0.18.1-alpha.0" -authors = ["Geordon Worley ", "Yuhan Liin "] -edition = "2018" +authors = [ + "Geordon Worley ", + "Yuhan Liin ", +] +edition = "2024" description = "A library providing abstractions for spatial datastructures and search" documentation = "https://docs.rs/space/" repository = "https://github.com/rust-cv/space" diff --git a/src/lib.rs b/src/lib.rs index 6581810..cd2db50 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -257,7 +257,7 @@ where } #[cfg(feature = "alloc")] -impl<'a, M, I> KnnFromMetricAndBatch for LinearKnn +impl KnnFromMetricAndBatch for LinearKnn where M: Default, { diff --git a/tests/linear_knn.rs b/tests/linear_knn.rs index c8a352f..c94626b 100644 --- a/tests/linear_knn.rs +++ b/tests/linear_knn.rs @@ -13,7 +13,7 @@ impl Metric for Hamming { #[test] fn test_linear_knn() { - let data = vec![ + let data = [ (0b1010_1010, 12), (0b1111_1111, 13), (0b0000_0000, 14), From cf27bdfef2e8a28ca9719d3542be3e8b95f02d9a Mon Sep 17 00:00:00 2001 From: Geordon Worley Date: Sun, 3 Aug 2025 14:11:01 -0500 Subject: [PATCH 3/9] bump version to v0.19.0 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 877cf57..e501ee6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "space" -version = "0.18.1-alpha.0" +version = "0.19.0" authors = [ "Geordon Worley ", "Yuhan Liin ", From 8dd34cfbe3ee5e856eee588f0749b6a1d91f0eca Mon Sep 17 00:00:00 2001 From: Geordon Worley Date: Sun, 3 Aug 2025 14:18:46 -0500 Subject: [PATCH 4/9] fix bench for new edition --- benches/knn.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/benches/knn.rs b/benches/knn.rs index a39bb31..eaeca02 100644 --- a/benches/knn.rs +++ b/benches/knn.rs @@ -1,17 +1,17 @@ -use criterion::{criterion_group, criterion_main, Criterion}; +use criterion::{Criterion, criterion_group, criterion_main}; use rand_core::{RngCore, SeedableRng}; use rand_pcg::Pcg64; use space::{Bits512, Knn, MetricPoint}; fn criterion_benchmark(c: &mut Criterion) { let mut rng = Pcg64::from_seed([1; 32]); - let mut gen = || { + let mut generator = || { let mut feature = Bits512([0; 64]); rng.fill_bytes(&mut *feature); feature }; - let search = gen(); - let data = (0..16384).map(|_| gen()).collect::>(); + let search = generator(); + let data = (0..16384).map(|_| generator()).collect::>(); c.bench_function("space: 4-nn in 16384", |b| { b.iter(|| space::LinearKnn(data.iter()).knn(&search, 4).len()) }) From 4e19a55c4ac508f40128740020a8384f7db8c923 Mon Sep 17 00:00:00 2001 From: Geordon Worley Date: Sun, 3 Aug 2025 14:19:47 -0500 Subject: [PATCH 5/9] update dep versions --- Cargo.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index e501ee6..685ebdd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,13 +20,13 @@ default = ["alloc"] alloc = [] [dependencies] -num-traits = { version = "0.2.14", default-features = false } +num-traits = { version = "0.2.19", default-features = false } doc-comment = "0.3.3" [dev-dependencies] -criterion = "0.3.4" -rand_core = "0.6.2" -rand_pcg = "0.3.0" +criterion = "0.7.0" +rand_core = "0.9.3" +rand_pcg = "0.9.0" [[bench]] name = "knn" From 3e9147ea7a45a890c983c21aecb0c0139b386394 Mon Sep 17 00:00:00 2001 From: Geordon Worley Date: Mon, 4 Aug 2025 15:58:24 -0500 Subject: [PATCH 6/9] total rework; uses pgat crate to enable abstract container concept --- Cargo.toml | 1 + README.md | 45 ++++--- src/lib.rs | 313 ++++++++++++++++++++------------------------ src/linear.rs | 220 +++++++++++++++++++++++++++++++ tests/linear_knn.rs | 42 +++++- 5 files changed, 424 insertions(+), 197 deletions(-) create mode 100644 src/linear.rs diff --git a/Cargo.toml b/Cargo.toml index 685ebdd..1367892 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,6 +22,7 @@ alloc = [] [dependencies] num-traits = { version = "0.2.19", default-features = false } doc-comment = "0.3.3" +pgat = "0.3.0" [dev-dependencies] criterion = "0.7.0" diff --git a/README.md b/README.md index 0256f49..ab4a0e0 100644 --- a/README.md +++ b/README.md @@ -30,37 +30,27 @@ See the [bitarray](https://crates.io/crates/bitarray) crate for an implementatio of `MetricPoint` using hamming distance (with optional, though unstable, 512-bit SIMD support, and always-on 64-bit popcnt instruction support). -## Usage +## Usage Examples ```rust -use space::Metric; +use pgat::ReferenceProxy; +use space::{Knn, LinearContainer, LinearSearch, Metric, SpatialContainer}; +#[derive(Copy, Clone, Default)] struct Hamming; -impl Metric for Hamming { +impl Metric> for Hamming { type Unit = u8; fn distance(&self, &a: &u8, &b: &u8) -> Self::Unit { (a ^ b).count_ones() as u8 } } -``` - -```rust -use space::{Knn, KnnFromBatch, LinearKnn, Metric}; -#[derive(Default)] -struct Hamming; - -impl Metric for Hamming { - type Unit = u8; - - fn distance(&self, &a: &u8, &b: &u8) -> Self::Unit { - (a ^ b).count_ones() as u8 - } -} +type Container = LinearContainer; +type Search<'a> = LinearSearch<'a, Hamming, u8, u8>; -let data = vec![ +let data = [ (0b1010_1010, 12), (0b1111_1111, 13), (0b0000_0000, 14), @@ -68,16 +58,31 @@ let data = vec![ (0b0000_1111, 10), ]; -let search: LinearKnn = KnnFromBatch::from_batch(data.iter()); +let search = Search::new(Hamming, &data); assert_eq!( - &search.knn(&0b0101_0000, 3), + search.knn(&0b0101_0000, 3).as_slice(), &[ (2, &data[2].0, &data[2].1), (2, &data[3].0, &data[3].1), (6, &data[0].0, &data[0].1) ] ); + +let mut search = Container::from_metric_and_iterator(Hamming, data); + +assert_eq!( + search.knn(&0b0101_0000, 3).as_slice(), + &[ + (2, &data[2].0, &data[2].1), + (2, &data[3].0, &data[3].1), + (6, &data[0].0, &data[0].1) + ] +); + +search.insert(0b0101_0001, 8); + +assert_eq!(search.nn(&0b0101_0000), Some((1, &0b0101_0001, &8))); ``` ## Benchmarks diff --git a/src/lib.rs b/src/lib.rs index cd2db50..dbd92a1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,9 +7,13 @@ doc_comment::doctest!("../README.md"); extern crate alloc; #[cfg(feature = "alloc")] -use alloc::vec::Vec; +mod linear; + +#[cfg(feature = "alloc")] +pub use linear::*; use num_traits::Zero; +use pgat::{Owned, ProxyView, View}; /// This trait is implemented for metrics that form a metric space. /// It is primarily used for keys in nearest neighbor searches. @@ -44,9 +48,12 @@ use num_traits::Zero; /// ## Example /// /// ``` +/// use pgat::ReferenceProxy; +/// +/// #[derive(Copy, Clone, Default)] /// struct AbsDiff; /// -/// impl space::Metric for AbsDiff { +/// impl space::Metric> for AbsDiff { /// type Unit = u64; /// /// fn distance(&self, &a: &f64, &b: &f64) -> Self::Unit { @@ -56,212 +63,176 @@ use num_traits::Zero; /// } /// } /// ``` -pub trait Metric

{ +pub trait Metric: Copy { type Unit: Ord + Zero + Copy; - fn distance(&self, a: &P, b: &P) -> Self::Unit; + fn distance<'a, 'b>(&self, a: View<'a, P>, b: View<'b, P>) -> Self::Unit; +} + +pub type MetricUnit = >::Unit; + +/// Implement this trait on data structures (or wrappers) which perform spatial searches. +/// +/// Note that [`ApproximateSpace`] encompasses both exact and approximate searches. +/// Approximate searches may not always return the actual nearest neighbors or the entire set of neighbors in a region. +/// Returning the exact set of neighbors that belong in the query results is also known as 100% recall. +/// The amount of recall you get depends on the exact data structure and algorithm used to perform the search. +/// If you need exact nearest neighbor search (guaranteed 100% recall), instead depend on the [`ExactSpace`] trait. +pub trait ApproximateSpace { + type PointProxy: ProxyView; + type ValueProxy: ProxyView; + type Metric: Metric; } +/// This marker trait indicates that the methods provided by search algorithms are exact. +/// It has no further functionality at this time. Implement this on search data structures +/// that guarantee exact nearest neighbor search. +/// +/// In this context, exact doesn't mean equidistant neighbors will always be returned, nor does it mean +/// that the same query will always return the same neighbors. However, it does mean that closer neighbors +/// will always be returned before farther neighbors under the ordering of the metric used. +pub trait ExactSpace: ApproximateSpace {} + /// Implement this trait on data structures (or wrappers) which perform KNN searches. /// The data structure should maintain a key-value mapping between neighbour points and data -/// values. +/// values. It must be able to output the distance between the query point and the neighbours, +/// which is included in the results. /// -/// The lifetime on the trait will be removed once GATs are stabilized. -pub trait Knn<'a> { - type Point: 'a; - type Value: 'a; - type Metric: Metric; - type KnnIter: IntoIterator< +/// Note that [`Knn`] encompasses both exact and approximate nearest neighbor searches. +/// Depend on the [`ExactSpace`] trait to ensure all searches are exact. See [`ExactSpace`] for more details. +pub trait Knn: ApproximateSpace { + type KnnIter<'a>: Iterator< Item = ( - >::Unit, - &'a Self::Point, - &'a Self::Value, + MetricUnit, + View<'a, Self::PointProxy>, + View<'a, Self::ValueProxy>, ), - >; + > + where + Self: 'a; - /// Get `num` nearest neighbor keys and values of `target`. + /// Get `num` nearest neighbors' distance, key, and value relative to the `target` position. /// - /// For many KNN search algorithms, the returned neighbors are approximate, and may not - /// be the actual nearest neighbors. - fn knn(&'a self, query: &Self::Point, num: usize) -> Self::KnnIter; + /// The neighbors must be sorted by distance, with the closest neighbor first. + fn knn<'a, 'b>(&'a self, query: View<'b, Self::PointProxy>, num: usize) -> Self::KnnIter<'a>; - /// Get the nearest neighbor key and values of `target`. - /// - /// For many KNN search algorithms, the returned neighbors are approximate, and may not - /// be the actual nearest neighbors. + /// Get the nearest neighbor's distance, key, and value relative to the `target` position. #[allow(clippy::type_complexity)] - fn nn( + fn nn<'a, 'b>( &'a self, - query: &Self::Point, + query: View<'b, Self::PointProxy>, ) -> Option<( - >::Unit, - &'a Self::Point, - &'a Self::Value, - )>; + MetricUnit, + View<'a, Self::PointProxy>, + View<'a, Self::ValueProxy>, + )> { + self.knn(query, 1).next() + } } -/// Implement this trait on data structures (or wrappers) which perform range queries. +/// Implement this trait on data structures (or wrappers) which perform n-sphere range queries. /// The data structure should maintain a key-value mapping between neighbour points and data -/// values. +/// values. It must be able to output the distance between the query point and the neighbours, +/// which is included in the results. /// -/// The lifetime on the trait will be removed once GATs are stabilized. -pub trait RangeQuery<'a>: Knn<'a> { - type RangeIter: IntoIterator< +/// Note that [`NSphereRangeQuery`] encompasses both exact and approximate n-sphere searches. +/// Depend on the [`ExactSpace`] trait to ensure all searches are exact. See [`ExactSpace`] for more details. +pub trait NSphereRangeQuery: ApproximateSpace { + type NSphereIter<'a>: Iterator< Item = ( - >::Unit, - &'a Self::Point, - &'a Self::Value, + MetricUnit, + View<'a, Self::PointProxy>, + View<'a, Self::ValueProxy>, ), - >; - - /// Get all the points in the data structure that lie within a specified range of the query - /// point. The points may or may not be sorted by distance. - #[allow(clippy::type_complexity)] - fn range_query( - &self, - query: &Self::Point, - range: >::Unit, - ) -> Self::RangeIter; -} + > + where + Self: 'a; -/// Implement this trait on KNN search data structures that map keys to values and which you can -/// insert new (key, value) pairs. -pub trait KnnInsert<'a>: Knn<'a> { - /// Insert a (key, value) pair to the [`KnnMap`]. + /// Get all the neighbors in the data structure that lie within a specified range of the query n-sphere. /// - /// Returns the index type - fn insert(&mut self, key: Self::Point, value: Self::Value); -} - -/// Create a data structure from a metric and a batch of data points, such as a vector. -/// For many algorithms, using batch initialization yields better results than inserting the points -/// one at a time. -pub trait KnnFromMetricAndBatch { - fn from_metric_and_batch(metric: M, batch: B) -> Self; -} - -/// Create a data structure from a batch of data points, such as a vector. -/// For many algorithms, using batch initialization yields better results than inserting the points -/// one at a time. -pub trait KnnFromBatch: KnnFromMetricAndBatch { - fn from_batch(batch: B) -> Self; -} - -impl KnnFromBatch for T -where - T: KnnFromMetricAndBatch, - M: Default, -{ - fn from_batch(batch: B) -> Self { - Self::from_metric_and_batch(M::default(), batch) + /// The neighbors must be sorted by distance, with the closest neighbor first. + fn nsphere_query<'a, 'b>( + &'a self, + query: View<'b, Self::PointProxy>, + radius: MetricUnit, + ) -> Self::NSphereIter<'a> { + self.nsphere_query_limited(query, radius, usize::MAX).0 } -} -/// Performs a linear knn search by iterating over everything in the space -/// and performing a binary search on running set of neighbors. -/// -/// ## Example -/// -/// ``` -/// use space::{Knn, LinearKnn, Metric, KnnFromBatch}; -/// -/// #[derive(Default)] -/// struct Hamming; -/// -/// impl Metric for Hamming { -/// type Unit = u8; -/// -/// fn distance(&self, &a: &u8, &b: &u8) -> Self::Unit { -/// (a ^ b).count_ones() as u8 -/// } -/// } -/// -/// let data = vec![ -/// (0b1010_1010, 12), -/// (0b1111_1111, 13), -/// (0b0000_0000, 14), -/// (0b1111_0000, 16), -/// (0b0000_1111, 10), -/// ]; -/// -/// let search: LinearKnn = KnnFromBatch::from_batch(data.iter()); -/// -/// assert_eq!( -/// &search.knn(&0b0101_0000, 2), -/// &[ -/// (2, &data[2].0, &data[2].1), -/// (2, &data[3].0, &data[3].1), -/// ] -/// ); -/// ``` -#[cfg(feature = "alloc")] -pub struct LinearKnn { - pub metric: M, - pub points: I, + /// Get all the neighbors in the data structure that lie within a specified range of the query n-sphere. + /// You may also provide a `max_neighbors` to limit the number of neighbors returned. This is useful if you + /// only need neighbors with a certain region, but you need to bail out to prevent excessive searching. + /// + /// The neighbors must be sorted by distance, with the closest neighbor first. + /// + /// This returns a tuple containing the neighbors in the region and a boolean indicating if we + /// completely searched the region or if we stopped early due to the `max_neighbors` limit. This boolean + /// doesn't indicate the neighbors are complete if the algorithm is approximate, only if exact, but + /// it does indicate that the algorithm terminated its search. If it is `true`, + /// then the limit was not hit. If the result is `false`, one should not assume that neighbors of a lower + /// radius than the furthest found neighbor have been searched, but only that the search algorithm itself + /// was terminated early, so that the results are incomplete. They are still sorted by distance, however, + /// and search algorithms should still attempt to search closer neighbors first where possible, but it is + /// up to the user to use the results based on the algorithm's guarantees. + fn nsphere_query_limited<'a, 'b>( + &'a self, + query: View<'b, Self::PointProxy>, + radius: MetricUnit, + max_neighbors: usize, + ) -> (Self::NSphereIter<'a>, bool); } -#[cfg(feature = "alloc")] -impl<'a, M: Metric

, I, P: 'a, V: 'a> Knn<'a> for LinearKnn -where - I: Iterator + Clone, -{ - type Metric = M; - type Point = P; - type Value = V; - type KnnIter = Vec<(M::Unit, &'a P, &'a V)>; +/// Implement this trait on spatial containers that map points to values. +pub trait SpatialContainer: ApproximateSpace + Sized { + type SpatialIter<'a>: Iterator, View<'a, Self::ValueProxy>)> + where + Self: 'a; - fn knn(&'a self, query: &Self::Point, num: usize) -> Self::KnnIter { - let mut dataset = self - .points - .clone() - .map(|(pt, val)| (self.metric.distance(pt, query), pt, val)); + /// Create a new instance of the data structure with the given metric. + fn with_metric(metric: Self::Metric) -> Self; - // Create a vector with the correct capacity in advance. - let mut neighbors = Vec::with_capacity(num); + /// Insert a (point, value) pair into a spatial data structure. + fn insert(&mut self, point: Owned, value: Owned); - // Extend the vector with the first `num` neighbors. - neighbors.extend((&mut dataset).take(num)); - // Sort the vector by the neighbor distance. - neighbors.sort_unstable_by_key(|n| n.0); + /// Iterate over all the point, value pairs in the data structure. + fn iter(&self) -> Self::SpatialIter<'_>; - // Iterate over each additional neighbor. - for point in dataset { - // Find the position at which it would be inserted. - let position = neighbors.partition_point(|n| n.0 <= point.0); - // If the point is closer than at least one of the points already in `neighbors`, add it - // into its sorted position. - if position != num { - neighbors.pop(); - neighbors.insert(position, point); - } + /// Extend the data structure with additional data from an iterator of (point, value) pairs. + fn extend( + &mut self, + iter: impl IntoIterator, Owned)>, + ) { + for (point, value) in iter { + self.insert(point, value); } - - neighbors } - #[allow(clippy::type_complexity)] - fn nn( - &self, - query: &Self::Point, - ) -> Option<( - >::Unit, - &'a Self::Point, - &'a Self::Value, - )> { - // Map the input iterator into neighbors and then find the smallest one by distance. - self.points - .clone() - .map(|(pt, val)| (self.metric.distance(pt, query), pt, val)) - .min_by_key(|n| n.0) + /// Create a new instance of the data structure with the given metric and an iterator of (point, value) pairs. + fn from_metric_and_iterator( + metric: Self::Metric, + batch: impl IntoIterator, Owned)>, + ) -> Self { + let mut instance = Self::with_metric(metric); + instance.extend(batch); + instance } } -#[cfg(feature = "alloc")] -impl KnnFromMetricAndBatch for LinearKnn +/// This function performs exact linear nearest neighbor search. +/// +/// This may be useful specifically when implementing spatial containers +/// where you need to abstract over ProxyView types. +pub fn linear_nn<'a, 'b, M, P, V>( + metric: M, + dataset: impl Iterator, View<'a, V>)>, + query: View<'b, P>, +) -> Option<(M::Unit, View<'a, P>, View<'a, V>)> where - M: Default, + M: Metric

, + P: ProxyView, + V: ProxyView, { - fn from_metric_and_batch(metric: M, points: I) -> Self { - Self { metric, points } - } + dataset + .map(|(pt, val)| (metric.distance(pt, query), pt, val)) + .min_by_key(|n| n.0) } diff --git a/src/linear.rs b/src/linear.rs new file mode 100644 index 0000000..1fab1ed --- /dev/null +++ b/src/linear.rs @@ -0,0 +1,220 @@ +use core::{iter::Map, slice}; + +use alloc::vec::{self, Vec}; +use pgat::{ProxyView, ReferenceProxy, View}; + +use crate::{ApproximateSpace, ExactSpace, Knn, Metric, SpatialContainer, linear_nn}; + +/// This function performs exact linear nearest neighbor search. +/// +/// This may be useful specifically when implementing spatial containers +/// where you need to abstract over ProxyView types. +pub fn linear_knn<'a, 'b, M, P, V>( + metric: M, + dataset: impl Iterator, View<'a, V>)>, + query: View<'b, P>, + num: usize, +) -> Vec<(M::Unit, View<'a, P>, View<'a, V>)> +where + M: Metric

, + P: ProxyView, + V: ProxyView, +{ + let mut dataset = dataset.map(|(pt, val)| (metric.distance(pt, query), pt, val)); + + // Create a vector with the correct capacity in advance. + let mut neighbors = Vec::with_capacity(num); + + // Extend the vector with the first `num` neighbors. + neighbors.extend((&mut dataset).take(num)); + // Sort the vector by the neighbor distance. + neighbors.sort_unstable_by_key(|n| n.0); + + // Iterate over each additional neighbor. + for point in dataset { + // Find the position at which it would be inserted. + let position = neighbors.partition_point(|n| n.0 <= point.0); + // If the point is closer than at least one of the points already in `neighbors`, add it + // into its sorted position. + if position != num { + neighbors.pop(); + neighbors.insert(position, point); + } + } + + neighbors +} + +/// Performs a linear knn search by iterating one-by-one over the dataset +/// and keeping a running set of neighbors which it searches through with binary search. +/// +/// You may use the optional type parameters `PP` and `VP` to specify the proxy types for the point and value. +/// By default, it uses [`ReferenceProxy`] for both point and value, which uses &P and &V as the proxies. +pub struct LinearSearch<'a, M, P, V, PP = ReferenceProxy

, VP = ReferenceProxy> { + pub metric: M, + pub data: &'a [(P, V)], + pub _phantom: core::marker::PhantomData<(PP, VP)>, +} + +impl<'a, M, P, V, PP, VP> LinearSearch<'a, M, P, V, PP, VP> +where + M: Metric, + PP: ProxyView, + VP: ProxyView, +{ + pub fn new(metric: M, data: &'a [(P, V)]) -> Self { + Self { + metric, + data, + _phantom: core::marker::PhantomData, + } + } +} + +impl<'a, M, P, V, PP, VP> ApproximateSpace for LinearSearch<'a, M, P, V, PP, VP> +where + M: Metric, + PP: ProxyView, + VP: ProxyView, +{ + type PointProxy = PP; + type ValueProxy = VP; + type Metric = M; +} + +/// This trait is implemented for linear search, which is an exact search algorithm. +impl<'a, M, P, V, PP, VP> ExactSpace for LinearSearch<'a, M, P, V, PP, VP> +where + M: Metric, + PP: ProxyView, + VP: ProxyView, +{ +} + +impl<'c, M, P, V, PP, VP> Knn for LinearSearch<'c, M, P, V, PP, VP> +where + M: Metric, + PP: ProxyView, + VP: ProxyView, +{ + type KnnIter<'a> + = vec::IntoIter<(M::Unit, View<'a, PP>, View<'a, VP>)> + where + Self: 'a; + + fn knn<'a, 'b>(&'a self, query: View<'b, Self::PointProxy>, num: usize) -> Self::KnnIter<'a> { + linear_knn::( + self.metric, + self.data + .iter() + .map(|(pt, val)| (PP::view(pt), VP::view(val))), + query, + num, + ) + .into_iter() + } + + fn nn<'a, 'b>( + &'a self, + query: View<'b, Self::PointProxy>, + ) -> Option<(M::Unit, View<'a, PP>, View<'a, VP>)> { + linear_nn::( + self.metric, + self.data + .iter() + .map(|(pt, val)| (PP::view(pt), VP::view(val))), + query, + ) + } +} + +pub struct LinearContainer, VP = ReferenceProxy> { + pub metric: M, + pub data: Vec<(P, V)>, + pub _phantom: core::marker::PhantomData<(PP, VP)>, +} + +impl ApproximateSpace for LinearContainer +where + M: Metric, + PP: ProxyView, + VP: ProxyView, +{ + type PointProxy = PP; + type ValueProxy = VP; + type Metric = M; +} + +/// This trait is implemented for linear search, which is an exact search algorithm. +impl ExactSpace for LinearContainer +where + M: Metric, + PP: ProxyView, + VP: ProxyView, +{ +} + +impl SpatialContainer for LinearContainer +where + M: Metric, + PP: ProxyView, + VP: ProxyView, +{ + type SpatialIter<'a> + = Map, fn(&'a (P, V)) -> (View<'a, PP>, View<'a, VP>)> + where + Self: 'a; + + fn with_metric(metric: Self::Metric) -> Self { + Self { + metric, + data: Vec::new(), + _phantom: core::marker::PhantomData, + } + } + + fn insert(&mut self, point: P, value: V) { + self.data.push((point, value)); + } + + fn iter(&self) -> Self::SpatialIter<'_> { + self.data.iter().map(|(p, v)| (PP::view(p), VP::view(v))) + } +} + +impl Knn for LinearContainer +where + M: Metric, + PP: ProxyView, + VP: ProxyView, +{ + type KnnIter<'a> + = vec::IntoIter<(M::Unit, View<'a, PP>, View<'a, VP>)> + where + Self: 'a; + + fn knn<'a, 'b>(&'a self, query: View<'b, Self::PointProxy>, num: usize) -> Self::KnnIter<'a> { + linear_knn::( + self.metric, + self.data + .iter() + .map(|(pt, val)| (PP::view(pt), VP::view(val))), + query, + num, + ) + .into_iter() + } + + fn nn<'a, 'b>( + &'a self, + query: View<'b, Self::PointProxy>, + ) -> Option<(M::Unit, View<'a, PP>, View<'a, VP>)> { + linear_nn::( + self.metric, + self.data + .iter() + .map(|(pt, val)| (PP::view(pt), VP::view(val))), + query, + ) + } +} diff --git a/tests/linear_knn.rs b/tests/linear_knn.rs index c94626b..48b8a1b 100644 --- a/tests/linear_knn.rs +++ b/tests/linear_knn.rs @@ -1,9 +1,10 @@ -use space::{Knn, KnnFromBatch, LinearKnn, Metric}; +use pgat::ReferenceProxy; +use space::{Knn, LinearContainer, LinearSearch, Metric, SpatialContainer}; -#[derive(Default)] +#[derive(Copy, Clone, Default)] struct Hamming; -impl Metric for Hamming { +impl Metric> for Hamming { type Unit = u8; fn distance(&self, &a: &u8, &b: &u8) -> Self::Unit { @@ -11,8 +12,33 @@ impl Metric for Hamming { } } +type Container = LinearContainer; +type Search<'a> = LinearSearch<'a, Hamming, u8, u8>; + +#[test] +fn test_linear_search() { + let data = [ + (0b1010_1010, 12), + (0b1111_1111, 13), + (0b0000_0000, 14), + (0b1111_0000, 16), + (0b0000_1111, 10), + ]; + + let search = Search::new(Hamming, &data); + + assert_eq!( + search.knn(&0b0101_0000, 3).as_slice(), + &[ + (2, &data[2].0, &data[2].1), + (2, &data[3].0, &data[3].1), + (6, &data[0].0, &data[0].1) + ] + ); +} + #[test] -fn test_linear_knn() { +fn test_linear_container() { let data = [ (0b1010_1010, 12), (0b1111_1111, 13), @@ -21,14 +47,18 @@ fn test_linear_knn() { (0b0000_1111, 10), ]; - let search: LinearKnn = KnnFromBatch::from_batch(data.iter()); + let mut search = Container::from_metric_and_iterator(Hamming, data); assert_eq!( - &search.knn(&0b0101_0000, 3), + search.knn(&0b0101_0000, 3).as_slice(), &[ (2, &data[2].0, &data[2].1), (2, &data[3].0, &data[3].1), (6, &data[0].0, &data[0].1) ] ); + + search.insert(0b0101_0001, 8); + + assert_eq!(search.nn(&0b0101_0000), Some((1, &0b0101_0001, &8))); } From 550e8230e200983d57c2f0dc73b8c094de9ecca8 Mon Sep 17 00:00:00 2001 From: Geordon Worley Date: Mon, 4 Aug 2025 16:04:15 -0500 Subject: [PATCH 7/9] only include readme docs when alloc is enabled --- README.md | 2 ++ src/lib.rs | 2 ++ 2 files changed, 4 insertions(+) diff --git a/README.md b/README.md index ab4a0e0..902131c 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,8 @@ SIMD support, and always-on 64-bit popcnt instruction support). ## Usage Examples +This example shows how to use the LinearSearch and LinearContainer that come built-in by default. You would use third party containers similarly and can abstract over them using the traits like SpatialContainer and Knn. + ```rust use pgat::ReferenceProxy; use space::{Knn, LinearContainer, LinearSearch, Metric, SpatialContainer}; diff --git a/src/lib.rs b/src/lib.rs index dbd92a1..05c8095 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,8 @@ //! See the [Crates.io page](https://crates.io/crates/space) for the README. #![no_std] + +#[cfg(feature = "alloc")] doc_comment::doctest!("../README.md"); #[cfg(feature = "alloc")] From a962dbc7a119a8e0b800f72d829625197a3763d1 Mon Sep 17 00:00:00 2001 From: Geordon Worley Date: Mon, 4 Aug 2025 16:59:33 -0500 Subject: [PATCH 8/9] add test with ndarray container --- Cargo.toml | 2 + README.md | 2 + tests/ndarray.rs | 195 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 199 insertions(+) create mode 100644 tests/ndarray.rs diff --git a/Cargo.toml b/Cargo.toml index 1367892..ab4de10 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,6 +28,8 @@ pgat = "0.3.0" criterion = "0.7.0" rand_core = "0.9.3" rand_pcg = "0.9.0" +ndarray = "0.16.1" +decorum = "0.4.0" [[bench]] name = "knn" diff --git a/README.md b/README.md index 902131c..77fa362 100644 --- a/README.md +++ b/README.md @@ -87,6 +87,8 @@ search.insert(0b0101_0001, 8); assert_eq!(search.nn(&0b0101_0000), Some((1, &0b0101_0001, &8))); ``` +For an example on how to create a container, a great reference may be found in tests/ndarray.rs in the repository. For brevity it is omitted here, but it shows how to create a specialized structure that uses an Array2 as the storage mechanism for Array1 points. + ## Benchmarks To run the benchmarks, use the following command: diff --git a/tests/ndarray.rs b/tests/ndarray.rs new file mode 100644 index 0000000..6116019 --- /dev/null +++ b/tests/ndarray.rs @@ -0,0 +1,195 @@ +use std::vec; + +use decorum::Total; +use ndarray::{Array1, Array2, ArrayView1, arr1, arr2}; +use pgat::{ProxyView, ReferenceProxy, View, ViewInverse}; +use space::{ApproximateSpace, ExactSpace, Knn, Metric, SpatialContainer, linear_knn, linear_nn}; + +#[derive(Copy, Clone, Debug, PartialEq)] +pub struct ArrayViewWrapper<'a>(pub ArrayView1<'a, f32>); + +pub struct ArrayViewProxy; + +impl ProxyView for ArrayViewProxy { + type Owned = Array1; + type View<'a> = ArrayViewWrapper<'a>; + + fn view<'a>(owned: &'a Self::Owned) -> Self::View<'a> { + ArrayViewWrapper(owned.view()) + } +} + +impl<'a> ViewInverse<'a> for ArrayViewWrapper<'a> { + type Owned = Array1; + + type Proxy = ArrayViewProxy; +} + +#[derive(Copy, Clone, Default)] +struct L2; + +impl Metric for L2 { + type Unit = Total; + + fn distance<'a, 'b>(&self, a: ArrayViewWrapper<'a>, b: ArrayViewWrapper<'b>) -> Self::Unit { + a.0.iter() + .zip(b.0.iter()) + .map(|(x, y)| (x - y).powi(2)) + .sum::() + .sqrt() + .into() + } +} + +pub struct Array2Container> { + pub metric: M, + pub points: Array2, + pub values: Vec, + pub _phantom: core::marker::PhantomData, +} + +impl Array2Container +where + M: Metric, + VP: ProxyView, +{ + fn with_metric_and_data(metric: M, points: Array2, values: Vec) -> Self { + assert_eq!( + points.nrows(), + values.len(), + "Number of points must match number of values" + ); + Self { + metric, + points, + values, + _phantom: core::marker::PhantomData, + } + } +} + +impl ApproximateSpace for Array2Container +where + M: Metric, + VP: ProxyView, +{ + type PointProxy = ArrayViewProxy; + type ValueProxy = VP; + type Metric = M; +} + +/// This trait is implemented for linear search, which is an exact search algorithm. +impl ExactSpace for Array2Container +where + M: Metric, + VP: ProxyView, +{ +} + +impl SpatialContainer for Array2Container +where + M: Metric, + VP: ProxyView, +{ + type SpatialIter<'a> + = std::iter::Map< + std::iter::Zip< + ndarray::iter::LanesIter<'a, f32, ndarray::Dim<[usize; 1]>>, + std::slice::Iter<'a, V>, + >, + fn((ArrayView1<'a, f32>, &'a V)) -> (ArrayViewWrapper<'a>, View<'a, VP>), + > + where + Self: 'a; + + fn with_metric(metric: Self::Metric) -> Self { + Self { + metric, + points: Array2::zeros((0, 2)), + values: Vec::new(), + _phantom: core::marker::PhantomData, + } + } + + fn insert(&mut self, point: Array1, value: V) { + self.points.push_row(point.view()).unwrap(); + self.values.push(value); + } + + fn iter(&self) -> Self::SpatialIter<'_> { + self.points + .rows() + .into_iter() + .zip(self.values.iter()) + .map(|(p, v)| (ArrayViewWrapper(p), VP::view(v))) + } +} + +impl Knn for Array2Container +where + M: Metric, + VP: ProxyView, +{ + type KnnIter<'a> + = vec::IntoIter<(M::Unit, ArrayViewWrapper<'a>, View<'a, VP>)> + where + Self: 'a; + + fn knn<'a, 'b>(&'a self, query: View<'b, Self::PointProxy>, num: usize) -> Self::KnnIter<'a> { + linear_knn::( + self.metric, + self.points + .rows() + .into_iter() + .zip(self.values.iter()) + .map(|(pt, val)| (ArrayViewWrapper(pt), VP::view(val))), + query, + num, + ) + .into_iter() + } + + fn nn<'a, 'b>( + &'a self, + query: View<'b, Self::PointProxy>, + ) -> Option<(M::Unit, View<'a, ArrayViewProxy>, View<'a, VP>)> { + linear_nn::( + self.metric, + self.points + .rows() + .into_iter() + .zip(self.values.iter()) + .map(|(pt, val)| (ArrayViewWrapper(pt), VP::view(val))), + query, + ) + } +} + +type Container = Array2Container; + +#[test] +fn test_ndarray_container() { + let points = arr2(&[[1.0, 1.2], [4.4, 4.5], [5.0, -1.2], [2.0, 2.8], [-5.0, 1.3]]); + let values = vec![1, 2, 3, 4, 5]; + + let mut search = Container::with_metric_and_data(L2, points.clone(), values.clone()); + + let result = search.knn(ArrayViewWrapper(points.row(0)), 2); + let result = result.as_slice(); + assert_eq!(result[0].1, ArrayViewWrapper(points.row(0))); + assert_eq!(result[0].2, &values[0]); + assert_eq!(result[1].1, ArrayViewWrapper(points.row(3))); + assert_eq!(result[1].2, &values[3]); + + let new_point = arr1(&[2.0, 2.0]); + search.insert(new_point.clone(), 6); + + let result = search.knn(ArrayViewWrapper(points.row(0)), 3); + let result = result.as_slice(); + assert_eq!(result[0].1, ArrayViewWrapper(points.row(0))); + assert_eq!(result[0].2, &values[0]); + assert_eq!(result[1].1, ArrayViewWrapper(new_point.view())); + assert_eq!(result[1].2, &6); + assert_eq!(result[2].1, ArrayViewWrapper(points.row(3))); + assert_eq!(result[2].2, &values[3]); +} From 1a1fc008f43eb28f3b51d3e5812041de72a7d091 Mon Sep 17 00:00:00 2001 From: Geordon Worley Date: Mon, 4 Aug 2025 17:02:10 -0500 Subject: [PATCH 9/9] add helpful comment to readme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 77fa362..1f8c77f 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,7 @@ impl Metric> for Hamming { } } +// Use type aliases like below to get default proxy types (ReferenceView) on the container. type Container = LinearContainer; type Search<'a> = LinearSearch<'a, Hamming, u8, u8>;