Show HN: Kentro – a fast Rust library for K-Means clustering
5 hours ago
2
A high-performance Rust implementation of K-Means clustering algorithms. Kentro provides both standard and advanced K-Means variants with parallel processing support.
Standard K-Means: Classic Lloyd's algorithm implementation
Spherical K-Means: Uses cosine similarity instead of Euclidean distance
Balanced K-Means: Ensures clusters have similar sizes using efficient balancing algorithms
K-Medoids: PAM (Partition Around Medoids) algorithm for robust clustering with actual data points as centers
Parallel Processing: Multi-threaded execution using Rayon
Flexible API: Builder pattern for easy configuration
Memory Efficient: Optimized for large datasets
Comprehensive Error Handling: Detailed error types and messages
Add this to your Cargo.toml:
[dependencies]
kentro = "0.1.0"ndarray = "0.15"
use kentro::KMeans;use ndarray::Array2;// Create sample data (100 points, 2 dimensions)let data = Array2::from_shape_vec((100,2),(0..200).map(|x| x asf32).collect()).unwrap();// Create and configure K-Meansletmut kmeans = KMeans::new(3).with_iterations(50).with_verbose(true);// Train the modellet clusters = kmeans.train(data.view(),None).unwrap();println!("Found {} clusters", clusters.len());for(i, cluster)in clusters.iter().enumerate(){println!("Cluster {}: {} points", i, cluster.len());}
// Train on your datalet clusters = kmeans.train(data.view(),Some(4))?;// Use 4 threads// Returns Vec<Vec<usize>> where each inner vector contains // the indices of points assigned to that cluster
// Assign new points to existing clusterslet assignments = kmeans.assign(new_data.view(),1)?;// k=1 (nearest cluster)// Multi-assignment (assign to k nearest clusters)let multi_assignments = kmeans.assign(new_data.view(),2)?;// k=2
// Get centroidsifletSome(centroids) = kmeans.centroids(){println!("Centroids shape: {:?}", centroids.dim());}// Get medoid indices (when using K-medoids)ifletSome(medoids) = kmeans.medoid_indices(){println!("Medoid indices: {:?}", medoids);}// Check model stateprintln!("Trained: {}", kmeans.is_trained());println!("Clusters: {}", kmeans.n_clusters());println!("Euclidean: {}", kmeans.is_euclidean());println!("Balanced: {}", kmeans.is_balanced());println!("Using medoids: {}", kmeans.is_use_medoids());
Uses cosine similarity (inner product) as the distance metric. Suitable for high-dimensional data and text clustering.
letmut kmeans = KMeans::new(5);
Uses Euclidean distance as the distance metric. Better for geometric data.
fnmedoids_example() -> Result<(),Box<dyn std::error::Error>>{// Generate sample data with some outliersletmut data_vec = vec![];// Cluster 1: around (1, 1)
data_vec.extend_from_slice(&[1.0,1.0,1.1,1.1,1.2,1.0,0.9,1.1]);// Cluster 2: around (5, 5)
data_vec.extend_from_slice(&[5.0,5.0,5.1,5.1,4.9,5.0,5.0,4.9]);// Outlier
data_vec.extend_from_slice(&[10.0,1.0]);let data = Array2::from_shape_vec((9,2), data_vec)?;// Use K-Medoids for robustness to outliersletmut kmeans = KMeans::new(3).with_use_medoids(true).with_euclidean(true).with_verbose(true);let clusters = kmeans.train(data.view(),None)?;// Get the actual data points used as cluster centersifletSome(medoids) = kmeans.medoid_indices(){println!("Medoid points:");for(i,&medoid_idx)in medoids.iter().enumerate(){let medoid_point = data.row(medoid_idx);println!(" Cluster {}: Point {} [{:.1}, {:.1}]",
i, medoid_idx, medoid_point[0], medoid_point[1]);}}Ok(())}
### TextClustering(SphericalK-Means)
```rust
fn text_clustering_example() -> Result<(),Box<dyn std::error::Error>> {// Assume we have TF-IDF vectorslet tfidf_vectors = load_tfidf_data()?;// Your TF-IDF data// Use spherical K-Means (cosine similarity)letmut kmeans = KMeans::new(10).with_euclidean(false)// Use cosine similarity.with_iterations(50);let clusters = kmeans.train(tfidf_vectors.view(),None)?;println!("Clustered {} documents into {} topics",
tfidf_vectors.nrows(), clusters.len());Ok(())}
Kentro provides comprehensive error handling:
use kentro::{KMeans,KMeansError};match kmeans.train(data.view(),None){Ok(clusters) => println!("Success: {} clusters", clusters.len()),Err(KMeansError::InsufficientPoints(n, k)) => {println!("Error: {} points < {} clusters", n, k);},Err(KMeansError::AlreadyTrained) => {println!("Error: Model already trained");},Err(e) => println!("Error: {}", e),}
Run the test suite:
Run with verbose output:
cargo test -- --nocapture
# Run the main example
cargo run --example simple
# Run the K-Medoids demo
cargo run --example medoids_demo
# Run with release optimizations
cargo run --example simple --release
Kentro is designed for high performance:
Parallel Processing: Scales with CPU cores
Memory Efficient: Minimal allocations during clustering
Optimized Algorithms: Based on proven efficient implementations
This project is licensed under the Apache 2.0 License - see the LICENSE file for details.