28 KMeans(
int n_clusters = 8,
int max_iter = 300,
double tol = 1e-4,
unsigned int random_state = 0);
39 void fit(
const std::vector<std::vector<double>>& X);
46 std::vector<int>
predict(
const std::vector<std::vector<double>>& X)
const;
58 std::vector<std::vector<double>> cluster_centers;
59 std::vector<int> labels;
61 mutable std::mt19937 rng;
69 double euclidean_distance(
const std::vector<double>& a,
const std::vector<double>& b)
const;
76 std::vector<int> assign_labels(
const std::vector<std::vector<double>>& X)
const;
84 std::vector<std::vector<double>> compute_cluster_centers(
const std::vector<std::vector<double>>& X,
const std::vector<int>& labels)
const;
90 void initialize_centers(
const std::vector<std::vector<double>>& X);
93 KMeans::KMeans(
int n_clusters,
int max_iter,
double tol,
unsigned int random_state)
94 : n_clusters(n_clusters), max_iter(max_iter), tol(tol), rng(random_state) {
95 if (random_state == 0) {
96 std::random_device rd;
104 size_t n_samples = X.size();
107 initialize_centers(X);
109 labels.resize(n_samples);
110 std::vector<std::vector<double>> old_cluster_centers;
112 for (
int iter = 0; iter < max_iter; ++iter) {
114 labels = assign_labels(X);
117 old_cluster_centers = cluster_centers;
120 cluster_centers = compute_cluster_centers(X, labels);
123 double max_center_shift = 0.0;
124 for (
int i = 0; i < n_clusters; ++i) {
125 double shift = euclidean_distance(cluster_centers[i], old_cluster_centers[i]);
126 if (shift > max_center_shift) {
127 max_center_shift = shift;
130 if (max_center_shift <= tol) {
137 return assign_labels(X);
141 return cluster_centers;
144 double KMeans::euclidean_distance(
const std::vector<double>& a,
const std::vector<double>& b)
const {
146 for (
size_t i = 0; i < a.size(); ++i) {
147 double diff = a[i] - b[i];
150 return std::sqrt(sum);
153 std::vector<int> KMeans::assign_labels(
const std::vector<std::vector<double>>& X)
const {
154 std::vector<int> labels(X.size());
155 for (
size_t i = 0; i < X.size(); ++i) {
156 double min_dist = std::numeric_limits<double>::max();
158 for (
int k = 0; k < n_clusters; ++k) {
159 double dist = euclidean_distance(X[i], cluster_centers[k]);
160 if (dist < min_dist) {
170 std::vector<std::vector<double>> KMeans::compute_cluster_centers(
const std::vector<std::vector<double>>& X,
const std::vector<int>& labels)
const {
171 size_t n_features = X[0].size();
172 std::vector<std::vector<double>> new_centers(n_clusters, std::vector<double>(n_features, 0.0));
173 std::vector<int> counts(n_clusters, 0);
175 for (
size_t i = 0; i < X.size(); ++i) {
176 int label = labels[i];
178 for (
size_t j = 0; j < n_features; ++j) {
179 new_centers[label][j] += X[i][j];
183 for (
int k = 0; k < n_clusters; ++k) {
184 if (counts[k] == 0) {
186 std::uniform_int_distribution<size_t> dist(0, X.size() - 1);
187 new_centers[k] = X[dist(rng)];
189 for (
size_t j = 0; j < n_features; ++j) {
190 new_centers[k][j] /= counts[k];
198 void KMeans::initialize_centers(
const std::vector<std::vector<double>>& X) {
199 size_t n_samples = X.size();
200 size_t n_features = X[0].size();
201 cluster_centers.clear();
202 cluster_centers.reserve(n_clusters);
205 std::uniform_int_distribution<size_t> dist(0, n_samples - 1);
206 size_t first_center_idx = dist(rng);
207 cluster_centers.push_back(X[first_center_idx]);
210 std::vector<double> distances(n_samples, std::numeric_limits<double>::max());
212 for (
int k = 1; k < n_clusters; ++k) {
213 double total_distance = 0.0;
214 for (
size_t i = 0; i < n_samples; ++i) {
215 double dist_to_center = euclidean_distance(X[i], cluster_centers.back());
216 if (dist_to_center < distances[i]) {
217 distances[i] = dist_to_center;
219 total_distance += distances[i];
223 std::uniform_real_distribution<double> uniform_dist(0.0, total_distance);
224 double random_value = uniform_dist(rng);
226 double cumulative_distance = 0.0;
227 size_t next_center_idx = 0;
228 for (
size_t i = 0; i < n_samples; ++i) {
229 cumulative_distance += distances[i];
230 if (cumulative_distance >= random_value) {
235 cluster_centers.push_back(X[next_center_idx]);
Implements the K-Means clustering algorithm with K-Means++ initialization.
Definition: KMeans.hpp:19
void fit(const std::vector< std::vector< double >> &X)
Fits the KMeans model to the data.
Definition: KMeans.hpp:103
std::vector< int > predict(const std::vector< std::vector< double >> &X) const
Predicts the closest cluster each sample in X belongs to.
Definition: KMeans.hpp:136
~KMeans()
Destructor for KMeans.
Definition: KMeans.hpp:101
KMeans(int n_clusters=8, int max_iter=300, double tol=1e-4, unsigned int random_state=0)
Constructs a KMeans object.
Definition: KMeans.hpp:93
const std::vector< std::vector< double > > & get_cluster_centers() const
Returns the cluster centers.
Definition: KMeans.hpp:140