Cpp ML Library  1.0.0
A library of Machine Learning Algorithmns seen from the Udemy course Machine Learning A to Z.
KMeans.hpp
Go to the documentation of this file.
1 #ifndef KMEANS_HPP
2 #define KMEANS_HPP
3 
4 #include <vector>
5 #include <cmath>
6 #include <limits>
7 #include <random>
8 #include <algorithm>
9 
19 class KMeans {
20 public:
28  KMeans(int n_clusters = 8, int max_iter = 300, double tol = 1e-4, unsigned int random_state = 0);
29 
33  ~KMeans();
34 
39  void fit(const std::vector<std::vector<double>>& X);
40 
46  std::vector<int> predict(const std::vector<std::vector<double>>& X) const;
47 
52  const std::vector<std::vector<double>>& get_cluster_centers() const;
53 
54 private:
55  int n_clusters;
56  int max_iter;
57  double tol;
58  std::vector<std::vector<double>> cluster_centers;
59  std::vector<int> labels;
60 
61  mutable std::mt19937 rng;
62 
69  double euclidean_distance(const std::vector<double>& a, const std::vector<double>& b) const;
70 
76  std::vector<int> assign_labels(const std::vector<std::vector<double>>& X) const;
77 
84  std::vector<std::vector<double>> compute_cluster_centers(const std::vector<std::vector<double>>& X, const std::vector<int>& labels) const;
85 
90  void initialize_centers(const std::vector<std::vector<double>>& X);
91 };
92 
93 KMeans::KMeans(int n_clusters, int max_iter, double tol, unsigned int random_state)
94  : n_clusters(n_clusters), max_iter(max_iter), tol(tol), rng(random_state) {
95  if (random_state == 0) {
96  std::random_device rd;
97  rng.seed(rd());
98  }
99 }
100 
102 
103 void KMeans::fit(const std::vector<std::vector<double>>& X) {
104  size_t n_samples = X.size();
105 
106  // Initialize cluster centers using K-Means++ initialization
107  initialize_centers(X);
108 
109  labels.resize(n_samples);
110  std::vector<std::vector<double>> old_cluster_centers;
111 
112  for (int iter = 0; iter < max_iter; ++iter) {
113  // Assign labels to each point
114  labels = assign_labels(X);
115 
116  // Save old centers
117  old_cluster_centers = cluster_centers;
118 
119  // Compute new centers
120  cluster_centers = compute_cluster_centers(X, labels);
121 
122  // Check for convergence
123  double max_center_shift = 0.0;
124  for (int i = 0; i < n_clusters; ++i) {
125  double shift = euclidean_distance(cluster_centers[i], old_cluster_centers[i]);
126  if (shift > max_center_shift) {
127  max_center_shift = shift;
128  }
129  }
130  if (max_center_shift <= tol) {
131  break;
132  }
133  }
134 }
135 
136 std::vector<int> KMeans::predict(const std::vector<std::vector<double>>& X) const {
137  return assign_labels(X);
138 }
139 
140 const std::vector<std::vector<double>>& KMeans::get_cluster_centers() const {
141  return cluster_centers;
142 }
143 
144 double KMeans::euclidean_distance(const std::vector<double>& a, const std::vector<double>& b) const {
145  double sum = 0.0;
146  for (size_t i = 0; i < a.size(); ++i) {
147  double diff = a[i] - b[i];
148  sum += diff * diff;
149  }
150  return std::sqrt(sum);
151 }
152 
153 std::vector<int> KMeans::assign_labels(const std::vector<std::vector<double>>& X) const {
154  std::vector<int> labels(X.size());
155  for (size_t i = 0; i < X.size(); ++i) {
156  double min_dist = std::numeric_limits<double>::max();
157  int label = -1;
158  for (int k = 0; k < n_clusters; ++k) {
159  double dist = euclidean_distance(X[i], cluster_centers[k]);
160  if (dist < min_dist) {
161  min_dist = dist;
162  label = k;
163  }
164  }
165  labels[i] = label;
166  }
167  return labels;
168 }
169 
170 std::vector<std::vector<double>> KMeans::compute_cluster_centers(const std::vector<std::vector<double>>& X, const std::vector<int>& labels) const {
171  size_t n_features = X[0].size();
172  std::vector<std::vector<double>> new_centers(n_clusters, std::vector<double>(n_features, 0.0));
173  std::vector<int> counts(n_clusters, 0);
174 
175  for (size_t i = 0; i < X.size(); ++i) {
176  int label = labels[i];
177  counts[label]++;
178  for (size_t j = 0; j < n_features; ++j) {
179  new_centers[label][j] += X[i][j];
180  }
181  }
182 
183  for (int k = 0; k < n_clusters; ++k) {
184  if (counts[k] == 0) {
185  // If a cluster lost all its members, reinitialize its center using K-Means++ logic
186  std::uniform_int_distribution<size_t> dist(0, X.size() - 1);
187  new_centers[k] = X[dist(rng)];
188  } else {
189  for (size_t j = 0; j < n_features; ++j) {
190  new_centers[k][j] /= counts[k];
191  }
192  }
193  }
194 
195  return new_centers;
196 }
197 
198 void KMeans::initialize_centers(const std::vector<std::vector<double>>& X) {
199  size_t n_samples = X.size();
200  size_t n_features = X[0].size();
201  cluster_centers.clear();
202  cluster_centers.reserve(n_clusters);
203 
204  // Step 1: Choose one center uniformly at random from the data points
205  std::uniform_int_distribution<size_t> dist(0, n_samples - 1);
206  size_t first_center_idx = dist(rng);
207  cluster_centers.push_back(X[first_center_idx]);
208 
209  // Step 2: For each data point, compute its distance to the nearest center
210  std::vector<double> distances(n_samples, std::numeric_limits<double>::max());
211 
212  for (int k = 1; k < n_clusters; ++k) {
213  double total_distance = 0.0;
214  for (size_t i = 0; i < n_samples; ++i) {
215  double dist_to_center = euclidean_distance(X[i], cluster_centers.back());
216  if (dist_to_center < distances[i]) {
217  distances[i] = dist_to_center;
218  }
219  total_distance += distances[i];
220  }
221 
222  // Step 3: Choose the next center with probability proportional to the square of the distance
223  std::uniform_real_distribution<double> uniform_dist(0.0, total_distance);
224  double random_value = uniform_dist(rng);
225 
226  double cumulative_distance = 0.0;
227  size_t next_center_idx = 0;
228  for (size_t i = 0; i < n_samples; ++i) {
229  cumulative_distance += distances[i];
230  if (cumulative_distance >= random_value) {
231  next_center_idx = i;
232  break;
233  }
234  }
235  cluster_centers.push_back(X[next_center_idx]);
236  }
237 }
238 
239 #endif // KMEANS_HPP
Implements the K-Means clustering algorithm with K-Means++ initialization.
Definition: KMeans.hpp:19
void fit(const std::vector< std::vector< double >> &X)
Fits the KMeans model to the data.
Definition: KMeans.hpp:103
std::vector< int > predict(const std::vector< std::vector< double >> &X) const
Predicts the closest cluster each sample in X belongs to.
Definition: KMeans.hpp:136
~KMeans()
Destructor for KMeans.
Definition: KMeans.hpp:101
KMeans(int n_clusters=8, int max_iter=300, double tol=1e-4, unsigned int random_state=0)
Constructs a KMeans object.
Definition: KMeans.hpp:93
const std::vector< std::vector< double > > & get_cluster_centers() const
Returns the cluster centers.
Definition: KMeans.hpp:140