Cpp ML Library  1.0.0
A library of Machine Learning Algorithmns seen from the Udemy course Machine Learning A to Z.
HierarchicalClustering.hpp
Go to the documentation of this file.
1 #ifndef HIERARCHICAL_CLUSTERING_HPP
2 #define HIERARCHICAL_CLUSTERING_HPP
3 
4 #include <vector>
5 #include <cmath>
6 #include <algorithm>
7 #include <memory>
8 #include <limits>
9 
20 public:
24  enum class Linkage {
25  SINGLE,
26  COMPLETE,
27  AVERAGE
28  };
29 
35  HierarchicalClustering(int n_clusters = 2, Linkage linkage = Linkage::AVERAGE);
36 
41 
46  void fit(const std::vector<std::vector<double>>& X);
47 
52  std::vector<int> predict() const;
53 
58  std::vector<std::vector<double>> get_cluster_centers() const;
59 
60 private:
61  int n_clusters;
62  Linkage linkage;
63  std::vector<std::vector<double>> data;
64 
65  struct Cluster {
66  int id;
67  std::vector<int> points;
68  };
69 
70  std::vector<std::shared_ptr<Cluster>> clusters;
71 
78  double euclidean_distance(int a, int b) const;
79 
86  double cluster_distance(const Cluster& cluster_a, const Cluster& cluster_b) const;
87 
91  void merge_clusters();
92 
97  std::pair<int, int> find_closest_clusters() const;
98 };
99 
101  : n_clusters(n_clusters), linkage(linkage) {}
102 
104 
105 void HierarchicalClustering::fit(const std::vector<std::vector<double>>& X) {
106  data = X;
107 
108  // Initialize each data point as a separate cluster
109  clusters.clear();
110  for (size_t i = 0; i < data.size(); ++i) {
111  auto cluster = std::make_shared<Cluster>();
112  cluster->id = static_cast<int>(i);
113  cluster->points.push_back(static_cast<int>(i));
114  clusters.push_back(cluster);
115  }
116 
117  // Agglomerative clustering
118  while (static_cast<int>(clusters.size()) > n_clusters) {
119  merge_clusters();
120  }
121 }
122 
123 std::vector<int> HierarchicalClustering::predict() const {
124  std::vector<int> labels(data.size(), -1);
125  for (size_t i = 0; i < clusters.size(); ++i) {
126  for (int point_idx : clusters[i]->points) {
127  labels[point_idx] = static_cast<int>(i);
128  }
129  }
130  return labels;
131 }
132 
133 std::vector<std::vector<double>> HierarchicalClustering::get_cluster_centers() const {
134  std::vector<std::vector<double>> centers;
135  centers.reserve(clusters.size());
136 
137  for (const auto& cluster : clusters) {
138  std::vector<double> centroid(data[0].size(), 0.0);
139  for (int idx : cluster->points) {
140  const auto& point = data[idx];
141  for (size_t i = 0; i < point.size(); ++i) {
142  centroid[i] += point[i];
143  }
144  }
145  // Divide by the number of points to get the mean
146  for (double& val : centroid) {
147  val /= cluster->points.size();
148  }
149  centers.push_back(centroid);
150  }
151 
152  return centers;
153 }
154 
155 double HierarchicalClustering::euclidean_distance(int a, int b) const {
156  const auto& point_a = data[a];
157  const auto& point_b = data[b];
158  double distance = 0.0;
159  for (size_t i = 0; i < point_a.size(); ++i) {
160  double diff = point_a[i] - point_b[i];
161  distance += diff * diff;
162  }
163  return std::sqrt(distance);
164 }
165 
166 double HierarchicalClustering::cluster_distance(const Cluster& cluster_a, const Cluster& cluster_b) const {
167  double distance = 0.0;
168 
169  if (linkage == Linkage::SINGLE) {
170  // Minimum distance between any two points in the clusters
171  distance = std::numeric_limits<double>::max();
172  for (int idx_a : cluster_a.points) {
173  for (int idx_b : cluster_b.points) {
174  double dist = euclidean_distance(idx_a, idx_b);
175  if (dist < distance) {
176  distance = dist;
177  }
178  }
179  }
180  } else if (linkage == Linkage::COMPLETE) {
181  // Maximum distance between any two points in the clusters
182  distance = 0.0;
183  for (int idx_a : cluster_a.points) {
184  for (int idx_b : cluster_b.points) {
185  double dist = euclidean_distance(idx_a, idx_b);
186  if (dist > distance) {
187  distance = dist;
188  }
189  }
190  }
191  } else if (linkage == Linkage::AVERAGE) {
192  // Average distance between all pairs of points in the clusters
193  distance = 0.0;
194  int count = 0;
195  for (int idx_a : cluster_a.points) {
196  for (int idx_b : cluster_b.points) {
197  distance += euclidean_distance(idx_a, idx_b);
198  count++;
199  }
200  }
201  distance /= count;
202  }
203 
204  return distance;
205 }
206 
207 void HierarchicalClustering::merge_clusters() {
208  auto [idx_a, idx_b] = find_closest_clusters();
209 
210  // Merge cluster b into cluster a
211  clusters[idx_a]->points.insert(clusters[idx_a]->points.end(),
212  clusters[idx_b]->points.begin(),
213  clusters[idx_b]->points.end());
214 
215  // Remove cluster b
216  clusters.erase(clusters.begin() + idx_b);
217 }
218 
219 std::pair<int, int> HierarchicalClustering::find_closest_clusters() const {
220  double min_distance = std::numeric_limits<double>::max();
221  int idx_a = -1;
222  int idx_b = -1;
223 
224  for (size_t i = 0; i < clusters.size(); ++i) {
225  for (size_t j = i + 1; j < clusters.size(); ++j) {
226  double dist = cluster_distance(*clusters[i], *clusters[j]);
227  if (dist < min_distance) {
228  min_distance = dist;
229  idx_a = static_cast<int>(i);
230  idx_b = static_cast<int>(j);
231  }
232  }
233  }
234 
235  return {idx_a, idx_b};
236 }
237 
238 #endif // HIERARCHICAL_CLUSTERING_HPP
Agglomerative Hierarchical Clustering for clustering tasks.
Definition: HierarchicalClustering.hpp:19
~HierarchicalClustering()
Destructor for HierarchicalClustering.
Definition: HierarchicalClustering.hpp:103
std::vector< std::vector< double > > get_cluster_centers() const
Retrieves the cluster centers (centroids) after fitting.
Definition: HierarchicalClustering.hpp:133
Linkage
Linkage criteria for clustering.
Definition: HierarchicalClustering.hpp:24
void fit(const std::vector< std::vector< double >> &X)
Fits the clustering algorithm to the data.
Definition: HierarchicalClustering.hpp:105
HierarchicalClustering(int n_clusters=2, Linkage linkage=Linkage::AVERAGE)
Constructs a HierarchicalClustering instance.
Definition: HierarchicalClustering.hpp:100
std::vector< int > predict() const
Predicts the cluster labels for the data.
Definition: HierarchicalClustering.hpp:123