Cpp ML Library  1.0.0
A library of Machine Learning Algorithmns seen from the Udemy course Machine Learning A to Z.
RandomForestClassifier.hpp
Go to the documentation of this file.
1 #ifndef RANDOM_FOREST_CLASSIFIER_HPP
2 #define RANDOM_FOREST_CLASSIFIER_HPP
3 
4 #include <vector>
5 #include <algorithm>
6 #include <numeric>
7 #include <limits>
8 #include <unordered_map>
9 #include <cmath>
10 #include <random>
11 #include <memory>
12 
23 public:
31  RandomForestClassifier(int n_estimators = 10, int max_depth = 5, int min_samples_split = 2, int max_features = -1);
32 
37 
43  void fit(const std::vector<std::vector<double>>& X, const std::vector<int>& y);
44 
50  std::vector<int> predict(const std::vector<std::vector<double>>& X) const;
51 
52 private:
53  struct Node {
54  bool is_leaf;
55  int value; // Class label for leaf nodes
56  int feature_index;
57  double threshold;
58  std::unique_ptr<Node> left;
59  std::unique_ptr<Node> right;
60 
61  Node() : is_leaf(false), value(0), feature_index(-1), threshold(0.0) {}
62  };
63 
64  struct DecisionTree {
65  std::unique_ptr<Node> root;
66  int max_depth;
67  int min_samples_split;
68  int max_features;
69  std::mt19937 random_engine;
70 
71  DecisionTree(int max_depth, int min_samples_split, int max_features, std::mt19937::result_type seed);
72  ~DecisionTree() = default;
73  void fit(const std::vector<std::vector<double>>& X, const std::vector<int>& y);
74  int predict_sample(const std::vector<double>& x) const;
75 
76  private:
77  std::unique_ptr<Node> build_tree(const std::vector<std::vector<double>>& X, const std::vector<int>& y, int depth);
78  double calculate_gini(const std::vector<int>& y) const;
79  void split_dataset(const std::vector<std::vector<double>>& X, const std::vector<int>& y, int feature_index, double threshold,
80  std::vector<std::vector<double>>& X_left, std::vector<int>& y_left,
81  std::vector<std::vector<double>>& X_right, std::vector<int>& y_right) const;
82  int majority_class(const std::vector<int>& y) const;
83  };
84 
85  int n_estimators;
86  int max_depth;
87  int min_samples_split;
88  int max_features;
89  std::vector<std::unique_ptr<DecisionTree>> trees;
90  std::mt19937 random_engine;
91 
92  void bootstrap_sample(const std::vector<std::vector<double>>& X, const std::vector<int>& y,
93  std::vector<std::vector<double>>& X_sample, std::vector<int>& y_sample);
94 };
95 
96 RandomForestClassifier::RandomForestClassifier(int n_estimators, int max_depth, int min_samples_split, int max_features)
97  : n_estimators(n_estimators), max_depth(max_depth), min_samples_split(min_samples_split), max_features(max_features) {
98  std::random_device rd;
99  random_engine.seed(rd());
100 }
101 
102 void RandomForestClassifier::fit(const std::vector<std::vector<double>>& X, const std::vector<int>& y) {
103  // Set max_features if not set
104  int actual_max_features = max_features;
105  if (actual_max_features == -1) {
106  actual_max_features = static_cast<int>(std::sqrt(X[0].size()));
107  }
108 
109  for (int i = 0; i < n_estimators; ++i) {
110  std::vector<std::vector<double>> X_sample;
111  std::vector<int> y_sample;
112  bootstrap_sample(X, y, X_sample, y_sample);
113 
114  auto tree = std::make_unique<DecisionTree>(max_depth, min_samples_split, actual_max_features, random_engine());
115  tree->fit(X_sample, y_sample);
116  trees.push_back(std::move(tree));
117  }
118 }
119 
120 std::vector<int> RandomForestClassifier::predict(const std::vector<std::vector<double>>& X) const {
121  std::vector<int> predictions(X.size());
122  for (size_t i = 0; i < X.size(); ++i) {
123  std::unordered_map<int, int> votes;
124  for (const auto& tree : trees) {
125  int vote = tree->predict_sample(X[i]);
126  votes[vote]++;
127  }
128  // Majority vote
129  predictions[i] = std::max_element(votes.begin(), votes.end(),
130  [](const auto& a, const auto& b) {
131  return a.second < b.second;
132  })->first;
133  }
134  return predictions;
135 }
136 
137 void RandomForestClassifier::bootstrap_sample(const std::vector<std::vector<double>>& X, const std::vector<int>& y,
138  std::vector<std::vector<double>>& X_sample, std::vector<int>& y_sample) {
139  size_t n_samples = X.size();
140  std::uniform_int_distribution<size_t> dist(0, n_samples - 1);
141 
142  for (size_t i = 0; i < n_samples; ++i) {
143  size_t index = dist(random_engine);
144  X_sample.push_back(X[index]);
145  y_sample.push_back(y[index]);
146  }
147 }
148 
149 RandomForestClassifier::DecisionTree::DecisionTree(int max_depth, int min_samples_split, int max_features, std::mt19937::result_type seed)
150  : root(nullptr), max_depth(max_depth), min_samples_split(min_samples_split), max_features(max_features), random_engine(seed) {}
151 
152 void RandomForestClassifier::DecisionTree::fit(const std::vector<std::vector<double>>& X, const std::vector<int>& y) {
153  root = build_tree(X, y, 0);
154 }
155 
156 int RandomForestClassifier::DecisionTree::predict_sample(const std::vector<double>& x) const {
157  const Node* node = root.get();
158  while (!node->is_leaf) {
159  if (x[node->feature_index] <= node->threshold) {
160  node = node->left.get();
161  } else {
162  node = node->right.get();
163  }
164  }
165  return node->value;
166 }
167 
168 std::unique_ptr<RandomForestClassifier::Node> RandomForestClassifier::DecisionTree::build_tree(
169  const std::vector<std::vector<double>>& X, const std::vector<int>& y, int depth) {
170  auto node = std::make_unique<Node>();
171 
172  // Check stopping criteria
173  if (depth >= max_depth || y.size() < static_cast<size_t>(min_samples_split) || calculate_gini(y) == 0.0) {
174  node->is_leaf = true;
175  node->value = majority_class(y);
176  return node;
177  }
178 
179  double best_gini = std::numeric_limits<double>::max();
180  int best_feature_index = -1;
181  double best_threshold = 0.0;
182  std::vector<std::vector<double>> best_X_left, best_X_right;
183  std::vector<int> best_y_left, best_y_right;
184 
185  int num_features = X[0].size();
186  std::vector<int> features_indices(num_features);
187  std::iota(features_indices.begin(), features_indices.end(), 0);
188 
189  // Randomly select features without replacement
190  std::shuffle(features_indices.begin(), features_indices.end(), random_engine);
191  if (max_features < num_features) {
192  features_indices.resize(max_features);
193  }
194 
195  for (int feature_index : features_indices) {
196  // Get all possible thresholds
197  std::vector<double> feature_values;
198  feature_values.reserve(X.size());
199  for (const auto& x : X) {
200  feature_values.push_back(x[feature_index]);
201  }
202  std::sort(feature_values.begin(), feature_values.end());
203  feature_values.erase(std::unique(feature_values.begin(), feature_values.end()), feature_values.end());
204 
205  if (feature_values.size() <= 1) continue;
206 
207  std::vector<double> thresholds;
208  thresholds.reserve(feature_values.size() - 1);
209  for (size_t i = 1; i < feature_values.size(); ++i) {
210  thresholds.push_back((feature_values[i - 1] + feature_values[i]) / 2.0);
211  }
212 
213  // Evaluate each threshold
214  for (double threshold : thresholds) {
215  std::vector<std::vector<double>> X_left, X_right;
216  std::vector<int> y_left, y_right;
217  split_dataset(X, y, feature_index, threshold, X_left, y_left, X_right, y_right);
218 
219  if (y_left.empty() || y_right.empty())
220  continue;
221 
222  double gini_left = calculate_gini(y_left);
223  double gini_right = calculate_gini(y_right);
224  double gini = (gini_left * y_left.size() + gini_right * y_right.size()) / y.size();
225 
226  if (gini < best_gini) {
227  best_gini = gini;
228  best_feature_index = feature_index;
229  best_threshold = threshold;
230  best_X_left = std::move(X_left);
231  best_X_right = std::move(X_right);
232  best_y_left = std::move(y_left);
233  best_y_right = std::move(y_right);
234  }
235  }
236  }
237 
238  // If no split improves the Gini impurity, make this a leaf node
239  if (best_feature_index == -1) {
240  node->is_leaf = true;
241  node->value = majority_class(y);
242  return node;
243  }
244 
245  // Recursively build the left and right subtrees
246  node->feature_index = best_feature_index;
247  node->threshold = best_threshold;
248  node->left = build_tree(best_X_left, best_y_left, depth + 1);
249  node->right = build_tree(best_X_right, best_y_right, depth + 1);
250  return node;
251 }
252 
253 double RandomForestClassifier::DecisionTree::calculate_gini(const std::vector<int>& y) const {
254  std::unordered_map<int, int> class_counts;
255  for (int label : y) {
256  class_counts[label]++;
257  }
258  double impurity = 1.0;
259  size_t total = y.size();
260  for (const auto& [label, count] : class_counts) {
261  double prob = static_cast<double>(count) / total;
262  impurity -= prob * prob;
263  }
264  return impurity;
265 }
266 
267 int RandomForestClassifier::DecisionTree::majority_class(const std::vector<int>& y) const {
268  std::unordered_map<int, int> class_counts;
269  for (int label : y) {
270  class_counts[label]++;
271  }
272  return std::max_element(class_counts.begin(), class_counts.end(),
273  [](const auto& a, const auto& b) {
274  return a.second < b.second;
275  })->first;
276 }
277 
278 void RandomForestClassifier::DecisionTree::split_dataset(const std::vector<std::vector<double>>& X, const std::vector<int>& y,
279  int feature_index, double threshold,
280  std::vector<std::vector<double>>& X_left, std::vector<int>& y_left,
281  std::vector<std::vector<double>>& X_right, std::vector<int>& y_right) const {
282  for (size_t i = 0; i < X.size(); ++i) {
283  if (X[i][feature_index] <= threshold) {
284  X_left.push_back(X[i]);
285  y_left.push_back(y[i]);
286  } else {
287  X_right.push_back(X[i]);
288  y_right.push_back(y[i]);
289  }
290  }
291 }
292 
293 #endif // RANDOM_FOREST_CLASSIFIER_HPP
Implements a Random Forest Classifier.
Definition: RandomForestClassifier.hpp:22
std::vector< int > predict(const std::vector< std::vector< double >> &X) const
Predicts class labels for given input data.
Definition: RandomForestClassifier.hpp:120
~RandomForestClassifier()=default
Destructor for RandomForestClassifier.
RandomForestClassifier(int n_estimators=10, int max_depth=5, int min_samples_split=2, int max_features=-1)
Constructs a RandomForestClassifier.
Definition: RandomForestClassifier.hpp:96
void fit(const std::vector< std::vector< double >> &X, const std::vector< int > &y)
Fits the model to the training data.
Definition: RandomForestClassifier.hpp:102