Cpp ML Library  1.0.0
A library of Machine Learning Algorithmns seen from the Udemy course Machine Learning A to Z.
RandomForestRegressor.hpp
Go to the documentation of this file.
1 #ifndef RANDOM_FOREST_REGRESSOR_HPP
2 #define RANDOM_FOREST_REGRESSOR_HPP
3 
4 #include <vector>
5 #include <algorithm>
6 #include <numeric>
7 #include <limits>
8 #include <cmath>
9 #include <random>
10 #include <memory>
11 
22 public:
30  RandomForestRegressor(int n_estimators = 10, int max_depth = 5, int min_samples_split = 2, int max_features = -1);
31 
36 
42  void fit(const std::vector<std::vector<double>>& X, const std::vector<double>& y);
43 
49  std::vector<double> predict(const std::vector<std::vector<double>>& X) const;
50 
51 private:
52  struct Node {
53  bool is_leaf;
54  double value;
55  int feature_index;
56  double threshold;
57  std::unique_ptr<Node> left;
58  std::unique_ptr<Node> right;
59 
60  Node()
61  : is_leaf(false), value(0.0), feature_index(-1), threshold(0.0), left(nullptr), right(nullptr) {}
62  };
63 
64  struct DecisionTree {
65  std::unique_ptr<Node> root;
66  int max_depth;
67  int min_samples_split;
68  int max_features;
69  std::mt19937 random_engine;
70 
71  DecisionTree(int max_depth, int min_samples_split, int max_features);
72  ~DecisionTree() = default;
73  void fit(const std::vector<std::vector<double>>& X, const std::vector<double>& y);
74  double predict_sample(const std::vector<double>& x) const;
75 
76  private:
77  std::unique_ptr<Node> build_tree(const std::vector<std::vector<double>>& X, const std::vector<double>& y, int depth);
78  double calculate_mse(const std::vector<double>& y) const;
79  void split_dataset(const std::vector<std::vector<double>>& X, const std::vector<double>& y, int feature_index, double threshold,
80  std::vector<std::vector<double>>& X_left, std::vector<double>& y_left,
81  std::vector<std::vector<double>>& X_right, std::vector<double>& y_right) const;
82  };
83 
84  int n_estimators;
85  int max_depth;
86  int min_samples_split;
87  int max_features;
88  std::vector<std::unique_ptr<DecisionTree>> trees;
89  std::mt19937 random_engine;
90 
91  void bootstrap_sample(const std::vector<std::vector<double>>& X, const std::vector<double>& y,
92  std::vector<std::vector<double>>& X_sample, std::vector<double>& y_sample);
93 };
94 
95 RandomForestRegressor::RandomForestRegressor(int n_estimators, int max_depth, int min_samples_split, int max_features)
96  : n_estimators(n_estimators), max_depth(max_depth), min_samples_split(min_samples_split), max_features(max_features) {
97  std::random_device rd;
98  random_engine.seed(rd());
99 }
100 
101 void RandomForestRegressor::fit(const std::vector<std::vector<double>>& X, const std::vector<double>& y) {
102  // Set max_features if not set
103  int actual_max_features = max_features;
104  if (actual_max_features == -1) {
105  actual_max_features = static_cast<int>(std::sqrt(X[0].size()));
106  }
107 
108  for (int i = 0; i < n_estimators; ++i) {
109  std::vector<std::vector<double>> X_sample;
110  std::vector<double> y_sample;
111  bootstrap_sample(X, y, X_sample, y_sample);
112 
113  auto tree = std::make_unique<DecisionTree>(max_depth, min_samples_split, actual_max_features);
114  tree->fit(X_sample, y_sample);
115  trees.push_back(std::move(tree));
116  }
117 }
118 
119 std::vector<double> RandomForestRegressor::predict(const std::vector<std::vector<double>>& X) const {
120  std::vector<double> predictions(X.size(), 0.0);
121  for (const auto& tree : trees) {
122  for (size_t i = 0; i < X.size(); ++i) {
123  predictions[i] += tree->predict_sample(X[i]);
124  }
125  }
126  for (auto& pred : predictions) {
127  pred /= n_estimators;
128  }
129  return predictions;
130 }
131 
132 void RandomForestRegressor::bootstrap_sample(const std::vector<std::vector<double>>& X, const std::vector<double>& y,
133  std::vector<std::vector<double>>& X_sample, std::vector<double>& y_sample) {
134  size_t n_samples = X.size();
135  std::uniform_int_distribution<size_t> dist(0, n_samples - 1);
136 
137  for (size_t i = 0; i < n_samples; ++i) {
138  size_t index = dist(random_engine);
139  X_sample.push_back(X[index]);
140  y_sample.push_back(y[index]);
141  }
142 }
143 
144 RandomForestRegressor::DecisionTree::DecisionTree(int max_depth, int min_samples_split, int max_features)
145  : root(nullptr), max_depth(max_depth), min_samples_split(min_samples_split), max_features(max_features) {
146  std::random_device rd;
147  random_engine.seed(rd());
148 }
149 
150 void RandomForestRegressor::DecisionTree::fit(const std::vector<std::vector<double>>& X, const std::vector<double>& y) {
151  root = build_tree(X, y, 0);
152 }
153 
154 double RandomForestRegressor::DecisionTree::predict_sample(const std::vector<double>& x) const {
155  const Node* node = root.get();
156  while (!node->is_leaf) {
157  if (x[node->feature_index] <= node->threshold) {
158  node = node->left.get();
159  } else {
160  node = node->right.get();
161  }
162  }
163  return node->value;
164 }
165 
166 std::unique_ptr<RandomForestRegressor::Node> RandomForestRegressor::DecisionTree::build_tree(
167  const std::vector<std::vector<double>>& X, const std::vector<double>& y, int depth) {
168  auto node = std::make_unique<Node>();
169 
170  // Check stopping criteria
171  if (depth >= max_depth || y.size() < static_cast<size_t>(min_samples_split)) {
172  node->is_leaf = true;
173  node->value = std::accumulate(y.begin(), y.end(), 0.0) / y.size();
174  return node;
175  }
176 
177  double best_mse = std::numeric_limits<double>::max();
178  int best_feature_index = -1;
179  double best_threshold = 0.0;
180  std::vector<std::vector<double>> best_X_left, best_X_right;
181  std::vector<double> best_y_left, best_y_right;
182 
183  int num_features = X[0].size();
184  std::vector<int> features_indices(num_features);
185  std::iota(features_indices.begin(), features_indices.end(), 0);
186 
187  // Randomly select features without replacement
188  std::shuffle(features_indices.begin(), features_indices.end(), random_engine);
189  if (max_features < num_features) {
190  features_indices.resize(max_features);
191  }
192 
193  for (int feature_index : features_indices) {
194  // Get all possible thresholds
195  std::vector<double> feature_values;
196  feature_values.reserve(X.size());
197  for (const auto& x : X) {
198  feature_values.push_back(x[feature_index]);
199  }
200  std::sort(feature_values.begin(), feature_values.end());
201  feature_values.erase(std::unique(feature_values.begin(), feature_values.end()), feature_values.end());
202 
203  std::vector<double> thresholds;
204  thresholds.reserve(feature_values.size() - 1);
205  for (size_t i = 1; i < feature_values.size(); ++i) {
206  thresholds.push_back((feature_values[i - 1] + feature_values[i]) / 2.0);
207  }
208 
209  // Evaluate each threshold
210  for (double threshold : thresholds) {
211  std::vector<std::vector<double>> X_left, X_right;
212  std::vector<double> y_left, y_right;
213  split_dataset(X, y, feature_index, threshold, X_left, y_left, X_right, y_right);
214 
215  if (y_left.empty() || y_right.empty())
216  continue;
217 
218  double mse_left = calculate_mse(y_left);
219  double mse_right = calculate_mse(y_right);
220  double mse = (mse_left * y_left.size() + mse_right * y_right.size()) / y.size();
221 
222  if (mse < best_mse) {
223  best_mse = mse;
224  best_feature_index = feature_index;
225  best_threshold = threshold;
226  best_X_left = std::move(X_left);
227  best_X_right = std::move(X_right);
228  best_y_left = std::move(y_left);
229  best_y_right = std::move(y_right);
230  }
231  }
232  }
233 
234  // If no split improves the mse, make this a leaf node
235  if (best_feature_index == -1) {
236  node->is_leaf = true;
237  node->value = std::accumulate(y.begin(), y.end(), 0.0) / y.size();
238  return node;
239  }
240 
241  // Recursively build the left and right subtrees
242  node->feature_index = best_feature_index;
243  node->threshold = best_threshold;
244  node->left = build_tree(best_X_left, best_y_left, depth + 1);
245  node->right = build_tree(best_X_right, best_y_right, depth + 1);
246  return node;
247 }
248 
249 double RandomForestRegressor::DecisionTree::calculate_mse(const std::vector<double>& y) const {
250  double mean = std::accumulate(y.begin(), y.end(), 0.0) / y.size();
251  double mse = std::transform_reduce(y.begin(), y.end(), 0.0, std::plus<>(), [mean](double val) {
252  double diff = val - mean;
253  return diff * diff;
254  });
255  return mse / y.size();
256 }
257 
258 void RandomForestRegressor::DecisionTree::split_dataset(const std::vector<std::vector<double>>& X, const std::vector<double>& y,
259  int feature_index, double threshold,
260  std::vector<std::vector<double>>& X_left, std::vector<double>& y_left,
261  std::vector<std::vector<double>>& X_right, std::vector<double>& y_right) const {
262  for (size_t i = 0; i < X.size(); ++i) {
263  if (X[i][feature_index] <= threshold) {
264  X_left.push_back(X[i]);
265  y_left.push_back(y[i]);
266  } else {
267  X_right.push_back(X[i]);
268  y_right.push_back(y[i]);
269  }
270  }
271 }
272 
273 #endif // RANDOM_FOREST_REGRESSOR_HPP
Implements a Random Forest Regressor.
Definition: RandomForestRegressor.hpp:21
~RandomForestRegressor()=default
Destructor for RandomForestRegressor.
std::vector< double > predict(const std::vector< std::vector< double >> &X) const
Predicts target values for given input data.
Definition: RandomForestRegressor.hpp:119
void fit(const std::vector< std::vector< double >> &X, const std::vector< double > &y)
Fits the model to the training data.
Definition: RandomForestRegressor.hpp:101
RandomForestRegressor(int n_estimators=10, int max_depth=5, int min_samples_split=2, int max_features=-1)
Constructs a RandomForestRegressor.
Definition: RandomForestRegressor.hpp:95