1 #ifndef RANDOM_FOREST_REGRESSOR_HPP
2 #define RANDOM_FOREST_REGRESSOR_HPP
30 RandomForestRegressor(
int n_estimators = 10,
int max_depth = 5,
int min_samples_split = 2,
int max_features = -1);
42 void fit(
const std::vector<std::vector<double>>& X,
const std::vector<double>& y);
49 std::vector<double>
predict(
const std::vector<std::vector<double>>& X)
const;
57 std::unique_ptr<Node> left;
58 std::unique_ptr<Node> right;
61 : is_leaf(false), value(0.0), feature_index(-1), threshold(0.0), left(nullptr), right(nullptr) {}
65 std::unique_ptr<Node> root;
67 int min_samples_split;
69 std::mt19937 random_engine;
71 DecisionTree(
int max_depth,
int min_samples_split,
int max_features);
72 ~DecisionTree() =
default;
73 void fit(
const std::vector<std::vector<double>>& X,
const std::vector<double>& y);
74 double predict_sample(
const std::vector<double>& x)
const;
77 std::unique_ptr<Node> build_tree(
const std::vector<std::vector<double>>& X,
const std::vector<double>& y,
int depth);
78 double calculate_mse(
const std::vector<double>& y)
const;
79 void split_dataset(
const std::vector<std::vector<double>>& X,
const std::vector<double>& y,
int feature_index,
double threshold,
80 std::vector<std::vector<double>>& X_left, std::vector<double>& y_left,
81 std::vector<std::vector<double>>& X_right, std::vector<double>& y_right)
const;
86 int min_samples_split;
88 std::vector<std::unique_ptr<DecisionTree>> trees;
89 std::mt19937 random_engine;
91 void bootstrap_sample(
const std::vector<std::vector<double>>& X,
const std::vector<double>& y,
92 std::vector<std::vector<double>>& X_sample, std::vector<double>& y_sample);
96 : n_estimators(n_estimators), max_depth(max_depth), min_samples_split(min_samples_split), max_features(max_features) {
97 std::random_device rd;
98 random_engine.seed(rd());
103 int actual_max_features = max_features;
104 if (actual_max_features == -1) {
105 actual_max_features =
static_cast<int>(std::sqrt(X[0].size()));
108 for (
int i = 0; i < n_estimators; ++i) {
109 std::vector<std::vector<double>> X_sample;
110 std::vector<double> y_sample;
111 bootstrap_sample(X, y, X_sample, y_sample);
113 auto tree = std::make_unique<DecisionTree>(max_depth, min_samples_split, actual_max_features);
114 tree->fit(X_sample, y_sample);
115 trees.push_back(std::move(tree));
120 std::vector<double> predictions(X.size(), 0.0);
121 for (
const auto& tree : trees) {
122 for (
size_t i = 0; i < X.size(); ++i) {
123 predictions[i] += tree->predict_sample(X[i]);
126 for (
auto& pred : predictions) {
127 pred /= n_estimators;
132 void RandomForestRegressor::bootstrap_sample(
const std::vector<std::vector<double>>& X,
const std::vector<double>& y,
133 std::vector<std::vector<double>>& X_sample, std::vector<double>& y_sample) {
134 size_t n_samples = X.size();
135 std::uniform_int_distribution<size_t> dist(0, n_samples - 1);
137 for (
size_t i = 0; i < n_samples; ++i) {
138 size_t index = dist(random_engine);
139 X_sample.push_back(X[index]);
140 y_sample.push_back(y[index]);
144 RandomForestRegressor::DecisionTree::DecisionTree(
int max_depth,
int min_samples_split,
int max_features)
145 : root(nullptr), max_depth(max_depth), min_samples_split(min_samples_split), max_features(max_features) {
146 std::random_device rd;
147 random_engine.seed(rd());
150 void RandomForestRegressor::DecisionTree::fit(
const std::vector<std::vector<double>>& X,
const std::vector<double>& y) {
151 root = build_tree(X, y, 0);
154 double RandomForestRegressor::DecisionTree::predict_sample(
const std::vector<double>& x)
const {
155 const Node* node = root.get();
156 while (!node->is_leaf) {
157 if (x[node->feature_index] <= node->threshold) {
158 node = node->left.get();
160 node = node->right.get();
166 std::unique_ptr<RandomForestRegressor::Node> RandomForestRegressor::DecisionTree::build_tree(
167 const std::vector<std::vector<double>>& X,
const std::vector<double>& y,
int depth) {
168 auto node = std::make_unique<Node>();
171 if (depth >= max_depth || y.size() <
static_cast<size_t>(min_samples_split)) {
172 node->is_leaf =
true;
173 node->value = std::accumulate(y.begin(), y.end(), 0.0) / y.size();
177 double best_mse = std::numeric_limits<double>::max();
178 int best_feature_index = -1;
179 double best_threshold = 0.0;
180 std::vector<std::vector<double>> best_X_left, best_X_right;
181 std::vector<double> best_y_left, best_y_right;
183 int num_features = X[0].size();
184 std::vector<int> features_indices(num_features);
185 std::iota(features_indices.begin(), features_indices.end(), 0);
188 std::shuffle(features_indices.begin(), features_indices.end(), random_engine);
189 if (max_features < num_features) {
190 features_indices.resize(max_features);
193 for (
int feature_index : features_indices) {
195 std::vector<double> feature_values;
196 feature_values.reserve(X.size());
197 for (
const auto& x : X) {
198 feature_values.push_back(x[feature_index]);
200 std::sort(feature_values.begin(), feature_values.end());
201 feature_values.erase(std::unique(feature_values.begin(), feature_values.end()), feature_values.end());
203 std::vector<double> thresholds;
204 thresholds.reserve(feature_values.size() - 1);
205 for (
size_t i = 1; i < feature_values.size(); ++i) {
206 thresholds.push_back((feature_values[i - 1] + feature_values[i]) / 2.0);
210 for (
double threshold : thresholds) {
211 std::vector<std::vector<double>> X_left, X_right;
212 std::vector<double> y_left, y_right;
213 split_dataset(X, y, feature_index, threshold, X_left, y_left, X_right, y_right);
215 if (y_left.empty() || y_right.empty())
218 double mse_left = calculate_mse(y_left);
219 double mse_right = calculate_mse(y_right);
220 double mse = (mse_left * y_left.size() + mse_right * y_right.size()) / y.size();
222 if (mse < best_mse) {
224 best_feature_index = feature_index;
225 best_threshold = threshold;
226 best_X_left = std::move(X_left);
227 best_X_right = std::move(X_right);
228 best_y_left = std::move(y_left);
229 best_y_right = std::move(y_right);
235 if (best_feature_index == -1) {
236 node->is_leaf =
true;
237 node->value = std::accumulate(y.begin(), y.end(), 0.0) / y.size();
242 node->feature_index = best_feature_index;
243 node->threshold = best_threshold;
244 node->left = build_tree(best_X_left, best_y_left, depth + 1);
245 node->right = build_tree(best_X_right, best_y_right, depth + 1);
249 double RandomForestRegressor::DecisionTree::calculate_mse(
const std::vector<double>& y)
const {
250 double mean = std::accumulate(y.begin(), y.end(), 0.0) / y.size();
251 double mse = std::transform_reduce(y.begin(), y.end(), 0.0, std::plus<>(), [mean](
double val) {
252 double diff = val - mean;
255 return mse / y.size();
258 void RandomForestRegressor::DecisionTree::split_dataset(
const std::vector<std::vector<double>>& X,
const std::vector<double>& y,
259 int feature_index,
double threshold,
260 std::vector<std::vector<double>>& X_left, std::vector<double>& y_left,
261 std::vector<std::vector<double>>& X_right, std::vector<double>& y_right)
const {
262 for (
size_t i = 0; i < X.size(); ++i) {
263 if (X[i][feature_index] <= threshold) {
264 X_left.push_back(X[i]);
265 y_left.push_back(y[i]);
267 X_right.push_back(X[i]);
268 y_right.push_back(y[i]);
Implements a Random Forest Regressor.
Definition: RandomForestRegressor.hpp:21
~RandomForestRegressor()=default
Destructor for RandomForestRegressor.
std::vector< double > predict(const std::vector< std::vector< double >> &X) const
Predicts target values for given input data.
Definition: RandomForestRegressor.hpp:119
void fit(const std::vector< std::vector< double >> &X, const std::vector< double > &y)
Fits the model to the training data.
Definition: RandomForestRegressor.hpp:101
RandomForestRegressor(int n_estimators=10, int max_depth=5, int min_samples_split=2, int max_features=-1)
Constructs a RandomForestRegressor.
Definition: RandomForestRegressor.hpp:95