1 #ifndef RANDOM_FOREST_CLASSIFIER_HPP
2 #define RANDOM_FOREST_CLASSIFIER_HPP
8 #include <unordered_map>
31 RandomForestClassifier(
int n_estimators = 10,
int max_depth = 5,
int min_samples_split = 2,
int max_features = -1);
43 void fit(
const std::vector<std::vector<double>>& X,
const std::vector<int>& y);
50 std::vector<int>
predict(
const std::vector<std::vector<double>>& X)
const;
58 std::unique_ptr<Node> left;
59 std::unique_ptr<Node> right;
61 Node() : is_leaf(false), value(0), feature_index(-1), threshold(0.0) {}
65 std::unique_ptr<Node> root;
67 int min_samples_split;
69 std::mt19937 random_engine;
71 DecisionTree(
int max_depth,
int min_samples_split,
int max_features, std::mt19937::result_type seed);
72 ~DecisionTree() =
default;
73 void fit(
const std::vector<std::vector<double>>& X,
const std::vector<int>& y);
74 int predict_sample(
const std::vector<double>& x)
const;
77 std::unique_ptr<Node> build_tree(
const std::vector<std::vector<double>>& X,
const std::vector<int>& y,
int depth);
78 double calculate_gini(
const std::vector<int>& y)
const;
79 void split_dataset(
const std::vector<std::vector<double>>& X,
const std::vector<int>& y,
int feature_index,
double threshold,
80 std::vector<std::vector<double>>& X_left, std::vector<int>& y_left,
81 std::vector<std::vector<double>>& X_right, std::vector<int>& y_right)
const;
82 int majority_class(
const std::vector<int>& y)
const;
87 int min_samples_split;
89 std::vector<std::unique_ptr<DecisionTree>> trees;
90 std::mt19937 random_engine;
92 void bootstrap_sample(
const std::vector<std::vector<double>>& X,
const std::vector<int>& y,
93 std::vector<std::vector<double>>& X_sample, std::vector<int>& y_sample);
97 : n_estimators(n_estimators), max_depth(max_depth), min_samples_split(min_samples_split), max_features(max_features) {
98 std::random_device rd;
99 random_engine.seed(rd());
104 int actual_max_features = max_features;
105 if (actual_max_features == -1) {
106 actual_max_features =
static_cast<int>(std::sqrt(X[0].size()));
109 for (
int i = 0; i < n_estimators; ++i) {
110 std::vector<std::vector<double>> X_sample;
111 std::vector<int> y_sample;
112 bootstrap_sample(X, y, X_sample, y_sample);
114 auto tree = std::make_unique<DecisionTree>(max_depth, min_samples_split, actual_max_features, random_engine());
115 tree->fit(X_sample, y_sample);
116 trees.push_back(std::move(tree));
121 std::vector<int> predictions(X.size());
122 for (
size_t i = 0; i < X.size(); ++i) {
123 std::unordered_map<int, int> votes;
124 for (
const auto& tree : trees) {
125 int vote = tree->predict_sample(X[i]);
129 predictions[i] = std::max_element(votes.begin(), votes.end(),
130 [](
const auto& a,
const auto& b) {
131 return a.second < b.second;
137 void RandomForestClassifier::bootstrap_sample(
const std::vector<std::vector<double>>& X,
const std::vector<int>& y,
138 std::vector<std::vector<double>>& X_sample, std::vector<int>& y_sample) {
139 size_t n_samples = X.size();
140 std::uniform_int_distribution<size_t> dist(0, n_samples - 1);
142 for (
size_t i = 0; i < n_samples; ++i) {
143 size_t index = dist(random_engine);
144 X_sample.push_back(X[index]);
145 y_sample.push_back(y[index]);
149 RandomForestClassifier::DecisionTree::DecisionTree(
int max_depth,
int min_samples_split,
int max_features, std::mt19937::result_type seed)
150 : root(nullptr), max_depth(max_depth), min_samples_split(min_samples_split), max_features(max_features), random_engine(seed) {}
152 void RandomForestClassifier::DecisionTree::fit(
const std::vector<std::vector<double>>& X,
const std::vector<int>& y) {
153 root = build_tree(X, y, 0);
156 int RandomForestClassifier::DecisionTree::predict_sample(
const std::vector<double>& x)
const {
157 const Node* node = root.get();
158 while (!node->is_leaf) {
159 if (x[node->feature_index] <= node->threshold) {
160 node = node->left.get();
162 node = node->right.get();
168 std::unique_ptr<RandomForestClassifier::Node> RandomForestClassifier::DecisionTree::build_tree(
169 const std::vector<std::vector<double>>& X,
const std::vector<int>& y,
int depth) {
170 auto node = std::make_unique<Node>();
173 if (depth >= max_depth || y.size() <
static_cast<size_t>(min_samples_split) || calculate_gini(y) == 0.0) {
174 node->is_leaf =
true;
175 node->value = majority_class(y);
179 double best_gini = std::numeric_limits<double>::max();
180 int best_feature_index = -1;
181 double best_threshold = 0.0;
182 std::vector<std::vector<double>> best_X_left, best_X_right;
183 std::vector<int> best_y_left, best_y_right;
185 int num_features = X[0].size();
186 std::vector<int> features_indices(num_features);
187 std::iota(features_indices.begin(), features_indices.end(), 0);
190 std::shuffle(features_indices.begin(), features_indices.end(), random_engine);
191 if (max_features < num_features) {
192 features_indices.resize(max_features);
195 for (
int feature_index : features_indices) {
197 std::vector<double> feature_values;
198 feature_values.reserve(X.size());
199 for (
const auto& x : X) {
200 feature_values.push_back(x[feature_index]);
202 std::sort(feature_values.begin(), feature_values.end());
203 feature_values.erase(std::unique(feature_values.begin(), feature_values.end()), feature_values.end());
205 if (feature_values.size() <= 1)
continue;
207 std::vector<double> thresholds;
208 thresholds.reserve(feature_values.size() - 1);
209 for (
size_t i = 1; i < feature_values.size(); ++i) {
210 thresholds.push_back((feature_values[i - 1] + feature_values[i]) / 2.0);
214 for (
double threshold : thresholds) {
215 std::vector<std::vector<double>> X_left, X_right;
216 std::vector<int> y_left, y_right;
217 split_dataset(X, y, feature_index, threshold, X_left, y_left, X_right, y_right);
219 if (y_left.empty() || y_right.empty())
222 double gini_left = calculate_gini(y_left);
223 double gini_right = calculate_gini(y_right);
224 double gini = (gini_left * y_left.size() + gini_right * y_right.size()) / y.size();
226 if (gini < best_gini) {
228 best_feature_index = feature_index;
229 best_threshold = threshold;
230 best_X_left = std::move(X_left);
231 best_X_right = std::move(X_right);
232 best_y_left = std::move(y_left);
233 best_y_right = std::move(y_right);
239 if (best_feature_index == -1) {
240 node->is_leaf =
true;
241 node->value = majority_class(y);
246 node->feature_index = best_feature_index;
247 node->threshold = best_threshold;
248 node->left = build_tree(best_X_left, best_y_left, depth + 1);
249 node->right = build_tree(best_X_right, best_y_right, depth + 1);
253 double RandomForestClassifier::DecisionTree::calculate_gini(
const std::vector<int>& y)
const {
254 std::unordered_map<int, int> class_counts;
255 for (
int label : y) {
256 class_counts[label]++;
258 double impurity = 1.0;
259 size_t total = y.size();
260 for (
const auto& [label, count] : class_counts) {
261 double prob =
static_cast<double>(count) / total;
262 impurity -= prob * prob;
267 int RandomForestClassifier::DecisionTree::majority_class(
const std::vector<int>& y)
const {
268 std::unordered_map<int, int> class_counts;
269 for (
int label : y) {
270 class_counts[label]++;
272 return std::max_element(class_counts.begin(), class_counts.end(),
273 [](
const auto& a,
const auto& b) {
274 return a.second < b.second;
278 void RandomForestClassifier::DecisionTree::split_dataset(
const std::vector<std::vector<double>>& X,
const std::vector<int>& y,
279 int feature_index,
double threshold,
280 std::vector<std::vector<double>>& X_left, std::vector<int>& y_left,
281 std::vector<std::vector<double>>& X_right, std::vector<int>& y_right)
const {
282 for (
size_t i = 0; i < X.size(); ++i) {
283 if (X[i][feature_index] <= threshold) {
284 X_left.push_back(X[i]);
285 y_left.push_back(y[i]);
287 X_right.push_back(X[i]);
288 y_right.push_back(y[i]);
Implements a Random Forest Classifier.
Definition: RandomForestClassifier.hpp:22
std::vector< int > predict(const std::vector< std::vector< double >> &X) const
Predicts class labels for given input data.
Definition: RandomForestClassifier.hpp:120
~RandomForestClassifier()=default
Destructor for RandomForestClassifier.
RandomForestClassifier(int n_estimators=10, int max_depth=5, int min_samples_split=2, int max_features=-1)
Constructs a RandomForestClassifier.
Definition: RandomForestClassifier.hpp:96
void fit(const std::vector< std::vector< double >> &X, const std::vector< int > &y)
Fits the model to the training data.
Definition: RandomForestClassifier.hpp:102