Cpp ML Library  1.0.0
A library of Machine Learning Algorithmns seen from the Udemy course Machine Learning A to Z.
Apriori.hpp
Go to the documentation of this file.
1 #ifndef APRIORI_HPP
2 #define APRIORI_HPP
3 
4 #include <unordered_map>
5 #include <unordered_set>
6 #include <vector>
7 #include <set>
8 #include <algorithm>
9 #include <functional>
10 #include <iostream>
11 #include <string>
12 #include <cmath>
13 
23 class Apriori {
24 public:
29  Apriori(double min_support);
30 
36  std::vector<std::set<int>> run(const std::vector<std::vector<int>>& transactions);
37 
42  std::unordered_map<std::string, int> get_support_counts() const;
43 
49  std::string itemset_to_string(const std::set<int>& itemset) const;
50 
51 private:
58  std::set<std::set<int>> generate_candidates(const std::set<std::set<int>>& frequent_itemsets, int k);
59 
66  std::set<std::set<int>> prune_candidates(const std::set<std::set<int>>& candidates,
67  const std::set<std::set<int>>& frequent_itemsets_k_minus_1);
68 
75  std::unordered_map<std::string, int> count_support(const std::set<std::set<int>>& candidates,
76  const std::vector<std::vector<int>>& transactions);
77 
78 
85  bool has_infrequent_subset(const std::set<int>& candidate,
86  const std::set<std::set<int>>& frequent_itemsets_k_minus_1);
87 
88  double min_support;
89  int min_support_count;
90  int total_transactions;
91  std::unordered_map<std::string, int> support_counts;
92 };
93 
94 Apriori::Apriori(double min_support)
95  : min_support(min_support), min_support_count(0), total_transactions(0) {
96  if (min_support <= 0.0 || min_support > 1.0) {
97  throw std::invalid_argument("min_support must be between 0 and 1.");
98  }
99 }
100 
101 std::vector<std::set<int>> Apriori::run(const std::vector<std::vector<int>>& transactions) {
102  total_transactions = static_cast<int>(transactions.size());
103  min_support_count = static_cast<int>(std::ceil(min_support * total_transactions));
104 
105  // Generate frequent 1-itemsets
106  std::unordered_map<int, int> item_counts;
107  for (const auto& transaction : transactions) {
108  for (int item : transaction) {
109  item_counts[item]++;
110  }
111  }
112 
113  std::set<std::set<int>> frequent_itemsets;
114  std::set<std::set<int>> frequent_itemsets_k;
115  for (const auto& [item, count] : item_counts) {
116  if (count >= min_support_count) {
117  std::set<int> itemset = {item};
118  frequent_itemsets.insert(itemset);
119  frequent_itemsets_k.insert(itemset);
120  support_counts[itemset_to_string(itemset)] = count;
121  }
122  }
123 
124  int k = 2;
125  while (!frequent_itemsets_k.empty()) {
126  // Generate candidate itemsets of size k
127  auto candidates_k = generate_candidates(frequent_itemsets_k, k);
128 
129  // Count support for candidates
130  auto candidate_supports = count_support(candidates_k, transactions);
131 
132  // Select candidates that meet min_support
133  frequent_itemsets_k.clear();
134  for (const auto& [itemset_str, count] : candidate_supports) {
135  if (count >= min_support_count) {
136  // Convert string back to itemset
137  std::set<int> itemset;
138  size_t pos = 0;
139  std::string token;
140  std::string s = itemset_str;
141  while ((pos = s.find(',')) != std::string::npos) {
142  token = s.substr(0, pos);
143  itemset.insert(std::stoi(token));
144  s.erase(0, pos + 1);
145  }
146  itemset.insert(std::stoi(s));
147 
148  frequent_itemsets.insert(itemset);
149  frequent_itemsets_k.insert(itemset);
150  support_counts[itemset_str] = count;
151  }
152  }
153 
154  k++;
155  }
156 
157  // Convert frequent itemsets to vector
158  std::vector<std::set<int>> result(frequent_itemsets.begin(), frequent_itemsets.end());
159  return result;
160 }
161 
162 std::set<std::set<int>> Apriori::generate_candidates(const std::set<std::set<int>>& frequent_itemsets, int k) {
163  std::set<std::set<int>> candidates;
164  for (auto it1 = frequent_itemsets.begin(); it1 != frequent_itemsets.end(); ++it1) {
165  for (auto it2 = std::next(it1); it2 != frequent_itemsets.end(); ++it2) {
166  // Join step: combine two itemsets if they share k-2 items
167  std::vector<int> v1(it1->begin(), it1->end());
168  std::vector<int> v2(it2->begin(), it2->end());
169  if (std::equal(v1.begin(), v1.end() - 1, v2.begin())) {
170  std::set<int> candidate = *it1;
171  candidate.insert(*v2.rbegin());
172  // Prune step: only include candidate if all subsets are frequent
173  if (!has_infrequent_subset(candidate, frequent_itemsets)) {
174  candidates.insert(candidate);
175  }
176  }
177  }
178  }
179  return candidates;
180 }
181 
182 bool Apriori::has_infrequent_subset(const std::set<int>& candidate,
183  const std::set<std::set<int>>& frequent_itemsets_k_minus_1) {
184  for (auto it = candidate.begin(); it != candidate.end(); ++it) {
185  std::set<int> subset = candidate;
186  subset.erase(*it);
187  if (frequent_itemsets_k_minus_1.find(subset) == frequent_itemsets_k_minus_1.end()) {
188  return true;
189  }
190  }
191  return false;
192 }
193 
194 std::unordered_map<std::string, int> Apriori::count_support(const std::set<std::set<int>>& candidates,
195  const std::vector<std::vector<int>>& transactions) {
196  std::unordered_map<std::string, int> counts;
197  for (const auto& transaction : transactions) {
198  std::set<int> transaction_set(transaction.begin(), transaction.end());
199  for (const auto& candidate : candidates) {
200  if (std::includes(transaction_set.begin(), transaction_set.end(),
201  candidate.begin(), candidate.end())) {
202  std::string candidate_str = itemset_to_string(candidate);
203  counts[candidate_str]++;
204  }
205  }
206  }
207  return counts;
208 }
209 
210 std::unordered_map<std::string, int> Apriori::get_support_counts() const {
211  return support_counts;
212 }
213 
214 std::string Apriori::itemset_to_string(const std::set<int>& itemset) const {
215  std::string s;
216  for (auto it = itemset.begin(); it != itemset.end(); ++it) {
217  s += std::to_string(*it);
218  if (std::next(it) != itemset.end()) {
219  s += ",";
220  }
221  }
222  return s;
223 }
224 
225 #endif // APRIORI_HPP
Class to perform frequent itemset mining using the Apriori algorithm.
Definition: Apriori.hpp:23
std::vector< std::set< int > > run(const std::vector< std::vector< int >> &transactions)
Runs the Apriori algorithm on the provided dataset.
Definition: Apriori.hpp:101
std::string itemset_to_string(const std::set< int > &itemset) const
Converts an itemset to a string representation for use as a key.
Definition: Apriori.hpp:214
Apriori(double min_support)
Constructor for the Apriori class.
Definition: Apriori.hpp:94
std::unordered_map< std::string, int > get_support_counts() const
Gets the support counts for all frequent itemsets found.
Definition: Apriori.hpp:210