mlpack  2.0.1
hoeffding_tree.hpp
Go to the documentation of this file.
1 
15 #ifndef __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_TREE_HPP
16 #define __MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_TREE_HPP
17 
18 #include <mlpack/core.hpp>
19 #include "gini_impurity.hpp"
22 
23 namespace mlpack {
24 namespace tree {
25 
56 template<typename FitnessFunction = GiniImpurity,
57  template<typename> class NumericSplitType =
59  template<typename> class CategoricalSplitType =
60  HoeffdingCategoricalSplit
61 >
63 {
64  public:
66  typedef NumericSplitType<FitnessFunction> NumericSplit;
68  typedef CategoricalSplitType<FitnessFunction> CategoricalSplit;
69 
92  template<typename MatType>
93  HoeffdingTree(const MatType& data,
95  const arma::Row<size_t>& labels,
96  const size_t numClasses,
97  const bool batchTraining = true,
98  const double successProbability = 0.95,
99  const size_t maxSamples = 0,
100  const size_t checkInterval = 100,
101  const size_t minSamples = 100,
102  const CategoricalSplitType<FitnessFunction>& categoricalSplitIn
103  = CategoricalSplitType<FitnessFunction>(0, 0),
104  const NumericSplitType<FitnessFunction>& numericSplitIn =
105  NumericSplitType<FitnessFunction>(0));
106 
126  HoeffdingTree(const data::DatasetInfo& datasetInfo,
127  const size_t numClasses,
128  const double successProbability = 0.95,
129  const size_t maxSamples = 0,
130  const size_t checkInterval = 100,
131  const size_t minSamples = 100,
132  const CategoricalSplitType<FitnessFunction>& categoricalSplitIn
133  = CategoricalSplitType<FitnessFunction>(0, 0),
134  const NumericSplitType<FitnessFunction>& numericSplitIn =
135  NumericSplitType<FitnessFunction>(0),
136  std::unordered_map<size_t, std::pair<size_t, size_t>>*
137  dimensionMappings = NULL);
138 
145  HoeffdingTree(const HoeffdingTree& other);
146 
150  ~HoeffdingTree();
151 
160  template<typename MatType>
161  void Train(const MatType& data,
162  const arma::Row<size_t>& labels,
163  const bool batchTraining = true);
164 
171  template<typename VecType>
172  void Train(const VecType& point, const size_t label);
173 
179  size_t SplitCheck();
180 
182  size_t SplitDimension() const { return splitDimension; }
183 
185  size_t MajorityClass() const { return majorityClass; }
187  size_t& MajorityClass() { return majorityClass; }
188 
190  double MajorityProbability() const { return majorityProbability; }
193 
195  size_t NumChildren() const { return children.size(); }
196 
198  const HoeffdingTree& Child(const size_t i) const { return *children[i]; }
200  HoeffdingTree& Child(const size_t i) { return *children[i]; }
201 
203  double SuccessProbability() const { return successProbability; }
205  void SuccessProbability(const double successProbability);
206 
208  size_t MinSamples() const { return minSamples; }
210  void MinSamples(const size_t minSamples);
211 
213  size_t MaxSamples() const { return maxSamples; }
215  void MaxSamples(const size_t maxSamples);
216 
218  size_t CheckInterval() const { return checkInterval; }
220  void CheckInterval(const size_t checkInterval);
221 
229  template<typename VecType>
230  size_t CalculateDirection(const VecType& point) const;
231 
239  template<typename VecType>
240  size_t Classify(const VecType& point) const;
241 
253  template<typename VecType>
254  void Classify(const VecType& point, size_t& prediction, double& probability)
255  const;
256 
264  template<typename MatType>
265  void Classify(const MatType& data, arma::Row<size_t>& predictions) const;
266 
278  template<typename MatType>
279  void Classify(const MatType& data,
280  arma::Row<size_t>& predictions,
281  arma::rowvec& probabilities) const;
282 
286  void CreateChildren();
287 
289  template<typename Archive>
290  void Serialize(Archive& ar, const unsigned int /* version */);
291 
292  private:
293  // We need to keep some information for before we have split.
294 
296  std::vector<NumericSplitType<FitnessFunction>> numericSplits;
298  std::vector<CategoricalSplitType<FitnessFunction>> categoricalSplits;
299 
301  std::unordered_map<size_t, std::pair<size_t, size_t>>* dimensionMappings;
304 
306  size_t numSamples;
308  size_t numClasses;
310  size_t maxSamples;
314  size_t minSamples;
318  bool ownsInfo;
321 
322  // And we need to keep some information for after we have split.
323 
332  typename CategoricalSplitType<FitnessFunction>::SplitInfo categoricalSplit;
334  typename NumericSplitType<FitnessFunction>::SplitInfo numericSplit;
336  std::vector<HoeffdingTree*> children;
337 };
338 
339 } // namespace tree
340 } // namespace mlpack
341 
342 #include "hoeffding_tree_impl.hpp"
343 
344 #endif
size_t MajorityClass() const
Get the majority class.
NumericSplitType< FitnessFunction > NumericSplit
Allow access to the numeric split type.
HoeffdingTree(const MatType &data, const data::DatasetInfo &datasetInfo, const arma::Row< size_t > &labels, const size_t numClasses, const bool batchTraining=true, const double successProbability=0.95, const size_t maxSamples=0, const size_t checkInterval=100, const size_t minSamples=100, const CategoricalSplitType< FitnessFunction > &categoricalSplitIn=CategoricalSplitType< FitnessFunction >(0, 0), const NumericSplitType< FitnessFunction > &numericSplitIn=NumericSplitType< FitnessFunction >(0))
Construct the Hoeffding tree with the given parameters and given training data.
double SuccessProbability() const
Get the confidence required for a split.
double MajorityProbability() const
Get the probability of the majority class (based on training samples).
bool ownsInfo
Whether or not we own the dataset information.
The HoeffdingTree object represents all of the necessary information for a Hoeffding-bound-based deci...
Linear algebra utility functions, generally performed on matrices or vectors.
size_t checkInterval
The number of samples that should be seen before checking for a split.
~HoeffdingTree()
Clean up memory.
double & MajorityProbability()
Modify the probability of the majority class.
size_t minSamples
The minimum number of samples for splitting.
size_t numSamples
The number of samples seen so far by this node.
HoeffdingNumericSplit< FitnessFunction, double > HoeffdingDoubleNumericSplit
Convenience typedef.
size_t CalculateDirection(const VecType &point) const
Given a point and that this node is not a leaf, calculate the index of the child node this point woul...
size_t MinSamples() const
Get the minimum number of samples for a split.
bool ownsMappings
Indicates whether or not we own the mappings.
CategoricalSplitType< FitnessFunction >::SplitInfo categoricalSplit
If the split is categorical, this holds the splitting information.
std::vector< NumericSplitType< FitnessFunction > > numericSplits
Information for splitting of numeric features (used before split).
void Train(const MatType &data, const arma::Row< size_t > &labels, const bool batchTraining=true)
Train on a set of points, either in streaming mode or in batch mode, with the given labels...
void Serialize(Archive &ar, const unsigned int)
Serialize the split.
NumericSplitType< FitnessFunction >::SplitInfo numericSplit
If the split is numeric, this holds the splitting information.
size_t SplitDimension() const
Get the splitting dimension (size_t(-1) if no split).
size_t maxSamples
The maximum number of samples we can see before splitting.
size_t majorityClass
The majority class of this node.
Auxiliary information for a dataset, including mappings to/from strings and the datatype of each dime...
size_t Classify(const VecType &point) const
Classify the given point, using this node and the entire (sub)tree beneath it.
std::unordered_map< size_t, std::pair< size_t, size_t > > * dimensionMappings
This structure is owned by this node only if it is the root of the tree.
Include all of the base components required to write MLPACK methods, and the main MLPACK Doxygen docu...
double successProbability
The required probability of success for a split to be performed.
double majorityProbability
The empirical probability of a point this node saw having the majority class.
size_t numClasses
The number of classes this node is trained on.
size_t CheckInterval() const
Get the number of samples before a split check is performed.
std::vector< CategoricalSplitType< FitnessFunction > > categoricalSplits
Information for splitting of categorical features (used before split).
size_t SplitCheck()
Check if a split would satisfy the conditions of the Hoeffding bound with the node&#39;s specified succes...
std::vector< HoeffdingTree * > children
If the split has occurred, these are the children.
CategoricalSplitType< FitnessFunction > CategoricalSplit
Allow access to the categorical split type.
size_t & MajorityClass()
Modify the majority class.
size_t NumChildren() const
Get the number of children.
size_t MaxSamples() const
Get the maximum number of samples before a split is forced.
const HoeffdingTree & Child(const size_t i) const
Get a child.
size_t splitDimension
The dimension that this node has split on.
const data::DatasetInfo * datasetInfo
The dataset information.
HoeffdingTree & Child(const size_t i)
Modify a child.
void CreateChildren()
Given that this node should split, create the children.