Canopy  1.0
The header-only random forests library
randomForestBase.hpp
Go to the documentation of this file.
1 #ifndef RANDOMFORESTBASE_HPP
2 #define RANDOMFORESTBASE_HPP
3 
10 #include <string>
11 #include <iostream>
12 #include <fstream>
13 #include <vector>
14 #include <array>
15 #include <random>
16 
18 namespace canopy
19 {
20 
43 template <class TDerived, class TLabel, class TNodeDist, class TOutputDist, unsigned TNumParams>
45 {
46  public:
47  // Methods
48  // --------
50 
51  randomForestBase(const int num_trees, const int num_levels); // constructor
52 
53  bool readFromFile(const std::string filename, const int trees_used = -1, const int max_depth_used = -1);
54 
55  bool writeToFile(const std::string filename) const;
56 
57  bool isValid() const;
58 
59  void setFeatureDefinitionString(const std::string& header_str, const std::string& feat_str);
60 
61  void getFeatureDefinitionString(std::string &feat_str) const;
62 
63  template <class TIdIterator, class TLabelIterator, class TFeatureFunctor, class TParameterFunctor>
64  void train(const TIdIterator first_id, const TIdIterator last_id, const TLabelIterator first_label, TFeatureFunctor&& feature_functor, TParameterFunctor&& parameter_functor, const unsigned num_param_combos_to_test, const bool bagging = true, const float bag_proportion = C_DEFAULT_BAGGING_PROPORTION, const bool fit_split_nodes = true, const unsigned min_training_data = C_DEFAULT_MIN_TRAINING_DATA);
65 
66  template<class TIdIterator, class TOutputIterator, class TFeatureFunctor>
67  void predictDistGroupwise(TIdIterator first_id, const TIdIterator last_id, TOutputIterator out_it, TFeatureFunctor&& feature_functor) const;
68 
69  template<class TIdIterator, class TOutputIterator, class TFeatureFunctor>
70  void predictDistSingle(TIdIterator first_id, const TIdIterator last_id, TOutputIterator out_it, TFeatureFunctor&& feature_functor) const;
71 
72  template <class TIdIterator, class TLabelIterator, class TOutputIterator, class TFeatureFunctor>
73  void probabilityGroupwise(TIdIterator first_id, const TIdIterator last_id, TLabelIterator label_it, TOutputIterator out_it, const bool single_label, TFeatureFunctor&& feature_functor) const;
74 
75  template <class TIdIterator, class TLabelIterator, class TOutputIterator, class TFeatureFunctor>
76  void probabilitySingle(TIdIterator first_id, const TIdIterator last_id, TLabelIterator label_it, TOutputIterator out_it, const bool single_label, TFeatureFunctor&& feature_functor) const;
77 
78  template <class TIdIterator, class TLabelIterator, class TOutputIterator, class TBinaryFunction, class TFeatureFunctor, class TPDFFunctor>
79  void probabilityGroupwiseBase(TIdIterator first_id, const TIdIterator last_id, TLabelIterator label_it, TOutputIterator out_it, const bool single_label, TBinaryFunction&& binary_function, TFeatureFunctor&& feature_functor, TPDFFunctor&& pdf_functor) const;
80 
81  template <class TIdIterator, class TLabelIterator, class TOutputIterator, class TBinaryFunction, class TFeatureFunctor, class TPDFFunctor>
82  void probabilitySingleBase(TIdIterator first_id, const TIdIterator last_id, TLabelIterator label_it, TOutputIterator out_it, const bool single_label, TBinaryFunction&& binary_function, TFeatureFunctor&& feature_functor, TPDFFunctor&& pdf_functor) const;
83 
84  private:
85  // Methods
86  // -------
87  void allocateForestMemory();
88 
89  void initialise();
90 
91  template <class TIdIterator, class TLabelIterator>
92  void fitLeaf(const int t, const int n, const std::vector<int>& nodebag, const TIdIterator first_id, const TLabelIterator first_label);
93 
94  protected:
95 
96  // Types
97  // -----
98 
101  struct node
102  {
103  std::array<int,TNumParams> params;
104  bool is_leaf;
105  float thresh;
106  std::vector<TNodeDist> post;
107  node(): is_leaf(false), thresh(0.0) {}
108  };
109 
118  struct tree
119  {
120  std::vector<node> nodes;
121  };
122 
129  {
130  float score;
131  int id;
132 
133  scoreInternalIndexStruct(const float score, const int id): score(score), id(id) {}
134  };
135 
143  class derivedProxy: public TDerived
144  {
145  friend randomForestBase;
146  };
147 
148  // Methods
149  // -------
150 
151  template<class TIdIterator, class TFeatureFunctor>
152  void findLeavesGroupwise(TIdIterator first_id, const TIdIterator last_id, const int treenum, std::vector<const TNodeDist*>& leaves, TFeatureFunctor&& feature_functor) const;
153 
154  template<class TId, class TFeatureFunctor>
155  const TNodeDist* findLeafSingle(const TId id, const int treenum, TFeatureFunctor&& feature_functor) const;
156 
157  template <class TLabelIterator>
158  static double fastDiscreteEntropy(const std::vector<int>& internal_index, const int n_labels, const TLabelIterator first_label, const std::vector<double>& xlogx_precalc);
159 
160  template <class TLabelIterator>
161  static int fastDiscreteEntropySplit(const std::vector<scoreInternalIndexStruct>& data_structs, const int n_labels, const TLabelIterator first_label, const std::vector<double>& xlogx_precalc, double& best_split_impurity, float& thresh);
162 
163  static std::vector<double> preCalculateXlogX(const int N);
164 
165  // Data
166  // ----
167  int n_trees;
168  int n_levels;
169  int n_nodes;
170  bool valid;
172  std::vector<tree> forest;
173  std::string feature_header;
174  std::string feature_string;
175  std::default_random_engine rand_engine;
176  std::uniform_int_distribution<int> uni_dist;
177 
178  // Constants
179  // ---------
180  static constexpr int C_DEFAULT_MIN_TRAINING_DATA = 50;
181  static constexpr float C_DEFAULT_BAGGING_PROPORTION = 0.5;
182 
183 };
184 
185 } // end of namespace
186 
187 // Include template class definition
189 
190 // End include guard
191 #endif
static constexpr int C_DEFAULT_MIN_TRAINING_DATA
Default value for the minimum number of traning data points in a node before a leaf is declared...
Definition: randomForestBase.hpp:180
void getFeatureDefinitionString(std::string &feat_str) const
Retrieve a stored feature string.
Definition: randomForestBase.tpp:1243
std::array< int, TNumParams > params
Parameters for the split function.
Definition: randomForestBase.hpp:103
bool readFromFile(const std::string filename, const int trees_used=-1, const int max_depth_used=-1)
Read a pre-trained model in from a file.
Definition: randomForestBase.tpp:112
static std::vector< double > preCalculateXlogX(const int N)
Calculate an array of x*log(x) for integer x.
Definition: randomForestBase.tpp:1262
void predictDistSingle(TIdIterator first_id, const TIdIterator last_id, TOutputIterator out_it, TFeatureFunctor &&feature_functor) const
Predict the output distribution for a number of IDs.
Definition: randomForestBase.tpp:453
std::vector< tree > forest
Vector of tree models.
Definition: randomForestBase.hpp:172
bool valid
Whether the forest model is currently valid and usable for predictions (true = valid) ...
Definition: randomForestBase.hpp:170
bool fit_split_nodes
Whether a node distribution is fitted to all nodes (true) or just the leaf nodes (false) ...
Definition: randomForestBase.hpp:171
Node structure - represents one node in a tree.
Definition: randomForestBase.hpp:101
Tree structure - represents a single tree.
Definition: randomForestBase.hpp:118
node()
Basic constructor.
Definition: randomForestBase.hpp:107
std::vector< TNodeDist > post
The posterior distribution over labels for a leaf node, shuld only ever have 1 or 0 elements...
Definition: randomForestBase.hpp:106
void probabilitySingleBase(TIdIterator first_id, const TIdIterator last_id, TLabelIterator label_it, TOutputIterator out_it, const bool single_label, TBinaryFunction &&binary_function, TFeatureFunctor &&feature_functor, TPDFFunctor &&pdf_functor) const
A generalised version of the probabilitySingle() function that enables the creation of more general f...
Definition: randomForestBase.tpp:765
Structure for holding information about a data sample and its feature score.
Definition: randomForestBase.hpp:128
scoreInternalIndexStruct(const float score, const int id)
Definition: randomForestBase.hpp:133
void probabilityGroupwise(TIdIterator first_id, const TIdIterator last_id, TLabelIterator label_it, TOutputIterator out_it, const bool single_label, TFeatureFunctor &&feature_functor) const
Evaluate the probability of a certain value of the label for a set of data points.
Definition: randomForestBase.tpp:536
void findLeavesGroupwise(TIdIterator first_id, const TIdIterator last_id, const int treenum, std::vector< const TNodeDist * > &leaves, TFeatureFunctor &&feature_functor) const
Function to query a single tree model with a set of data points and store a pointer to the leaf distr...
Definition: randomForestBase.tpp:822
int n_nodes
The number of nodes in each tree.
Definition: randomForestBase.hpp:169
bool isValid() const
Check whether a forest model is valid.
Definition: randomForestBase.tpp:1208
bool is_leaf
Indicates whether the node is a leaf (1 -> leaf)
Definition: randomForestBase.hpp:104
Proxy for the derived class.
Definition: randomForestBase.hpp:143
bool writeToFile(const std::string filename) const
Write a trained model to a .tr file to be stored and re-used.
Definition: randomForestBase.tpp:272
int n_trees
The number of trees in the forest.
Definition: randomForestBase.hpp:167
int n_levels
The maximum number of levels in each tree.
Definition: randomForestBase.hpp:168
Namespace containing the canopy library for random forest models.
Definition: circularRegressor.hpp:13
randomForestBase()
Default constructor.
Definition: randomForestBase.tpp:46
float thresh
The decision threshold for an internal node.
Definition: randomForestBase.hpp:105
static constexpr float C_DEFAULT_BAGGING_PROPORTION
Default value for the proportion of the traning set used to train each tree.
Definition: randomForestBase.hpp:181
Contains implementations of the methods of the canopy::randomForestBase class.
static int fastDiscreteEntropySplit(const std::vector< scoreInternalIndexStruct > &data_structs, const int n_labels, const TLabelIterator first_label, const std::vector< double > &xlogx_precalc, double &best_split_impurity, float &thresh)
Find the split in a set of training data that results in the best information gain for discrete label...
Definition: randomForestBase.tpp:1359
int id
The internal traning index of this data point.
Definition: randomForestBase.hpp:131
const TNodeDist * findLeafSingle(const TId id, const int treenum, TFeatureFunctor &&feature_functor) const
Function to query a single tree model with a single data point and return a pointer to the leaf distr...
Definition: randomForestBase.tpp:904
static double fastDiscreteEntropy(const std::vector< int > &internal_index, const int n_labels, const TLabelIterator first_label, const std::vector< double > &xlogx_precalc)
Calculates the entropy of the discrete labels of a set of data points using an efficient method...
Definition: randomForestBase.tpp:1299
void probabilityGroupwiseBase(TIdIterator first_id, const TIdIterator last_id, TLabelIterator label_it, TOutputIterator out_it, const bool single_label, TBinaryFunction &&binary_function, TFeatureFunctor &&feature_functor, TPDFFunctor &&pdf_functor) const
A generalised version of the probabilityGroupwise() function that enables the creation of more genera...
Definition: randomForestBase.tpp:610
void predictDistGroupwise(TIdIterator first_id, const TIdIterator last_id, TOutputIterator out_it, TFeatureFunctor &&feature_functor) const
Predict the output distribution for a number of IDs.
Definition: randomForestBase.tpp:385
void setFeatureDefinitionString(const std::string &header_str, const std::string &feat_str)
Store arbitrary strings that define parameters of the feature extraction process. ...
Definition: randomForestBase.tpp:1226
std::uniform_int_distribution< int > uni_dist
For generating random integers during traning, may also be used derived classes.
Definition: randomForestBase.hpp:176
std::string feature_string
Arbitrary string describing the feature extraction process.
Definition: randomForestBase.hpp:174
Base class for random forests models from which all specific models are derived using CRTP...
Definition: randomForestBase.hpp:44
std::vector< node > nodes
Vector of the nodes.
Definition: randomForestBase.hpp:120
std::string feature_header
String describing the content of the feature string.
Definition: randomForestBase.hpp:173
float score
The score of this data point according to the feature extraction.
Definition: randomForestBase.hpp:130
void probabilitySingle(TIdIterator first_id, const TIdIterator last_id, TLabelIterator label_it, TOutputIterator out_it, const bool single_label, TFeatureFunctor &&feature_functor) const
Evaluate the probability of a certain value of the label for a set of data points.
Definition: randomForestBase.tpp:691
std::default_random_engine rand_engine
Random engine for generating random numbers during training, may also be used by derived classes...
Definition: randomForestBase.hpp:175
void train(const TIdIterator first_id, const TIdIterator last_id, const TLabelIterator first_label, TFeatureFunctor &&feature_functor, TParameterFunctor &&parameter_functor, const unsigned num_param_combos_to_test, const bool bagging=true, const float bag_proportion=C_DEFAULT_BAGGING_PROPORTION, const bool fit_split_nodes=true, const unsigned min_training_data=C_DEFAULT_MIN_TRAINING_DATA)
Train the random forest model on training data.
Definition: randomForestBase.tpp:967