1 #ifndef RANDOMFORESTBASE_HPP 2 #define RANDOMFORESTBASE_HPP 43 template <
class TDerived,
class TLabel,
class TNodeDist,
class TOutputDist,
unsigned TNumParams>
53 bool readFromFile(
const std::string filename,
const int trees_used = -1,
const int max_depth_used = -1);
63 template <
class TIdIterator,
class TLabelIterator,
class TFeatureFunctor,
class TParameterFunctor>
64 void train(
const TIdIterator first_id,
const TIdIterator last_id,
const TLabelIterator first_label, TFeatureFunctor&& feature_functor, TParameterFunctor&& parameter_functor,
const unsigned num_param_combos_to_test,
const bool bagging =
true,
const float bag_proportion =
C_DEFAULT_BAGGING_PROPORTION,
const bool fit_split_nodes =
true,
const unsigned min_training_data =
C_DEFAULT_MIN_TRAINING_DATA);
66 template<
class TIdIterator,
class TOutputIterator,
class TFeatureFunctor>
67 void predictDistGroupwise(TIdIterator first_id,
const TIdIterator last_id, TOutputIterator out_it, TFeatureFunctor&& feature_functor)
const;
69 template<
class TIdIterator,
class TOutputIterator,
class TFeatureFunctor>
70 void predictDistSingle(TIdIterator first_id,
const TIdIterator last_id, TOutputIterator out_it, TFeatureFunctor&& feature_functor)
const;
72 template <
class TIdIterator,
class TLabelIterator,
class TOutputIterator,
class TFeatureFunctor>
73 void probabilityGroupwise(TIdIterator first_id,
const TIdIterator last_id, TLabelIterator label_it, TOutputIterator out_it,
const bool single_label, TFeatureFunctor&& feature_functor)
const;
75 template <
class TIdIterator,
class TLabelIterator,
class TOutputIterator,
class TFeatureFunctor>
76 void probabilitySingle(TIdIterator first_id,
const TIdIterator last_id, TLabelIterator label_it, TOutputIterator out_it,
const bool single_label, TFeatureFunctor&& feature_functor)
const;
78 template <
class TIdIterator,
class TLabelIterator,
class TOutputIterator,
class TBinaryFunction,
class TFeatureFunctor,
class TPDFFunctor>
79 void probabilityGroupwiseBase(TIdIterator first_id,
const TIdIterator last_id, TLabelIterator label_it, TOutputIterator out_it,
const bool single_label, TBinaryFunction&& binary_function, TFeatureFunctor&& feature_functor, TPDFFunctor&& pdf_functor)
const;
81 template <
class TIdIterator,
class TLabelIterator,
class TOutputIterator,
class TBinaryFunction,
class TFeatureFunctor,
class TPDFFunctor>
82 void probabilitySingleBase(TIdIterator first_id,
const TIdIterator last_id, TLabelIterator label_it, TOutputIterator out_it,
const bool single_label, TBinaryFunction&& binary_function, TFeatureFunctor&& feature_functor, TPDFFunctor&& pdf_functor)
const;
87 void allocateForestMemory();
91 template <
class TIdIterator,
class TLabelIterator>
92 void fitLeaf(
const int t,
const int n,
const std::vector<int>& nodebag,
const TIdIterator first_id,
const TLabelIterator first_label);
107 node(): is_leaf(false), thresh(0.0) {}
151 template<
class TIdIterator,
class TFeatureFunctor>
152 void findLeavesGroupwise(TIdIterator first_id,
const TIdIterator last_id,
const int treenum, std::vector<const TNodeDist*>& leaves, TFeatureFunctor&& feature_functor)
const;
154 template<
class TId,
class TFeatureFunctor>
155 const TNodeDist*
findLeafSingle(
const TId
id,
const int treenum, TFeatureFunctor&& feature_functor)
const;
157 template <
class TLabelIterator>
158 static double fastDiscreteEntropy(
const std::vector<int>& internal_index,
const int n_labels,
const TLabelIterator first_label,
const std::vector<double>& xlogx_precalc);
160 template <
class TLabelIterator>
161 static int fastDiscreteEntropySplit(
const std::vector<scoreInternalIndexStruct>& data_structs,
const int n_labels,
const TLabelIterator first_label,
const std::vector<double>& xlogx_precalc,
double& best_split_impurity,
float&
thresh);
static constexpr int C_DEFAULT_MIN_TRAINING_DATA
Default value for the minimum number of traning data points in a node before a leaf is declared...
Definition: randomForestBase.hpp:180
void getFeatureDefinitionString(std::string &feat_str) const
Retrieve a stored feature string.
Definition: randomForestBase.tpp:1243
std::array< int, TNumParams > params
Parameters for the split function.
Definition: randomForestBase.hpp:103
bool readFromFile(const std::string filename, const int trees_used=-1, const int max_depth_used=-1)
Read a pre-trained model in from a file.
Definition: randomForestBase.tpp:112
static std::vector< double > preCalculateXlogX(const int N)
Calculate an array of x*log(x) for integer x.
Definition: randomForestBase.tpp:1262
void predictDistSingle(TIdIterator first_id, const TIdIterator last_id, TOutputIterator out_it, TFeatureFunctor &&feature_functor) const
Predict the output distribution for a number of IDs.
Definition: randomForestBase.tpp:453
std::vector< tree > forest
Vector of tree models.
Definition: randomForestBase.hpp:172
bool valid
Whether the forest model is currently valid and usable for predictions (true = valid) ...
Definition: randomForestBase.hpp:170
bool fit_split_nodes
Whether a node distribution is fitted to all nodes (true) or just the leaf nodes (false) ...
Definition: randomForestBase.hpp:171
Node structure - represents one node in a tree.
Definition: randomForestBase.hpp:101
Tree structure - represents a single tree.
Definition: randomForestBase.hpp:118
node()
Basic constructor.
Definition: randomForestBase.hpp:107
std::vector< TNodeDist > post
The posterior distribution over labels for a leaf node, shuld only ever have 1 or 0 elements...
Definition: randomForestBase.hpp:106
void probabilitySingleBase(TIdIterator first_id, const TIdIterator last_id, TLabelIterator label_it, TOutputIterator out_it, const bool single_label, TBinaryFunction &&binary_function, TFeatureFunctor &&feature_functor, TPDFFunctor &&pdf_functor) const
A generalised version of the probabilitySingle() function that enables the creation of more general f...
Definition: randomForestBase.tpp:765
Structure for holding information about a data sample and its feature score.
Definition: randomForestBase.hpp:128
scoreInternalIndexStruct(const float score, const int id)
Definition: randomForestBase.hpp:133
void probabilityGroupwise(TIdIterator first_id, const TIdIterator last_id, TLabelIterator label_it, TOutputIterator out_it, const bool single_label, TFeatureFunctor &&feature_functor) const
Evaluate the probability of a certain value of the label for a set of data points.
Definition: randomForestBase.tpp:536
void findLeavesGroupwise(TIdIterator first_id, const TIdIterator last_id, const int treenum, std::vector< const TNodeDist * > &leaves, TFeatureFunctor &&feature_functor) const
Function to query a single tree model with a set of data points and store a pointer to the leaf distr...
Definition: randomForestBase.tpp:822
int n_nodes
The number of nodes in each tree.
Definition: randomForestBase.hpp:169
bool isValid() const
Check whether a forest model is valid.
Definition: randomForestBase.tpp:1208
bool is_leaf
Indicates whether the node is a leaf (1 -> leaf)
Definition: randomForestBase.hpp:104
Proxy for the derived class.
Definition: randomForestBase.hpp:143
bool writeToFile(const std::string filename) const
Write a trained model to a .tr file to be stored and re-used.
Definition: randomForestBase.tpp:272
int n_trees
The number of trees in the forest.
Definition: randomForestBase.hpp:167
int n_levels
The maximum number of levels in each tree.
Definition: randomForestBase.hpp:168
Namespace containing the canopy library for random forest models.
Definition: circularRegressor.hpp:13
randomForestBase()
Default constructor.
Definition: randomForestBase.tpp:46
float thresh
The decision threshold for an internal node.
Definition: randomForestBase.hpp:105
static constexpr float C_DEFAULT_BAGGING_PROPORTION
Default value for the proportion of the traning set used to train each tree.
Definition: randomForestBase.hpp:181
Contains implementations of the methods of the canopy::randomForestBase class.
static int fastDiscreteEntropySplit(const std::vector< scoreInternalIndexStruct > &data_structs, const int n_labels, const TLabelIterator first_label, const std::vector< double > &xlogx_precalc, double &best_split_impurity, float &thresh)
Find the split in a set of training data that results in the best information gain for discrete label...
Definition: randomForestBase.tpp:1359
int id
The internal traning index of this data point.
Definition: randomForestBase.hpp:131
const TNodeDist * findLeafSingle(const TId id, const int treenum, TFeatureFunctor &&feature_functor) const
Function to query a single tree model with a single data point and return a pointer to the leaf distr...
Definition: randomForestBase.tpp:904
static double fastDiscreteEntropy(const std::vector< int > &internal_index, const int n_labels, const TLabelIterator first_label, const std::vector< double > &xlogx_precalc)
Calculates the entropy of the discrete labels of a set of data points using an efficient method...
Definition: randomForestBase.tpp:1299
void probabilityGroupwiseBase(TIdIterator first_id, const TIdIterator last_id, TLabelIterator label_it, TOutputIterator out_it, const bool single_label, TBinaryFunction &&binary_function, TFeatureFunctor &&feature_functor, TPDFFunctor &&pdf_functor) const
A generalised version of the probabilityGroupwise() function that enables the creation of more genera...
Definition: randomForestBase.tpp:610
void predictDistGroupwise(TIdIterator first_id, const TIdIterator last_id, TOutputIterator out_it, TFeatureFunctor &&feature_functor) const
Predict the output distribution for a number of IDs.
Definition: randomForestBase.tpp:385
void setFeatureDefinitionString(const std::string &header_str, const std::string &feat_str)
Store arbitrary strings that define parameters of the feature extraction process. ...
Definition: randomForestBase.tpp:1226
std::uniform_int_distribution< int > uni_dist
For generating random integers during traning, may also be used derived classes.
Definition: randomForestBase.hpp:176
std::string feature_string
Arbitrary string describing the feature extraction process.
Definition: randomForestBase.hpp:174
Base class for random forests models from which all specific models are derived using CRTP...
Definition: randomForestBase.hpp:44
std::vector< node > nodes
Vector of the nodes.
Definition: randomForestBase.hpp:120
std::string feature_header
String describing the content of the feature string.
Definition: randomForestBase.hpp:173
float score
The score of this data point according to the feature extraction.
Definition: randomForestBase.hpp:130
void probabilitySingle(TIdIterator first_id, const TIdIterator last_id, TLabelIterator label_it, TOutputIterator out_it, const bool single_label, TFeatureFunctor &&feature_functor) const
Evaluate the probability of a certain value of the label for a set of data points.
Definition: randomForestBase.tpp:691
std::default_random_engine rand_engine
Random engine for generating random numbers during training, may also be used by derived classes...
Definition: randomForestBase.hpp:175
void train(const TIdIterator first_id, const TIdIterator last_id, const TLabelIterator first_label, TFeatureFunctor &&feature_functor, TParameterFunctor &¶meter_functor, const unsigned num_param_combos_to_test, const bool bagging=true, const float bag_proportion=C_DEFAULT_BAGGING_PROPORTION, const bool fit_split_nodes=true, const unsigned min_training_data=C_DEFAULT_MIN_TRAINING_DATA)
Train the random forest model on training data.
Definition: randomForestBase.tpp:967