Canopy  1.0
The header-only random forests library
classifier.hpp
Go to the documentation of this file.
1 #ifndef CLASSIFER_HPP
2 #define CLASSIFER_HPP
3 
12 
13 namespace canopy
14 {
15 
25 template <unsigned TNumParams>
26 class classifier : public randomForestBase<classifier<TNumParams>,int,discreteDistribution,discreteDistribution,TNumParams>
27 {
28  public:
29  // Methods
30  classifier(const int num_classes, const int num_trees, const int num_levels, const double info_gain_tresh = C_DEFAULT_MIN_INFO_GAIN); // constructor
31  classifier();
32  int getNumberClasses() const;
33  void setClassNames(const std::vector<std::string>& new_class_names);
34  void getClassNames(std::vector<std::string>& end_class_names) const;
35  void raiseNodeTemperature(const double T);
36 
37  protected:
40 
41  // Methods
42  void initialiseNodeDist(const int t, const int n);
43  template <class TLabelIterator>
44  void bestSplit(const std::vector<scoreInternalIndexStruct> &data_structs, const TLabelIterator first_label, const int /*tree*/, const int /*node*/, const float initial_impurity,float& info_gain, float& thresh) const;
45  void printHeaderDescription(std::ofstream &stream) const;
46  void printHeaderData(std::ofstream &stream) const;
47  void readHeader(std::ifstream &stream);
48  float minInfoGain(const int /*tree*/, const int /*node*/) const;
49  template <class TLabelIterator>
50  float singleNodeImpurity(const TLabelIterator first_label, const std::vector<int>& nodebag, const int /*tree*/, const int /*node*/) const;
51  template <class TLabelIterator, class TIdIterator>
52  void trainingPrecalculations(const TLabelIterator first_label, const TLabelIterator last_label, const TIdIterator/*unused*/);
54 
55  // Data
56  int n_classes;
57  std::vector<std::string> class_names;
58  std::vector<double> xlogx_precalc;
59  double min_info_gain;
60 
61  // Constants
62  static constexpr double C_DEFAULT_MIN_INFO_GAIN = 0.05;
63 };
64 
65 } // end of namespace
66 
68 #endif
69 // CLASSIFER_HPP
float minInfoGain(const int, const int) const
Get the information gain threshold for a given node.
Definition: classifier.tpp:282
static constexpr double C_DEFAULT_MIN_INFO_GAIN
Default value for the information gain threshold.
Definition: classifier.hpp:62
void readHeader(std::ifstream &stream)
Read the header information specific to the classifier model from a stream.
Definition: classifier.tpp:245
Contains implementations of the methods of the canopy::classifier class.
std::vector< std::string > class_names
The names of the classes.
Definition: classifier.hpp:57
Contains the canopy::discreteDistribution class, which is the node and output distribution for the cl...
void cleanupPrecalculations()
Clean-up of data to perform after training ends.
Definition: classifier.tpp:116
void initialiseNodeDist(const int t, const int n)
Initialise a discreteDistribution as a node distribution for training.
Definition: classifier.tpp:78
int getNumberClasses() const
Get the number of classes in the discrete label space of the model.
Definition: classifier.tpp:265
void setClassNames(const std::vector< std::string > &new_class_names)
Set the class name strings.
Definition: classifier.tpp:52
void bestSplit(const std::vector< scoreInternalIndexStruct > &data_structs, const TLabelIterator first_label, const int, const int, const float initial_impurity, float &info_gain, float &thresh) const
Find the best way to split training data using the scores of a certain feature.
Definition: classifier.tpp:160
void raiseNodeTemperature(const double T)
Smooth the distributions in all of the leaf nodes using the softmax function.
Definition: classifier.tpp:303
void getClassNames(std::vector< std::string > &end_class_names) const
Get the class name strings.
Definition: classifier.tpp:65
std::vector< double > xlogx_precalc
Used for storing temporary precalculations of x*log(x) values during training.
Definition: classifier.hpp:58
Namespace containing the canopy library for random forest models.
Definition: circularRegressor.hpp:13
void printHeaderData(std::ofstream &stream) const
Print the header information specific to the classifier model to a stream.
Definition: classifier.tpp:228
Contains the declaration of the canopy::randomForestBase class.
Implements a random forest classifier model to predict a discrete output label.
Definition: classifier.hpp:26
randomForestBase< classifier< TNumParams >, int, discreteDistribution, discreteDistribution, TNumParams >::scoreInternalIndexStruct scoreInternalIndexStruct
Forward the definition of the type declared in the randomForestBase class.
Definition: classifier.hpp:39
A distribution that defines the probabilities over a number of discrete (integer-valued) class labels...
Definition: discreteDistribution.hpp:26
void trainingPrecalculations(const TLabelIterator first_label, const TLabelIterator last_label, const TIdIterator)
Preliminary calculations to perform berfore training begins.
Definition: classifier.tpp:102
double min_info_gain
If during training, the best information gain at a node goes below this threshold, a lead node is declared.
Definition: classifier.hpp:59
Base class for random forests models from which all specific models are derived using CRTP...
Definition: randomForestBase.hpp:44
int n_classes
The number of classes in the discrete label space.
Definition: classifier.hpp:56
float singleNodeImpurity(const TLabelIterator first_label, const std::vector< int > &nodebag, const int, const int) const
Calculate the impurity of the label set in a single node.
Definition: classifier.tpp:200
void printHeaderDescription(std::ofstream &stream) const
Prints a string that allows a human to interpret the header information to a stream.
Definition: classifier.tpp:213
classifier()
Default constructor.
Definition: classifier.tpp:38