Base class for random forests models from which all specific models are derived using CRTP. More...

#include <randomForestBase.hpp>

Collaboration diagram for canopy::randomForestBase< TDerived, TLabel, TNodeDist, TOutputDist, TNumParams >:

[legend]

Classes
class	derivedProxy
	Proxy for the derived class. More...

struct	node
	Node structure - represents one node in a tree. More...

struct	scoreInternalIndexStruct
	Structure for holding information about a data sample and its feature score. More...

struct	tree
	Tree structure - represents a single tree. More...

Public Member Functions
	randomForestBase ()
	Default constructor. More...

	randomForestBase (const int num_trees, const int num_levels)
	Full constructor. More...

bool	readFromFile (const std::string filename, const int trees_used=-1, const int max_depth_used=-1)
	Read a pre-trained model in from a file. More...

bool	writeToFile (const std::string filename) const
	Write a trained model to a .tr file to be stored and re-used. More...

bool	isValid () const
	Check whether a forest model is valid. More...

void	setFeatureDefinitionString (const std::string &header_str, const std::string &feat_str)
	Store arbitrary strings that define parameters of the feature extraction process. More...

void	getFeatureDefinitionString (std::string &feat_str) const
	Retrieve a stored feature string. More...

template<class TIdIterator , class TLabelIterator , class TFeatureFunctor , class TParameterFunctor >
void	train (const TIdIterator first_id, const TIdIterator last_id, const TLabelIterator first_label, TFeatureFunctor &&feature_functor, TParameterFunctor &&parameter_functor, const unsigned num_param_combos_to_test, const bool bagging=true, const float bag_proportion=C_DEFAULT_BAGGING_PROPORTION, const bool fit_split_nodes=true, const unsigned min_training_data=C_DEFAULT_MIN_TRAINING_DATA)
	Train the random forest model on training data. More...

template<class TIdIterator , class TOutputIterator , class TFeatureFunctor >
void	predictDistGroupwise (TIdIterator first_id, const TIdIterator last_id, TOutputIterator out_it, TFeatureFunctor &&feature_functor) const
	Predict the output distribution for a number of IDs. More...

template<class TIdIterator , class TOutputIterator , class TFeatureFunctor >
void	predictDistSingle (TIdIterator first_id, const TIdIterator last_id, TOutputIterator out_it, TFeatureFunctor &&feature_functor) const
	Predict the output distribution for a number of IDs. More...

template<class TIdIterator , class TLabelIterator , class TOutputIterator , class TFeatureFunctor >
void	probabilityGroupwise (TIdIterator first_id, const TIdIterator last_id, TLabelIterator label_it, TOutputIterator out_it, const bool single_label, TFeatureFunctor &&feature_functor) const
	Evaluate the probability of a certain value of the label for a set of data points. More...

template<class TIdIterator , class TLabelIterator , class TOutputIterator , class TFeatureFunctor >
void	probabilitySingle (TIdIterator first_id, const TIdIterator last_id, TLabelIterator label_it, TOutputIterator out_it, const bool single_label, TFeatureFunctor &&feature_functor) const
	Evaluate the probability of a certain value of the label for a set of data points. More...

template<class TIdIterator , class TLabelIterator , class TOutputIterator , class TBinaryFunction , class TFeatureFunctor , class TPDFFunctor >
void	probabilityGroupwiseBase (TIdIterator first_id, const TIdIterator last_id, TLabelIterator label_it, TOutputIterator out_it, const bool single_label, TBinaryFunction &&binary_function, TFeatureFunctor &&feature_functor, TPDFFunctor &&pdf_functor) const
	A generalised version of the `probabilityGroupwise()` function that enables the creation of more general functions. More...

template<class TIdIterator , class TLabelIterator , class TOutputIterator , class TBinaryFunction , class TFeatureFunctor , class TPDFFunctor >
void	probabilitySingleBase (TIdIterator first_id, const TIdIterator last_id, TLabelIterator label_it, TOutputIterator out_it, const bool single_label, TBinaryFunction &&binary_function, TFeatureFunctor &&feature_functor, TPDFFunctor &&pdf_functor) const
	A generalised version of the `probabilitySingle()` function that enables the creation of more general functions. More...

Protected Member Functions
template<class TIdIterator , class TFeatureFunctor >
void	findLeavesGroupwise (TIdIterator first_id, const TIdIterator last_id, const int treenum, std::vector< const TNodeDist * > &leaves, TFeatureFunctor &&feature_functor) const
	Function to query a single tree model with a set of data points and store a pointer to the leaf distribution that each reaches. More...

template<class TId , class TFeatureFunctor >
const TNodeDist *	findLeafSingle (const TId id, const int treenum, TFeatureFunctor &&feature_functor) const
	Function to query a single tree model with a single data point and return a pointer to the leaf distribution that it reaches. More...

Static Protected Member Functions
template<class TLabelIterator >
static double	fastDiscreteEntropy (const std::vector< int > &internal_index, const int n_labels, const TLabelIterator first_label, const std::vector< double > &xlogx_precalc)
	Calculates the entropy of the discrete labels of a set of data points using an efficient method. More...

template<class TLabelIterator >
static int	fastDiscreteEntropySplit (const std::vector< scoreInternalIndexStruct > &data_structs, const int n_labels, const TLabelIterator first_label, const std::vector< double > &xlogx_precalc, double &best_split_impurity, float &thresh)
	Find the split in a set of training data that results in the best information gain for discrete labels. More...

static std::vector< double >	preCalculateXlogX (const int N)
	Calculate an array of x*log(x) for integer x. More...

Protected Attributes
int	n_trees
	The number of trees in the forest.

int	n_levels
	The maximum number of levels in each tree.

int	n_nodes
	The number of nodes in each tree.

bool	valid
	Whether the forest model is currently valid and usable for predictions (true = valid)

bool	fit_split_nodes
	Whether a node distribution is fitted to all nodes (true) or just the leaf nodes (false)

std::vector< tree >	forest
	Vector of tree models.

std::string	feature_header
	String describing the content of the feature string.

std::string	feature_string
	Arbitrary string describing the feature extraction process.

std::default_random_engine	rand_engine
	Random engine for generating random numbers during training, may also be used by derived classes.

std::uniform_int_distribution< int >	uni_dist
	For generating random integers during traning, may also be used derived classes.

Static Protected Attributes
static constexpr int	C_DEFAULT_MIN_TRAINING_DATA = 50
	Default value for the minimum number of traning data points in a node before a leaf is declared.

static constexpr float	C_DEFAULT_BAGGING_PROPORTION = 0.5
	Default value for the proportion of the traning set used to train each tree.

Detailed Description

template<class TDerived, class TLabel, class TNodeDist, class TOutputDist, unsigned TNumParams>
class canopy::randomForestBase< TDerived, TLabel, TNodeDist, TOutputDist, TNumParams >

Base class for random forests models from which all specific models are derived using CRTP.

This class implements the basic training and testing routines, and some utility functions that may be used by derived classs. This class cannot not be used directly.

Template Parameters

TDerived	The type of the derived random forests model (e.g. classifier, regressor). Having the derived class as a template parameter implements the curiously recurring template (CRTP) idiom, which allows for static polymorphism.
TLabel	The type of the label that the model is used to predict. This is the output type of the forest model, for example an integer for a classifier or a float for a 1D regressor.
TNodeDist	The type of the node distribution, which is the distribution stored at each leaf node. The node distribution must have certain characteristics.
TOutputDist	The type of the output distribution, which is the type of the distribution predicted by the forest model. This may be same as or different from TNodeDist. The output distribution must have certain charaecteristics.
TNumParams	The number of parameters used by the features callback.

Constructor & Destructor Documentation

template<class TDerived , class TLabel , class TNodeDist , class TOutputDist , unsigned TNumParams>

canopy::randomForestBase< TDerived, TLabel, TNodeDist, TOutputDist, TNumParams >::randomForestBase ( )

Default constructor.

Note that an object initialised in this way should not be trained, but may be used to read in a pre-trained model using readFromFile()

template<class TDerived , class TLabel , class TNodeDist , class TOutputDist , unsigned TNumParams>

canopy::randomForestBase< TDerived, TLabel, TNodeDist, TOutputDist, TNumParams >::randomForestBase	(	const int	num_trees,
		const int	num_levels
	)

Full constructor.

Creates a full forest with a specified number of trees and levels, ready to be trained.

Parameters

num_trees	The number of decision trees in the forest
num_levels	The maximum depth of any node in the trees

Member Function Documentation

template<class TDerived , class TLabel , class TNodeDist , class TOutputDist , unsigned TNumParams>

template<class TLabelIterator >

double canopy::randomForestBase< TDerived, TLabel, TNodeDist, TOutputDist, TNumParams >::fastDiscreteEntropy	(	const std::vector< int > &	internal_index,
		const int	n_labels,
		const TLabelIterator	first_label,
		const std::vector< double > &	xlogx_precalc
	)

staticprotected

Calculates the entropy of the discrete labels of a set of data points using an efficient method.

This is utility method that is provided for use in subclasses if convenient.

Template Parameters

TLabelIterator Type of the iterator used to access the discrete labels. Must be a random access iterator that dereferences to an integral data type.

Parameters

internal_index	Vector containing the internal training indices of the data points. These are the indices through which the labels may be accessed in first_label
n_labels	The number of discrete labels. The possible values of the label are assumed to be the integers in the range 0 to n_labels-1
first_label	Iterator to the labels for which the entropy is to be calculated. The labels should be located at the offsets from this iterator given by the elements of the internal_index vector. I.e. first_label[internal_index[0]], first_label[internal_index[1]] etc.
xlogx_precalc	A pre-calculated array of the value of x*log(x), as calculated by the `preCalculateXlogX()` routine. This must be long enough to include the value for x = internal_index.size() or greater.

Returns: The entropy of the set of labels.

template<class TDerived , class TLabel , class TNodeDist , class TOutputDist , unsigned TNumParams>

template<class TLabelIterator >

int canopy::randomForestBase< TDerived, TLabel, TNodeDist, TOutputDist, TNumParams >::fastDiscreteEntropySplit	(	const std::vector< scoreInternalIndexStruct > &	data_structs,
		const int	n_labels,
		const TLabelIterator	first_label,
		const std::vector< double > &	xlogx_precalc,
		double &	best_split_impurity,
		float &	thresh
	)

staticprotected

Find the split in a set of training data that results in the best information gain for discrete labels.

This is utility method that is provided for use in subclasses if convenient.

Template Parameters

TLabelIterator Type of the iterator used to access the discrete labels. Must be a random access iterator that dereferences to an integral data type.

Parameters

data_structs	A vector in which each element is a structure containing the internal id (.id) and score (.score) for the current feature of the training data points. The vector is assumed to be sorted according to the score field in ascending order.
n_labels	The number of discrete labels. The possible values of the label are assumed to be the integers in the range 0 to n_labels-1
first_label	Iterator to the labels for which the entropy is to be calculated. The labels should be located at the offsets from this iterator given by the IDs of elements of the data_structs vector. I.e. first_label[data_structs[0].id], first_label[data_structs[1].id] etc.
xlogx_precalc	A pre-calculated array of the value of x*log(x), as calculated by the `preCalculateXlogX()` routine. This must be long enough to include the value for x = data_structs.size() or greater.
best_split_impurity	Returns by reference the impurity of the best split found.
thresh	Returns by reference the threshold of the feature score that gives the best split.

Returns: The position 'd' of the best split in the training data. The partition of the data resulting in the best split has the first d+1 elements in one partiiton and the remainder in the other partition.

template<class TDerived , class TLabel , class TNodeDist , class TOutputDist , unsigned TNumParams>

template<class TId , class TFeatureFunctor >

const TNodeDist * canopy::randomForestBase< TDerived, TLabel, TNodeDist, TOutputDist, TNumParams >::findLeafSingle	(	const TId	id,
		const int	treenum,
		TFeatureFunctor &&	feature_functor
	)		const

protected

Function to query a single tree model with a single data point and return a pointer to the leaf distribution that it reaches.

This is a basic operation that is used by higher-level processes. Using this method.

Template Parameters

TId	Type of the ID used to identify the data point.
TFeatureFunctor	The type of the feature functor object. Must meet the specifications for a single feature functor object, meaning it must define operator() with a certain form.

Parameters

first_id	ID of the data point for which the leaf distribution is to be found.
treenum	Index of the tree to use.
feature_functor	The feature functor object to be used as a callback to calculate the features. Must be safe to call from multiple threads simultaneously.

Returns: A pointers to the leaf distribution reached by the data point.

template<class TDerived , class TLabel , class TNodeDist, class TOutputDist , unsigned TNumParams>

template<class TIdIterator , class TFeatureFunctor >

void canopy::randomForestBase< TDerived, TLabel, TNodeDist, TOutputDist, TNumParams >::findLeavesGroupwise	(	TIdIterator	first_id,
		const TIdIterator	last_id,
		const int	treenum,
		std::vector< const TNodeDist * > &	leaves,
		TFeatureFunctor &&	feature_functor
	)		const

protected

Function to query a single tree model with a set of data points and store a pointer to the leaf distribution that each reaches.

This is a basic operation that is used by higher-level processes. Using this method, the features needed by a single node are requested from the feature functor for all the IDs with a single function call. This involves some overhead, but may permit efficiencies resulting from calculating multiple features at once.

Template Parameters

TIdIterator	Type of the iterator to the IDs. Must be a random access iterator and dereference to the TId type expected by the feature functor.
TFeatureFunctor	The type of the feature functor object. Must meet the specifications for a groupwise feature functor object, meaning it must define operator() with a certain form.

Parameters

first_id	Iterator to the ID of the first data point for which the leaf distribution is to be found.
last_id	Iterator to the ID of the last data point for which the leaf distribution is to be found.
treenum	Index of the tree to use.
leaves	After the function, this array contains pointers to the leaf distribution reached by the corresponding elements in the ID list. Expects to be pre-allocated to the correct size.
feature_functor	The feature functor object to be used as a callback to calculate the features. Must be safe to call from multiple threads simultaneously.

template<class TDerived , class TLabel , class TNodeDist , class TOutputDist , unsigned TNumParams>

void canopy::randomForestBase< TDerived, TLabel, TNodeDist, TOutputDist, TNumParams >::getFeatureDefinitionString ( std::string & feat_str ) const

Retrieve a stored feature string.

This method is used to retrieve a feature string previously stored in the model. This string may be used to store paratmers of the feature extraction process used to train the model. The parsing of this string is leftentirely up to the user.

Parameters

feature_string The stored string is returned by reference in this variable. If no string has been stored, an empty string is returned.

template<class TDerived , class TLabel , class TNodeDist , class TOutputDist , unsigned TNumParams>

bool canopy::randomForestBase< TDerived, TLabel, TNodeDist, TOutputDist, TNumParams >::isValid ( ) const

Check whether a forest model is valid.

Returns: True if the forest has either been successfully trained or successfully read from a file and is therefore ready to use. False otherwise. If false, the model should not be used.

template<class TDerived , class TLabel , class TNodeDist , class TOutputDist , unsigned TNumParams>

std::vector< double > canopy::randomForestBase< TDerived, TLabel, TNodeDist, TOutputDist, TNumParams >::preCalculateXlogX ( const int N )

staticprotected

Calculate an array of x*log(x) for integer x.

This is a utility routine provided for subclasses to make use of if convenient. The quantity x*log(x) arises in many entropy-based calculations, including the fastDiscreteEntropy() and fastDiscreteEntropySplit() calculations, and needs to be calculated a very large number of times in such routines. This method pre-calculates an array of x*log(x) for integers x in the range 0 to N inclusive, such that it may be used by other routines to speed up calculations.

Parameters

n	Upper limit of the range of values for x

Returns: A vector 'result' of length N+1 where result[i] has the value i*log(i), and result[0] = 0.0

template<class TDerived , class TLabel , class TNodeDist , class TOutputDist , unsigned TNumParams>

template<class TIdIterator , class TOutputIterator , class TFeatureFunctor >

void canopy::randomForestBase< TDerived, TLabel, TNodeDist, TOutputDist, TNumParams >::predictDistGroupwise	(	TIdIterator	first_id,
		const TIdIterator	last_id,
		TOutputIterator	out_it,
		TFeatureFunctor &&	feature_functor
	)		const

Predict the output distribution for a number of IDs.

This function uses the forest model to predict the full output distribution for each of a number of data points, where each data point is identified by an ID variable.

These ID variables are passed in as a pair of iterators pointing to the first and last IDs to be processed. The output distribution for each of these IDs is placed in a second container accessed by iterators.

In this version of the function, the features needed by a single node are requested from the feature functor for all the IDs with a single function call. This involves some overhead, but may permit efficiencies resulting from calculating multiple features at once.

Uses OpenMP to query the multiple tree models in parallel.

Template Parameters

TIdIterator	Type of the iterator to the IDs. Must be a random access iterator and dereference to the TId type expected by the feature functor.
TOutputIterator	Type of the iterator to the output distributions. Must be a forward output iterator that dereferences to TOutputDist.
TFeatureFunctor	The type of the feature functor object. Must meet the specifications for a groupwise feature functor object, meaning it must define operator() with a certain form.

Parameters

first_id	Iterator to the first ID whose output is to be predicted.
last_id	Iterator to the last ID whose output is to be predicted.
out_it	Iterator to the output distribution corresponding to the first ID. The container of output distributions must already exist, and contain enough elements for all of the IDs between first_id and last_id. At the end of this function, the output distributions in this container relate to the corresponding elements of the id container.
feature_functor	The feature functor object to be used as a callback to calculate the features. Must be safe to call from multiple threads simultaneously.

template<class TDerived , class TLabel , class TNodeDist , class TOutputDist , unsigned TNumParams>

template<class TIdIterator , class TOutputIterator , class TFeatureFunctor >

void canopy::randomForestBase< TDerived, TLabel, TNodeDist, TOutputDist, TNumParams >::predictDistSingle	(	TIdIterator	first_id,
		const TIdIterator	last_id,
		TOutputIterator	out_it,
		TFeatureFunctor &&	feature_functor
	)		const

Predict the output distribution for a number of IDs.

This function uses the forest model to predict the full output distribution for each of a number of data points, where each data poitn is identified by an ID variable.

These ID variables are passed in as a pair of iterators pointing to the first and last IDs to be processed. The output distribution for each of these IDs is placed in a second container accessed by iterators.

In this version of the function, the features needed by a single node are requested from the feature functor one-by-one.

Uses OpenMP to query the multiple tree models in parallel.

Template Parameters

TIdIterator	Type of the iterator to the IDs. Must be a random access iterator and dereference to the TId type expected by the feature functor.
TOutputIterator	Type of the iterator to the output distributions. Must be a forward output iterator that dereferences to TOutputDist.
TFeatureFunctor	The type of the feature functor object. Must meet the specifications for a single feature functor object, meaning it must define operator() with a certain form.

Parameters

first_id	Iterator to the first ID whose output is to be predicted.
last_id	Iterator to the last ID whose output is to be predicted.
out_it	Iterator to the output distribution corresponding to the first ID. The container of output distributions must already exist, and contain enough elements for all of the IDs between first_id and last_id. At the end of this function, the output distributions in this container relate to the corresponding elements of the id container.
feature_functor	The feature functor object to be used as a callback to calculate the features. Must be safe to call from multiple threads simultaneously.

template<class TDerived , class TLabel , class TNodeDist , class TOutputDist , unsigned TNumParams>

template<class TIdIterator , class TLabelIterator , class TOutputIterator , class TFeatureFunctor >

void canopy::randomForestBase< TDerived, TLabel, TNodeDist, TOutputDist, TNumParams >::probabilityGroupwise	(	TIdIterator	first_id,
		const TIdIterator	last_id,
		TLabelIterator	label_it,
		TOutputIterator	out_it,
		const bool	single_label,
		TFeatureFunctor &&	feature_functor
	)		const

Evaluate the probability of a certain value of the label for a set of data points.

This function uses the forest model to evaluate the probability of a given value of the label (output) variable for a number of data points, where each point data is identified by an ID variable.

These ID variables are passed in as a pair of iterators pointing to the first and last IDs to be processed. The value of the label for which the probability should be evaluated is passed in as a second iterator. The probability of the label for each of these IDs is placed in a third container accessed by iterators.

In this version of the function, the features needed by a single node are requested from the feature functor for all the IDs with a single function call. This involves some overhead, but may permit efficiencies resulting from calculating multiple features at once.

Uses OpenMP to query the multiple tree models in parallel.

Template Parameters

TIdIterator	Type of the iterator to the IDs. Must be a random access iterator and dereference to the TId type expected by the feature functor.
TLabelIterator	Type of the iterator to the IDs. Must be a random access iterator and dereference to the TLabel type of the forest (or to something trivially convertible to that type).
TOutputIterator	Type of the iterator to the output. Must be a forward output iterator that dereferences to a type that supports assignment to float.
TFeatureFunctor	The type of the feature functor object. Must meet the specifications for a groupwise feature functor object, meaning it must define operator() with a certain form.

Parameters

first_id	Iterator to the ID of the first data point for which the probability of the label is to be evaluated.
last_id	Iterator to the ID of the last data point for which the probability of the label is to be evaluated.
label_it	Iterator to the label variable whose probability is to be evaluated.
out_it	Iterator to the output probability value for the first ID. The container of output values must already exist, and contain enough elements for all of the IDs between first_id and last_id. At the end of this function, the output values in this container relate to the corresponding elements of the id container.
single_label	If true, the value of the label whose probability is evaluated is the same for all the data points. This means that the label_it iterator is never advanced. If false, the value of the label is not necessarily the same for all data points, and the label_it iterator is advanced for each data point to give the value of the label to use.
feature_functor	The feature functor object to be used as a callback to calculate the features. Must be safe to call from multiple threads simultaneously.

template<class TDerived , class TLabel , class TNodeDist , class TOutputDist , unsigned TNumParams>

template<class TIdIterator , class TLabelIterator , class TOutputIterator , class TBinaryFunction , class TFeatureFunctor , class TPDFFunctor >

void canopy::randomForestBase< TDerived, TLabel, TNodeDist, TOutputDist, TNumParams >::probabilityGroupwiseBase	(	TIdIterator	first_id,
		const TIdIterator	last_id,
		TLabelIterator	label_it,
		TOutputIterator	out_it,
		const bool	single_label,
		TBinaryFunction &&	binary_function,
		TFeatureFunctor &&	feature_functor,
		TPDFFunctor &&	pdf_functor
	)		const

A generalised version of the probabilityGroupwise() function that enables the creation of more general functions.

A generalised version of the probabilityGroupwise() function. There are two generalisations:

The pdf value may be calculated from the node distribution in some way other than the calling the pdf() method. This enables, for example, accessing one distribution from a node distribution that contains multiple distributions over different variables. This behaviour is controlled by the pdf_functor object.
The output probability value may be used for something other than simple assignment to a variable. This may be used, for example, to use the output value to update some other variable (via multiplication or addtition etc) in a single step without having to store results in a temporary array. This behaviour is controlled by the binary_function functor object.

Unless otherwise specified, the behaviour is the same as the probabilityGroupwise() function.

Template Parameters

TIdIterator	Type of the iterator to the IDs. Must be a random access iterator and dereference to the TId type expected by the feature functor.
TLabelIterator	Type of the iterator to the IDs. Must be a random access iterator and dereference to the TLabel type of the forest (or to something trivially convertible to that type).
TOutputIterator	Type of the iterator to the output. Must be a forward output iterator that dereferences to a type that supports assignment to float.
TBinaryFunction	The type of the binary_function argument. Must be a function object that has an operator() of the form float operator()(TOutput, float) where TOutput is the type that TOutputIterator dereferences to.
TFeatureFunctor	The type of the feature functor object. Must meet the specifications for a groupwise feature functor object, meaning it must define operator() with a certain form.
TPDFFunctor	The type of the pdf_functor argument. Must be a function object that has an operator() of the form float operator()(TNodeDist*, TLabel, TId).

Parameters

first_id	Iterator to the ID of the first data point for which the probability of the label is to be evaluated.
last_id	Iterator to the ID of the last data point for which the probability of the label is to be evaluated.
label_it	Iterator to the label variable whose probability is to be evaluated.
out_it	Iterator to the output probability value for the first ID. The container of output values must already exist, and contain enough elements for all of the IDs between first_id and last_id. At the end of this function, the output values in this container relate to the corresponding elements of the id container.
single_label	If true, the value of the label whose probability is evaluated is the same for all the data points. This means that the label_it iterator is never advanced. If false, the value of the label is not necessarily the same for all data points, and the label_it iterator is advanced for each data point to give the value of the label to use.
binary_function	A function object that takes the current value of the output variable (first argument) and the forest's predicted probability value (second) argument and returns the value that is then assigned to the output variable.
feature_functor	The feature functor object to be used as a callback to calculate the features. Must be safe to call from multiple threads simultaneously.
pdf_functor	A function object that takes a pointer to the leaf distribution reached by the forest (first argument), a lable value (second argument), and an ID (third argument) and returns the value used as the pdf for the that leaf distribution.

template<class TDerived , class TLabel , class TNodeDist , class TOutputDist , unsigned TNumParams>

template<class TIdIterator , class TLabelIterator , class TOutputIterator , class TFeatureFunctor >

void canopy::randomForestBase< TDerived, TLabel, TNodeDist, TOutputDist, TNumParams >::probabilitySingle	(	TIdIterator	first_id,
		const TIdIterator	last_id,
		TLabelIterator	label_it,
		TOutputIterator	out_it,
		const bool	single_label,
		TFeatureFunctor &&	feature_functor
	)		const

Evaluate the probability of a certain value of the label for a set of data points.

This function uses the forest model to evaluate the probability of a given value of the label (output) variable for a number of data points, where each point data is identified by an ID variable.

These ID variables are passed in as a pair of iterators pointing to the first and last IDs to be processed. The value of the label for which the probability should be evaluated is passed in as a second iterator. The probability of the label for each of these IDs is placed in a third container accessed by iterators.

In this version of the function, the features needed by a single node are requested from the feature functor one-by-one.

Uses OpenMP to query the multiple tree models in parallel.

Template Parameters

TIdIterator	Type of the iterator to the IDs. Must be a random access iterator and dereference to the TId type expected by the feature functor.
TLabelIterator	Type of the iterator to the IDs. Must be a random access iterator and dereference to the TLabel type of the forest (or to something trivially convertible to that type).
TOutputIterator	Type of the iterator to the output. Must be a forward output iterator that dereferences to a type that supports assignment to float.
TFeatureFunctor	The type of the feature functor object. Must meet the specifications for a single feature functor object, meaning it must define operator() with a certain form.

Parameters

first_id	Iterator to the ID of the first data point for which the probability of the label is to be evaluated.
last_id	Iterator to the ID of the last data point for which the probability of the label is to be evaluated.
label_it	Iterator to the label variable whose probability is to be evaluated.
out_it	Iterator to the output probability value for the first ID. The container of output values must already exist, and contain enough elements for all of the IDs between first_id and last_id. At the end of this function, the output values in this container relate to the corresponding elements of the id container.
single_label	If true, the value of the label whose probability is evaluated is the same for all the data points. This means that the label_it iterator is never advanced. If false, the value of the label is not necessarily the same for all data points, and the label_it iterator is advanced for each data point to give the value of the label to use.
feature_functor	The feature functor object to be used as a callback to calculate the features. Must be safe to call from multiple threads simultaneously.

template<class TDerived , class TLabel , class TNodeDist , class TOutputDist , unsigned TNumParams>

template<class TIdIterator , class TLabelIterator , class TOutputIterator , class TBinaryFunction , class TFeatureFunctor , class TPDFFunctor >

void canopy::randomForestBase< TDerived, TLabel, TNodeDist, TOutputDist, TNumParams >::probabilitySingleBase	(	TIdIterator	first_id,
		const TIdIterator	last_id,
		TLabelIterator	label_it,
		TOutputIterator	out_it,
		const bool	single_label,
		TBinaryFunction &&	binary_function,
		TFeatureFunctor &&	feature_functor,
		TPDFFunctor &&	pdf_functor
	)		const

A generalised version of the probabilitySingle() function that enables the creation of more general functions.

A generalised version of the probabilitySingle() function. There are two generalisations:

The pdf value may be calculated from the node distribution in some way other than the calling the pdf() method. This enables, for example, accessing one distribution from a node distribution that contains multiple distributions over different variables. This behaviour is controlled by the pdf_functor object.
The output probability value may be used for something other than simple assignment to a variable. This may be used, for example, to use the output value to update some other variable (via multiplication or addtition etc) in a single step without having to store results in a temporary array. This behaviour is controlled by the binary_function functor object.

Unless otherwise specified, the behaviour is the same as the probabilitySingle() function.

Template Parameters

TIdIterator	Type of the iterator to the IDs. Must be a random access iterator and dereference to the TId type expected by the feature functor.
TLabelIterator	Type of the iterator to the IDs. Must be a random access iterator and dereference to the TLabel type of the forest (or to something trivially convertible to that type).
TOutputIterator	Type of the iterator to the output. Must be a forward output iterator that dereferences to a type that supports assignment to float.
TBinaryFunction	The type of the binary_function argument. Must be a function object that has an operator() of the form float operator()(TOutput, float) where TOutput is the type that TOutputIterator dereferences to.
TFeatureFunctor	The type of the feature functor object. Must meet the specifications for a single feature functor, meaning it must define operator() with a certain form.
TPDFFunctor	The type of the pdf_functor argument. Must be a function object that has an operator() of the form float operator()(TNodeDist*, TLabel, TId).

Parameters

first_id	Iterator to the ID of the first data point for which the probability of the label is to be evaluated.
last_id	Iterator to the ID of the last data point for which the probability of the label is to be evaluated.
label_it	Iterator to the label variable whose probability is to be evaluated.
out_it	Iterator to the output probability value for the first ID. The container of output values must already exist, and contain enough elements for all of the IDs between first_id and last_id. At the end of this function, the output values in this container relate to the corresponding elements of the id container.
single_label	If true, the value of the label whose probability is evaluated is the same for all the data points. This means that the label_it iterator is never advanced. If false, the value of the label is not necessarily the same for all data points, and the label_it iterator is advanced for each data point to give the value of the label to use.
binary_function	A function object that takes the current value of the output variable (first argument) and the forest's predicted probability value (second) argument and returns the value that is then assigned to the output variable.
feature_functor	The feature functor object to be used as a callback to calculate the features. Must be safe to call from multiple threads simultaneously.
pdf_functor	A function object that takes a pointer to the leaf distribution reached by the forest (first argument), a lable value (second argument), and an ID (third argument) and returns the value used as the pdf for the that leaf distribution.

template<class TDerived , class TLabel , class TNodeDist , class TOutputDist , unsigned TNumParams>

bool canopy::randomForestBase< TDerived, TLabel, TNodeDist, TOutputDist, TNumParams >::readFromFile	(	const std::string	filename,
		const int	trees_used = `-1`,
		const int	max_depth_used = `-1`
	)

Read a pre-trained model in from a file.

Read in the parameters for a forest from a pre-trained model stored in a .tr file. After this function, the object will be ready to use for testing with the pre-trained model.

Parameters

filename	The full name and path of the .tr file to read.
trees_used	The number of trees to read in from the file. If this is unspecified or set to a negative value, all the trees in the .tr file will be used. If the number specified is greater than the number trianed in .tr file, the function will fail and return zero.
max_depth_used	The maximum tree depth to read from the the file. If this is unspecified or set to a negative value, all the levels in the .tr file will be used. If the number specified is greater than the number trianed in .tr file, the function will fail and return zero. The .tr must have been trained with the fit_split_nodes option set to true for this option to be successful.

Returns: True if the model was successfully read from the file, false otherwise. If false, the model should not be used.

template<class TDerived , class TLabel , class TNodeDist , class TOutputDist , unsigned TNumParams>

void canopy::randomForestBase< TDerived, TLabel, TNodeDist, TOutputDist, TNumParams >::setFeatureDefinitionString	(	const std::string &	header_str,
		const std::string &	feat_str
	)

Store arbitrary strings that define parameters of the feature extraction process.

This string is stored alongside the model, enabling the storage of information necessary to recreate the same feature extraction process at test time. The construction (and later parsing) of this string is left entirely up to the user.

Parameters

header_str	This string will be printed above the feature string and is intended to help human readility of the string by explaining the meaning of the terms in the feature string.
feat_str	Arbitrary string containing data that can later be used to recreate the feature extraction process.

template<class TDerived , class TLabel , class TNodeDist , class TOutputDist , unsigned TNumParams>

template<class TIdIterator , class TLabelIterator , class TFeatureFunctor , class TParameterFunctor >

void canopy::randomForestBase< TDerived, TLabel, TNodeDist, TOutputDist, TNumParams >::train	(	const TIdIterator	first_id,
		const TIdIterator	last_id,
		const TLabelIterator	first_label,
		TFeatureFunctor &&	feature_functor,
		TParameterFunctor &&	parameter_functor,
		const unsigned	num_param_combos_to_test,
		const bool	bagging = `true`,
		const float	bag_proportion = `C_DEFAULT_BAGGING_PROPORTION`,
		const bool	train_split_nodes = `true`,
		const unsigned	min_training_data = `C_DEFAULT_MIN_TRAINING_DATA`
	)

Train the random forest model on training data.

This function trains the random forest model to produce a valid model that may used for predictions or stored for future use. It takes iterators pointing to the IDs of the training data and the corresponding label variables, and functors to generate parameters of the feature functor and evaluate the features.

This function uses OpenMP to train the trees in parallel threads.

Template Parameters

TIdIterator	Type of the iterator used to access the training IDs. Must be a random access iterator that dereferences to the ID type expected by feature_functor.
TLabelIterator	Type of the iterator used to access the label variables. Must be a random access iterator that dereferences to type TLabel.
TFeatureFunctor	Type of the feature_functor parameter. Must be a groupwise feature functor object with an operator() of a specified form.
TParameterFunctor	Type of the feature_functor parameter. Must be a parameter generator functor object with an operator() of the form void operator()(std::array<int,TNumParams>&)

Parameters

first_id	Iterator to the ID of the first element in the training list.
last_id	Iterator to the ID of the last element in the training list.
first_label	Iterator to the label of the first element in the training list. This iterator will be advanced to find the labels of the subsequent IDs.
feature_functor	The function object that should be used to evaluate the features when training the split nodes. Must be safe to call from multiple threads simultaneously.
parameter_functor	The function object that should be called to generate a random set of split nodes parameters for use in the feature_functor. Should take a std::array<int,TNumParams> by reference and populate the elements with a valid combination of randomly chosen parameters. Must be safe to call from multiple threads simultaneously.
num_param_combos_to_test	The number of parameter combinations to test when training each split node.
bagging	If true, a random subset of the training data are used to train each tree. If false, the full set of training data are used to train each tree. Default: true.
bag_proportion	Proportion of the training data in the bag used to train each tree if bagging is true. If bagging is false, this parameter is ignored. If the value is not in the range 0 to 1, the training procedure will fail immediately. Default: C_DEFAULT_BAGGING_PROPORTION .
train_split_nodes	If true, a node distribution is fitted at every node in the forest, regardless of the lead nodes. This is typically slightly more time consuming and results is a larger .tr, but allows the trained model to be tested using a smaller depth than it was trained at. If false, the node distributions are only fitted to the leaf nodes. Default: true.
min_training_data	The threshold number of training data points in a node below which a leaf node is declared during training. Default: C_DEFAULT_MIN_TRAINING_DATA .

template<class TDerived , class TLabel , class TNodeDist , class TOutputDist , unsigned TNumParams>

bool canopy::randomForestBase< TDerived, TLabel, TNodeDist, TOutputDist, TNumParams >::writeToFile ( const std::string filename ) const

Write a trained model to a .tr file to be stored and re-used.

Ensure that setFeatureDefinitionString() is called before this function, otherwise a blank feature definition string will be stored.

Parameters

filename The full name and path of the file into which the model should be written.

Returns: True if the model was successfully written to the specified file, false otherwise.

The documentation for this class was generated from the following files:

Classes

Public Member Functions

Protected Member Functions

Static Protected Member Functions

Protected Attributes

Static Protected Attributes

Detailed Description

template<class TDerived, class TLabel, class TNodeDist, class TOutputDist, unsigned TNumParams> class canopy::randomForestBase< TDerived, TLabel, TNodeDist, TOutputDist, TNumParams >

Constructor & Destructor Documentation

Member Function Documentation

template<class TDerived, class TLabel, class TNodeDist, class TOutputDist, unsigned TNumParams>
class canopy::randomForestBase< TDerived, TLabel, TNodeDist, TOutputDist, TNumParams >