lundi 6 mai 2019

Categorical features in mlpack

I have been trying to implement random forest in C++ using mlpack. My data has some features as categorical. I have been trying to use DatasetInfo of the mlpack but with no success.

Code is below:

#include "pch.h"
#include <iostream>
using namespace arma;
using namespace mlpack;
using namespace mlpack::tree;
using namespace mlpack::cv;

int main()
{
    cout << "[SAMPLE:BEGIN]";

    // (1) Load the dataset
    cout << "\nLoading dataset...";

    mat dataset;
    data::DatasetInfo di;
    di = data::DatasetInfo(0);
    bool loaded = data::Load("data/final.csv", dataset, di);
    if (!loaded)
        return -1;

    di.Type(0) = data::Datatype::numeric;
    di.Type(1) = data::Datatype::categorical;
    di.Type(2) = data::Datatype::categorical;

    Row<size_t> labels;

    // Extract the labels from the last dimension of the training set
    //labels = conv_to<Row<size_t>>::from(dataset.row(dataset.n_rows - 1));
    loaded = data::Load("data/labels.csv", labels);

    // Remove the labels from the training set
    //dataset.shed_row(dataset.n_rows - 1);

    // (2) Training
    cout << "\nTraining...";
    const size_t numClasses = 2;
    const size_t minimumLeafSize = 5;
    const size_t numTrees = 10;

    RandomForest<GiniGain, RandomDimensionSelect> rf; 

    rf = RandomForest<GiniGain, RandomDimensionSelect>(dataset, di, labels,
        numClasses, numTrees, minimumLeafSize);

    Row<size_t> predictions;
    rf.Classify(dataset, predictions);

    const size_t correct = arma::accu(predictions == labels);

    cout << "\nTraining Accuracy: " << (double(correct) / double(labels.n_elem));


    //Save the model
    cout << "\nSaving model...";
    mlpack::data::Save("mymodel.xml", "model", rf, false);

    //Load the model
    cout << "\nLoading model...";
    mlpack::data::Load("mymodel.xml", "model", rf);

    // (6) Classify a new sample
    cout << "\nClassifying a new sample...";
    mat sample("67.00,5812,901");
    mat probabilities;
    rf.Classify(sample, predictions, probabilities);
    u64 result = predictions.at(0);
    cout << "\nClassification result: " << result << " , Probabilities: " <<
        probabilities.at(0) << "/" << probabilities.at(1);
    cout << "\n[SAMPLE:END]\n";
    return 0;
}

I have split the data in to files one which has the three features and the othe file has labels. The final.csv has the features.

1548.0,5964,812

where first column is to be number and other two columns need to be treated as categories.

The labels.csv has labels in form of 0 and 1.

This sample crashes with read access violation exception, when I try to train the machine.

I think I am doing something wrong in the way I am trying to specify the DatasetInfo.

Can anyone point to what is going wrong here or point me to some samples where I can see how DatasetInfo can be used.

Thanks.

Aucun commentaire:

Enregistrer un commentaire