daal4py 随机森林模型训练mnist并保存模型给C++ daal predict使用

# daal4py Decision Forest Classification Training example Serialization

import daal4py as d4p
import numpy as np
import pickle
from sklearn.datasets import fetch_mldata
from sklearn.model_selection import train_test_split

def get_mnist():
    mnist = fetch_mldata('MNIST original')
    X_train, X_test, y_train, y_test = train_test_split(mnist.data, mnist.target, train_size=60000, test_size=10000)
    data   = np.ascontiguousarray(X_train, dtype=np.float32)
    labels = np.ascontiguousarray(y_train, dtype=np.float32).reshape(y_train.shape[0],1)

    return data, labels

# serialized model can be used only by daal4py with pickle
def pickle_serialization(result, file='df_result.pkl'):
    with open(file,'wb') as out:
        pickle.dump(result, out)

# universal naitive DAAL model serializtion. Can be used in all DAAL interfaces C++/Java/pydaal/daal4py
def native_serialization(result, file='native_result.txt'):
    daal_buff = result.__getstate__()
    File = open(file, "wb")
    File.write(daal_buff)


if __name__ == "__main__":
    data, labels = get_mnist()

    # 'fptype' parameter should be the same type as input numpy arrays to archive the best performance
    # (no data conversation in this case)
    train = d4p.decision_forest_classification_training(10, fptype='float', nTrees=100, minObservationsInLeafNode=1,
                                                        engine = d4p.engines_mt19937(seed=777),bootstrap=True)
    result = train.compute(data, labels)

    # serialize model to file
    pickle_serialization(result)
    native_serialization(result)

python预测

import daal4py as d4p

import numpy as np
import pickle
from sklearn.datasets import fetch_mldata
from sklearn.model_selection import train_test_split

def get_mnist_test():
    mnist = fetch_mldata('MNIST original')
    X_train, X_test, y_train, y_test = train_test_split(mnist.data, mnist.target, train_size=60000, test_size=10000)
    pdata   = np.ascontiguousarray(X_test, dtype=np.float32)
    plabels = np.ascontiguousarray(y_test, dtype=np.float32).reshape(y_test.shape[0],1)

    return pdata, plabels

def checkAccuracy(plabels, prediction):
    t = 0
    count = 0
    for i in plabels:
        if i != prediction[t]:
            count = count + 1
        t = t + 1
    return (1 - count/t)

def pickle_deserialization(file='df_result.pkl'):
    with open(file,'rb') as inp:
        return pickle.load(inp)

def native_deserialization(file='native_result.txt'):
    daal_result = d4p.decision_forest_classification_training_result()
    File = open(file, "rb")
    daal_buff = File.read()
    daal_result.__setstate__(daal_buff)
    return daal_result

if __name__ == "__main__":
    nClasses = 10

    pdata, plabels = get_mnist_test()

    #deserialize model
    deserialized_result_pickle = pickle_deserialization()

    deserialized_result_naitive = native_deserialization()
    
    # now predict using the deserialized model from the training above, fptype is float as input data
    predict_algo = d4p.decision_forest_classification_prediction(nClasses, fptype='float')

    # just set pickle-obtained model into compute
    predict_result = predict_algo.compute(pdata, deserialized_result_pickle.model)   

    print("
Accuracy:", checkAccuracy(plabels, predict_result.prediction))

    # the same result as above. just set native-obtained model into compute
    predict_result = predict_algo.compute(pdata, deserialized_result_naitive.model)   

    print("
Accuracy:", checkAccuracy(plabels, predict_result.prediction))

c++使用该daal4py的模型：　　

/**
 * <a name="DAAL-EXAMPLE-CPP-DF_CLS_DENSE_BATCH"></a>
 * example df_cls_dense_batch.cpp
 */

#include "daal.h"
#include "service.h"
#include "stdio.h"
using namespace std;
using namespace daal;
using namespace daal::algorithms;
using namespace daal::algorithms::decision_forest::classification;

/* Input data set parameters */
const string testDatasetFileName  = "../data/batch/mnist_test_data.csv";
const string labels  = "../data/batch/mnist_test_labels.csv";

const size_t nFeatures  = 784;  /* Number of features in training and testing data sets */
const size_t nClasses = 10;  /* Number of classes */

void testModel();
void loadData(const std::string& dataFileName, const std::string& labelsFileName, NumericTablePtr& pData, NumericTablePtr& pDependentVar);
void check_accuracy(NumericTablePtr prediction, NumericTablePtr testGroundTruth);

int main(int argc, char *argv[])
{
    checkArguments(argc, argv, 2, &labels, &testDatasetFileName);

    /* Deserialization */
    size_t size = 0;
    byte * buffer = NULL;
    FILE * pFile;
    size_t result;
    
    pFile = fopen ( "../data/batch/native_result.txt" , "rb" );
    if (pFile==NULL)
    {
        fputs ("File error",stderr);
        exit (1);
    }
    
    // obtain file size:
    fseek (pFile , 0 , SEEK_END);
    size = ftell (pFile);
    std::cout << "size: " << size << "
";
    rewind(pFile);
    
    // allocate memory to contain the whole file:
    buffer = (byte*) malloc (sizeof(byte)*size);
    if (buffer == NULL)
    {
        fputs ("Memory error",stderr); 
        exit (2);
    }
    
    // copy the file into the buffer:
    result = fread (buffer,1,size,pFile);
    if (result != size)
    {
        fputs ("Reading error",stderr);
        exit (3);
    }
    /* the result buffer is now loaded in the buffer. */

    /* Create a data archive to deserialize the numeric table */
    OutputDataArchive out_dataArch(buffer, size);
    free (buffer);
    fclose (pFile);

    /* needed for result allocation */
    training::Batch<> train(nClasses);
    train.getResult()->deserialize(out_dataArch);

    /* Create Numeric Tables for testing data and ground truth values */
    NumericTablePtr testData;
    NumericTablePtr testGroundTruth;

    loadData(testDatasetFileName, labels, testData, testGroundTruth);
    /* Create an algorithm object to predict values of decision forest classification */
    prediction::Batch<> algorithm(nClasses);

    /* Pass a testing data set and the trained model to the algorithm */
    algorithm.input.set(classifier::prediction::data, testData);
    /* set deserialized model */
    algorithm.input.set(classifier::prediction::model, train.getResult()->get(classifier::training::model));

    /* Predict values of decision forest classification */
    algorithm.compute();

    /* Retrieve the algorithm results */
    NumericTablePtr prediction = algorithm.getResult()->get(classifier::prediction::prediction); 
    printNumericTable(prediction, "Prediction results (first 10 rows):", 10);
    printNumericTable(testGroundTruth, "Ground truth (first 10 rows):", 10);

    check_accuracy(prediction, testGroundTruth);
    
    return 0;
}

void check_accuracy(NumericTablePtr prediction, NumericTablePtr testGroundTruth)
{
    /* check accuracy */
    BlockDescriptor<double> blockPr;
    prediction->getBlockOfRows(0, prediction->getNumberOfRows(), readOnly, blockPr);
    
    double* valueP = (blockPr.getBlockPtr());

    BlockDescriptor<double> blockGT;
    testGroundTruth->getBlockOfRows(0, testGroundTruth->getNumberOfRows(), readOnly, blockGT);
    
    double* valueG = (blockGT.getBlockPtr());

    size_t count = 0;
    for(size_t i = 0; i < testGroundTruth->getNumberOfRows(); i++)
    {
        if(valueG[i] != valueP[i])
            count++;
    }
    testGroundTruth->releaseBlockOfRows(blockGT);
    prediction->releaseBlockOfRows(blockPr);
    cout << "accuracy: " << 1- double(count)/double(testGroundTruth->getNumberOfRows()) << "
";
}

void loadData(const std::string& dataFileName,const std::string& labelsFileName, NumericTablePtr& pData, NumericTablePtr& pDependentVar)
{
    /* Initialize FileDataSource<CSVFeatureManager> to retrieve the input data from a .csv file */
    FileDataSource<CSVFeatureManager> trainDataSource(dataFileName,
        DataSource::notAllocateNumericTable,
        DataSource::doDictionaryFromContext);

    FileDataSource<CSVFeatureManager> trainLabels(labelsFileName,
        DataSource::notAllocateNumericTable,
        DataSource::doDictionaryFromContext);

    /* Create Numeric Tables for training data and dependent variables */
    pData.reset(new HomogenNumericTable<>(nFeatures, 0, NumericTable::notAllocate));
    pDependentVar.reset(new HomogenNumericTable<>(1, 0, NumericTable::notAllocate));

    /* Retrieve the data from input file */
    trainDataSource.loadDataBlock(pData.get());
    trainLabels.loadDataBlock(pDependentVar.get());
    NumericTableDictionaryPtr pDictionary = pData->getDictionarySharedPtr();
}