Source code for giotto.ml.classifier.random_forest

'''Randome Forest Classifer Module'''

from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn import preprocessing
from sklearn import feature_selection

import pickle
import numpy as np

from giotto.ml.database.classifier import MLClassifier
from giotto.ml.database.sensor import MLSensor  


[docs]class MLRandomForest(MLClassifier):
    '''Random Forest classifier class

    This class train a Random Forest classifier using a dataset passed to the
    train function. Then, it makes a prediction using timeseries data given to
    the "predict" function. 
    If you want to implement a classifier class using other models, replicate
    this class. The class have to implement two functions at least, train and predict.
    '''
    def __init__(self, dictionary=None, serialized=False):
        MLClassifier.__init__(self, dictionary, serialized)
        if self.model is None:
            self.model = RandomForestClassifier()
            self.model_name = 'random forest'

[docs]    def extract_features(self, dataset):
        '''Extracts features from a given dataset

        Extracts features with the preprocess function. The function is implemented
        in the MLClassifier class.
        '''
        data = dataset['data']
        labels = dataset['labels']

        all_features = []
        all_labels = []

        for sample in data:
            raw_data = sample['timeseries']

            features = self.preprocess(raw_data)
            indexed_label = labels.index(sample['label'])

            # Stack features and labels
            if all_features == []:
                all_features = features
            else:
                all_features = np.vstack((all_features,features))

            all_labels.append(indexed_label)

        data = {
            'features':all_features,
            'labels':all_labels,
            'sampling_period':dataset['sampling_period']
        }

        return data            

[docs]    def train(self, dataset):
        '''Trains a random forest classifier'''

        # Generate a training set
        data = self.extract_features(dataset)

        # Prescale
        self.scaler = preprocessing.StandardScaler().fit(data['features'])
        scaledFeatures = self.scaler.transform(data['features'])

        # Select features  Random Forest does not require feature selection
        # For other classifier uncomment the next 2 lines and do feature selection
        #self.selector = feature_selection.SelectKBest(feature_selection.f_regression).fit(scaledFeatures, data.labels)
        #selectedFeatures = self.selector.transform(scaledFeatures)

        # Train a classifier
        self.classifier = self.model.fit(scaledFeatures, data['labels'])
        self.sampling_period = data['sampling_period']
        self.labels = dataset['labels']

[docs]    def predict(self, timeseries):
        '''Makes a prediction using a pre-trained random forest classifier'''

        features = self.preprocess(timeseries)
        features = features.reshape(1, -1)

        # prescaling
        scaled_features = self.scaler.transform(features)

        # Feture selection
        #selectedFeatures = selector.transform(scaledFeatures)

        # Prediction
        predictions = self.classifier.predict(scaled_features)

        return self.labels[predictions[0].astype(int)]

if __name__=="__main__":
    clf = MLRandomForest('56b3c0f023cf8c29e049e89e','default')

    clf.train()