Source code for giotto.ml.database.classifier

import json
import pickle
import numpy as np

[docs]class MLClassifier: '''Classifier base class''' def __init__(self, dictionary=None, serialized=False): '''Initializes an instance Initializes an instance with 0, null, and None when dictionary is not passed. When a dictionary is passed, sets properties based on the dictionary. If the dictionary is fetched from a database, classifier, scaler, model, and selector are serialized. Thus, loads them before setting. Args: dictionary: a dictionary that contains properties. { '_id': An object ID of a classifier 'sensor_id': An object ID of a virtual sensor which this classifier is associated to 'user_id': A user ID of an owner of this classifier 'model_name':'random forest' 'labels': a list of potential outputs from this classifier 'sampling_period': A length of timeseries data necessary to make a prediction. This value is calculated when this classifier is trained by taking average of sampling periods of all samples in a given training set 'classifier': A trained random forest classifier 'scaler': A scaler that scales inputs as a part of pre-processing 'selector': A feature selector } Returns: A MLClassifier instance ''' if dictionary == None: self.object_id = '' self.sensor_id = '' self.model_name = '' self.user_id = '' self.model = None self.classifier = None self.scaler = None self.selector = None self.labels = [] self.sampling_period = 0 else: self.object_id = str(dictionary['_id']) self.sensor_id = dictionary['sensor_id'] self.user_id = dictionary['user_id'] self.model_name = dictionary['model_name'] self.labels = dictionary['labels'] self.sampling_period = dictionary['sampling_period'] if serialized: self.classifier = pickle.loads(dictionary['classifier']) self.scaler = pickle.loads(dictionary['scaler']) self.model = pickle.loads(dictionary['model']) self.selector = pickle.loads(dictionary['selector']) else: self.classifier = dictionary['classifier'] self.scaler = dictionary['scaler'] self.model = dictionary['model'] self.selector = dictionary['selector']
[docs] def to_dictionary(self, serialized=False): '''Creates a dictionary that contains all properties Creates a dictionary that contains all properties to store a classifier in a database. When serialized=True, classifier, scaler, selector, and model instance are serialized. Args: serialized: A flag that indicates if instances should be serialized or not Returns: dic: A dictionary that contains all properties, typically used to store a classifier in a database ''' dic = { 'object_id': self.object_id, 'sensor_id': self.sensor_id, 'user_id': self.user_id, 'model_name': self.model_name, 'sampling_period': self.sampling_period, 'labels': self.labels } if serialized: dic['classifier'] = pickle.dumps(self.classifier) dic['scaler'] = pickle.dumps(self.scaler) dic['selector'] = pickle.dumps(self.selector) dic['model'] = pickle.dumps(self.model) else: dic['classifier'] = self.classifier dic['scaler'] = self.scaler dic['selector'] = self.selector dic['model'] = self.model return dic
[docs] def train(self, dataset): '''Trains a classifier using the dataset as a training set A derived class should implement a training algorithm that creates and train classifier, scaler, selector, and model here. Args: dataset: A dictionary that holds a training set { 'data': An array of training samples [ { 'timeseries': An array of timeseries data arrays 'label': A string label for the timeseries data }, { more samples } ] 'labels': An array of string labels 'sampling_period': An average duration of samples } Returns: Nothing ''' pass
[docs] def predict(self, timeseries): '''Makes a prediction using a pre-trained classifier A derived class should implement a prediction algorithm here. Args: timeseries: Timeseries data for prediction Returns: label: a string label as a prediction ''' pass
[docs] def to_json(self): '''Returns a JSON representation of a MLClassifier instance''' return json.dumps(self, default=lambda o: o.__dict__, separators=(',',':'))
[docs] def preprocess(self, sensor_readings): '''Extracts features from timeseries data Extracts the following features from a array of timeseries data. Averages, standard deviations, numbers of peaks, medians, minimum values, maximum values, numbers of zero-crossing, differences between max and min. If you want to implement your own feature extraction, overwrite this function in a derived class. Args: sensor_readings: An array of timeseries data Returns: f: a ndarray that contains features for each timeseries data ''' colNum = len(sensor_readings) features = np.zeros((colNum,8)) for col in range(0,colNum): vals = np.array(sensor_readings[col]).astype('float') # average features[col,0] = np.average(vals) # std features[col,1] = np.std(vals) # peak count features[col,2] = self.peak_count(vals) # median features[col,3] = np.median(vals) # min features[col,4] = np.min(vals) # max features[col,5] = np.max(vals) # zero crossing features[col,6] = self.zero_crossing(vals) # max - min features[col,7] = features[col-1,5] - features[col-1,4] f = features.reshape(1,colNum*8)[0] return f
[docs] def zero_crossing(self, data): '''Counts the number of zero-crossing in given timesries data''' count = 0 for idx in range(len(data)-1): if data[idx]*data[idx+1] < 0: count = count + 1 return count
[docs] def peak_count(self, data): '''Counts the number of peaks in given timesries data''' count = 0 std = np.std(data) for idx in range(len(data)-2): if data[idx+1] > std*2 and (data[idx+1] - data[idx]) * (data[idx+2] - data[idx+1]) < 0: count = count + 1 return count