Source code for zoo.chronos.autots.autotsestimator

#
# Copyright 2018 Analytics Zoo Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import types

from zoo.orca.automl.auto_estimator import AutoEstimator
from zoo.chronos.data import TSDataset
import zoo.orca.automl.hp as hp
from zoo.chronos.autots.model import AutoModelFactory
from zoo.chronos.autots.tspipeline import TSPipeline


[docs]class AutoTSEstimator: """ Automated TimeSeries Estimator for time series forecasting task, which supports TSDataset and customized data creator as data input on built-in model (only "lstm", "tcn", "seq2seq" for now) and 3rd party model. Only backend="torch" is supported for now. Customized data creator has not been fully supported by TSPipeline. >>> # Here is a use case example: >>> # prepare train/valid/test tsdataset >>> autoest = AutoTSEstimator(model="lstm", >>> search_space=search_space, >>> past_seq_len=6, >>> future_seq_len=1) >>> tsppl = autoest.fit(data=tsdata_train, >>> validation_data=tsdata_valid) >>> tsppl.predict(tsdata_test) >>> tsppl.save("my_tsppl") """ def __init__(self, model="lstm", search_space=dict(), metric="mse", loss=None, optimizer="Adam", past_seq_len=2, future_seq_len=1, input_feature_num=None, output_target_num=None, selected_features="auto", backend="torch", logs_dir="/tmp/autots_estimator", cpus_per_trial=1, name="autots_estimator", remote_dir=None, ): """ AutoTSEstimator trains a model for time series forecasting. Users can choose one of the built-in models, or pass in a customized pytorch or keras model for tuning using AutoML. :param model: a string or a model creation function. A string indicates a built-in model, currently "lstm", "tcn", "seq2seq" are supported. A model creation function indicates a 3rd party model, the function should take a config param and return a torch.nn.Module (backend="torch") / tf model (backend="keras"). If you use chronos.data.TSDataset as data input, the 3rd party should have 3 dim input (num_sample, past_seq_len, input_feature_num) and 3 dim output (num_sample, future_seq_len, output_feature_num) and use the same key in the model creation function. If you use a customized data creator, the output of data creator should fit the input of model creation function. :param search_space: str or dict. hyper parameter configurations. For str, you can choose from "minimal", "normal", or "large", each represents a default search_space for our built-in model with different computing requirement. For dict, Read the API docs for each auto model. Some common hyper parameter can be explicitly set in named parameter. search_space should contain those parameters other than the keyword arguments in this constructor in its key. If a 3rd parth model is used, then you must set search_space to a dict. :param metric: String. The evaluation metric name to optimize. e.g. "mse" :param loss: String or pytorch/tf.keras loss instance or pytorch loss creator function. The default loss function for pytorch backend is nn.MSELoss(). :param optimizer: String or pyTorch optimizer creator function or tf.keras optimizer instance. :param past_seq_len: Int or or hp sampling function. The number of historical steps (i.e. lookback) used for forecasting. For hp sampling, see zoo.orca.automl.hp for more details. The values defaults to 2. :param future_seq_len: Int. The number of future steps to forecast. The value defaults to 1. :param input_feature_num: Int. The number of features in the input. The value is ignored if you use chronos.data.TSDataset as input data type. :param output_target_num: Int. The number of targets in the output. The value is ignored if you use chronos.data.TSDataset as input data type. :param selected_features: String. "all" and "auto" are supported for now. For "all", all features that are generated are used for each trial. For "auto", a subset is sampled randomly from all features for each trial. The parameter is ignored if not using chronos.data.TSDataset as input data type. The value defaults to "auto". :param backend: The backend of the auto model. We only support backend as "torch" for now. :param logs_dir: Local directory to save logs and results. It defaults to "/tmp/autots_estimator" :param cpus_per_trial: Int. Number of cpus for each trial. It defaults to 1. :param name: name of the autots estimator. It defaults to "autots_estimator". :param remote_dir: String. Remote directory to sync training results and checkpoints. It defaults to None and doesn't take effects while running in local. While running in cluster, it defaults to "hdfs:///tmp/{name}". """ # check backend and set default loss if backend != "torch": raise ValueError(f"We only support backend as torch. Got {backend}") else: import torch if loss is None: loss = torch.nn.MSELoss() if isinstance(search_space, str): search_space = AutoModelFactory.get_default_search_space(model, search_space) if isinstance(model, types.FunctionType) and backend == "torch": # pytorch 3rd party model from zoo.orca.automl.auto_estimator import AutoEstimator self.model = AutoEstimator.from_torch(model_creator=model, optimizer=optimizer, loss=loss, logs_dir=logs_dir, resources_per_trial={"cpu": cpus_per_trial}, name=name) self.metric = metric search_space.update({"past_seq_len": past_seq_len, "future_seq_len": future_seq_len, "input_feature_num": input_feature_num, "output_feature_num": output_target_num}) self.search_space = search_space if isinstance(model, str): # built-in model # update auto model common search space search_space.update({"past_seq_len": past_seq_len, "future_seq_len": future_seq_len, "input_feature_num": input_feature_num, "output_target_num": output_target_num, "loss": loss, "metric": metric, "optimizer": optimizer, "backend": backend, "logs_dir": logs_dir, "cpus_per_trial": cpus_per_trial, "name": name}) # create auto model from name self.model = AutoModelFactory.create_auto_model(name=model, search_space=search_space) # save selected features setting for data creator generation self.selected_features = selected_features self._scaler = None self._scaler_index = None
[docs] def fit(self, data, epochs=1, batch_size=32, validation_data=None, metric_threshold=None, n_sampling=1, search_alg=None, search_alg_params=None, scheduler=None, scheduler_params=None ): """ fit using AutoEstimator :param data: train data. For backend of "torch", data can be a TSDataset or a function that takes a config dictionary as parameter and returns a PyTorch DataLoader. For backend of "keras", data can be a TSDataset. :param epochs: Max number of epochs to train in each trial. Defaults to 1. If you have also set metric_threshold, a trial will stop if either it has been optimized to the metric_threshold or it has been trained for {epochs} epochs. :param batch_size: Int or hp sampling function from an integer space. Training batch size. It defaults to 32. :param validation_data: Validation data. Validation data type should be the same as data. :param metric_threshold: a trial will be terminated when metric threshold is met. :param n_sampling: Number of times to sample from the search_space. Defaults to 1. If hp.grid_search is in search_space, the grid will be repeated n_sampling of times. If this is -1, (virtually) infinite samples are generated until a stopping condition is met. :param search_alg: str, all supported searcher provided by ray tune (i.e."variant_generator", "random", "ax", "dragonfly", "skopt", "hyperopt", "bayesopt", "bohb", "nevergrad", "optuna", "zoopt" and "sigopt") :param search_alg_params: extra parameters for searcher algorithm besides search_space, metric and searcher mode :param scheduler: str, all supported scheduler provided by ray tune :param scheduler_params: parameters for scheduler :return: a TSPipeline with the best model. """ is_third_party_model = isinstance(self.model, AutoEstimator) # generate data creator from TSDataset (pytorch base require validation data) if isinstance(data, TSDataset) and isinstance(validation_data, TSDataset): train_d, val_d = self._prepare_data_creator( search_space=self.search_space if is_third_party_model else self.model.search_space, train_data=data, val_data=validation_data, ) self._scaler = data.scaler self._scaler_index = data.scaler_index else: train_d, val_d = data, validation_data if is_third_party_model: self.search_space.update({"batch_size": batch_size}) self.model.fit( data=train_d, epochs=epochs, validation_data=val_d, metric=self.metric, metric_threshold=metric_threshold, n_sampling=n_sampling, search_space=self.search_space, search_alg=search_alg, search_alg_params=search_alg_params, scheduler=scheduler, scheduler_params=scheduler_params, ) if not is_third_party_model: self.model.fit( data=train_d, epochs=epochs, batch_size=batch_size, validation_data=val_d, metric_threshold=metric_threshold, n_sampling=n_sampling, search_alg=search_alg, search_alg_params=search_alg_params, scheduler=scheduler, scheduler_params=scheduler_params ) return TSPipeline(best_model=self._get_best_automl_model(), best_config=self.get_best_config(), scaler=self._scaler, scaler_index=self._scaler_index)
def _prepare_data_creator(self, search_space, train_data, val_data=None): """ prepare the data creators and add selected features to search_space :param search_space: the search space :param train_data: train data :param val_data: validation data :return: data creators from train and validation data """ import torch from torch.utils.data import TensorDataset, DataLoader import ray # automatically inference output_feature_num # input_feature_num will be set by base pytorch model according to selected features. search_space['output_feature_num'] = len(train_data.target_col) # append feature selection into search space # TODO: more flexible setting all_features = train_data.feature_col if self.selected_features not in ("all", "auto"): raise ValueError(f"Only \"all\" and \"auto\" are supported for selected_features,\ but found {self.selected_features}") if self.selected_features == "auto": if len(all_features) == 0: search_space['selected_features'] = all_features else: search_space['selected_features'] = hp.choice_n(all_features, min_items=0, max_items=len(all_features)) if self.selected_features == "all": search_space['selected_features'] = all_features # put train/val data in ray train_data_id = ray.put(train_data) valid_data_id = ray.put(val_data) def train_data_creator(config): train_d = ray.get(train_data_id) x, y = train_d.roll(lookback=config.get('past_seq_len'), horizon=config.get('future_seq_len'), feature_col=config['selected_features']) \ .to_numpy() return DataLoader(TensorDataset(torch.from_numpy(x).float(), torch.from_numpy(y).float()), batch_size=config["batch_size"], shuffle=True) def val_data_creator(config): val_d = ray.get(valid_data_id) x, y = val_d.roll(lookback=config.get('past_seq_len'), horizon=config.get('future_seq_len'), feature_col=config['selected_features']) \ .to_numpy() return DataLoader(TensorDataset(torch.from_numpy(x).float(), torch.from_numpy(y).float()), batch_size=config["batch_size"], shuffle=True) return train_data_creator, val_data_creator def _get_best_automl_model(self): """ For internal use only. :return: the best automl model instance """ return self.model._get_best_automl_model()
[docs] def get_best_config(self): """ Get the best configuration :return: A dictionary of best hyper parameters """ return self.model.get_best_config()