Source code for zoo.chronos.autots.autotsestimator

#
# Copyright 2018 Analytics Zoo Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import types

from zoo.orca.automl.auto_estimator import AutoEstimator
from zoo.chronos.data import TSDataset
import zoo.orca.automl.hp as hp
from zoo.chronos.autots.model import AutoModelFactory
from zoo.chronos.autots.tspipeline import TSPipeline


[docs]class AutoTSEstimator:
    """
    Automated TimeSeries Estimator for time series forecasting task, which supports
    TSDataset and customized data creator as data input on built-in model (only
    "lstm", "tcn", "seq2seq" for now) and 3rd party model.

    Only backend="torch" is supported for now. Customized data creator has not been
    fully supported by TSPipeline.

    >>> # Here is a use case example:
    >>> # prepare train/valid/test tsdataset
    >>> autoest = AutoTSEstimator(model="lstm",
    >>>                           search_space=search_space,
    >>>                           past_seq_len=6,
    >>>                           future_seq_len=1)
    >>> tsppl = autoest.fit(data=tsdata_train,
    >>>                     validation_data=tsdata_valid)
    >>> tsppl.predict(tsdata_test)
    >>> tsppl.save("my_tsppl")
    """

    def __init__(self,
                 model="lstm",
                 search_space=dict(),
                 metric="mse",
                 loss=None,
                 optimizer="Adam",
                 past_seq_len=2,
                 future_seq_len=1,
                 input_feature_num=None,
                 output_target_num=None,
                 selected_features="auto",
                 backend="torch",
                 logs_dir="/tmp/autots_estimator",
                 cpus_per_trial=1,
                 name="autots_estimator",
                 remote_dir=None,
                 ):
        """
        AutoTSEstimator trains a model for time series forecasting.
        Users can choose one of the built-in models, or pass in a customized pytorch or keras model
        for tuning using AutoML.

        :param model: a string or a model creation function.
               A string indicates a built-in model, currently "lstm", "tcn", "seq2seq" are
               supported.
               A model creation function indicates a 3rd party model, the function should take a
               config param and return a torch.nn.Module (backend="torch") / tf model
               (backend="keras").
               If you use chronos.data.TSDataset as data input, the 3rd party
               should have 3 dim input (num_sample, past_seq_len, input_feature_num) and 3 dim
               output (num_sample, future_seq_len, output_feature_num) and use the same key
               in the model creation function. If you use a customized data creator, the output of
               data creator should fit the input of model creation function.
        :param search_space: str or dict. hyper parameter configurations. For str, you can choose
               from "minimal", "normal", or "large", each represents a default search_space for
               our built-in model with different computing requirement. For dict, Read the API docs
               for each auto model. Some common hyper parameter can be explicitly set in named
               parameter. search_space should contain those parameters other than the keyword
               arguments in this constructor in its key. If a 3rd parth model is used, then you
               must set search_space to a dict.
        :param metric: String. The evaluation metric name to optimize. e.g. "mse"
        :param loss: String or pytorch/tf.keras loss instance or pytorch loss creator function. The
               default loss function for pytorch backend is nn.MSELoss().
        :param optimizer: String or pyTorch optimizer creator function or
               tf.keras optimizer instance.
        :param past_seq_len: Int or or hp sampling function. The number of historical steps (i.e.
               lookback) used for forecasting. For hp sampling, see zoo.orca.automl.hp for more
               details. The values defaults to 2.
        :param future_seq_len: Int. The number of future steps to forecast. The value defaults
               to 1.
        :param input_feature_num: Int. The number of features in the input. The value is ignored if
               you use chronos.data.TSDataset as input data type.
        :param output_target_num: Int. The number of targets in the output. The value is ignored if
               you use chronos.data.TSDataset as input data type.
        :param selected_features: String. "all" and "auto" are supported for now. For "all",
               all features that are generated are used for each trial. For "auto", a subset
               is sampled randomly from all features for each trial. The parameter is ignored
               if not using chronos.data.TSDataset as input data type. The value defaults
               to "auto".
        :param backend: The backend of the auto model. We only support backend as "torch" for now.
        :param logs_dir: Local directory to save logs and results.
               It defaults to "/tmp/autots_estimator"
        :param cpus_per_trial: Int. Number of cpus for each trial. It defaults to 1.
        :param name: name of the autots estimator. It defaults to "autots_estimator".
        :param remote_dir: String. Remote directory to sync training results and checkpoints. It
               defaults to None and doesn't take effects while running in local. While running in
               cluster, it defaults to "hdfs:///tmp/{name}".
        """
        # check backend and set default loss
        if backend != "torch":
            raise ValueError(f"We only support backend as torch. Got {backend}")
        else:
            import torch
            if loss is None:
                loss = torch.nn.MSELoss()

        if isinstance(search_space, str):
            search_space = AutoModelFactory.get_default_search_space(model, search_space)

        if isinstance(model, types.FunctionType) and backend == "torch":
            # pytorch 3rd party model
            from zoo.orca.automl.auto_estimator import AutoEstimator
            self.model = AutoEstimator.from_torch(model_creator=model,
                                                  optimizer=optimizer,
                                                  loss=loss,
                                                  logs_dir=logs_dir,
                                                  resources_per_trial={"cpu": cpus_per_trial},
                                                  name=name)
            self.metric = metric
            search_space.update({"past_seq_len": past_seq_len,
                                 "future_seq_len": future_seq_len,
                                 "input_feature_num": input_feature_num,
                                 "output_feature_num": output_target_num})
            self.search_space = search_space

        if isinstance(model, str):
            # built-in model
            # update auto model common search space
            search_space.update({"past_seq_len": past_seq_len,
                                 "future_seq_len": future_seq_len,
                                 "input_feature_num": input_feature_num,
                                 "output_target_num": output_target_num,
                                 "loss": loss,
                                 "metric": metric,
                                 "optimizer": optimizer,
                                 "backend": backend,
                                 "logs_dir": logs_dir,
                                 "cpus_per_trial": cpus_per_trial,
                                 "name": name})

            # create auto model from name
            self.model = AutoModelFactory.create_auto_model(name=model,
                                                            search_space=search_space)

        # save selected features setting for data creator generation
        self.selected_features = selected_features
        self._scaler = None
        self._scaler_index = None

[docs]    def fit(self,
            data,
            epochs=1,
            batch_size=32,
            validation_data=None,
            metric_threshold=None,
            n_sampling=1,
            search_alg=None,
            search_alg_params=None,
            scheduler=None,
            scheduler_params=None
            ):
        """
        fit using AutoEstimator

        :param data: train data.
               For backend of "torch", data can be a TSDataset or a function that takes a
               config dictionary as parameter and returns a PyTorch DataLoader.
               For backend of "keras", data can be a TSDataset.
        :param epochs: Max number of epochs to train in each trial. Defaults to 1.
               If you have also set metric_threshold, a trial will stop if either it has been
               optimized to the metric_threshold or it has been trained for {epochs} epochs.
        :param batch_size: Int or hp sampling function from an integer space. Training batch size.
               It defaults to 32.
        :param validation_data: Validation data. Validation data type should be the same as data.
        :param metric_threshold: a trial will be terminated when metric threshold is met.
        :param n_sampling: Number of times to sample from the search_space. Defaults to 1.
               If hp.grid_search is in search_space, the grid will be repeated n_sampling of times.
               If this is -1, (virtually) infinite samples are generated
               until a stopping condition is met.
        :param search_alg: str, all supported searcher provided by ray tune
               (i.e."variant_generator", "random", "ax", "dragonfly", "skopt",
               "hyperopt", "bayesopt", "bohb", "nevergrad", "optuna", "zoopt" and
               "sigopt")
        :param search_alg_params: extra parameters for searcher algorithm besides search_space,
               metric and searcher mode
        :param scheduler: str, all supported scheduler provided by ray tune
        :param scheduler_params: parameters for scheduler

        :return: a TSPipeline with the best model.
        """
        is_third_party_model = isinstance(self.model, AutoEstimator)

        # generate data creator from TSDataset (pytorch base require validation data)
        if isinstance(data, TSDataset) and isinstance(validation_data, TSDataset):
            train_d, val_d = self._prepare_data_creator(
                search_space=self.search_space if is_third_party_model else self.model.search_space,
                train_data=data,
                val_data=validation_data,
            )
            self._scaler = data.scaler
            self._scaler_index = data.scaler_index
        else:
            train_d, val_d = data, validation_data

        if is_third_party_model:
            self.search_space.update({"batch_size": batch_size})
            self.model.fit(
                data=train_d,
                epochs=epochs,
                validation_data=val_d,
                metric=self.metric,
                metric_threshold=metric_threshold,
                n_sampling=n_sampling,
                search_space=self.search_space,
                search_alg=search_alg,
                search_alg_params=search_alg_params,
                scheduler=scheduler,
                scheduler_params=scheduler_params,
            )

        if not is_third_party_model:
            self.model.fit(
                data=train_d,
                epochs=epochs,
                batch_size=batch_size,
                validation_data=val_d,
                metric_threshold=metric_threshold,
                n_sampling=n_sampling,
                search_alg=search_alg,
                search_alg_params=search_alg_params,
                scheduler=scheduler,
                scheduler_params=scheduler_params
            )

        return TSPipeline(best_model=self._get_best_automl_model(),
                          best_config=self.get_best_config(),
                          scaler=self._scaler,
                          scaler_index=self._scaler_index)

    def _prepare_data_creator(self, search_space, train_data, val_data=None):
        """
        prepare the data creators and add selected features to search_space
        :param search_space: the search space
        :param train_data: train data
        :param val_data: validation data
        :return: data creators from train and validation data
        """
        import torch
        from torch.utils.data import TensorDataset, DataLoader
        import ray

        # automatically inference output_feature_num
        # input_feature_num will be set by base pytorch model according to selected features.
        search_space['output_feature_num'] = len(train_data.target_col)

        # append feature selection into search space
        # TODO: more flexible setting
        all_features = train_data.feature_col
        if self.selected_features not in ("all", "auto"):
            raise ValueError(f"Only \"all\" and \"auto\" are supported for selected_features,\
                but found {self.selected_features}")
        if self.selected_features == "auto":
            if len(all_features) == 0:
                search_space['selected_features'] = all_features
            else:
                search_space['selected_features'] = hp.choice_n(all_features,
                                                                min_items=0,
                                                                max_items=len(all_features))
        if self.selected_features == "all":
            search_space['selected_features'] = all_features

        # put train/val data in ray
        train_data_id = ray.put(train_data)
        valid_data_id = ray.put(val_data)

        def train_data_creator(config):
            train_d = ray.get(train_data_id)

            x, y = train_d.roll(lookback=config.get('past_seq_len'),
                                horizon=config.get('future_seq_len'),
                                feature_col=config['selected_features']) \
                          .to_numpy()

            return DataLoader(TensorDataset(torch.from_numpy(x).float(),
                                            torch.from_numpy(y).float()),
                              batch_size=config["batch_size"],
                              shuffle=True)

        def val_data_creator(config):
            val_d = ray.get(valid_data_id)

            x, y = val_d.roll(lookback=config.get('past_seq_len'),
                              horizon=config.get('future_seq_len'),
                              feature_col=config['selected_features']) \
                        .to_numpy()

            return DataLoader(TensorDataset(torch.from_numpy(x).float(),
                                            torch.from_numpy(y).float()),
                              batch_size=config["batch_size"],
                              shuffle=True)

        return train_data_creator, val_data_creator

    def _get_best_automl_model(self):
        """
        For internal use only.

        :return: the best automl model instance
        """
        return self.model._get_best_automl_model()

[docs]    def get_best_config(self):
        """
        Get the best configuration

        :return: A dictionary of best hyper parameters
        """
        return self.model.get_best_config()