Source code for zoo.chronos.data.experimental.xshards_tsdataset

#
# Copyright 2018 Analytics Zoo Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#


from zoo.orca.data.shard import SparkXShards
from zoo.chronos.data.utils.utils import _to_list, _check_type
from zoo.chronos.data.utils.roll import roll_timeseries_dataframe
from zoo.chronos.data.utils.split import split_timeseries_dataframe
from zoo.chronos.data.experimental.utils import add_row, transform_to_dict

_DEFAULT_ID_COL_NAME = "id"
_DEFAULT_ID_PLACEHOLDER = "0"


[docs]class XShardsTSDataset:

    def __init__(self, shards, **schema):
        '''
        XShardTSDataset is an abstract of time series dataset with distributed fashion.
        Cascade call is supported for most of the transform methods.
        XShardTSDataset will partition the dataset by id_col, which is experimental.
        '''
        self.shards = shards
        self.id_col = schema["id_col"]
        self.dt_col = schema["dt_col"]
        self.feature_col = schema["feature_col"].copy()
        self.target_col = schema["target_col"].copy()

        self.numpy_shards = None

        self._id_list = list(shards[self.id_col].unique())

[docs]    @staticmethod
    def from_xshards(shards,
                     dt_col,
                     target_col,
                     id_col=None,
                     extra_feature_col=None,
                     with_split=False,
                     val_ratio=0,
                     test_ratio=0.1,
                     largest_look_back=0,
                     largest_horizon=1):
        '''
        Initialize xshardtsdataset(s) from xshard pandas dataframe.

        :param shards: an xshards pandas dataframe for your raw time series data.
        :param dt_col: a str indicates the col name of datetime
               column in the input data frame.
        :param target_col: a str or list indicates the col name of target column
               in the input data frame.
        :param id_col: (optional) a str indicates the col name of dataframe id. If
               it is not explicitly stated, then the data is interpreted as only
               containing a single id.
        :param extra_feature_col: (optional) a str or list indicates the col name
               of extra feature columns that needs to predict the target column.
        :param with_split: (optional) bool, states if we need to split the dataframe
               to train, validation and test set. The value defaults to False.
        :param val_ratio: (optional) float, validation ratio. Only effective when
               with_split is set to True. The value defaults to 0.
        :param test_ratio: (optional) float, test ratio. Only effective when with_split
               is set to True. The value defaults to 0.1.
        :param largest_look_back: (optional) int, the largest length to look back.
               Only effective when with_split is set to True. The value defaults to 0.
        :param largest_horizon: (optional) int, the largest num of steps to look
               forward. Only effective when with_split is set to True. The value defaults
               to 1.

        :return: a XShardTSDataset instance when with_split is set to False,
                 three XShardTSDataset instances when with_split is set to True.

        Create a xshardtsdataset instance by:

        >>> # Here is a df example:
        >>> # id        datetime      value   "extra feature 1"   "extra feature 2"
        >>> # 00        2019-01-01    1.9     1                   2
        >>> # 01        2019-01-01    2.3     0                   9
        >>> # 00        2019-01-02    2.4     3                   4
        >>> # 01        2019-01-02    2.6     0                   2
        >>> from zoo.orca.data.pandas import read_csv
        >>> shards = read_csv(csv_path)
        >>> tsdataset = XShardsTSDataset.from_xshards(shards, dt_col="datetime",
        >>>                                           target_col="value", id_col="id",
        >>>                                           extra_feature_col=["extra feature 1",
        >>>                                                              "extra feature 2"])
        '''

        _check_type(shards, "shards", SparkXShards)

        target_col = _to_list(target_col, name="target_col")
        feature_col = _to_list(extra_feature_col, name="extra_feature_col")

        if id_col is None:
            shards = shards.transform_shard(add_row,
                                            _DEFAULT_ID_COL_NAME,
                                            _DEFAULT_ID_PLACEHOLDER)
            id_col = _DEFAULT_ID_COL_NAME

        # repartition to id
        shards = shards.partition_by(cols=id_col,
                                     num_partitions=len(shards[id_col].unique()))

        if with_split:
            tsdataset_shards\
                = shards.transform_shard(split_timeseries_dataframe,
                                         id_col, val_ratio, test_ratio,
                                         largest_look_back, largest_horizon).split()
            return [XShardsTSDataset(shards=tsdataset_shards[i],
                                     id_col=id_col,
                                     dt_col=dt_col,
                                     target_col=target_col,
                                     feature_col=feature_col) for i in range(3)]

        return XShardsTSDataset(shards=shards,
                                id_col=id_col,
                                dt_col=dt_col,
                                target_col=target_col,
                                feature_col=feature_col)

[docs]    def roll(self,
             lookback,
             horizon,
             feature_col=None,
             target_col=None,
             id_sensitive=False):
        '''
        Sampling by rolling for machine learning/deep learning models.

        :param lookback: int, lookback value.
        :param horizon: int or list,
               if `horizon` is an int, we will sample `horizon` step
               continuously after the forecasting point.
               if `horizon` is a list, we will sample discretely according
               to the input list.
               specially, when `horizon` is set to 0, ground truth will be generated as None.
        :param feature_col: str or list, indicates the feature col name. Default to None,
               where we will take all available feature in rolling.
        :param target_col: str or list, indicates the target col name. Default to None,
               where we will take all target in rolling. it should be a subset of target_col
               you used to initialize the xshardtsdataset.
        :param id_sensitive: bool,
               |if `id_sensitive` is False, we will rolling on each id's sub dataframe
               |and fuse the sampings.
               |The shape of rolling will be
               |x: (num_sample, lookback, num_feature_col + num_target_col)
               |y: (num_sample, horizon, num_target_col)
               |where num_sample is the summation of sample number of each dataframe
               |if `id_sensitive` is True, we have not implement this currently.

        :return: the xshardtsdataset instance.
        '''
        if id_sensitive:
            raise NotImplementedError("id_sensitive option has not been implemented.")
        feature_col = _to_list(feature_col, "feature_col") if feature_col is not None \
            else self.feature_col
        target_col = _to_list(target_col, "target_col") if target_col is not None \
            else self.target_col
        self.numpy_shards = self.shards.transform_shard(roll_timeseries_dataframe,
                                                        None, lookback, horizon,
                                                        feature_col, target_col)
        return self

[docs]    def to_xshards(self):
        '''
        Export rolling result in form of a dict of numpy ndarray {'x': ..., 'y': ...}

        :return: a 2-element dict xshard. each value is a 3d numpy ndarray. The ndarray
                 is casted to float32.
        '''
        if self.numpy_shards is None:
            raise RuntimeError("Please call \"roll\" method\
                    before transform a XshardsTSDataset to numpy ndarray!")
        return self.numpy_shards.transform_shard(transform_to_dict)