Source code for zoo.chronos.simulator.doppelganger_simulator

#
# Copyright 2018 Analytics Zoo Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import pickle
import os
import numpy as np

from zoo.chronos.simulator.doppelganger.util import gen_attribute_input_noise,\
    gen_feature_input_noise, gen_feature_input_data_free, renormalize_per_sample
from zoo.chronos.simulator.doppelganger.output import OutputType

import torch
import torch.nn.functional as F

MODEL_PATH = "doppelganger.ckpt"
FEATURE_OUTPUT = "feature.output.ckpt"
ATTRIBUTE_OUTPUT = "attribute.output.ckpt"


[docs]class DPGANSimulator:
    '''
    Doppelganger Simulator for time series generation.
    The codes and algorithm are adapted from https://github.com/fjxmlzn/DoppelGANger.
    '''
    def __init__(self,
                 L_max,
                 sample_len,
                 feature_dim,
                 num_real_attribute,
                 discriminator_num_layers=5,
                 discriminator_num_units=200,
                 attr_discriminator_num_layers=5,
                 attr_discriminator_num_units=200,
                 attribute_num_units=100,
                 attribute_num_layers=3,
                 feature_num_units=100,
                 feature_num_layers=1,
                 attribute_input_noise_dim=5,
                 addi_attribute_input_noise_dim=5,
                 d_gp_coe=10,
                 attr_d_gp_coe=10,
                 g_attr_d_coe=1,
                 d_lr=0.001,
                 attr_d_lr=0.001,
                 g_lr=0.001,
                 g_rounds=1,
                 d_rounds=1,
                 seed=0,
                 num_threads=None,
                 ckpt_dir=".",
                 checkpoint_every_n_epoch=0):
        '''
        Initialize a doppelganger simulator.

        :param L_max: the maximum length of your feature.
        :param sample_len: the sample length to control LSTM length, should be a divider to L_max
        :param feature_dim: dimention of the feature
        :param num_real_attribute: the length of you attribute, which should be equal to the
               len(data_attribute).
        :param discriminator_num_layers: MLP layer num for discriminator.
        :param discriminator_num_units: MLP hidden unit for discriminator.
        :param attr_discriminator_num_layers: MLP layer num for attr discriminator.
        :param attr_discriminator_num_units: MLP hidden unit for attr discriminator.
        :param attribute_num_units: MLP layer num for attr generator/addi attr generator.
        :param attribute_num_layers:  MLP hidden unit for attr generator/addi attr generator.
        :param feature_num_units: LSTM hidden unit for feature generator.
        :param feature_num_layers: LSTM layer num for feature generator.
        :param attribute_input_noise_dim: noise data dim for attr generator.
        :param addi_attribute_input_noise_dim: noise data dim for addi attr generator.
        :param d_gp_coe: gradient penalty ratio for d loss.
        :param attr_d_gp_coe: gradient penalty ratio for attr d loss.
        :param g_attr_d_coe: ratio between feature loss and attr loss for g loss.
        :param d_lr: learning rate for discriminator.
        :param attr_d_lr: learning rate for attr discriminator.
        :param g_lr: learning rate for genereators.
        :param g_rounds: g rounds.
        :param d_rounds: d rounds.
        :param seed: random seed.
        :param num_threads: num of threads to be used for training.
        :param ckpt_dir: The checkpoint location, defaults to the working dir.
        :param checkpoint_every_n_epoch: checkpoint every n epoch, defaults to 0
               for no checkpoints.
        '''
        # additional settings
        from pytorch_lightning import seed_everything
        seed_everything(seed=seed)
        if num_threads is not None:
            torch.set_num_threads(num_threads)
        self.ckpt_dir = ckpt_dir
        self.ckpt_dir_model = os.path.join(self.ckpt_dir, "model")
        self.checkpoint_every_n_epoch = checkpoint_every_n_epoch
        self.sample_len = sample_len
        self.L_max = L_max
        self.feature_dim = feature_dim
        self.num_real_attribute = num_real_attribute

        # hparam saving
        self.params = {"discriminator_num_layers": discriminator_num_layers,
                       "discriminator_num_units": discriminator_num_units,
                       "attr_discriminator_num_layers": attr_discriminator_num_layers,
                       "attr_discriminator_num_units": attr_discriminator_num_units,
                       "attribute_num_units": attribute_num_units,
                       "attribute_num_layers": attribute_num_layers,
                       "feature_num_units": feature_num_units,
                       "feature_num_layers": feature_num_layers,
                       "attribute_input_noise_dim": attribute_input_noise_dim,
                       "addi_attribute_input_noise_dim": addi_attribute_input_noise_dim,
                       "d_gp_coe": d_gp_coe,
                       "attr_d_gp_coe": attr_d_gp_coe,
                       "g_attr_d_coe": g_attr_d_coe,
                       "d_lr": d_lr,
                       "attr_d_lr": attr_d_lr,
                       "g_lr": g_lr,
                       "g_rounds": g_rounds,
                       "d_rounds": d_rounds}

        # model init
        self.model = None  # model will be lazy built since the dim will depend on the data

[docs]    def fit(self,
            data_feature,
            data_attribute,
            data_gen_flag,
            feature_outputs,
            attribute_outputs,
            epoch=1,
            batch_size=32):
        '''
        Fit on the training data(typically the private data).

        :param data_feature: Training features, in numpy float32 array format.
               The size is [(number of training samples) x (maximum length)
               x (total dimension of features)]. Categorical features are stored
               by one-hot encoding; for example, if a categorical feature has 3
               possibilities, then it can take values between [1., 0., 0.],
               [0., 1., 0.], and [0., 0., 1.]. Each continuous feature should be
               normalized to [0, 1] or [-1, 1]. The array is padded by zeros after
               the time series ends.
        :param data_attribute: Training attributes, in numpy float32 array format. The size is
               [(number of training samples) x (total dimension of attributes)]. Categorical
               attributes are stored by one-hot encoding; for example, if a categorical
               attribute has 3 possibilities, then it can take values between [1., 0., 0.],
               [0., 1., 0.], and [0., 0., 1.]. Each continuous attribute should be normalized
               to [0, 1] or [-1, 1].
        :param data_gen_flag: Flags indicating the activation of features, in numpy float32
               array format. The size is [(number of training samples) x (maximum length)].
               1 means the time series is activated at this time step, 0 means the time series
               is inactivated at this timestep.
        :param feature_outputs: A list of Output indicates the meta data of data_feature.
        :param attribute_outputs: A list of Output indicates the meta data of data_attribute.
        :param epoch: training epoch.
        :param batch_size: training batchsize.
        '''
        # data preparation
        real_data = {}
        real_data["data_feature"] = data_feature
        real_data["data_attribute"] = data_attribute
        real_data["data_gen_flag"] = data_gen_flag
        from zoo.chronos.simulator.doppelganger.data_module import DoppelGANgerDataModule
        self.data_module = DoppelGANgerDataModule(real_data=real_data,
                                                  feature_outputs=feature_outputs,
                                                  attribute_outputs=attribute_outputs,
                                                  sample_len=self.sample_len,
                                                  batch_size=batch_size)

        from pytorch_lightning.callbacks import ModelCheckpoint
        checkpoint_callback = ModelCheckpoint(dirpath=self.ckpt_dir_model,
                                              save_top_k=-1,
                                              every_n_epochs=self.checkpoint_every_n_epoch)
        if self.checkpoint_every_n_epoch != 0:
            with open(os.path.join(self.ckpt_dir, FEATURE_OUTPUT), "wb") as f:
                pickle.dump(self.data_module.data_feature_outputs, f)
            with open(os.path.join(self.ckpt_dir, ATTRIBUTE_OUTPUT), "wb") as f:
                pickle.dump(self.data_module.data_attribute_outputs, f)

        # build the model
        from zoo.chronos.simulator.doppelganger.doppelganger_pl import DoppelGANger_pl
        self.model = DoppelGANger_pl(data_feature_outputs=self.data_module.data_feature_outputs,
                                     data_attribute_outputs=self.data_module.data_attribute_outputs,
                                     L_max=self.L_max,
                                     sample_len=self.sample_len,
                                     num_real_attribute=self.num_real_attribute,
                                     **self.params)
        from pytorch_lightning import Trainer
        self.trainer = Trainer(logger=False,
                               callbacks=checkpoint_callback,
                               max_epochs=epoch,
                               default_root_dir=self.ckpt_dir)

        # fit!
        self.trainer.fit(self.model, self.data_module)

[docs]    def generate(self, sample_num=1, batch_size=32):
        '''
        Generate synthetic data with similar distribution as training data.

        :param sample_num: How many samples to be generated.
        :param batch_size: batch size to generate.
        '''
        # set to inference mode
        self.model.eval()
        total_generate_num_sample = sample_num

        # generate noise and inputs
        real_attribute_input_noise = gen_attribute_input_noise(total_generate_num_sample)
        addi_attribute_input_noise = gen_attribute_input_noise(total_generate_num_sample)
        feature_input_noise = gen_feature_input_noise(total_generate_num_sample, self.model.length)
        feature_input_data = gen_feature_input_data_free(total_generate_num_sample,
                                                         self.model.sample_len,
                                                         self.feature_dim)
        real_attribute_input_noise = torch.from_numpy(real_attribute_input_noise).float()
        addi_attribute_input_noise = torch.from_numpy(addi_attribute_input_noise).float()
        feature_input_noise = torch.from_numpy(feature_input_noise).float()
        feature_input_data = torch.from_numpy(feature_input_data).float()

        # generate
        features, attributes, gen_flags, lengths\
            = self.model.sample_from(real_attribute_input_noise,
                                     addi_attribute_input_noise,
                                     feature_input_noise,
                                     feature_input_data,
                                     batch_size=batch_size)

        # renormalize (max, min)
        features, attributes = renormalize_per_sample(
            features, attributes, self.model.data_feature_outputs,
            self.model.data_attribute_outputs, gen_flags,
            num_real_attribute=self.num_real_attribute)  # -2 for addi attr

        # post-process the attributes
        output_list = []
        current_idx = 0
        for i in range(len(self.model.data_attribute_outputs)):
            output_it = self.model.data_attribute_outputs[i]
            if output_it.type_ == OutputType.DISCRETE:
                sub_output = F.softmax(torch.from_numpy(attributes[:, current_idx:
                                                                   current_idx+output_it.dim]))
                sub_output_discrete = F.one_hot(torch.argmax(sub_output, dim=1),
                                                num_classes=output_it.dim)
                output_list.append(sub_output_discrete)
            current_idx += output_it.dim
        attributes = torch.cat(output_list, dim=1).numpy()

        return features, attributes, gen_flags, lengths

[docs]    def save(self, path_dir):
        '''
        Save the simulator.

        :param path_dir: saving path
        '''
        path_dir_model = os.path.join(path_dir, "model")
        self.trainer.save_checkpoint(os.path.join(path_dir_model, MODEL_PATH))
        with open(os.path.join(path_dir, FEATURE_OUTPUT), "wb") as f:
            pickle.dump(self.data_module.data_feature_outputs, f)
        with open(os.path.join(path_dir, ATTRIBUTE_OUTPUT), "wb") as f:
            pickle.dump(self.data_module.data_attribute_outputs, f)

[docs]    def load(self,
             path_dir,
             model_version=MODEL_PATH):
        '''
        Load the simulator.

        :param path_dir: saving path
        :param model_version: model version(filename) you would like to load.
        '''
        with open(os.path.join(path_dir, FEATURE_OUTPUT), "rb") as f:
            data_feature_outputs = pickle.load(f)
        with open(os.path.join(path_dir, ATTRIBUTE_OUTPUT), "rb") as f:
            data_attribute_outputs = pickle.load(f)
        path_dir_model = os.path.join(path_dir, "model")
        from zoo.chronos.simulator.doppelganger.doppelganger_pl import DoppelGANger_pl
        self.model =\
            DoppelGANger_pl.load_from_checkpoint(os.path.join(path_dir_model, model_version),
                                                 data_feature_outputs=data_feature_outputs,
                                                 data_attribute_outputs=data_attribute_outputs)