Source code for neurobench.datasets.mackey_glass

from neurobench.datasets.dataset import Dataset
import numpy as np
import torch
import os
from jitcdde import y, t, jitcdde_lyap
from .utils import download_url
from urllib.error import URLError
import tarfile

"""
The jitcdde package used to generate the MackeyGlass time-series can vary based 
on platform, due to lower level integration solvers. In order to ensure that you
are using the same data as the authors, please use the downloaded version, which
will be automatically downloaded.

https://huggingface.co/datasets/NeuroBench/mackey_glass
"""



[docs]
class MackeyGlass(Dataset):
    """Dataset for the Mackey-Glass task."""


[docs]
    def __init__(
        self,
        file_path=None,
        tau=17,
        lyaptime=197,
        constant_past=0.7206597,
        nmg=10,
        beta=0.2,
        gamma=0.1,
        pts_per_lyaptime=75,
        traintime=10.0,
        testtime=10.0,
        start_offset=0.0,
        seed_id=0,
        bin_window=1,
        download=True,
    ):
        """
        Initializes the Mackey-Glass dataset.

        Args:
            file_path (str): path to .npy file containing Mackey-Glass time-series. If this is provided, then tau, lyaptime, constant_past, nmg, beta, gamma are ignored.
            tau (float): parameter of the Mackey-Glass equation
            lyaptime (float): Lyapunov time of the time-series
            constant_past (float): initial condition for the solver
            nmg (float): parameter of the Mackey-Glass equation
            beta (float): parameter of the Mackey-Glass equation
            gamma (float): parameter of the Mackey-Glass equation
            pts_per_lyaptime (int): number of points to sample per one Lyapunov time
            traintime (float): number of Lyapunov times to be used for training a model
            testtime (float): number of Lyapunov times to be used for testing a model
            start_offset (int): added offset in number of points to shift the timeseries forward
            seed_id (int): seed for generating function solution
            bin_window (int): number of points forming lookback window for each prediction
            download (bool): If True, downloads the dataset from the internet and puts it in root
                                 directory. If dataset is already downloaded, it will not be downloaded again.

        """

        super().__init__()

        # Parameters
        self.tau = tau
        self.lyaptime = lyaptime
        self.constant_past = constant_past
        self.nmg = nmg
        self.beta = beta
        self.gamma = gamma
        self.pts_per_lyaptime = pts_per_lyaptime

        # Time units for train (user should split out the warmup or validation)
        self.traintime = traintime * self.lyaptime
        # Time units to forecast
        self.testtime = testtime * self.lyaptime

        self.start_offset = start_offset
        self.seed_id = seed_id

        self.bin_window = bin_window

        # Total time to simulate the system
        self.maxtime = (
            self.traintime + self.testtime + (self.lyaptime / self.pts_per_lyaptime)
        )

        # Discrete-time versions of the continuous times specified above
        self.traintime_pts = round(traintime * self.pts_per_lyaptime)
        self.testtime_pts = round(testtime * self.pts_per_lyaptime)
        self.maxtime_pts = (
            self.traintime_pts + self.testtime_pts + 1
        )  # eval one past the end

        # Specify the system using the provided parameters
        self.mackeyglass_specification = [
            self.beta * y(0, t - self.tau) / (1 + y(0, t - self.tau) ** self.nmg)
            - self.gamma * y(0)
        ]

        self.file_path = file_path
        self.url = "https://huggingface.co/datasets/NeuroBench/mackey_glass/resolve/main/data.tar.gz"

        if download and not os.path.exists(self.file_path):
            print("downloading ....")
            self.download()
        # Load or generate time-series
        if os.path.exists(self.file_path) is not None:
            self.load_data(self.file_path)
        else:
            self.generate_data()

        # Generate train/test indices
        self.split_data()



[docs]
    def download(self):
        """Download the Mackey Glass data if it doesn't exist already."""

        if os.path.exists(self.file_path):
            print("The dataset already exists!")
            return

        os.makedirs(os.path.dirname(self.file_path), exist_ok=True)

        # download file
        file_path = f"{os.path.dirname(self.file_path)}/data.tar.gz"
        try:
            print(f"Downloading {self.url}")
            download_url(self.url, file_path)
        except URLError as error:
            print(f"Failed to download (trying next):\n{error}")
        finally:
            print("Unzipping file...")
            with tarfile.open(file_path, "r:gz") as tar:
                for member in tar.getmembers():
                    if member.isfile():  # Check if it's a file
                        # Remove the directory path from the member's name
                        member.name = os.path.basename(member.name)
                        tar.extract(member, path=os.path.dirname(file_path))
            print()



[docs]
    def load_data(self, file):
        all_data = np.load(file)

        self.mackeyglass_soln = all_data[
            int(self.start_offset) : int(self.start_offset + self.maxtime_pts)
        ]

        self.mackeyglass_soln = torch.tensor(self.mackeyglass_soln, dtype=torch.float64)
        self.mackeyglass_soln = self.mackeyglass_soln.unsqueeze(dim=-1)

        # pad the soln with preceding zeroes for lookback window
        self.mackeyglass_soln = torch.cat(
            (
                torch.zeros((self.bin_window - 1, 1), dtype=torch.float64),
                self.mackeyglass_soln,
            ),
            0,
        )



[docs]
    def generate_data(self):
        """Generate time-series using the provided parameters of the equation."""
        np.random.seed(self.seed_id)

        # Create the equation object based on the settings
        self.DDE = jitcdde_lyap(self.mackeyglass_specification)
        # self.DDE.set_integration_parameters(atol=1e-17, rtol=1e-17, min_step=1e-17) # TODO: comment this out later after testing
        self.DDE.constant_past([self.constant_past])
        self.DDE.step_on_discontinuities()

        ##
        # Generate data from the Mackey-Glass system
        ##
        self.mackeyglass_soln = torch.zeros((self.maxtime_pts, 1), dtype=torch.float64)
        lyaps = torch.zeros((self.maxtime_pts, 1), dtype=torch.float64)
        lyaps_weights = torch.zeros((self.maxtime_pts, 1), dtype=torch.float64)
        count = 0

        offset = self.start_offset * self.lyaptime / self.pts_per_lyaptime

        for time in torch.linspace(
            self.DDE.t + offset,
            self.DDE.t + offset + self.maxtime,
            steps=self.maxtime_pts,
            dtype=torch.float64,
        ):
            value, lyap, weight = self.DDE.integrate(time.item())
            self.mackeyglass_soln[count, 0] = value[0]
            lyaps[count, 0] = lyap[0]
            lyaps_weights[count, 0] = weight
            count += 1

        # Total variance of the generated Mackey-Glass time-series
        self.total_var = torch.var(self.mackeyglass_soln[:, 0], True)

        # Estimate Lyapunov exponent
        self.lyap_exp = ((lyaps.T @ lyaps_weights) / lyaps_weights.sum()).item()

        # pad the soln with preceding zeroes for lookback window
        self.mackeyglass_soln = torch.cat(
            (
                torch.zeros((self.bin_window - 1, 1), dtype=torch.float64),
                self.mackeyglass_soln,
            ),
            0,
        )



[docs]
    def split_data(self):
        """Generate training and testing indices."""
        self.ind_train = torch.arange(0, self.traintime_pts)
        self.ind_test = torch.arange(self.traintime_pts, self.maxtime_pts - 1)


    def __len__(self):
        """
        Returns number of samples in dataset.

        Returns:
            int: number of samples in dataset

        """
        return len(self.mackeyglass_soln) - 1


[docs]
    def __getitem__(self, idx):
        """
        Getter method for dataset.

        Args:
            idx (int or tensor): index(s) of sample(s) to return

        Returns:
            sample (tensor): individual data sample, shape=(timestamps, features)=(1,1)
            target (tensor): corresponding next state of the system, shape=(label,)=(1,)

        """
        # using Subset with list of indices
        if isinstance(idx, list) or (isinstance(idx, torch.Tensor) and idx.ndim > 0):
            # return in format (batch, bin_window, feature)
            data = [self.mackeyglass_soln[i : i + self.bin_window, :] for i in idx]
            sample = torch.stack(data)

        # idx is an integer
        else:
            sample = self.mackeyglass_soln[idx : idx + self.bin_window, :]

        target = self.mackeyglass_soln[
            idx + self.bin_window, :
        ]  # add to account for pre-padding

        return sample, target