Multivariate Regression

Prediction of the Amount of Sun Spots Using a Multivariate Time Series

time-series
tensorflow
machine-learning
Author

Miguel Alexander Chitiva Diaz

Published

June 19, 2021

Open In Colab

In this post, we will explore how to perform multivariate regression with TensorFlow to predict sunspot activity. We’ll use a comprehensive approach that includes feature engineering, time series analysis, and deep learning to forecast sunspot counts.

Configuration

window_size = 60
batch_size = 256

Imports

import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

import warnings
warnings.filterwarnings(
    'ignore',
    category=UserWarning,
    module=r'^keras(\.|$)'
)


import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub
from kagglehub import KaggleDatasetAdapter
from sklearn.preprocessing import MinMaxScaler
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.metrics import mean_absolute_error
from tensorflow.keras import layers, Model, callbacks, losses
from datetime import datetime
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

Download the Dataset

# Load the sunspots dataset from Kaggle
df = kagglehub.dataset_load(
    KaggleDatasetAdapter.PANDAS,
    "robervalt/sunspots", 
    "Sunspots.csv",
)
df.head()
Unnamed: 0 Date Monthly Mean Total Sunspot Number
0 0 1749-01-31 96.7
1 1 1749-02-28 104.3
2 2 1749-03-31 116.7
3 3 1749-04-30 92.8
4 4 1749-05-31 141.7

Data Preprocessing & Scaling

df = df.drop(df.columns[0], axis=1)
df = df.sort_values('Date')
df = df.rename(columns={df.columns[0]: 'date', df.columns[1]: 'sunspots'})

scaler = MinMaxScaler()
df['normalized_sunspots'] = scaler.fit_transform(np.expand_dims(df['sunspots'], axis=-1))
df
date sunspots normalized_sunspots
0 1749-01-31 96.7 0.242843
1 1749-02-28 104.3 0.261929
2 1749-03-31 116.7 0.293069
3 1749-04-30 92.8 0.233049
4 1749-05-31 141.7 0.355851
... ... ... ...
3260 2020-09-30 0.6 0.001507
3261 2020-10-31 14.4 0.036163
3262 2020-11-30 34.0 0.085384
3263 2020-12-31 21.8 0.054746
3264 2021-01-31 10.4 0.026118

3265 rows × 3 columns

Dataset Exploration

graphs = df.plot(grid=True, figsize=(14,7), fontsize=10, subplots=True)
[x.legend(prop={'size': 15}) for x in graphs];

f = plt.figure(figsize=(14,7))
ax1 = f.add_subplot(121)
ax2 = f.add_subplot(122)
plot_acf(df['normalized_sunspots'], ax=ax1, title='Autocorrelation Function')
plot_pacf(df['normalized_sunspots'], ax=ax2, title='Partial Autocorrelation Function')
ax1.grid(True)
ax2.grid(True)
ax1.tick_params(labelsize=10)
ax2.tick_params(labelsize=10)
plt.tight_layout()
plt.show()

f = plt.figure(figsize=(14,7))
ax1 = f.add_subplot(121)
ax2 = f.add_subplot(122)
plot_acf(df['normalized_sunspots'], ax=ax1, lags=160, title='ACF with Extended Lags')
plot_pacf(df['normalized_sunspots'], ax=ax2, lags=160, title='PACF with Extended Lags')
ax1.grid(True)
ax2.grid(True)
ax1.tick_params(labelsize=10)
ax2.tick_params(labelsize=10)
plt.tight_layout()
plt.show()

lag_pos_corr = np.argmax([df['normalized_sunspots'].autocorr(lag=x) for x in range(110, 130, 1)])
lag_pos_corr = list(range(110, 130, 1))[lag_pos_corr]

lag_neg_corr = np.argmin([df['normalized_sunspots'].autocorr(lag=x) for x in range(55, 65, 1)])
lag_neg_corr = list(range(55, 65, 1))[lag_neg_corr]

print(f"Seasonal max correlation: {lag_pos_corr}")
print(f"Seasonal min correlation: {lag_neg_corr}")
Seasonal max correlation: 128
Seasonal min correlation: 63
lag_pos_corr = np.argmax([df['normalized_sunspots'].autocorr(lag=x) for x in range(110, 130, 1) ])
lag_pos_corr = list(range(110, 130, 1))[lag_pos_corr]

lag_neg_corr = np.argmin([df['normalized_sunspots'].autocorr(lag=x) for x in range(55, 65, 1) ])
lag_neg_corr = list(range(55, 65, 1))[lag_neg_corr]

print(f"seasonal max_corr:{lag_pos_corr} \t seasonal min_corr:{lag_neg_corr}")
seasonal max_corr:128    seasonal min_corr:63

Feature Engineering

Create Moving Averages

# Create moving averages over several time windows
df['MA(t-4)_sunspots'] = df['normalized_sunspots'].rolling(window=5).mean()
df['MA(t-10)_sunspots'] = df['normalized_sunspots'].rolling(window=11).mean()
df['MA(t-20)_sunspots'] = df['normalized_sunspots'].rolling(window=21).mean()
df['MA(t-30)_sunspots'] = df['normalized_sunspots'].rolling(window=61).mean()
df.tail()
date sunspots normalized_sunspots MA(t-4)_sunspots MA(t-10)_sunspots MA(t-20)_sunspots MA(t-30)_sunspots
3260 2020-09-30 0.6 0.001507 0.010146 0.008059 0.009124 0.047859
3261 2020-10-31 14.4 0.036163 0.017278 0.011232 0.009926 0.045216
3262 2020-11-30 34.0 0.085384 0.031441 0.018652 0.013896 0.043997
3263 2020-12-31 21.8 0.054746 0.039327 0.022214 0.015379 0.042334
3264 2021-01-31 10.4 0.026118 0.040784 0.024542 0.015534 0.040374

Remove Seasonality and Create New Features

# Create seasonal features
pos_season = df[0:len(df)-lag_pos_corr]['normalized_sunspots'].values - df[lag_pos_corr:]['normalized_sunspots'].values
df['pos_season'] = np.concatenate([np.array([np.nan]*(len(df)-len(pos_season))), pos_season])

neg_season = df[0:len(df)-lag_neg_corr]['normalized_sunspots'].values - df[lag_neg_corr:]['normalized_sunspots'].values
df['neg_season'] = np.concatenate([np.array([np.nan]*(len(df)-len(neg_season))), neg_season])

Split Train/Validation Dataset

train_df = df[0:3000]
val_df = df[3000:]

# Drop NaN values
train_df = train_df.dropna()
val_df = val_df.dropna()

print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
train_df.head()
Training set size: 2872
Validation set size: 265
date sunspots normalized_sunspots MA(t-4)_sunspots MA(t-10)_sunspots MA(t-20)_sunspots MA(t-30)_sunspots pos_season neg_season
128 1759-09-30 128.7 0.323204 0.249874 0.217113 0.210435 0.118817 -0.080362 -0.211452
129 1759-10-31 99.5 0.249874 0.258815 0.223460 0.214836 0.122350 0.012054 -0.171271
130 1759-11-30 77.2 0.193872 0.255751 0.224716 0.213700 0.123873 0.099196 -0.142391
131 1759-12-31 95.0 0.238574 0.260773 0.228026 0.215290 0.126878 -0.005525 -0.204169
132 1760-01-31 112.2 0.281768 0.257459 0.236907 0.214298 0.131209 0.074083 -0.180814

Check Multivariate Features

val_df[val_df.columns[-7:]].plot(figsize=(14,7), fontsize=10, grid=True).legend(prop={'size': 15});

Naïve Predictions using only the created features

for col in val_df.columns[-6:]:
    print(f"MAE using only '{col}' = {mean_absolute_error(val_df['normalized_sunspots'], val_df[col])}")
MAE using only 'MA(t-4)_sunspots' = 0.028975484017702304
MAE using only 'MA(t-10)_sunspots' = 0.03701519617007236
MAE using only 'MA(t-20)_sunspots' = 0.053933446240336684
MAE using only 'MA(t-30)_sunspots' = 0.12551217565341594
MAE using only 'pos_season' = 0.17150194744273764
MAE using only 'neg_season' = 0.344110762582565

Create TensorFlow Model and Dataset

Create Windowed Dataset

train_ds  = tf.data.Dataset.from_tensor_slices(train_df[train_df.columns[-7:]])
val_ds  = tf.data.Dataset.from_tensor_slices(val_df[val_df.columns[-7:]])


def windowed_ds(ds, window_size, pred_window=1, shift=1):
    ds = ds.window(window_size + pred_window, shift=shift, drop_remainder=True)
    ds = ds.flat_map(lambda w: w.batch(window_size + pred_window))
    ds = ds.map(lambda w: (w[:-pred_window],w[-pred_window:][:,0]))
    return ds

train_ds = windowed_ds(train_ds, window_size).repeat(10).shuffle(500, reshuffle_each_iteration=True).batch(batch_size)
val_ds = windowed_ds(val_ds, window_size).batch(batch_size).cache()
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
I0000 00:00:1762310906.865648 40000160 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1762310906.865678 40000160 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)

Create the model (CNN + RNN)

inputs = layers.Input(shape=(window_size,7))
x = layers.Conv1D(filters=32, kernel_size=10, padding='causal')(inputs)
x = layers.Conv1D(filters=64, kernel_size=2)(x)
x = layers.LSTM(128, return_sequences=True)(x)
x = layers.LSTM(128)(x)
x = layers.Dense(64, activation="relu")(x)
x = layers.Dropout(0.7)(x, training=True)
x = layers.Dense(1)(x)
model = Model(inputs, x)
model.summary()
Model: "functional"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                     Output Shape                  Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input_layer (InputLayer)        │ (None, 60, 7)          │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ conv1d (Conv1D)                 │ (None, 60, 32)         │         2,272 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ conv1d_1 (Conv1D)               │ (None, 59, 64)         │         4,160 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ lstm (LSTM)                     │ (None, 59, 128)        │        98,816 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ lstm_1 (LSTM)                   │ (None, 128)            │       131,584 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense (Dense)                   │ (None, 64)             │         8,256 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout (Dropout)               │ (None, 64)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_1 (Dense)                 │ (None, 1)              │            65 │
└─────────────────────────────────┴────────────────────────┴───────────────┘
 Total params: 245,153 (957.63 KB)
 Trainable params: 245,153 (957.63 KB)
 Non-trainable params: 0 (0.00 B)
model.compile(loss=losses.Huber(), optimizer='adam', metrics=['mae'])
early_stop = callbacks.EarlyStopping(monitor='val_mae', min_delta=1e-3, patience=30, mode='min', restore_best_weights=True)
checkpoint = callbacks.ModelCheckpoint(f'models/sunspots.model.keras', monitor='mae', save_best_only=True, mode='min')
history = model.fit(train_ds, validation_data=val_ds, epochs=100, callbacks=[early_stop, checkpoint], verbose=0)
y_true = []
y_pred = []
for elem in val_ds:
    y_true.append(elem[1].numpy())
    y_pred.append(model.predict(elem[0]))
y_true = np.concatenate(y_true, axis=0)
y_pred = np.concatenate(y_pred, axis=0)

print('scaled val_MAE:', mean_absolute_error(y_true, y_pred))
y_true = scaler.inverse_transform(y_true)
y_pred = scaler.inverse_transform(y_pred)
print('val_MAE:', mean_absolute_error(y_true, y_pred))
err = y_true - y_pred
1/7 ━━━━━━━━━━━━━━━━━━━━ 1s 264ms/step

7/7 ━━━━━━━━━━━━━━━━━━━━ 0s 34ms/step 

7/7 ━━━━━━━━━━━━━━━━━━━━ 0s 35ms/step

scaled val_MAE: 0.026955439574745693

val_MAE: 10.733656077966456
f = plt.figure(figsize=(14,7))
ax1 = f.add_subplot(211)
ax2 = f.add_subplot(212)
ax1.plot(np.arange(3000, 3000+len(y_true), 1), y_true)
ax1.plot(np.arange(3000, 3000+len(y_true), 1), y_pred)
ax1.grid(True)

sns.boxplot(np.squeeze(err), ax=ax2, orient='h')
ax1.legend(['y_true', 'y_pred'], prop={'size': 15})
ax2.set_title('total error distribution', fontsize=10)
ax1.xaxis.set_tick_params(labelsize=10)
ax1.yaxis.set_tick_params(labelsize=10)
ax2.xaxis.set_tick_params(labelsize=10)

Error Analysis

n_experiments = 100
y_true = []
y_pred = []
aux_ds = val_ds.unbatch()
for elem in aux_ds:
    y_true.append(elem[1].numpy())
    # Vectorized MC dropout: tile the input and call the model once with training=True
    tiled_inputs = tf.tile(tf.expand_dims(elem[0], axis=0), [n_experiments, 1, 1])
    preds = model(tiled_inputs)
    y_pred.append(preds)

y_true = np.concatenate(y_true, axis=0)
y_pred = np.concatenate(y_pred, axis=1)
quantiles = np.quantile(y_pred, [0.5,0.25,0.75,0.05,0.95], axis=0)
quantiles = scaler.inverse_transform(quantiles)
quantiles.shape
(5, 205)
y_true_tr = scaler.inverse_transform(np.expand_dims(y_true, axis=-1))
y_pred_tr = np.expand_dims(quantiles[0], axis=-1)
err = (y_true_tr - y_pred_tr).squeeze()
f = plt.figure(figsize=(14,7))
ax1 = f.add_subplot(211)
ax2 = f.add_subplot(212)
ax1.plot(np.arange(3000, 3000+len(y_true), 1), y_true_tr)
ax1.plot(np.arange(3000, 3000+len(y_true), 1), quantiles[0])
ax1.fill_between(np.arange(3000, 3000+len(y_true)), quantiles[1], quantiles[2], color='b', alpha=.2)
ax1.fill_between(np.arange(3000, 3000+len(y_true)), quantiles[3], quantiles[4], color='b', alpha=.1)
ax1.grid(True)
sns.boxplot(x=err, ax=ax2, orient='h')
ax1.legend(['y_true', 'y_pred'], prop={'size': 15})
ax2.set_title('total error distribution', fontsize=10)
ax1.xaxis.set_tick_params(labelsize=10)
ax1.yaxis.set_tick_params(labelsize=10)
ax2.xaxis.set_tick_params(labelsize=10)

Back to top