Open In Colab

In this post, we will explore how to perform multivariate regression with TensorFlow to predict sunspot activity. We’ll use a comprehensive approach that includes feature engineering, time series analysis, and deep learning to forecast sunspot counts.

Configuration

window_size = 60
batch_size = 256

Imports

import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

import warnings
warnings.filterwarnings(
    'ignore',
    category=UserWarning,
    module=r'^keras(\.|$)'
)


import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub
from kagglehub import KaggleDatasetAdapter
from sklearn.preprocessing import MinMaxScaler
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.metrics import mean_absolute_error
from tensorflow.keras import layers, Model, callbacks, losses
from datetime import datetime

physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

Download the Dataset

# Load the sunspots dataset from Kaggle
df = kagglehub.dataset_load(
    KaggleDatasetAdapter.PANDAS,
    "robervalt/sunspots", 
    "Sunspots.csv",
)
df.head()

	Unnamed: 0	Date	Monthly Mean Total Sunspot Number
0	0	1749-01-31	96.7
1	1	1749-02-28	104.3
2	2	1749-03-31	116.7
3	3	1749-04-30	92.8
4	4	1749-05-31	141.7

Data Preprocessing & Scaling

df = df.drop(df.columns[0], axis=1)
df = df.sort_values('Date')
df = df.rename(columns={df.columns[0]: 'date', df.columns[1]: 'sunspots'})

scaler = MinMaxScaler()
df['normalized_sunspots'] = scaler.fit_transform(np.expand_dims(df['sunspots'], axis=-1))
df

	date	sunspots	normalized_sunspots
0	1749-01-31	96.7	0.242843
1	1749-02-28	104.3	0.261929
2	1749-03-31	116.7	0.293069
3	1749-04-30	92.8	0.233049
4	1749-05-31	141.7	0.355851
...	...	...	...
3260	2020-09-30	0.6	0.001507
3261	2020-10-31	14.4	0.036163
3262	2020-11-30	34.0	0.085384
3263	2020-12-31	21.8	0.054746
3264	2021-01-31	10.4	0.026118

3265 rows × 3 columns

Dataset Exploration

graphs = df.plot(grid=True, figsize=(14,7), fontsize=10, subplots=True)
[x.legend(prop={'size': 15}) for x in graphs];

f = plt.figure(figsize=(14,7))
ax1 = f.add_subplot(121)
ax2 = f.add_subplot(122)
plot_acf(df['normalized_sunspots'], ax=ax1, title='Autocorrelation Function')
plot_pacf(df['normalized_sunspots'], ax=ax2, title='Partial Autocorrelation Function')
ax1.grid(True)
ax2.grid(True)
ax1.tick_params(labelsize=10)
ax2.tick_params(labelsize=10)
plt.tight_layout()
plt.show()

f = plt.figure(figsize=(14,7))
ax1 = f.add_subplot(121)
ax2 = f.add_subplot(122)
plot_acf(df['normalized_sunspots'], ax=ax1, lags=160, title='ACF with Extended Lags')
plot_pacf(df['normalized_sunspots'], ax=ax2, lags=160, title='PACF with Extended Lags')
ax1.grid(True)
ax2.grid(True)
ax1.tick_params(labelsize=10)
ax2.tick_params(labelsize=10)
plt.tight_layout()
plt.show()

lag_pos_corr = np.argmax([df['normalized_sunspots'].autocorr(lag=x) for x in range(110, 130, 1)])
lag_pos_corr = list(range(110, 130, 1))[lag_pos_corr]

lag_neg_corr = np.argmin([df['normalized_sunspots'].autocorr(lag=x) for x in range(55, 65, 1)])
lag_neg_corr = list(range(55, 65, 1))[lag_neg_corr]

print(f"Seasonal max correlation: {lag_pos_corr}")
print(f"Seasonal min correlation: {lag_neg_corr}")

Seasonal max correlation: 128
Seasonal min correlation: 63

lag_pos_corr = np.argmax([df['normalized_sunspots'].autocorr(lag=x) for x in range(110, 130, 1) ])
lag_pos_corr = list(range(110, 130, 1))[lag_pos_corr]

lag_neg_corr = np.argmin([df['normalized_sunspots'].autocorr(lag=x) for x in range(55, 65, 1) ])
lag_neg_corr = list(range(55, 65, 1))[lag_neg_corr]

print(f"seasonal max_corr:{lag_pos_corr} \t seasonal min_corr:{lag_neg_corr}")

seasonal max_corr:128    seasonal min_corr:63

Feature Engineering

Create Moving Averages

# Create moving averages over several time windows
df['MA(t-4)_sunspots'] = df['normalized_sunspots'].rolling(window=5).mean()
df['MA(t-10)_sunspots'] = df['normalized_sunspots'].rolling(window=11).mean()
df['MA(t-20)_sunspots'] = df['normalized_sunspots'].rolling(window=21).mean()
df['MA(t-30)_sunspots'] = df['normalized_sunspots'].rolling(window=61).mean()
df.tail()

	date	sunspots	normalized_sunspots	MA(t-4)_sunspots	MA(t-10)_sunspots	MA(t-20)_sunspots	MA(t-30)_sunspots
3260	2020-09-30	0.6	0.001507	0.010146	0.008059	0.009124	0.047859
3261	2020-10-31	14.4	0.036163	0.017278	0.011232	0.009926	0.045216
3262	2020-11-30	34.0	0.085384	0.031441	0.018652	0.013896	0.043997
3263	2020-12-31	21.8	0.054746	0.039327	0.022214	0.015379	0.042334
3264	2021-01-31	10.4	0.026118	0.040784	0.024542	0.015534	0.040374

Remove Seasonality and Create New Features

# Create seasonal features
pos_season = df[0:len(df)-lag_pos_corr]['normalized_sunspots'].values - df[lag_pos_corr:]['normalized_sunspots'].values
df['pos_season'] = np.concatenate([np.array([np.nan]*(len(df)-len(pos_season))), pos_season])

neg_season = df[0:len(df)-lag_neg_corr]['normalized_sunspots'].values - df[lag_neg_corr:]['normalized_sunspots'].values
df['neg_season'] = np.concatenate([np.array([np.nan]*(len(df)-len(neg_season))), neg_season])

Split Train/Validation Dataset

train_df = df[0:3000]
val_df = df[3000:]

# Drop NaN values
train_df = train_df.dropna()
val_df = val_df.dropna()

print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
train_df.head()

Training set size: 2872
Validation set size: 265

	date	sunspots	normalized_sunspots	MA(t-4)_sunspots	MA(t-10)_sunspots	MA(t-20)_sunspots	MA(t-30)_sunspots	pos_season	neg_season
128	1759-09-30	128.7	0.323204	0.249874	0.217113	0.210435	0.118817	-0.080362	-0.211452
129	1759-10-31	99.5	0.249874	0.258815	0.223460	0.214836	0.122350	0.012054	-0.171271
130	1759-11-30	77.2	0.193872	0.255751	0.224716	0.213700	0.123873	0.099196	-0.142391
131	1759-12-31	95.0	0.238574	0.260773	0.228026	0.215290	0.126878	-0.005525	-0.204169
132	1760-01-31	112.2	0.281768	0.257459	0.236907	0.214298	0.131209	0.074083	-0.180814

Check Multivariate Features

val_df[val_df.columns[-7:]].plot(figsize=(14,7), fontsize=10, grid=True).legend(prop={'size': 15});

Naïve Predictions using only the created features

for col in val_df.columns[-6:]:
    print(f"MAE using only '{col}' = {mean_absolute_error(val_df['normalized_sunspots'], val_df[col])}")

MAE using only 'MA(t-4)_sunspots' = 0.028975484017702304
MAE using only 'MA(t-10)_sunspots' = 0.03701519617007236
MAE using only 'MA(t-20)_sunspots' = 0.053933446240336684
MAE using only 'MA(t-30)_sunspots' = 0.12551217565341594
MAE using only 'pos_season' = 0.17150194744273764
MAE using only 'neg_season' = 0.344110762582565

Create TensorFlow Model and Dataset

Create Windowed Dataset

train_ds  = tf.data.Dataset.from_tensor_slices(train_df[train_df.columns[-7:]])
val_ds  = tf.data.Dataset.from_tensor_slices(val_df[val_df.columns[-7:]])


def windowed_ds(ds, window_size, pred_window=1, shift=1):
    ds = ds.window(window_size + pred_window, shift=shift, drop_remainder=True)
    ds = ds.flat_map(lambda w: w.batch(window_size + pred_window))
    ds = ds.map(lambda w: (w[:-pred_window],w[-pred_window:][:,0]))
    return ds

train_ds = windowed_ds(train_ds, window_size).repeat(10).shuffle(500, reshuffle_each_iteration=True).batch(batch_size)
val_ds = windowed_ds(val_ds, window_size).batch(batch_size).cache()

WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
I0000 00:00:1762310906.865648 40000160 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1762310906.865678 40000160 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)

Create the model (CNN + RNN)

inputs = layers.Input(shape=(window_size,7))
x = layers.Conv1D(filters=32, kernel_size=10, padding='causal')(inputs)
x = layers.Conv1D(filters=64, kernel_size=2)(x)
x = layers.LSTM(128, return_sequences=True)(x)
x = layers.LSTM(128)(x)
x = layers.Dense(64, activation="relu")(x)
x = layers.Dropout(0.7)(x, training=True)
x = layers.Dense(1)(x)
model = Model(inputs, x)
model.summary()

Model: "functional"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input_layer (InputLayer)        │ (None, 60, 7)          │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ conv1d (Conv1D)                 │ (None, 60, 32)         │         2,272 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ conv1d_1 (Conv1D)               │ (None, 59, 64)         │         4,160 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ lstm (LSTM)                     │ (None, 59, 128)        │        98,816 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ lstm_1 (LSTM)                   │ (None, 128)            │       131,584 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense (Dense)                   │ (None, 64)             │         8,256 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout (Dropout)               │ (None, 64)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_1 (Dense)                 │ (None, 1)              │            65 │
└─────────────────────────────────┴────────────────────────┴───────────────┘

 Total params: 245,153 (957.63 KB)

 Trainable params: 245,153 (957.63 KB)

 Non-trainable params: 0 (0.00 B)

model.compile(loss=losses.Huber(), optimizer='adam', metrics=['mae'])

early_stop = callbacks.EarlyStopping(monitor='val_mae', min_delta=1e-3, patience=30, mode='min', restore_best_weights=True)
checkpoint = callbacks.ModelCheckpoint(f'models/sunspots.model.keras', monitor='mae', save_best_only=True, mode='min')

history = model.fit(train_ds, validation_data=val_ds, epochs=100, callbacks=[early_stop, checkpoint], verbose=0)

y_true = []
y_pred = []
for elem in val_ds:
    y_true.append(elem[1].numpy())
    y_pred.append(model.predict(elem[0]))
y_true = np.concatenate(y_true, axis=0)
y_pred = np.concatenate(y_pred, axis=0)

print('scaled val_MAE:', mean_absolute_error(y_true, y_pred))
y_true = scaler.inverse_transform(y_true)
y_pred = scaler.inverse_transform(y_pred)
print('val_MAE:', mean_absolute_error(y_true, y_pred))
err = y_true - y_pred

1/7 ━━━━━━━━━━━━━━━━━━━━ 1s 264ms/step

7/7 ━━━━━━━━━━━━━━━━━━━━ 0s 34ms/step 

7/7 ━━━━━━━━━━━━━━━━━━━━ 0s 35ms/step

scaled val_MAE: 0.026955439574745693

val_MAE: 10.733656077966456

f = plt.figure(figsize=(14,7))
ax1 = f.add_subplot(211)
ax2 = f.add_subplot(212)
ax1.plot(np.arange(3000, 3000+len(y_true), 1), y_true)
ax1.plot(np.arange(3000, 3000+len(y_true), 1), y_pred)
ax1.grid(True)

sns.boxplot(np.squeeze(err), ax=ax2, orient='h')
ax1.legend(['y_true', 'y_pred'], prop={'size': 15})
ax2.set_title('total error distribution', fontsize=10)
ax1.xaxis.set_tick_params(labelsize=10)
ax1.yaxis.set_tick_params(labelsize=10)
ax2.xaxis.set_tick_params(labelsize=10)

Error Analysis

n_experiments = 100
y_true = []
y_pred = []
aux_ds = val_ds.unbatch()
for elem in aux_ds:
    y_true.append(elem[1].numpy())
    # Vectorized MC dropout: tile the input and call the model once with training=True
    tiled_inputs = tf.tile(tf.expand_dims(elem[0], axis=0), [n_experiments, 1, 1])
    preds = model(tiled_inputs)
    y_pred.append(preds)

y_true = np.concatenate(y_true, axis=0)
y_pred = np.concatenate(y_pred, axis=1)

quantiles = np.quantile(y_pred, [0.5,0.25,0.75,0.05,0.95], axis=0)
quantiles = scaler.inverse_transform(quantiles)
quantiles.shape

(5, 205)

y_true_tr = scaler.inverse_transform(np.expand_dims(y_true, axis=-1))
y_pred_tr = np.expand_dims(quantiles[0], axis=-1)
err = (y_true_tr - y_pred_tr).squeeze()

f = plt.figure(figsize=(14,7))
ax1 = f.add_subplot(211)
ax2 = f.add_subplot(212)
ax1.plot(np.arange(3000, 3000+len(y_true), 1), y_true_tr)
ax1.plot(np.arange(3000, 3000+len(y_true), 1), quantiles[0])
ax1.fill_between(np.arange(3000, 3000+len(y_true)), quantiles[1], quantiles[2], color='b', alpha=.2)
ax1.fill_between(np.arange(3000, 3000+len(y_true)), quantiles[3], quantiles[4], color='b', alpha=.1)
ax1.grid(True)
sns.boxplot(x=err, ax=ax2, orient='h')
ax1.legend(['y_true', 'y_pred'], prop={'size': 15})
ax2.set_title('total error distribution', fontsize=10)
ax1.xaxis.set_tick_params(labelsize=10)
ax1.yaxis.set_tick_params(labelsize=10)
ax2.xaxis.set_tick_params(labelsize=10)