window_size = 60
batch_size = 256In this post, we will explore how to perform multivariate regression with TensorFlow to predict sunspot activity. We’ll use a comprehensive approach that includes feature engineering, time series analysis, and deep learning to forecast sunspot counts.
Configuration
Imports
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
import warnings
warnings.filterwarnings(
'ignore',
category=UserWarning,
module=r'^keras(\.|$)'
)
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub
from kagglehub import KaggleDatasetAdapter
from sklearn.preprocessing import MinMaxScaler
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.metrics import mean_absolute_error
from tensorflow.keras import layers, Model, callbacks, losses
from datetime import datetimephysical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)Download the Dataset
# Load the sunspots dataset from Kaggle
df = kagglehub.dataset_load(
KaggleDatasetAdapter.PANDAS,
"robervalt/sunspots",
"Sunspots.csv",
)
df.head()| Unnamed: 0 | Date | Monthly Mean Total Sunspot Number | |
|---|---|---|---|
| 0 | 0 | 1749-01-31 | 96.7 |
| 1 | 1 | 1749-02-28 | 104.3 |
| 2 | 2 | 1749-03-31 | 116.7 |
| 3 | 3 | 1749-04-30 | 92.8 |
| 4 | 4 | 1749-05-31 | 141.7 |
Data Preprocessing & Scaling
df = df.drop(df.columns[0], axis=1)
df = df.sort_values('Date')
df = df.rename(columns={df.columns[0]: 'date', df.columns[1]: 'sunspots'})
scaler = MinMaxScaler()
df['normalized_sunspots'] = scaler.fit_transform(np.expand_dims(df['sunspots'], axis=-1))
df| date | sunspots | normalized_sunspots | |
|---|---|---|---|
| 0 | 1749-01-31 | 96.7 | 0.242843 |
| 1 | 1749-02-28 | 104.3 | 0.261929 |
| 2 | 1749-03-31 | 116.7 | 0.293069 |
| 3 | 1749-04-30 | 92.8 | 0.233049 |
| 4 | 1749-05-31 | 141.7 | 0.355851 |
| ... | ... | ... | ... |
| 3260 | 2020-09-30 | 0.6 | 0.001507 |
| 3261 | 2020-10-31 | 14.4 | 0.036163 |
| 3262 | 2020-11-30 | 34.0 | 0.085384 |
| 3263 | 2020-12-31 | 21.8 | 0.054746 |
| 3264 | 2021-01-31 | 10.4 | 0.026118 |
3265 rows × 3 columns
Dataset Exploration
graphs = df.plot(grid=True, figsize=(14,7), fontsize=10, subplots=True)
[x.legend(prop={'size': 15}) for x in graphs];
f = plt.figure(figsize=(14,7))
ax1 = f.add_subplot(121)
ax2 = f.add_subplot(122)
plot_acf(df['normalized_sunspots'], ax=ax1, title='Autocorrelation Function')
plot_pacf(df['normalized_sunspots'], ax=ax2, title='Partial Autocorrelation Function')
ax1.grid(True)
ax2.grid(True)
ax1.tick_params(labelsize=10)
ax2.tick_params(labelsize=10)
plt.tight_layout()
plt.show()
f = plt.figure(figsize=(14,7))
ax1 = f.add_subplot(121)
ax2 = f.add_subplot(122)
plot_acf(df['normalized_sunspots'], ax=ax1, lags=160, title='ACF with Extended Lags')
plot_pacf(df['normalized_sunspots'], ax=ax2, lags=160, title='PACF with Extended Lags')
ax1.grid(True)
ax2.grid(True)
ax1.tick_params(labelsize=10)
ax2.tick_params(labelsize=10)
plt.tight_layout()
plt.show()
lag_pos_corr = np.argmax([df['normalized_sunspots'].autocorr(lag=x) for x in range(110, 130, 1)])
lag_pos_corr = list(range(110, 130, 1))[lag_pos_corr]
lag_neg_corr = np.argmin([df['normalized_sunspots'].autocorr(lag=x) for x in range(55, 65, 1)])
lag_neg_corr = list(range(55, 65, 1))[lag_neg_corr]
print(f"Seasonal max correlation: {lag_pos_corr}")
print(f"Seasonal min correlation: {lag_neg_corr}")Seasonal max correlation: 128
Seasonal min correlation: 63
lag_pos_corr = np.argmax([df['normalized_sunspots'].autocorr(lag=x) for x in range(110, 130, 1) ])
lag_pos_corr = list(range(110, 130, 1))[lag_pos_corr]
lag_neg_corr = np.argmin([df['normalized_sunspots'].autocorr(lag=x) for x in range(55, 65, 1) ])
lag_neg_corr = list(range(55, 65, 1))[lag_neg_corr]
print(f"seasonal max_corr:{lag_pos_corr} \t seasonal min_corr:{lag_neg_corr}")seasonal max_corr:128 seasonal min_corr:63
Feature Engineering
Create Moving Averages
# Create moving averages over several time windows
df['MA(t-4)_sunspots'] = df['normalized_sunspots'].rolling(window=5).mean()
df['MA(t-10)_sunspots'] = df['normalized_sunspots'].rolling(window=11).mean()
df['MA(t-20)_sunspots'] = df['normalized_sunspots'].rolling(window=21).mean()
df['MA(t-30)_sunspots'] = df['normalized_sunspots'].rolling(window=61).mean()
df.tail()| date | sunspots | normalized_sunspots | MA(t-4)_sunspots | MA(t-10)_sunspots | MA(t-20)_sunspots | MA(t-30)_sunspots | |
|---|---|---|---|---|---|---|---|
| 3260 | 2020-09-30 | 0.6 | 0.001507 | 0.010146 | 0.008059 | 0.009124 | 0.047859 |
| 3261 | 2020-10-31 | 14.4 | 0.036163 | 0.017278 | 0.011232 | 0.009926 | 0.045216 |
| 3262 | 2020-11-30 | 34.0 | 0.085384 | 0.031441 | 0.018652 | 0.013896 | 0.043997 |
| 3263 | 2020-12-31 | 21.8 | 0.054746 | 0.039327 | 0.022214 | 0.015379 | 0.042334 |
| 3264 | 2021-01-31 | 10.4 | 0.026118 | 0.040784 | 0.024542 | 0.015534 | 0.040374 |
Remove Seasonality and Create New Features
# Create seasonal features
pos_season = df[0:len(df)-lag_pos_corr]['normalized_sunspots'].values - df[lag_pos_corr:]['normalized_sunspots'].values
df['pos_season'] = np.concatenate([np.array([np.nan]*(len(df)-len(pos_season))), pos_season])
neg_season = df[0:len(df)-lag_neg_corr]['normalized_sunspots'].values - df[lag_neg_corr:]['normalized_sunspots'].values
df['neg_season'] = np.concatenate([np.array([np.nan]*(len(df)-len(neg_season))), neg_season])Split Train/Validation Dataset
train_df = df[0:3000]
val_df = df[3000:]
# Drop NaN values
train_df = train_df.dropna()
val_df = val_df.dropna()
print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
train_df.head()Training set size: 2872
Validation set size: 265
| date | sunspots | normalized_sunspots | MA(t-4)_sunspots | MA(t-10)_sunspots | MA(t-20)_sunspots | MA(t-30)_sunspots | pos_season | neg_season | |
|---|---|---|---|---|---|---|---|---|---|
| 128 | 1759-09-30 | 128.7 | 0.323204 | 0.249874 | 0.217113 | 0.210435 | 0.118817 | -0.080362 | -0.211452 |
| 129 | 1759-10-31 | 99.5 | 0.249874 | 0.258815 | 0.223460 | 0.214836 | 0.122350 | 0.012054 | -0.171271 |
| 130 | 1759-11-30 | 77.2 | 0.193872 | 0.255751 | 0.224716 | 0.213700 | 0.123873 | 0.099196 | -0.142391 |
| 131 | 1759-12-31 | 95.0 | 0.238574 | 0.260773 | 0.228026 | 0.215290 | 0.126878 | -0.005525 | -0.204169 |
| 132 | 1760-01-31 | 112.2 | 0.281768 | 0.257459 | 0.236907 | 0.214298 | 0.131209 | 0.074083 | -0.180814 |
Check Multivariate Features
val_df[val_df.columns[-7:]].plot(figsize=(14,7), fontsize=10, grid=True).legend(prop={'size': 15});
Naïve Predictions using only the created features
for col in val_df.columns[-6:]:
print(f"MAE using only '{col}' = {mean_absolute_error(val_df['normalized_sunspots'], val_df[col])}")MAE using only 'MA(t-4)_sunspots' = 0.028975484017702304
MAE using only 'MA(t-10)_sunspots' = 0.03701519617007236
MAE using only 'MA(t-20)_sunspots' = 0.053933446240336684
MAE using only 'MA(t-30)_sunspots' = 0.12551217565341594
MAE using only 'pos_season' = 0.17150194744273764
MAE using only 'neg_season' = 0.344110762582565
Create TensorFlow Model and Dataset
Create Windowed Dataset
train_ds = tf.data.Dataset.from_tensor_slices(train_df[train_df.columns[-7:]])
val_ds = tf.data.Dataset.from_tensor_slices(val_df[val_df.columns[-7:]])
def windowed_ds(ds, window_size, pred_window=1, shift=1):
ds = ds.window(window_size + pred_window, shift=shift, drop_remainder=True)
ds = ds.flat_map(lambda w: w.batch(window_size + pred_window))
ds = ds.map(lambda w: (w[:-pred_window],w[-pred_window:][:,0]))
return ds
train_ds = windowed_ds(train_ds, window_size).repeat(10).shuffle(500, reshuffle_each_iteration=True).batch(batch_size)
val_ds = windowed_ds(val_ds, window_size).batch(batch_size).cache()WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
I0000 00:00:1762310906.865648 40000160 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1762310906.865678 40000160 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
Create the model (CNN + RNN)
inputs = layers.Input(shape=(window_size,7))
x = layers.Conv1D(filters=32, kernel_size=10, padding='causal')(inputs)
x = layers.Conv1D(filters=64, kernel_size=2)(x)
x = layers.LSTM(128, return_sequences=True)(x)
x = layers.LSTM(128)(x)
x = layers.Dense(64, activation="relu")(x)
x = layers.Dropout(0.7)(x, training=True)
x = layers.Dense(1)(x)
model = Model(inputs, x)
model.summary()Model: "functional"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ input_layer (InputLayer) │ (None, 60, 7) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ conv1d (Conv1D) │ (None, 60, 32) │ 2,272 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ conv1d_1 (Conv1D) │ (None, 59, 64) │ 4,160 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ lstm (LSTM) │ (None, 59, 128) │ 98,816 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ lstm_1 (LSTM) │ (None, 128) │ 131,584 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense (Dense) │ (None, 64) │ 8,256 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dropout (Dropout) │ (None, 64) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_1 (Dense) │ (None, 1) │ 65 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 245,153 (957.63 KB)
Trainable params: 245,153 (957.63 KB)
Non-trainable params: 0 (0.00 B)
model.compile(loss=losses.Huber(), optimizer='adam', metrics=['mae'])early_stop = callbacks.EarlyStopping(monitor='val_mae', min_delta=1e-3, patience=30, mode='min', restore_best_weights=True)
checkpoint = callbacks.ModelCheckpoint(f'models/sunspots.model.keras', monitor='mae', save_best_only=True, mode='min')history = model.fit(train_ds, validation_data=val_ds, epochs=100, callbacks=[early_stop, checkpoint], verbose=0)y_true = []
y_pred = []
for elem in val_ds:
y_true.append(elem[1].numpy())
y_pred.append(model.predict(elem[0]))
y_true = np.concatenate(y_true, axis=0)
y_pred = np.concatenate(y_pred, axis=0)
print('scaled val_MAE:', mean_absolute_error(y_true, y_pred))
y_true = scaler.inverse_transform(y_true)
y_pred = scaler.inverse_transform(y_pred)
print('val_MAE:', mean_absolute_error(y_true, y_pred))
err = y_true - y_pred1/7 ━━━━━━━━━━━━━━━━━━━━ 1s 264ms/step 7/7 ━━━━━━━━━━━━━━━━━━━━ 0s 34ms/step 7/7 ━━━━━━━━━━━━━━━━━━━━ 0s 35ms/step scaled val_MAE: 0.026955439574745693 val_MAE: 10.733656077966456
f = plt.figure(figsize=(14,7))
ax1 = f.add_subplot(211)
ax2 = f.add_subplot(212)
ax1.plot(np.arange(3000, 3000+len(y_true), 1), y_true)
ax1.plot(np.arange(3000, 3000+len(y_true), 1), y_pred)
ax1.grid(True)
sns.boxplot(np.squeeze(err), ax=ax2, orient='h')
ax1.legend(['y_true', 'y_pred'], prop={'size': 15})
ax2.set_title('total error distribution', fontsize=10)
ax1.xaxis.set_tick_params(labelsize=10)
ax1.yaxis.set_tick_params(labelsize=10)
ax2.xaxis.set_tick_params(labelsize=10)
Error Analysis
n_experiments = 100
y_true = []
y_pred = []
aux_ds = val_ds.unbatch()
for elem in aux_ds:
y_true.append(elem[1].numpy())
# Vectorized MC dropout: tile the input and call the model once with training=True
tiled_inputs = tf.tile(tf.expand_dims(elem[0], axis=0), [n_experiments, 1, 1])
preds = model(tiled_inputs)
y_pred.append(preds)
y_true = np.concatenate(y_true, axis=0)
y_pred = np.concatenate(y_pred, axis=1)quantiles = np.quantile(y_pred, [0.5,0.25,0.75,0.05,0.95], axis=0)
quantiles = scaler.inverse_transform(quantiles)
quantiles.shape(5, 205)
y_true_tr = scaler.inverse_transform(np.expand_dims(y_true, axis=-1))
y_pred_tr = np.expand_dims(quantiles[0], axis=-1)
err = (y_true_tr - y_pred_tr).squeeze()f = plt.figure(figsize=(14,7))
ax1 = f.add_subplot(211)
ax2 = f.add_subplot(212)
ax1.plot(np.arange(3000, 3000+len(y_true), 1), y_true_tr)
ax1.plot(np.arange(3000, 3000+len(y_true), 1), quantiles[0])
ax1.fill_between(np.arange(3000, 3000+len(y_true)), quantiles[1], quantiles[2], color='b', alpha=.2)
ax1.fill_between(np.arange(3000, 3000+len(y_true)), quantiles[3], quantiles[4], color='b', alpha=.1)
ax1.grid(True)
sns.boxplot(x=err, ax=ax2, orient='h')
ax1.legend(['y_true', 'y_pred'], prop={'size': 15})
ax2.set_title('total error distribution', fontsize=10)
ax1.xaxis.set_tick_params(labelsize=10)
ax1.yaxis.set_tick_params(labelsize=10)
ax2.xaxis.set_tick_params(labelsize=10)