# Importing packages for handling data, calculating statistics and modelling the neural net
import numpy as np
import pandas as pd
import pickle
import statsmodels.stats.api as sms
from sklearn.metrics import r2_score
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, LeakyReLU, PReLU
from tensorflow.keras.optimizers import RMSprop, Adam
from tensorflow.keras.callbacks import Callback, ModelCheckpoint
# Importing packages and defining helpers for styling output
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import IPython
%matplotlib inline
class Colors:
PRIMARY = '#ff1050'
SECONDARY = '#7B7C8D'
CMAP = LinearSegmentedColormap.from_list('secondary-primary', [SECONDARY, PRIMARY], N=256)
# Defining constants
class Columns:
RANK = "Overall rank"
COUNTRY = "Country or region"
SCORE = "Score"
GDP = "GDP per capita"
SUPPORT = "Social support"
LIFE = "Healthy life expectancy"
FREEDOM = "Freedom to make life choices"
GENEROSITY = "Generosity"
CORRUPTION = "Perceptions of corruption"
PREDICTION = "Predicted score"
IS_VALIDATION = "Is validation?"
ERROR = "Error"
SQUARED_ERROR = "Squared error"
ABS_ERROR = "Abs. error"
class Constants:
FILENAME = "world_happiness_report_2019.csv"
INDEX_COLUMN = Columns.COUNTRY
COLUMNS_TO_BE_SCALED = [Columns.SCORE, Columns.GDP, Columns.SUPPORT, Columns.LIFE, Columns.FREEDOM, Columns.GENEROSITY, Columns.CORRUPTION]
COLUMNS_AS_INPUTS = [Columns.GDP, Columns.SUPPORT, Columns.LIFE, Columns.FREEDOM, Columns.GENEROSITY, Columns.CORRUPTION]
FILENAME_WEIGHTS = "best_weights.hdf5"
FILENAME_TRAINING_HISTORY = "training_history.pickle"
PERCENT_N_OF_VALIDATION_DATA = 0.2
SEED = 30000
# Loading the dataset
df = pd.read_csv(Constants.FILENAME)
df = df.set_index(Constants.INDEX_COLUMN)
# Scaling
df_scaled = df.copy()
class Scaling:
def __init__(self, minimum=None, maximum=None):
self.min, self.max = minimum, maximum
self.span = self.max - self.min
self.scale_array = np.vectorize(self.scale)
self.rescale_array = np.vectorize(self.rescale)
def scale(self, x):
return (x - self.min) / self.span
def rescale(self, y):
return (y * self.span) + self.min
def scale_series(self, series):
return series.apply(self.scale)
def rescale_series(self, series):
return series.apply(self.rescale)
@staticmethod
def create_from_series(Series):
return Scaling(np.min(df[column]), np.max(df[column]))
df_scaling = {}
for column in Constants.COLUMNS_TO_BE_SCALED:
series = df[column]
df_scaling[column] = Scaling.create_from_series(series)
df_scaled[column] = df_scaling[column].scale_series(series)
df_scaled.describe()
# Transforming pandas' dataframes into matrices
all_inputs = np.array(df_scaled[Constants.COLUMNS_AS_INPUTS])
all_outputs = np.array(df_scaled[Columns.SCORE]).reshape(-1, 1)
print("Shape of input matrix", all_inputs.shape)
print("Shape of output matrix", all_outputs.shape)
# Splitting data in a set of training and validation data
n_of_data = all_inputs.shape[0]
n_of_validation_data = int(Constants.PERCENT_N_OF_VALIDATION_DATA * n_of_data)
n_of_training_data = n_of_data - n_of_validation_data
np.random.seed(Constants.SEED)
random_indices = np.arange(n_of_data)
np.random.shuffle(random_indices)
training_indices = random_indices[:n_of_training_data]
validation_indices = random_indices[n_of_training_data:]
inputs, outputs = all_inputs[training_indices, :], all_outputs[training_indices, :]
validation_inputs, validation_outputs = all_inputs[validation_indices, :], all_outputs[validation_indices, :]
print("Shape of training matrix", inputs.shape, outputs.shape)
print("Shape of validation matirx", validation_inputs.shape, validation_outputs.shape)
# Building the model of the neural net
model = Sequential()
model.add(Dense(30, input_dim = inputs.shape[1], activation=None))
model.add(LeakyReLU(alpha=0.4))
model.add(Dense(60, activation=None ))
model.add(LeakyReLU(alpha=0.2))
model.add(Dense(30, activation=None))
model.add(LeakyReLU(alpha=0.1))
model.add(Dense(outputs.shape[1], activation='sigmoid'))
rms = RMSprop(lr=0.0003)
model.compile(loss='mse', optimizer=rms, metrics=['mse'])
model.summary()
# Training
HAS_TO_TRAIN = True
DEFAULT_METRIC = 'mse'
VALIDATION_METRIC = 'val_mse'
EPOCHS = 300
BATCH_SIZE = 32
METRIC_FOR_SAVING_CHECKPOINT = VALIDATION_METRIC
class LossHistory(Callback):
N_OF_MOST_RECENT_DATA_TO_SHOW = 20
SHOW_PLOT_EVERY_N_EPOCH = 10
def __init__(self, metrics, metric_for_most_recent):
self.metrics, self.metric_for_most_recent = metrics, metric_for_most_recent
self.n_metrics = len(metrics)
self.on_train_begin()
def save_to_file(self, filename):
with open(filename, "wb") as file:
pickle.dump(self.data, file)
def on_train_begin(self, logs={}):
self.data = { metric: [] for metric in self.metrics }
self.index = []
self.current_index = 0
def on_epoch_end(self, batch, logs={}):
self.__update_data(logs)
self.__update_index()
if self.__should_show_plot():
self.__update_plot()
def __should_show_plot(self):
return self.current_index % LossHistory.SHOW_PLOT_EVERY_N_EPOCH == 0
def __update_data(self, logs):
for key in self.data.keys():
self.data[key].append(logs.get(key))
def __update_index(self):
self.index.append(self.current_index)
self.current_index += 1
def __update_plot(self):
fig, ax = plt.subplots(1, self.n_metrics + 1, figsize=(20, 8))
for index, metric in enumerate(self.data):
ax[index].scatter(self.index, self.data[metric], c=Colors.PRIMARY)
ax[index].set_yscale('log')
last_index, last_metric = self.__last_metric()
ax[-1].scatter(last_index, last_metric, c=Colors.PRIMARY)
self.__display_plot()
def __display_plot(self):
IPython.display.clear_output(wait=True)
IPython.display.display(plt.show())
def __last_metric(self):
n = LossHistory.N_OF_MOST_RECENT_DATA_TO_SHOW
return self.index[-n:-1], self.data[self.metric_for_most_recent][-n:-1]
if HAS_TO_TRAIN:
checkpoint = ModelCheckpoint(Constants.FILENAME_WEIGHTS, monitor=METRIC_FOR_SAVING_CHECKPOINT, verbose=0, save_best_only=True, mode='min')
lossHistory = LossHistory([DEFAULT_METRIC, VALIDATION_METRIC], VALIDATION_METRIC)
callbacks_list = [checkpoint, lossHistory]
history = model.fit(inputs, outputs, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=0, callbacks=callbacks_list, validation_data=(validation_inputs, validation_outputs))
lossHistory.save_to_file(Constants.FILENAME_TRAINING_HISTORY)
# Loading weights, making the prediction, and rescaling
model.load_weights(Constants.FILENAME_WEIGHTS)
scaled_predictions = model.predict(all_inputs)[:,0]
predictions = df_scaling[Columns.SCORE].rescale_array(scaled_predictions)
# Determining all kind of error metrics
scores = df[Columns.SCORE]
errors = scores - predictions
squared_errors = np.power(errors, 2)
abs_errors = abs(errors)
# Creating association with training or validation set
is_validation = np.zeros(len(scores), dtype='int')
is_validation[validation_indices] = 1
# Expanding the dataset
df_extended = df.copy()
df_extended[Columns.PREDICTION] = predictions
df_extended[Columns.ERROR] = errors
df_extended[Columns.SQUARED_ERROR] = squared_errors
df_extended[Columns.ABS_ERROR] = abs_errors
df_extended[Columns.IS_VALIDATION] = is_validation
df_extended.head()
# Grouping by validation and training set and display greatest squared errors in validation set
df_by_group = df_extended.groupby(by=Columns.IS_VALIDATION)
validation_set, training_set = df_by_group.get_group(0), df_by_group.get_group(1)
display( validation_set[[Columns.SCORE, Columns.PREDICTION, Columns.SQUARED_ERROR]].sort_values(Columns.SQUARED_ERROR, ascending=False).head() )
# Computing mean squared error, 95% CI, R squared
mse = validation_set[Columns.SQUARED_ERROR].mean()
lower, upper = sms.DescrStatsW(validation_set[Columns.SQUARED_ERROR]).tconfint_mean()
ci_lower, ci_upper = mse + np.array([-lower, upper])
r_squared = r2_score(validation_set[Columns.SCORE], validation_set[Columns.PREDICTION], sample_weight=None, multioutput='uniform_average')
print("## Validation Set ##\n")
print("MSE = {:.2f}".format(mse))
print("95% CI of MSE [{:.2f}, {:.2f}]".format(ci_lower, ci_upper))
print("R^2 = {:.2f}".format(r_squared))
# Drawing scores vs. predictions, mean squared errors, and box plot of mean squared errors for validation & training set
fig, sp = plt.subplots(nrows=1, ncols=3, figsize=(20, 5))
left_ax, middle_ax, right_ax = sp
left_ax.scatter(scores, predictions, c=is_validation, cmap=Colors.CMAP)
left_ax.axis('square')
left_ax.plot([0, 10], [0, 10], color=Colors.PRIMARY)
left_ax.set_ylim(left_ax.get_xlim())
left_ax.set_xlabel('actual score')
left_ax.set_ylabel('predicted score')
middle_ax.scatter(np.arange(len(squared_errors)), squared_errors, c=is_validation, cmap=Colors.CMAP)
middle_ax.set_xlabel('country')
middle_ax.set_ylabel('mse')
right_ax.boxplot([training_set[Columns.SQUARED_ERROR], validation_set[Columns.SQUARED_ERROR]], labels=['training', 'validation'])
right_ax.set_xlabel('group')
right_ax.set_ylabel('mse')
plt.show()