In [1]:
# Importing packages for handling data, calculating statistics and modelling the neural net

import numpy as np
import pandas as pd
import pickle

import statsmodels.stats.api as sms
from sklearn.metrics import r2_score

from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, LeakyReLU, PReLU
from tensorflow.keras.optimizers import RMSprop, Adam
from tensorflow.keras.callbacks import Callback, ModelCheckpoint
In [2]:
# Importing packages and defining helpers for styling output

import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import IPython

%matplotlib inline

class Colors:
    PRIMARY = '#ff1050'
    SECONDARY = '#7B7C8D'
    CMAP = LinearSegmentedColormap.from_list('secondary-primary', [SECONDARY, PRIMARY], N=256)
In [3]:
# Defining constants

class Columns:
    RANK = "Overall rank"
    COUNTRY = "Country or region"
    SCORE = "Score"
    GDP = "GDP per capita"
    SUPPORT = "Social support"
    LIFE = "Healthy life expectancy"
    FREEDOM = "Freedom to make life choices"
    GENEROSITY = "Generosity"
    CORRUPTION = "Perceptions of corruption"
    PREDICTION = "Predicted score"
    IS_VALIDATION = "Is validation?"
    ERROR = "Error"
    SQUARED_ERROR = "Squared error"
    ABS_ERROR = "Abs. error"

class Constants:
    FILENAME = "world_happiness_report_2019.csv"
    INDEX_COLUMN = Columns.COUNTRY
    COLUMNS_TO_BE_SCALED = [Columns.SCORE, Columns.GDP, Columns.SUPPORT, Columns.LIFE, Columns.FREEDOM, Columns.GENEROSITY, Columns.CORRUPTION]
    COLUMNS_AS_INPUTS =  [Columns.GDP, Columns.SUPPORT, Columns.LIFE, Columns.FREEDOM, Columns.GENEROSITY, Columns.CORRUPTION]
    FILENAME_WEIGHTS = "best_weights.hdf5"
    FILENAME_TRAINING_HISTORY = "training_history.pickle"
    PERCENT_N_OF_VALIDATION_DATA = 0.2
    SEED = 30000
In [4]:
# Loading the dataset

df = pd.read_csv(Constants.FILENAME)
df = df.set_index(Constants.INDEX_COLUMN)
In [5]:
# Scaling

df_scaled = df.copy()

class Scaling:
    def __init__(self, minimum=None, maximum=None):
        self.min, self.max = minimum, maximum
        self.span = self.max - self.min
        self.scale_array = np.vectorize(self.scale)
        self.rescale_array = np.vectorize(self.rescale)
    
    def scale(self, x):
        return (x - self.min) / self.span
    
    def rescale(self, y):
        return (y * self.span) + self.min
    
    def scale_series(self, series):
        return series.apply(self.scale)
    
    def rescale_series(self, series):
        return series.apply(self.rescale)
        
    @staticmethod
    def create_from_series(Series):
        return Scaling(np.min(df[column]), np.max(df[column]))

df_scaling = {}

for column in Constants.COLUMNS_TO_BE_SCALED:
    series = df[column]
    df_scaling[column] = Scaling.create_from_series(series)
    df_scaled[column] = df_scaling[column].scale_series(series)

df_scaled.describe()
Out[5]:
Overall rank Score GDP per capita Social support Healthy life expectancy Freedom to make life choices Generosity Perceptions of corruption
count 156.000000 156.000000 156.000000 156.000000 156.000000 156.000000 156.000000 156.000000
mean 78.500000 0.519548 0.537498 0.744344 0.635621 0.622140 0.326583 0.244156
std 45.177428 0.226428 0.236573 0.184231 0.212203 0.227083 0.168294 0.208693
min 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 39.750000 0.344081 0.357928 0.650092 0.480061 0.488114 0.192138 0.103753
50% 78.500000 0.513934 0.570071 0.782943 0.691499 0.660856 0.313604 0.188742
75% 117.250000 0.677685 0.731888 0.894397 0.772787 0.803883 0.438604 0.311810
max 156.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
In [6]:
# Transforming pandas' dataframes into matrices

all_inputs = np.array(df_scaled[Constants.COLUMNS_AS_INPUTS])
all_outputs = np.array(df_scaled[Columns.SCORE]).reshape(-1, 1)

print("Shape of input matrix", all_inputs.shape)
print("Shape of output matrix", all_outputs.shape)
Shape of input matrix (156, 6)
Shape of output matrix (156, 1)
In [7]:
# Splitting data in a set of training and validation data

n_of_data = all_inputs.shape[0]
n_of_validation_data = int(Constants.PERCENT_N_OF_VALIDATION_DATA * n_of_data)
n_of_training_data = n_of_data - n_of_validation_data

np.random.seed(Constants.SEED)
random_indices = np.arange(n_of_data)
np.random.shuffle(random_indices)
training_indices = random_indices[:n_of_training_data]
validation_indices = random_indices[n_of_training_data:]

inputs, outputs = all_inputs[training_indices, :], all_outputs[training_indices, :]
validation_inputs, validation_outputs = all_inputs[validation_indices, :], all_outputs[validation_indices, :]

print("Shape of training matrix", inputs.shape, outputs.shape)
print("Shape of validation matirx", validation_inputs.shape, validation_outputs.shape)
Shape of training matrix (125, 6) (125, 1)
Shape of validation matirx (31, 6) (31, 1)
In [8]:
# Building the model of the neural net

model = Sequential()
model.add(Dense(30, input_dim = inputs.shape[1], activation=None))
model.add(LeakyReLU(alpha=0.4))
model.add(Dense(60, activation=None ))
model.add(LeakyReLU(alpha=0.2))
model.add(Dense(30, activation=None))
model.add(LeakyReLU(alpha=0.1))
model.add(Dense(outputs.shape[1], activation='sigmoid'))

rms = RMSprop(lr=0.0003)

model.compile(loss='mse', optimizer=rms, metrics=['mse'])

model.summary()
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense (Dense)                (None, 30)                210       
_________________________________________________________________
leaky_re_lu (LeakyReLU)      (None, 30)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 60)                1860      
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 60)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 30)                1830      
_________________________________________________________________
leaky_re_lu_2 (LeakyReLU)    (None, 30)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 31        
=================================================================
Total params: 3,931
Trainable params: 3,931
Non-trainable params: 0
_________________________________________________________________
In [9]:
# Training

HAS_TO_TRAIN = True

DEFAULT_METRIC = 'mse'
VALIDATION_METRIC = 'val_mse'
EPOCHS = 300
BATCH_SIZE = 32
METRIC_FOR_SAVING_CHECKPOINT = VALIDATION_METRIC


class LossHistory(Callback):
    N_OF_MOST_RECENT_DATA_TO_SHOW = 20
    SHOW_PLOT_EVERY_N_EPOCH = 10
    
    def __init__(self, metrics, metric_for_most_recent):
        self.metrics, self.metric_for_most_recent = metrics, metric_for_most_recent
        self.n_metrics = len(metrics)
        self.on_train_begin()
        
    def save_to_file(self, filename):
        with open(filename, "wb") as file:
            pickle.dump(self.data, file)
            
    def on_train_begin(self, logs={}):
        self.data = { metric: [] for metric in self.metrics }
        self.index = []
        self.current_index = 0
    
    def on_epoch_end(self, batch, logs={}):
        self.__update_data(logs)
    
        self.__update_index()
            
        if self.__should_show_plot():
            self.__update_plot()
            
    def __should_show_plot(self):
        return self.current_index % LossHistory.SHOW_PLOT_EVERY_N_EPOCH == 0
    
    def __update_data(self, logs):
        for key in self.data.keys():
            self.data[key].append(logs.get(key))
            
    def __update_index(self):
        self.index.append(self.current_index)
        self.current_index += 1
        
    def __update_plot(self):
        fig, ax = plt.subplots(1, self.n_metrics + 1, figsize=(20, 8))
        
        for index, metric in enumerate(self.data):
            ax[index].scatter(self.index, self.data[metric], c=Colors.PRIMARY)
            ax[index].set_yscale('log')

        last_index, last_metric = self.__last_metric() 
        ax[-1].scatter(last_index, last_metric, c=Colors.PRIMARY)
        
        self.__display_plot()
        
    def __display_plot(self):
        IPython.display.clear_output(wait=True)
        IPython.display.display(plt.show())
        
    def __last_metric(self):
        n = LossHistory.N_OF_MOST_RECENT_DATA_TO_SHOW
        return self.index[-n:-1], self.data[self.metric_for_most_recent][-n:-1]


if HAS_TO_TRAIN:    
    checkpoint = ModelCheckpoint(Constants.FILENAME_WEIGHTS, monitor=METRIC_FOR_SAVING_CHECKPOINT, verbose=0, save_best_only=True, mode='min')
    lossHistory = LossHistory([DEFAULT_METRIC, VALIDATION_METRIC], VALIDATION_METRIC)
    callbacks_list = [checkpoint, lossHistory]
    history = model.fit(inputs, outputs, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=0, callbacks=callbacks_list, validation_data=(validation_inputs, validation_outputs))
    lossHistory.save_to_file(Constants.FILENAME_TRAINING_HISTORY)
None
In [10]:
# Loading weights, making the prediction, and rescaling

model.load_weights(Constants.FILENAME_WEIGHTS)

scaled_predictions = model.predict(all_inputs)[:,0]

predictions = df_scaling[Columns.SCORE].rescale_array(scaled_predictions)
In [11]:
# Determining all kind of error metrics

scores = df[Columns.SCORE]

errors = scores - predictions
squared_errors = np.power(errors, 2)
abs_errors = abs(errors)
In [12]:
# Creating association with training or validation set

is_validation = np.zeros(len(scores), dtype='int')
is_validation[validation_indices] = 1
In [13]:
# Expanding the dataset

df_extended = df.copy()

df_extended[Columns.PREDICTION] = predictions
df_extended[Columns.ERROR] = errors
df_extended[Columns.SQUARED_ERROR] = squared_errors
df_extended[Columns.ABS_ERROR] = abs_errors
df_extended[Columns.IS_VALIDATION] = is_validation

df_extended.head()
Out[13]:
Overall rank Score GDP per capita Social support Healthy life expectancy Freedom to make life choices Generosity Perceptions of corruption Predicted score Error Squared error Abs. error Is validation?
Country or region
Finland 1 7.769 1.340 1.587 0.986 0.596 0.153 0.393 6.781116 0.987884 0.975914 0.987884 0
Denmark 2 7.600 1.383 1.573 0.996 0.592 0.252 0.410 6.967333 0.632667 0.400267 0.632667 0
Norway 3 7.554 1.488 1.582 1.028 0.603 0.271 0.341 7.073109 0.480891 0.231256 0.480891 0
Iceland 4 7.494 1.380 1.624 1.026 0.591 0.354 0.118 6.867893 0.626107 0.392010 0.626107 0
Netherlands 5 7.488 1.396 1.522 0.999 0.557 0.322 0.298 6.902797 0.585203 0.342463 0.585203 0
In [14]:
# Grouping by validation and training set and display greatest squared errors in validation set

df_by_group = df_extended.groupby(by=Columns.IS_VALIDATION)

validation_set, training_set = df_by_group.get_group(0), df_by_group.get_group(1)

display( validation_set[[Columns.SCORE, Columns.PREDICTION, Columns.SQUARED_ERROR]].sort_values(Columns.SQUARED_ERROR, ascending=False).head() )
Score Predicted score Squared error
Country or region
Botswana 3.488 4.955444 2.153391
Benin 4.883 3.715804 1.362346
Pakistan 5.653 4.536079 1.247513
Tanzania 3.231 4.330420 1.208724
Hong Kong 5.430 6.474437 1.090849
In [15]:
# Computing mean squared error, 95% CI, R squared

mse = validation_set[Columns.SQUARED_ERROR].mean()

lower, upper = sms.DescrStatsW(validation_set[Columns.SQUARED_ERROR]).tconfint_mean()
ci_lower, ci_upper = mse + np.array([-lower, upper])

r_squared = r2_score(validation_set[Columns.SCORE], validation_set[Columns.PREDICTION], sample_weight=None, multioutput='uniform_average')

print("## Validation Set ##\n")
print("MSE = {:.2f}".format(mse))
print("95% CI of MSE [{:.2f}, {:.2f}]".format(ci_lower, ci_upper))
print("R^2 = {:.2f}".format(r_squared))
## Validation Set ##

MSE = 0.22
95% CI of MSE [0.06, 0.50]
R^2 = 0.83
In [16]:
# Drawing scores vs. predictions, mean squared errors, and box plot of mean squared errors for validation & training set

fig, sp = plt.subplots(nrows=1, ncols=3, figsize=(20, 5))
left_ax, middle_ax, right_ax = sp

left_ax.scatter(scores, predictions, c=is_validation, cmap=Colors.CMAP)
left_ax.axis('square')
left_ax.plot([0, 10], [0, 10], color=Colors.PRIMARY)
left_ax.set_ylim(left_ax.get_xlim())
left_ax.set_xlabel('actual score')
left_ax.set_ylabel('predicted score')

middle_ax.scatter(np.arange(len(squared_errors)), squared_errors, c=is_validation, cmap=Colors.CMAP)
middle_ax.set_xlabel('country')
middle_ax.set_ylabel('mse')

right_ax.boxplot([training_set[Columns.SQUARED_ERROR], validation_set[Columns.SQUARED_ERROR]], labels=['training', 'validation'])
right_ax.set_xlabel('group')
right_ax.set_ylabel('mse')

plt.show()