1. Methodology

Calculate Predictions with our model Coefficients with our Independent Variables (Validation Set).
- Expected output: Float Values (between 0 and 1).
Convert Predictions to True if above 0.5 otherwise False.
- Expected output: Boolean Values (True and False).
Compare Booled Predictions to Dependent Variables (Validation Set)
- Expected output: Boolean Values (True and False).
Convert the Boolean values to Float.
- Expected output: Integer Values (1s and 0s).
Calculate Mean of the floated values.
- Expected output: Single Float Value (between 0 and 1)

2. Run Deep-Learning Model and Get Coefficients

See previous blog post for model and code explanation.

import kaggle, zipfile
from pathlib import Path
import torch, numpy as np, pandas as pd
from fastai.data.transforms import RandomSplitter
import torch.nn.functional as F
path = Path("titanic")
if not path.exists():
    print(f"{path} folder doesn't exist, downloading...")
    kaggle.api.competition_download_cli(str(path))
    zipfile.ZipFile(f"{path}.zip").extractall(path)
else:
    print(f"{path} already exists, using this folder...")
!ls {path}
df = pd.read_csv(path/"train.csv")
def df_1_fillna_inplace(df):
    modes = df.mode(axis=0).iloc[0] # get modes
    df.fillna(modes, inplace=True)  # replace nas with mode per col
def df_2_log_numeric_data_addlogfare(df): df['LogFare'] = np.log1p(df['Fare'])
def df_3_create_dummy_variables_add(df):
    return pd.get_dummies(df, columns=["Sex", "Pclass", "Embarked"], dtype=int)
def df_clean(df):
    df_1_fillna_inplace(df)
    df_2_log_numeric_data_addlogfare(df)
    return df_3_create_dummy_variables_add(df)

def get_idep_and_dep_from_df(df):
    def normalise_idep_by_max(idep):
        maxes, _ = idep.max(axis=0) # get max of each column
        return idep / maxes 
    
    added_cols          = ['Sex_male', 'Sex_female', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Embarked_C', 'Embarked_Q', 'Embarked_S']
    indep_cols          = ['Age', 'SibSp', 'Parch', 'LogFare'] + added_cols
    idep                = torch.tensor(df[indep_cols].values, dtype=torch.float)
    idep                = normalise_idep_by_max(idep)
    dep                 = torch.tensor(df["Survived"])
    return idep, dep

def get_trn_val_idep_dep(idep, dep):     
    trn_idx, val_idx            = RandomSplitter(seed=42)(idep)
    trn_dep_mx0,  val_dep_mx0   = dep[trn_idx], dep[val_idx] # 1-dimension i.e. cant matrix multiply 
    trn_idep_mxn, val_idep_mxn  = idep[trn_idx], idep[val_idx] 
    trn_dep_mx1                 = trn_dep_mx0[:,None] # add extra dimention for matrix multiply
    val_dep_mx1                 = val_dep_mx0[:,None]
    return trn_idep_mxn, val_idep_mxn, trn_dep_mx1, val_dep_mx1 
df = df_clean(df)
idep, dep = get_idep_and_dep_from_df(df)
trn_idep_mxn, val_idep_mxn, trn_dep_mx1, val_dep_mx1 = get_trn_val_idep_dep(idep, dep)
def init_coeffs():
    n_coeffs    = trn_idep_mxn.shape[1] # 12
    hidden_layers = [10,10]
    sizes = [n_coeffs] + hidden_layers + [1]    # [12,10,10,1]
    layers = [(torch.rand(sizes[i],sizes[i+1])-0.3)/sizes[i+1]*4 for i in range(len(sizes)-1)]   # 0,1,2
    consts = [(torch.rand(1)[0]-0.5)*0.1 for i in range(len(sizes)-1)]   # [0,1,2]
    for layer in layers+consts:
        layer.requires_grad_()
    return layers, consts
def calc_preds_deeplearning(trn_idep_mxn, coeffs):    
    layers, consts = coeffs
    n = len(layers)
    res = trn_idep_mxn
    for i in range(n):
        res = res@layers[i] + consts[i] # [mxn]@[nxq]  [713x12][12x10]
        if i!=n-1: 
            res = F.relu(res) 
    sgm_preds_mx1 = torch.sigmoid(res)
    return sgm_preds_mx1
def calc_loss(idep_mxn, dep_mx1, coeffs):
    preds_mx1 = calc_preds_deeplearning(idep_mxn, coeffs)
    return torch.abs(dep_mx1-preds_mx1).mean()
def update_coeffs(coeffs, lr):
    layers, consts = coeffs
    for layer in layers+consts:
        layer.sub_(layer.grad*lr)
        layer.grad.zero_()
def one_epoch(coeffs,lr):
    loss = calc_loss(trn_idep_mxn, trn_dep_mx1, coeffs)
    loss.backward()
    with torch.no_grad(): update_coeffs(coeffs, lr)
    print(f"{loss:.3f}",end=';')
def train_model(n_epochs=30,lr=0.1):
    torch.manual_seed(442)
    coeffs = init_coeffs()
    for _ in range(n_epochs):
        one_epoch(coeffs,lr)
    return coeffs
coeffs = train_model(lr=4)

titanic exists!
gender_submission.csv  test.csv  train.csv
0.521;0.483;0.427;0.379;0.379;0.379;0.379;0.378;0.378;0.378;0.378;0.378;0.378;0.378;0.378;0.378;0.377;0.376;0.371;0.333;0.239;0.224;0.208;0.204;0.203;0.203;0.207;0.197;0.196;0.195;

3. Accuracy Function

def calculate_accuracy_deepelearning(val_idep_mxn, coeffs):
    val_preds_mx1               = calc_preds_deeplearning(val_idep_mxn, coeffs)     # 1.
    bool_preds_mx1              = val_preds_mx1>0.5                                 # 2. 
    comp_dep_vs_preds_val_mx1   = (val_dep_mx1==bool_preds_mx1)                     # 3.
    float_comp_mx1              = comp_dep_vs_preds_val_mx1.float()                 # 4.   
    accuracy_mx1                = float_comp_mx1.mean()                             # 5. 
    return accuracy_mx1

calculate_accuracy_deepelearning(val_idep_mxn, coeffs)

tensor(0.8258)