import torch, numpy as np, pandas as pd
1. Download Competition Data
import kaggle, zipfile
from pathlib import Path
= Path("titanic")
path if not path.exists():
print(f"{path} folder doesn't exist, downloading...")
str(path))
kaggle.api.competition_download_cli(f"{path}.zip").extractall(path)
zipfile.ZipFile(else:
print(f"{path} exists!")
!ls {path}
titanic folder doesn't exist, downloading...
Downloading titanic.zip to /home/tonydevs/github/blog/posts/2024-04-21-deep_learning
100%|██████████| 34.1k/34.1k [00:00<00:00, 92.8kB/s]
gender_submission.csv test.csv train.csv
2. Clean Data
2.1 Read Training Data
= pd.read_csv(path/"train.csv") df
2.2 Deal with NA’s
sum() # find nas df.isna().
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
= df.mode(axis=0).iloc[0] # get modes
modes =True) # replace nas with mode per col
df.fillna(modes, inplacesum() # no more nas df.isna().
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 0
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 0
Embarked 0
dtype: int64
2.3 Deal with Numeric Data
'Fare'].hist() # Not evenly spread df[
'LogFare'] = np.log1p(df['Fare'])
df['LogFare'].hist() # more evenly spread df[
2.4 Deal with Categorical Data
df.nunique() # [Pclass], [Sex] and [Age] variables only has 2-3 categories.
# A good choice to create dummy variables
PassengerId 891
Survived 2
Pclass 3
Name 891
Sex 2
Age 88
SibSp 7
Parch 7
Ticket 681
Fare 248
Cabin 147
Embarked 3
LogFare 248
dtype: int64
= pd.get_dummies(df, columns=["Sex", "Pclass", "Embarked"], dtype=int)
df = ['Sex_male', 'Sex_female', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Embarked_C', 'Embarked_Q', 'Embarked_S']
added_cols = ['Age', 'SibSp', 'Parch', 'LogFare'] + added_cols indep_cols
df.nunique() # [Sex], [Pclass] and [Embarked] dummy variables created
PassengerId 891
Survived 2
Name 891
Age 88
SibSp 7
Parch 7
Ticket 681
Fare 248
Cabin 147
LogFare 248
Sex_female 2
Sex_male 2
Pclass_1 2
Pclass_2 2
Pclass_3 2
Embarked_C 2
Embarked_Q 2
Embarked_S 2
dtype: int64
2.5 Normalise Numerical Data
= torch.tensor(df[indep_cols].values, dtype=torch.float)
idep_values_2d_tsr 0:5] # Column 1 (20s) and Column 4 (2-4) are much larger than others (0-1). idep_values_2d_tsr[
tensor([[22.0000, 1.0000, 0.0000, 2.1102, 1.0000, 0.0000, 0.0000, 0.0000,
1.0000, 0.0000, 0.0000, 1.0000],
[38.0000, 1.0000, 0.0000, 4.2806, 0.0000, 1.0000, 1.0000, 0.0000,
0.0000, 1.0000, 0.0000, 0.0000],
[26.0000, 0.0000, 0.0000, 2.1889, 0.0000, 1.0000, 0.0000, 0.0000,
1.0000, 0.0000, 0.0000, 1.0000],
[35.0000, 1.0000, 0.0000, 3.9908, 0.0000, 1.0000, 1.0000, 0.0000,
0.0000, 0.0000, 0.0000, 1.0000],
[35.0000, 0.0000, 0.0000, 2.2028, 1.0000, 0.0000, 0.0000, 0.0000,
1.0000, 0.0000, 0.0000, 1.0000]])
= idep_values_2d_tsr.max(axis=0) # get max of each column
maxes, _ = idep_values_2d_tsr / maxes
idep_norms_2d_tsr_mxn 0:5] # values are normalised about 0-1 idep_norms_2d_tsr_mxn[
tensor([[0.2750, 0.1250, 0.0000, 0.3381, 1.0000, 0.0000, 0.0000, 0.0000, 1.0000,
0.0000, 0.0000, 1.0000],
[0.4750, 0.1250, 0.0000, 0.6859, 0.0000, 1.0000, 1.0000, 0.0000, 0.0000,
1.0000, 0.0000, 0.0000],
[0.3250, 0.0000, 0.0000, 0.3507, 0.0000, 1.0000, 0.0000, 0.0000, 1.0000,
0.0000, 0.0000, 1.0000],
[0.4375, 0.1250, 0.0000, 0.6395, 0.0000, 1.0000, 1.0000, 0.0000, 0.0000,
0.0000, 0.0000, 1.0000],
[0.4375, 0.0000, 0.0000, 0.3530, 1.0000, 0.0000, 0.0000, 0.0000, 1.0000,
0.0000, 0.0000, 1.0000]])
3. Training and Validation Sets
from fastai.data.transforms import RandomSplitter
= torch.tensor(df["Survived"])
dep_mx0
= RandomSplitter(seed=42)(idep_norms_2d_tsr_mxn)
trn_idx, val_idx = idep_norms_2d_tsr_mxn[trn_idx], idep_norms_2d_tsr_mxn[val_idx]
trn_idep_mxn, val_idep_mxn = dep_mx0[trn_idx], dep_mx0[val_idx]
trn_dep_mx0, val_dep_mx0
= trn_dep_mx0[:,None] # add extra dimention for matrix multiplies comparisons
trn_dep_mx1 = val_dep_mx0[:,None] val_dep_mx1
4. Deep Learning Neural Network
4.1 Initialise Coefficients
import torch.nn.functional as F
def init_coeffs():
= trn_idep_mxn.shape[1] # 12
n_coeffs = [10,10]
hidden_layers = [n_coeffs] + hidden_layers + [1] # [12,10,10,1]
sizes = [(torch.rand(sizes[i],sizes[i+1])-0.3)/sizes[i+1]*4 for i in range(len(sizes)-1)] # 0,1,2
layers = [(torch.rand(1)[0]-0.5)*0.1 for i in range(len(sizes)-1)] # [0,1,2]
consts
for layer in layers+consts:
layer.requires_grad_()
return layers, consts
4.2 Calculate Predictions
# i=1: [12,10] [nxq1] res1 = [713x12]@[12x10] = [713x10]
# i=2: [10,10] [q1xq2] res2 = [713x10]@[10x10] = [713x10]
# ...
# i=n: [10,1] [qnx1] resn = [713x10]@[10x1] = [713x1]
def calc_preds_deeplearning(trn_idep_mxn, coeffs):
= coeffs
layers, consts = len(layers)
n = trn_idep_mxn
res for i in range(n):
= res@layers[i] + consts[i] # [mxn]@[nxq] [713x12][12x10]
res if i!=n-1:
= F.relu(res)
res = torch.sigmoid(res)
sgm_preds_mx1 return sgm_preds_mx1
4.3 Calculate Loss
def calc_loss(idep_mxn, dep_mx1, coeffs):
= calc_preds_deeplearning(idep_mxn, coeffs)
preds_mx1 return torch.abs(dep_mx1-preds_mx1).mean()
4.4 Update Coefficients and Constants
def update_coeffs(coeffs, lr):
= coeffs
layers, consts for layer in layers+consts:
*lr)
layer.sub_(layer.grad layer.grad.zero_()
4.5 One Epoch
def one_epoch(coeffs,lr):
= calc_loss(trn_idep_mxn, trn_dep_mx1, coeffs)
loss
loss.backward()with torch.no_grad(): update_coeffs(coeffs, lr)
print(f"{loss:.3f}",end=';')
4.6 Train Model with 30 Epochs
def train_model(n_epochs=30,lr=0.1):
442)
torch.manual_seed(= init_coeffs()
coeffs for _ in range(n_epochs):
one_epoch(coeffs,lr)return coeffs
= train_model(lr=4) coeffs
0.521;0.483;0.427;0.379;0.379;0.379;0.379;0.378;0.378;0.378;0.378;0.378;0.378;0.378;0.378;0.378;0.377;0.376;0.371;0.333;0.239;0.224;0.208;0.204;0.203;0.203;0.207;0.197;0.196;0.195;
5. Submit to Kaggles
5.1 Prepare Test-Set
= pd.read_csv(path/'test.csv')
tst_df 'Fare'] = tst_df.Fare.fillna(0)
tst_df[=True)
tst_df.fillna(modes, inplace'LogFare'] = np.log(tst_df['Fare']+1)
tst_df[= pd.get_dummies(tst_df, columns=["Sex","Pclass","Embarked"], dtype=int)
tst_df = torch.tensor(tst_df[indep_cols].values, dtype=torch.float)
tst_indep = tst_indep / maxes tst_indep
5.2 Predictions on Test-Set
'Survived'] = (calc_preds_deeplearning(tst_indep, coeffs)>0.5).int() tst_df[
5.3 Create Submission CSV
= tst_df[['PassengerId','Survived']]
titanic_submission_df 'titanic_submission.csv', index=False) titanic_submission_df.to_csv(
='titanic_submission.csv',
kaggle.api.competition_submit(file_name='20240420_tit_submission',
message='titanic') competition
Warning: Looks like you're using an outdated API Version, please consider updating (server 1.6.12 / client 1.6.6)
100%|██████████| 2.77k/2.77k [00:00<00:00, 3.42kB/s]100%|██████████| 2.77k/2.77k [00:01<00:00, 1.82kB/s]
Successfully submitted to Titanic - Machine Learning from Disaster
6. Success!
I’ve finally completed building my first deep-learning neural-network model from scratch and successfully submitting to Kaggle with 77.75% Accuracy.