import kaggle, zipfile
from pathlib import Path
import torch, numpy as np, pandas as pd
from fastai.data.transforms import RandomSplitter
import torch.nn.functional as F
path = Path("titanic")
if not path.exists():
print(f"{path} folder doesn't exist, downloading...")
kaggle.api.competition_download_cli(str(path))
zipfile.ZipFile(f"{path}.zip").extractall(path)
else:
print(f"{path} already exists, using this folder...")
!ls {path}
df = pd.read_csv(path/"train.csv")
def df_1_fillna_inplace(df):
modes = df.mode(axis=0).iloc[0] # get modes
df.fillna(modes, inplace=True) # replace nas with mode per col
def df_2_log_numeric_data_addlogfare(df): df['LogFare'] = np.log1p(df['Fare'])
def df_3_create_dummy_variables_add(df):
return pd.get_dummies(df, columns=["Sex", "Pclass", "Embarked"], dtype=int)
def df_clean(df):
df_1_fillna_inplace(df)
df_2_log_numeric_data_addlogfare(df)
return df_3_create_dummy_variables_add(df)
def get_idep_and_dep_from_df(df):
def normalise_idep_by_max(idep):
maxes, _ = idep.max(axis=0) # get max of each column
return idep / maxes
added_cols = ['Sex_male', 'Sex_female', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Embarked_C', 'Embarked_Q', 'Embarked_S']
indep_cols = ['Age', 'SibSp', 'Parch', 'LogFare'] + added_cols
idep = torch.tensor(df[indep_cols].values, dtype=torch.float)
idep = normalise_idep_by_max(idep)
dep = torch.tensor(df["Survived"])
return idep, dep
def get_trn_val_idep_dep(idep, dep):
trn_idx, val_idx = RandomSplitter(seed=42)(idep)
trn_dep_mx0, val_dep_mx0 = dep[trn_idx], dep[val_idx] # 1-dimension i.e. cant matrix multiply
trn_idep_mxn, val_idep_mxn = idep[trn_idx], idep[val_idx]
trn_dep_mx1 = trn_dep_mx0[:,None] # add extra dimention for matrix multiply
val_dep_mx1 = val_dep_mx0[:,None]
return trn_idep_mxn, val_idep_mxn, trn_dep_mx1, val_dep_mx1
df = df_clean(df)
idep, dep = get_idep_and_dep_from_df(df)
trn_idep_mxn, val_idep_mxn, trn_dep_mx1, val_dep_mx1 = get_trn_val_idep_dep(idep, dep)
def init_coeffs():
n_coeffs = trn_idep_mxn.shape[1] # 12
hidden_layers = [10,10]
sizes = [n_coeffs] + hidden_layers + [1] # [12,10,10,1]
layers = [(torch.rand(sizes[i],sizes[i+1])-0.3)/sizes[i+1]*4 for i in range(len(sizes)-1)] # 0,1,2
consts = [(torch.rand(1)[0]-0.5)*0.1 for i in range(len(sizes)-1)] # [0,1,2]
for layer in layers+consts:
layer.requires_grad_()
return layers, consts
def calc_preds_deeplearning(trn_idep_mxn, coeffs):
layers, consts = coeffs
n = len(layers)
res = trn_idep_mxn
for i in range(n):
res = res@layers[i] + consts[i] # [mxn]@[nxq] [713x12][12x10]
if i!=n-1:
res = F.relu(res)
sgm_preds_mx1 = torch.sigmoid(res)
return sgm_preds_mx1
def calc_loss(idep_mxn, dep_mx1, coeffs):
preds_mx1 = calc_preds_deeplearning(idep_mxn, coeffs)
return torch.abs(dep_mx1-preds_mx1).mean()
def update_coeffs(coeffs, lr):
layers, consts = coeffs
for layer in layers+consts:
layer.sub_(layer.grad*lr)
layer.grad.zero_()
def one_epoch(coeffs,lr):
loss = calc_loss(trn_idep_mxn, trn_dep_mx1, coeffs)
loss.backward()
with torch.no_grad(): update_coeffs(coeffs, lr)
print(f"{loss:.3f}",end=';')
def train_model(n_epochs=30,lr=0.1):
torch.manual_seed(442)
coeffs = init_coeffs()
for _ in range(n_epochs):
one_epoch(coeffs,lr)
return coeffs
coeffs = train_model(lr=4)