from numpy import randomfrom sklearn.model_selection import train_test_splitrandom.seed(42)# 0 get raw datadf = pd.read_csv(path/'train.csv')tst_df = pd.read_csv(path/'test.csv')# 1. clean data ([replace nas with mode], [logfare], [sex/embarked to cat])proc_data_1(df)proc_data_1(tst_df)# 2. split training data: training and validation settrn_df,val_df = train_test_split(df, test_size=0.25)# 3. convert cats to codescat_list = ["Sex","Embarked"]trn_df, val_df = convert_cats_to_codes_2(trn_df, val_df, cat_list)# 4. get idep and depsdep_col ="Survived"cont_list = ['Age', 'SibSp', 'Parch', 'LogFare',"Pclass"]def get_trn_and_val_idep_dep(df): idep = df[ cat_list + cont_list ].copy() dep = df[dep_col]return idep, deptrn_idep,trn_dep = get_trn_and_val_idep_dep(trn_df)val_idep,val_dep = get_trn_and_val_idep_dep(val_df)
3. Binary Splits
A binary split is where all rows are placed into one of two groups, based on whether they’re above or below some threshold of some column.
4. 1R Classifier model
In laymens:
1. Get all unique values of each idependent value.
2. Split on the value, ie. binary split.
3. Make predictions on survivability using the above split.
4. Calculate standard deviation for each split and add them.
5. If std.dev is high, than its a bad split since survived and perished within each split. A good split results in low-variability.
6. find the split point for each column with lowest std.dev.
7. This is the 1R model.
5. Code
def _side_score(side, y): tot = side.sum()if tot<=1: return0return y[side].std()*totdef score(idep_col, dep, split_val): lhs_bool_list = idep_col <= split_valreturn (_side_score(lhs_bool_list, dep) + _side_score(~lhs_bool_list, dep)) /len(dep)def min_col(df, idep_col_name): idep_col = df[idep_col_name] dep = df[dep_col] col_uniques = idep_col.dropna().unique() # get all unique values of idep col scores = np.array( # get score for each unique value in idep_col [score(idep_col, dep, col_val) for col_val in col_uniques ifnot np.isnan(col_val) ]) idx = scores.argmin() # get index of min scorereturn col_uniques[idx],scores[idx]all_cols = cat_list+cont_list {col:min_col(trn_df, col) for col in all_cols}