from sklearn.tree import DecisionTreeClassifier, export_graphvizdef get_tree(prop=0.75): n =len(trn_dep) idxs = random.choice(n, int(n*prop))return DecisionTreeClassifier(min_samples_leaf=5).fit(trn_idep.iloc[idxs], trn_dep.iloc[idxs])
# create as many trees as we wanttrees = [get_tree() for t inrange(100)]
# average themall_probs = [t.predict(val_idep) for t in trees]avg_probs = np.stack(all_probs).mean(0)from sklearn.metrics import mean_absolute_errormean_absolute_error(val_dep, avg_probs)
0.2272645739910314
4. Using RandomForestClassifier
This is nearly identical to what sklearn’s RandomForestClassifier does.
The main extra piece in a “real” random forest (is that as well as choosing a random sample of data for each tree):
it also picks a random subset of columns for each split.
from sklearn.ensemble import RandomForestClassifierrf = RandomForestClassifier(100, min_samples_leaf=5)rf.fit(trn_idep, trn_dep);mean_absolute_error(val_dep, rf.predict(val_idep))