RF from scratch

Although there are lots of “ML from scratch” online, I haven’t seen many good RF ones.

In Lecture 7, Jeremy gave a good online tutorial on RF from scratch.(http://course18.fast.ai/lessonsml1/lesson7.html)

Jeremy introduced a concept new to me: “top down” programming: assume you have everything you need, how would you proceed

given tree, how to get forest?

For this example: suppose we have a tree API, how to make a forest?

We need to sample x and y, and make predictions based on different trees, at last, prediction based on average of trees

given that we know how to get 1 feature’s best split, how to solve all features

given feature, how to get best split?

So we delay “real work” as long as possible, and finish it before we realize! (I don’t think it’s that simple though…But this can help get myself started.)

Think about the process:

start from all samples (score: Inf, since this is the worst model)
get trees: loop through each col, loop through all split points:
get a prediction for each tree
avg predictions for forest result

class TreeEnsemble():
# top down approach to write
# delay "real work" as long as possible
# assume we have everything to start with: imagine have tree API when writing RF
    def __init__(self, x, y, n_trees, sample_sz, min_leaf=5):
    	# write this first
        np.random.seed(42)
        self.x,self.y,self.sample_sz,self.min_leaf = x,y,sample_sz,min_leaf
        self.trees = [self.create_tree() for i in range(n_trees)]

    def create_tree(self):
    	# how to we predict tree?
    	# sample with replacement, (permutation) then subsample the first sample sz items
        idxs = np.random.permutation(len(self.y))[:self.sample_sz] # this line is used everywhere
        return DecisionTree(self.x.iloc[idxs], self.y[idxs], 
                    idxs=np.array(range(self.sample_sz)), min_leaf=self.min_leaf)
        
    def predict(self, x):
    	# write this 2nd
        return np.mean([t.predict(x) for t in self.trees], axis=0)

def std_agg(cnt, s1, s2): return math.sqrt((s2/cnt) - (s1/cnt)**2)


class DecisionTree():
	# plain tree: don't do random sample, just deal with data at hand
    # pass in random sample of x, sample of y, idxs(of data we want to use, since recursive start with entire random sample)
    def __init__(self, x, y, idxs, min_leaf=5):
        self.x,self.y,self.idxs,self.min_leaf = x,y,idxs,min_leaf
        self.n,self.c = len(idxs), x.shape[1] # n: observations, c: columns
        self.val = np.mean(y[idxs]) #  mean of y for those indexes (top: all data)
        self.score = float('inf')
        self.find_varsplit()
        
    def find_varsplit(self):# find variable to split
        for i in range(self.c): self.find_better_split(i) # go through all columns
        if self.score == float('inf'): return
        x = self.split_col
        lhs = np.nonzero(x<=self.split)[0] # convert a vector of 0/1 to only those 1 indexes
        rhs = np.nonzero(x>self.split)[0]
        self.lhs = DecisionTree(self.x, self.y, self.idxs[lhs]) # build a tree on this half
        self.rhs = DecisionTree(self.x, self.y, self.idxs[rhs])

    def find_better_split(self, var_idx):
    	# this is a slower implementation: o(n^2)
    	x,y = self.x.values[self.idxs,var_idx], self.y[self.idxs]


    	for i in range(self.n): # o(n)
        	lhs = x<=x[i] #o(n)
        	rhs = x>x[i]
        	if rhs.sum()<self.min_leaf or lhs.sum()<self.min_leaf: continue
        	lhs_std = y[lhs].std()
        	rhs_std = y[rhs].std()
        	curr_score = lhs_std*lhs.sum() + rhs_std*rhs.sum()
        	if curr_score<self.score: 
            	self.var_idx,self.score,self.split = var_idx,curr_score,x[i]

    def find_better_split(self, var_idx): # find a better split for this column
    # this way: o(n), we start everything at right side, then move 1 to left or not
    	# better: 2 groups have low sum of std dev
        x,y = self.x.values[self.idxs,var_idx], self.y[self.idxs]
        sort_idx = np.argsort(x)
        sort_y,sort_x = y[sort_idx], x[sort_idx]
        rhs_cnt,rhs_sum,rhs_sum2 = self.n, sort_y.sum(), (sort_y**2).sum()
        lhs_cnt,lhs_sum,lhs_sum2 = 0,0.,0.

        for i in range(0,self.n-self.min_leaf):
            xi,yi = sort_x[i],sort_y[i]
            lhs_cnt += 1; rhs_cnt -= 1
            lhs_sum += yi; rhs_sum -= yi
            lhs_sum2 += yi**2; rhs_sum2 -= yi**2
            if i<self.min_leaf-1 or xi==sort_x[i+1]:
                continue

            lhs_std = std_agg(lhs_cnt, lhs_sum, lhs_sum2) # aggregate std dev for this subtree
            rhs_std = std_agg(rhs_cnt, rhs_sum, rhs_sum2)
            curr_score = lhs_std*lhs_cnt + rhs_std*rhs_cnt
            if curr_score<self.score: 
                self.var_idx,self.score,self.split = var_idx,curr_score,xi

    @property
    # @property so don't need () when used: self.is_leaf not self.is_leaf()
    def split_name(self): return self.x.columns[self.var_idx]
    
    @property
    def split_col(self): return self.x.values[self.idxs,self.var_idx]

    @property
    def is_leaf(self): return self.score == float('inf')
    
    # for printing
    def __repr__(self):
        s = f'n: {self.n}; val:{self.val}'
        if not self.is_leaf:
            s += f'; score:{self.score}; split:{self.split}; var:{self.split_name}'
        return s

    def predict(self, x):
        return np.array([self.predict_row(xi) for xi in x])

    def predict_row(self, xi):
        if self.is_leaf: return self.val
        t = self.lhs if xi[self.var_idx]<=self.split else self.rhs
        return t.predict_row(xi)

check with sklearn tree

m = RandomForestRegressor(n_estimators=1, max_depth=2, bootstrap=False)
m.fit(x_samp, y_samp)
draw_tree(m.estimators_[0], x_samp, precision=2)

So we could see a tree: where it splits.

run implemented model

cols = ['MachineID', 'YearMade', 'MachineHoursCurrentMeter', 'ProductSize', 'Enclosure',
        'Coupler_System', 'saleYear']

ens = TreeEnsemble(X_train[cols], y_train, 5, 1000)

preds = ens.predict(X_valid[cols].values)

plt.scatter(y_valid, preds, alpha=0.1, s=6);

metrics.r2_score(y_valid, preds)