Random Forest

Loading Packages and Reading Data

Code

# utility packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


# modeling packages
from sklearn.model_selection import train_test_split, KFold, RepeatedKFold, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

Code

# read the dataset
df = pd.read_csv('./star_classification 2.csv')

# encode values for class column
df.replace({'class': {'GALAXY': 0, 'STAR': 1, 'QSO':2}}, inplace=True)

# remove all columns containing ID at the end
cleaned = df.drop(df.filter(regex='ID$').columns, axis=1)
# drop the date column
cleaned = cleaned.drop('MJD', axis=1)

cleaned = cleaned.reset_index()

# make the X and y varialbes
X = cleaned.drop('class', axis=1)
y = cleaned['class']

# split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)

Modeling

Code

# Trial 1
# Trying number of tress 10, 20, 30, ...

Code

n = []
accuracy = []
for i in range(10, 1001, 10):
    clf=RandomForestClassifier(n_estimators=i)
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_test)
    print(f"#{i} Accuracy:",metrics.accuracy_score(y_test, y_pred))
    n.append(i)
    accuracy.append(metrics.accuracy_score(y_test, y_pred))

#10 Accuracy: 0.9771333333333333
#20 Accuracy: 0.9789
#30 Accuracy: 0.9789333333333333
#40 Accuracy: 0.9793666666666667
#50 Accuracy: 0.9789
#60 Accuracy: 0.9791666666666666
#70 Accuracy: 0.9791666666666666
#80 Accuracy: 0.9791
#90 Accuracy: 0.9792
#100 Accuracy: 0.9793333333333333
#110 Accuracy: 0.9791
#120 Accuracy: 0.9791333333333333
#130 Accuracy: 0.9789333333333333
#140 Accuracy: 0.9793666666666667
#150 Accuracy: 0.9788333333333333
#160 Accuracy: 0.9796666666666667
#170 Accuracy: 0.9790333333333333
#180 Accuracy: 0.9790333333333333
#190 Accuracy: 0.9791
#200 Accuracy: 0.9794666666666667
#210 Accuracy: 0.9794
#220 Accuracy: 0.9793333333333333
#230 Accuracy: 0.9792
#240 Accuracy: 0.9795
#250 Accuracy: 0.9792666666666666
#260 Accuracy: 0.9793
#270 Accuracy: 0.9792333333333333
#280 Accuracy: 0.9794
#290 Accuracy: 0.9795333333333334
#300 Accuracy: 0.9792
#310 Accuracy: 0.9792666666666666
#320 Accuracy: 0.9794666666666667
#330 Accuracy: 0.9791
#340 Accuracy: 0.9792333333333333
#350 Accuracy: 0.9793
#360 Accuracy: 0.9795333333333334
#370 Accuracy: 0.9794333333333334
#380 Accuracy: 0.9792666666666666
#390 Accuracy: 0.9791333333333333
#400 Accuracy: 0.9791666666666666
#410 Accuracy: 0.9789666666666667
#420 Accuracy: 0.9793666666666667
#430 Accuracy: 0.9795666666666667
#440 Accuracy: 0.9794333333333334
#450 Accuracy: 0.9797
#460 Accuracy: 0.9792
#470 Accuracy: 0.9796
#480 Accuracy: 0.9794333333333334
#490 Accuracy: 0.9790333333333333
#500 Accuracy: 0.9793666666666667
#510 Accuracy: 0.9792
#520 Accuracy: 0.9794
#530 Accuracy: 0.9792666666666666
#540 Accuracy: 0.9792666666666666
#550 Accuracy: 0.9794
#560 Accuracy: 0.9795333333333334
#570 Accuracy: 0.9794
#580 Accuracy: 0.9792666666666666
#590 Accuracy: 0.9793333333333333
#600 Accuracy: 0.9790333333333333
#610 Accuracy: 0.9795
#620 Accuracy: 0.9792
#630 Accuracy: 0.9793333333333333
#640 Accuracy: 0.9790666666666666
#650 Accuracy: 0.9793
#660 Accuracy: 0.9792666666666666
#670 Accuracy: 0.9792333333333333
#680 Accuracy: 0.9790666666666666
#690 Accuracy: 0.9792333333333333
#700 Accuracy: 0.9793
#710 Accuracy: 0.9797
#720 Accuracy: 0.9793666666666667
#730 Accuracy: 0.9793
#740 Accuracy: 0.9792666666666666
#750 Accuracy: 0.9794333333333334
#760 Accuracy: 0.9795
#770 Accuracy: 0.9792333333333333
#780 Accuracy: 0.9793666666666667
#790 Accuracy: 0.9793333333333333
#800 Accuracy: 0.9791666666666666
#810 Accuracy: 0.9794
#820 Accuracy: 0.9794333333333334
#830 Accuracy: 0.9794333333333334
#840 Accuracy: 0.9793333333333333
#850 Accuracy: 0.9794666666666667
#860 Accuracy: 0.9792333333333333
#870 Accuracy: 0.9793666666666667
#880 Accuracy: 0.9794
#890 Accuracy: 0.9793
#900 Accuracy: 0.9794333333333334
#910 Accuracy: 0.9795666666666667
#920 Accuracy: 0.9795666666666667
#930 Accuracy: 0.9792333333333333
#940 Accuracy: 0.9793333333333333
#950 Accuracy: 0.9792666666666666
#960 Accuracy: 0.9792333333333333
#970 Accuracy: 0.9793666666666667
#980 Accuracy: 0.9794666666666667
#990 Accuracy: 0.9795
#1000 Accuracy: 0.9793

Code

max(accuracy)
# accuracy = 0.9797

0.9797

Code

n.iloc(max(accuracy))
# n = 260

Code

# Trial 2
# Using grid search to find the optimal paramerts

Code

# Random forest with Grid Search for paramerts tuning
rfc = RandomForestClassifier()
parameters = {
    "n_estimators":[5,10,50,100,250],
    "max_depth":[2,4,8,16,32,None]
}

Code

cv = GridSearchCV(rfc, parameters, cv=5)
cv.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [2, 4, 8, 16, 32, None],
                         'n_estimators': [5, 10, 50, 100, 250]})

Code

 # The highest accuracy achieved
print(cv.best_score_)

0.977997852463188

Code

 # The paramerts that yeild the best score
print(cv.best_params_)

{'max_depth': None, 'n_estimators': 250}

Questions: - What is the best n? - how does random forest prevent overfitting? via bootstrap samples and bagging - what is rf bad at? can be black box, not so good at regression - what is rf good at?

Observations: * Quick to compute (average computing time for different RF with different no of trees and depths is less than a min) * Diffrent value after each run though its close * Highest accuraccy (0.9799) when 250-260 tree and no max depth (20m) * 3h 1-1000 tress w max 0.9797 * RF doesnt assume any distribution

Code

# Trial 3
# Trying different set of columns

Code

# read the dataset
df = pd.read_csv('./star_classification 2.csv')

# encode values for class column
df.replace({'class': {'GALAXY': 0, 'STAR': 1, 'QSO':2}}, inplace=True)

# remove unneeded columns
cleaned = df.drop(['spec_obj_ID','run_ID', 'field_ID', 'plate', 'MJD'], axis=1)

cleaned = cleaned.reset_index()

# make the X and y varialbes
X = cleaned.drop('class', axis=1)
y = cleaned['class']

# split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)

Code

# Random forest with Grid Search for paramerts tuning
rfc = RandomForestClassifier()
parameters = {
    "n_estimators":[5,10,50,100,250],
    "max_depth":[2,4,8,16,32,None]
}

Code

cv = GridSearchCV(rfc, parameters, cv=5)
cv.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [2, 4, 8, 16, 32, None],
                         'n_estimators': [5, 10, 50, 100, 250]})

Code

 # The highest accuracy achieved
print(cv.best_score_)

0.9767428571428571

Code

 # The paramerts that yeild the best score
print(cv.best_params_)

{'max_depth': None, 'n_estimators': 100}