KNN & XGB

Code

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams["figure.dpi"] = 170
plt.style.use("seaborn")

import xgboost as xgb
from sklearn.model_selection import (
    train_test_split,
    KFold,
    GridSearchCV,
    cross_val_score,
    RandomizedSearchCV,
    RepeatedStratifiedKFold,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance

# import warnings filter
from warnings import simplefilter

# ignore all future warnings
simplefilter(action="ignore", category=FutureWarning)

Code

# read the dataset
df = pd.read_csv('../data/star_classification.csv')

# encode values for `class` column
df.replace({'class': {'GALAXY': 0, 'STAR': 1, 'QSO':2}}, inplace=True)

# remove all columns containing `ID` at the end
cleaned = df.drop(df.filter(regex='ID$').columns, axis=1)
# drop the date column
cleaned = cleaned.drop('MJD', axis=1)

# make the X and y varialbes
X = cleaned.drop('class', axis=1)
y = cleaned['class']

# split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    train_size=0.8,
    random_state=123,
    stratify=y,
)

Basic KNN with 5-Fold CV

Code

scaler = StandardScaler()
# standardize all columns
X_train_std = scaler.fit_transform(X_train)

# create KNN model
knn = KNeighborsClassifier(n_neighbors=5)

# create 5-Fold CV
kfold = RepeatedStratifiedKFold(n_splits=5,random_state=123)

# Fit the model
results = cross_val_score(knn, X_train_std, y_train, cv=kfold, scoring='accuracy')

print(max(results))

0.9341875

Tuning the KNN Model

Code

# hyper parameters for CV
hyper_params = {
    'n_neighbors': range(1, 10+1)
}

scaler = StandardScaler()
# standardize all columns
X_train_std = scaler.fit_transform(X_train)

# create KNN model
knn = KNeighborsClassifier()

# create 5-Fold CV
kfold = RepeatedStratifiedKFold(n_splits=5,random_state=123)

# Tune `knn` using grid search
grid_search = GridSearchCV(knn, hyper_params, cv=kfold, scoring='accuracy')
grid_results = grid_search.fit(X_train_std, y_train)

Code

# get the best accuracy achieved
print("Best accuracy", grid_results.best_score_)
print("Best K value", grid_results.best_estimator_.get_params()['n_neighbors'])

Best accuracy 0.9297842857142856
Best K value 3

Code

plt.plot(hyper_params['n_neighbors'], grid_search.cv_results_['mean_test_score'])
plt.title('Cross validated grid search results'.title())
plt.show()

Feature Interpretation

Code

%%capture

# fit the KNN with best K value
knn_best = KNeighborsClassifier(n_neighbors=3)
knn_best_fit = knn.fit(X_train_std, y_train)

r = permutation_importance(
    knn,
    X_train_std,
    y_train,
    n_repeats=5,
    random_state=123,
    n_jobs=-1,
)

Code

feat = pd.DataFrame({
    'feature': X_train.columns,
    'importance': r.importances_mean,
}).sort_values('importance')

plt.scatter(data=feat, x='importance', y='feature')
plt.xlabel("Importance")
plt.title('feature interpretation'.title())
plt.show()

Basic Gradient Boosting Model

Code

# create XGBClassifier model
xgb_model = xgb.XGBClassifier()

# create 5-Fold CV
kfold = KFold(n_splits=5, random_state=123, shuffle=True)

# Fit the model
results = cross_val_score(xgb_model, X_train, y_train, cv=kfold, scoring='accuracy')

print(max(results))

0.9791428571428571

With Randomized Searching

Code

param_distributions = {
    'n_estimators': [5000, 5500, 6000],
    'learning_rate': [0.001, 0.01, 0.1],
    'max_depth': [9, 10, 11, 12],
    'min_child_weight': [1, 2, 3]
}

random_search = RandomizedSearchCV(
    xgb_model,
    param_distributions,
    n_iter=15,
    cv=kfold,
    scoring='accuracy',
    random_state=123,
    n_jobs=-1,
)

search_results = random_search.fit(X_train_std, y_train)

Code

search_results.best_score_, search_results.best_params_

(0.9795,
 {'n_estimators': 6000,
  'min_child_weight': 1,
  'max_depth': 9,
  'learning_rate': 0.01})

Best Model & Feature Interpretation

Code

best_model = xgb.XGBRFClassifier(
    n_estimators=6000,
    learning_rate=0.01,
    max_depth=9,
    min_child_weight=1,
    subsample=1,
    colsample_bytree=0.75,
    colsample_bylevel=0.75,
    colsample_bynode=0.75
)

best_model_fit = best_model.fit(X_train, y_train)

Code

feat = pd.DataFrame({
    'feature': X_train.columns,
    'importance': best_model_fit.feature_importances_,
}).sort_values('importance')


plt.scatter(data=feat, x='importance', y='feature')
plt.xlabel("Importance")
plt.title('feature interpretation'.title())
plt.show()