Feature Engineering

Code
# Helper packages
import imp
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt

# import plotnine
from plotnine import ggplot, aes, geom_density, geom_line, geom_point, ggtitle

import seaborn as sns
# Modeling process
from sklearn.model_selection import train_test_split, KFold, RepeatedKFold, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
import sklearn.metrics 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE  #try downsampling 
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from mlxtend.feature_selection import sequential_feature_selector as SFS

# deep learning functionality
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras import regularizers
from tensorflow.keras import layers
Code
# read the dataset
df = pd.read_csv("..\data\star_classification.csv")

# encode values for class column
df.replace({'class': {'GALAXY': 0, 'STAR': 1, 'QSO':2}}, inplace=True)
Code
f,ax = plt.subplots(figsize=(12,8))
sns.heatmap(df.corr(), cmap="PuBu", annot=True, linewidths=0.5, fmt= '.2f',ax=ax)
plt.show()

Code
df.corr()["class"].sort_values()
field_ID      -0.038044
u             -0.017701
g             -0.005915
run_ID        -0.000049
obj_ID        -0.000047
alpha          0.004552
cam_col        0.014476
z              0.017352
fiber_ID       0.032053
delta          0.056643
r              0.150691
MJD            0.207262
spec_obj_ID    0.215722
plate          0.215722
i              0.284396
redshift       0.536822
class          1.000000
rerun_ID            NaN
Name: class, dtype: float64
Code
from dis import dis

cleaned = df.drop(['obj_ID','run_ID','rerun_ID',"alpha"], axis = 1)

cleaned=cleaned.drop(79543)
# make the X and y varialbes
X = cleaned.drop('class', axis=1)
y = cleaned['class']
display(cleaned)


sm = SMOTE(random_state=42)
X, y = sm.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)
delta u g r i z cam_col field_ID spec_obj_ID class redshift plate MJD fiber_ID
0 32.494632 23.87882 22.27530 20.39501 19.16573 18.79371 2 79 6.543777e+18 0 0.634794 5812 56354 171
1 31.274185 24.77759 22.83188 22.58444 21.16812 21.61427 5 119 1.176014e+19 0 0.779136 10445 58158 427
2 35.582444 25.26307 22.66389 20.60976 19.34857 18.94827 2 120 5.152200e+18 0 0.644195 4576 55592 299
3 -0.402828 22.13682 23.77656 21.61162 20.50454 19.25010 3 214 1.030107e+19 0 0.932346 9149 58039 775
4 21.183866 19.43718 17.58028 16.49747 15.97711 15.54461 3 137 6.891865e+18 0 0.116123 6121 56187 842
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
99995 -2.594074 22.16759 22.97586 21.90404 21.30548 20.73569 2 581 1.055431e+19 0 0.000000 9374 57749 438
99996 19.798874 22.69118 22.38628 20.45003 19.75759 19.41526 1 289 8.586351e+18 0 0.404895 7626 56934 866
99997 15.700707 21.16916 19.26997 18.20428 17.69034 17.35221 4 308 3.112008e+18 0 0.143366 2764 54535 74
99998 46.660365 25.35039 21.63757 19.91386 19.07254 18.62482 4 131 7.601080e+18 0 0.455040 6751 56368 470
99999 49.464643 22.62171 21.79745 20.60115 20.00959 19.28075 4 60 8.343152e+18 0 0.542944 7410 57104 851

99999 rows × 14 columns

pre-processing

Code
# Normalizing approach
yj = PowerTransformer(method="yeo-johnson")
scaler = StandardScaler()
# nzv_encoder = VarianceThreshold(threshold=0.1)
# pca = PCA(n_components=7)

# Normalize all numeric features
preprocessor = ColumnTransformer([("norm", yj, selector(dtype_include="number")),
                ("std_encode", scaler, selector(dtype_include="number")),
                # ("nzv_encoder", nzv_encoder, selector(dtype_include="number")),
                # ("pca_encode", pca, selector(dtype_include="number"))
                ])

Random Forest Classifier

Code

#best training data for Random Forest Classifier
tree_X_train= X_train[['delta', 'u', 'g', 'i',"spec_obj_ID", 'redshift', 'plate']]
tree_X_test= X_test[['delta', 'u', 'g', 'i',"spec_obj_ID", 'redshift', 'plate']]


#creating the Random Forest Classifier model with the pre processing steps
r_forest = RandomForestClassifier()
r_forest_pipeline = Pipeline(steps=[
  ("norm", yj),
  ("std_encode",scaler),
  ("r_forest", r_forest),
])


# training the model
r_forest_pipeline.fit(tree_X_train,y_train)
predicted = r_forest_pipeline.predict(tree_X_test)
score = r_forest_pipeline.score(tree_X_test,y_test)
r_forest_pipeline_score = np.mean(score)
r_forest_pipeline_score
c:\Users\user\AppData\Local\Programs\Python\Python310\lib\site-packages\numpy\core\_methods.py:236: RuntimeWarning: overflow encountered in multiply
0.9825049999065438
Code
print(classification_report(y_test,r_forest_pipeline.predict(tree_X_test)))
              precision    recall  f1-score   support

           0       0.97      0.98      0.97     17862
           1       1.00      1.00      1.00     17804
           2       0.98      0.97      0.98     17835

    accuracy                           0.98     53501
   macro avg       0.98      0.98      0.98     53501
weighted avg       0.98      0.98      0.98     53501
Code
r_forest.predict(tree_X_test)
c:\Users\user\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py:443: UserWarning: X has feature names, but RandomForestClassifier was fitted without feature names
array([2, 2, 2, ..., 2, 2, 2], dtype=int64)
Code
# define loss function
scoring = 'accuracy'

# create 10 fold CV object
kfold = KFold(n_splits=5, random_state=123, shuffle=True)

# # fit model with 10-fold CV
results = cross_val_score(r_forest_pipeline, X_test, y_test, cv=kfold, scoring=scoring)
results.mean()
Code
#feature selection
tsfs=SFS(r_forest_pipeline,k_features=14,scoring=scoring,cv=kfold)
tsfs.fit(X,y)
tsfs.subsets_


#best output
#  7: {'feature_idx': (2, 3, 4, 6, 12, 13, 14),
#   'cv_scores': array([0.98320576, 0.98357025, 0.9836824 , 0.98457958, 0.98340202]),
#   'avg_score': 0.9836880029158606,
#   'feature_names': ('delta','u','g','i','spec_obj_ID','redshift','plate')},
TypeError: 'module' object is not callable
Code
# Create grid of hyperparameter values

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

hyper_grid = {'r_forest__n_estimators': n_estimators,
               'r_forest__max_depth': max_depth,
               'r_forest__min_samples_split': min_samples_split,
               'r_forest__min_samples_leaf': min_samples_leaf,
               'r_forest__bootstrap': bootstrap}


grid_search = GridSearchCV(r_forest_pipeline, hyper_grid, cv=kfold, scoring=scoring)
results = grid_search.fit(X_train[["g","i","redshift"]], y_train)

SVM

Code

#best training data for Random Forest Classifier
#??


#creating SVM model
svm_clf = svm.SVC(kernel='rbf', C=2, random_state=0)
model_pipeline = Pipeline(steps=[
  ("norm", yj),
  ("std_encode",scaler),
  ("knn", svm_clf),
])


#training SVM model
# model_pipeline.fit(X_train,y_train)
# predicted = model_pipeline.predict(X_test)
# score = model_pipeline.score(X_test,y_test)
# model_pipeline_score = np.mean(score)
# model_pipeline_score
#0.977
0.973589278705071
Code
# define loss function
scoring = 'accuracy'

# create 10 fold CV object
kfold = KFold(n_splits=5, random_state=123, shuffle=True)

# # fit model with 10-fold CV
# results = cross_val_score(model_pipeline, X_test, y_test, cv=kfold, scoring=scoring)
# results.mean()

#feature selection
svmsfs=SFS(model_pipeline,
    k_features=17,
    scoring=scoring,
    cv=kfold)
svmsfs.fit(X,y)
svmsfs.subsets_
NameError: name 'SFS' is not defined

Logistic Regression

Code
#best training data for Logistic Regression
log_X_train=X_train[["delta","u","g","r","cam_col","field_ID","spec_obj_ID","redshift","plate","MJD","fiber_ID"]]
log_X_test=X_test[["delta","u","g","r","cam_col","field_ID","spec_obj_ID","redshift","plate","MJD","fiber_ID"]]

#creating Logistic Regression model
log_reg=LogisticRegression(max_iter=1000,C=4714.85,penalty="l2")
log_reg_pipeline = Pipeline(steps=[
  ("norm", yj),
  ("std_encode",scaler),
  ("log_reg", log_reg),
])

#training Logistic Regression model
log_reg_pipeline.fit(log_X_train,y_train)
predicted = log_reg_pipeline.predict(log_X_test)
score = log_reg_pipeline.score(log_X_test,y_test)
log_reg_pipeline_score = np.mean(score)
log_reg_pipeline_score
c:\Users\user\AppData\Local\Programs\Python\Python310\lib\site-packages\numpy\core\_methods.py:236: RuntimeWarning: overflow encountered in multiply
0.9682062017532382
Code
# define scoring function
scoring = 'accuracy'

# create 10 fold CV object
kfold = KFold(n_splits=5, random_state=123, shuffle=True)

#  fit model with 10-fold CV
results = cross_val_score(log_reg_pipeline, X_test, y_test, cv=kfold, scoring=scoring)
results.mean()
c:\Users\user\AppData\Local\Programs\Python\Python310\lib\site-packages\numpy\core\_methods.py:236: RuntimeWarning: overflow encountered in multiply
c:\Users\user\AppData\Local\Programs\Python\Python310\lib\site-packages\numpy\core\_methods.py:236: RuntimeWarning: overflow encountered in multiply
c:\Users\user\AppData\Local\Programs\Python\Python310\lib\site-packages\numpy\core\_methods.py:236: RuntimeWarning: overflow encountered in multiply
c:\Users\user\AppData\Local\Programs\Python\Python310\lib\site-packages\numpy\core\_methods.py:236: RuntimeWarning: overflow encountered in multiply
c:\Users\user\AppData\Local\Programs\Python\Python310\lib\site-packages\numpy\core\_methods.py:236: RuntimeWarning: overflow encountered in multiply
0.9693089963642143
Code
# Create grid of hyperparameter values
C = np.logspace(-4, 4, 50)
penalty = ['l1', 'l2']

hyper_grid = {'log_reg__C': C,
        'log_reg__penalty':penalty,
        }

# Tune a Logistic Regression model using grid search
grid_search = GridSearchCV(model_pipeline, hyper_grid, cv=kfold, scoring=scoring)
results = grid_search.fit(X_train, y_train)

results.best_params_
#best output
#C=4714.85
#penalty="l2"
Code
from mlxtend.feature_selection import sequential_feature_selector as SFS

#feature selection
sfs=SFS(log_reg_pipeline,
    k_features=14,
    scoring=scoring,
    cv=kfold,
    forward=True
    )
    
sfs.fit(X,y)
sfs.subsets_


#best output
#  13: {'feature_idx': ( 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13),
#   'cv_scores': array([0.96859842, 0.96991617, 0.96851431, 0.97128999, 0.9696358 ]),
#   'avg_score': 0.9695909384024448,
#   'feature_names': ('delta','u','g','r','i','cam_col','field_ID','spec_obj_ID','redshift','plate','MJD','fiber_ID')}}
TypeError: 'module' object is not callable
Code
print(classification_report(y_test,log_reg_pipeline.predict(log_X_test)))
              precision    recall  f1-score   support

           0       0.94      0.96      0.95     17862
           1       0.99      1.00      1.00     17804
           2       0.97      0.94      0.96     17835

    accuracy                           0.97     53501
   macro avg       0.97      0.97      0.97     53501
weighted avg       0.97      0.97      0.97     53501

K-Nearest Neighbors

Code
#best training data for K-Nearest Neighbors
knn_X_train=X_train[['g', 'r', 'i', 'z', 'redshift', 'plate']]
knn_X_test=X_test[['g', 'r', 'i', 'z', 'redshift', 'plate']]


#creating the Knn model
knn=KNeighborsClassifier(n_neighbors=3)
knn_pipeline = Pipeline(steps=[
  ("norm", yj),
  ("std_encode",scaler),
  ("knn", knn),
])

#training the KNN model
knn_pipeline.fit(knn_X_train,y_train)
predicted = knn_pipeline.predict(knn_X_test)
score = knn_pipeline.score(knn_X_test,y_test)
knn_pipeline_score = np.mean(score)
knn_pipeline_score
0.9730285415225883
Code
# define loss function
scoring = 'accuracy'

# create 10 fold CV object
kfold = KFold(n_splits=5, random_state=123, shuffle=True)

# # fit model with 10-fold CV
results = cross_val_score(model_pipeline, X_test, y_test, cv=kfold, scoring=scoring)
results.mean()
Code
# hyper parameters for CV
hyper_params = {
    'n_neighbors': range(1, 10+1)
}

# Tune `knn` using grid search
grid_search = GridSearchCV(knn_pipeline, hyper_params, cv=kfold, scoring='accuracy')
grid_results = grid_search.fit(log_X_train, y_train)

#best output
# 3
Code
#feature selection
sfs=SFS(knn_pipeline,
    k_features=13,
    scoring=scoring,
    cv=kfold)
    
sfs.fit(X,y)
sfs.subsets_

#best output
#  {'feature_idx': (4, 5, 6, 7, 13, 14),
#   'cv_scores': array([0.97300025, 0.97330866, 0.97364511, 0.975075  , 0.97473855]),
#   'avg_score': 0.9739535144531359,
#   'feature_names': ('g', 'r', 'i', 'z', 'redshift', 'plate')}, 
Code
print(classification_report(y_test,knn_pipeline.predict(knn_X_test)))
              precision    recall  f1-score   support

           0       0.96      0.96      0.96     17862
           1       0.98      1.00      0.99     17804
           2       0.98      0.97      0.97     17835

    accuracy                           0.97     53501
   macro avg       0.97      0.97      0.97     53501
weighted avg       0.97      0.97      0.97     53501

deep learining

Code
from dis import dis

# cleaned = df.drop(['obj_ID','alpha','delta','run_ID','rerun_ID','cam_col','field_ID','fiber_ID'], axis = 1)
cleaned = df.drop(['u','r','i','z','obj_ID','spec_obj_ID','MJD'], axis = 1)

# cleaned = df.drop(df.filter(regex='ID$').columns, axis=1)
# drop the date column
# cleaned = cleaned.drop(["MJD","plate","cam_col"], axis=1)
cleaned=cleaned.drop(79543)
# make the X and y varialbes
X = cleaned.drop('class', axis=1)
y = cleaned['class']
display(cleaned)


sm = SMOTE(random_state=42)
X, y = sm.fit_resample(X, y)


X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)

from tensorflow.keras import utils
# y = utils.to_categorical(y)
# y_train = utils.to_categorical(y_train)
# y_test = utils.to_categorical(y_test)
Code

# define the keras model
model = Sequential()
model.add(Dense(units=64, input_dim=20, activation="tanh"))
model.add(Dense(units=64,  activation="tanh"))
model.add(Dense(units=32,  activation="tanh"))
model.add(Dense(units=3, activation='softmax'))

# compile the keras model
model.compile(
    loss='sparse_categorical_crossentropy', 
    optimizer="rmsprop",
    metrics='accuracy'
)
# fit the model
# model.fit(X, y, epochs=20, validation_split=0.2)
model_pipeline = Pipeline(steps=[
  ("preprocessor", preprocessor),
  ("model", model),
])
m1=model_pipeline.fit(X_train,y_train, model__epochs=20, model__validation_split=0.2,model__batch_size=32,)
Code
predicted=m1.predict(X_test)
y_classes = predicted.argmax(axis=-1)

# model_pipeline.transform(X_test)
print(classification_report(y_test, y_classes)) 
Code
# define loss function
loss = 'accuracy'

# create 10 fold CV object
kfold = KFold(n_splits=5, random_state=123, shuffle=True)

# fit model with 10-fold CV
results = cross_val_score(model_pipeline, X_train, y_train, cv=kfold, scoring=loss)
results
Code
# hyper_grid = {'n_neighbors': range(2, 26)}
# grid_search = GridSearchCV(knn, hyper_grid, cv=kfold, scoring=loss)
# results = grid_search.fit(X_train, y_train)

k-means clustring | trash

Code
model = sklearn.cluster.KMeans(n_clusters=3,random_state=123)
model
Code
galaxy = cleaned[cleaned["class"]==0].drop("class",axis=1)
galaxy_centers = map(lambda a: a/galaxy.shape[0],galaxy.sum())
galaxy_centers= np.array(list(galaxy_centers))
galaxy_centers


STAR = cleaned[cleaned["class"]==1].drop("class",axis=1)
STAR_centers = map(lambda a: a/STAR.shape[0],STAR.sum())
STAR_centers= np.array(list(STAR_centers))
STAR_centers



QSO = cleaned[cleaned["class"]==2].drop("class",axis=1)
QSO_centers = map(lambda a: a/QSO.shape[0],QSO.sum())
QSO_centers= np.array(list(QSO_centers))
QSO_centers
Code
m1=model.fit([galaxy_centers,STAR_centers,QSO_centers])
m1.labels_
Code
pred= m1.predict(X_test)
print(classification_report(y_test, pred))