Decision Tree

Code
import autosklearn.classification
import sklearn
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
import warnings
import graphviz
import pandas as pd
from dtreeviz.models.shadow_decision_tree import ShadowDecTree
from dtreeviz.models.sklearn_decision_trees import ShadowSKDTree
from dtreeviz import trees
from sklearn import tree
import  matplotlib.pyplot  as  plt
from matplotlib.pyplot import figure
from PIL import Image
from IPython.display import display, HTML, SVG
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from plotnine import *
from sklearn.inspection import plot_partial_dependence, partial_dependence
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
import seaborn as sns
from sklearn.pipeline import Pipeline
import pickle
%matplotlib inline
Code
!pip install TPOT
Code
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tpot.config import classifier_config_dict
Code
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,
                                                    train_size=0.75, test_size=0.25, random_state=42

Don’t include: * plate ID, identifies each plate in SDSS * Unique ID used for optical spectroscopic objects (this means that 2 different observations with the same spec_obj_ID must share the output class) * Field number to identify each field * Camera column to identify the scanline within the run * Rerun Number to specify how the image was processed * Run Number used to identify the specific scan * Object Identifier, the unique value that identifies the object in the image catalog used by the CAS * MJD = Modified Julian Date, used to indicate when a given piece of SDSS data was taken

Code
df = pd.read_csv("star_classification.csv")
#just manually remove all ID-type columns, and class
features = [
 'alpha',
 'delta',
 'u',
 'g',
 'r',
 'i',
 'z',
 'redshift']
X = df[features]
y = df['class']
le = LabelEncoder()
y = le.fit_transform(y)
Code
tpot_config = {
    'sklearn.tree.DecisionTreeClassifier': {
        'criterion': ["gini", "entropy"],
        'max_depth': range(1, 11),
        'min_samples_split': range(2, 21),
        'min_samples_leaf': range(1, 21)
    }
}
Code
tpot_config['tpot.builtins.FeatureSetSelector'] = {
    'subset_list': ['https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/subset_test.csv'],
    'sel_subset': [0,1] # select only one feature set, a list of index of subset in the list above
    #'sel_subset': list(combinations(range(3), 2)) # select two feature sets
}
Code
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=0.75, test_size=0.25, random_state=42)
Code
tpot = TPOTClassifier(config_dict=tpot_config, generations=5, population_size=50, verbosity=2, random_state=42,max_time_mins = 5)
Code
tpot.fit(X_train, y_train)

5.01 minutes have elapsed. TPOT will close down.
TPOT closed during evaluation in one generation.
WARNING: TPOT may not provide a good pipeline if TPOT is stopped/interrupted in a early generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: DecisionTreeClassifier(DecisionTreeClassifier(input_matrix, criterion=entropy, max_depth=10, min_samples_leaf=19, min_samples_split=18), criterion=entropy, max_depth=3, min_samples_leaf=5, min_samples_split=10)
TPOTClassifier(config_dict={'sklearn.tree.DecisionTreeClassifier': {'criterion': ['gini',
                                                                                  'entropy'],
                                                                    'max_depth': range(1, 11),
                                                                    'min_samples_leaf': range(1, 21),
                                                                    'min_samples_split': range(2, 21)},
                            'tpot.builtins.FeatureSetSelector': {'sel_subset': [0,
                                                                                1],
                                                                 'subset_list': ['https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/subset_test.csv']}},
               generations=5, max_time_mins=5, population_size=50,
               random_state=42, verbosity=2)
Code
print(tpot.score(X_test, y_test))
0.97568
Code
tpot.export("dt_pipeline.py")
Code
#run pipeline
Code
clf = DecisionTreeClassifier(criterion="entropy", max_depth=3, min_samples_leaf=5, min_samples_split=10)
Code
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
Code
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_pred))
Accuracy score 0.95088
Code
print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

           0       0.94      0.98      0.96     14895
           1       0.95      0.79      0.86      4769
           2       1.00      1.00      1.00      5336

    accuracy                           0.95     25000
   macro avg       0.96      0.93      0.94     25000
weighted avg       0.95      0.95      0.95     25000
Code
matrix = confusion_matrix(y_pred, y_test)
matrix = matrix / matrix.astype(np.float).sum(axis=1)
cm = sns.heatmap(matrix, square=True, annot=True, cbar=False,
            xticklabels=['GALAXY', 'STAR', 'QSO'], yticklabels=['GALAXY', 'STAR', 'QSO'])
plt.xlabel('Truth')
plt.ylabel('Predicted')
plt.title("")
plt.show()
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:2: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  

Code
#initialize shadow tree
sk_dtree = ShadowSKDTree(clf, X, y, features, "class", ['GALAXY', 'STAR', 'QSO'])
Code
trees.viz_leaf_samples(sk_dtree)
[WARNING] [2022-08-09 23:43:03,900:matplotlib.font_manager] findfont: Font family ['Arial'] not found. Falling back to DejaVu Sans.

Code
trees.dtreeviz(sk_dtree)
[WARNING] [2022-08-09 23:43:50,124:matplotlib.font_manager] findfont: Font family ['Arial'] not found. Falling back to DejaVu Sans.
[WARNING] [2022-08-09 23:43:50,353:matplotlib.font_manager] findfont: Font family ['Arial'] not found. Falling back to DejaVu Sans.
[WARNING] [2022-08-09 23:43:50,369:matplotlib.font_manager] findfont: Font family ['Arial'] not found. Falling back to DejaVu Sans.
/usr/local/lib/python3.7/dist-packages/numpy/core/fromnumeric.py:3208: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
  return asarray(a).size
/usr/local/lib/python3.7/dist-packages/matplotlib/cbook/__init__.py:1376: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
  X = np.atleast_1d(X.T if isinstance(X, np.ndarray) else np.asarray(X))
[WARNING] [2022-08-09 23:43:51,615:matplotlib.font_manager] findfont: Font family ['Arial'] not found. Falling back to DejaVu Sans.

Code
trees.dtreeviz(sk_dtree, fancy=False)

Code
trees.dtreeviz(sk_dtree, show_just_path=True, X = X.iloc[10])
/usr/local/lib/python3.7/dist-packages/numpy/core/fromnumeric.py:3208: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
  return asarray(a).size
/usr/local/lib/python3.7/dist-packages/matplotlib/cbook/__init__.py:1376: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
  X = np.atleast_1d(X.T if isinstance(X, np.ndarray) else np.asarray(X))

Code
trees.viz_leaf_criterion(clf)

Code
trees.describe_node_sample(sk_dtree, node_id=10)
alpha delta u g r i z redshift
count 51935.000000 51935.000000 51935.000000 51935.000000 51935.000000 51935.000000 51935.000000 51935.000000
mean 181.711450 24.074552 22.346448 20.536128 19.210662 18.515720 18.135271 0.355549
std 91.416751 19.351312 2.327826 2.027503 1.723310 1.536114 1.502448 0.203380
min 0.005528 -12.364701 13.897990 12.679020 11.746640 11.299560 10.897380 0.004285
25% 134.238282 6.283251 20.344025 18.611765 17.660465 17.226380 16.926995 0.145027
50% 187.270126 22.799320 22.584700 21.159340 19.592490 18.855040 18.436680 0.387110
75% 231.975632 38.773742 24.060300 22.144885 20.597755 19.655370 19.214630 0.534749
max 359.994125 74.459854 29.325650 29.862580 29.571860 29.889210 29.383740 0.685050
Code
pred_path = X.iloc[10]
pred_path
alpha       328.092076
delta        18.220310
u            25.771630
g            22.520420
r            20.638840
i            19.780710
z            19.057650
redshift      0.459596
Name: 10, dtype: float64
Code
print(trees.explain_prediction_path(clf, pred_path, feature_names=features, explanation_type="plain_english"))
0.0 <= redshift  < 0.69
Code
trees.explain_prediction_path(clf, pred_path, feature_names=features, explanation_type="sklearn_default")
<matplotlib.axes._subplots.AxesSubplot at 0x7f5a40719c90>