import kagglehub
# loading the dataset
path = kagglehub.dataset_download("fedesoriano/stellar-classification-dataset-sdss17")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\hp\.cache\kagglehub\datasets\fedesoriano\stellar-classification-dataset-sdss17\versions\1

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix,classification_report
import seaborn as sns

df = pd.read_csv(path + "/star_classification.csv")
df.head()

# plotting the correlation heat map
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(numeric_only=True),annot=True,cmap='Greens',fmt=".2f")
plt.title("Correlation heat map")
plt.show()

# droping features
df.drop(columns=['obj_ID', 'spec_obj_ID','rerun_ID','run_ID','cam_col','field_ID','plate','MJD','fiber_ID'], inplace=True)
df.head()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   alpha     100000 non-null  float64
 1   delta     100000 non-null  float64
 2   u         100000 non-null  float64
 3   g         100000 non-null  float64
 4   r         100000 non-null  float64
 5   i         100000 non-null  float64
 6   z         100000 non-null  float64
 7   class     100000 non-null  object 
 8   redshift  100000 non-null  float64
dtypes: float64(8), object(1)
memory usage: 6.9+ MB

# Adding additional features based on suggestions in the paper
df['u_g'] = df['u'] - df['g']
df['g_r'] = df['g'] - df['r']
df['r_i'] = df['r'] - df['i']
df['i_z'] = df['i'] - df['z']
df.head()

# Feature distribution  (Histogram)
num_cols = df.select_dtypes(include=[np.number]).columns
n_cols = len(num_cols)


n_rows = int(np.ceil(n_cols / 3))  
fig, axes = plt.subplots(n_rows, 3, figsize=(15, 5 * n_rows))
axes = axes.flatten()  

for i, column in enumerate(num_cols):
    ax = axes[i]
    ax.hist(df[column], bins=100, color='steelblue', edgecolor='black')
    ax.set_xlim(df[column].quantile(0.1), df[column].quantile(.9))
    ax.set_title(column)
    ax.grid(True)

for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

# Feature distribution  (Box plot)
num_cols = df.select_dtypes(include=[np.number]).columns
n_cols = len(num_cols)

n_rows = int(np.ceil(n_cols / 3))  
fig, axes = plt.subplots(n_rows, 3, figsize=(15, 5 * n_rows))
axes = axes.flatten()  

for i, column in enumerate(num_cols):
    ax = axes[i]
    data = df[column].dropna()

    bp = ax.boxplot(data, vert=True, patch_artist=True,
                    boxprops=dict(facecolor='steelblue', alpha=0.8),
                    medianprops=dict(color='crimson', linewidth=2.5),
                    whiskerprops=dict(color='#2c3e50', linewidth=1.5),
                    capprops=dict(color='#2c3e50', linewidth=1.5),
                    flierprops=dict(marker='o', markerfacecolor='coral',
                                   markersize=5, alpha=0.6, markeredgecolor='darkred'))
    median = data.median()
    min_val = data.min()
    max_val = data.max()

    legend_text = f'Max: {max_val:.2f}\nMedian: {median:.2f}\nMin: {min_val:.2f}'

    ax.text(0.97, 0.97, legend_text, transform=ax.transAxes,
            fontsize=9, verticalalignment='top', horizontalalignment='right',
            bbox=dict(boxstyle='round,pad=0.6', facecolor='white',
                     edgecolor='steelblue', linewidth=2, alpha=0.9),
            fontfamily='monospace', fontweight='bold')

    ax.set_title(column, fontsize=11, fontweight='bold', pad=10)
    ax.set_ylabel('Values', fontsize=9)
    ax.grid(True, alpha=0.2, axis='y', linestyle='--')
    ax.set_facecolor('#f8f9fa')

for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

# Seperating the data frame to feature table and label(target) vector
Y = df['class']
X = df.drop(columns=['class'])

print(len(X),len(Y))

100000 100000

# Checking Class imbalance
sns.set(style="whitegrid")

plt.figure(figsize=(6, 5))
sns.countplot(x=Y, palette="viridis")
plt.title("Class Distribution in Dataset", fontsize=14, pad=12)
plt.xlabel("Object Class", fontsize=12)
plt.ylabel("Count", fontsize=12)

for p in plt.gca().patches:
    plt.gca().text(
        p.get_x() + p.get_width() / 2,
        p.get_height() + 200, 
        int(p.get_height()),
        ha='center', va='bottom', fontsize=10
    )

plt.tight_layout()
plt.show()

/tmp/ipykernel_8345/1255944318.py:5: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x=Y, palette="viridis")

from sklearn.naive_bayes import GaussianNB

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

model = GaussianNB()
model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)

print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

      GALAXY       0.76      0.92      0.83     11860
         QSO       0.63      0.92      0.74      3797
        STAR       0.92      0.02      0.04      4343

    accuracy                           0.72     20000
   macro avg       0.77      0.62      0.54     20000
weighted avg       0.77      0.72      0.64     20000

from sklearn.metrics import RocCurveDisplay
from sklearn.preprocessing import label_binarize


classes = model.classes_  
y_test_bin = label_binarize(Y_test, classes=classes)
y_score = model.predict_proba(X_test)

fig, axes = plt.subplots(1, len(classes), figsize=(18, 5))
colors = ['blue', 'red', 'green']

for i, (ax, class_name, color) in enumerate(zip(axes, classes, colors)):
    RocCurveDisplay.from_predictions(
        y_test_bin[:, i],
        y_score[:, i],
        name=f"Class {class_name}",
        ax=ax,
        color=color
    )
    ax.plot([0, 1], [0, 1], 'k--', lw=2)
    ax.set_title(f"ROC - {class_name}", fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(Y_test, Y_pred, labels=['GALAXY','QSO','STAR'])
disp = ConfusionMatrixDisplay(cm, display_labels=['GALAXY','QSO','STAR'])
disp.plot(cmap='Greens', values_format='d')
plt.show()

from sklearn.linear_model import LogisticRegression

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

model = LogisticRegression(max_iter=1000, class_weight='balanced',solver='lbfgs')
model.fit(X_train,Y_train)

Y_pred = model.predict(X_test)

print(classification_report(Y_test,Y_pred))

/home/simeon/.local/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

              precision    recall  f1-score   support

      GALAXY       0.97      0.91      0.94     11860
         QSO       0.85      0.92      0.88      3797
        STAR       0.90      1.00      0.95      4343

    accuracy                           0.93     20000
   macro avg       0.91      0.94      0.92     20000
weighted avg       0.93      0.93      0.93     20000

from sklearn.metrics import RocCurveDisplay
from sklearn.preprocessing import label_binarize


classes = model.classes_ 
y_test_bin = label_binarize(Y_test, classes=classes)
y_score = model.predict_proba(X_test)

fig, axes = plt.subplots(1, len(classes), figsize=(18, 5))
colors = ['blue', 'red', 'green']

for i, (ax, class_name, color) in enumerate(zip(axes, classes, colors)):
    RocCurveDisplay.from_predictions(
        y_test_bin[:, i],
        y_score[:, i],
        name=f"Class {class_name}",
        ax=ax,
        color=color
    )
    ax.plot([0, 1], [0, 1], 'k--', lw=2)
    ax.set_title(f"ROC - {class_name}", fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(Y_test, Y_pred, labels=['GALAXY','QSO','STAR'])
disp = ConfusionMatrixDisplay(cm, display_labels=['GALAXY','QSO','STAR'])
disp.plot(cmap='Greens', values_format='d')
plt.show()

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

sc = StandardScaler()
Xsc_train = sc.fit_transform(X_train)
Xsc_test = sc.transform(X_test)

model = LogisticRegression(max_iter=1000, class_weight='balanced',solver='lbfgs')
model.fit(Xsc_train,Y_train)

Y_pred = model.predict(Xsc_test)

print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

      GALAXY       0.97      0.92      0.95     11860
         QSO       0.87      0.92      0.90      3797
        STAR       0.91      1.00      0.95      4343

    accuracy                           0.94     20000
   macro avg       0.92      0.95      0.93     20000
weighted avg       0.94      0.94      0.94     20000

from sklearn.metrics import RocCurveDisplay
from sklearn.preprocessing import label_binarize


classes = model.classes_  
y_test_bin = label_binarize(Y_test, classes=classes)
y_score = model.predict_proba(Xsc_test)

fig, axes = plt.subplots(1, len(classes), figsize=(18, 5))
colors = ['blue', 'red', 'green']

for i, (ax, class_name, color) in enumerate(zip(axes, classes, colors)):
    RocCurveDisplay.from_predictions(
        y_test_bin[:, i],
        y_score[:, i],
        name=f"Class {class_name}",
        ax=ax,
        color=color
    )
    ax.plot([0, 1], [0, 1], 'k--', lw=2)
    ax.set_title(f"ROC - {class_name}", fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(Y_test, Y_pred, labels=['GALAXY','QSO','STAR'])
disp = ConfusionMatrixDisplay(cm, display_labels=['GALAXY','QSO','STAR'])
disp.plot(cmap='Greens', values_format='d')
plt.show()

from sklearn.neighbors import KNeighborsClassifier

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

model = KNeighborsClassifier(n_neighbors=5) # euclidean p=2
model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)

print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

      GALAXY       0.85      0.94      0.89     11860
         QSO       0.84      0.81      0.83      3797
        STAR       0.79      0.57      0.66      4343

    accuracy                           0.84     20000
   macro avg       0.83      0.78      0.79     20000
weighted avg       0.83      0.84      0.83     20000

from sklearn.metrics import RocCurveDisplay
from sklearn.preprocessing import label_binarize


classes = model.classes_ 
y_test_bin = label_binarize(Y_test, classes=classes)
y_score = model.predict_proba(X_test)

fig, axes = plt.subplots(1, len(classes), figsize=(18, 5))
colors = ['blue', 'red', 'green']

for i, (ax, class_name, color) in enumerate(zip(axes, classes, colors)):
    RocCurveDisplay.from_predictions(
        y_test_bin[:, i],
        y_score[:, i],
        name=f"Class {class_name}",
        ax=ax,
        color=color
    )
    ax.plot([0, 1], [0, 1], 'k--', lw=2)
    ax.set_title(f"ROC - {class_name}", fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(Y_test, Y_pred, labels=['GALAXY','QSO','STAR'])
disp = ConfusionMatrixDisplay(cm, display_labels=['GALAXY','QSO','STAR'])
disp.plot(cmap='Greens', values_format='d')
plt.show()

model = KNeighborsClassifier(n_neighbors=5)
model.fit(Xsc_train, Y_train)

Y_pred = model.predict(Xsc_test)

print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

      GALAXY       0.95      0.96      0.95     11860
         QSO       0.96      0.91      0.93      3797
        STAR       0.92      0.94      0.93      4343

    accuracy                           0.94     20000
   macro avg       0.94      0.94      0.94     20000
weighted avg       0.94      0.94      0.94     20000

from sklearn.metrics import RocCurveDisplay
from sklearn.preprocessing import label_binarize


classes = model.classes_  
y_test_bin = label_binarize(Y_test, classes=classes)
y_score = model.predict_proba(Xsc_test)

fig, axes = plt.subplots(1, len(classes), figsize=(18, 5))
colors = ['blue', 'red', 'green']

for i, (ax, class_name, color) in enumerate(zip(axes, classes, colors)):
    RocCurveDisplay.from_predictions(
        y_test_bin[:, i],
        y_score[:, i],
        name=f"Class {class_name}",
        ax=ax,
        color=color
    )
    ax.plot([0, 1], [0, 1], 'k--', lw=2)
    ax.set_title(f"ROC - {class_name}", fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(Y_test, Y_pred, labels=['GALAXY','QSO','STAR'])
disp = ConfusionMatrixDisplay(cm, display_labels=['GALAXY','QSO','STAR'])
disp.plot(cmap='Greens', values_format='d')
plt.show()

neighbors = range(1, 26,2)
accuracies = []

for k in neighbors:
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(Xsc_train, Y_train)
    Y_pred = model.predict(Xsc_test)
    acc = accuracy_score(Y_test, Y_pred)
    accuracies.append(acc)

# Plotting
plt.figure(figsize=(8, 5))
plt.plot(neighbors, accuracies, marker='o')
plt.title("KNN Hyperparameter Tuning (n_neighbors)")
plt.xlabel("Number of Neighbors (k)")
plt.ylabel("Accuracy")
plt.grid(True)
plt.xticks(neighbors)
plt.show()

best_k = neighbors[accuracies.index(max(accuracies))]
print(f"Best k: {best_k} with accuracy: {max(accuracies):.4f}")

Best k: 5 with accuracy: 0.9445

from sklearn.svm import SVC

C_values = [0.1, 1,10]
gamma_values = ['scale', 'auto']
kernels = ['rbf', 'linear']

best_score = 0
best_params = {}

for C in C_values:
    for gamma in gamma_values:
        for kernel in kernels:
            model = SVC(C=C, gamma=gamma, kernel=kernel, class_weight='balanced')
            model.fit(Xsc_train, Y_train)

            Y_pred = model.predict(Xsc_test)
            acc = accuracy_score(Y_test, Y_pred)

            print(f"C={C}, gamma={gamma}, kernel={kernel}, accuracy={acc:.4f}")

            if acc > best_score:
                best_score = acc
                best_params = {'C': C, 'gamma': gamma, 'kernel': kernel}
                best_model = model

print(f"\nBest Parameters: {best_params}, Best Accuracy: {best_score:.4f}")

C=0.1, gamma=scale, kernel=rbf, accuracy=0.9399
C=0.1, gamma=scale, kernel=linear, accuracy=0.9389
C=0.1, gamma=auto, kernel=rbf, accuracy=0.9399
C=0.1, gamma=auto, kernel=linear, accuracy=0.9389
C=1, gamma=scale, kernel=rbf, accuracy=0.9545
C=1, gamma=scale, kernel=linear, accuracy=0.9528
C=1, gamma=auto, kernel=rbf, accuracy=0.9545
C=1, gamma=auto, kernel=linear, accuracy=0.9528
C=10, gamma=scale, kernel=rbf, accuracy=0.9650
C=10, gamma=scale, kernel=linear, accuracy=0.9560
C=10, gamma=auto, kernel=rbf, accuracy=0.9650
C=10, gamma=auto, kernel=linear, accuracy=0.9560

Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}, Best Accuracy: 0.9650

best_model = SVC(C=10, gamma="scale", kernel="rbf", class_weight='balanced', probability=True)
best_model.fit(Xsc_train, Y_train)

SVC(C=10, class_weight='balanced', probability=True)

SVC(C=10, class_weight='balanced', probability=True)

Y_pred = best_model.predict(Xsc_test)
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

      GALAXY       0.98      0.96      0.97     11860
         QSO       0.94      0.93      0.93      3797
        STAR       0.96      1.00      0.98      4343

    accuracy                           0.97     20000
   macro avg       0.96      0.97      0.96     20000
weighted avg       0.97      0.97      0.97     20000

from sklearn.metrics import RocCurveDisplay
from sklearn.preprocessing import label_binarize

classes = best_model.classes_  
y_test_bin = label_binarize(Y_test, classes=classes)
y_score = best_model.predict_proba(Xsc_test)

fig, axes = plt.subplots(1, len(classes), figsize=(18, 5))
colors = ['blue', 'red', 'green']

for i, (ax, class_name, color) in enumerate(zip(axes, classes, colors)):
    RocCurveDisplay.from_predictions(
        y_test_bin[:, i],
        y_score[:, i],
        name=f"Class {class_name}",
        ax=ax,
        color=color
    )
    ax.plot([0, 1], [0, 1], 'k--', lw=2)
    ax.set_title(f"ROC - {class_name}", fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(Y_test, Y_pred, labels=['GALAXY','QSO','STAR'])
disp = ConfusionMatrixDisplay(cm, display_labels=['GALAXY','QSO','STAR'])
disp.plot(cmap='Greens', values_format='d')
plt.show()

from sklearn.tree import DecisionTreeClassifier


model = DecisionTreeClassifier(criterion='entropy',max_depth=6,random_state=42)

model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)

print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

      GALAXY       0.96      0.99      0.98     11860
         QSO       0.96      0.89      0.92      3797
        STAR       1.00      1.00      1.00      4343

    accuracy                           0.97     20000
   macro avg       0.97      0.96      0.96     20000
weighted avg       0.97      0.97      0.97     20000

from sklearn.tree import plot_tree

# ploting the dendogram view of the decision tree
plt.figure(figsize=(20, 10))
plot_tree(model, 
          feature_names=X_train.columns,  
          class_names=model.classes_, 
          filled=True,                   
          rounded=True, 
          fontsize=12)
plt.show()

# finding the optimal depth 
depths = list(range(1, 21))  
train_acc = []
test_acc = []

for d in depths:
    model = DecisionTreeClassifier(criterion='gini', max_depth=d, random_state=42)
    model.fit(X_train, Y_train)
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    train_acc.append(accuracy_score(Y_train, y_train_pred))
    test_acc.append(accuracy_score(Y_test, y_test_pred))

plt.figure(figsize=(10, 6))
plt.plot(depths, train_acc, label='Train Accuracy', marker='o')
plt.plot(depths, test_acc, label='Test Accuracy', marker='o')
plt.xlabel('Max Depth')
plt.ylabel('Accuracy')
plt.title('Decision Tree Accuracy vs Max Depth')
plt.xticks(depths)
plt.grid(alpha=0.3)
plt.legend()
plt.show()

from sklearn.metrics import RocCurveDisplay
from sklearn.preprocessing import label_binarize

classes = model.classes_
y_test_bin = label_binarize(Y_test, classes=classes)
y_score = model.predict_proba(X_test)

fig, axes = plt.subplots(1, len(classes), figsize=(18, 5))
colors = ['blue', 'red', 'green']

for i, (ax, class_name, color) in enumerate(zip(axes, classes, colors)):
    RocCurveDisplay.from_predictions(
        y_test_bin[:, i],
        y_score[:, i],
        name=f"Class {class_name}",
        ax=ax,
        color=color
    )
    ax.plot([0, 1], [0, 1], 'k--', lw=2)
    ax.set_title(f"ROC - {class_name}", fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(Y_test, Y_pred, labels=['GALAXY','QSO','STAR'])
disp = ConfusionMatrixDisplay(cm, display_labels=['GALAXY','QSO','STAR'])
disp.plot(cmap='Greens', values_format='d')
plt.show()

from sklearn.ensemble import RandomForestClassifier

n_estimators_list = [10, 50, 100,150, 200,250, 300,350,400,450, 500]

accuracies = []

for n in n_estimators_list:
    model = RandomForestClassifier(n_estimators=n,criterion='gini',max_depth=6,random_state=42,n_jobs=-1)
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    acc = accuracy_score(Y_test, Y_pred)
    accuracies.append(acc)

# Plot results
plt.figure(figsize=(8,5))
plt.plot(n_estimators_list, accuracies, marker='o', linestyle='--', color='b')
plt.xlabel("Number of Trees (n_estimators)")
plt.ylabel("Accuracy")
plt.title("Random Forest Accuracy vs Number of Trees")
plt.grid(True)
plt.show()

best_n = n_estimators_list[accuracies.index(max(accuracies))]
print(f"Best number of trees: {best_n}, Accuracy: {max(accuracies):.4f}")

best_model = RandomForestClassifier(
    n_estimators=best_n,
    criterion='gini',
    max_depth=6,
    random_state=42,
    n_jobs=-1
)
best_model.fit(X_train, Y_train)

Y_pred_best = best_model.predict(X_test)
print(classification_report(Y_test, Y_pred_best))

Best number of trees: 50, Accuracy: 0.9685
              precision    recall  f1-score   support

      GALAXY       0.97      0.98      0.97     11860
         QSO       0.95      0.90      0.93      3797
        STAR       0.98      1.00      0.99      4343

    accuracy                           0.97     20000
   macro avg       0.97      0.96      0.96     20000
weighted avg       0.97      0.97      0.97     20000

from sklearn.metrics import RocCurveDisplay
from sklearn.preprocessing import label_binarize

classes = model.classes_ 
y_test_bin = label_binarize(Y_test, classes=classes)
y_score = model.predict_proba(X_test)

fig, axes = plt.subplots(1, len(classes), figsize=(18, 5))
colors = ['blue', 'red', 'green']

for i, (ax, class_name, color) in enumerate(zip(axes, classes, colors)):
    RocCurveDisplay.from_predictions(
        y_test_bin[:, i],
        y_score[:, i],
        name=f"Class {class_name}",
        ax=ax,
        color=color
    )
    ax.plot([0, 1], [0, 1], 'k--', lw=2)
    ax.set_title(f"ROC - {class_name}", fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(Y_test, Y_pred, labels=['GALAXY','QSO','STAR'])
disp = ConfusionMatrixDisplay(cm, display_labels=['GALAXY','QSO','STAR'])
disp.plot(cmap='Greens', values_format='d')
plt.show()

import plotly.io as pio
pio.renderers.default = "iframe_connected"

import plotly.graph_objects as go

# Convert RA/DEC to radians
ra = np.radians(df['alpha'].values)
dec = np.radians(df['delta'].values)

x = np.cos(dec) * np.cos(ra)
y = np.cos(dec) * np.sin(ra)
z = np.sin(dec)


class_to_color = {
    'GALAXY': "#0080FF",  
    'QSO': "#E72020",      
    'STAR': "#F7D410"     
}

point_colors = df['class'].map(class_to_color)

theta = np.linspace(0, 2*np.pi, 100)
phi = np.linspace(-np.pi/2, np.pi/2, 100)
theta, phi = np.meshgrid(theta, phi)

xs = np.cos(phi) * np.cos(theta)
ys = np.cos(phi) * np.sin(theta)
zs = np.sin(phi)

fig = go.Figure()


fig.add_trace(go.Surface(
    x=xs, y=ys, z=zs,
    opacity=0.05,
    colorscale=[[0, "white"], [1, "white"]],
    showscale=False
))

fig.add_trace(go.Scatter3d(
    x=x, y=y, z=z,
    mode='markers',
    marker=dict(
        size=3,
        color=point_colors,
        opacity=0.8
    ),
    text=df['class'],
    hovertemplate=
        "Class: %{text}<br>" +
        "x: %{x:.3f}<br>" +
        "y: %{y:.3f}<br>" +
        "z: %{z:.3f}<extra></extra>"
))

fig.update_layout(
    title="3D Celestial Sphere Sky Distribution (RA/DEC)",
    template="plotly_dark",
    scene=dict(
        xaxis=dict(visible=False),
        yaxis=dict(visible=False),
        zaxis=dict(visible=False),
        aspectmode='data'
    ),
    width=900,
    height=700,
    legend=dict(
        itemsizing='constant'
    )
)

fig.show(render='browser')

from imblearn.over_sampling import SMOTE
sm = SMOTE()
X_res,Y_res = sm.fit_resample(X,Y)

# Checking Class imbalance
sns.set(style="whitegrid")

plt.figure(figsize=(6, 5))
sns.countplot(x=Y_res, palette="viridis")
plt.title("Class Distribution in Dataset", fontsize=14, pad=12)
plt.xlabel("Object Class", fontsize=12)
plt.ylabel("Count", fontsize=12)

for p in plt.gca().patches:
    plt.gca().text(
        p.get_x() + p.get_width() / 2,
        p.get_height() + 200, 
        int(p.get_height()),
        ha='center', va='bottom', fontsize=10
    )

plt.tight_layout()
plt.show()

/tmp/ipykernel_8345/3065864254.py:5: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x=Y_res, palette="viridis")

from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
Y_encoded = le.fit_transform(Y_res)

X_train, X_test, Y_train, Y_test = train_test_split(X_res, Y_encoded, test_size=0.2, random_state=42)

n_estimators_list = [50, 100,150, 200, 300,400, 500]

accuracies = []

for n in n_estimators_list:
    model = XGBClassifier(
        n_estimators=n,
        learning_rate=0.1,
        max_depth=3,
        random_state=42,
        use_label_encoder=False,
        eval_metric="logloss"   # avoid warning
    )
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    acc = accuracy_score(Y_test, Y_pred)
    accuracies.append(acc)

# Plot results
plt.figure(figsize=(8,5))
plt.plot(n_estimators_list, accuracies, marker='o', linestyle='--', color='g')
plt.xlabel("Number of Trees (n_estimators)")
plt.ylabel("Accuracy")
plt.title("XGBoost Accuracy vs Number of Trees")
plt.grid(True)
plt.show()

/home/murali-karthick/Desktop/Sem5/ML/venv/lib/python3.12/site-packages/xgboost/training.py:199: UserWarning: [18:33:54] WARNING: /workspace/src/learner.cc:790: 
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
/home/murali-karthick/Desktop/Sem5/ML/venv/lib/python3.12/site-packages/xgboost/training.py:199: UserWarning: [18:33:55] WARNING: /workspace/src/learner.cc:790: 
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
/home/murali-karthick/Desktop/Sem5/ML/venv/lib/python3.12/site-packages/xgboost/training.py:199: UserWarning: [18:33:57] WARNING: /workspace/src/learner.cc:790: 
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
/home/murali-karthick/Desktop/Sem5/ML/venv/lib/python3.12/site-packages/xgboost/training.py:199: UserWarning: [18:34:00] WARNING: /workspace/src/learner.cc:790: 
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
/home/murali-karthick/Desktop/Sem5/ML/venv/lib/python3.12/site-packages/xgboost/training.py:199: UserWarning: [18:34:03] WARNING: /workspace/src/learner.cc:790: 
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
/home/murali-karthick/Desktop/Sem5/ML/venv/lib/python3.12/site-packages/xgboost/training.py:199: UserWarning: [18:34:08] WARNING: /workspace/src/learner.cc:790: 
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
/home/murali-karthick/Desktop/Sem5/ML/venv/lib/python3.12/site-packages/xgboost/training.py:199: UserWarning: [18:34:26] WARNING: /workspace/src/learner.cc:790: 
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

best_n = n_estimators_list[accuracies.index(max(accuracies))]
print(f"Best n_estimators: {best_n}, Accuracy: {max(accuracies):.4f}")

best_model = XGBClassifier(
    n_estimators=best_n,
    learning_rate=0.1,
    max_depth=3,
    random_state=42,
    use_label_encoder=False,
    eval_metric="logloss"
)
best_model.fit(X_train, Y_train)

Y_pred_best = best_model.predict(X_test)
print(classification_report(Y_test, Y_pred_best))

Best n_estimators: 500, Accuracy: 0.9799

/home/murali-karthick/Desktop/Sem5/ML/venv/lib/python3.12/site-packages/xgboost/training.py:199: UserWarning: [18:34:39] WARNING: /workspace/src/learner.cc:790: 
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

              precision    recall  f1-score   support

           0       0.96      0.98      0.97     11806
           1       0.98      0.97      0.97     11865
           2       1.00      1.00      1.00     11996

    accuracy                           0.98     35667
   macro avg       0.98      0.98      0.98     35667
weighted avg       0.98      0.98      0.98     35667

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

X_train, X_test, Y_train, Y_test = train_test_split(X_res, Y_res, test_size=0.2, random_state=42)

n_estimators_list = [50, 100,150, 200, 300,400, 500]

accuracies = []

for n in n_estimators_list:
    model = AdaBoostClassifier(
        estimator=DecisionTreeClassifier(max_depth=1),
        n_estimators=n,
        learning_rate=1.0,
        random_state=42
    )
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    acc = accuracy_score(Y_test, Y_pred)
    accuracies.append(acc)

plt.figure(figsize=(8,5))
plt.plot(n_estimators_list, accuracies, marker='o', linestyle='--', color='r')
plt.xlabel("Number of Trees (n_estimators)")
plt.ylabel("Accuracy")
plt.title("AdaBoost Accuracy vs Number of Trees")
plt.grid(True)
plt.show()

best_n = n_estimators_list[accuracies.index(max(accuracies))]
print(f"Best n_estimators: {best_n}, Accuracy: {max(accuracies):.4f}")

best_model = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=1),
    n_estimators=best_n,
    learning_rate=1.0,
    random_state=42
)
best_model.fit(X_train, Y_train)

Y_pred_best = best_model.predict(X_test)
print(classification_report(Y_test, Y_pred_best))

Best n_estimators: 50, Accuracy: 0.8879
              precision    recall  f1-score   support

      GALAXY       0.88      0.77      0.82     11806
         QSO       0.80      0.89      0.84     11865
        STAR       0.99      1.00      1.00     11996

    accuracy                           0.89     35667
   macro avg       0.89      0.89      0.89     35667
weighted avg       0.89      0.89      0.89     35667

import shap
import matplotlib.pyplot as plt
import numpy as np

# Use model.predict_proba as callable for SHAP
explainer_shap = shap.Explainer(best_model.predict_proba, X_train, feature_names=X_train.columns)
shap_values = explainer_shap(X_test)

PermutationExplainer explainer: 20001it [1:25:55,  3.87it/s]

# GLOBAL FEATURE IMPORTANCE
class_names = best_model.classes_
# If class_names are integers, convert them to strings
class_names = ["GALAXY","QSO","STAR"]

print("Class names:", class_names)
print("Generating SHAP summary plot for all classes...")

# Create the summary plot
shap.summary_plot(shap_values, X_test, feature_names=X_test.columns, show=False)

# Replace legend labels safely
ax = plt.gca()
handles, labels = ax.get_legend_handles_labels()

# Replace generic class labels with actual class names
new_labels = []
for label in labels:
    for i, cname in enumerate(class_names):
        label = label.replace(f"Class {i}", cname)
    new_labels.append(label)

# Rebuild the legend
plt.legend(handles, new_labels, loc='lower right')
plt.title("SHAP Summary Plot with Actual Class Labels")
plt.show()

# LOCAL EXPLANATION for one sample
sample_to_explain = 0
pred_class = np.argmax(best_model.predict_proba([X_test.iloc[sample_to_explain]])[0])
true_label = le.inverse_transform([Y_test[sample_to_explain]])[0]
pred_label = le.inverse_transform([pred_class])[0]

print(f"\nSHAP Explanation for sample #{sample_to_explain} (True: {true_label}, Predicted: {pred_label})")

# Waterfall plot for the predicted class
shap.plots.waterfall(shap_values[sample_to_explain, :, pred_class])

Class names: ['GALAXY', 'QSO', 'STAR']
Generating SHAP summary plot for all classes...

C:\Users\hp\AppData\Local\Temp\ipykernel_18180\123089330.py:10: FutureWarning: The NumPy global RNG was seeded by calling `np.random.seed`. In a future version this function will no longer use the global RNG. Pass `rng` explicitly to opt-in to the new behaviour and silence this warning.
  shap.summary_plot(shap_values, X_test, feature_names=X_test.columns, show=False)

SHAP Explanation for sample #0 (True: GALAXY, Predicted: GALAXY)

# For each class (0,1,2)
for class_idx, class_name in enumerate(le.classes_):
    print(f"=== SHAP Summary for {class_name} ===")
    shap.summary_plot(shap_values[:,:,class_idx], X_test, feature_names=X_test.columns)

=== SHAP Summary for GALAXY ===

/tmp/ipykernel_7170/2939035931.py:4: FutureWarning: The NumPy global RNG was seeded by calling `np.random.seed`. In a future version this function will no longer use the global RNG. Pass `rng` explicitly to opt-in to the new behaviour and silence this warning.
  shap.summary_plot(shap_values[:,:,class_idx], X_test, feature_names=X_test.columns)

=== SHAP Summary for QSO ===

/tmp/ipykernel_7170/2939035931.py:4: FutureWarning: The NumPy global RNG was seeded by calling `np.random.seed`. In a future version this function will no longer use the global RNG. Pass `rng` explicitly to opt-in to the new behaviour and silence this warning.
  shap.summary_plot(shap_values[:,:,class_idx], X_test, feature_names=X_test.columns)

=== SHAP Summary for STAR ===

/tmp/ipykernel_7170/2939035931.py:4: FutureWarning: The NumPy global RNG was seeded by calling `np.random.seed`. In a future version this function will no longer use the global RNG. Pass `rng` explicitly to opt-in to the new behaviour and silence this warning.
  shap.summary_plot(shap_values[:,:,class_idx], X_test, feature_names=X_test.columns)

Y_test_labels = le.inverse_transform(Y_test)
Y_pred_labels = le.inverse_transform(Y_pred_best)

redshift_test = df.loc[X_test.index, 'redshift']

# Create result DataFrame
df_results = pd.DataFrame({
    'True_Class': Y_test_labels,
    'Pred_Class': Y_pred_labels,
    'Redshift': redshift_test
})

plt.style.use('seaborn-v0_8-whitegrid')

for cls in le.classes_:
    subset_true = df_results[df_results['True_Class'] == cls]
    subset_pred = df_results[df_results['Pred_Class'] == cls]

    plt.figure(figsize=(8,4))
    sns.kdeplot(subset_true['Redshift'], label='True Class', fill=True, alpha=0.4, linewidth=1.5)
    sns.kdeplot(subset_pred['Redshift'], label='Predicted Class', fill=True, alpha=0.4, linewidth=1.5)
    plt.title(f'Redshift Density Comparison — {cls}')
    plt.xlabel('Redshift (z)')
    plt.ylabel('Density')
    plt.legend()
    plt.show()

plt.style.use('seaborn-v0_8-whitegrid')

# Define bins for redshift
bins = np.linspace(df_results['Redshift'].min(), df_results['Redshift'].max(), 20)
bin_centers = 0.5 * (bins[:-1] + bins[1:])

# Analyze bias per predicted class
for cls in df_results['Pred_Class'].unique():
    subset = df_results[df_results['Pred_Class'] == cls]

    bias = []
    for i in range(len(bins) - 1):
        mask = (subset['Redshift'] >= bins[i]) & (subset['Redshift'] < bins[i+1])
        if np.any(mask):
            # For classification-based case, “bias” = difference between
            # mean redshift of predicted class vs mean redshift of true class in that bin
            true_mean = df_results[(df_results['True_Class'] == cls) &
                                   (df_results['Redshift'] >= bins[i]) &
                                   (df_results['Redshift'] < bins[i+1])]['Redshift'].mean()
            pred_mean = subset.loc[mask, 'Redshift'].mean()
            bias.append(pred_mean - true_mean if not np.isnan(true_mean) else np.nan)
        else:
            bias.append(np.nan)

    # Plot
    plt.figure(figsize=(8,4))
    plt.plot(bin_centers, bias, marker='o')
    plt.axhline(0, color='k', linestyle='--', linewidth=1)
    plt.title(f'Redshift Bias per Bin — Predicted Class: {cls}')
    plt.xlabel('Redshift (bin center)')
    plt.ylabel('Mean Bias (Predicted − True)')
    plt.grid(True, alpha=0.3)
    plt.show()

plt.figure(figsize=(10, 6))

classes = df['class'].unique()
colors = ['tab:blue', 'tab:orange', 'tab:red']

for c, col in zip(classes, colors):
    subset = df[df['class'] == c]
    plt.scatter(subset['alpha'], subset['delta'], 
                s=10, alpha=0.6, label=c, color=col)

plt.title("Sky Distribution of Objects (SDSS DR17)")
plt.xlabel("Right Ascension (degrees)")
plt.ylabel("Declination (degrees)")
plt.legend(title="Class")
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import f1_score
from itertools import permutations


model = KMeans(n_clusters=3, random_state=42)
model.fit(X)
labels = model.labels_

unique_classes = np.unique(Y)
class_to_num = {cls: i for i, cls in enumerate(unique_classes)}
y_true_numeric = np.array([class_to_num[cls] for cls in Y])


best_f1 = -1
best_mapping = None
best_labels_mapped = None

# Generate all possible permutations of [0, 1, 2]
for perm in permutations([0, 1, 2]):

    mapping = {i: perm[i] for i in range(3)}
    
    labels_mapped = np.array([mapping[label] for label in labels])
    

    f1 = f1_score(y_true_numeric, labels_mapped, average='weighted')

    if f1 > best_f1:
        best_f1 = f1
        best_mapping = mapping
        best_labels_mapped = labels_mapped.copy()

print(f"Best F1-score: {best_f1:.4f}")
print(f"Best mapping: {best_mapping}")

Best F1-score: 0.4064
Best mapping: {0: 2, 1: 1, 2: 0}

Feature	Description
`obj_ID`	Unique identifier assigned in the SDSS catalog. Links photometric and spectroscopic data.
`alpha` (RA)	Angular coordinate (in degrees) along the celestial equator.
`delta` (Dec)	Angular coordinate (in degrees) north or south of the celestial equator.
`u, g, r, i, z`	Apparent magnitudes in five SDSS filters — ultraviolet (u), green (g), red (r), near-infrared (i), and infrared (z).
`run_ID`	Imaging run identifier. Useful for calibration.
`rerun_ID`	Specifies image reprocessing or recalibration details.
`cam_col`	Camera column number (1–6) used for imaging.
`field_ID`	Identifies the sky field where the object was imaged.
`spec_obj_ID`	Unique spectroscopic identifier linking to the object’s spectrum. Crucial for determining redshift and class.
`class`	Target label specifying object type — STAR, GALAXY, or QSO (quasar) — determined via spectral template fitting.
`redshift (z)`	Dimensionless measure of wavelength shift (Δλ/λ). Distinguishes stars (≈0), galaxies (0.01–0.5), and quasars (>1).
`plate`	Identifier for the metal plate used in the SDSS spectrograph.
`MJD`	Modified Julian Date of observation.
`fiber_ID`	Fiber number (1–640) corresponding to the position on the spectrographic plate.

	obj_ID	alpha	delta	u	g	r	i	z	run_ID	rerun_ID	cam_col	field_ID	spec_obj_ID	class	redshift	plate	MJD	fiber_ID
0	1.237661e+18	135.689107	32.494632	23.87882	22.27530	20.39501	19.16573	18.79371	3606	301	2	79	6.543777e+18	GALAXY	0.634794	5812	56354	171
1	1.237665e+18	144.826101	31.274185	24.77759	22.83188	22.58444	21.16812	21.61427	4518	301	5	119	1.176014e+19	GALAXY	0.779136	10445	58158	427
2	1.237661e+18	142.188790	35.582444	25.26307	22.66389	20.60976	19.34857	18.94827	3606	301	2	120	5.152200e+18	GALAXY	0.644195	4576	55592	299
3	1.237663e+18	338.741038	-0.402828	22.13682	23.77656	21.61162	20.50454	19.25010	4192	301	3	214	1.030107e+19	GALAXY	0.932346	9149	58039	775
4	1.237680e+18	345.282593	21.183866	19.43718	17.58028	16.49747	15.97711	15.54461	8102	301	3	137	6.891865e+18	GALAXY	0.116123	6121	56187	842

Column	Reason for Removal
`obj_ID`	Purely an index or key; contains no physical or photometric information about the object.
`spec_obj_ID`	Identifies a measurement instance, not a measurable feature. Including it would introduce meaningless numeric variance.
`rerun_ID`	Used for data provenance and quality tracking; does not reflect any astrophysical property.
`run_ID`	Encodes when and where the observation occurred — not intrinsic to the object’s spectrum or class.
`cam_col`	Relates to instrument geometry; has no correlation with the physical class of the observed object.
`field_ID`	Represents sky segmentation; objects from the same field can belong to any class, so it adds noise.
`plate`	Instrumental reference only; does not affect the spectrum’s physical interpretation.
`MJD`	Observation time; irrelevant to the intrinsic properties of stars, galaxies, or quasars in a static snapshot dataset.
`fiber_ID`	Hardware mapping reference; not related to object characteristics.

	alpha	delta	u	g	r	i	z	class	redshift
0	135.689107	32.494632	23.87882	22.27530	20.39501	19.16573	18.79371	GALAXY	0.634794
1	144.826101	31.274185	24.77759	22.83188	22.58444	21.16812	21.61427	GALAXY	0.779136
2	142.188790	35.582444	25.26307	22.66389	20.60976	19.34857	18.94827	GALAXY	0.644195
3	338.741038	-0.402828	22.13682	23.77656	21.61162	20.50454	19.25010	GALAXY	0.932346
4	345.282593	21.183866	19.43718	17.58028	16.49747	15.97711	15.54461	GALAXY	0.116123

Feature	Type	Reason for Inclusion
`alpha` (Right Ascension)	Positional	Provides the object’s celestial coordinate. While weakly correlated with class, it can offer contextual priors (e.g., objects near the Galactic plane are more likely to be stars).
`delta` (Declination)	Positional	Complements Right Ascension to specify sky position. Useful for spatial context, though not strongly discriminative by itself.
`u`, `g`, `r`, `i`, `z`	Photometric Magnitudes	Measure an object’s brightness in five wavelength bands — from ultraviolet to infrared. These values represent the spectral energy distribution (SED) and are fundamental for identification.
`redshift`	Spectroscopic	Measures the fractional shift in observed wavelength. This is the most decisive feature: stars have $z \approx 0$, galaxies have moderate $z$, and quasars have large $z$.
`u_g = u - g`	Derived Color Index	Represents the ultraviolet–green color. Sensitive to UV excess — quasars and hot stars show small $(u-g)$ values.
`g_r = g - r`	Derived Color Index	Indicates the blue–red color difference. Useful to separate galaxies (redder) from stars (bluer).
`r_i = r - i`	Derived Color Index	Traces the continuum slope in the red–infrared region. Helps in distinguishing late-type stars and red galaxies.
`i_z = i - z`	Derived Color Index	Captures near-infrared color, valuable for identifying very cool stars or highly redshifted galaxies/quasars.

Feature	Value	SHAP Impact Effect on Prediction
`redshift = 0.506`	+0.21	🔺 Strongly increases Galaxy probability
`g_r = 1.47`	+0.05	🔺 Moderate positive effect
`r_i = 0.878`	+0.05	🔺 Moderate positive effect
`g = 21.951`	+0.03	🔺 Small positive contribution
`i = 19.603`	-0.01	🔻 Slightly decreases probability
`z = 19.131`	+0.01	🔺 Very minor positive impact
Other features	—	≈0 Negligible influence

Aspect	Summary
Most Influential Feature	`redshift` dominates both global and local explanations.
Supporting Features	`g_r`, `r_i`, and `g` refine decisions for ambiguous cases.
Least Influential	Positional or auxiliary features (`alpha`, `delta`, `r`, etc.) have near-zero impact.
Consistency	Global (summary) and local (waterfall) analyses both validate that redshift + color information are the core drivers behind the model’s astrophysical classification decisions.

Feature	Galaxy Impact	Quasar (QSO) Impact	Star Impact	Key Observation
redshift	🔺 High (positive for high values)	🔺 Very High (positive for very high values)	🔻 High (positive for low values)	Most critical feature for all classes
g_r	Moderate	Moderate	Low	Helps differentiate spectral energy distributions
r_i	Moderate	Moderate	Low	Color index supporting classification
u_g	Low	Low	Low	Minor refinement feature
Other features (`z`, `i_z`, `g`, `i`, `u`, `delta`, `r`, `alpha`)	Negligible	Negligible	Negligible	Minimal model contribution

STELLAR ML PROJECT¶

Dataset Description¶

Feature Description¶

Feature Relevance for Classification¶

1. Spectroscopic Features¶

2. Photometric Features¶

3. Positional Context¶

4. Calibration Metadata¶

Import the dataset¶

Stellar Classification Dataset¶

Dataset visualisation and Preprocessing¶

Basic Models¶

Gaussian Naive-Bayes¶

Inference¶

Multiclass Logistic Regression¶

Inference¶

Scaled Multiclass Logistic Regression¶

Inference¶

KNN - Classifier¶

Inference¶

Hyperparameter tuning of KNN , to find optimum number of neighbours¶

Better Models¶

Multi-Class SVM with Kernel¶

Multi-Class Setup¶

Hyperparameter Tuning¶

Inference¶

Decision Tree¶

Hyperparameter Tuning¶

Inference¶

Random Forest¶

Inference¶

Future Plans¶

Verification of Claims Presented in the Paper¶

Suggestions¶

EndSem¶

Spherical Sky Distribution (RA/DEC on Celestial Sphere)¶

SMOTE¶

How SMOTE Works¶

Effect on Dataset¶

XGBoost¶

AdaBoost¶

Explainablility of Features for classification¶

SHAP and Model Interpretation¶

Shapley Values¶

Why SHAP is Useful¶

In This Project¶

SHAP on XGBoost¶

SHAP Summary Plot — Overall Feature Importance¶

SHAP Waterfall Plot — Single Sample (Galaxy Class)¶

Overall Interpretation¶

SHAP Beeswarm Plots — Feature Impact Analysis¶

Class: Galaxy¶

Class: Quasar (QSO)¶

Class: Star¶

Comparative Summary¶

Overall Interpretation¶

Inference From Redshift Density & Bias Analysis.¶

1. Density Alignment Between True and Predicted Classes¶

2. Redshift Bias Is Very Small Across Bins¶

Attempt of Clustering¶

Conclusion¶