import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from scipy.stats import multivariate_normal
from matplotlib.colors import ListedColormap

np.random.seed(42)
plt.style.use('seaborn-v0_8-whitegrid')
mpl.rcParams['font.family'] = 'DejaVu Sans'

# True parameters for each cancer subtype
# Feature 1: CEA (ng/mL, log-transformed)
# Feature 2: CYFRA 21-1 (ng/mL, log-transformed)

class_names = ['Adenocarcinoma', 'Squamous Cell', 'Large Cell']
colors = ['#2196F3', '#F44336', '#4CAF50']  # Blue, Red, Green

# Adenocarcinoma: high CEA, moderate CYFRA, positive correlation
mu1 = np.array([3.5, 1.0])
Sigma1 = np.array([[1.2, 0.4],
                   [0.4, 0.8]])

# Squamous Cell: moderate CEA, high CYFRA, negative correlation
mu2 = np.array([0.5, 4.0])
Sigma2 = np.array([[0.8, -0.3],
                   [-0.3, 1.5]])

# Large Cell: intermediate, nearly spherical
mu3 = np.array([-2.0, 0.5])
Sigma3 = np.array([[1.0, 0.1],
                   [0.1, 0.6]])

# Class priors (prevalence in clinic)
true_pi = np.array([0.45, 0.35, 0.20])  # Adeno most common

# Generate samples
N = 300
n1 = int(N * true_pi[0])
n2 = int(N * true_pi[1])
n3 = N - n1 - n2

X1 = np.random.multivariate_normal(mu1, Sigma1, n1)
X2 = np.random.multivariate_normal(mu2, Sigma2, n2)
X3 = np.random.multivariate_normal(mu3, Sigma3, n3)

X = np.vstack([X1, X2, X3])
y = np.concatenate([np.zeros(n1), np.ones(n2), 2*np.ones(n3)]).astype(int)

true_mus = [mu1, mu2, mu3]
true_Sigmas = [Sigma1, Sigma2, Sigma3]

print(f"Generated {N} patient samples:")
for c in range(3):
    print(f"  {class_names[c]:20s}: {(y == c).sum()} samples ({(y == c).mean():.0%})")

Generated 300 patient samples:
  Adenocarcinoma      : 135 samples (45%)
  Squamous Cell       : 105 samples (35%)
  Large Cell          : 60 samples (20%)

# Visualize the data
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Left: raw data with class labels
for c in range(3):
    mask = y == c
    axes[0].scatter(X[mask, 0], X[mask, 1], c=colors[c], alpha=0.5, s=25,
                    label=class_names[c], edgecolors='white', linewidths=0.3)
axes[0].set_xlabel('log(CEA)', fontsize=12)
axes[0].set_ylabel('log(CYFRA 21-1)', fontsize=12)
axes[0].set_title('NSCLC Patient Biomarker Data', fontsize=14)
axes[0].legend(fontsize=10)

# Right: data with fitted Gaussians (true parameters)
for c in range(3):
    mask = y == c
    axes[1].scatter(X[mask, 0], X[mask, 1], c=colors[c], alpha=0.4, s=20,
                    edgecolors='white', linewidths=0.3)

# Draw contours of class-conditional Gaussians
x_min, x_max = X[:, 0].min() - 2, X[:, 0].max() + 2
y_min, y_max = X[:, 1].min() - 2, X[:, 1].max() + 2
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200),
                     np.linspace(y_min, y_max, 200))
grid = np.column_stack([xx.ravel(), yy.ravel()])

for c, (mu, Sigma, color) in enumerate(zip(true_mus, true_Sigmas, colors)):
    rv = multivariate_normal(mu, Sigma)
    zz = rv.pdf(grid).reshape(xx.shape)
    axes[1].contour(xx, yy, zz, levels=4, colors=[color], alpha=0.8, linewidths=1.5)
    axes[1].plot(*mu, 'x', color=color, markersize=12, markeredgewidth=3)

axes[1].set_xlabel('log(CEA)', fontsize=12)
axes[1].set_ylabel('log(CYFRA 21-1)', fontsize=12)
axes[1].set_title('Fitted Class-Conditional Gaussians', fontsize=14)

plt.tight_layout()
plt.show()

def fit_gda(X, y, C):
    """
    Fit GDA model via MLE (Eqs. 9.19-9.20).
    
    Returns class priors, means, and per-class covariances.
    """
    N = len(y)
    pi_hat = np.zeros(C)
    mu_hat = []
    Sigma_hat = []
    
    for c in range(C):
        mask = y == c
        Nc = mask.sum()
        
        # Class prior: pi_c = N_c / N
        pi_hat[c] = Nc / N
        
        # Class mean: mu_c = (1/N_c) * sum of x_n for y_n = c
        mu_c = X[mask].mean(axis=0)
        mu_hat.append(mu_c)
        
        # Class covariance: Sigma_c = (1/N_c) * sum of (x_n - mu_c)(x_n - mu_c)^T
        diff = X[mask] - mu_c
        Sigma_c = (diff.T @ diff) / Nc
        Sigma_hat.append(Sigma_c)
    
    return pi_hat, mu_hat, Sigma_hat


def fit_gda_tied(X, y, C):
    """
    Fit GDA with tied (shared) covariance (Eq. 9.21).
    
    Returns class priors, means, and a single shared covariance.
    """
    N = len(y)
    pi_hat = np.zeros(C)
    mu_hat = []
    
    for c in range(C):
        mask = y == c
        pi_hat[c] = mask.sum() / N
        mu_hat.append(X[mask].mean(axis=0))
    
    # Tied covariance: pool all samples
    # Sigma = (1/N) * sum_c sum_{n: y_n=c} (x_n - mu_c)(x_n - mu_c)^T
    Sigma_tied = np.zeros((X.shape[1], X.shape[1]))
    for c in range(C):
        mask = y == c
        diff = X[mask] - mu_hat[c]
        Sigma_tied += diff.T @ diff
    Sigma_tied /= N
    
    return pi_hat, mu_hat, Sigma_tied


# Fit both models
C = 3
pi_qda, mu_qda, Sigma_qda = fit_gda(X, y, C)
pi_lda, mu_lda, Sigma_lda = fit_gda_tied(X, y, C)

print("=" * 60)
print("MLE PARAMETER ESTIMATES")
print("=" * 60)

for c in range(C):
    print(f"\n{class_names[c].upper()}")
    print(f"  Prior pi_c:      {pi_qda[c]:.3f}  (true: {true_pi[c]:.3f})")
    print(f"  Mean mu_c:       [{mu_qda[c][0]:+.2f}, {mu_qda[c][1]:+.2f}]  "
          f"(true: [{true_mus[c][0]:+.2f}, {true_mus[c][1]:+.2f}])")

print(f"\nTied covariance (LDA):")
print(f"  [[{Sigma_lda[0,0]:.3f}, {Sigma_lda[0,1]:.3f}],")
print(f"   [{Sigma_lda[1,0]:.3f}, {Sigma_lda[1,1]:.3f}]]")

============================================================
MLE PARAMETER ESTIMATES
============================================================

ADENOCARCINOMA
  Prior pi_c:      0.450  (true: 0.450)
  Mean mu_c:       [+3.57, +1.09]  (true: [+3.50, +1.00])

SQUAMOUS CELL
  Prior pi_c:      0.350  (true: 0.350)
  Mean mu_c:       [+0.43, +4.16]  (true: [+0.50, +4.00])

LARGE CELL
  Prior pi_c:      0.200  (true: 0.200)
  Mean mu_c:       [-1.88, +0.39]  (true: [-2.00, +0.50])

Tied covariance (LDA):
  [[0.957, -0.004],
   [-0.004, 0.953]]

def discriminant_qda(x, pi, mu, Sigma):
    """
    QDA discriminant function (Eq. 9.4):
    delta_c(x) = log(pi_c) - 0.5*log|Sigma_c| - 0.5*(x-mu_c)^T Sigma_c^{-1} (x-mu_c)
    """
    diff = x - mu
    Sigma_inv = np.linalg.inv(Sigma)
    mahal = diff @ Sigma_inv @ diff
    log_det = np.log(np.linalg.det(Sigma))
    return np.log(pi) - 0.5 * log_det - 0.5 * mahal


def predict_qda(X_grid, pi, mus, Sigmas, C):
    """Classify each point by argmax of QDA discriminant."""
    N = X_grid.shape[0]
    scores = np.zeros((N, C))
    for c in range(C):
        for i in range(N):
            scores[i, c] = discriminant_qda(X_grid[i], pi[c], mus[c], Sigmas[c])
    return np.argmax(scores, axis=1)


# Classify on a grid
preds_qda = predict_qda(grid, pi_qda, mu_qda, Sigma_qda, C)

print("QDA: Each class has its own covariance — decision boundaries are quadratic.")

QDA: Each class has its own covariance — decision boundaries are quadratic.

def discriminant_lda(x, pi, mu, Sigma_inv):
    """
    LDA discriminant function (Eq. 9.7):
    delta_c(x) = gamma_c + x^T beta_c
    where beta_c = Sigma^{-1} mu_c
          gamma_c = log(pi_c) - 0.5 * mu_c^T Sigma^{-1} mu_c
    """
    beta = Sigma_inv @ mu
    gamma = np.log(pi) - 0.5 * mu @ Sigma_inv @ mu
    return gamma + x @ beta


def predict_lda(X_grid, pi, mus, Sigma, C):
    """Classify each point by argmax of LDA discriminant."""
    Sigma_inv = np.linalg.inv(Sigma)
    N = X_grid.shape[0]
    scores = np.zeros((N, C))
    for c in range(C):
        for i in range(N):
            scores[i, c] = discriminant_lda(X_grid[i], pi[c], mus[c], Sigma_inv)
    return np.argmax(scores, axis=1)


# Classify on the same grid
preds_lda = predict_lda(grid, pi_lda, mu_lda, Sigma_lda, C)

# Show the beta and gamma parameters explicitly
Sigma_inv = np.linalg.inv(Sigma_lda)

print("LDA Parameters (Eq. 9.7)")
print("=" * 50)
for c in range(C):
    beta_c = Sigma_inv @ mu_lda[c]
    gamma_c = np.log(pi_lda[c]) - 0.5 * mu_lda[c] @ Sigma_inv @ mu_lda[c]
    print(f"\n{class_names[c]}:")
    print(f"  beta_c  = [{beta_c[0]:+.3f}, {beta_c[1]:+.3f}]")
    print(f"  gamma_c = {gamma_c:+.3f}")
    print(f"  delta_c(x) = {gamma_c:+.3f} + {beta_c[0]:+.3f}*x1 + {beta_c[1]:+.3f}*x2")

LDA Parameters (Eq. 9.7)
==================================================

Adenocarcinoma:
  beta_c  = [+3.738, +1.160]
  gamma_c = -8.107
  delta_c(x) = -8.107 + +3.738*x1 + +1.160*x2

Squamous Cell:
  beta_c  = [+0.461, +4.369]
  gamma_c = -10.238
  delta_c(x) = -10.238 + +0.461*x1 + +4.369*x2

Large Cell:
  beta_c  = [-1.961, +0.405]
  gamma_c = -3.529
  delta_c(x) = -3.529 + -1.961*x1 + +0.405*x2

# Compare QDA vs LDA decision boundaries
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

cmap_bg = ListedColormap(['#BBDEFB', '#FFCDD2', '#C8E6C9'])  # Light versions

for ax, preds, title in [(axes[0], preds_qda, 'QDA (Unconstrained Covariances)'),
                          (axes[1], preds_lda, 'LDA (Tied Covariance)')]:
    # Decision regions
    ax.contourf(xx, yy, preds.reshape(xx.shape), alpha=0.3, cmap=cmap_bg, levels=[-0.5, 0.5, 1.5, 2.5])
    
    # Decision boundaries
    ax.contour(xx, yy, preds.reshape(xx.shape), colors='black', linewidths=1.5, levels=[0.5, 1.5])
    
    # Data points
    for c in range(3):
        mask = y == c
        ax.scatter(X[mask, 0], X[mask, 1], c=colors[c], alpha=0.6, s=25,
                   label=class_names[c], edgecolors='white', linewidths=0.3)
    
    ax.set_xlabel('log(CEA)', fontsize=12)
    ax.set_ylabel('log(CYFRA 21-1)', fontsize=12)
    ax.set_title(title, fontsize=14)
    ax.legend(fontsize=9, loc='upper left')
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)

plt.tight_layout()
plt.show()

print("Left: QDA uses separate covariances — curved decision boundaries")
print("Right: LDA ties covariances — straight decision boundaries")

Left: QDA uses separate covariances — curved decision boundaries
Right: LDA ties covariances — straight decision boundaries

# Binary LDA: Adenocarcinoma (class 0) vs Squamous Cell (class 1)
# Use only these two classes to demonstrate the geometry
mask_binary = y < 2
X_bin = X[mask_binary]
y_bin = y[mask_binary]

# Fit tied covariance on just these two classes
pi_bin, mu_bin, Sigma_bin = fit_gda_tied(X_bin, y_bin, 2)
Sigma_bin_inv = np.linalg.inv(Sigma_bin)

# Compute w and x0 (Eqs. 9.13-9.14)
mu_diff = mu_bin[1] - mu_bin[0]  # mu_1 - mu_0
w = Sigma_bin_inv @ mu_diff

midpoint = 0.5 * (mu_bin[1] + mu_bin[0])
log_prior_ratio = np.log(pi_bin[1] / pi_bin[0])
mahal_sq = mu_diff @ Sigma_bin_inv @ mu_diff
x0 = midpoint - mu_diff * log_prior_ratio / mahal_sq

print("Binary LDA: Adenocarcinoma vs Squamous Cell")
print("=" * 50)
print(f"\nWeight vector w = Sigma^{{-1}}(mu_1 - mu_0):")
print(f"  w = [{w[0]:+.3f}, {w[1]:+.3f}]")
print(f"\nDecision threshold point x0:")
print(f"  x0 = [{x0[0]:+.3f}, {x0[1]:+.3f}]")
print(f"\nMidpoint of means: [{midpoint[0]:+.3f}, {midpoint[1]:+.3f}]")
print(f"Log prior ratio log(pi_1/pi_0): {log_prior_ratio:+.3f}")
print(f"\nThe boundary shifts from the midpoint because Adenocarcinoma")
print(f"is more prevalent (pi_0={pi_bin[0]:.2f} > pi_1={pi_bin[1]:.2f}).")

Binary LDA: Adenocarcinoma vs Squamous Cell
==================================================

Weight vector w = Sigma^{-1}(mu_1 - mu_0):
  w = [-3.182, +2.871]

Decision threshold point x0:
  x0 = [+1.956, +2.667]

Midpoint of means: [+1.998, +2.626]
Log prior ratio log(pi_1/pi_0): -0.251

The boundary shifts from the midpoint because Adenocarcinoma
is more prevalent (pi_0=0.56 > pi_1=0.44).

# Visualize the binary LDA geometry (cf. Figure 9.3 in PML)
fig, ax = plt.subplots(1, 1, figsize=(9, 7))

# Plot data points
for c, name, color in [(0, 'Adenocarcinoma', colors[0]), (1, 'Squamous Cell', colors[1])]:
    mask = y_bin == c
    ax.scatter(X_bin[mask, 0], X_bin[mask, 1], c=color, alpha=0.4, s=25,
              label=name, edgecolors='white', linewidths=0.3)

# Plot class means
ax.plot(*mu_bin[0], 'o', color=colors[0], markersize=14, markeredgecolor='black',
        markeredgewidth=2, label='$\\mu_0$ (Adeno)', zorder=5)
ax.plot(*mu_bin[1], 'o', color=colors[1], markersize=14, markeredgecolor='black',
        markeredgewidth=2, label='$\\mu_1$ (Squamous)', zorder=5)

# Plot decision boundary: w^T(x - x0) = 0
# This is a line perpendicular to w, passing through x0
# Direction perpendicular to w
w_perp = np.array([-w[1], w[0]])
w_perp = w_perp / np.linalg.norm(w_perp)

t_range = np.linspace(-6, 6, 100)
boundary_line = x0[:, np.newaxis] + w_perp[:, np.newaxis] * t_range
ax.plot(boundary_line[0], boundary_line[1], 'k-', linewidth=2.5, label='Decision boundary')

# Plot the w vector from x0
w_norm = w / np.linalg.norm(w)
ax.annotate('', xy=x0 + 1.2 * w_norm, xytext=x0,
            arrowprops=dict(arrowstyle='->', color='black', lw=2.5))
ax.text(x0[0] + 1.3 * w_norm[0] + 0.15, x0[1] + 1.3 * w_norm[1],
        '$\\mathbf{w}$', fontsize=16, fontweight='bold')

# Plot x0
ax.plot(*x0, 's', color='gold', markersize=12, markeredgecolor='black',
        markeredgewidth=2, label='$\\mathbf{x}_0$ (threshold)', zorder=5)

# Plot midpoint
ax.plot(*midpoint, 'D', color='gray', markersize=8, markeredgecolor='black',
        markeredgewidth=1.5, alpha=0.7, label='Midpoint', zorder=5)

# Line connecting means
ax.plot([mu_bin[0][0], mu_bin[1][0]], [mu_bin[0][1], mu_bin[1][1]],
        '--', color='gray', linewidth=1, alpha=0.6)

ax.set_xlabel('log(CEA)', fontsize=12)
ax.set_ylabel('log(CYFRA 21-1)', fontsize=12)
ax.set_title('Geometry of Binary LDA (cf. PML Figure 9.3)', fontsize=14)
ax.legend(fontsize=9, loc='lower right')
ax.set_xlim(-3, 7)
ax.set_ylim(-2.5, 7.5)

plt.tight_layout()
plt.show()

print("The weight vector w is perpendicular to the decision boundary.")
print("A new patient is classified by projecting their biomarker vector onto w")
print("and checking which side of x0 the projection falls on.")

The weight vector w is perpendicular to the decision boundary.
A new patient is classified by projecting their biomarker vector onto w
and checking which side of x0 the projection falls on.

def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))


# LDA-derived posterior for the binary case (Eq. 9.15)
# p(y=1|x) = sigma(w^T(x - x0))
def lda_posterior_binary(X_pts, w, x0):
    return sigmoid(X_pts @ w - w @ x0)


# Compute posterior on the grid
xx_bin, yy_bin = np.meshgrid(np.linspace(-3, 7, 200), np.linspace(-2.5, 7.5, 200))
grid_bin = np.column_stack([xx_bin.ravel(), yy_bin.ravel()])
posterior_squamous = lda_posterior_binary(grid_bin, w, x0).reshape(xx_bin.shape)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Left: posterior heatmap
im = axes[0].contourf(xx_bin, yy_bin, posterior_squamous, levels=20, cmap='RdBu_r', alpha=0.8)
plt.colorbar(im, ax=axes[0], label='$p(y = \mathrm{Squamous} | \mathbf{x})$')
axes[0].contour(xx_bin, yy_bin, posterior_squamous, levels=[0.5], colors='black', linewidths=2)

for c, color in [(0, colors[0]), (1, colors[1])]:
    mask = y_bin == c
    axes[0].scatter(X_bin[mask, 0], X_bin[mask, 1], c=color, alpha=0.4, s=20,
                    edgecolors='white', linewidths=0.3)

axes[0].set_xlabel('log(CEA)', fontsize=12)
axes[0].set_ylabel('log(CYFRA 21-1)', fontsize=12)
axes[0].set_title('LDA Posterior $p(y = \mathrm{Squamous} | \mathbf{x})$', fontsize=13)

# Right: sigmoid curve along the w direction
# Project all points onto w direction
projections = X_bin @ w - w @ x0

t = np.linspace(-8, 8, 200)
axes[1].plot(t, sigmoid(t), 'k-', linewidth=2, label='$\\sigma(\\mathbf{w}^\\top (\\mathbf{x} - \\mathbf{x}_0))$')

for c, name, color in [(0, 'Adeno', colors[0]), (1, 'Squamous', colors[1])]:
    mask = y_bin == c
    # Add jitter to y for visibility
    jitter = 0.03 * np.random.randn(mask.sum())
    axes[1].scatter(projections[mask], y_bin[mask] + jitter, c=color, alpha=0.3, s=15)

axes[1].axvline(0, color='gray', linestyle='--', alpha=0.5)
axes[1].axhline(0.5, color='gray', linestyle='--', alpha=0.5)
axes[1].set_xlabel('Projection $\\mathbf{w}^\\top (\\mathbf{x} - \\mathbf{x}_0)$', fontsize=12)
axes[1].set_ylabel('$p(y = \mathrm{Squamous} | \mathbf{x})$', fontsize=12)
axes[1].set_title('Sigmoid Along Discriminant Direction', fontsize=13)
axes[1].legend(fontsize=10)

plt.tight_layout()
plt.show()

print("LDA produces a logistic regression posterior with weights derived")
print("from the Gaussian parameters — same functional form, different fitting procedure.")

<>:20: SyntaxWarning: invalid escape sequence '\m'
<>:30: SyntaxWarning: invalid escape sequence '\m'
<>:48: SyntaxWarning: invalid escape sequence '\m'
<>:20: SyntaxWarning: invalid escape sequence '\m'
<>:30: SyntaxWarning: invalid escape sequence '\m'
<>:48: SyntaxWarning: invalid escape sequence '\m'
/var/folders/34/4mb6rzb52l76jcqm_pjx3fph0000gn/T/ipykernel_99955/3315418520.py:20: SyntaxWarning: invalid escape sequence '\m'
  plt.colorbar(im, ax=axes[0], label='$p(y = \mathrm{Squamous} | \mathbf{x})$')
/var/folders/34/4mb6rzb52l76jcqm_pjx3fph0000gn/T/ipykernel_99955/3315418520.py:30: SyntaxWarning: invalid escape sequence '\m'
  axes[0].set_title('LDA Posterior $p(y = \mathrm{Squamous} | \mathbf{x})$', fontsize=13)
/var/folders/34/4mb6rzb52l76jcqm_pjx3fph0000gn/T/ipykernel_99955/3315418520.py:48: SyntaxWarning: invalid escape sequence '\m'
  axes[1].set_ylabel('$p(y = \mathrm{Squamous} | \mathbf{x})$', fontsize=12)

LDA produces a logistic regression posterior with weights derived
from the Gaussian parameters — same functional form, different fitting procedure.

def predict_nearest_centroid(X_grid, mus, Sigma_inv, C):
    """Nearest centroid classifier using Mahalanobis distance (Eq. 9.23)."""
    N = X_grid.shape[0]
    dists = np.zeros((N, C))
    for c in range(C):
        diff = X_grid - mus[c]
        dists[:, c] = np.sum(diff @ Sigma_inv * diff, axis=1)
    return np.argmin(dists, axis=1)


def predict_nearest_centroid_euclidean(X_grid, mus, C):
    """Nearest centroid classifier using Euclidean distance."""
    N = X_grid.shape[0]
    dists = np.zeros((N, C))
    for c in range(C):
        dists[:, c] = np.sum((X_grid - mus[c])**2, axis=1)
    return np.argmin(dists, axis=1)


preds_ncm_mahal = predict_nearest_centroid(grid, mu_lda, np.linalg.inv(Sigma_lda), C)
preds_ncm_euclid = predict_nearest_centroid_euclidean(grid, mu_lda, C)

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

for ax, preds, title in [(axes[0], preds_ncm_mahal, 'NCM (Mahalanobis Distance)'),
                          (axes[1], preds_ncm_euclid, 'NCM (Euclidean Distance)')]:
    ax.contourf(xx, yy, preds.reshape(xx.shape), alpha=0.3, cmap=cmap_bg, levels=[-0.5, 0.5, 1.5, 2.5])
    ax.contour(xx, yy, preds.reshape(xx.shape), colors='black', linewidths=1.5, levels=[0.5, 1.5])
    
    for c in range(3):
        mask = y == c
        ax.scatter(X[mask, 0], X[mask, 1], c=colors[c], alpha=0.5, s=25,
                   label=class_names[c], edgecolors='white', linewidths=0.3)
        ax.plot(*mu_lda[c], 'X', color=colors[c], markersize=14, markeredgecolor='black',
                markeredgewidth=2, zorder=5)
    
    ax.set_xlabel('log(CEA)', fontsize=12)
    ax.set_ylabel('log(CYFRA 21-1)', fontsize=12)
    ax.set_title(title, fontsize=14)
    ax.legend(fontsize=9, loc='upper left')
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)

plt.tight_layout()
plt.show()

print("Left: Mahalanobis distance accounts for covariance structure (tilted boundaries).")
print("Right: Euclidean distance assumes isotropic clusters (perpendicular bisectors).")

Left: Mahalanobis distance accounts for covariance structure (tilted boundaries).
Right: Euclidean distance assumes isotropic clusters (perpendicular bisectors).

def fit_rda(X, y, C, alpha=0.5):
    """
    Regularized Discriminant Analysis.
    Interpolates between QDA (alpha=1) and LDA (alpha=0).
    """
    # Fit QDA and LDA components
    pi_hat, mu_hat, Sigma_hat = fit_gda(X, y, C)
    _, _, Sigma_tied = fit_gda_tied(X, y, C)
    
    # Interpolate
    Sigma_rda = []
    for c in range(C):
        Sigma_c = alpha * Sigma_hat[c] + (1 - alpha) * Sigma_tied
        Sigma_rda.append(Sigma_c)
    
    return pi_hat, mu_hat, Sigma_rda


# Show the spectrum from LDA to QDA
alphas = [0.0, 0.33, 0.67, 1.0]
labels_alpha = ['$\\alpha=0$ (LDA)', '$\\alpha=0.33$', '$\\alpha=0.67$', '$\\alpha=1$ (QDA)']

fig, axes = plt.subplots(1, 4, figsize=(20, 4.5))

for ax, alpha, label in zip(axes, alphas, labels_alpha):
    pi_r, mu_r, Sigma_r = fit_rda(X, y, C, alpha=alpha)
    preds_r = predict_qda(grid, pi_r, mu_r, Sigma_r, C)
    
    ax.contourf(xx, yy, preds_r.reshape(xx.shape), alpha=0.3, cmap=cmap_bg, levels=[-0.5, 0.5, 1.5, 2.5])
    ax.contour(xx, yy, preds_r.reshape(xx.shape), colors='black', linewidths=1.5, levels=[0.5, 1.5])
    
    for c in range(3):
        mask = y == c
        ax.scatter(X[mask, 0], X[mask, 1], c=colors[c], alpha=0.4, s=15,
                   edgecolors='white', linewidths=0.3)
    
    ax.set_xlabel('log(CEA)', fontsize=11)
    if ax == axes[0]:
        ax.set_ylabel('log(CYFRA 21-1)', fontsize=11)
    ax.set_title(label, fontsize=13)
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)

plt.suptitle('Regularized Discriminant Analysis: LDA ↔ QDA Spectrum', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

print("As alpha increases from 0 to 1, boundaries transition from linear to quadratic.")
print("RDA lets us tune the bias-variance tradeoff via cross-validation.")

As alpha increases from 0 to 1, boundaries transition from linear to quadratic.
RDA lets us tune the bias-variance tradeoff via cross-validation.

# Fisher's LDA: compare projection onto FLDA vs PCA direction
# Using the binary case (Adeno vs Squamous)

# Compute within-class scatter matrix S_W (Eq. 9.32)
S_W = np.zeros((2, 2))
for c in range(2):
    mask = y_bin == c
    diff = X_bin[mask] - mu_bin[c]
    S_W += diff.T @ diff

# Between-class scatter S_B (Eq. 9.31)
mu_diff_fisher = mu_bin[1] - mu_bin[0]
S_B = np.outer(mu_diff_fisher, mu_diff_fisher)

# FLDA direction (Eq. 9.42): w = S_W^{-1} (mu_2 - mu_1)
w_flda = np.linalg.inv(S_W) @ mu_diff_fisher
w_flda = w_flda / np.linalg.norm(w_flda)  # Normalize

# PCA direction: leading eigenvector of total covariance
Sigma_total = np.cov(X_bin.T)
eigvals, eigvecs = np.linalg.eigh(Sigma_total)
w_pca = eigvecs[:, -1]  # Largest eigenvalue

# Fisher criterion for both directions
def fisher_criterion(w, S_B, S_W):
    return (w @ S_B @ w) / (w @ S_W @ w)

J_flda = fisher_criterion(w_flda, S_B, S_W)
J_pca = fisher_criterion(w_pca, S_B, S_W)

print("Fisher's LDA vs PCA")
print("=" * 50)
print(f"\nFLDA direction: [{w_flda[0]:+.3f}, {w_flda[1]:+.3f}]")
print(f"PCA direction:  [{w_pca[0]:+.3f}, {w_pca[1]:+.3f}]")
print(f"\nFisher criterion J(w):")
print(f"  FLDA: {J_flda:.3f}")
print(f"  PCA:  {J_pca:.3f}")
print(f"\nFLDA achieves {J_flda/J_pca:.1f}x better class separation.")

Fisher's LDA vs PCA
==================================================

FLDA direction: [-0.742, +0.670]
PCA direction:  [-0.710, +0.704]

Fisher criterion J(w):
  FLDA: 0.078
  PCA:  0.078

FLDA achieves 1.0x better class separation.

# Visualize: PCA vs FLDA projections (cf. Figure 9.4 in PML)
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

directions = [(w_pca, 'PCA'), (w_flda, 'FLDA')]

for col, (w_dir, name) in enumerate(directions):
    ax_scatter = axes[0, col]
    ax_hist = axes[1, col]
    
    # Top: scatter with projection direction
    for c, cname, color in [(0, 'Adeno', colors[0]), (1, 'Squamous', colors[1])]:
        mask = y_bin == c
        ax_scatter.scatter(X_bin[mask, 0], X_bin[mask, 1], c=color, alpha=0.4, s=20,
                           label=cname, edgecolors='white', linewidths=0.3)
    
    # Draw projection direction through the data center
    center = X_bin.mean(axis=0)
    t = np.linspace(-6, 6, 100)
    line = center[:, np.newaxis] + w_dir[:, np.newaxis] * t
    ax_scatter.plot(line[0], line[1], 'k--', linewidth=2, alpha=0.7, label=f'{name} direction')
    
    ax_scatter.set_xlabel('log(CEA)', fontsize=12)
    ax_scatter.set_ylabel('log(CYFRA 21-1)', fontsize=12)
    ax_scatter.set_title(f'{name} Direction', fontsize=14)
    ax_scatter.legend(fontsize=9)
    ax_scatter.set_xlim(-3, 7)
    ax_scatter.set_ylim(-2.5, 7.5)
    
    # Bottom: histogram of projections
    for c, cname, color in [(0, 'Adeno', colors[0]), (1, 'Squamous', colors[1])]:
        mask = y_bin == c
        projections = X_bin[mask] @ w_dir
        ax_hist.hist(projections, bins=20, alpha=0.6, color=color, label=cname, density=True)
    
    ax_hist.set_xlabel(f'Projection onto {name} direction', fontsize=12)
    ax_hist.set_ylabel('Density', fontsize=12)
    ax_hist.set_title(f'Projected Distributions ({name})', fontsize=14)
    ax_hist.legend(fontsize=10)

plt.tight_layout()
plt.show()

print("Top row: the dashed line shows the direction each method projects onto.")
print("Bottom row: FLDA produces better class separation in the projected histograms.")
print("\nPCA captures maximum variance (spread) but classes may overlap.")
print("FLDA explicitly maximizes the between-class / within-class variance ratio.")

Top row: the dashed line shows the direction each method projects onto.
Bottom row: FLDA produces better class separation in the projected histograms.

PCA captures maximum variance (spread) but classes may overlap.
FLDA explicitly maximizes the between-class / within-class variance ratio.

# Generate a test set from the same true distribution
N_test = 500
n1_t = int(N_test * true_pi[0])
n2_t = int(N_test * true_pi[1])
n3_t = N_test - n1_t - n2_t

X_test = np.vstack([
    np.random.multivariate_normal(mu1, Sigma1, n1_t),
    np.random.multivariate_normal(mu2, Sigma2, n2_t),
    np.random.multivariate_normal(mu3, Sigma3, n3_t)
])
y_test = np.concatenate([np.zeros(n1_t), np.ones(n2_t), 2*np.ones(n3_t)]).astype(int)

# Evaluate each method
methods = [
    ('QDA', predict_qda(X_test, pi_qda, mu_qda, Sigma_qda, C)),
    ('LDA', predict_lda(X_test, pi_lda, mu_lda, Sigma_lda, C)),
    ('RDA (alpha=0.5)', predict_qda(X_test, *fit_rda(X, y, C, alpha=0.5)[:2],
                                     fit_rda(X, y, C, alpha=0.5)[2], C)),
    ('NCM (Mahalanobis)', predict_nearest_centroid(X_test, mu_lda, np.linalg.inv(Sigma_lda), C)),
    ('NCM (Euclidean)', predict_nearest_centroid_euclidean(X_test, mu_lda, C)),
]

print("=" * 60)
print("CLASSIFICATION ACCURACY ON TEST SET")
print("=" * 60)
print(f"{'Method':25s} {'Accuracy':>10s}")
print("-" * 37)

for name, preds in methods:
    acc = (preds == y_test).mean()
    print(f"{name:25s} {acc:10.1%}")

print("\nQDA performs best because the true data has class-specific covariances.")
print("In high dimensions with limited data, LDA or RDA would likely win")
print("due to the bias-variance tradeoff.")

============================================================
CLASSIFICATION ACCURACY ON TEST SET
============================================================
Method                      Accuracy
-------------------------------------
QDA                            98.0%
LDA                            98.0%
RDA (alpha=0.5)                98.0%
NCM (Mahalanobis)              97.4%
NCM (Euclidean)                97.6%

QDA performs best because the true data has class-specific covariances.
In high dimensions with limited data, LDA or RDA would likely win
due to the bias-variance tradeoff.

Concept	Formula
GDA model	$p(\mathbf{x} \\| y=c) = \mathcal{N}(\mathbf{x} \\| \boldsymbol{\mu}_c, \boldsymbol{\Sigma}_c)$
QDA discriminant	$\delta_c(\mathbf{x}) = \log \pi_c - \frac{1}{2}\log\\|\boldsymbol{\Sigma}_c\\| - \frac{1}{2}(\mathbf{x}-\boldsymbol{\mu}_c)^\top \boldsymbol{\Sigma}_c^{-1}(\mathbf{x}-\boldsymbol{\mu}_c)$
LDA discriminant	$\delta_c(\mathbf{x}) = \gamma_c + \mathbf{x}^\top \boldsymbol{\beta}_c$
Binary LDA weight	$\mathbf{w} = \boldsymbol{\Sigma}^{-1}(\boldsymbol{\mu}_1 - \boldsymbol{\mu}_0)$
Fisher criterion	$J(\mathbf{w}) = \frac{\mathbf{w}^\top \mathbf{S}_B \mathbf{w}}{\mathbf{w}^\top \mathbf{S}_W \mathbf{w}}$
RDA covariance	$\hat{\boldsymbol{\Sigma}}_c = \alpha \hat{\boldsymbol{\Sigma}}_c^{\text{MLE}} + (1-\alpha)\hat{\boldsymbol{\Sigma}}_{\text{tied}}$

Gaussian Discriminant Analysis¶

Real-World Scenario: Cancer Subtype Classification from Blood Biomarkers¶

Key Formulas from PML Chapter 9.2¶

1. Generate Synthetic Biomarker Data¶

2. Model Fitting via Maximum Likelihood (Section 9.2.4)¶

3. Quadratic Discriminant Analysis (Section 9.2.1)¶

4. Linear Discriminant Analysis (Section 9.2.2)¶

5. Step-by-Step: Deriving the LDA Decision Boundary (Binary Case)¶

6. Connection to Logistic Regression (Section 9.2.3)¶

7. Nearest Centroid Classifier (Section 9.2.5)¶

8. Regularized Discriminant Analysis (Section 9.2.4.3)¶

9. Fisher's Linear Discriminant Analysis (Section 9.2.6)¶

10. Comparing All Methods¶

Summary¶

Key Takeaways¶

Key Formulas from Chapter 9.2¶