import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib.patches import FancyArrowPatch
from itertools import combinations

np.random.seed(42)
plt.style.use('seaborn-v0_8-whitegrid')
mpl.rcParams['font.family'] = 'DejaVu Sans'

N, D = 80, 50  # 80 bacterial isolates, 50 gene expression features
sigma_noise = 1.5

# True weights: only 6 out of 50 genes drive resistance
w_true = np.zeros(D)
w_true[0] = 4.0    # mecA — methicillin resistance
w_true[3] = -2.5   # ompF — porin downregulation
w_true[7] = 3.0    # blaZ — β-lactamase
w_true[12] = -1.5  # acrB — efflux pump
w_true[20] = 2.0   # vanA — vancomycin resistance
w_true[35] = -1.0  # tetM — tetracycline resistance

# Gene expression data (standardized)
X = np.random.randn(N, D)
y = X @ w_true + np.random.randn(N) * sigma_noise

# Hold-out test set
N_test = 200
X_test = np.random.randn(N_test, D)
y_test = X_test @ w_true + np.random.randn(N_test) * sigma_noise

# Center y (intercept handling)
y_mean = y.mean()
y_c = y - y_mean
y_test_c = y_test - y_mean

gene_names = (
    ['mecA', 'gyrA', 'rpoB', 'ompF', 'folA', 'dfrA', 'sul1', 'blaZ',
     'ermB', 'mphA', 'catA', 'floR', 'acrB', 'tolC', 'marA', 'soxS',
     'ramA', 'parC', 'parE', 'gyrB', 'vanA', 'vanB', 'pbp2a', 'murA',
     'ddl', 'fusA', 'rpsL', 'rrs', 'inhA', 'katG', 'embB', 'pncA',
     'rpoC', 'fabI', 'mupA', 'tetM', 'tetK', 'msrA', 'linA', 'cfr',
     'optrA', 'qnrA', 'qnrB', 'fosA', 'mcr1', 'arnT', 'pmrA', 'pmrB',
     'mgrB', 'lpxC']
)

true_support = np.where(w_true != 0)[0]
print(f"Data: N={N} isolates, D={D} genes")
print(f"True resistance genes ({len(true_support)} / {D}):")
for idx in true_support:
    print(f"  {gene_names[idx]:>6s}  (w = {w_true[idx]:+.1f})")
print(f"\nNoise σ = {sigma_noise}, Bayes-optimal test MSE = σ² = {sigma_noise**2:.2f}")

Data: N=80 isolates, D=50 genes
True resistance genes (6 / 50):
    mecA  (w = +4.0)
    ompF  (w = -2.5)
    blaZ  (w = +3.0)
    acrB  (w = -1.5)
    vanA  (w = +2.0)
    tetM  (w = -1.0)

Noise σ = 1.5, Bayes-optimal test MSE = σ² = 2.25

w_grid = np.linspace(-5, 5, 500)

# Match variance: Laplace(0, b) has var = 2b², Gaussian(0, σ²) has var = σ²
b_laplace = 1.0
sigma_gauss = np.sqrt(2) * b_laplace  # same variance

pdf_laplace = (1 / (2 * b_laplace)) * np.exp(-np.abs(w_grid) / b_laplace)
pdf_gauss = (1 / (sigma_gauss * np.sqrt(2 * np.pi))) * np.exp(-w_grid**2 / (2 * sigma_gauss**2))

fig, axes = plt.subplots(1, 2, figsize=(13, 4.5))

# PDF comparison
ax = axes[0]
ax.plot(w_grid, pdf_laplace, 'C3', lw=2.5, label=f'Laplace(0, b={b_laplace})')
ax.plot(w_grid, pdf_gauss, 'C0', lw=2.5, ls='--', label=f'Gaussian(0, σ²={2*b_laplace**2:.1f})')
ax.fill_between(w_grid, pdf_laplace, alpha=0.15, color='C3')
ax.set_xlabel('w')
ax.set_ylabel('p(w)')
ax.set_title('Laplace vs Gaussian Prior (same variance)')
ax.legend(fontsize=10)
ax.set_xlim(-5, 5)

# Log-PDF comparison (shows tail behavior)
ax = axes[1]
ax.plot(w_grid, np.log(pdf_laplace), 'C3', lw=2.5, label='log Laplace')
ax.plot(w_grid, np.log(pdf_gauss), 'C0', lw=2.5, ls='--', label='log Gaussian')
ax.set_xlabel('w')
ax.set_ylabel('log p(w)')
ax.set_title('Log-scale: Laplace has sharp peak + heavier tails')
ax.legend(fontsize=10)
ax.set_xlim(-5, 5)

plt.tight_layout()
plt.show()

print("Key observations:")
print("• Laplace has a sharp peak at 0 → more prior mass on w=0 → encourages sparsity")
print("• Laplace has heavier tails → large weights are less penalized than with Gaussian")
print("• Log Laplace is V-shaped (|w| penalty) vs parabolic (w² penalty) for Gaussian")

Key observations:
• Laplace has a sharp peak at 0 → more prior mass on w=0 → encourages sparsity
• Laplace has heavier tails → large weights are less penalized than with Gaussian
• Log Laplace is V-shaped (|w| penalty) vs parabolic (w² penalty) for Gaussian

fig, axes = plt.subplots(1, 2, figsize=(13, 5.5))

# Create NLL contours (elliptical)
w1_grid = np.linspace(-3, 3, 300)
w2_grid = np.linspace(-3, 3, 300)
W1, W2 = np.meshgrid(w1_grid, w2_grid)

# NLL: (w - w_ols)^T H (w - w_ols), with tilted ellipse
w_ols = np.array([1.8, 1.2])
H = np.array([[2.0, 1.2], [1.2, 1.5]])  # positive definite
dW = np.stack([W1 - w_ols[0], W2 - w_ols[1]], axis=-1)
NLL = np.einsum('...i,ij,...j', dW, H, dW)

titles = ['ℓ₁ constraint (Lasso)', 'ℓ₂ constraint (Ridge)']
B_vals = [1.2, 1.2]

for ax_idx, (ax, title, B) in enumerate(zip(axes, titles, B_vals)):
    # NLL contours
    levels = [0.5, 2, 5, 10, 18]
    cs = ax.contour(W1, W2, NLL, levels=levels, colors='C0', linewidths=1.2, alpha=0.7)

    if ax_idx == 0:  # L1 diamond
        theta = np.linspace(0, 2 * np.pi, 500)
        # Diamond parametrically
        diamond_w1 = B * np.sign(np.cos(theta)) * np.abs(np.cos(theta))
        diamond_w2 = B * np.sign(np.sin(theta)) * np.abs(np.sin(theta))
        # Proper diamond
        corners = np.array([[B, 0], [0, B], [-B, 0], [0, -B], [B, 0]])
        ax.fill(corners[:, 0], corners[:, 1], alpha=0.2, color='C3')
        ax.plot(corners[:, 0], corners[:, 1], 'C3', lw=2.5)

        # Optimal point is at corner (sparse!)
        # Find where the NLL contour touches the diamond
        # For this geometry, it touches at (B, 0)
        opt = np.array([B, 0])
        ax.plot(*opt, 'C3', marker='*', ms=18, zorder=5)
        ax.annotate(f'ŵ = ({opt[0]:.1f}, {opt[1]:.1f})\n(sparse: w₂ = 0)',
                    xy=opt, xytext=(opt[0]+0.3, opt[1]+0.7),
                    fontsize=10, color='C3', fontweight='bold',
                    arrowprops=dict(arrowstyle='->', color='C3'))
    else:  # L2 circle
        theta = np.linspace(0, 2 * np.pi, 200)
        ax.fill(B * np.cos(theta), B * np.sin(theta), alpha=0.2, color='C0')
        ax.plot(B * np.cos(theta), B * np.sin(theta), 'C0', lw=2.5)

        # Optimal point on circle (not sparse)
        # Approximately where ellipse tangent touches circle
        opt_angle = np.arctan2(w_ols[1], w_ols[0])  # direction of OLS
        # Refine: find point on circle closest to OLS along gradient direction
        # Use Lagrangian solution: w_ridge ~ (H + λI)^{-1} H w_ols
        from scipy.optimize import minimize_scalar
        def nll_on_circle(angle):
            w = B * np.array([np.cos(angle), np.sin(angle)])
            d = w - w_ols
            return d @ H @ d
        res = minimize_scalar(nll_on_circle, bounds=(0, np.pi/2), method='bounded')
        opt = B * np.array([np.cos(res.x), np.sin(res.x)])
        ax.plot(*opt, 'C0', marker='*', ms=18, zorder=5)
        ax.annotate(f'ŵ = ({opt[0]:.2f}, {opt[1]:.2f})\n(dense: both w ≠ 0)',
                    xy=opt, xytext=(opt[0]+0.3, opt[1]+0.7),
                    fontsize=10, color='C0', fontweight='bold',
                    arrowprops=dict(arrowstyle='->', color='C0'))

    ax.plot(*w_ols, 'k+', ms=14, mew=2.5, zorder=5)
    ax.annotate('ŵ_OLS', xy=w_ols, xytext=(w_ols[0]+0.15, w_ols[1]+0.15), fontsize=10)

    ax.set_xlabel('w₁', fontsize=12)
    ax.set_ylabel('w₂', fontsize=12)
    ax.set_title(title, fontsize=13)
    ax.set_xlim(-2.5, 3)
    ax.set_ylim(-2, 2.5)
    ax.axhline(0, color='gray', lw=0.5)
    ax.axvline(0, color='gray', lw=0.5)
    ax.set_aspect('equal')

plt.tight_layout()
plt.show()

print("The ℓ₁ diamond's corners 'stick out' along axes → NLL contours hit corners first → sparse solutions")
print("The ℓ₂ circle is smooth everywhere → no preference for axes → dense solutions")

The ℓ₁ diamond's corners 'stick out' along axes → NLL contours hit corners first → sparse solutions
The ℓ₂ circle is smooth everywhere → no preference for axes → dense solutions

def soft_threshold(x, delta):
    """SoftThreshold(x, δ) = sign(x)(|x| - δ)₊"""
    return np.sign(x) * np.maximum(np.abs(x) - delta, 0)

def hard_threshold(x, delta):
    """HardThreshold: zero out if |x| ≤ δ, keep otherwise."""
    return x * (np.abs(x) > delta)

c_grid = np.linspace(-5, 5, 500)
lam = 1.5

fig, axes = plt.subplots(1, 3, figsize=(15, 4.5))

# Soft thresholding (Lasso)
ax = axes[0]
ax.plot(c_grid, c_grid, 'k--', lw=1, alpha=0.4, label='OLS (no penalty)')
ax.plot(c_grid, soft_threshold(c_grid, lam), 'C3', lw=2.5, label=f'Soft threshold (λ={lam})')
ax.axhspan(-0.05, 0.05, xmin=0.2, xmax=0.8, alpha=0.1, color='C3')
ax.axvline(-lam, color='gray', ls=':', lw=1, alpha=0.5)
ax.axvline(lam, color='gray', ls=':', lw=1, alpha=0.5)
ax.set_xlabel('c_d (residual correlation)', fontsize=11)
ax.set_ylabel('ŵ_d', fontsize=11)
ax.set_title('Soft Thresholding (Lasso)', fontsize=12)
ax.legend(fontsize=9)
ax.text(0, -0.8, f'[−λ, λ] = [{-lam}, {lam}]', ha='center', fontsize=9, color='gray')

# Hard thresholding (Subset selection)
ax = axes[1]
ax.plot(c_grid, c_grid, 'k--', lw=1, alpha=0.4, label='OLS (no penalty)')
ax.plot(c_grid, hard_threshold(c_grid, lam), 'C2', lw=2.5, label=f'Hard threshold (λ={lam})')
ax.axvline(-lam, color='gray', ls=':', lw=1, alpha=0.5)
ax.axvline(lam, color='gray', ls=':', lw=1, alpha=0.5)
ax.set_xlabel('c_d (residual correlation)', fontsize=11)
ax.set_ylabel('ŵ_d', fontsize=11)
ax.set_title('Hard Thresholding (Subset Selection)', fontsize=12)
ax.legend(fontsize=9)

# Ridge (proportional shrinkage)
ax = axes[2]
ax.plot(c_grid, c_grid, 'k--', lw=1, alpha=0.4, label='OLS (no penalty)')
ax.plot(c_grid, c_grid / (1 + lam), 'C0', lw=2.5, label=f'Ridge (λ={lam})')
ax.set_xlabel('c_d (residual correlation)', fontsize=11)
ax.set_ylabel('ŵ_d', fontsize=11)
ax.set_title('Proportional Shrinkage (Ridge)', fontsize=12)
ax.legend(fontsize=9)

for ax in axes:
    ax.axhline(0, color='gray', lw=0.5)
    ax.axvline(0, color='gray', lw=0.5)
    ax.set_xlim(-5, 5)
    ax.set_ylim(-5, 5)

plt.tight_layout()
plt.show()

print("Soft thresholding (Lasso): zeros out |c_d| ≤ λ AND shrinks the survivors → biased")
print("Hard thresholding (Subset): zeros out |c_d| ≤ λ, keeps survivors unchanged → unbiased but discontinuous")
print("Ridge: shrinks all coefficients by factor 1/(1+λ), NEVER zeros any → dense")

Soft thresholding (Lasso): zeros out |c_d| ≤ λ AND shrinks the survivors → biased
Hard thresholding (Subset): zeros out |c_d| ≤ λ, keeps survivors unchanged → unbiased but discontinuous
Ridge: shrinks all coefficients by factor 1/(1+λ), NEVER zeros any → dense

# Let's compute a_d, c_d, r_{-d} for a concrete example
# Use a simple initial w (e.g., ridge estimate) to illustrate one coordinate update

w_init = np.linalg.solve(X.T @ X + 10.0 * np.eye(D), X.T @ y_c)
lam_ex = 10.0

print("=" * 70)
print("Step-by-step coordinate update for THREE example genes")
print("=" * 70)

for d, gene in [(0, 'mecA (true w=+4.0)'), (7, 'blaZ (true w=+3.0)'), (5, 'dfrA (true w=0.0)')]:
    # a_d = ||x_{:,d}||² — squared norm of feature column
    a_d = np.sum(X[:, d]**2)
    
    # r_{-d} = y - X @ w + x_d * w_d  (residual without feature d's contribution)
    r_minus_d = y_c - X @ w_init + X[:, d] * w_init[d]
    
    # c_d = x_{:,d}^T r_{-d}  (correlation of feature d with residual)
    c_d = X[:, d] @ r_minus_d
    
    # OLS update (no penalty): w_d = c_d / a_d
    w_ols_d = c_d / a_d
    
    # Lasso update: SoftThreshold(c_d / a_d, λ / a_d)
    w_lasso_d = soft_threshold(c_d / a_d, lam_ex / a_d)
    
    print(f"\n--- Gene {d}: {gene} ---")
    print(f"  a_d = ||x_{{:,{d}}}||² = {a_d:.2f}    (column norm²)")
    print(f"  r_{{-d}} shape: {r_minus_d.shape}     (residual without gene {d})")
    print(f"  c_d = x_{{:,{d}}}ᵀ r_{{-d}} = {c_d:+.3f}  (correlation with residual)")
    print(f"  λ = {lam_ex}")
    print()
    print(f"  OLS update:   ŵ_d = c_d/a_d = {c_d:.3f}/{a_d:.2f} = {w_ols_d:+.4f}")
    print(f"  Lasso update: ŵ_d = SoftThreshold({w_ols_d:+.4f}, {lam_ex/a_d:.4f}) = {w_lasso_d:+.4f}")
    
    if abs(c_d) < lam_ex:
        print(f"  → |c_d| = {abs(c_d):.3f} < λ = {lam_ex}  ⟹  KILLED (set to 0)")
    elif c_d > lam_ex:
        print(f"  → c_d = {c_d:.3f} > λ = {lam_ex}  ⟹  KEPT but shrunk by λ/a_d = {lam_ex/a_d:.4f}")
    else:
        print(f"  → c_d = {c_d:.3f} < -λ = {-lam_ex}  ⟹  KEPT but shrunk by λ/a_d = {lam_ex/a_d:.4f}")

# Show all c_d values to see which features survive the threshold
print("\n" + "=" * 70)
print("All c_d values vs λ threshold — which genes survive?")
print("=" * 70)

a_all = np.sum(X**2, axis=0)
c_all = np.zeros(D)
for d in range(D):
    r_d = y_c - X @ w_init + X[:, d] * w_init[d]
    c_all[d] = X[:, d] @ r_d

fig, ax = plt.subplots(figsize=(14, 5))
colors = ['C3' if w_true[d] != 0 else 'C7' for d in range(D)]
bars = ax.bar(range(D), c_all, color=colors, alpha=0.7)

# Threshold bands
ax.axhline(lam_ex, color='C2', ls='--', lw=2)
ax.axhline(-lam_ex, color='C2', ls='--', lw=2)
ax.axhspan(-lam_ex, lam_ex, alpha=0.08, color='C2')

# Label the true genes
for d in true_support:
    ax.annotate(gene_names[d], xy=(d, c_all[d]),
                xytext=(d, c_all[d] + np.sign(c_all[d]) * 15),
                fontsize=8, ha='center', color='C3', fontweight='bold',
                arrowprops=dict(arrowstyle='->', color='C3', lw=0.8))

ax.set_xlabel('Gene index (d)', fontsize=11)
ax.set_ylabel('c_d = x_{:,d}ᵀ r_{-d}', fontsize=11)
ax.set_title(f'Residual Correlation c_d for Each Gene (λ = {lam_ex})', fontsize=13)

from matplotlib.patches import Patch
ax.legend(handles=[
    Patch(facecolor='C3', alpha=0.7, label='True resistance gene'),
    Patch(facecolor='C7', alpha=0.7, label='Noise gene'),
    plt.Line2D([0], [0], color='C2', ls='--', lw=2, label=f'±λ = ±{lam_ex}'),
    Patch(facecolor='C2', alpha=0.08, label='Dead zone → ŵ_d = 0'),
], fontsize=9, loc='upper right')

plt.tight_layout()
plt.show()

n_surviving = np.sum(np.abs(c_all) > lam_ex)
print(f"\n{n_surviving} genes have |c_d| > λ and survive the threshold")
print(f"The true resistance genes have large |c_d| because they genuinely correlate with the response")

======================================================================
Step-by-step coordinate update for THREE example genes
======================================================================

--- Gene 0: mecA (true w=+4.0) ---
  a_d = ||x_{:,0}||² = 73.59    (column norm²)
  r_{-d} shape: (80,)     (residual without gene 0)
  c_d = x_{:,0}ᵀ r_{-d} = +268.963  (correlation with residual)
  λ = 10.0

  OLS update:   ŵ_d = c_d/a_d = 268.963/73.59 = +3.6550
  Lasso update: ŵ_d = SoftThreshold(+3.6550, 0.1359) = +3.5191
  → c_d = 268.963 > λ = 10.0  ⟹  KEPT but shrunk by λ/a_d = 0.1359

--- Gene 7: blaZ (true w=+3.0) ---
  a_d = ||x_{:,7}||² = 82.56    (column norm²)
  r_{-d} shape: (80,)     (residual without gene 7)
  c_d = x_{:,7}ᵀ r_{-d} = +214.039  (correlation with residual)
  λ = 10.0

  OLS update:   ŵ_d = c_d/a_d = 214.039/82.56 = +2.5925
  Lasso update: ŵ_d = SoftThreshold(+2.5925, 0.1211) = +2.4714
  → c_d = 214.039 > λ = 10.0  ⟹  KEPT but shrunk by λ/a_d = 0.1211

--- Gene 5: dfrA (true w=0.0) ---
  a_d = ||x_{:,5}||² = 94.25    (column norm²)
  r_{-d} shape: (80,)     (residual without gene 5)
  c_d = x_{:,5}ᵀ r_{-d} = +29.442  (correlation with residual)
  λ = 10.0

  OLS update:   ŵ_d = c_d/a_d = 29.442/94.25 = +0.3124
  Lasso update: ŵ_d = SoftThreshold(+0.3124, 0.1061) = +0.2063
  → c_d = 29.442 > λ = 10.0  ⟹  KEPT but shrunk by λ/a_d = 0.1061

======================================================================
All c_d values vs λ threshold — which genes survive?
======================================================================

34 genes have |c_d| > λ and survive the threshold
The true resistance genes have large |c_d| because they genuinely correlate with the response

def lasso_coordinate_descent(X, y, lam, max_iter=1000, tol=1e-6):
    """Coordinate descent (shooting algorithm) for lasso.
    
    Minimizes: ||Xw - y||² + lam * ||w||₁
    """
    N, D = X.shape
    # Precompute a_d = ||x_d||²
    a = np.sum(X**2, axis=0)  # (D,)
    
    # Initialize with ridge solution
    w = np.linalg.solve(X.T @ X + lam * np.eye(D), X.T @ y)
    
    objective_history = []
    
    for iteration in range(max_iter):
        w_old = w.copy()
        
        for d in range(D):
            # Residual excluding feature d's contribution
            r_minus_d = y - X @ w + X[:, d] * w[d] # (N,)
            # c_d = correlation of feature d with residual
            c_d = X[:, d] @ r_minus_d # scalar
            # Soft threshold update
            w[d] = soft_threshold(c_d / a[d], lam / a[d])
        
        obj = np.sum((X @ w - y)**2) + lam * np.sum(np.abs(w))
        objective_history.append(obj)
        
        if np.max(np.abs(w - w_old)) < tol:
            break
    
    return w, objective_history

# Fit lasso at a moderate λ
lam_demo = 5.0
w_lasso, obj_hist = lasso_coordinate_descent(X, y_c, lam_demo)

n_nonzero = np.sum(np.abs(w_lasso) > 1e-8)
print(f"Lasso (λ={lam_demo}): {n_nonzero} nonzero coefficients out of {D}")
print(f"Converged in {len(obj_hist)} iterations\n")

print("Selected genes and their coefficients:")
print(f"{'Gene':>8s}  {'ŵ_lasso':>8s}  {'w_true':>8s}")
print("-" * 30)
for d in range(D):
    if np.abs(w_lasso[d]) > 1e-8 or w_true[d] != 0:
        marker = '✓' if w_true[d] != 0 else '✗'
        print(f"{gene_names[d]:>8s}  {w_lasso[d]:>+8.3f}  {w_true[d]:>+8.3f}  {marker}")

Lasso (λ=5.0): 36 nonzero coefficients out of 50
Converged in 32 iterations

Selected genes and their coefficients:
    Gene   ŵ_lasso    w_true
------------------------------
    mecA    +3.962    +4.000  ✓
    gyrA    +0.061    +0.000  ✗
    rpoB    -0.257    +0.000  ✗
    ompF    -2.681    -2.500  ✓
    folA    +0.049    +0.000  ✗
    dfrA    +0.021    +0.000  ✗
    sul1    -0.085    +0.000  ✗
    blaZ    +2.805    +3.000  ✓
    ermB    +0.095    +0.000  ✗
    mphA    -0.044    +0.000  ✗
    floR    +0.023    +0.000  ✗
    acrB    -1.401    -1.500  ✓
    marA    -0.043    +0.000  ✗
    soxS    +0.112    +0.000  ✗
    parC    -0.012    +0.000  ✗
    parE    +0.117    +0.000  ✗
    vanA    +2.033    +2.000  ✓
    vanB    -0.043    +0.000  ✗
   pbp2a    -0.097    +0.000  ✗
    murA    +0.096    +0.000  ✗
     ddl    -0.022    +0.000  ✗
    fusA    +0.186    +0.000  ✗
     rrs    -0.312    +0.000  ✗
    inhA    -0.017    +0.000  ✗
    pncA    -0.135    +0.000  ✗
    rpoC    +0.106    +0.000  ✗
    fabI    -0.177    +0.000  ✗
    tetM    -0.982    -1.000  ✓
    msrA    -0.068    +0.000  ✗
    fosA    -0.043    +0.000  ✗
    mcr1    +0.197    +0.000  ✗
    arnT    -0.107    +0.000  ✗
    pmrA    +0.042    +0.000  ✗
    pmrB    -0.057    +0.000  ✗
    mgrB    +0.119    +0.000  ✗
    lpxC    +0.096    +0.000  ✗

# Convergence of coordinate descent
fig, axes = plt.subplots(1, 2, figsize=(13, 4.5))

ax = axes[0]
ax.plot(obj_hist, 'C3', lw=2)
ax.set_xlabel('Iteration')
ax.set_ylabel('Objective')
ax.set_title('Lasso Objective During Coordinate Descent')
ax.set_yscale('log') if obj_hist[0] > 0 else None

ax = axes[1]
nonzero_mask = np.abs(w_lasso) > 1e-8
colors = ['C3' if w_true[d] != 0 else 'C7' for d in range(D)]
ax.bar(range(D), w_lasso, color=colors, alpha=0.7)
# Mark true nonzeros
for d in true_support:
    ax.axvline(d, color='C0', ls=':', lw=0.8, alpha=0.5)
ax.set_xlabel('Gene index')
ax.set_ylabel('ŵ_d')
ax.set_title(f'Lasso Coefficients (λ={lam_demo})')
ax.axhline(0, color='gray', lw=0.5)

# Custom legend
from matplotlib.patches import Patch
ax.legend(handles=[
    Patch(facecolor='C3', alpha=0.7, label='True resistance gene'),
    Patch(facecolor='C7', alpha=0.7, label='False positive'),
    plt.Line2D([0], [0], color='C0', ls=':', label='True support')
], fontsize=9)

plt.tight_layout()
plt.show()

# Compute regularization path
lam_max = np.max(np.abs(X.T @ y_c))
n_lambdas = 80
lambdas = np.logspace(np.log10(lam_max), np.log10(lam_max * 0.001), n_lambdas)

# Warm-start: use previous solution as initialization
path_weights = np.zeros((n_lambdas, D))
w_warm = np.zeros(D)

for i, lam_i in enumerate(lambdas):
    # Warm-start coordinate descent
    N_data = X.shape[0]
    a = np.sum(X**2, axis=0)
    w = w_warm.copy()
    
    for _ in range(2000):
        w_old = w.copy()
        for d in range(D):
            r = y_c - X @ w + X[:, d] * w[d]
            c_d = X[:, d] @ r
            w[d] = soft_threshold(c_d / a[d], lam_i / a[d])
        if np.max(np.abs(w - w_old)) < 1e-7:
            break
    
    path_weights[i] = w
    w_warm = w.copy()

# Count nonzeros along the path
n_nonzeros = np.sum(np.abs(path_weights) > 1e-8, axis=1)

print(f"λ_max = {lam_max:.2f} (all weights zero above this)")
print(f"Path: {n_lambdas} values of λ from {lambdas[0]:.2f} to {lambdas[-1]:.4f}")
print(f"Number of nonzero coefficients: {n_nonzeros[0]} → {n_nonzeros[-1]}")

λ_max = 253.06 (all weights zero above this)
Path: 80 values of λ from 253.06 to 0.2531
Number of nonzero coefficients: 0 → 47

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Lasso regularization path
ax = axes[0]
log_lam = np.log10(lambdas)

for d in range(D):
    if np.max(np.abs(path_weights[:, d])) > 1e-8:
        lw = 2.5 if d in true_support else 0.8
        alpha = 1.0 if d in true_support else 0.4
        color = 'C3' if d in true_support else 'C7'
        label = gene_names[d] if d in true_support else None
        ax.plot(log_lam, path_weights[:, d], color=color, lw=lw, alpha=alpha, label=label)

ax.axhline(0, color='gray', lw=0.5)
ax.set_xlabel('log₁₀(λ)', fontsize=11)
ax.set_ylabel('ŵ_d(λ)', fontsize=11)
ax.set_title('Lasso Regularization Path', fontsize=13)
ax.legend(fontsize=9, loc='lower left')
ax.invert_xaxis()

# Number of nonzero coefficients vs λ
ax = axes[1]
ax.plot(log_lam, n_nonzeros, 'C3', lw=2.5)
ax.axhline(len(true_support), color='C0', ls='--', lw=1.5, label=f'True sparsity = {len(true_support)}')
ax.set_xlabel('log₁₀(λ)', fontsize=11)
ax.set_ylabel('Number of nonzero coefficients', fontsize=11)
ax.set_title('Model Complexity vs Regularization', fontsize=13)
ax.legend(fontsize=10)
ax.invert_xaxis()

plt.tight_layout()
plt.show()

print("As λ decreases (left to right): coefficients 'turn on' one by one")
print("The true resistance genes (colored) emerge first — they have the strongest signal")

As λ decreases (left to right): coefficients 'turn on' one by one
The true resistance genes (colored) emerge first — they have the strongest signal

def lasso_cv(X, y, lambdas, K=5):
    """K-fold cross-validation for lasso."""
    N = X.shape[0]
    indices = np.arange(N)
    np.random.shuffle(indices)
    folds = np.array_split(indices, K)
    
    mse_folds = np.zeros((K, len(lambdas)))
    
    for k in range(K):
        val_idx = folds[k]
        train_idx = np.concatenate([folds[j] for j in range(K) if j != k])
        
        X_tr, y_tr = X[train_idx], y[train_idx]
        X_val, y_val = X[val_idx], y[val_idx]
        
        w_warm = np.zeros(X.shape[1])
        a = np.sum(X_tr**2, axis=0)
        
        for i, lam_i in enumerate(lambdas):
            w = w_warm.copy()
            for _ in range(1000):
                w_old = w.copy()
                for d in range(X.shape[1]):
                    r = y_tr - X_tr @ w + X_tr[:, d] * w[d]
                    c_d = X_tr[:, d] @ r
                    w[d] = soft_threshold(c_d / a[d], lam_i / a[d])
                if np.max(np.abs(w - w_old)) < 1e-7:
                    break
            
            mse_folds[k, i] = np.mean((X_val @ w - y_val)**2)
            w_warm = w.copy()
    
    return mse_folds.mean(axis=0), mse_folds.std(axis=0) / np.sqrt(K)

# Run CV
cv_lambdas = np.logspace(np.log10(lam_max), np.log10(lam_max * 0.005), 50)
cv_mean, cv_se = lasso_cv(X, y_c, cv_lambdas, K=5)

# Best λ and 1-SE rule
best_idx = np.argmin(cv_mean)
lam_best = cv_lambdas[best_idx]

# 1-SE rule: largest λ within 1 SE of the minimum
threshold = cv_mean[best_idx] + cv_se[best_idx]
lam_1se_idx = np.where(cv_mean <= threshold)[0][0]
lam_1se = cv_lambdas[lam_1se_idx]

print(f"Best λ (min CV):   {lam_best:.4f}")
print(f"1-SE rule λ:       {lam_1se:.4f}  (sparser model)")

Best λ (min CV):   10.9998
1-SE rule λ:       23.4478  (sparser model)

fig, ax = plt.subplots(figsize=(10, 5))

log_cv_lam = np.log10(cv_lambdas)
ax.plot(log_cv_lam, cv_mean, 'C3', lw=2)
ax.fill_between(log_cv_lam, cv_mean - cv_se, cv_mean + cv_se, color='C3', alpha=0.2)
ax.axvline(np.log10(lam_best), color='C0', ls='--', lw=1.5, label=f'λ_min = {lam_best:.3f}')
ax.axvline(np.log10(lam_1se), color='C2', ls='--', lw=1.5, label=f'λ_1SE = {lam_1se:.3f}')

ax.set_xlabel('log₁₀(λ)', fontsize=12)
ax.set_ylabel('Cross-Validated MSE', fontsize=12)
ax.set_title('5-Fold Cross-Validation for Lasso', fontsize=13)
ax.legend(fontsize=11)
ax.invert_xaxis()

# Add number of nonzeros on top
ax2 = ax.twiny()
# Find nonzeros at each CV lambda
cv_nnz = []
w_tmp = np.zeros(D)
a_tmp = np.sum(X**2, axis=0)
for lam_i in cv_lambdas:
    w = w_tmp.copy()
    for _ in range(1000):
        w_old = w.copy()
        for d in range(D):
            r = y_c - X @ w + X[:, d] * w[d]
            c_d = X[:, d] @ r
            w[d] = soft_threshold(c_d / a_tmp[d], lam_i / a_tmp[d])
        if np.max(np.abs(w - w_old)) < 1e-7:
            break
    cv_nnz.append(np.sum(np.abs(w) > 1e-8))
    w_tmp = w.copy()

# Show selected tick positions
tick_positions = np.linspace(0, len(cv_lambdas)-1, 8, dtype=int)
ax2.set_xlim(ax.get_xlim())
ax2.set_xticks(log_cv_lam[tick_positions])
ax2.set_xticklabels([str(cv_nnz[i]) for i in tick_positions])
ax2.set_xlabel('Number of nonzero coefficients', fontsize=11)

plt.tight_layout()
plt.show()

def ridge_solve(X, y, lam):
    D = X.shape[1]
    return np.linalg.solve(X.T @ X + lam * np.eye(D), X.T @ y)

def forward_stepwise(X, y, k):
    """Greedy forward selection of k features (tractable approximation to best subset)."""
    N, D = X.shape
    selected = []
    remaining = list(range(D))
    w = np.zeros(D)
    
    for _ in range(k):
        best_rss = np.inf
        best_feat = None
        for d in remaining:
            subset = selected + [d]
            X_sub = X[:, subset]
            w_sub = np.linalg.lstsq(X_sub, y, rcond=None)[0]
            rss = np.sum((X_sub @ w_sub - y)**2)
            if rss < best_rss:
                best_rss = rss
                best_feat = d
        selected.append(best_feat)
        remaining.remove(best_feat)
    
    X_sub = X[:, selected]
    w[selected] = np.linalg.lstsq(X_sub, y, rcond=None)[0]
    return w

# Fit all methods
w_ols = np.linalg.lstsq(X, y_c, rcond=None)[0]
w_ridge = ridge_solve(X, y_c, lam=10.0)
w_lasso_cv, _ = lasso_coordinate_descent(X, y_c, lam=lam_1se)
w_fwd = forward_stepwise(X, y_c, k=len(true_support))

# Test MSE
methods = {
    'OLS': w_ols,
    'Ridge': w_ridge,
    'Lasso': w_lasso_cv,
    'Forward Select': w_fwd,
}

print(f"{'Method':<16s} {'Nonzeros':>8s} {'||ŵ-w*||':>10s} {'Test MSE':>10s}")
print("=" * 48)
for name, w_hat in methods.items():
    nnz = np.sum(np.abs(w_hat) > 1e-8)
    w_err = np.linalg.norm(w_hat - w_true)
    test_mse = np.mean((X_test @ w_hat - y_test_c)**2)
    print(f"{name:<16s} {nnz:>8d} {w_err:>10.3f} {test_mse:>10.3f}")

print(f"\nBayes optimal MSE = σ² = {sigma_noise**2:.2f}")

Method           Nonzeros   ||ŵ-w*||   Test MSE
================================================
OLS                    50      2.148      6.705
Ridge                  50      1.748      5.256
Lasso                   7      0.732      2.876
Forward Select          6      0.363      2.782

Bayes optimal MSE = σ² = 2.25

fig, axes = plt.subplots(2, 2, figsize=(14, 9))

for ax, (name, w_hat) in zip(axes.ravel(), methods.items()):
    colors = []
    for d in range(D):
        if w_true[d] != 0 and np.abs(w_hat[d]) > 1e-8:
            colors.append('C2')  # true positive
        elif w_true[d] != 0:
            colors.append('C3')  # false negative
        elif np.abs(w_hat[d]) > 1e-8:
            colors.append('C7')  # false positive
        else:
            colors.append('C7')  # true negative (won't show)
    
    ax.bar(range(D), w_hat, color=colors, alpha=0.7)
    # Show true weights as markers
    for d in true_support:
        ax.plot(d, w_true[d], 'kx', ms=10, mew=2)
    
    nnz = np.sum(np.abs(w_hat) > 1e-8)
    ax.set_title(f'{name}  ({nnz} nonzero)', fontsize=12)
    ax.set_xlabel('Gene index')
    ax.set_ylabel('Coefficient')
    ax.axhline(0, color='gray', lw=0.5)
    ax.set_ylim(-5, 6)

# Legend in first subplot
from matplotlib.lines import Line2D
axes[0, 0].legend(handles=[
    Patch(facecolor='C2', alpha=0.7, label='True positive'),
    Patch(facecolor='C7', alpha=0.7, label='False positive'),
    Line2D([0], [0], marker='x', color='k', ls='', ms=10, mew=2, label='True w*'),
], fontsize=9, loc='upper right')

plt.suptitle('Coefficient Estimates: OLS vs Ridge vs Lasso vs Forward Selection', fontsize=14, y=1.01)
plt.tight_layout()
plt.show()

print("OLS: dense, overfits to noise features")
print("Ridge: shrinks all coefficients, none exactly zero")
print("Lasso: sparse — selects a subset and shrinks survivors (slightly biased)")
print("Forward Selection: sparse with unbiased coefficients on selected features")

OLS: dense, overfits to noise features
Ridge: shrinks all coefficients, none exactly zero
Lasso: sparse — selects a subset and shrinks survivors (slightly biased)
Forward Selection: sparse with unbiased coefficients on selected features

# Debiasing: refit OLS on the lasso-selected support
support_lasso = np.where(np.abs(w_lasso_cv) > 1e-8)[0]

w_debiased = np.zeros(D)
if len(support_lasso) > 0:
    X_sub = X[:, support_lasso]
    w_debiased[support_lasso] = np.linalg.lstsq(X_sub, y_c, rcond=None)[0]

# Compare
print(f"{'Method':<16s} {'Nonzeros':>8s} {'||ŵ-w*||':>10s} {'Test MSE':>10s}")
print("=" * 48)
for name, w_hat in [('Lasso', w_lasso_cv), ('Debiased Lasso', w_debiased)]:
    nnz = np.sum(np.abs(w_hat) > 1e-8)
    w_err = np.linalg.norm(w_hat - w_true)
    test_mse = np.mean((X_test @ w_hat - y_test_c)**2)
    print(f"{name:<16s} {nnz:>8d} {w_err:>10.3f} {test_mse:>10.3f}")

print(f"\nLasso support:    {[gene_names[d] for d in support_lasso]}")
print(f"True support:     {[gene_names[d] for d in true_support]}")

# Show coefficient comparison
fig, ax = plt.subplots(figsize=(10, 4.5))
width = 0.3
idx_show = np.union1d(support_lasso, true_support)
x_pos = np.arange(len(idx_show))

ax.bar(x_pos - width, w_true[idx_show], width, label='True w*', color='C0', alpha=0.7)
ax.bar(x_pos, w_lasso_cv[idx_show], width, label='Lasso', color='C3', alpha=0.7)
ax.bar(x_pos + width, w_debiased[idx_show], width, label='Debiased Lasso', color='C2', alpha=0.7)

ax.set_xticks(x_pos)
ax.set_xticklabels([gene_names[d] for d in idx_show], rotation=45, ha='right')
ax.set_ylabel('Coefficient value')
ax.set_title('Debiasing: OLS Refit on Lasso-Selected Genes')
ax.legend()
ax.axhline(0, color='gray', lw=0.5)

plt.tight_layout()
plt.show()

print("\nDebiasing removes the shrinkage bias — coefficients are closer to the true values")
print("But it only helps if the support was correctly identified!")

Method           Nonzeros   ||ŵ-w*||   Test MSE
================================================
Lasso                   7      0.732      2.876
Debiased Lasso          7      0.418      2.802

Lasso support:    ['mecA', 'ompF', 'blaZ', 'acrB', 'vanA', 'fabI', 'tetM']
True support:     ['mecA', 'ompF', 'blaZ', 'acrB', 'vanA', 'tetM']

Debiasing removes the shrinkage bias — coefficients are closer to the true values
But it only helps if the support was correctly identified!

def elastic_net_cd(X, y, lam1, lam2, max_iter=1000, tol=1e-6):
    """Coordinate descent for elastic net.
    
    Minimizes: ||Xw - y||² + lam2 * ||w||² + lam1 * ||w||₁
    """
    N, D = X.shape
    a = np.sum(X**2, axis=0) + lam2  # modified: add ℓ₂ penalty to denominator
    w = np.zeros(D)
    
    for _ in range(max_iter):
        w_old = w.copy()
        for d in range(D):
            r = y - X @ w + X[:, d] * w[d]
            c_d = X[:, d] @ r
            w[d] = soft_threshold(c_d / a[d], lam1 / a[d])
        if np.max(np.abs(w - w_old)) < tol:
            break
    
    return w

# Demonstrate grouping effect with correlated features
# Add a duplicate of gene 0 (mecA) with slight noise
X_corr = np.column_stack([X, X[:, 0] + np.random.randn(N) * 0.1])  # gene 50 ≈ mecA copy
X_test_corr = np.column_stack([X_test, X_test[:, 0] + np.random.randn(N_test) * 0.1])

# Lasso: will pick one of the two correlated features
w_lasso_corr, _ = lasso_coordinate_descent(X_corr, y_c, lam=lam_1se)

# Elastic net: will assign similar weights to both
w_enet_corr = elastic_net_cd(X_corr, y_c, lam1=lam_1se, lam2=5.0)

print("Grouping effect with correlated features:")
print(f"Gene 0 (mecA) and Gene 50 (mecA copy) are nearly identical\n")
print(f"{'Method':<14s} {'w[mecA]':>10s} {'w[copy]':>10s}")
print("-" * 36)
print(f"{'Lasso':<14s} {w_lasso_corr[0]:>+10.3f} {w_lasso_corr[50]:>+10.3f}")
print(f"{'Elastic Net':<14s} {w_enet_corr[0]:>+10.3f} {w_enet_corr[50]:>+10.3f}")
print(f"{'True':<14s} {w_true[0]:>+10.3f} {'N/A':>10s}")
print("\nLasso arbitrarily picks one; elastic net shares the weight across both")

Grouping effect with correlated features:
Gene 0 (mecA) and Gene 50 (mecA copy) are nearly identical

Method            w[mecA]    w[copy]
------------------------------------
Lasso              +0.000     +3.600
Elastic Net        +1.511     +1.929
True               +4.000        N/A

Lasso arbitrarily picks one; elastic net shares the weight across both

# Visualize constraint regions for ℓ₁, ℓ₂, and elastic net
fig, axes = plt.subplots(1, 3, figsize=(14, 4.5))

theta = np.linspace(0, 2 * np.pi, 500)
B = 1.0

# ℓ₁ ball (diamond)
ax = axes[0]
corners = np.array([[B, 0], [0, B], [-B, 0], [0, -B], [B, 0]])
ax.fill(corners[:, 0], corners[:, 1], alpha=0.3, color='C3')
ax.plot(corners[:, 0], corners[:, 1], 'C3', lw=2)
ax.set_title('ℓ₁ ball (Lasso)', fontsize=12)

# ℓ₂ ball (circle)
ax = axes[1]
ax.fill(B * np.cos(theta), B * np.sin(theta), alpha=0.3, color='C0')
ax.plot(B * np.cos(theta), B * np.sin(theta), 'C0', lw=2)
ax.set_title('ℓ₂ ball (Ridge)', fontsize=12)

# Elastic net (blend of both)
ax = axes[2]
alpha_mix = 0.5  # mixing parameter
# Elastic net constraint: α||w||₁ + (1-α)||w||₂² ≤ B
# Compute boundary numerically
w1_grid_en = np.linspace(-B, B, 500)
w2_pos = []
w2_neg = []
for w1 in w1_grid_en:
    # Solve: α|w1| + α|w2| + (1-α)(w1² + w2²) ≤ B
    # → (1-α)w2² + α|w2| + [(1-α)w1² + α|w1| - B] ≤ 0
    a_coeff = 1 - alpha_mix
    b_coeff = alpha_mix
    c_coeff = (1 - alpha_mix) * w1**2 + alpha_mix * np.abs(w1) - B
    
    disc = b_coeff**2 - 4 * a_coeff * c_coeff
    if disc >= 0:
        w2_max = (-b_coeff + np.sqrt(disc)) / (2 * a_coeff)
        if w2_max >= 0:
            w2_pos.append((w1, w2_max))
            w2_neg.append((w1, -w2_max))

w2_pos = np.array(w2_pos)
w2_neg = np.array(w2_neg)[::-1]
boundary = np.vstack([w2_pos, w2_neg])
ax.fill(boundary[:, 0], boundary[:, 1], alpha=0.3, color='C4')
ax.plot(boundary[:, 0], boundary[:, 1], 'C4', lw=2)
ax.set_title('Elastic Net (α=0.5)', fontsize=12)

for ax in axes:
    ax.set_xlim(-1.5, 1.5)
    ax.set_ylim(-1.5, 1.5)
    ax.set_aspect('equal')
    ax.axhline(0, color='gray', lw=0.5)
    ax.axvline(0, color='gray', lw=0.5)
    ax.set_xlabel('w₁')
    ax.set_ylabel('w₂')

plt.tight_layout()
plt.show()

print("Elastic net constraint region: rounded diamond — corners for sparsity, curvature for grouping")

Elastic net constraint region: rounded diamond — corners for sparsity, curvature for grouping

def ista(X, y, lam, max_iter=500, tol=1e-7):
    """Iterative Soft Thresholding Algorithm (proximal gradient descent)."""
    N, D = X.shape
    # Step size: 1 / L where L = largest eigenvalue of 2*X'X
    L = 2 * np.linalg.eigvalsh(X.T @ X)[-1]
    eta = 1.0 / L
    
    w = np.zeros(D)
    obj_history = []
    
    for t in range(max_iter):
        grad = 2 * X.T @ (X @ w - y)  # gradient of NLL
        w = soft_threshold(w - eta * grad, eta * lam)
        obj = np.sum((X @ w - y)**2) + lam * np.sum(np.abs(w))
        obj_history.append(obj)
        if t > 0 and abs(obj_history[-2] - obj_history[-1]) < tol:
            break
    
    return w, obj_history

def fista(X, y, lam, max_iter=500, tol=1e-7):
    """Fast ISTA (with Nesterov momentum)."""
    N, D = X.shape
    L = 2 * np.linalg.eigvalsh(X.T @ X)[-1]
    eta = 1.0 / L
    
    w = np.zeros(D)
    w_prev = w.copy()
    t_k = 1.0
    obj_history = []
    
    for k in range(max_iter):
        # Nesterov momentum
        t_k_new = (1 + np.sqrt(1 + 4 * t_k**2)) / 2
        momentum = (t_k - 1) / t_k_new
        v = w + momentum * (w - w_prev)
        
        # Proximal gradient step
        grad = 2 * X.T @ (X @ v - y)
        w_new = soft_threshold(v - eta * grad, eta * lam)
        
        obj = np.sum((X @ w_new - y)**2) + lam * np.sum(np.abs(w_new))
        obj_history.append(obj)
        
        w_prev = w
        w = w_new
        t_k = t_k_new
        
        if k > 0 and abs(obj_history[-2] - obj_history[-1]) < tol:
            break
    
    return w, obj_history

# Compare convergence: coordinate descent vs ISTA vs FISTA
lam_compare = lam_1se
_, obj_cd = lasso_coordinate_descent(X, y_c, lam_compare, max_iter=200)
_, obj_ista = ista(X, y_c, lam_compare, max_iter=500)
_, obj_fista = fista(X, y_c, lam_compare, max_iter=500)

# Find optimal objective (best achieved)
obj_star = min(obj_cd[-1], obj_ista[-1], obj_fista[-1])

fig, ax = plt.subplots(figsize=(10, 5))
ax.semilogy(np.array(obj_cd) - obj_star + 1e-12, 'C0', lw=2, label='Coordinate Descent')
ax.semilogy(np.array(obj_ista) - obj_star + 1e-12, 'C3', lw=2, label='ISTA')
ax.semilogy(np.array(obj_fista) - obj_star + 1e-12, 'C2', lw=2, label='FISTA')
ax.set_xlabel('Iteration', fontsize=12)
ax.set_ylabel('Objective − Optimal + ε', fontsize=12)
ax.set_title('Convergence: Coordinate Descent vs ISTA vs FISTA', fontsize=13)
ax.legend(fontsize=11)
ax.set_xlim(0, 200)

plt.tight_layout()
plt.show()

print(f"Coordinate descent: {len(obj_cd)} iterations")
print(f"ISTA:               {len(obj_ista)} iterations")
print(f"FISTA:              {len(obj_fista)} iterations")
print("\nCoordinate descent converges fastest for lasso (each step has a closed-form solution)")
print("FISTA's O(1/t²) rate beats ISTA's O(1/t), but both are slower than CD for this problem")

Coordinate descent: 9 iterations
ISTA:               74 iterations
FISTA:              68 iterations

Coordinate descent converges fastest for lasso (each step has a closed-form solution)
FISTA's O(1/t²) rate beats ISTA's O(1/t), but both are slower than CD for this problem

# Compressed sensing demo
D_cs = 512
N_cs = 128
K_cs = 20  # number of nonzero spikes

# True sparse signal with random ±1 spikes
w_star = np.zeros(D_cs)
spike_idx = np.random.choice(D_cs, K_cs, replace=False)
w_star[spike_idx] = np.random.choice([-1, 1], K_cs) * (1 + np.random.rand(K_cs))

# Random Gaussian design matrix
X_cs = np.random.randn(N_cs, D_cs) / np.sqrt(N_cs)
y_cs = X_cs @ w_star + np.random.randn(N_cs) * 0.01

# L1 recovery
lam_cs_max = np.max(np.abs(X_cs.T @ y_cs))
w_l1, _ = lasso_coordinate_descent(X_cs, y_cs, lam=0.05 * lam_cs_max, max_iter=5000)

# Debiased recovery
support_l1 = np.where(np.abs(w_l1) > 1e-6)[0]
w_debiased_cs = np.zeros(D_cs)
if len(support_l1) > 0 and len(support_l1) < N_cs:
    X_sub = X_cs[:, support_l1]
    w_debiased_cs[support_l1] = np.linalg.lstsq(X_sub, y_cs, rcond=None)[0]

# Minimum-norm OLS (dense, fails)
w_ols_cs = X_cs.T @ np.linalg.solve(X_cs @ X_cs.T, y_cs)

fig, axes = plt.subplots(4, 1, figsize=(14, 10), sharex=True)

titles = [
    f'Original (D={D_cs}, K={K_cs} nonzeros)',
    f'ℓ₁ Recovery (MSE = {np.mean((w_l1 - w_star)**2):.4f})',
    f'Debiased (MSE = {np.mean((w_debiased_cs - w_star)**2):.6f})',
    f'Min-norm OLS (MSE = {np.mean((w_ols_cs - w_star)**2):.4f})'
]
signals = [w_star, w_l1, w_debiased_cs, w_ols_cs]
colors_list = ['C0', 'C3', 'C2', 'C7']

for ax, title, signal, color in zip(axes, titles, signals, colors_list):
    ax.stem(np.arange(D_cs), signal, linefmt=color+'-', markerfmt=color+'o',
            basefmt='gray', label=title)
    ax.set_ylabel('w')
    ax.set_title(title, fontsize=11)
    ax.set_ylim(-2.5, 2.5)

axes[-1].set_xlabel('Component index')
plt.tight_layout()
plt.show()

print(f"L1 recovery found {len(support_l1)} nonzeros (true: {K_cs})")
print(f"Support overlap: {len(set(support_l1) & set(spike_idx))} / {K_cs} true spikes recovered")
print("Debiasing removes shrinkage bias → near-perfect recovery")
print("Min-norm OLS is dense and noisy — no single threshold can recover the support")

L1 recovery found 44 nonzeros (true: 20)
Support overlap: 20 / 20 true spikes recovered
Debiasing removes shrinkage bias → near-perfect recovery
Min-norm OLS is dense and noisy — no single threshold can recover the support

Method	Formula	Sparsity
OLS	$\hat{w}_d^{\text{MLE}} = c_d$	Dense
Ridge	$\hat{w}_d^{\text{MLE}} / (1 + \lambda)$	Dense (shrunk)
Lasso	$\text{sign}(\hat{w}_d)(\\|\hat{w}_d\\| - \lambda)_+$	Sparse (shrunk)
Subset	$\hat{w}_d \cdot \mathbb{1}(\\|\hat{w}_d\\| > \lambda)$	Sparse (unbiased)

Concept	Key Idea
Laplace prior	Peaked at zero with heavy tails → encourages exact sparsity
ℓ₁ geometry	Diamond constraint has corners on axes → optimal solutions tend to be sparse
Soft thresholding	Zeros out small coefficients AND shrinks large ones (biased)
Coordinate descent	Analytically solve one coordinate at a time; fast for lasso
Regularization path	Piecewise linear in λ; coefficients "turn on" as λ decreases
Debiasing	Refit OLS on lasso support to remove shrinkage bias
Elastic net	ℓ₁ + ℓ₂ penalty: sparse solutions with grouping for correlated features
ISTA / FISTA	Proximal gradient methods; FISTA adds Nesterov acceleration

Lasso Regression¶

Real-World Scenario: Identifying Biomarkers for Antibiotic Resistance¶

Key Formulas from PML Chapter 11.4¶

1. Generate Synthetic Antibiotic Resistance Data¶

2. Laplace vs Gaussian Prior (Section 11.4.1)¶

3. Why ℓ₁ Yields Sparse Solutions — Geometry (Section 11.4.2)¶

4. Soft Thresholding vs Hard Thresholding (Section 11.4.3)¶

5. Coordinate Descent — The Shooting Algorithm (Section 11.4.9.1)¶

Building intuition: $a_d$, $c_d$, $\mathbf{r}_{-d}$ and the subgradient¶

6. The Regularization Path (Section 11.4.4)¶

7. Cross-Validation for λ Selection¶

8. Comparison: OLS, Ridge, Lasso, Forward Selection (Section 11.4.5)¶

9. Variable Selection Consistency and Debiasing (Section 11.4.6)¶

10. Elastic Net — Combining ℓ₁ and ℓ₂ (Section 11.4.8)¶

11. Proximal Gradient Descent — ISTA and FISTA (Section 11.4.9.3)¶

12. Sparse Signal Recovery Demo (Section 11.4.6)¶

Summary¶