import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from mpl_toolkits.mplot3d import Axes3D

np.random.seed(42)
plt.style.use('seaborn-v0_8-whitegrid')
mpl.rcParams['font.family'] = 'DejaVu Sans'

# --- Simple (1D) data: one mRNA predictor ---
N = 100

# True parameters: protein = w0 + w1 * mRNA + noise
w0_true = 5.0     # basal protein level (constitutive translation)
w1_true = 2.5     # translation efficiency (proteins per mRNA unit)
sigma_true = 3.0  # noise from stochastic translation

# mRNA expression levels (log-scale, centered)
mRNA = np.random.uniform(0, 10, N)
protein = w0_true + w1_true * mRNA + np.random.randn(N) * sigma_true

print(f"Generated {N} gene expression measurements")
print(f"True model: protein = {w0_true} + {w1_true} × mRNA + ε,  ε ~ N(0, {sigma_true}²)")
print(f"mRNA range: [{mRNA.min():.1f}, {mRNA.max():.1f}]")
print(f"Protein range: [{protein.min():.1f}, {protein.max():.1f}]")

Generated 100 gene expression measurements
True model: protein = 5.0 + 2.5 × mRNA + ε,  ε ~ N(0, 3.0²)
mRNA range: [0.1, 9.9]
Protein range: [3.4, 32.2]

fig, ax = plt.subplots(figsize=(8, 5))
ax.scatter(mRNA, protein, alpha=0.6, s=30, c='#2196F3', edgecolors='white', linewidths=0.3)
x_line = np.linspace(0, 10, 100)
ax.plot(x_line, w0_true + w1_true * x_line, 'k--', linewidth=2,
        label=f'True: y = {w0_true} + {w1_true}x')
ax.set_xlabel('mRNA Expression (log₂ TPM)', fontsize=12)
ax.set_ylabel('Protein Abundance (log₂ intensity)', fontsize=12)
ax.set_title('mRNA → Protein Relationship', fontsize=14)
ax.legend(fontsize=11)
plt.tight_layout()
plt.show()

# Simple linear regression using the closed-form (Eq. 11.27-11.28)
x_bar = np.mean(mRNA)
y_bar = np.mean(protein)

# Covariance and variance
Cxy = np.mean((mRNA - x_bar) * (protein - y_bar))
Cxx = np.mean((mRNA - x_bar) ** 2)
Cyy = np.mean((protein - y_bar) ** 2)

# MLE estimates (Eq. 11.27-11.28)
w1_hat = Cxy / Cxx
w0_hat = y_bar - w1_hat * x_bar

# Correlation coefficient
sigma_x = np.sqrt(Cxx)
sigma_y = np.sqrt(Cyy)
rho_xy = Cxy / (sigma_x * sigma_y)

print("Simple Linear Regression (1D)")
print("=" * 45)
print(f"  x̄ = {x_bar:.3f},  ȳ = {y_bar:.3f}")
print(f"  Cov(X,Y) = {Cxy:.3f}")
print(f"  Var(X)   = {Cxx:.3f}")
print(f"  ρ(X,Y)   = {rho_xy:.3f}")
print(f"\n  ŵ₁ = Cxy/Cxx = {w1_hat:.3f}  (true: {w1_true})")
print(f"  ŵ₀ = ȳ - ŵ₁x̄ = {w0_hat:.3f}  (true: {w0_true})")
print(f"\nAlternative: ŵ₁ = ρ × σ_y/σ_x = {rho_xy:.3f} × {sigma_y:.3f}/{sigma_x:.3f} = {rho_xy * sigma_y / sigma_x:.3f}")

Simple Linear Regression (1D)
=============================================
  x̄ = 4.702,  ȳ = 16.751
  Cov(X,Y) = 20.695
  Var(X)   = 8.761
  ρ(X,Y)   = 0.933

  ŵ₁ = Cxy/Cxx = 2.362  (true: 2.5)
  ŵ₀ = ȳ - ŵ₁x̄ = 5.645  (true: 5.0)

Alternative: ŵ₁ = ρ × σ_y/σ_x = 0.933 × 7.493/2.960 = 2.362

# Design matrix with bias column
X = np.column_stack([np.ones(N), mRNA])  # [1, x]
y = protein

# OLS via normal equations (Eq. 11.9)
XtX = X.T @ X
Xty = X.T @ y
w_ols = np.linalg.solve(XtX, Xty)  # more stable than inverting

# Verify: also compute via pseudo-inverse
w_pinv = np.linalg.pinv(X) @ y

# Predictions
y_hat = X @ w_ols

print("OLS Solution (matrix form)")
print("=" * 45)
print(f"  ŵ = (X'X)⁻¹X'y = [{w_ols[0]:.3f}, {w_ols[1]:.3f}]")
print(f"  Pseudo-inverse:   [{w_pinv[0]:.3f}, {w_pinv[1]:.3f}]")
print(f"  True:             [{w0_true:.3f}, {w1_true:.3f}]")
print(f"\n  X'X shape: {XtX.shape}")
print(f"  X'X is PD: eigenvalues = {np.linalg.eigvalsh(XtX)}")
print(f"  → Unique global minimum confirmed.")

OLS Solution (matrix form)
=============================================
  ŵ = (X'X)⁻¹X'y = [5.645, 2.362]
  Pseudo-inverse:   [5.645, 2.362]
  True:             [5.000, 2.500]

  X'X shape: (2, 2)
  X'X is PD: eigenvalues = [  27.73401912 3159.11479702]
  → Unique global minimum confirmed.

# Visualize the RSS surface and the OLS solution (cf. PML Figure 11.2)
w0_range = np.linspace(-5, 15, 100)
w1_range = np.linspace(0, 5, 100)
W0, W1 = np.meshgrid(w0_range, w1_range)

RSS_surface = np.zeros_like(W0)
for i in range(W0.shape[0]):
    for j in range(W0.shape[1]):
        w_ij = np.array([W0[i, j], W1[i, j]])
        residuals = X @ w_ij - y
        RSS_surface[i, j] = 0.5 * np.sum(residuals ** 2)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Left: contour plot
cs = axes[0].contour(W0, W1, RSS_surface, levels=20, cmap='viridis')
axes[0].clabel(cs, inline=True, fontsize=7)
axes[0].plot(w_ols[0], w_ols[1], 'r*', markersize=15, label='OLS $\hat{\mathbf{w}}$', zorder=5)
axes[0].plot(w0_true, w1_true, 'k+', markersize=12, markeredgewidth=2, label='True $\mathbf{w}$', zorder=5)
axes[0].set_xlabel('$w_0$ (intercept)', fontsize=12)
axes[0].set_ylabel('$w_1$ (slope)', fontsize=12)
axes[0].set_title('RSS Contours', fontsize=14)
axes[0].legend(fontsize=11)

# Right: surface plot
ax3d = fig.add_subplot(122, projection='3d')
ax3d.plot_surface(W0, W1, RSS_surface, cmap='viridis', alpha=0.8, edgecolor='none')
ax3d.scatter([w_ols[0]], [w_ols[1]], [0.5 * np.sum((X @ w_ols - y)**2)],
             c='red', s=100, marker='*', zorder=5)
ax3d.set_xlabel('$w_0$', fontsize=10)
ax3d.set_ylabel('$w_1$', fontsize=10)
ax3d.set_zlabel('RSS', fontsize=10)
ax3d.set_title('RSS Surface', fontsize=14)
ax3d.view_init(elev=25, azim=-50)

# Remove auto-generated axes[1]
axes[1].remove()

plt.tight_layout()
plt.show()

print("The RSS is a convex quadratic bowl — the OLS solution is the unique global minimum.")

<>:18: SyntaxWarning: invalid escape sequence '\h'
<>:19: SyntaxWarning: invalid escape sequence '\m'
<>:18: SyntaxWarning: invalid escape sequence '\h'
<>:19: SyntaxWarning: invalid escape sequence '\m'
/var/folders/34/4mb6rzb52l76jcqm_pjx3fph0000gn/T/ipykernel_99599/2522919810.py:18: SyntaxWarning: invalid escape sequence '\h'
  axes[0].plot(w_ols[0], w_ols[1], 'r*', markersize=15, label='OLS $\hat{\mathbf{w}}$', zorder=5)
/var/folders/34/4mb6rzb52l76jcqm_pjx3fph0000gn/T/ipykernel_99599/2522919810.py:19: SyntaxWarning: invalid escape sequence '\m'
  axes[0].plot(w0_true, w1_true, 'k+', markersize=12, markeredgewidth=2, label='True $\mathbf{w}$', zorder=5)

The RSS is a convex quadratic bowl — the OLS solution is the unique global minimum.

# Geometric illustration: project y in R^3 onto 2D column space of X
# Use 3 data points with 2 features (bias + one predictor)
X_geo = np.array([[1, 1.0],
                   [1, 3.0],
                   [1, 2.0]])  # 3×2 design matrix
y_geo = np.array([3.5, 8.0, 7.0])  # observations in R^3

# OLS solution
w_geo = np.linalg.solve(X_geo.T @ X_geo, X_geo.T @ y_geo)
y_hat_geo = X_geo @ w_geo
residual = y_geo - y_hat_geo

# Hat matrix
H_mat = X_geo @ np.linalg.inv(X_geo.T @ X_geo) @ X_geo.T

# Verify orthogonality: X^T (y - y_hat) = 0
orth_check = X_geo.T @ residual

print("Geometric Interpretation (N=3, D=2)")
print("=" * 45)
print(f"  y     = {y_geo}")
print(f"  ŷ     = {y_hat_geo}")
print(f"  y - ŷ = {residual}  (residual)")
print(f"\n  X'(y - ŷ) = {orth_check}  (≈ 0, orthogonality confirmed)")
print(f"  ||y - ŷ|| = {np.linalg.norm(residual):.3f}")

Geometric Interpretation (N=3, D=2)
=============================================
  y     = [3.5 8.  7. ]
  ŷ     = [3.91666667 8.41666667 6.16666667]
  y - ŷ = [-0.41666667 -0.41666667  0.83333333]  (residual)

  X'(y - ŷ) = [1.33226763e-15 3.10862447e-15]  (≈ 0, orthogonality confirmed)
  ||y - ŷ|| = 1.021

# 3D visualization of the projection
fig = plt.figure(figsize=(9, 7))
ax = fig.add_subplot(111, projection='3d')

# Column space of X: spanned by x_1 = [1,1,1] and x_2 = [1,3,2]
col1 = X_geo[:, 0]  # bias column
col2 = X_geo[:, 1]  # feature column

# Draw the 2D plane (column space)
s_range = np.linspace(-0.5, 1.5, 10)
t_range = np.linspace(-0.3, 0.8, 10)
S, T = np.meshgrid(s_range, t_range)
plane_x = S * col1[0] + T * col2[0]
plane_y = S * col1[1] + T * col2[1]
plane_z = S * col1[2] + T * col2[2]
ax.plot_surface(plane_x, plane_y, plane_z, alpha=0.15, color='#2196F3')

# Column vectors
origin = np.zeros(3)
ax.quiver(*origin, *col1, color='#2196F3', linewidth=2.5, arrow_length_ratio=0.08,
          label=r'$\mathbf{x}_{:,1}$ (bias column)')
ax.quiver(*origin, *col2, color='#4CAF50', linewidth=2.5, arrow_length_ratio=0.08,
          label=r'$\mathbf{x}_{:,2}$ (mRNA column)')

# Target vector y
ax.quiver(*origin, *y_geo, color='#F44336', linewidth=2.5, arrow_length_ratio=0.05,
          label=r'$\mathbf{y}$ (protein)')

# Projection y_hat
ax.quiver(*origin, *y_hat_geo, color='#FF9800', linewidth=2.5, arrow_length_ratio=0.06,
          label=r'$\hat{\mathbf{y}}$ (projection)')

# Residual vector (y - y_hat), drawn from y_hat to y
ax.quiver(*y_hat_geo, *residual, color='gray', linewidth=2, linestyle='--',
          arrow_length_ratio=0.1, label=r'$\mathbf{y} - \hat{\mathbf{y}}$ (residual)')

# Mark the right angle
ax.scatter(*y_hat_geo, color='#FF9800', s=80, zorder=5)

ax.set_xlabel('Obs 1', fontsize=11)
ax.set_ylabel('Obs 2', fontsize=11)
ax.set_zlabel('Obs 3', fontsize=11)
ax.set_title(r'OLS as Orthogonal Projection' + '\n' + r'$\hat{\mathbf{y}} = \mathrm{Proj}(\mathbf{X})\,\mathbf{y}$',
             fontsize=14)
ax.legend(fontsize=9, loc='upper left')
ax.view_init(elev=20, azim=-60)
plt.tight_layout()
plt.show()

print("The residual y − ŷ is perpendicular to the column space of X.")
print("OLS finds the point in span(X) closest to y in Euclidean distance.")

The residual y − ŷ is perpendicular to the column space of X.
OLS finds the point in span(X) closest to y in Euclidean distance.

from scipy.linalg import lstsq, solve_triangular

# Method 1: Normal equations (direct inverse)
w_direct = np.linalg.inv(X.T @ X) @ X.T @ y

# Method 2: QR decomposition (Eq. 11.19-11.21)
Q, R = np.linalg.qr(X)
w_qr = solve_triangular(R, Q.T @ y)  # backsubstitution

# Method 3: SVD (via scipy.linalg.lstsq, which calls LAPACK DGELSD)
w_svd, residuals_svd, rank, sv = lstsq(X, y)

# Method 4: np.linalg.solve (LU decomposition of X'X)
w_solve = np.linalg.solve(X.T @ X, X.T @ y)

print("Comparison of OLS Solvers")
print("=" * 55)
print(f"{'Method':<25s} {'w₀':>10s} {'w₁':>10s}")
print("-" * 55)
print(f"{'True':25s} {w0_true:10.4f} {w1_true:10.4f}")
print(f"{'Direct inverse':25s} {w_direct[0]:10.4f} {w_direct[1]:10.4f}")
print(f"{'QR decomposition':25s} {w_qr[0]:10.4f} {w_qr[1]:10.4f}")
print(f"{'SVD (lstsq)':25s} {w_svd[0]:10.4f} {w_svd[1]:10.4f}")
print(f"{'LU (np.linalg.solve)':25s} {w_solve[0]:10.4f} {w_solve[1]:10.4f}")
print("-" * 55)
print(f"\nSingular values of X: {sv}")
print(f"Condition number: {sv[0]/sv[1]:.1f}")
print("All methods agree — QR and SVD are preferred for numerical stability.")

Comparison of OLS Solvers
=======================================================
Method                            w₀         w₁
-------------------------------------------------------
True                          5.0000     2.5000
Direct inverse                5.6453     2.3621
QR decomposition              5.6453     2.3621
SVD (lstsq)                   5.6453     2.3621
LU (np.linalg.solve)          5.6453     2.3621
-------------------------------------------------------

Singular values of X: [56.20600321  5.26630982]
Condition number: 10.7
All methods agree — QR and SVD are preferred for numerical stability.

# Multiple regression: protein ~ mRNA + ribosome_density + codon_score
D = 3
w_true_multi = np.array([3.0,   # w0: basal expression
                          2.0,   # w1: mRNA level
                          1.5,   # w2: ribosome density
                          0.8])  # w3: codon adaptation index

feature_names = ['mRNA level', 'Ribosome density', 'Codon adaptation']

# Generate correlated predictors (realistic: mRNA and ribosomes are correlated)
mean_feat = np.array([5.0, 3.0, 0.5])
cov_feat = np.array([[2.0, 0.5, 0.1],
                      [0.5, 1.0, 0.2],
                      [0.1, 0.2, 0.3]])
X_multi_raw = np.random.multivariate_normal(mean_feat, cov_feat, N)

# Design matrix with bias
X_multi = np.column_stack([np.ones(N), X_multi_raw])
y_multi = X_multi @ w_true_multi + np.random.randn(N) * 2.0

# OLS
w_multi_hat = np.linalg.solve(X_multi.T @ X_multi, X_multi.T @ y_multi)

print("Multiple Linear Regression")
print("=" * 50)
print(f"{'Feature':<25s} {'True w':>10s} {'Estimated ŵ':>12s}")
print("-" * 50)
print(f"{'Intercept':<25s} {w_true_multi[0]:10.3f} {w_multi_hat[0]:12.3f}")
for i, name in enumerate(feature_names):
    print(f"{name:<25s} {w_true_multi[i+1]:10.3f} {w_multi_hat[i+1]:12.3f}")
print("\nEach coefficient = change in protein per unit change in that predictor,")
print("holding all others constant (partial regression coefficient, Eq. 11.30).")

Multiple Linear Regression
==================================================
Feature                       True w  Estimated ŵ
--------------------------------------------------
Intercept                      3.000        2.497
mRNA level                     2.000        2.246
Ribosome density               1.500        1.153
Codon adaptation               0.800        0.935

Each coefficient = change in protein per unit change in that predictor,
holding all others constant (partial regression coefficient, Eq. 11.30).

# Demonstrate partial regression for the first predictor (mRNA)
# Full model: y = w0 + w1*mRNA + w2*ribosome + w3*codon + eps
# Partial regression of Y on X1, controlling for X2, X3:
#   1. Regress Y on (1, X2, X3) → residuals e_Y
#   2. Regress X1 on (1, X2, X3) → residuals e_X1  
#   3. Regress e_Y on e_X1 → slope = w1

# Step 1: Regress Y on confounders (intercept + ribosome + codon)
X_confounders = np.column_stack([np.ones(N), X_multi_raw[:, 1], X_multi_raw[:, 2]])
w_y_conf = np.linalg.solve(X_confounders.T @ X_confounders, X_confounders.T @ y_multi)
e_Y = y_multi - X_confounders @ w_y_conf

# Step 2: Regress X1 (mRNA) on confounders
w_x1_conf = np.linalg.solve(X_confounders.T @ X_confounders, X_confounders.T @ X_multi_raw[:, 0])
e_X1 = X_multi_raw[:, 0] - X_confounders @ w_x1_conf

# Step 3: Regress e_Y on e_X1 (simple regression through origin)
w1_partial = np.sum(e_X1 * e_Y) / np.sum(e_X1 ** 2)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Left: naive scatter (mRNA vs protein, ignoring confounders)
axes[0].scatter(X_multi_raw[:, 0], y_multi, alpha=0.5, s=30, c='#2196F3',
                edgecolors='white', linewidths=0.3)
# Simple regression line
X_simple = np.column_stack([np.ones(N), X_multi_raw[:, 0]])
w_simple = np.linalg.solve(X_simple.T @ X_simple, X_simple.T @ y_multi)
x_line = np.linspace(X_multi_raw[:, 0].min(), X_multi_raw[:, 0].max(), 100)
axes[0].plot(x_line, w_simple[0] + w_simple[1] * x_line, 'r-', linewidth=2,
             label=f'Simple: slope = {w_simple[1]:.3f}')
axes[0].set_xlabel('mRNA Level', fontsize=12)
axes[0].set_ylabel('Protein Abundance', fontsize=12)
axes[0].set_title('Simple Regression (ignores confounders)', fontsize=13)
axes[0].legend(fontsize=10)

# Right: partial regression (residualized)
axes[1].scatter(e_X1, e_Y, alpha=0.5, s=30, c='#4CAF50',
                edgecolors='white', linewidths=0.3)
e_line = np.linspace(e_X1.min(), e_X1.max(), 100)
axes[1].plot(e_line, w1_partial * e_line, 'r-', linewidth=2,
             label=f'Partial: slope = {w1_partial:.3f}')
axes[1].axhline(0, color='gray', linestyle='--', alpha=0.3)
axes[1].axvline(0, color='gray', linestyle='--', alpha=0.3)
axes[1].set_xlabel('mRNA residual (effect of other predictors removed)', fontsize=11)
axes[1].set_ylabel('Protein residual', fontsize=11)
axes[1].set_title('Partial Regression Plot (Frisch-Waugh)', fontsize=13)
axes[1].legend(fontsize=10)

plt.tight_layout()
plt.show()

print(f"Simple regression slope for mRNA:   {w_simple[1]:.4f}  (confounded)")
print(f"Partial regression slope for mRNA:  {w1_partial:.4f}")
print(f"Full OLS coefficient for mRNA:      {w_multi_hat[1]:.4f}")
print(f"True coefficient:                   {w_true_multi[1]:.4f}")
print("\nPartial regression = full OLS coefficient (Frisch-Waugh theorem).")

Simple regression slope for mRNA:   2.6139  (confounded)
Partial regression slope for mRNA:  2.2464
Full OLS coefficient for mRNA:      2.2464
True coefficient:                   2.0000

Partial regression = full OLS coefficient (Frisch-Waugh theorem).

# Generate nonlinear data: protein saturates at high mRNA (Michaelis-Menten-like)
np.random.seed(42)
N_poly = 80
x_poly = np.random.uniform(0.5, 8, N_poly)
y_poly = 5.0 + 4.0 * x_poly - 0.3 * x_poly**2 + np.random.randn(N_poly) * 2.0


def polynomial_features(x, degree):
    """Create polynomial design matrix [1, x, x², ..., x^d]."""
    return np.column_stack([x**d for d in range(degree + 1)])


# Fit polynomials of different degrees
x_plot = np.linspace(0, 9, 200)
degrees = [1, 2, 5, 15]

fig, axes = plt.subplots(1, 4, figsize=(18, 4))

for ax, deg in zip(axes, degrees):
    Phi = polynomial_features(x_poly, deg)
    Phi_plot = polynomial_features(x_plot, deg)
    
    # OLS fit
    w_poly = np.linalg.lstsq(Phi, y_poly, rcond=None)[0]
    y_pred = Phi_plot @ w_poly
    
    # RSS and R²
    y_hat_poly = Phi @ w_poly
    rss = np.sum((y_poly - y_hat_poly)**2)
    tss = np.sum((y_poly - np.mean(y_poly))**2)
    r2 = 1 - rss / tss
    
    ax.scatter(x_poly, y_poly, alpha=0.5, s=25, c='#2196F3',
               edgecolors='white', linewidths=0.3)
    ax.plot(x_plot, y_pred, 'r-', linewidth=2)
    ax.set_title(f'Degree {deg}  (R² = {r2:.3f})', fontsize=12)
    ax.set_xlabel('mRNA', fontsize=10)
    if deg == degrees[0]:
        ax.set_ylabel('Protein', fontsize=10)
    ax.set_ylim(y_poly.min() - 5, y_poly.max() + 5)

plt.suptitle('Polynomial Regression: Underfitting vs Overfitting (cf. PML Figure 11.1)',
             fontsize=13, y=1.02)
plt.tight_layout()
plt.show()

print("Degree 1: underfits (misses the curvature).")
print("Degree 2: good fit (captures the saturation).")
print("Degree 15: overfits (wiggles through noise, high R² on training data).")

Degree 1: underfits (misses the curvature).
Degree 2: good fit (captures the saturation).
Degree 15: overfits (wiggles through noise, high R² on training data).

# Solve offset and slope separately using centered data (Eq. 11.25-11.26)
# Using the multiple regression data
x_bar_multi = np.mean(X_multi_raw, axis=0)  # feature means
y_bar_multi = np.mean(y_multi)               # target mean

# Center
Xc = X_multi_raw - x_bar_multi
yc = y_multi - y_bar_multi

# Slope from centered data (Eq. 11.25)
w_slope = np.linalg.solve(Xc.T @ Xc, Xc.T @ yc)

# Intercept (Eq. 11.26)
w_intercept = y_bar_multi - x_bar_multi @ w_slope

print("Solving Offset and Slope Separately (centered data)")
print("=" * 55)
print(f"{'':25s} {'Centered':>12s} {'Full OLS':>12s} {'True':>10s}")
print("-" * 55)
print(f"{'w₀ (intercept)':<25s} {w_intercept:12.4f} {w_multi_hat[0]:12.4f} {w_true_multi[0]:10.4f}")
for i, name in enumerate(feature_names):
    print(f"{name:<25s} {w_slope[i]:12.4f} {w_multi_hat[i+1]:12.4f} {w_true_multi[i+1]:10.4f}")
print("\nBoth approaches give identical results.")

Solving Offset and Slope Separately (centered data)
=======================================================
                              Centered     Full OLS       True
-------------------------------------------------------
w₀ (intercept)                  2.4969       2.4969     3.0000
mRNA level                      2.2464       2.2464     2.0000
Ribosome density                1.1532       1.1532     1.5000
Codon adaptation                0.9347       0.9347     0.8000

Both approaches give identical results.

# Generate heteroskedastic data: noise increases with mRNA level
np.random.seed(42)
N_wls = 150
x_wls = np.random.uniform(0, 10, N_wls)
sigma_x = 1.0 + 1.5 * x_wls  # noise increases with x
y_wls = 5.0 + 2.5 * x_wls + np.random.randn(N_wls) * sigma_x

# Design matrix
X_wls = np.column_stack([np.ones(N_wls), x_wls])

# OLS (ignores heteroskedasticity)
w_ols_h = np.linalg.solve(X_wls.T @ X_wls, X_wls.T @ y_wls)

# WLS with known variance (Eq. 11.24)
Lambda = np.diag(1.0 / sigma_x**2)  # precision weights
w_wls = np.linalg.solve(X_wls.T @ Lambda @ X_wls, X_wls.T @ Lambda @ y_wls)

# Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

x_line = np.linspace(0, 10, 100)

# Left: data with OLS and WLS fits
axes[0].scatter(x_wls, y_wls, alpha=0.4, s=25, c='#2196F3',
                edgecolors='white', linewidths=0.3, label='Data')
axes[0].plot(x_line, w_ols_h[0] + w_ols_h[1] * x_line, 'r-', linewidth=2,
             label=f'OLS: y = {w_ols_h[0]:.1f} + {w_ols_h[1]:.2f}x')
axes[0].plot(x_line, w_wls[0] + w_wls[1] * x_line, 'g-', linewidth=2,
             label=f'WLS: y = {w_wls[0]:.1f} + {w_wls[1]:.2f}x')
axes[0].plot(x_line, 5.0 + 2.5 * x_line, 'k--', linewidth=1.5, alpha=0.5,
             label='True: y = 5.0 + 2.50x')
axes[0].fill_between(x_line, 5.0 + 2.5 * x_line - 2*(1.0 + 1.5*x_line),
                     5.0 + 2.5 * x_line + 2*(1.0 + 1.5*x_line),
                     alpha=0.1, color='gray', label='±2σ(x)')
axes[0].set_xlabel('mRNA Expression', fontsize=12)
axes[0].set_ylabel('Protein Abundance', fontsize=12)
axes[0].set_title('Heteroskedastic Data: OLS vs WLS', fontsize=13)
axes[0].legend(fontsize=9)

# Right: weights used in WLS
weights = 1.0 / sigma_x**2
sc = axes[1].scatter(x_wls, y_wls, c=weights, cmap='YlOrRd', s=30, alpha=0.7,
                     edgecolors='white', linewidths=0.3)
plt.colorbar(sc, ax=axes[1], label='Weight $1/\sigma^2(x)$')
axes[1].set_xlabel('mRNA Expression', fontsize=12)
axes[1].set_ylabel('Protein Abundance', fontsize=12)
axes[1].set_title('WLS Weights (low noise → high weight)', fontsize=13)

plt.tight_layout()
plt.show()

print(f"True parameters:  w₀ = 5.00,  w₁ = 2.50")
print(f"OLS estimates:    w₀ = {w_ols_h[0]:.2f},  w₁ = {w_ols_h[1]:.2f}")
print(f"WLS estimates:    w₀ = {w_wls[0]:.2f},  w₁ = {w_wls[1]:.2f}")
print("\nWLS gives more accurate estimates by downweighting noisy high-expression points.")

<>:44: SyntaxWarning: invalid escape sequence '\s'
<>:44: SyntaxWarning: invalid escape sequence '\s'
/var/folders/34/4mb6rzb52l76jcqm_pjx3fph0000gn/T/ipykernel_99599/3380059573.py:44: SyntaxWarning: invalid escape sequence '\s'
  plt.colorbar(sc, ax=axes[1], label='Weight $1/\sigma^2(x)$')

True parameters:  w₀ = 5.00,  w₁ = 2.50
OLS estimates:    w₀ = 4.36,  w₁ = 2.76
WLS estimates:    w₀ = 5.04,  w₁ = 2.57

WLS gives more accurate estimates by downweighting noisy high-expression points.

# Online/recursive estimation of simple linear regression (cf. PML Figure 11.4)
def recursive_linreg(x_stream, y_stream):
    """Recursively estimate w0, w1 for simple linear regression."""
    N = len(x_stream)
    w0_history = np.zeros(N)
    w1_history = np.zeros(N)
    
    # Initialize with first point
    x_bar = x_stream[0]
    y_bar = y_stream[0]
    Cxx = 0.0
    Cxy = 0.0
    sum_xy = x_stream[0] * y_stream[0]
    
    w0_history[0] = y_bar
    w1_history[0] = 0.0
    
    for n in range(1, N):
        # Update means (Eq. 11.35)
        x_bar_new = x_bar + (x_stream[n] - x_bar) / (n + 1)
        y_bar_new = y_bar + (y_stream[n] - y_bar) / (n + 1)
        
        # Update covariances (Eq. 11.40)
        sum_xy += x_stream[n] * y_stream[n]
        Cxy = (sum_xy - (n + 1) * x_bar_new * y_bar_new) / (n + 1)
        
        sum_xx = 0.0
        for i in range(n + 1):
            sum_xx += x_stream[i] ** 2
        Cxx = (sum_xx - (n + 1) * x_bar_new**2) / (n + 1)
        
        x_bar = x_bar_new
        y_bar = y_bar_new
        
        # Update regression coefficients
        if Cxx > 1e-10:
            w1_history[n] = Cxy / Cxx
            w0_history[n] = y_bar - w1_history[n] * x_bar
        else:
            w1_history[n] = 0.0
            w0_history[n] = y_bar
    
    return w0_history, w1_history


# Run online estimation on the 1D data
w0_online, w1_online = recursive_linreg(mRNA, protein)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

n_range = np.arange(1, N + 1)

# Left: w0 over time
axes[0].plot(n_range, w0_online, 'b-', linewidth=1.5, label='Online $\hat{w}_0$')
axes[0].axhline(w0_true, color='r', linestyle='--', linewidth=1.5, label=f'True $w_0$ = {w0_true}')
axes[0].axhline(w_ols[0], color='g', linestyle=':', linewidth=1.5, label=f'Batch OLS $\hat{{w}}_0$ = {w_ols[0]:.2f}')
axes[0].set_xlabel('Number of Observations $n$', fontsize=12)
axes[0].set_ylabel('$\hat{w}_0$ (intercept)', fontsize=12)
axes[0].set_title('Online Estimation of Intercept', fontsize=13)
axes[0].legend(fontsize=10)

# Right: w1 over time
axes[1].plot(n_range, w1_online, 'b-', linewidth=1.5, label='Online $\hat{w}_1$')
axes[1].axhline(w1_true, color='r', linestyle='--', linewidth=1.5, label=f'True $w_1$ = {w1_true}')
axes[1].axhline(w_ols[1], color='g', linestyle=':', linewidth=1.5, label=f'Batch OLS $\hat{{w}}_1$ = {w_ols[1]:.2f}')
axes[1].set_xlabel('Number of Observations $n$', fontsize=12)
axes[1].set_ylabel('$\hat{w}_1$ (slope)', fontsize=12)
axes[1].set_title('Online Estimation of Slope', fontsize=13)
axes[1].legend(fontsize=10)

plt.tight_layout()
plt.show()

print(f"After all {N} samples, online estimates converge to batch OLS.")
print(f"Online:  w₀ = {w0_online[-1]:.4f},  w₁ = {w1_online[-1]:.4f}")
print(f"Batch:   w₀ = {w_ols[0]:.4f},  w₁ = {w_ols[1]:.4f}")

<>:54: SyntaxWarning: invalid escape sequence '\h'
<>:56: SyntaxWarning: invalid escape sequence '\h'
<>:58: SyntaxWarning: invalid escape sequence '\h'
<>:63: SyntaxWarning: invalid escape sequence '\h'
<>:65: SyntaxWarning: invalid escape sequence '\h'
<>:67: SyntaxWarning: invalid escape sequence '\h'
<>:54: SyntaxWarning: invalid escape sequence '\h'
<>:56: SyntaxWarning: invalid escape sequence '\h'
<>:58: SyntaxWarning: invalid escape sequence '\h'
<>:63: SyntaxWarning: invalid escape sequence '\h'
<>:65: SyntaxWarning: invalid escape sequence '\h'
<>:67: SyntaxWarning: invalid escape sequence '\h'
/var/folders/34/4mb6rzb52l76jcqm_pjx3fph0000gn/T/ipykernel_99599/1616787979.py:54: SyntaxWarning: invalid escape sequence '\h'
  axes[0].plot(n_range, w0_online, 'b-', linewidth=1.5, label='Online $\hat{w}_0$')
/var/folders/34/4mb6rzb52l76jcqm_pjx3fph0000gn/T/ipykernel_99599/1616787979.py:56: SyntaxWarning: invalid escape sequence '\h'
  axes[0].axhline(w_ols[0], color='g', linestyle=':', linewidth=1.5, label=f'Batch OLS $\hat{{w}}_0$ = {w_ols[0]:.2f}')
/var/folders/34/4mb6rzb52l76jcqm_pjx3fph0000gn/T/ipykernel_99599/1616787979.py:58: SyntaxWarning: invalid escape sequence '\h'
  axes[0].set_ylabel('$\hat{w}_0$ (intercept)', fontsize=12)
/var/folders/34/4mb6rzb52l76jcqm_pjx3fph0000gn/T/ipykernel_99599/1616787979.py:63: SyntaxWarning: invalid escape sequence '\h'
  axes[1].plot(n_range, w1_online, 'b-', linewidth=1.5, label='Online $\hat{w}_1$')
/var/folders/34/4mb6rzb52l76jcqm_pjx3fph0000gn/T/ipykernel_99599/1616787979.py:65: SyntaxWarning: invalid escape sequence '\h'
  axes[1].axhline(w_ols[1], color='g', linestyle=':', linewidth=1.5, label=f'Batch OLS $\hat{{w}}_1$ = {w_ols[1]:.2f}')
/var/folders/34/4mb6rzb52l76jcqm_pjx3fph0000gn/T/ipykernel_99599/1616787979.py:67: SyntaxWarning: invalid escape sequence '\h'
  axes[1].set_ylabel('$\hat{w}_1$ (slope)', fontsize=12)

After all 100 samples, online estimates converge to batch OLS.
Online:  w₀ = 5.6453,  w₁ = 2.3621
Batch:   w₀ = 5.6453,  w₁ = 2.3621

# MLE for sigma^2 (Eq. 11.49)
residuals_1d = y - X @ w_ols
sigma2_mle = np.mean(residuals_1d**2)
sigma_mle = np.sqrt(sigma2_mle)

# Unbiased estimate (for reference)
sigma2_unbiased = np.sum(residuals_1d**2) / (N - 2)  # N - D degrees of freedom

print("MLE for Noise Variance")
print("=" * 40)
print(f"  True σ²:     {sigma_true**2:.3f}  (σ = {sigma_true:.3f})")
print(f"  MLE σ²:      {sigma2_mle:.3f}  (σ = {sigma_mle:.3f})")
print(f"  Unbiased σ²: {sigma2_unbiased:.3f}  (σ = {np.sqrt(sigma2_unbiased):.3f})")
print(f"\nThe MLE is biased downward (divides by N, not N-D).")
print(f"The unbiased estimate divides by N - D = {N} - 2 = {N-2}.")

MLE for Noise Variance
========================================
  True σ²:     9.000  (σ = 3.000)
  MLE σ²:      7.259  (σ = 2.694)
  Unbiased σ²: 7.407  (σ = 2.722)

The MLE is biased downward (divides by N, not N-D).
The unbiased estimate divides by N - D = 100 - 2 = 98.

# Residual plots for degree 1 and 2 polynomial fits (cf. PML Figure 11.5)
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

for col, deg in enumerate([1, 2]):
    Phi = polynomial_features(x_poly, deg)
    w_fit = np.linalg.lstsq(Phi, y_poly, rcond=None)[0]
    y_fit = Phi @ w_fit
    residuals = y_poly - y_fit
    
    # Compute R²
    rss = np.sum(residuals**2)
    tss = np.sum((y_poly - np.mean(y_poly))**2)
    r2 = 1 - rss / tss
    rmse = np.sqrt(rss / N_poly)
    
    # Top row: residual plots (cf. PML Figure 11.5)
    axes[0, col].scatter(x_poly, residuals, alpha=0.6, s=30, c='#2196F3',
                         edgecolors='white', linewidths=0.3)
    axes[0, col].axhline(0, color='black', linestyle='-', linewidth=1)
    axes[0, col].fill_between([0, 9], -2*np.std(residuals), 2*np.std(residuals),
                              alpha=0.1, color='gray')
    axes[0, col].set_xlabel('mRNA Expression', fontsize=11)
    axes[0, col].set_ylabel(r'Residual $r_n = y_n - \hat{y}_n$', fontsize=11)
    axes[0, col].set_title(f'Residual Plot (degree {deg})', fontsize=13)
    axes[0, col].text(0.02, 0.95, f'RMSE = {rmse:.2f}',
                      transform=axes[0, col].transAxes, fontsize=10,
                      verticalalignment='top',
                      bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
    
    # Bottom row: fit vs actual plots (cf. PML Figure 11.6)
    axes[1, col].scatter(y_fit, y_poly, alpha=0.6, s=30, c='#4CAF50',
                         edgecolors='white', linewidths=0.3)
    lims = [min(y_poly.min(), y_fit.min()) - 1, max(y_poly.max(), y_fit.max()) + 1]
    axes[1, col].plot(lims, lims, 'k--', linewidth=1.5, alpha=0.5,
                      label='Perfect fit')
    axes[1, col].set_xlabel(r'Predicted $\hat{y}_n$', fontsize=11)
    axes[1, col].set_ylabel('Actual $y_n$', fontsize=11)
    axes[1, col].set_title(f'Fit vs Actual (degree {deg}, R² = {r2:.3f})', fontsize=13)
    axes[1, col].legend(fontsize=10)
    axes[1, col].set_xlim(lims)
    axes[1, col].set_ylim(lims)
    axes[1, col].set_aspect('equal')

plt.tight_layout()
plt.show()

print("Degree 1: residuals show a curved pattern → model misses the nonlinearity.")
print("Degree 2: residuals are a random cloud → good fit.")
print("The fit-vs-actual plot should show points on the diagonal for a good model.")

Degree 1: residuals show a curved pattern → model misses the nonlinearity.
Degree 2: residuals are a random cloud → good fit.
The fit-vs-actual plot should show points on the diagonal for a good model.

# R² for increasing polynomial degree: training vs test
np.random.seed(42)
x_test_poly = np.random.uniform(0.5, 8, 50)
y_test_poly = 5.0 + 4.0 * x_test_poly - 0.3 * x_test_poly**2 + np.random.randn(50) * 2.0

max_degree = 15
r2_train = []
r2_test = []

for deg in range(1, max_degree + 1):
    Phi_train = polynomial_features(x_poly, deg)
    Phi_test = polynomial_features(x_test_poly, deg)
    w_d = np.linalg.lstsq(Phi_train, y_poly, rcond=None)[0]
    
    # Training R²
    y_hat_tr = Phi_train @ w_d
    r2_tr = 1 - np.sum((y_poly - y_hat_tr)**2) / np.sum((y_poly - np.mean(y_poly))**2)
    r2_train.append(r2_tr)
    
    # Test R²
    y_hat_te = Phi_test @ w_d
    r2_te = 1 - np.sum((y_test_poly - y_hat_te)**2) / np.sum((y_test_poly - np.mean(y_test_poly))**2)
    r2_test.append(r2_te)

fig, ax = plt.subplots(figsize=(9, 5))
ax.plot(range(1, max_degree + 1), r2_train, 'b-o', linewidth=2, markersize=6, label='Training R²')
ax.plot(range(1, max_degree + 1), r2_test, 'r-s', linewidth=2, markersize=6, label='Test R²')
ax.axhline(1.0, color='gray', linestyle='--', alpha=0.3)
ax.set_xlabel('Polynomial Degree', fontsize=12)
ax.set_ylabel('R²', fontsize=12)
ax.set_title('Training vs Test R² by Polynomial Degree', fontsize=14)
ax.legend(fontsize=11)
ax.set_xticks(range(1, max_degree + 1))
plt.tight_layout()
plt.show()

best_deg = np.argmax(r2_test) + 1
print(f"Training R² always increases with degree (more flexibility).")
print(f"Test R² peaks at degree {best_deg} — beyond that, overfitting degrades generalization.")

Training R² always increases with degree (more flexibility).
Test R² peaks at degree 11 — beyond that, overfitting degrades generalization.

# Derive MLE from joint MVN (generative approach)
# Fit p(x, y) as a joint Gaussian, then condition on x
# Using the simple 1D data for clarity

# Joint sufficient statistics (Eq. 11.42-11.45)
mu_x = np.mean(mRNA)
mu_y = np.mean(protein)
Sigma_xx = np.mean((mRNA - mu_x)**2)       # scalar variance
Sigma_xy = np.mean((mRNA - mu_x) * (protein - mu_y))  # scalar covariance

# Conditional expectation (Eq. 11.46): E[y|x] = mu_y + Sigma_xy/Sigma_xx * (x - mu_x)
w_gen = Sigma_xy / Sigma_xx  # = Sigma_xx^{-1} Sigma_xy
w0_gen = mu_y - w_gen * mu_x

print("Generative vs Discriminative MLE")
print("=" * 45)
print(f"  Generative (joint MVN):  w₀ = {w0_gen:.4f},  w₁ = {w_gen:.4f}")
print(f"  Discriminative (OLS):    w₀ = {w_ols[0]:.4f},  w₁ = {w_ols[1]:.4f}")
print(f"  True:                    w₀ = {w0_true:.4f},  w₁ = {w1_true:.4f}")
print(f"\n  Max difference: {max(abs(w0_gen - w_ols[0]), abs(w_gen - w_ols[1])):.2e}")
print("\nFor Gaussian models, fitting the joint and conditioning")
print("gives the same result as fitting the conditional directly.")

Generative vs Discriminative MLE
=============================================
  Generative (joint MVN):  w₀ = 5.6453,  w₁ = 2.3621
  Discriminative (OLS):    w₀ = 5.6453,  w₁ = 2.3621
  True:                    w₀ = 5.0000,  w₁ = 2.5000

  Max difference: 8.88e-16

For Gaussian models, fitting the joint and conditioning
gives the same result as fitting the conditional directly.

Concept	Formula
Model	$p(y \mid \mathbf{x}) = \mathcal{N}(y \mid w_0 + \mathbf{w}^\top\mathbf{x}, \sigma^2)$
RSS	$\frac{1}{2}\\|\mathbf{Xw} - \mathbf{y}\\|_2^2$
Normal equations	$\mathbf{X}^\top\mathbf{X}\hat{\mathbf{w}} = \mathbf{X}^\top\mathbf{y}$
OLS solution	$\hat{\mathbf{w}} = (\mathbf{X}^\top\mathbf{X})^{-1}\mathbf{X}^\top\mathbf{y}$
Projection	$\hat{\mathbf{y}} = \mathbf{X}(\mathbf{X}^\top\mathbf{X})^{-1}\mathbf{X}^\top\mathbf{y}$
Simple regression	$\hat{w}_1 = C_{xy}/C_{xx} = \rho_{xy}\,\sigma_y/\sigma_x$
WLS	$\hat{\mathbf{w}} = (\mathbf{X}^\top\boldsymbol{\Lambda}\mathbf{X})^{-1}\mathbf{X}^\top\boldsymbol{\Lambda}\mathbf{y}$
Noise MLE	$\hat{\sigma}^2 = \frac{1}{N}\sum(y_n - \hat{y}_n)^2$
R²	$1 - \text{RSS}/\text{TSS}$

Least Squares Linear Regression¶

Real-World Scenario: Predicting Protein Abundance from mRNA Expression¶

Key Formulas from PML Chapter 11.2¶

1. Generate Synthetic mRNA → Protein Data¶

2. Simple Linear Regression — 1D (Section 11.2.3.2)¶

3. Ordinary Least Squares — Matrix Form (Section 11.2.2.1)¶

4. Geometric Interpretation — Orthogonal Projection (Section 11.2.2.2)¶

5. Solving the Normal Equations: Direct vs QR vs SVD (Section 11.2.2.3)¶

6. Multiple Linear Regression (Section 11.2.1)¶

7. Partial Regression (Section 11.2.3.3)¶

8. Polynomial Regression — Nonlinear Features (Section 11.2.1)¶

9. Offset and Slope Separately (Section 11.2.3.1)¶

10. Weighted Least Squares (Section 11.2.2.4)¶

11. Recursive (Online) MLE (Section 11.2.3.4)¶

12. MLE for σ² (Section 11.2.3.6)¶

13. Goodness of Fit (Section 11.2.4)¶

Residual Plots (Section 11.2.4.1)¶

R² — Coefficient of Determination (Section 11.2.4.2)¶

14. Deriving the MLE from a Generative Perspective (Section 11.2.3.5)¶

Summary¶

Key Takeaways¶

Key Formulas from Chapter 11.2¶