import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse

np.random.seed(42)
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11

# Two correlated biomarkers
# x1 = fasting blood glucose (mg/dL), x2 = HbA1c (%)
mu = np.array([100, 5.5])

# Build covariance from correlation: cov(x1,x2) = ρ * σ1 * σ2 = 0.85 * 15 * 0.6 = 7.65
Sigma = np.array([[225,   7.65],    # std(glucose) = 15 mg/dL
                  [  7.65, 0.36]])   # std(HbA1c)  = 0.6 %, ρ = 0.85

# Step 1: Cholesky decomposition
L = np.linalg.cholesky(Sigma)

print("Covariance matrix Σ:")
print(Sigma)
print(f"\nCorrelation: ρ = {Sigma[0,1] / (np.sqrt(Sigma[0,0]) * np.sqrt(Sigma[1,1])):.2f}")
print(f"\nCholesky factor L (lower triangular):")
print(L.round(4))
print(f"\nVerify: L L^T = Σ? {np.allclose(L @ L.T, Sigma)}")

Covariance matrix Σ:
[[225.     7.65]
 [  7.65   0.36]]

Correlation: ρ = 0.85

Cholesky factor L (lower triangular):
[[15.      0.    ]
 [ 0.51    0.3161]]

Verify: L L^T = Σ? True

n_samples = 2000

# Step 2: sample independent standard normals
X_indep = np.random.randn(n_samples, 2)

# Step 3: transform
Y = (L @ X_indep.T).T + mu  # y = Lx + μ

# Helper to draw confidence ellipse
def draw_ellipse(ax, mu, Sigma, n_std=2, **kwargs):
    evals, evecs = np.linalg.eigh(Sigma)
    angle = np.degrees(np.arctan2(evecs[1, 1], evecs[0, 1]))
    w = 2 * n_std * np.sqrt(evals[1])
    h = 2 * n_std * np.sqrt(evals[0])
    ell = Ellipse(mu, w, h, angle=angle, fill=False, **kwargs)
    ax.add_patch(ell)

# Visualize: before and after
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

ax = axes[0]
ax.scatter(X_indep[:, 0], X_indep[:, 1], s=3, alpha=0.3, c='gray')
draw_ellipse(ax, [0, 0], np.eye(2), color='black', lw=2, ls='--')
ax.set_xlabel('$x_1$')
ax.set_ylabel('$x_2$')
ax.set_title(r'Independent samples $\mathbf{x} \sim \mathcal{N}(\mathbf{0}, \mathbf{I})$', fontsize=12)
ax.set_aspect('equal')
ax.grid(True, alpha=0.3)
ax.set_xlim(-4, 4)
ax.set_ylim(-4, 4)

ax = axes[1]
ax.scatter(Y[:, 0], Y[:, 1], s=3, alpha=0.3, c='#2980b9')
draw_ellipse(ax, mu, Sigma, color='black', lw=2, ls='--')
ax.scatter(*mu, c='red', s=100, marker='+', zorder=5, linewidths=2)
ax.set_xlabel('Glucose (mg/dL)')
ax.set_ylabel('HbA1c (%)')
ax.set_title(r'After Cholesky transform $\mathbf{y} = \mathbf{L}\mathbf{x} + \boldsymbol{\mu}$', fontsize=12)
ax.grid(True, alpha=0.3)

plt.suptitle(r'Independent $\rightarrow$ Correlated via Cholesky', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

# Verify statistics
print(f"{'':>15} {'True':>12} {'Sample':>12}")
print(f"{'Mean glucose':>15} {mu[0]:>12.1f} {Y[:,0].mean():>12.1f}")
print(f"{'Mean HbA1c':>15} {mu[1]:>12.2f} {Y[:,1].mean():>12.2f}")
print(f"{'Var glucose':>15} {Sigma[0,0]:>12.1f} {np.var(Y[:,0]):>12.1f}")
print(f"{'Var HbA1c':>15} {Sigma[1,1]:>12.2f} {np.var(Y[:,1]):>12.2f}")
print(f"{'Cov':>15} {Sigma[0,1]:>12.2f} {np.cov(Y.T)[0,1]:>12.2f}")

                        True       Sample
   Mean glucose        100.0        100.2
     Mean HbA1c         5.50         5.51
    Var glucose        225.0        218.3
      Var HbA1c         0.36         0.36
            Cov         7.65         7.52

print(f"L = ")
print(L.round(3))
print(f"\ny₁ = {L[0,0]:.1f} · x₁ + {mu[0]:.1f}")
print(f"y₂ = {L[1,0]:.3f} · x₁ + {L[1,1]:.3f} · x₂ + {mu[1]:.1f}")
print(f"\nL₂₁ = {L[1,0]:.3f} controls the correlation: ")
print(f"  each unit of x₁ shifts glucose by {L[0,0]:.1f} and HbA1c by {L[1,0]:.3f}")

n = 1500
X = np.random.randn(n, 2)

# Build up the transformation step by step
step0 = X                                        # independent N(0,I)
step1 = np.column_stack([L[0,0]*X[:,0],           # scale x₁ → glucose
                         L[1,0]*X[:,0] + L[1,1]*X[:,1]])  # mix → HbA1c
step2 = step1 + mu                               # shift to population mean

fig, axes = plt.subplots(1, 3, figsize=(16, 5))

data = [step0, step1, step2]
titles = [
    r'Independent $\mathbf{x} \sim \mathcal{N}(\mathbf{0}, \mathbf{I})$',
    r'After $\mathbf{L}\mathbf{x}$: correlation introduced',
    r'After $+ \boldsymbol{\mu}$: shifted to population mean'
]
xlabels = ['$x_1$', r'$L_{11}\, x_1$', 'Glucose (mg/dL)']
ylabels = ['$x_2$', r'$L_{21}\, x_1 + L_{22}\, x_2$', 'HbA1c (%)']

for ax, d, title, xl, yl in zip(axes, data, titles, xlabels, ylabels):
    ax.scatter(d[:, 0], d[:, 1], s=2, alpha=0.3, c='#2980b9')
    ax.set_title(title, fontsize=11)
    ax.set_xlabel(xl)
    ax.set_ylabel(yl)
    ax.grid(True, alpha=0.3)
    ax.set_aspect('auto')
    mean = d.mean(axis=0)
    ax.scatter(*mean, c='red', s=80, marker='+', zorder=5, linewidths=2)

plt.suptitle('Cholesky builds correlation step by step', fontsize=13, y=1.02)
plt.tight_layout()
plt.show()

L = 
[[15.     0.   ]
 [ 0.51   0.316]]

y₁ = 15.0 · x₁ + 100.0
y₂ = 0.510 · x₁ + 0.316 · x₂ + 5.5

L₂₁ = 0.510 controls the correlation: 
  each unit of x₁ shifts glucose by 15.0 and HbA1c by 0.510

# A matrix that looks like a covariance but isn't positive definite
# (can happen with rounding, subsetting, or manual construction)
bad_Sigma = np.array([[1.0,  0.9,  0.9],
                      [0.9,  1.0,  0.0],
                      [0.9,  0.0,  1.0]])

print("Suspicious matrix:")
print(bad_Sigma)
print(f"Symmetric? {np.allclose(bad_Sigma, bad_Sigma.T)}")
print(f"Diagonal all positive? {np.all(np.diag(bad_Sigma) > 0)}")
print(f"Looks like a valid correlation matrix...")

print(f"\nAttempting Cholesky...")
try:
    L_bad = np.linalg.cholesky(bad_Sigma)
    print("Cholesky succeeded")
except np.linalg.LinAlgError as e:
    print(f"FAILED: {e}")
    print("Cholesky immediately caught that this matrix is NOT positive definite.")
    evals = np.linalg.eigvalsh(bad_Sigma)
    print(f"Eigenvalues confirm: {evals.round(4)} (negative eigenvalue present)")

Suspicious matrix:
[[1.  0.9 0.9]
 [0.9 1.  0. ]
 [0.9 0.  1. ]]
Symmetric? True
Diagonal all positive? True
Looks like a valid correlation matrix...

Attempting Cholesky...
FAILED: Matrix is not positive definite
Cholesky immediately caught that this matrix is NOT positive definite.
Eigenvalues confirm: [-0.2728  1.      2.2728] (negative eigenvalue present)

# Population parameters for Type 2 diabetes patients (realistic but synthetic)
biomarker_names = ['Glucose\n(mg/dL)', 'HbA1c\n(%)', 'Insulin\n(μU/mL)', 'BMI\n(kg/m²)', 'Triglycerides\n(mg/dL)']
short_names = ['Glucose', 'HbA1c', 'Insulin', 'BMI', 'Triglycerides']

mu_patient = np.array([140, 7.5, 18, 31, 180])  # typical T2D population means

# Standard deviations
stds = np.array([30, 1.2, 8, 5, 60])

# Correlation matrix (based on typical clinical correlations)
corr = np.array([
    [1.00, 0.85, 0.55, 0.30, 0.35],   # Glucose
    [0.85, 1.00, 0.50, 0.25, 0.30],   # HbA1c
    [0.55, 0.50, 1.00, 0.45, 0.40],   # Insulin
    [0.30, 0.25, 0.45, 1.00, 0.50],   # BMI
    [0.35, 0.30, 0.40, 0.50, 1.00],   # Triglycerides
])

# Convert to covariance: Σ = diag(σ) @ R @ diag(σ)
Sigma_patient = np.diag(stds) @ corr @ np.diag(stds)

print("Population parameters for T2D patients")
print("=" * 50)
for i, (name, m, s) in enumerate(zip(short_names, mu_patient, stds)):
    print(f"  {name:<15} μ = {m:>6.1f},  σ = {s:>5.1f}")

print(f"\nCorrelation matrix:")
print(corr.round(2))

Population parameters for T2D patients
==================================================
  Glucose         μ =  140.0,  σ =  30.0
  HbA1c           μ =    7.5,  σ =   1.2
  Insulin         μ =   18.0,  σ =   8.0
  BMI             μ =   31.0,  σ =   5.0
  Triglycerides   μ =  180.0,  σ =  60.0

Correlation matrix:
[[1.   0.85 0.55 0.3  0.35]
 [0.85 1.   0.5  0.25 0.3 ]
 [0.55 0.5  1.   0.45 0.4 ]
 [0.3  0.25 0.45 1.   0.5 ]
 [0.35 0.3  0.4  0.5  1.  ]]

# The Cholesky recipe from Section 7.6.3.1
L_patient = np.linalg.cholesky(Sigma_patient)

print("Cholesky factor L (lower triangular):")
print("")
header = f"{'':>14}" + "".join(f"{n:>14}" for n in short_names)
print(header)
for i, name in enumerate(short_names):
    row = f"{name:>14}" + "".join(f"{L_patient[i,j]:>14.3f}" for j in range(5))
    print(row)

print(f"\nBelow the diagonal: how each biomarker inherits correlation from earlier ones")
print(f"Diagonal: the 'own' standard deviation after accounting for correlations")

Cholesky factor L (lower triangular):

                     Glucose         HbA1c       Insulin           BMI Triglycerides
       Glucose        30.000         0.000         0.000         0.000         0.000
         HbA1c         1.020         0.632         0.000         0.000         0.000
       Insulin         4.400         0.494         6.663         0.000         0.000
           BMI         1.500        -0.047         1.714         4.451         0.000
 Triglycerides        21.000         0.285        14.927        20.878        50.002

Below the diagonal: how each biomarker inherits correlation from earlier ones
Diagonal: the 'own' standard deviation after accounting for correlations

# Generate virtual patient cohort
n_patients = 10_000

X_indep = np.random.randn(n_patients, 5)  # Step 1: independent standard normals
patients = (L_patient @ X_indep.T).T + mu_patient  # Step 2: transform

# Verify sample statistics
sample_mean = patients.mean(axis=0)
sample_cov = np.cov(patients.T)
sample_corr = np.corrcoef(patients.T)

print("Sample verification (10,000 virtual patients)")
print("=" * 55)
print(f"{'Biomarker':<15} {'True μ':>8} {'Sample μ':>10} {'True σ':>8} {'Sample σ':>10}")
print("-" * 55)
for i, name in enumerate(short_names):
    print(f"{name:<15} {mu_patient[i]:>8.1f} {sample_mean[i]:>10.1f} {stds[i]:>8.1f} {np.sqrt(sample_cov[i,i]):>10.1f}")

Sample verification (10,000 virtual patients)
=======================================================
Biomarker         True μ   Sample μ   True σ   Sample σ
-------------------------------------------------------
Glucose            140.0      140.0     30.0       30.5
HbA1c                7.5        7.5      1.2        1.2
Insulin             18.0       17.9      8.0        8.1
BMI                 31.0       31.0      5.0        5.0
Triglycerides      180.0      179.8     60.0       60.1

# Visualize the full correlation structure of the virtual cohort
fig, axes = plt.subplots(5, 5, figsize=(14, 14))

for i in range(5):
    for j in range(5):
        ax = axes[i, j]
        if i == j:
            ax.hist(patients[:, i], bins=40, color='#2980b9', alpha=0.7, density=True)
            ax.axvline(mu_patient[i], color='red', lw=2, ls='--')
        elif i > j:
            ax.scatter(patients[:, j], patients[:, i], s=1, alpha=0.05, c='#2980b9')
        else:
            ax.text(0.5, 0.5, f'r = {sample_corr[i,j]:.2f}\n(true: {corr[i,j]:.2f})',
                    ha='center', va='center', fontsize=10, transform=ax.transAxes,
                    fontweight='bold',
                    color='#27ae60' if abs(sample_corr[i,j] - corr[i,j]) < 0.03 else '#e74c3c')
            ax.set_xlim(0, 1); ax.set_ylim(0, 1)
        
        if j == 0:
            ax.set_ylabel(biomarker_names[i], fontsize=9)
        if i == 4:
            ax.set_xlabel(biomarker_names[j], fontsize=9)
        if i != 4:
            ax.set_xticklabels([])
        if j != 0:
            ax.set_yticklabels([])

plt.suptitle('Virtual Patient Cohort (10,000 patients)\nGenerated via Cholesky sampling', fontsize=14, y=1.01)
plt.tight_layout()
plt.show()

print("Diagonal: marginal distributions (red line = true mean)")
print("Lower triangle: pairwise scatter plots showing correlations")
print("Upper triangle: sample vs true correlation coefficients")

Diagonal: marginal distributions (red line = true mean)
Lower triangle: pairwise scatter plots showing correlations
Upper triangle: sample vs true correlation coefficients

Cholesky Sampling from a Multivariate Normal¶

Real-World Scenario: Clinical Trial Simulation¶

1. The Recipe¶

2. Why It Works: The Triangular Structure¶

3. Bonus: Built-In Positive Definiteness Check¶

4. Application: Clinical Trial Simulation¶

Key Takeaways¶