import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from scipy.stats import norm, multivariate_normal
from matplotlib.patches import Ellipse

np.random.seed(42)
plt.style.use('seaborn-v0_8-whitegrid')
mpl.rcParams['font.family'] = 'DejaVu Sans'

# Generate data from a mixture of two Gaussians
# True parameters (unknown to our algorithm)
true_pi = 0.4  # 40% from component 1
true_mu1, true_sigma1 = 2.0, 0.8  # Component 1: low biomarker
true_mu2, true_sigma2 = 6.0, 1.2  # Component 2: high biomarker

# Generate samples
N = 200
z_true = np.random.binomial(1, 1 - true_pi, N)  # 0 = comp 1, 1 = comp 2
y = np.where(z_true == 0, 
             np.random.normal(true_mu1, true_sigma1, N),
             np.random.normal(true_mu2, true_sigma2, N))

print(f"Generated {N} patients")
print(f"True Type A (z=0): {(z_true == 0).sum()} patients")
print(f"True Type B (z=1): {(z_true == 1).sum()} patients")

Generated 200 patients
True Type A (z=0): 78 patients
True Type B (z=1): 122 patients

# Visualize the data and true model
fig, axes = plt.subplots(1, 2, figsize=(14, 4))

# Left: What we observe (no labels)
axes[0].hist(y, bins=30, density=True, alpha=0.7, color='steelblue', edgecolor='white')
axes[0].set_xlabel('Biomarker Level', fontsize=12)
axes[0].set_ylabel('Density', fontsize=12)
axes[0].set_title('Observed Data (Disease Type Unknown)', fontsize=14)

# Right: True underlying structure
x_plot = np.linspace(-2, 10, 200)
pdf1 = true_pi * norm.pdf(x_plot, true_mu1, true_sigma1)
pdf2 = (1 - true_pi) * norm.pdf(x_plot, true_mu2, true_sigma2)
pdf_mixture = pdf1 + pdf2

axes[1].hist(y[z_true == 0], bins=15, density=True, alpha=0.5, color='#e74c3c', label='Type A')
axes[1].hist(y[z_true == 1], bins=15, density=True, alpha=0.5, color='#3498db', label='Type B')
axes[1].plot(x_plot, pdf1, 'r-', linewidth=2, label=f'Type A: N({true_mu1}, {true_sigma1}²)')
axes[1].plot(x_plot, pdf2, 'b-', linewidth=2, label=f'Type B: N({true_mu2}, {true_sigma2}²)')
axes[1].plot(x_plot, pdf_mixture, 'k--', linewidth=2, label='Mixture')
axes[1].set_xlabel('Biomarker Level', fontsize=12)
axes[1].set_ylabel('Density', fontsize=12)
axes[1].set_title('True Structure (Hidden)', fontsize=14)
axes[1].legend()

plt.tight_layout()
plt.show()

print("\nLeft: What the hospital observes - just biomarker values")
print("Right: The hidden truth - two disease types with different biomarker distributions")

Left: What the hospital observes - just biomarker values
Right: The hidden truth - two disease types with different biomarker distributions

def e_step(y, pi, mu1, sigma1, mu2, sigma2):
    """
    E-Step: Compute responsibilities.
    
    Returns r1: probability each point belongs to component 1
    """
    # Weighted likelihoods
    w1 = pi * norm.pdf(y, mu1, sigma1)
    w2 = (1 - pi) * norm.pdf(y, mu2, sigma2)
    
    # Normalize to get responsibilities
    r1 = w1 / (w1 + w2)
    r2 = 1 - r1
    
    return r1, r2

# Example: compute responsibilities with true parameters
r1_true, r2_true = e_step(y, true_pi, true_mu1, true_sigma1, true_mu2, true_sigma2)

print("Responsibilities for first 10 patients (using true parameters):")
print("Patient |  y value  | P(Type A) | P(Type B) | True Type")
print("-" * 60)
for i in range(10):
    true_type = 'A' if z_true[i] == 0 else 'B'
    print(f"   {i+1:2d}   |   {y[i]:5.2f}   |   {r1_true[i]:5.1%}   |   {r2_true[i]:5.1%}   |    {true_type}")

Responsibilities for first 10 patients (using true parameters):
Patient |  y value  | P(Type A) | P(Type B) | True Type
------------------------------------------------------------
    1   |    7.03   |    0.0%   |   100.0%   |    B
    2   |    2.19   |   99.3%   |    0.7%   |    A
    3   |    2.23   |   99.2%   |    0.8%   |    A
    4   |    4.80   |    0.4%   |   99.6%   |    B
    5   |    5.98   |    0.0%   |   100.0%   |    B
    6   |    5.65   |    0.0%   |   100.0%   |    B
    7   |    6.39   |    0.0%   |   100.0%   |    B
    8   |    2.53   |   98.2%   |    1.8%   |    A
    9   |    1.22   |   99.9%   |    0.1%   |    A
   10   |    2.63   |   97.4%   |    2.6%   |    A

# Visualize responsibilities
fig, axes = plt.subplots(1, 2, figsize=(14, 4))

# Sort by y value for cleaner visualization
sort_idx = np.argsort(y)
y_sorted = y[sort_idx]
r1_sorted = r1_true[sort_idx]

# Left: Responsibilities as function of y
axes[0].scatter(y_sorted, r1_sorted, c=r1_sorted, cmap='RdBu', s=30, alpha=0.7)
axes[0].axhline(0.5, color='gray', linestyle='--', alpha=0.5, label='Decision boundary')
axes[0].set_xlabel('Biomarker Level (y)', fontsize=12)
axes[0].set_ylabel('P(Type A | y)', fontsize=12)
axes[0].set_title('Responsibilities: Probability of Type A', fontsize=14)
axes[0].legend()

# Right: Soft vs hard assignment
colors = plt.cm.RdBu(r1_true)
axes[1].scatter(y, np.zeros_like(y), c=colors, s=50, alpha=0.8)
axes[1].set_xlabel('Biomarker Level', fontsize=12)
axes[1].set_yticks([])
axes[1].set_title('Soft Clustering (Red=Type A, Blue=Type B)', fontsize=14)

# Add colorbar
sm = plt.cm.ScalarMappable(cmap='RdBu', norm=plt.Normalize(0, 1))
sm.set_array([])
cbar = plt.colorbar(sm, ax=axes[1], orientation='horizontal', pad=0.2)
cbar.set_label('P(Type A)')

plt.tight_layout()
plt.show()

print("\nKey insight: Responsibilities give SOFT assignments.")
print("Patients with intermediate biomarker levels have uncertain type assignment.")

Key insight: Responsibilities give SOFT assignments.
Patients with intermediate biomarker levels have uncertain type assignment.

def m_step(y, r1, r2):
    """
    M-Step: Update parameters using responsibilities.
    """
    N = len(y)
    
    # Effective counts
    N1 = r1.sum()
    N2 = r2.sum()
    
    # Update mixing weight
    pi_new = N1 / N
    
    # Update means (weighted average)
    mu1_new = (r1 * y).sum() / N1
    mu2_new = (r2 * y).sum() / N2
    
    # Update standard deviations (weighted)
    sigma1_new = np.sqrt((r1 * (y - mu1_new)**2).sum() / N1)
    sigma2_new = np.sqrt((r2 * (y - mu2_new)**2).sum() / N2)
    
    return pi_new, mu1_new, sigma1_new, mu2_new, sigma2_new

# Demonstrate M-step
print("M-Step demonstration (using true responsibilities):")
print("\nEffective counts:")
print(f"  N1 (Type A): {r1_true.sum():.1f}")
print(f"  N2 (Type B): {r2_true.sum():.1f}")

pi_demo, mu1_demo, sig1_demo, mu2_demo, sig2_demo = m_step(y, r1_true, r2_true)
print(f"\nEstimated vs True parameters:")
print(f"  pi:     {pi_demo:.3f} (true: {true_pi})")
print(f"  mu1:    {mu1_demo:.3f} (true: {true_mu1})")
print(f"  sigma1: {sig1_demo:.3f} (true: {true_sigma1})")
print(f"  mu2:    {mu2_demo:.3f} (true: {true_mu2})")
print(f"  sigma2: {sig2_demo:.3f} (true: {true_sigma2})")

M-Step demonstration (using true responsibilities):

Effective counts:
  N1 (Type A): 79.5
  N2 (Type B): 120.5

Estimated vs True parameters:
  pi:     0.398 (true: 0.4)
  mu1:    2.139 (true: 2.0)
  sigma1: 0.718 (true: 0.8)
  mu2:    5.887 (true: 6.0)
  sigma2: 1.239 (true: 1.2)

def compute_log_likelihood(y, pi, mu1, sigma1, mu2, sigma2):
    """Compute the log-likelihood of the data."""
    likelihood = pi * norm.pdf(y, mu1, sigma1) + (1 - pi) * norm.pdf(y, mu2, sigma2)
    return np.sum(np.log(likelihood + 1e-10))

def em_algorithm(y, max_iter=100, tol=1e-6, verbose=True):
    """
    Full EM algorithm for 1D Gaussian mixture with 2 components.
    """
    N = len(y)
    
    # Initialize parameters (poor initial guess)
    pi = 0.5
    mu1, mu2 = np.percentile(y, 25), np.percentile(y, 75)
    sigma1 = sigma2 = np.std(y)
    
    log_likelihoods = []
    history = []
    
    for iteration in range(max_iter):
        # Store history
        history.append({
            'pi': pi, 'mu1': mu1, 'sigma1': sigma1, 
            'mu2': mu2, 'sigma2': sigma2
        })
        
        # E-Step
        r1, r2 = e_step(y, pi, mu1, sigma1, mu2, sigma2)
        
        # Compute log-likelihood
        ll = compute_log_likelihood(y, pi, mu1, sigma1, mu2, sigma2)
        log_likelihoods.append(ll)
        
        if verbose and iteration % 5 == 0:
            print(f"Iter {iteration:3d}: LL = {ll:.2f}, pi = {pi:.3f}, "
                  f"mu1 = {mu1:.2f}, mu2 = {mu2:.2f}")
        
        # Check convergence
        if iteration > 0 and abs(log_likelihoods[-1] - log_likelihoods[-2]) < tol:
            if verbose:
                print(f"\nConverged at iteration {iteration}")
            break
        
        # M-Step
        pi, mu1, sigma1, mu2, sigma2 = m_step(y, r1, r2)
    
    return pi, mu1, sigma1, mu2, sigma2, log_likelihoods, history, r1, r2

# Run EM
print("Running EM Algorithm...\n")
pi_em, mu1_em, sig1_em, mu2_em, sig2_em, lls, history, r1_final, r2_final = em_algorithm(y)

Running EM Algorithm...

Iter   0: LL = -446.14, pi = 0.500, mu1 = 2.43, mu2 = 6.10
Iter   5: LL = -413.06, pi = 0.465, mu1 = 2.50, mu2 = 6.04
Iter  10: LL = -404.42, pi = 0.403, mu1 = 2.16, mu2 = 5.90
Iter  15: LL = -403.80, pi = 0.384, mu1 = 2.10, mu2 = 5.83
Iter  20: LL = -403.79, pi = 0.381, mu1 = 2.09, mu2 = 5.82
Iter  25: LL = -403.79, pi = 0.380, mu1 = 2.09, mu2 = 5.81

Converged at iteration 29

# Compare estimated vs true parameters
print("\n" + "="*60)
print("FINAL RESULTS")
print("="*60)
print(f"\n{'Parameter':<12} {'Estimated':>12} {'True':>12} {'Error':>12}")
print("-" * 50)

# Note: Components might be swapped, check and swap if needed
if mu1_em > mu2_em:  # Swap if needed
    pi_em, mu1_em, sig1_em, mu2_em, sig2_em = 1-pi_em, mu2_em, sig2_em, mu1_em, sig1_em
    r1_final, r2_final = r2_final, r1_final

params = [
    ('pi', pi_em, true_pi),
    ('mu1', mu1_em, true_mu1),
    ('sigma1', sig1_em, true_sigma1),
    ('mu2', mu2_em, true_mu2),
    ('sigma2', sig2_em, true_sigma2)
]

for name, est, true in params:
    error = abs(est - true)
    print(f"{name:<12} {est:>12.3f} {true:>12.3f} {error:>12.3f}")

print("\nEM successfully recovered the true parameters!")

============================================================
FINAL RESULTS
============================================================

Parameter       Estimated         True        Error
--------------------------------------------------
pi                  0.380        0.400        0.020
mu1                 2.089        2.000        0.089
sigma1              0.678        0.800        0.122
mu2                 5.813        6.000        0.187
sigma2              1.302        1.200        0.102

EM successfully recovered the true parameters!

# Plot convergence
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Log-likelihood
axes[0].plot(lls, 'b-', linewidth=2)
axes[0].set_xlabel('Iteration', fontsize=12)
axes[0].set_ylabel('Log-likelihood', fontsize=12)
axes[0].set_title('EM Convergence', fontsize=14)
axes[0].axhline(lls[-1], color='r', linestyle='--', alpha=0.5)

# Parameter evolution: means
mu1_hist = [h['mu1'] for h in history]
mu2_hist = [h['mu2'] for h in history]
axes[1].plot(mu1_hist, 'r-', linewidth=2, label='μ₁ (estimated)')
axes[1].plot(mu2_hist, 'b-', linewidth=2, label='μ₂ (estimated)')
axes[1].axhline(true_mu1, color='r', linestyle='--', alpha=0.5, label='μ₁ (true)')
axes[1].axhline(true_mu2, color='b', linestyle='--', alpha=0.5, label='μ₂ (true)')
axes[1].set_xlabel('Iteration', fontsize=12)
axes[1].set_ylabel('Mean', fontsize=12)
axes[1].set_title('Mean Evolution', fontsize=14)
axes[1].legend()

# Parameter evolution: mixing weight
pi_hist = [h['pi'] for h in history]
axes[2].plot(pi_hist, 'g-', linewidth=2, label='π (estimated)')
axes[2].axhline(true_pi, color='g', linestyle='--', alpha=0.5, label='π (true)')
axes[2].set_xlabel('Iteration', fontsize=12)
axes[2].set_ylabel('Mixing weight', fontsize=12)
axes[2].set_title('Mixing Weight Evolution', fontsize=14)
axes[2].legend()
axes[2].set_ylim(0, 1)

plt.tight_layout()
plt.show()

# Animate EM iterations
iterations_to_show = [0, 1, 3, 5, 10, len(history)-1]

fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.flatten()

x_plot = np.linspace(-2, 10, 200)

for ax_idx, iter_idx in enumerate(iterations_to_show):
    ax = axes[ax_idx]
    h = history[iter_idx]
    
    # Plot data histogram
    ax.hist(y, bins=30, density=True, alpha=0.3, color='gray')
    
    # Plot current model
    pdf1 = h['pi'] * norm.pdf(x_plot, h['mu1'], h['sigma1'])
    pdf2 = (1 - h['pi']) * norm.pdf(x_plot, h['mu2'], h['sigma2'])
    
    ax.plot(x_plot, pdf1, 'r-', linewidth=2, label='Component 1')
    ax.plot(x_plot, pdf2, 'b-', linewidth=2, label='Component 2')
    ax.plot(x_plot, pdf1 + pdf2, 'k--', linewidth=2, label='Mixture')
    
    # Mark means
    ax.axvline(h['mu1'], color='r', linestyle=':', alpha=0.7)
    ax.axvline(h['mu2'], color='b', linestyle=':', alpha=0.7)
    
    ax.set_title(f'Iteration {iter_idx}', fontsize=12)
    ax.set_xlabel('Biomarker Level')
    if ax_idx % 3 == 0:
        ax.set_ylabel('Density')
    ax.set_ylim(0, 0.5)
    
    if ax_idx == 0:
        ax.legend(fontsize=9)

plt.suptitle('EM Algorithm: Fitting a Gaussian Mixture Model', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

def compute_elbo(y, q_z1, pi, mu1, sigma1, mu2, sigma2):
    """
    Compute the ELBO for the 1D GMM.
    q_z1: probability that z=1 under q distribution
    """
    q_z2 = 1 - q_z1
    N = len(y)
    
    # Expected complete log-likelihood
    log_p_y_z1 = np.log(pi + 1e-10) + norm.logpdf(y, mu1, sigma1)
    log_p_y_z2 = np.log(1 - pi + 1e-10) + norm.logpdf(y, mu2, sigma2)
    
    E_log_p = np.sum(q_z1 * log_p_y_z1 + q_z2 * log_p_y_z2)
    
    # Entropy of q
    H_q = -np.sum(q_z1 * np.log(q_z1 + 1e-10) + q_z2 * np.log(q_z2 + 1e-10))
    
    return E_log_p + H_q

# Visualize ELBO vs log-likelihood
fig, ax = plt.subplots(figsize=(10, 5))

# Compute ELBO at each iteration
elbos = []
for i, h in enumerate(history):
    r1, _ = e_step(y, h['pi'], h['mu1'], h['sigma1'], h['mu2'], h['sigma2'])
    elbo = compute_elbo(y, r1, h['pi'], h['mu1'], h['sigma1'], h['mu2'], h['sigma2'])
    elbos.append(elbo)

ax.plot(lls, 'b-o', linewidth=2, markersize=6, label='Log-likelihood')
ax.plot(elbos, 'r--s', linewidth=2, markersize=6, label='ELBO')
ax.set_xlabel('Iteration', fontsize=12)
ax.set_ylabel('Value', fontsize=12)
ax.set_title('ELBO vs Log-Likelihood (They Match!)', fontsize=14)
ax.legend(fontsize=12)

plt.tight_layout()
plt.show()

print("Key observation: After each E-step, ELBO = Log-likelihood")
print("This happens because we set q(z) = p(z|y,θ), making KL = 0")

Key observation: After each E-step, ELBO = Log-likelihood
This happens because we set q(z) = p(z|y,θ), making KL = 0

# True parameters for 2D GMM
true_pi_2d = 0.4
true_mu1_2d = np.array([2.0, 3.0])  # Type A: low on both markers
true_mu2_2d = np.array([6.0, 7.0])  # Type B: high on both markers
true_Sigma1 = np.array([[1.0, 0.3], [0.3, 0.8]])
true_Sigma2 = np.array([[1.5, -0.4], [-0.4, 1.2]])

# Generate 2D data
N_2d = 300
z_true_2d = np.random.binomial(1, 1 - true_pi_2d, N_2d)
Y_2d = np.zeros((N_2d, 2))
Y_2d[z_true_2d == 0] = np.random.multivariate_normal(true_mu1_2d, true_Sigma1, (z_true_2d == 0).sum())
Y_2d[z_true_2d == 1] = np.random.multivariate_normal(true_mu2_2d, true_Sigma2, (z_true_2d == 1).sum())

print(f"Generated {N_2d} patients with 2 biomarkers")

Generated 300 patients with 2 biomarkers

def em_2d(Y, K=2, max_iter=100, tol=1e-6):
    """
    EM for 2D Gaussian Mixture Model.
    """
    N, D = Y.shape
    
    # Initialize
    pi = np.ones(K) / K
    mus = [Y[np.random.choice(N)] for _ in range(K)]
    Sigmas = [np.eye(D) for _ in range(K)]
    
    history_2d = []
    log_likelihoods_2d = []
    
    for iteration in range(max_iter):
        history_2d.append({'mus': [m.copy() for m in mus], 
                          'Sigmas': [S.copy() for S in Sigmas],
                          'pi': pi.copy()})
        
        # E-Step
        weighted_likelihoods = np.zeros((N, K))
        for k in range(K):
            rv = multivariate_normal(mus[k], Sigmas[k])
            weighted_likelihoods[:, k] = pi[k] * rv.pdf(Y)
        
        total = weighted_likelihoods.sum(axis=1)
        # np.newaxis reshapes total from (N,) to (N,1) so it broadcasts across K columns
        R = weighted_likelihoods / total[:, np.newaxis]
        
        ll = np.sum(np.log(total + 1e-10))
        log_likelihoods_2d.append(ll)
        
        if iteration > 0 and abs(log_likelihoods_2d[-1] - log_likelihoods_2d[-2]) < tol:
            break
        
        # M-Step
        N_k = R.sum(axis=0)
        pi = N_k / N
        
        for k in range(K):
            # Note: R[:, k:k+1] (slice) keeps 2D shape (N,1), while R[:, k] (index)
            # would collapse to 1D (N,). The 2D shape is needed because:
            #   - .T on (N,1) gives (1,N), so (1,N) @ (N,D) -> (1,D) for the mean
            #   - .T on 1D (N,) is a no-op, so the matmul would give a scalar
            # .flatten() converts (1,D) -> (D,) so mus[k] stays 1D
            mus[k] = (R[:, k:k+1].T @ Y).flatten() / N_k[k]
            diff = Y - mus[k]
            # Same here: (N,1)*(N,D) broadcasts to (N,D), then (D,N)@(N,D) -> (D,D)
            Sigmas[k] = (R[:, k:k+1] * diff).T @ diff / N_k[k]
            Sigmas[k] += 1e-6 * np.eye(D)  # Regularization
    
    return pi, mus, Sigmas, R, log_likelihoods_2d, history_2d

# Run 2D EM
pi_2d, mus_2d, Sigmas_2d, R_2d, lls_2d, history_2d = em_2d(Y_2d)
print(f"Converged in {len(lls_2d)} iterations")

Converged in 18 iterations

def plot_ellipse(ax, mean, cov, color, n_std=2.0, **kwargs):
    """Plot covariance ellipse."""
    eigenvalues, eigenvectors = np.linalg.eigh(cov)
    order = eigenvalues.argsort()[::-1]
    eigenvalues, eigenvectors = eigenvalues[order], eigenvectors[:, order]
    angle = np.degrees(np.arctan2(*eigenvectors[:, 0][::-1]))
    width, height = 2 * n_std * np.sqrt(eigenvalues)
    ellipse = Ellipse(xy=mean, width=width, height=height, angle=angle,
                      facecolor='none', edgecolor=color, linewidth=2, **kwargs)
    ax.add_patch(ellipse)

# Visualize 2D EM evolution
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

iterations_to_show_2d = [0, 2, 5, 10, 20, len(history_2d)-1]

for ax_idx, iter_idx in enumerate(iterations_to_show_2d):
    ax = axes.flatten()[ax_idx]
    h = history_2d[min(iter_idx, len(history_2d)-1)]
    
    # Compute responsibilities for coloring
    weighted = np.zeros((N_2d, 2))
    for k in range(2):
        rv = multivariate_normal(h['mus'][k], h['Sigmas'][k])
        weighted[:, k] = h['pi'][k] * rv.pdf(Y_2d)
    R_iter = weighted / weighted.sum(axis=1, keepdims=True)
    
    # Color by responsibility
    colors_plot = plt.cm.RdBu(R_iter[:, 0])
    ax.scatter(Y_2d[:, 0], Y_2d[:, 1], c=colors_plot, s=20, alpha=0.6)
    
    # Plot ellipses and means
    for k, color in enumerate(['red', 'blue']):
        plot_ellipse(ax, h['mus'][k], h['Sigmas'][k], color)
        ax.scatter(*h['mus'][k], c=color, s=150, marker='*', edgecolors='black', zorder=10)
    
    ax.set_xlim(-1, 10)
    ax.set_ylim(0, 11)
    ax.set_title(f'Iteration {iter_idx}', fontsize=12)
    ax.set_xlabel('Biomarker 1')
    ax.set_ylabel('Biomarker 2')

plt.suptitle('2D EM Algorithm: Evolution of Cluster Discovery', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

# Demonstrate sensitivity to initialization
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

np.random.seed(None)  # Different runs
final_lls = []

for run in range(3):
    pi_run, mus_run, Sigmas_run, R_run, lls_run, _ = em_2d(Y_2d)
    final_lls.append(lls_run[-1])
    
    ax = axes[run]
    labels_run = np.argmax(R_run, axis=1)
    
    for k in range(2):
        mask = labels_run == k
        ax.scatter(Y_2d[mask, 0], Y_2d[mask, 1], alpha=0.5, s=20)
        plot_ellipse(ax, mus_run[k], Sigmas_run[k], f'C{k}')
        ax.scatter(*mus_run[k], s=150, marker='*', c=f'C{k}', edgecolors='black')
    
    ax.set_title(f'Run {run+1}: Final LL = {lls_run[-1]:.1f}', fontsize=12)
    ax.set_xlabel('Biomarker 1')
    ax.set_ylabel('Biomarker 2')
    ax.set_xlim(-1, 10)
    ax.set_ylim(0, 11)

plt.suptitle('EM with Different Random Initializations', fontsize=14)
plt.tight_layout()
plt.show()

print(f"\nFinal log-likelihoods: {final_lls}")
print(f"Best run: {np.argmax(final_lls) + 1}")
print("\nPractice: Run multiple times and keep the solution with highest likelihood!")

Final log-likelihoods: [np.float64(-1063.2227539403475), np.float64(-1063.222753902323), np.float64(-1063.222753929289)]
Best run: 2

Practice: Run multiple times and keep the solution with highest likelihood!

Step	Formula
E-Step	$r_{nk} = \frac{\pi_k \mathcal{N}(y_n\\|\mu_k, \Sigma_k)}{\sum_j \pi_j \mathcal{N}(y_n\\|\mu_j, \Sigma_j)}$
M-Step: weights	$\pi_k = \frac{1}{N}\sum_n r_{nk}$
M-Step: means	$\mu_k = \frac{\sum_n r_{nk} y_n}{\sum_n r_{nk}}$
M-Step: covariances	$\Sigma_k = \frac{\sum_n r_{nk}(y_n-\mu_k)(y_n-\mu_k)^T}{\sum_n r_{nk}}$

8.1 The Expectation-Maximization (EM) Algorithm¶

Real-World Scenario¶

1. The Problem: Maximum Likelihood with Latent Variables¶

Why Do We Need EM?¶

2. A Simple Example: 1D Gaussian Mixture¶

The Model¶

3. The EM Algorithm: Core Idea¶

The Key Insight: Lower Bound Optimization¶

The Two Steps¶

4. E-Step: Computing Responsibilities¶

The Optimal $q(\mathbf{z})$¶

For Gaussian Mixtures: Responsibilities¶

5. M-Step: Updating Parameters¶

Maximizing the Expected Complete-Data Log-Likelihood¶

Update Formulas¶

6. The Complete EM Algorithm¶

7. Visualizing EM Convergence¶

8. Why Does EM Work? The ELBO Perspective¶

The Evidence Lower Bound (ELBO)¶

E-Step Makes the Bound Tight¶

M-Step Increases the Bound¶

9. Extension to 2D: Multivariate Gaussian Mixture¶

10. Key Properties of EM¶

Guaranteed Properties¶

Limitations¶

Practical Tips¶

11. Summary: The EM Algorithm¶

Algorithm Overview¶

Key Formulas for Gaussian Mixtures¶

When to Use EM¶

Key Takeaways¶