import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from scipy import stats
from scipy.stats import multivariate_normal
import seaborn as sns

np.random.seed(42)
plt.style.use('seaborn-v0_8-whitegrid')
mpl.rcParams['font.family'] = 'DejaVu Sans'
%matplotlib inline

# Define the joint distribution parameters
# y1 = mRNA expression (log2 TPM), y2 = protein concentration (log2 intensity)

# Mean values: typical EGFR expression across tumor samples
mu = np.array([7.5, 5.2])  # [log2 TPM, log2 intensity]

# Correlation between mRNA and protein (realistic range: 0.4-0.8 for well-expressed genes)
rho = 0.78

# Standard deviations (variability across patient samples)
sigma_mrna = 1.8     # std of mRNA expression (log2 TPM)
sigma_protein = 1.4  # std of protein concentration (log2 intensity)

# Build covariance matrix
Sigma = np.array([
    [sigma_mrna**2, rho * sigma_mrna * sigma_protein],
    [rho * sigma_mrna * sigma_protein, sigma_protein**2]
])

print("Joint Distribution Parameters (EGFR biomarker):")
print(f"Mean μ = {mu}")
print(f"\nCovariance matrix Σ:")
print(Sigma)
print(f"\nCorrelation coefficient ρ = {rho}")

Joint Distribution Parameters (EGFR biomarker):
Mean μ = [7.5 5.2]

Covariance matrix Σ:
[[3.24   1.9656]
 [1.9656 1.96  ]]

Correlation coefficient ρ = 0.78

# Generate sample data (100 tumor samples)
n_samples = 100
data = np.random.multivariate_normal(mu, Sigma, n_samples)
mrna_data = data[:, 0]
protein_data = data[:, 1]

print("Sample tumor biopsies (first 5):")
print(f"{'mRNA (log2 TPM)':>16} | {'Protein (log2 int)':>18}")
print("-" * 40)
for i in range(5):
    print(f"{mrna_data[i]:>16.2f} | {protein_data[i]:>18.2f}")

Sample tumor biopsies (first 5):
 mRNA (log2 TPM) | Protein (log2 int)
----------------------------------------
            6.69 |               4.49
            5.71 |               5.28
            8.01 |               5.36
            4.41 |               3.65
            8.09 |               6.12

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Plot 1: Scatter plot with data
ax1 = axes[0]
ax1.scatter(mrna_data, protein_data, alpha=0.6, edgecolors='black', linewidth=0.5)
ax1.set_xlabel('mRNA Expression (log2 TPM)', fontsize=12)
ax1.set_ylabel('Protein Concentration (log2 intensity)', fontsize=12)
ax1.set_title('EGFR: mRNA vs Protein', fontsize=14)

# Plot 2: Joint distribution contours
ax2 = axes[1]
x_grid = np.linspace(2, 13, 100)
y_grid = np.linspace(0, 10, 100)
X, Y = np.meshgrid(x_grid, y_grid)
pos = np.dstack((X, Y))
rv = multivariate_normal(mu, Sigma)
Z = rv.pdf(pos)

ax2.contour(X, Y, Z, levels=10, cmap='viridis')
ax2.scatter(mrna_data, protein_data, alpha=0.4, s=20, color='red')
ax2.set_xlabel('mRNA Expression (log2 TPM)', fontsize=12)
ax2.set_ylabel('Protein Concentration (log2 intensity)', fontsize=12)
ax2.set_title('Joint Distribution Contours', fontsize=14)

# Plot 3: 3D surface
ax3 = fig.add_subplot(1, 3, 3, projection='3d')
ax3.plot_surface(X, Y, Z, cmap='viridis', alpha=0.8)
ax3.set_xlabel('mRNA (log2 TPM)')
ax3.set_ylabel('Protein (log2 int)')
ax3.set_zlabel('Density')
ax3.set_title('Joint PDF Surface', fontsize=14)

axes[2].remove()

plt.tight_layout()
plt.show()

def get_marginal(mu, Sigma, idx):
    """
    Get marginal distribution parameters for variable at index idx.
    
    From Eq. 3.27:
    p(y_i) = N(y_i | μ_i, Σ_ii)
    
    Parameters:
    -----------
    mu : array, mean vector
    Sigma : array, covariance matrix
    idx : int, index of variable (0 or 1 for 2D case)
    
    Returns:
    --------
    mu_marginal, sigma_marginal : marginal mean and std
    """
    mu_marginal = mu[idx]
    sigma_marginal = np.sqrt(Sigma[idx, idx])
    return mu_marginal, sigma_marginal


def get_conditional(mu, Sigma, y2, idx_1, idx_2):
    """
    Get conditional distribution p(y_1 | y_2 = y2).
    
    From Eq. 3.28:
    μ_{1|2} = μ_1 + Σ_12 * Σ_22^{-1} * (y_2 - μ_2)
    Σ_{1|2} = Σ_11 - Σ_12 * Σ_22^{-1} * Σ_21
    
    Parameters:
    -----------
    mu : array, mean vector
    Sigma : array, covariance matrix  
    y2 : float, observed value of variable 2
    idx_1 : int, index of variable to predict
    idx_2 : int, index of observed variable
    
    Returns:
    --------
    mu_cond, sigma_cond : conditional mean and std
    """
    mu_1 = mu[idx_1]
    mu_2 = mu[idx_2]
    Sigma_11 = Sigma[idx_1, idx_1]
    Sigma_12 = Sigma[idx_1, idx_2]
    Sigma_21 = Sigma[idx_2, idx_1]
    Sigma_22 = Sigma[idx_2, idx_2]
    
    # Conditional mean (Eq. 3.28)
    mu_cond = mu_1 + Sigma_12 / Sigma_22 * (y2 - mu_2)
    
    # Conditional variance (Eq. 3.28)
    Sigma_cond = Sigma_11 - Sigma_12 * Sigma_21 / Sigma_22
    sigma_cond = np.sqrt(Sigma_cond)
    
    return mu_cond, sigma_cond

# Given: tumor sample with high EGFR mRNA expression
observed_mrna = 10.0  # log2 TPM

# Compute conditional distribution p(protein | mRNA = 10.0)
# idx_1 = 1 (protein), idx_2 = 0 (mRNA)
mu_protein_given_mrna, sigma_protein_given_mrna = get_conditional(
    mu, Sigma, y2=observed_mrna, idx_1=1, idx_2=0
)

print("=" * 60)
print("PROTEIN PREDICTION GIVEN mRNA EXPRESSION")
print("=" * 60)
print(f"\nObserved: EGFR mRNA = {observed_mrna:.1f} log2 TPM")
print(f"\nConditional Distribution p(Protein | mRNA = {observed_mrna}):")
print(f"  Expected Protein: {mu_protein_given_mrna:.2f} log2 intensity")
print(f"  Std Deviation:    {sigma_protein_given_mrna:.2f} log2 intensity")
print(f"\n95% Credible Interval:")
ci_low = mu_protein_given_mrna - 1.96 * sigma_protein_given_mrna
ci_high = mu_protein_given_mrna + 1.96 * sigma_protein_given_mrna
print(f"  [{ci_low:.2f}, {ci_high:.2f}] log2 intensity")

============================================================
PROTEIN PREDICTION GIVEN mRNA EXPRESSION
============================================================

Observed: EGFR mRNA = 10.0 log2 TPM

Conditional Distribution p(Protein | mRNA = 10.0):
  Expected Protein: 6.72 log2 intensity
  Std Deviation:    0.88 log2 intensity

95% Credible Interval:
  [5.00, 8.43] log2 intensity

# Visualize the conditional distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Left: Show the "slice" through the joint distribution
ax1 = axes[0]
ax1.contour(X, Y, Z, levels=10, cmap='viridis', alpha=0.7)
ax1.scatter(mrna_data, protein_data, alpha=0.3, s=20, color='gray')
ax1.axvline(x=observed_mrna, color='red', linewidth=2, linestyle='--',
            label=f'Observed mRNA = {observed_mrna:.1f}')
ax1.axhline(y=mu_protein_given_mrna, color='blue', linewidth=2, linestyle='--',
            label=f'Predicted Protein = {mu_protein_given_mrna:.2f}')
ax1.set_xlabel('mRNA Expression (log2 TPM)', fontsize=12)
ax1.set_ylabel('Protein Concentration (log2 intensity)', fontsize=12)
ax1.set_title('Joint Distribution with Conditioning Line', fontsize=14)
ax1.legend()

# Right: The conditional distribution p(protein | mRNA)
ax2 = axes[1]
protein_range = np.linspace(0, 10, 200)
conditional_pdf = stats.norm.pdf(protein_range, mu_protein_given_mrna, sigma_protein_given_mrna)
ax2.plot(protein_range, conditional_pdf, 'b-', linewidth=2, label='p(Protein | mRNA)')
ax2.fill_between(protein_range, conditional_pdf, alpha=0.3)
ax2.axvline(x=mu_protein_given_mrna, color='red', linewidth=2, linestyle='--',
            label=f'E[Protein|mRNA] = {mu_protein_given_mrna:.2f}')

# Show 95% CI
ax2.axvspan(ci_low, ci_high, alpha=0.2, color='green', label='95% CI')

ax2.set_xlabel('Protein Concentration (log2 intensity)', fontsize=12)
ax2.set_ylabel('Density', fontsize=12)
ax2.set_title(f'Conditional: p(Protein | mRNA = {observed_mrna})', fontsize=14)
ax2.legend()

plt.tight_layout()
plt.show()

# Given: protein level of 7.0 log2 intensity
observed_protein = 7.0

# Compute conditional distribution p(mRNA | protein = 7.0)
# idx_1 = 0 (mRNA), idx_2 = 1 (protein)
mu_mrna_given_protein, sigma_mrna_given_protein = get_conditional(
    mu, Sigma, y2=observed_protein, idx_1=0, idx_2=1
)

print("=" * 60)
print("mRNA PREDICTION GIVEN PROTEIN LEVEL")
print("=" * 60)
print(f"\nObserved: EGFR protein = {observed_protein:.1f} log2 intensity")
print(f"\nConditional Distribution p(mRNA | Protein = {observed_protein}):")
print(f"  Expected mRNA: {mu_mrna_given_protein:.2f} log2 TPM")
print(f"  Std Deviation: {sigma_mrna_given_protein:.2f} log2 TPM")
print(f"\n95% Credible Interval:")
ci_low_mrna = mu_mrna_given_protein - 1.96 * sigma_mrna_given_protein
ci_high_mrna = mu_mrna_given_protein + 1.96 * sigma_mrna_given_protein
print(f"  [{ci_low_mrna:.2f}, {ci_high_mrna:.2f}] log2 TPM")

============================================================
mRNA PREDICTION GIVEN PROTEIN LEVEL
============================================================

Observed: EGFR protein = 7.0 log2 intensity

Conditional Distribution p(mRNA | Protein = 7.0):
  Expected mRNA: 9.31 log2 TPM
  Std Deviation: 1.13 log2 TPM

95% Credible Interval:
  [7.10, 11.51] log2 TPM

# Get marginal distributions
mu_mrna_marginal, sigma_mrna_marginal = get_marginal(mu, Sigma, idx=0)
mu_protein_marginal, sigma_protein_marginal = get_marginal(mu, Sigma, idx=1)

print("=" * 60)
print("MARGINAL VS CONDITIONAL: UNCERTAINTY REDUCTION")
print("=" * 60)

print("\n--- PROTEIN DISTRIBUTION ---")
print(f"Marginal p(Protein):")
print(f"  Mean: {mu_protein_marginal:.2f}, Std: {sigma_protein_marginal:.2f} log2 intensity")
print(f"\nConditional p(Protein | mRNA = {observed_mrna}):")
print(f"  Mean: {mu_protein_given_mrna:.2f}, Std: {sigma_protein_given_mrna:.2f} log2 intensity")
variance_reduction_protein = 1 - (sigma_protein_given_mrna**2 / sigma_protein_marginal**2)
print(f"\n  Variance reduction: {variance_reduction_protein * 100:.1f}%")

print("\n--- mRNA DISTRIBUTION ---")
print(f"Marginal p(mRNA):")
print(f"  Mean: {mu_mrna_marginal:.2f}, Std: {sigma_mrna_marginal:.2f} log2 TPM")
print(f"\nConditional p(mRNA | Protein = {observed_protein}):")
print(f"  Mean: {mu_mrna_given_protein:.2f}, Std: {sigma_mrna_given_protein:.2f} log2 TPM")
variance_reduction_mrna = 1 - (sigma_mrna_given_protein**2 / sigma_mrna_marginal**2)
print(f"\n  Variance reduction: {variance_reduction_mrna * 100:.1f}%")

============================================================
MARGINAL VS CONDITIONAL: UNCERTAINTY REDUCTION
============================================================

--- PROTEIN DISTRIBUTION ---
Marginal p(Protein):
  Mean: 5.20, Std: 1.40 log2 intensity

Conditional p(Protein | mRNA = 10.0):
  Mean: 6.72, Std: 0.88 log2 intensity

  Variance reduction: 60.8%

--- mRNA DISTRIBUTION ---
Marginal p(mRNA):
  Mean: 7.50, Std: 1.80 log2 TPM

Conditional p(mRNA | Protein = 7.0):
  Mean: 9.31, Std: 1.13 log2 TPM

  Variance reduction: 60.8%

# Visualize marginal vs conditional
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Protein comparison
ax1 = axes[0]
protein_range = np.linspace(0, 10, 200)
marginal_pdf = stats.norm.pdf(protein_range, mu_protein_marginal, sigma_protein_marginal)
conditional_pdf = stats.norm.pdf(protein_range, mu_protein_given_mrna, sigma_protein_given_mrna)

ax1.plot(protein_range, marginal_pdf, 'b-', linewidth=2, label='Marginal p(Protein)')
ax1.plot(protein_range, conditional_pdf, 'r-', linewidth=2,
         label=f'Conditional p(Protein|mRNA={observed_mrna})')
ax1.fill_between(protein_range, marginal_pdf, alpha=0.2, color='blue')
ax1.fill_between(protein_range, conditional_pdf, alpha=0.2, color='red')
ax1.set_xlabel('Protein Concentration (log2 intensity)', fontsize=12)
ax1.set_ylabel('Density', fontsize=12)
ax1.set_title('Marginal vs Conditional: Protein', fontsize=14)
ax1.legend()

# mRNA comparison
ax2 = axes[1]
mrna_range = np.linspace(1, 14, 200)
marginal_pdf_mrna = stats.norm.pdf(mrna_range, mu_mrna_marginal, sigma_mrna_marginal)
conditional_pdf_mrna = stats.norm.pdf(mrna_range, mu_mrna_given_protein, sigma_mrna_given_protein)

ax2.plot(mrna_range, marginal_pdf_mrna, 'b-', linewidth=2, label='Marginal p(mRNA)')
ax2.plot(mrna_range, conditional_pdf_mrna, 'r-', linewidth=2,
         label=f'Conditional p(mRNA|Protein={observed_protein})')
ax2.fill_between(mrna_range, marginal_pdf_mrna, alpha=0.2, color='blue')
ax2.fill_between(mrna_range, conditional_pdf_mrna, alpha=0.2, color='red')
ax2.set_xlabel('mRNA Expression (log2 TPM)', fontsize=12)
ax2.set_ylabel('Density', fontsize=12)
ax2.set_title('Marginal vs Conditional: mRNA', fontsize=14)
ax2.legend()

plt.tight_layout()
plt.show()

# Verify using the simplified 2D formula
def conditional_2d_formula(mu_1, mu_2, sigma_1, sigma_2, rho, y2):
    """
    Eq. 3.31: Conditional distribution using correlation coefficient.
    
    p(y_1|y_2) = N(y_1 | mu_1 + rho*(sigma_1/sigma_2)*(y_2 - mu_2), sigma_1^2*(1-rho^2))
    """
    mu_cond = mu_1 + rho * (sigma_1 / sigma_2) * (y2 - mu_2)
    var_cond = sigma_1**2 * (1 - rho**2)
    return mu_cond, np.sqrt(var_cond)

# Verify: predict protein given mRNA using Eq. 3.31
mu_cond_verify, sigma_cond_verify = conditional_2d_formula(
    mu_1=mu[1], mu_2=mu[0],  # protein | mRNA
    sigma_1=sigma_protein, sigma_2=sigma_mrna,
    rho=rho,
    y2=observed_mrna
)

print("=" * 60)
print("VERIFICATION: Using Eq. 3.31 (correlation form)")
print("=" * 60)
print(f"\nFrom matrix formula: mu = {mu_protein_given_mrna:.4f}, sigma = {sigma_protein_given_mrna:.4f}")
print(f"From Eq. 3.31:       mu = {mu_cond_verify:.4f}, sigma = {sigma_cond_verify:.4f}")
print(f"\nVariance reduction factor: 1 - rho^2 = 1 - {rho}^2 = {1 - rho**2:.4f}")
print(f"This means {rho**2*100:.1f}% of protein variance is explained by knowing mRNA!")

============================================================
VERIFICATION: Using Eq. 3.31 (correlation form)
============================================================

From matrix formula: mu = 6.7167, sigma = 0.8761
From Eq. 3.31:       mu = 6.7167, sigma = 0.8761

Variance reduction factor: 1 - rho^2 = 1 - 0.78^2 = 0.3916
This means 60.8% of protein variance is explained by knowing mRNA!

# Explore different correlation strengths
# In biology, mRNA-protein correlation varies by gene:
# ~0.4 for poorly correlated genes, ~0.8+ for well-correlated ones
correlations = [0.0, 0.3, 0.6, 0.78, 0.95]

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Left: Effect on conditional distribution
ax1 = axes[0]
protein_range = np.linspace(0, 10, 200)
colors = plt.cm.viridis(np.linspace(0, 1, len(correlations)))

for rho_i, color in zip(correlations, colors):
    mu_cond_i, sigma_cond_i = conditional_2d_formula(
        mu_1=mu[1], mu_2=mu[0],
        sigma_1=sigma_protein, sigma_2=sigma_mrna,
        rho=rho_i,
        y2=observed_mrna
    )
    pdf_i = stats.norm.pdf(protein_range, mu_cond_i, sigma_cond_i)
    ax1.plot(protein_range, pdf_i, color=color, linewidth=2,
             label=f'rho = {rho_i}')

ax1.set_xlabel('Protein Concentration (log2 intensity)', fontsize=12)
ax1.set_ylabel('Density', fontsize=12)
ax1.set_title(f'Effect of Correlation on p(Protein | mRNA = {observed_mrna})', fontsize=14)
ax1.legend()

# Right: Variance reduction vs correlation
ax2 = axes[1]
rho_range = np.linspace(0, 0.99, 100)
variance_reduction = rho_range**2 * 100

ax2.plot(rho_range, variance_reduction, 'b-', linewidth=2)
ax2.fill_between(rho_range, variance_reduction, alpha=0.3)

# Mark our actual correlation
ax2.scatter([rho], [rho**2 * 100], color='red', s=100, zorder=5,
            label=f'EGFR: rho={rho}, reduction={rho**2*100:.1f}%')

ax2.set_xlabel('Correlation coefficient rho', fontsize=12)
ax2.set_ylabel('Variance reduction (%)', fontsize=12)
ax2.set_title('Variance Reduction = rho^2 (Eq. 3.32)', fontsize=14)
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

def predict_protein(mrna_value):
    """Predict protein concentration given mRNA expression."""
    mu_pred, sigma_pred = get_conditional(mu, Sigma, y2=mrna_value, idx_1=1, idx_2=0)
    return {
        'expected': mu_pred,
        'std': sigma_pred,
        'ci_95_low': mu_pred - 1.96 * sigma_pred,
        'ci_95_high': mu_pred + 1.96 * sigma_pred
    }

def predict_mrna(protein_value):
    """Predict mRNA expression given protein concentration."""
    mu_pred, sigma_pred = get_conditional(mu, Sigma, y2=protein_value, idx_1=0, idx_2=1)
    return {
        'expected': mu_pred,
        'std': sigma_pred,
        'ci_95_low': mu_pred - 1.96 * sigma_pred,
        'ci_95_high': mu_pred + 1.96 * sigma_pred
    }

# Example predictions
print("=" * 60)
print("EGFR BIOMARKER PREDICTION TOOL")
print("=" * 60)

test_mrna = [5.0, 7.5, 10.0, 12.0]
print("\nGiven mRNA Expression -> Predict Protein Level:")
print("-" * 55)
for m in test_mrna:
    r = predict_protein(m)
    print(f"  mRNA = {m:>5.1f} log2 TPM -> Protein = {r['expected']:.2f} "
          f"(95% CI: [{r['ci_95_low']:.2f}, {r['ci_95_high']:.2f}])")

test_protein = [3.0, 5.0, 7.0, 8.5]
print("\nGiven Protein Level -> Predict mRNA Expression:")
print("-" * 55)
for p in test_protein:
    r = predict_mrna(p)
    print(f"  Protein = {p:>4.1f} log2 int -> mRNA = {r['expected']:.2f} "
          f"(95% CI: [{r['ci_95_low']:.2f}, {r['ci_95_high']:.2f}])")

============================================================
EGFR BIOMARKER PREDICTION TOOL
============================================================

Given mRNA Expression -> Predict Protein Level:
-------------------------------------------------------
  mRNA =   5.0 log2 TPM -> Protein = 3.68 (95% CI: [1.97, 5.40])
  mRNA =   7.5 log2 TPM -> Protein = 5.20 (95% CI: [3.48, 6.92])
  mRNA =  10.0 log2 TPM -> Protein = 6.72 (95% CI: [5.00, 8.43])
  mRNA =  12.0 log2 TPM -> Protein = 7.93 (95% CI: [6.21, 9.65])

Given Protein Level -> Predict mRNA Expression:
-------------------------------------------------------
  Protein =  3.0 log2 int -> mRNA = 5.29 (95% CI: [3.09, 7.50])
  Protein =  5.0 log2 int -> mRNA = 7.30 (95% CI: [5.09, 9.51])
  Protein =  7.0 log2 int -> mRNA = 9.31 (95% CI: [7.10, 11.51])
  Protein =  8.5 log2 int -> mRNA = 10.81 (95% CI: [8.60, 13.02])

Marginals and Conditionals of a Multivariate Normal Distribution¶

Real-World Scenario: Predicting Protein Levels from Gene Expression¶

Key Formulas from PML 3.2.3¶

Marginals (Eq. 3.27)¶

Conditionals (Eq. 3.28)¶

Step 1: Generate Synthetic Biomarker Data¶

Step 2: Visualize the Joint Distribution¶

Step 3: Implement Marginal and Conditional Functions¶

Step 4: Predict Protein Level Given mRNA Expression¶

Step 5: Reverse Problem - Predict mRNA Given Protein Level¶

Step 6: Comparing Marginal vs Conditional Distributions¶

Step 7: The 2D Case Formula (Eq. 3.31-3.32)¶

Step 8: Effect of Correlation Strength¶

Step 9: Biomarker Prediction Tool¶

Summary¶

Key Takeaways from PML 3.2.3:¶

Biotech Application: mRNA-Protein Prediction¶