import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

np.random.seed(42)
plt.rcParams['figure.figsize'] = [10, 6]

# Naive approach: random matrix
n = 5  # 5 genes
random_matrix = np.random.randn(n, n)

print("Random matrix:")
print(random_matrix.round(2))
print(f"\nIs symmetric? {np.allclose(random_matrix, random_matrix.T)}")

# Check eigenvalues
eigenvalues = np.linalg.eigvalsh(random_matrix)
print(f"Eigenvalues: {eigenvalues.round(2)}")
print(f"Has negative eigenvalues? {any(eigenvalues < 0)}")
print("\nThis is NOT a valid covariance matrix!")

Random matrix:
[[ 0.5  -0.14  0.65  1.52 -0.23]
 [-0.23  1.58  0.77 -0.47  0.54]
 [-0.46 -0.47  0.24 -1.91 -1.72]
 [-0.56 -1.01  0.31 -0.91 -1.41]
 [ 1.47 -0.23  0.07 -1.42 -0.54]]

Is symmetric? False
Eigenvalues: [-2.48 -1.02  0.15  1.89  2.33]
Has negative eigenvalues? True

This is NOT a valid covariance matrix!

# Let's build intuition with a tiny example
# 4 genes, 2 regulatory pathways

genes = ['TP53', 'BRCA1', 'IL6', 'TNF']
pathways = ['p53 Pathway', 'NF-κB Pathway']

# Pathway loadings: how much each gene responds to each pathway
# Rows = genes, Columns = pathways
B = np.array([
    [0.8, 0.0],   # TP53: high p53 response, no NF-κB response
    [0.7, 0.0],   # BRCA1: high p53 response, no NF-κB response
    [0.1, 0.9],   # IL6: low p53, high NF-κB response
    [0.1, 0.8],   # TNF: low p53, high NF-κB response
])

print("Pathway Loading Matrix B:")
print("=" * 40)
print(f"{'Gene':<12} {'p53 Pathway':>12} {'NF-κB':>12}")
print("-" * 40)
for i, gene in enumerate(genes):
    print(f"{gene:<12} {B[i, 0]:>12.1f} {B[i, 1]:>12.1f}")

Pathway Loading Matrix B:
========================================
Gene          p53 Pathway        NF-κB
----------------------------------------
TP53                  0.8          0.0
BRCA1                 0.7          0.0
IL6                   0.1          0.9
TNF                   0.1          0.8

# Visualize pathway loadings
fig, ax = plt.subplots(figsize=(8, 5))
im = ax.imshow(B, cmap='RdBu_r', aspect='auto', vmin=-1, vmax=1)

ax.set_xticks(range(len(pathways)))
ax.set_xticklabels(pathways, fontsize=12)
ax.set_yticks(range(len(genes)))
ax.set_yticklabels(genes, fontsize=12)
ax.set_title('Pathway Loading Matrix B\n(How strongly each gene responds to each pathway)', fontsize=14)

# Add values on cells
for i in range(len(genes)):
    for j in range(len(pathways)):
        ax.text(j, i, f'{B[i, j]:.1f}', ha='center', va='center', fontsize=14, fontweight='bold')

plt.colorbar(im, label='Loading')
plt.tight_layout()
plt.show()

print("\nInterpretation:")
print("   TP53 & BRCA1 have HIGH p53 pathway loading (0.8, 0.7)")
print("   IL6 & TNF have HIGH NF-κB pathway loading (0.9, 0.8)")
print("   Tumor suppressors have ZERO NF-κB loading (different regulatory program)")

Interpretation:
   TP53 & BRCA1 have HIGH p53 pathway loading (0.8, 0.7)
   IL6 & TNF have HIGH NF-κB pathway loading (0.9, 0.8)
   Tumor suppressors have ZERO NF-κB loading (different regulatory program)

# Pathway covariance: assume pathways are independent with unit variance
K = 2  # number of pathways
F = np.eye(K)

print("Pathway Covariance Matrix F:")
print(F)
print("\nInterpretation:")
print("   Each pathway has variance = 1 (diagonal)")
print("   Pathways are uncorrelated (off-diagonal = 0)")
print("   p53 and NF-κB pathways operate independently")

Pathway Covariance Matrix F:
[[1. 0.]
 [0. 1.]]

Interpretation:
   Each pathway has variance = 1 (diagonal)
   Pathways are uncorrelated (off-diagonal = 0)
   p53 and NF-κB pathways operate independently

# Compute systematic covariance
systematic_cov = B @ F @ B.T

print("Systematic Covariance (B @ F @ B^T):")
print("=" * 50)
print(f"{'':12}", end='')
for gene in genes:
    print(f"{gene:>10}", end='')
print()
print("-" * 50)
for i, gene in enumerate(genes):
    print(f"{gene:<12}", end='')
    for j in range(len(genes)):
        print(f"{systematic_cov[i, j]:>10.2f}", end='')
    print()

Systematic Covariance (B @ F @ B^T):
==================================================
                  TP53     BRCA1       IL6       TNF
--------------------------------------------------
TP53              0.64      0.56      0.08      0.08
BRCA1             0.56      0.49      0.07      0.07
IL6               0.08      0.07      0.82      0.73
TNF               0.08      0.07      0.73      0.65

# Visualize systematic covariance
fig, ax = plt.subplots(figsize=(8, 6))
im = ax.imshow(systematic_cov, cmap='coolwarm', vmin=-0.2, vmax=1)

ax.set_xticks(range(len(genes)))
ax.set_xticklabels(genes, fontsize=11, rotation=45, ha='right')
ax.set_yticks(range(len(genes)))
ax.set_yticklabels(genes, fontsize=11)
ax.set_title('Systematic Covariance: B @ F @ B$^T$\n(Co-expression from shared pathway regulation)', fontsize=14)

for i in range(len(genes)):
    for j in range(len(genes)):
        ax.text(j, i, f'{systematic_cov[i, j]:.2f}', ha='center', va='center', fontsize=12, fontweight='bold')

plt.colorbar(im, label='Covariance')
plt.tight_layout()
plt.show()

print("\nKey Observations:")
print(f"   TP53–BRCA1 covariance: {systematic_cov[0,1]:.2f} (both regulated by p53)")
print(f"   IL6–TNF covariance: {systematic_cov[2,3]:.2f} (both regulated by NF-κB)")
print(f"   TP53–IL6 covariance: {systematic_cov[0,2]:.2f} (different pathways -> low)")

Key Observations:
   TP53–BRCA1 covariance: 0.56 (both regulated by p53)
   IL6–TNF covariance: 0.73 (both regulated by NF-κB)
   TP53–IL6 covariance: 0.08 (different pathways -> low)

# Gene-specific noise variances
idio_variances = np.array([0.04, 0.03, 0.05, 0.06])
D = np.diag(idio_variances)

print("Gene-Specific Noise Variance Matrix D:")
print("=" * 50)
print(f"{'':12}", end='')
for gene in genes:
    print(f"{gene:>10}", end='')
print()
print("-" * 50)
for i, gene in enumerate(genes):
    print(f"{gene:<12}", end='')
    for j in range(len(genes)):
        print(f"{D[i, j]:>10.2f}", end='')
    print()

print("\nInterpretation:")
print("   Diagonal = each gene's unique expression variance")
print("   Off-diagonal = 0 (gene-specific noise is independent)")
print("   TNF (0.06) has more intrinsic noise than BRCA1 (0.03)")

Gene-Specific Noise Variance Matrix D:
==================================================
                  TP53     BRCA1       IL6       TNF
--------------------------------------------------
TP53              0.04      0.00      0.00      0.00
BRCA1             0.00      0.03      0.00      0.00
IL6               0.00      0.00      0.05      0.00
TNF               0.00      0.00      0.00      0.06

Interpretation:
   Diagonal = each gene's unique expression variance
   Off-diagonal = 0 (gene-specific noise is independent)
   TNF (0.06) has more intrinsic noise than BRCA1 (0.03)

# Total covariance matrix
Sigma = systematic_cov + D

print("Total Covariance Matrix Sigma = B @ F @ B^T + D:")
print("=" * 50)
print(f"{'':12}", end='')
for gene in genes:
    print(f"{gene:>10}", end='')
print()
print("-" * 50)
for i, gene in enumerate(genes):
    print(f"{gene:<12}", end='')
    for j in range(len(genes)):
        print(f"{Sigma[i, j]:>10.2f}", end='')
    print()

Total Covariance Matrix Sigma = B @ F @ B^T + D:
==================================================
                  TP53     BRCA1       IL6       TNF
--------------------------------------------------
TP53              0.68      0.56      0.08      0.08
BRCA1             0.56      0.52      0.07      0.07
IL6               0.08      0.07      0.87      0.73
TNF               0.08      0.07      0.73      0.71

# Visualize the decomposition
fig, axes = plt.subplots(1, 4, figsize=(18, 4))

# Plot B @ F @ B^T
im0 = axes[0].imshow(systematic_cov, cmap='Blues', vmin=0, vmax=0.8)
axes[0].set_title('Systematic Covariance\n$B F B^T$', fontsize=12)

# Plus sign
axes[1].text(0.5, 0.5, '+', fontsize=50, ha='center', va='center')
axes[1].axis('off')

# Plot D
im2 = axes[2].imshow(D, cmap='Oranges', vmin=0, vmax=0.1)
axes[2].set_title('Gene-Specific Noise\n$D$', fontsize=12)

# Equals sign  
axes[3].text(-0.3, 0.5, '=', fontsize=50, ha='center', va='center', transform=axes[3].transAxes)

# Plot Sigma
im3 = axes[3].imshow(Sigma, cmap='Purples', vmin=0, vmax=0.8)
axes[3].set_title('Total Covariance\n$\Sigma$', fontsize=12)

for ax in [axes[0], axes[2], axes[3]]:
    ax.set_xticks(range(4))
    ax.set_xticklabels(genes, fontsize=9)
    ax.set_yticks(range(4))
    ax.set_yticklabels(genes, fontsize=9)

plt.tight_layout()
plt.show()

<>:21: SyntaxWarning: invalid escape sequence '\S'
<>:21: SyntaxWarning: invalid escape sequence '\S'
/var/folders/34/4mb6rzb52l76jcqm_pjx3fph0000gn/T/ipykernel_99891/3498595891.py:21: SyntaxWarning: invalid escape sequence '\S'
  axes[3].set_title('Total Covariance\n$\Sigma$', fontsize=12)

# Check properties
print("Validation Checks:")
print("=" * 40)

# 1. Symmetric?
is_symmetric = np.allclose(Sigma, Sigma.T)
print(f"1. Is symmetric (Σ = Σ^T)?  {'✅ YES' if is_symmetric else '❌ NO'}")

# 2. Positive semi-definite? (all eigenvalues >= 0)
eigenvalues = np.linalg.eigvalsh(Sigma)
is_psd = all(eigenvalues >= -1e-10)  # Small tolerance for numerical errors
print(f"2. Is positive semi-definite? {'✅ YES' if is_psd else '❌ NO'}")
print(f"   Eigenvalues: {eigenvalues.round(4)}")

# 3. Diagonal elements positive? (variances must be positive)
diag_positive = all(np.diag(Sigma) > 0)
print(f"3. Diagonal (variances) > 0? {'✅ YES' if diag_positive else '❌ NO'}")

print("\n🎉 This is a VALID covariance matrix!")

Validation Checks:
========================================
1. Is symmetric (Σ = Σ^T)?  ✅ YES
2. Is positive semi-definite? ✅ YES
   Eigenvalues: [0.0343 0.0556 1.1112 1.5789]
3. Diagonal (variances) > 0? ✅ YES

🎉 This is a VALID covariance matrix!

# Convert to correlation matrix
std_devs = np.sqrt(np.diag(Sigma))
correlation = Sigma / np.outer(std_devs, std_devs)

print("Correlation Matrix:")
print("=" * 50)
print(f"{'':12}", end='')
for gene in genes:
    print(f"{gene:>10}", end='')
print()
print("-" * 50)
for i, gene in enumerate(genes):
    print(f"{gene:<12}", end='')
    for j in range(len(genes)):
        print(f"{correlation[i, j]:>10.2f}", end='')
    print()

Correlation Matrix:
==================================================
                  TP53     BRCA1       IL6       TNF
--------------------------------------------------
TP53              1.00      0.94      0.10      0.12
BRCA1             0.94      1.00      0.10      0.12
IL6               0.10      0.10      1.00      0.93
TNF               0.12      0.12      0.93      1.00

# Visualize correlation matrix
fig, ax = plt.subplots(figsize=(8, 6))

im = ax.imshow(correlation, cmap='RdBu_r', vmin=-1, vmax=1)
ax.set_xticks(range(len(genes)))
ax.set_xticklabels(genes, fontsize=12, rotation=45, ha='right')
ax.set_yticks(range(len(genes)))
ax.set_yticklabels(genes, fontsize=12)
ax.set_title('Co-Expression Correlation Matrix\n(Derived from Factor Model)', fontsize=14)

for i in range(len(genes)):
    for j in range(len(genes)):
        color = 'white' if abs(correlation[i, j]) > 0.5 else 'black'
        ax.text(j, i, f'{correlation[i, j]:.2f}', ha='center', va='center', 
                fontsize=12, fontweight='bold', color=color)

plt.colorbar(im, label='Correlation')
plt.tight_layout()
plt.show()

print("\nKey Insights:")
print(f"   TP53–BRCA1 correlation: {correlation[0,1]:.2f} (same pathway = high co-expression)")
print(f"   IL6–TNF correlation: {correlation[2,3]:.2f} (same pathway = high co-expression)")
print(f"   TP53–IL6 correlation: {correlation[0,2]:.2f} (different pathways = low)")
print(f"   Diagonal = 1.00 (each gene perfectly correlated with itself)")

Key Insights:
   TP53–BRCA1 correlation: 0.94 (same pathway = high co-expression)
   IL6–TNF correlation: 0.93 (same pathway = high co-expression)
   TP53–IL6 correlation: 0.10 (different pathways = low)
   Diagonal = 1.00 (each gene perfectly correlated with itself)

def generate_factor_covariance(n_genes, n_pathways, loading_scale=0.1, idio_range=(0.01, 0.05)):
    """
    Generate a realistic covariance matrix using a factor model.
    
    Parameters:
    -----------
    n_genes : int
        Number of genes (N)
    n_pathways : int
        Number of latent regulatory pathways (K)
    loading_scale : float
        Scale of pathway loadings (larger = stronger pathway influence)
    idio_range : tuple
        Range for gene-specific noise variances (min, max)
    
    Returns:
    --------
    Sigma : ndarray (N x N)
        Covariance matrix
    B : ndarray (N x K)
        Pathway loadings
    D : ndarray (N x N)
        Gene-specific noise covariance (diagonal)
    """
    # Step 1: Generate random pathway loadings
    B = np.random.randn(n_genes, n_pathways) * loading_scale
    
    # Step 2: Pathway covariance (identity = independent pathways)
    F = np.eye(n_pathways)
    
    # Step 3: Systematic covariance
    systematic_cov = B @ F @ B.T
    
    # Step 4: Gene-specific noise variances
    idio_var = np.random.uniform(idio_range[0], idio_range[1], n_genes)
    D = np.diag(idio_var)
    
    # Step 5: Total covariance
    Sigma = systematic_cov + D
    
    return Sigma, B, D

# Generate a larger example
N = 100  # 100 genes
K = 5    # 5 regulatory pathways

Sigma_large, B_large, D_large = generate_factor_covariance(N, K)

print(f"Generated covariance matrix for {N} genes using {K} pathways")
print(f"\nMatrix shape: {Sigma_large.shape}")
print(f"Pathway loadings shape: {B_large.shape}")

Generated covariance matrix for 100 genes using 5 pathways

Matrix shape: (100, 100)
Pathway loadings shape: (100, 5)

# Validate and visualize
eigenvalues_large = np.linalg.eigvalsh(Sigma_large)

fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Plot 1: Covariance matrix heatmap
im1 = axes[0].imshow(Sigma_large, cmap='viridis', aspect='auto')
axes[0].set_title(f'Covariance Matrix ({N}x{N})\nfrom {K}-Pathway Model', fontsize=12)
axes[0].set_xlabel('Gene')
axes[0].set_ylabel('Gene')
plt.colorbar(im1, ax=axes[0], label='Covariance')

# Plot 2: Pathway loadings
im2 = axes[1].imshow(B_large, cmap='RdBu_r', aspect='auto')
axes[1].set_title(f'Pathway Loadings ({N}x{K})', fontsize=12)
axes[1].set_xlabel('Pathway')
axes[1].set_ylabel('Gene')
plt.colorbar(im2, ax=axes[1], label='Loading')

# Plot 3: Eigenvalue distribution
axes[2].hist(eigenvalues_large, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
axes[2].axvline(x=0, color='red', linestyle='--', label='Zero')
axes[2].set_title('Eigenvalue Distribution', fontsize=12)
axes[2].set_xlabel('Eigenvalue')
axes[2].set_ylabel('Count')
axes[2].legend()

plt.tight_layout()
plt.show()

print(f"\nAll eigenvalues positive: {all(eigenvalues_large > 0)}")
print(f"   Min eigenvalue: {eigenvalues_large.min():.6f}")
print(f"   Max eigenvalue: {eigenvalues_large.max():.6f}")

All eigenvalues positive: True
   Min eigenvalue: 0.011776
   Max eigenvalue: 1.222432

# Parameter count comparison
N = 500  # 500 genes
K = 10   # 10 pathways

# Full covariance: N(N+1)/2 unique parameters (symmetric)
full_params = N * (N + 1) // 2

# Factor model: N*K loadings + N gene-specific variances
factor_params = N * K + N

print("Parameter Count Comparison:")
print("=" * 50)
print(f"Gene panel size: {N} genes")
print(f"Number of pathways: {K}")
print(f"\nFull covariance matrix: {full_params:,} parameters")
print(f"Factor model:           {factor_params:,} parameters")
print(f"\nReduction: {(1 - factor_params/full_params)*100:.1f}% fewer parameters!")
print(f"\nThis matters in genomics: with ~200 patient samples")
print(f"we can't reliably estimate {full_params:,} parameters,")
print(f"but {factor_params:,} is feasible.")

Parameter Count Comparison:
==================================================
Gene panel size: 500 genes
Number of pathways: 10

Full covariance matrix: 125,250 parameters
Factor model:           5,500 parameters

Reduction: 95.6% fewer parameters!

This matters in genomics: with ~200 patient samples
we can't reliably estimate 125,250 parameters,
but 5,500 is feasible.

Factor (Pathway)	Description	Example Effect
p53 Pathway	Tumor suppressor / DNA damage response	TP53, BRCA1, CDKN1A activated together
NF-κB Signaling	Master inflammation regulator	IL6, TNF, IL1B co-expressed during inflammation
Cell Cycle	Mitosis and cell division control	CDK1, CCNB1, PLK1 peak together in dividing cells
Hypoxia Response	Low-oxygen adaptation	HIF1A, VEGFA, GLUT1 co-activated in tumors
Estrogen Signaling	Hormone receptor pathway	ESR1, PGR, GATA3 co-expressed in ER+ breast cancer

Benefit	Explanation
Guaranteed Valid	$BFB^T$ is always positive semi-definite, $D$ adds positive diagonal
Realistic Structure	Captures how genes cluster by regulatory pathway
Interpretable	Factors correspond to biological programs (pathways, TFs)
Efficient	Only need to estimate $N \times K + N$ parameters instead of $N \times N$
Stable	More robust when the number of samples is small relative to genes

Symbol	Name	Shape	Description
$\Sigma$	Covariance Matrix	N × N	Gene expression covariance we want to construct
$B$	Pathway Loadings	N × K	How each gene responds to each regulatory pathway
$F$	Pathway Covariance	K × K	How pathways co-vary (often $I$)
$D$	Gene-Specific Noise	N × N	Intrinsic noise per gene (diagonal)

Factor Models and Covariance Matrices¶

1. The Problem: Why Do We Need Covariance Matrices?¶

2. The Naive Approach (and Why It Fails)¶

3. The Factor Model: A Realistic Solution¶

The Key Insight¶

Real-World Example: What Are the Factors?¶

4. Building the Covariance Matrix: Step by Step¶

Step 4.1: Pathway Loading Matrix $B$¶

Step 4.2: Pathway Covariance Matrix $F$¶

Step 4.3: Systematic Covariance $B \cdot F \cdot B^T$¶

Step 4.4: Gene-Specific Noise Variance $D$¶

Step 4.5: Total Covariance Matrix $\Sigma = B F B^T + D$¶

Step 4.6: Verify It's a Valid Covariance Matrix¶

5. Converting to Correlation Matrix¶

6. Scaling Up: Large Covariance Matrices¶

7. Why Use Factor Models?¶

Advantages¶

8. Summary¶

The Factor Model Formula¶

Components¶

Key Insights¶