import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    BaggingClassifier,
    HistGradientBoostingClassifier,
)
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.inspection import permutation_importance, PartialDependenceDisplay

rng = np.random.default_rng(42)
plt.style.use('seaborn-v0_8-whitegrid')
mpl.rcParams['font.family'] = 'DejaVu Sans'

N = 500

# Gene expression: 10 real + 5 noise genes
gene_names = ['EGFR', 'KRAS_expr', 'TP53_expr', 'MYC', 'BRCA1',
              'BCL2', 'CDK4', 'MDM2', 'PTEN', 'RB1',
              'NOISE_1', 'NOISE_2', 'NOISE_3', 'NOISE_4', 'NOISE_5']
X_expr = rng.standard_normal((N, len(gene_names)))

# Mutation status (binary) — prevalences loosely based on TCGA pan-cancer
mut_names = ['TP53_mut', 'KRAS_mut', 'EGFR_mut', 'BRAF_mut', 'PIK3CA_mut']
mut_probs = [0.35, 0.25, 0.15, 0.12, 0.20]
X_mut = np.column_stack([rng.binomial(1, p, N) for p in mut_probs])

# Tissue of origin — one-hot
tissues = ['lung', 'breast', 'colon', 'skin']
tissue_idx = rng.integers(0, 4, N)
X_tissue = np.eye(4)[tissue_idx]

# Concatenate into the design matrix
X = np.column_stack([X_expr, X_mut, X_tissue])
feature_names = gene_names + mut_names + [f'tissue_{t}' for t in tissues]
D = X.shape[1]

# --- True biology ---
EGFR   = X_expr[:, 0]
BCL2   = X_expr[:, 5]
TP53_mut = X_mut[:, 0]
KRAS_mut = X_mut[:, 1]
EGFR_mut = X_mut[:, 2]
is_lung  = X_tissue[:, 0]

# Log-odds of being drug-sensitive
logit = (
    1.4 * np.clip(EGFR, -2, 2) * (1 - KRAS_mut)   # EGFR effect, killed by KRAS mut
    + 2.8 * EGFR_mut                               # EGFR mut → strongly sensitive
    + 1.3 * BCL2 * (1 - TP53_mut)                  # synthetic lethal interaction
    + 0.6 * is_lung                                # tissue effect
    - 0.4
)
p_sens = 1 / (1 + np.exp(-logit))
y = rng.binomial(1, p_sens).astype(int)

print(f'N cell lines:         {N}')
print(f'N features:           {D}  ({len(gene_names)} expr + {len(mut_names)} mut + {len(tissues)} tissue)')
print(f'Sensitive fraction:   {y.mean():.2%}')
print(f'Tissue distribution:  ' + ', '.join(f'{t}={np.sum(tissue_idx==i)}' for i, t in enumerate(tissues)))

# Train/test split — stratified so class balance is preserved
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=0
)
print(f'\nTrain: {len(y_train)}   Test: {len(y_test)}')

N cell lines:         500
N features:           24  (15 expr + 5 mut + 4 tissue)
Sensitive fraction:   50.00%
Tissue distribution:  lung=125, breast=124, colon=131, skin=120

Train: 375   Test: 125

X_mut
# X_tissue
# np.eye(4)[1]
# y

array([[1, 0, 0, 0, 1],
       [0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0],
       [1, 0, 1, 0, 0],
       [0, 1, 0, 0, 0]], shape=(500, 5))

tree_small = DecisionTreeClassifier(max_depth=3, random_state=0)
tree_small.fit(X_train, y_train)

test_acc = accuracy_score(y_test, tree_small.predict(X_test))
print(f'Depth-3 tree test accuracy: {test_acc:.3f}')

fig, ax = plt.subplots(figsize=(15, 6))
plot_tree(
    tree_small,
    feature_names=feature_names,
    class_names=['resistant', 'sensitive'],
    filled=True,
    rounded=True,
    impurity=False,
    fontsize=9,
    ax=ax,
)
ax.set_title('Depth-3 decision tree on drug sensitivity data', fontsize=12)
plt.tight_layout()
plt.show()

Depth-3 tree test accuracy: 0.584

depths = [1, 2, 3, 5, 8, 12, 20, None]  # None = unlimited
rows = []
for d in depths:
    t = DecisionTreeClassifier(max_depth=d, random_state=0).fit(X_train, y_train)
    rows.append((
        d if d is not None else t.get_depth(),
        t.get_n_leaves(),
        accuracy_score(y_train, t.predict(X_train)),
        accuracy_score(y_test,  t.predict(X_test)),
    ))

df_depth = pd.DataFrame(rows, columns=['depth', 'n_leaves', 'train_acc', 'test_acc'])
print(df_depth.to_string(index=False))

fig, ax = plt.subplots(figsize=(7, 4))
ax.plot(df_depth['depth'], df_depth['train_acc'], 'o-', label='train', color='#1f77b4')
ax.plot(df_depth['depth'], df_depth['test_acc'],  's-', label='test',  color='#d62728')
ax.set_xlabel('tree depth')
ax.set_ylabel('accuracy')
ax.set_title('A single tree overfits as it gets deeper')
ax.legend()
plt.tight_layout()
plt.show()

 depth  n_leaves  train_acc  test_acc
     1         2   0.610667     0.592
     2         4   0.674667     0.632
     3         8   0.696000     0.584
     5        23   0.797333     0.512
     8        46   0.941333     0.488
    12        66   0.997333     0.480
    20        67   1.000000     0.488
    13        67   1.000000     0.488

fig, axes = plt.subplots(1, 3, figsize=(18, 5))
for ax, seed in zip(axes, [1, 2, 3]):
    # Drop a random 10% of training data
    sub = rng.choice(len(X_train), size=int(0.9 * len(X_train)), replace=False)
    t = DecisionTreeClassifier(max_depth=3, random_state=0).fit(X_train[sub], y_train[sub])
    plot_tree(t, feature_names=feature_names,
              class_names=['R', 'S'], filled=True, rounded=True,
              impurity=False, fontsize=8, ax=ax)
    ax.set_title(f'Same algorithm, 90% of data (seed {seed})\n'
                 f'test acc = {accuracy_score(y_test, t.predict(X_test)):.3f}',
                 fontsize=10)
plt.tight_layout()
plt.show()

# Single deep (overfitted) tree vs. bag of 50 deep trees
tree_deep = DecisionTreeClassifier(random_state=0).fit(X_train, y_train)
bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(random_state=0),
    n_estimators=50, bootstrap=True, random_state=0,
).fit(X_train, y_train)

print(f'Single deep tree  test acc: {accuracy_score(y_test, tree_deep.predict(X_test)):.3f}')
print(f'Bag of 50 trees   test acc: {accuracy_score(y_test, bag.predict(X_test)):.3f}')
print(f'Single deep tree  test AUC: {roc_auc_score(y_test, tree_deep.predict_proba(X_test)[:, 1]):.3f}')
print(f'Bag of 50 trees   test AUC: {roc_auc_score(y_test, bag.predict_proba(X_test)[:, 1]):.3f}')

Single deep tree  test acc: 0.488
Bag of 50 trees   test acc: 0.648
Single deep tree  test AUC: 0.488
Bag of 50 trees   test AUC: 0.703

rf = RandomForestClassifier(
    n_estimators=200,
    max_features='sqrt',
    oob_score=True,         # free out-of-bag accuracy estimate
    random_state=0,
).fit(X_train, y_train)

print(f'Random forest   test acc: {accuracy_score(y_test, rf.predict(X_test)):.3f}')
print(f'Random forest   test AUC: {roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1]):.3f}')
print(f'Random forest   OOB acc:  {rf.oob_score_:.3f}   (estimated without touching the test set)')

Random forest   test acc: 0.632
Random forest   test AUC: 0.704
Random forest   OOB acc:  0.656   (estimated without touching the test set)

n_list = [5, 10, 25, 50, 100, 200]
oob_curve, test_curve = [], []

for n in n_list:
    m = RandomForestClassifier(
        n_estimators=n, max_features='sqrt',
        oob_score=True, random_state=0,
    ).fit(X_train, y_train)
    oob_curve.append(1 - m.oob_score_)
    test_curve.append(1 - accuracy_score(y_test, m.predict(X_test)))

fig, ax = plt.subplots(figsize=(7, 4))
ax.plot(n_list, oob_curve, 'o-', label='OOB error (train-only)', color='#2ca02c')
ax.plot(n_list, test_curve, 's-', label='test error', color='#d62728')
ax.set_xscale('log')
ax.set_xlabel('number of trees')
ax.set_ylabel('error rate')
ax.set_title('RF error vs. ensemble size — OOB tracks test error well')
ax.legend()
plt.tight_layout()
plt.show()

/Users/christophe/Sites/notebook/venv/lib/python3.13/site-packages/sklearn/ensemble/_forest.py:611: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable OOB estimates.
  warn(
/Users/christophe/Sites/notebook/venv/lib/python3.13/site-packages/sklearn/ensemble/_forest.py:611: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable OOB estimates.
  warn(

# 1. Impurity-based (built-in)
imp_gini = pd.Series(rf.feature_importances_, index=feature_names).sort_values()

# 2. Permutation importance (on the test set)
perm = permutation_importance(
    rf, X_test, y_test, n_repeats=10, random_state=0,
)
imp_perm = pd.Series(perm.importances_mean, index=feature_names)
imp_perm_std = pd.Series(perm.importances_std, index=feature_names)
order = imp_perm.sort_values().index
imp_perm = imp_perm.loc[order]
imp_perm_std = imp_perm_std.loc[order]

fig, axes = plt.subplots(1, 2, figsize=(14, 6.5), sharey=False)

colors_g = ['#2ca02c' if f.startswith('NOISE') else '#1f77b4' for f in imp_gini.index]
axes[0].barh(imp_gini.index, imp_gini.values, color=colors_g)
axes[0].set_title('Impurity-based importance (built-in)', fontsize=11)
axes[0].set_xlabel('mean Gini reduction')

colors_p = ['#2ca02c' if f.startswith('NOISE') else '#1f77b4' for f in imp_perm.index]
axes[1].barh(imp_perm.index, imp_perm.values, xerr=imp_perm_std.values,
             color=colors_p, ecolor='gray', capsize=2)
axes[1].set_title('Permutation importance (accuracy drop on test)', fontsize=11)
axes[1].set_xlabel('mean accuracy drop')
axes[1].axvline(0, color='black', lw=0.5)

plt.suptitle('Feature importance — noise genes in green, real in blue', y=1.01)
plt.tight_layout()
plt.show()

gbdt = HistGradientBoostingClassifier(
    max_iter=500,
    learning_rate=0.05,
    max_depth=4,
    early_stopping=True,    # stops when validation loss plateaus
    validation_fraction=0.15,
    random_state=0,
).fit(X_train, y_train)

print(f'Gradient boosting test acc: {accuracy_score(y_test, gbdt.predict(X_test)):.3f}')
print(f'Gradient boosting test AUC: {roc_auc_score(y_test, gbdt.predict_proba(X_test)[:, 1]):.3f}')
print(f'Actual trees used:          {gbdt.n_iter_} (early-stopped from max_iter=500)')

Gradient boosting test acc: 0.680
Gradient boosting test AUC: 0.750
Actual trees used:          42 (early-stopped from max_iter=500)

models = {
    'Single tree (depth 5)':   DecisionTreeClassifier(max_depth=5, random_state=0),
    'Bagging (50 trees)':      BaggingClassifier(DecisionTreeClassifier(random_state=0),
                                                 n_estimators=50, random_state=0),
    'Random Forest (200)':     RandomForestClassifier(n_estimators=200, max_features='sqrt',
                                                      random_state=0),
    'Gradient Boosting':       HistGradientBoostingClassifier(
                                   max_iter=500, learning_rate=0.05, max_depth=4,
                                   early_stopping=True, random_state=0),
}

rows = []
for name, m in models.items():
    m.fit(X_train, y_train)
    rows.append((
        name,
        accuracy_score(y_test, m.predict(X_test)),
        roc_auc_score(y_test, m.predict_proba(X_test)[:, 1]),
    ))

df_cmp = pd.DataFrame(rows, columns=['model', 'test_acc', 'test_AUC'])
print(df_cmp.to_string(index=False))

                model  test_acc  test_AUC
Single tree (depth 5)     0.512  0.514081
   Bagging (50 trees)     0.648  0.702765
  Random Forest (200)     0.632  0.703661
    Gradient Boosting     0.720  0.742704

# Fit without early stopping so we can see the full loss trajectory
gbdt_full = HistGradientBoostingClassifier(
    max_iter=400, learning_rate=0.05, max_depth=4,
    early_stopping=False, random_state=0,
)
# scikit-learn exposes the staged validation loss via fitting on a held-out slice
X_tr, X_va, y_tr, y_va = train_test_split(X_train, y_train, test_size=0.2,
                                          stratify=y_train, random_state=1)

gbdt_full.fit(X_tr, y_tr)

# staged_decision_function gives predictions at each boosting iteration
train_losses, val_losses = [], []
from sklearn.metrics import log_loss
for p_tr, p_va in zip(gbdt_full.staged_predict_proba(X_tr),
                      gbdt_full.staged_predict_proba(X_va)):
    train_losses.append(log_loss(y_tr, p_tr))
    val_losses.append(log_loss(y_va, p_va))

fig, ax = plt.subplots(figsize=(7, 4))
ax.plot(train_losses, label='train log-loss', color='#1f77b4')
ax.plot(val_losses,   label='validation log-loss', color='#d62728')
ax.axvline(np.argmin(val_losses), ls='--', color='gray',
           label=f'best iter = {np.argmin(val_losses)}')
ax.set_xlabel('boosting iteration')
ax.set_ylabel('log-loss')
ax.set_title('Train loss keeps dropping; val loss bottoms out — this is where to stop')
ax.legend()
plt.tight_layout()
plt.show()

results = []
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
for lr in [0.05, 0.1]:
    for depth in [3, 6]:
        m = HistGradientBoostingClassifier(
            max_iter=300, learning_rate=lr, max_depth=depth,
            early_stopping=True, random_state=0,
        )
        scores = cross_val_score(m, X_train, y_train, cv=cv, scoring='roc_auc')
        results.append((lr, depth, scores.mean(), scores.std()))

df_grid = pd.DataFrame(results, columns=['lr', 'depth', 'AUC_mean', 'AUC_std'])
df_grid = df_grid.sort_values('AUC_mean', ascending=False).reset_index(drop=True)
print(df_grid.to_string(index=False))
print(f'\nBest config: lr={df_grid.iloc[0].lr}, depth={int(df_grid.iloc[0].depth)}')

  lr  depth  AUC_mean  AUC_std
0.05      3  0.731353 0.038930
0.10      3  0.728196 0.039492
0.10      6  0.711342 0.037370
0.05      6  0.698754 0.024319

Best config: lr=0.05, depth=3

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# 1D PDPs for the most important continuous features
PartialDependenceDisplay.from_estimator(
    rf, X_train,
    features=[feature_names.index('EGFR')],
    feature_names=feature_names,
    ax=axes[0], line_kw={'color': '#1f77b4', 'lw': 2},
)
axes[0].set_title('Effect of EGFR expression', fontsize=11)

PartialDependenceDisplay.from_estimator(
    rf, X_train,
    features=[feature_names.index('BCL2')],
    feature_names=feature_names,
    ax=axes[1], line_kw={'color': '#1f77b4', 'lw': 2},
)
axes[1].set_title('Effect of BCL2 expression', fontsize=11)

PartialDependenceDisplay.from_estimator(
    rf, X_train,
    features=[feature_names.index('KRAS_mut')],
    feature_names=feature_names,
    ax=axes[2], line_kw={'color': '#1f77b4', 'lw': 2},
)
axes[2].set_title('Effect of KRAS mutation status', fontsize=11)

plt.suptitle('1D partial dependence — the RF recovered the true biology', y=1.02)
plt.tight_layout()
plt.show()

# 2D PDP — the EGFR x KRAS_mut interaction we engineered
fig, ax = plt.subplots(figsize=(6.5, 5))
PartialDependenceDisplay.from_estimator(
    rf, X_train,
    features=[(feature_names.index('EGFR'), feature_names.index('KRAS_mut'))],
    feature_names=feature_names,
    ax=ax,
)
ax.set_title('2D PDP: EGFR × KRAS_mut\n(high EGFR only helps if KRAS is wild-type)')
plt.tight_layout()
plt.show()

Hyperparameter	What it does	Typical range
`learning_rate` (ν)	Shrinkage — how much each tree contributes. Smaller = more trees needed, better generalization.	0.01 – 0.1
`n_estimators` / `max_iter`	Total number of boosting rounds. Use early stopping.	100 – 2000+
`max_depth`	Depth of each weak learner. Boosting wants shallow trees (3–8); deeper for complex interactions.	3 – 8

Situation	Pick
Small data (N < 200), mostly linear signal	Logistic / Ridge regression (with interpretable coefficients)
Tabular biotech data, N = 200 – 5 000	Gradient boosting (XGBoost / LightGBM / HistGBM)
Want a no-fuss baseline, accuracy not paramount	Random Forest (little tuning needed)
Very high-dim sparse data (genomics with $D \gg N$)	Lasso / Elastic Net first, trees second
Need calibrated probabilities	GBDT with log-loss or RF with isotonic calibration
Need to deploy with minimal tuning/risk	Random Forest — harder to shoot yourself in the foot than with boosting

Feature type	Measured from	What it tells you	Values
Gene expression	RNA (mRNA)	How much of the gene's protein the cell is currently producing	Continuous (TPM, FPKM, log-counts)
Mutation status	DNA	Whether the gene's DNA sequence is altered (and may code a broken or hyperactive protein)	Usually binary (0 = wild-type, 1 = mutated)

Term	One-line definition
Oncogene	A gene whose mutation gains a cancer-driving function (e.g., KRAS, EGFR, MYC).
Tumor suppressor	A gene whose mutation loses a protective function (e.g., TP53, PTEN, RB1).
Wild-type	The normal, unmutated form of a gene.
Pathway	A chain of proteins passing signals to one another (e.g., EGFR → KRAS → MAPK → cell division).
Targeted therapy	A drug designed to block one specific protein (contrast with chemotherapy, which is non-specific).
Precision oncology	Choosing a drug based on a patient's specific molecular profile.
Biomarker	A measurable feature (gene expression, mutation, protein level) that predicts something clinical (diagnosis, prognosis, drug response).
Apoptosis	Programmed cell death — the cell's built-in self-destruct program.
Synthetic lethal	A combination of two genetic changes that is lethal only when both are present. A major framework for new cancer drugs.
Oncogene addiction	A cancer's dependence on a single mutated oncogene, which makes drugs targeting it devastatingly effective.
IC50	Drug concentration that kills 50% of cells — the standard sensitivity metric.

Tree Ensembles for Biotech: Random Forests and Gradient Boosting¶

Real-World Scenario: Predicting Cancer Cell Line Drug Sensitivity¶

1. Simulate a Multi-Omics Drug Sensitivity Dataset¶

2. A Single Decision Tree — Build Intuition First¶

The depth knob — underfit vs. overfit¶

The other big problem: trees are unstable¶

3. Bagging: Average Many Trees to Kill the Variance¶

4. Random Forests: Decorrelate the Trees¶

Out-of-bag (OOB) error — a free cross-validation¶

How many trees? More is better, up to diminishing returns.¶

5. Feature Importance — What Is the Model Actually Using?¶

Impurity-based importance (Gini importance)¶

Permutation importance (more robust)¶

6. Gradient Boosting — Usually the Accuracy Winner¶

Side-by-side: single tree vs. bag vs. RF vs. GBDT¶

7. Hyperparameter Tuning — The Three Knobs That Matter¶

Learning curves: spot overfitting visually¶

A quick cross-validated sweep¶

8. Partial Dependence Plots — What Did the Model Learn?¶

9. Practical Cheat Sheet¶

Which model to reach for first¶

The pitfalls that bite biotech practitioners¶

What's not covered here but worth knowing¶

Appendix — Biology Quick Start¶

What is a cancer cell line?¶

Two kinds of features: expression vs. mutation¶

The biology we baked into the simulated data¶

EGFR — the drug's target¶

KRAS — why the drug stops working¶

TP53 and BCL2 — the death machinery¶

Tissue of origin¶

The noise genes and noise mutations¶

How drug sensitivity is actually measured in the lab¶

Why tree models for this specifically¶

Glossary¶

Where to go next¶