import ast
import sys
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings('ignore', category=FutureWarning)

# --- Environment Auto-Detection ---
# Same pattern as prior notebooks: detect Colab vs. local / Great Lakes.
try:
    IS_COLAB = 'google.colab' in str(get_ipython())
except NameError:
    IS_COLAB = False

if IS_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    PROJECT_ROOT = Path('/content/drive/MyDrive/SIADS 692 Milestone II/'
                        'Milestone II - NLP Cryptic Crossword Clues/'
                        'clue_misdirection')
else:
    try:
        PROJECT_ROOT = Path(__file__).resolve().parent.parent
    except NameError:
        PROJECT_ROOT = Path.cwd().parent

DATA_DIR = PROJECT_ROOT / 'data'
OUTPUT_DIR = PROJECT_ROOT / 'outputs'
FIGURES_DIR = OUTPUT_DIR / 'figures'
SCRIPTS_DIR = PROJECT_ROOT / 'scripts'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
FIGURES_DIR.mkdir(parents=True, exist_ok=True)

# --- Add scripts/ to sys.path so feature_utils is importable ---
# Decision 18: feature column lists are defined in feature_utils.py
# (extracted from NB 03) so all downstream notebooks use the same names.
if str(SCRIPTS_DIR) not in sys.path:
    sys.path.insert(0, str(SCRIPTS_DIR))

from feature_utils import (
    CONTEXT_INFORMED_COLS,
    RELATIONSHIP_COLS,
    SURFACE_COLS,
)

# --- Experiment parameters ---
RANDOM_SEED = 42
N_FOLDS = 5
SAMPLE_MODE = True   # <-- Set to False for final runs
SAMPLE_SIZE = 20_000

# --- Feature set for the harder dataset (Exp 2A) ---
# The harder dataset excludes the 15 context-free cosine features
# (Decision 6: artifacts of cosine-similarity-based distractor
# construction). The remaining 32 features are:
#   6 context-informed + 22 relationship + 4 surface
HARDER_FEATURE_COLS = (
    CONTEXT_INFORMED_COLS + list(RELATIONSHIP_COLS) + SURFACE_COLS
)

# --- Feature group definitions for ablation ---
# Each group maps to a conceptually distinct set of features.
# These are the same groups defined in feature_utils.py.
FEATURE_GROUPS = {
    'Context-Informed (6)': CONTEXT_INFORMED_COLS,
    'Relationship (22)': list(RELATIONSHIP_COLS),
    'Surface (4)': SURFACE_COLS,
}

print(f'Environment: {"Google Colab" if IS_COLAB else "Local / Great Lakes"}')
print(f'Project root: {PROJECT_ROOT}')
print(f'Data directory: {DATA_DIR}')
print(f'Output directory: {OUTPUT_DIR}')
print(f'\nRandom seed: {RANDOM_SEED}')
print(f'Sample mode: {SAMPLE_MODE}'
      f'{f" ({SAMPLE_SIZE:,} rows)" if SAMPLE_MODE else " (full dataset)"}')
print(f'\nHarder feature set (Exp 2A): {len(HARDER_FEATURE_COLS)} features')
print(f'Feature groups:')
for name, cols in FEATURE_GROUPS.items():
    print(f'  {name}: {len(cols)} features')

# ============================================================
# Load experiment results from NB 06 and NB 07
# ============================================================
results_easy_path = OUTPUT_DIR / 'results_easy.csv'
results_harder_path = OUTPUT_DIR / 'results_harder.csv'

assert results_easy_path.exists(), (
    f'Missing {results_easy_path}\n'
    f'Run 06_experiments_easy.ipynb first.'
)
assert results_harder_path.exists(), (
    f'Missing {results_harder_path}\n'
    f'Run 07_experiments_harder.ipynb first.'
)

results_easy = pd.read_csv(results_easy_path)
results_harder = pd.read_csv(results_harder_path)

print(f'Loaded results_easy.csv: {len(results_easy)} rows')
print(results_easy[['Experiment', 'Model', 'accuracy_mean', 'accuracy_std']].to_string(index=False))

print(f'\nLoaded results_harder.csv: {len(results_harder)} rows')
print(results_harder[['Experiment', 'Model', 'accuracy_mean', 'accuracy_std']].to_string(index=False))

# ============================================================
# Build Table 8: Summary Results (design doc Section 10.2)
# ============================================================
# Rows: KNN, Logistic Regression, Random Forest
# Columns: Exp 1A, Exp 1B, Delta Easy, Exp 2A, Exp 2B, Delta Hard
# Values: mean accuracy +/- SD

model_names = ['KNN', 'Logistic Regression', 'Random Forest']

def _fmt(mean, std):
    """Format accuracy as 'mean +/- SD' with 4 decimal places."""
    return f'{mean:.4f} +/- {std:.4f}'

def _fmt_delta(delta):
    """Format delta with sign indicator."""
    return f'{delta:+.4f}'

table_rows = []

for model in model_names:
    row = {'Model': model}

    # --- Easy dataset ---
    e1a = results_easy[(results_easy['Experiment'] == 'Exp_1A') &
                       (results_easy['Model'] == model)].iloc[0]
    e1b = results_easy[(results_easy['Experiment'] == 'Exp_1B') &
                       (results_easy['Model'] == model)].iloc[0]

    row['Exp 1A'] = _fmt(e1a['accuracy_mean'], e1a['accuracy_std'])
    row['Exp 1B'] = _fmt(e1b['accuracy_mean'], e1b['accuracy_std'])
    delta_easy = e1a['accuracy_mean'] - e1b['accuracy_mean']
    row['Delta Easy'] = _fmt_delta(delta_easy)

    # --- Harder dataset ---
    e2a = results_harder[(results_harder['Experiment'] == 'Exp_2A') &
                         (results_harder['Model'] == model)].iloc[0]
    e2b = results_harder[(results_harder['Experiment'] == 'Exp_2B') &
                         (results_harder['Model'] == model)].iloc[0]

    row['Exp 2A'] = _fmt(e2a['accuracy_mean'], e2a['accuracy_std'])
    row['Exp 2B'] = _fmt(e2b['accuracy_mean'], e2b['accuracy_std'])
    delta_hard = e2a['accuracy_mean'] - e2b['accuracy_mean']
    row['Delta Hard'] = _fmt_delta(delta_hard)

    # Store numeric values for the CSV export
    row['_e1a_mean'] = e1a['accuracy_mean']
    row['_e1a_std'] = e1a['accuracy_std']
    row['_e1b_mean'] = e1b['accuracy_mean']
    row['_e1b_std'] = e1b['accuracy_std']
    row['_delta_easy'] = delta_easy
    row['_e2a_mean'] = e2a['accuracy_mean']
    row['_e2a_std'] = e2a['accuracy_std']
    row['_e2b_mean'] = e2b['accuracy_mean']
    row['_e2b_std'] = e2b['accuracy_std']
    row['_delta_hard'] = delta_hard

    table_rows.append(row)

table8 = pd.DataFrame(table_rows)

# --- Display formatted table ---
display_cols = ['Model', 'Exp 1A', 'Exp 1B', 'Delta Easy',
                'Exp 2A', 'Exp 2B', 'Delta Hard']
print('TABLE 8 \u2014 Summary Results: Mean Accuracy +/- SD (5-fold GroupKFold CV)')
print('=' * 110)
print(table8[display_cols].to_string(index=False))

# --- Save to CSV ---
# The CSV includes both the formatted strings (for human readability)
# and the raw numeric values (for downstream analysis).
save_cols = [
    'Model',
    '_e1a_mean', '_e1a_std', '_e1b_mean', '_e1b_std', '_delta_easy',
    '_e2a_mean', '_e2a_std', '_e2b_mean', '_e2b_std', '_delta_hard',
]
save_df = table8[save_cols].rename(columns={
    '_e1a_mean': 'exp_1a_accuracy_mean', '_e1a_std': 'exp_1a_accuracy_std',
    '_e1b_mean': 'exp_1b_accuracy_mean', '_e1b_std': 'exp_1b_accuracy_std',
    '_delta_easy': 'delta_easy',
    '_e2a_mean': 'exp_2a_accuracy_mean', '_e2a_std': 'exp_2a_accuracy_std',
    '_e2b_mean': 'exp_2b_accuracy_mean', '_e2b_std': 'exp_2b_accuracy_std',
    '_delta_hard': 'delta_hard',
})
summary_path = OUTPUT_DIR / 'results_summary.csv'
save_df.to_csv(summary_path, index=False)
print(f'\nSaved to {summary_path}')

# ============================================================
# Render Table 8 as a matplotlib figure for report inclusion
# ============================================================
# A publication-quality table rendered as an image, suitable for
# pasting into the final report. The figure uses no axes — just
# a clean table with alternating row shading.

fig, ax = plt.subplots(figsize=(14, 2.8))
ax.axis('off')
ax.set_title('Table 8 \u2014 Summary Results: Mean Accuracy \u00b1 SD (5-fold GroupKFold CV)',
             fontsize=13, fontweight='bold', pad=12)

# --- Prepare cell text ---
col_labels = ['Model', 'Exp 1A', 'Exp 1B', '\u0394 Easy',
              'Exp 2A', 'Exp 2B', '\u0394 Hard']
cell_text = []
for _, r in table8.iterrows():
    cell_text.append([
        r['Model'],
        r['Exp 1A'].replace('+/-', '\u00b1'),
        r['Exp 1B'].replace('+/-', '\u00b1'),
        r['Delta Easy'],
        r['Exp 2A'].replace('+/-', '\u00b1'),
        r['Exp 2B'].replace('+/-', '\u00b1'),
        r['Delta Hard'],
    ])

tbl = ax.table(
    cellText=cell_text,
    colLabels=col_labels,
    loc='center',
    cellLoc='center',
)
tbl.auto_set_font_size(False)
tbl.set_fontsize(10)
tbl.scale(1.0, 1.6)

# --- Style: header row and alternating row colors ---
header_color = '#4472C4'
row_colors = ['#D9E2F3', '#FFFFFF', '#D9E2F3']

for (row_idx, col_idx), cell in tbl.get_celld().items():
    if row_idx == 0:
        # Header row
        cell.set_facecolor(header_color)
        cell.set_text_props(color='white', fontweight='bold')
        cell.set_edgecolor('white')
    else:
        cell.set_facecolor(row_colors[(row_idx - 1) % len(row_colors)])
        cell.set_edgecolor('#CCCCCC')
    # Make Model column left-aligned and slightly wider
    if col_idx == 0:
        cell.set_text_props(ha='left')
        cell._loc = 'left'

# --- Highlight Delta Hard column to draw attention to the key result ---
for row_idx in range(1, len(cell_text) + 1):
    cell = tbl[row_idx, 6]
    delta_val = table8.iloc[row_idx - 1]['_delta_hard']
    if delta_val > 0:
        cell.set_text_props(fontweight='bold', color='#2E7D32')
    elif delta_val < 0:
        cell.set_text_props(fontweight='bold', color='#C62828')

fig.tight_layout()

fig_path = FIGURES_DIR / 'results_summary_table.png'
fig.savefig(fig_path, dpi=300, bbox_inches='tight',
            facecolor='white', edgecolor='none')
print(f'Saved figure to {fig_path}')
plt.show()

# ============================================================
# Identify the best model from the harder dataset
# ============================================================
# We select the model with the highest mean accuracy on Exp 2A
# (the "with context" condition on the harder dataset), because
# that is the full-feature condition whose internals we want to
# examine in Steps 10–12.

exp2a_results = results_harder[results_harder['Experiment'] == 'Exp_2A']
best_row = exp2a_results.loc[exp2a_results['accuracy_mean'].idxmax()]
best_model_name = best_row['Model']
best_accuracy = best_row['accuracy_mean']

print(f'Best model on harder dataset (Exp 2A): {best_model_name}')
print(f'Mean accuracy: {best_accuracy:.4f} +/- {best_row["accuracy_std"]:.4f}')

# ============================================================
# Load per-fold results to get best hyperparameters for fold 0
# ============================================================
per_fold_path = OUTPUT_DIR / 'results_harder_per_fold.csv'
assert per_fold_path.exists(), (
    f'Missing {per_fold_path}\n'
    f'Run 07_experiments_harder.ipynb first.'
)

per_fold_df = pd.read_csv(per_fold_path)

# Get fold 0 best hyperparameters for the best model under Exp 2A.
# The best_params column is a stringified dict — parse it with
# ast.literal_eval.
fold0_row = per_fold_df[
    (per_fold_df['experiment'] == 'Exp_2A') &
    (per_fold_df['model'] == best_model_name) &
    (per_fold_df['fold'] == 0)
].iloc[0]

fold0_best_params = ast.literal_eval(fold0_row['best_params'])
fold0_expected_acc = fold0_row['accuracy']

print(f'\nFold 0 best hyperparameters: {fold0_best_params}')
print(f'Fold 0 expected accuracy: {fold0_expected_acc:.4f}')

# ============================================================
# Load the harder dataset
# ============================================================
dataset_path = DATA_DIR / 'dataset_harder.parquet'
assert dataset_path.exists(), (
    f'Missing input file: {dataset_path}\n'
    f'Run 05_dataset_construction.ipynb first to produce this file.'
)

df = pd.read_parquet(dataset_path)
print(f'Loaded dataset_harder.parquet: {len(df):,} rows x {len(df.columns)} columns')

# --- Validate expected columns ---
missing_feat = [c for c in HARDER_FEATURE_COLS if c not in df.columns]
assert not missing_feat, f'Missing feature columns: {missing_feat}'
assert 'label' in df.columns, 'Missing label column'
assert 'definition_wn' in df.columns, 'Missing definition_wn column'
assert 'answer_wn' in df.columns, 'Missing answer_wn column'

# --- Sample mode ---
# Same stratified subsampling pattern as NB 07 to ensure the
# GroupKFold split reproduces the same fold assignments.
if SAMPLE_MODE:
    sampled_parts = []
    for label_val in df['label'].unique():
        group = df[df['label'] == label_val]
        sampled_parts.append(
            group.sample(n=min(SAMPLE_SIZE // 2, len(group)),
                         random_state=RANDOM_SEED)
        )
    df = pd.concat(sampled_parts, ignore_index=True)
    print(f'\n\u26a0 SAMPLE MODE: subsampled to {len(df):,} rows '
          f'(set SAMPLE_MODE = False for final runs)')

print(f'\nShape: {df.shape}')
print(f'Label distribution:')
print(df['label'].value_counts().to_string())

# --- Validate no NaNs in feature columns ---
assert not df[HARDER_FEATURE_COLS].isnull().any().any(), \
    'NaN values found in feature columns'

# ============================================================
# Assign GroupKFold splits (same logic as NB 07)
# ============================================================
# We replicate the exact same GroupKFold assignment so that fold 0
# here matches fold 0 in NB 07. This requires using the same group
# key construction, same n_splits, and same data (or same sample).

groups = df['definition_wn'].astype(str) + '|||' + df['answer_wn'].astype(str)

gkf = GroupKFold(n_splits=N_FOLDS)

df['fold'] = -1
for fold_idx, (_, test_idx) in enumerate(gkf.split(df, y=df['label'], groups=groups)):
    df.loc[df.index[test_idx], 'fold'] = fold_idx

assert (df['fold'] >= 0).all(), 'Some rows were not assigned to any fold'

# Verify no group leakage
folds_per_group = df.groupby(['definition_wn', 'answer_wn'])['fold'].nunique()
leaked_groups = folds_per_group[folds_per_group > 1]
assert len(leaked_groups) == 0, (
    f'{len(leaked_groups)} groups span multiple folds'
)
print(f'GroupKFold verification passed \u2713')

# --- Fold 0 train/test split ---
test_mask = (df['fold'] == 0).values
train_mask = ~test_mask

X_train = df.loc[train_mask, HARDER_FEATURE_COLS].values
X_test = df.loc[test_mask, HARDER_FEATURE_COLS].values
y_train = df.loc[train_mask, 'label'].values
y_test = df.loc[test_mask, 'label'].values

# Keep the test DataFrame for failure analysis in later cells
df_test = df[test_mask].copy()

print(f'\nFold 0 split: train={train_mask.sum():,}  test={test_mask.sum():,}')
print(f'Train label balance: {(y_train == 1).sum():,} pos / {(y_train == 0).sum():,} neg')
print(f'Test label balance:  {(y_test == 1).sum():,} pos / {(y_test == 0).sum():,} neg')

# ============================================================
# Retrain the best model on fold 0 with best hyperparameters
# ============================================================
# We train directly with the best hyperparameters found during
# NB 07's inner CV for this fold — no additional search needed.
# This reproduces the exact model that produced the fold 0 result.
#
# Random Forest is scale-invariant (Decision in CLAUDE.md coding
# standards), so no StandardScaler is applied.

print(f'Retraining: {best_model_name}')
print(f'Hyperparameters: {fold0_best_params}')
print(f'Feature set: Exp 2A ({len(HARDER_FEATURE_COLS)} features)')

best_model = RandomForestClassifier(
    random_state=RANDOM_SEED,
    **fold0_best_params,
)
best_model.fit(X_train, y_train)

# --- Predictions and probabilities ---
y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1]

# Store predictions in the test DataFrame for failure analysis later
df_test['y_pred'] = y_pred
df_test['y_prob'] = y_prob

# --- Verify accuracy matches NB 07 fold 0 ---
retrained_acc = accuracy_score(y_test, y_pred)

print(f'\nRetrained fold 0 accuracy: {retrained_acc:.4f}')
print(f'Expected fold 0 accuracy:  {fold0_expected_acc:.4f}')

# The accuracy should match exactly (same data, same splits, same
# hyperparameters, same random seed). A small tolerance accounts for
# floating-point differences across platforms.
if abs(retrained_acc - fold0_expected_acc) < 1e-6:
    print('\nAccuracy matches NB 07 fold 0 result exactly \u2713')
else:
    diff = retrained_acc - fold0_expected_acc
    print(f'\nWARNING: Accuracy differs by {diff:+.6f} from NB 07 fold 0.')
    print('This may indicate a difference in sampling, fold assignment, '
          'or hyperparameters. Investigate before proceeding.')

print(f'\nModel, predictions, and probabilities stored for Steps 10b\u201312.')

# ============================================================
# Gini Importance (from the trained RF)
# ============================================================
# Gini importance is the mean decrease in impurity (MDI) across all
# trees. It is fast (already computed during training) but can be
# biased toward continuous features with many unique values.

gini_imp = pd.Series(
    best_model.feature_importances_,
    index=HARDER_FEATURE_COLS,
    name='gini_importance',
).sort_values(ascending=False)

print('Top 15 features by Gini importance:')
print(gini_imp.head(15).to_string())

# --- Tag each feature with its group for color-coding ---
def _feature_group(feat_name):
    """Return the group name for a feature column."""
    if feat_name in CONTEXT_INFORMED_COLS:
        return 'Context-Informed'
    elif feat_name in RELATIONSHIP_COLS:
        return 'Relationship'
    elif feat_name in SURFACE_COLS:
        return 'Surface'
    return 'Unknown'

GROUP_COLORS = {
    'Context-Informed': '#E57373',  # red
    'Relationship': '#4FC3F7',      # blue
    'Surface': '#81C784',           # green
}

# ============================================================
# Plot: Top 15 by Gini importance
# ============================================================
top15_gini = gini_imp.head(15).iloc[::-1]  # reverse for horizontal bar

fig, ax = plt.subplots(figsize=(9, 6))
colors = [GROUP_COLORS[_feature_group(f)] for f in top15_gini.index]
ax.barh(range(len(top15_gini)), top15_gini.values, color=colors, edgecolor='white')
ax.set_yticks(range(len(top15_gini)))
ax.set_yticklabels(top15_gini.index, fontsize=9)
ax.set_xlabel('Gini Importance (Mean Decrease in Impurity)', fontsize=11)
ax.set_title(f'Top 15 Features by Gini Importance \u2014 {best_model_name} (Exp 2A, Fold 0)',
             fontsize=12, fontweight='bold')

# Legend for feature groups
from matplotlib.patches import Patch
legend_handles = [Patch(facecolor=c, label=g) for g, c in GROUP_COLORS.items()]
ax.legend(handles=legend_handles, loc='lower right', fontsize=9,
          title='Feature Group', title_fontsize=9)

ax.grid(axis='x', alpha=0.3)
fig.tight_layout()

fig_path = FIGURES_DIR / 'feature_importance_gini.png'
fig.savefig(fig_path, dpi=300, bbox_inches='tight',
            facecolor='white', edgecolor='none')
print(f'Saved to {fig_path}')
plt.show()

# ============================================================
# Permutation Importance (on the test set)
# ============================================================
# Permutation importance measures how much accuracy drops when a
# single feature's values are randomly shuffled in the test set,
# breaking its relationship with the target. Unlike Gini importance,
# this is computed on held-out data and is not biased by feature
# cardinality. We use n_repeats=10 for stable estimates.

print('Computing permutation importance (n_repeats=10)...')
perm_result = permutation_importance(
    best_model, X_test, y_test,
    n_repeats=10,
    random_state=RANDOM_SEED,
    n_jobs=-1,
    scoring='accuracy',
)

perm_imp = pd.DataFrame({
    'feature': HARDER_FEATURE_COLS,
    'importance_mean': perm_result.importances_mean,
    'importance_std': perm_result.importances_std,
}).sort_values('importance_mean', ascending=False).reset_index(drop=True)

print('\nTop 15 features by permutation importance:')
print(perm_imp.head(15).to_string(index=False))

# ============================================================
# Plot: Top 15 by permutation importance
# ============================================================
top15_perm = perm_imp.head(15).iloc[::-1]  # reverse for horizontal bar

fig, ax = plt.subplots(figsize=(9, 6))
colors = [GROUP_COLORS[_feature_group(f)] for f in top15_perm['feature']]
ax.barh(
    range(len(top15_perm)),
    top15_perm['importance_mean'].values,
    xerr=top15_perm['importance_std'].values,
    color=colors, edgecolor='white', capsize=3,
)
ax.set_yticks(range(len(top15_perm)))
ax.set_yticklabels(top15_perm['feature'].values, fontsize=9)
ax.set_xlabel('Permutation Importance (Decrease in Accuracy)', fontsize=11)
ax.set_title(f'Top 15 Features by Permutation Importance \u2014 {best_model_name} (Exp 2A, Fold 0)',
             fontsize=12, fontweight='bold')

# Legend for feature groups
legend_handles = [Patch(facecolor=c, label=g) for g, c in GROUP_COLORS.items()]
ax.legend(handles=legend_handles, loc='lower right', fontsize=9,
          title='Feature Group', title_fontsize=9)

ax.grid(axis='x', alpha=0.3)
fig.tight_layout()

fig_path = FIGURES_DIR / 'feature_importance_permutation.png'
fig.savefig(fig_path, dpi=300, bbox_inches='tight',
            facecolor='white', edgecolor='none')
print(f'Saved to {fig_path}')
plt.show()

# ============================================================
# Group-Level Ablation: remove one group at a time
# ============================================================
# For each feature group, we retrain the RF on fold 0's training
# set *without* that group's features, then evaluate on fold 0's
# test set. The accuracy drop relative to the full model tells us
# how much that group contributes.

# Baseline: full model accuracy on fold 0 test set
baseline_acc = retrained_acc
print(f'Baseline accuracy (all {len(HARDER_FEATURE_COLS)} features): '
      f'{baseline_acc:.4f}\n')

ablation_rows = []

for group_name, group_cols in FEATURE_GROUPS.items():
    # Features remaining after removing this group
    remaining_cols = [c for c in HARDER_FEATURE_COLS if c not in group_cols]
    n_removed = len(group_cols)
    n_remaining = len(remaining_cols)

    # Get column indices for the remaining features
    remaining_idx = [HARDER_FEATURE_COLS.index(c) for c in remaining_cols]
    X_train_ablated = X_train[:, remaining_idx]
    X_test_ablated = X_test[:, remaining_idx]

    # Retrain RF with the same hyperparameters
    rf_ablated = RandomForestClassifier(
        random_state=RANDOM_SEED,
        **fold0_best_params,
    )
    rf_ablated.fit(X_train_ablated, y_train)
    ablated_acc = accuracy_score(y_test, rf_ablated.predict(X_test_ablated))
    delta = ablated_acc - baseline_acc

    ablation_rows.append({
        'group_removed': group_name,
        'n_removed': n_removed,
        'n_remaining': n_remaining,
        'accuracy': ablated_acc,
        'delta_accuracy': delta,
    })

    print(f'Remove {group_name:25s}  '
          f'{n_removed:>2d} removed, {n_remaining:>2d} remain  '
          f'Acc={ablated_acc:.4f}  \u0394={delta:+.4f}')

ablation_df = pd.DataFrame(ablation_rows)

# --- Save to CSV ---
ablation_path = OUTPUT_DIR / 'ablation_results.csv'
ablation_df.to_csv(ablation_path, index=False)
print(f'\nSaved to {ablation_path}')

# --- Display as formatted table ---
print(f'\n{"Group Removed":<28s} {"Removed":>8s} {"Remain":>8s} '
      f'{"Accuracy":>10s} {"\u0394 Accuracy":>12s}')
print('-' * 72)
print(f'{"(none \u2014 baseline)":<28s} {"\u2014":>8s} '
      f'{len(HARDER_FEATURE_COLS):>8d} '
      f'{baseline_acc:>10.4f} {"\u2014":>12s}')
for _, r in ablation_df.iterrows():
    print(f'{r["group_removed"]:<28s} {r["n_removed"]:>8d} '
          f'{r["n_remaining"]:>8d} '
          f'{r["accuracy"]:>10.4f} {r["delta_accuracy"]:>+12.4f}')

# ============================================================
# Learning Curve: vary training set size, evaluate on fold 0 test
# ============================================================
# We subsample the fold 0 training set at ~8 fractions from 10%
# to 100%, retrain RF with the same hyperparameters at each size,
# and record both train and test accuracy. The test set is always
# the full fold 0 test set (held constant).

train_fractions = np.array([0.10, 0.20, 0.30, 0.40, 0.50,
                            0.65, 0.80, 1.00])
n_train_full = len(y_train)

lc_rows = []

print(f'Training set sizes (fold 0 train = {n_train_full:,} rows):')
print(f'{"Fraction":>10s} {"N_train":>10s} {"Train Acc":>12s} {"Test Acc":>12s}')
print('-' * 48)

for frac in train_fractions:
    n_subset = int(n_train_full * frac)

    if frac < 1.0:
        # Stratified subsample of the training set to preserve
        # label balance at each size.
        rng = np.random.RandomState(RANDOM_SEED)
        pos_idx = np.where(y_train == 1)[0]
        neg_idx = np.where(y_train == 0)[0]
        n_pos = n_subset // 2
        n_neg = n_subset - n_pos
        chosen_pos = rng.choice(pos_idx, size=min(n_pos, len(pos_idx)),
                                replace=False)
        chosen_neg = rng.choice(neg_idx, size=min(n_neg, len(neg_idx)),
                                replace=False)
        subset_idx = np.concatenate([chosen_pos, chosen_neg])
        X_tr_sub = X_train[subset_idx]
        y_tr_sub = y_train[subset_idx]
    else:
        X_tr_sub = X_train
        y_tr_sub = y_train

    rf_lc = RandomForestClassifier(
        random_state=RANDOM_SEED,
        n_jobs=-1,
        **fold0_best_params,
    )
    rf_lc.fit(X_tr_sub, y_tr_sub)

    train_acc = accuracy_score(y_tr_sub, rf_lc.predict(X_tr_sub))
    test_acc = accuracy_score(y_test, rf_lc.predict(X_test))

    lc_rows.append({
        'fraction': frac,
        'n_train': len(y_tr_sub),
        'train_accuracy': train_acc,
        'test_accuracy': test_acc,
    })

    print(f'{frac:>10.0%} {len(y_tr_sub):>10,d} '
          f'{train_acc:>12.4f} {test_acc:>12.4f}')

lc_df = pd.DataFrame(lc_rows)

# ============================================================
# Plot: Learning Curve
# ============================================================
fig, ax = plt.subplots(figsize=(8, 5))

ax.plot(lc_df['n_train'], lc_df['train_accuracy'],
        'o-', color='#4472C4', linewidth=2, markersize=6,
        label='Train accuracy')
ax.plot(lc_df['n_train'], lc_df['test_accuracy'],
        's-', color='#C0504D', linewidth=2, markersize=6,
        label='Test accuracy (fold 0)')

ax.set_xlabel('Training Set Size', fontsize=11)
ax.set_ylabel('Accuracy', fontsize=11)
ax.set_title(f'Learning Curve \u2014 {best_model_name} (Exp 2A, Fold 0)',
             fontsize=12, fontweight='bold')
ax.legend(fontsize=10, loc='lower right')
ax.grid(alpha=0.3)

# Add percentage labels on the x-axis as secondary ticks
ax2 = ax.twiny()
ax2.set_xlim(ax.get_xlim())
pct_ticks = lc_df['n_train'].values
pct_labels = [f'{f:.0%}' for f in lc_df['fraction']]
ax2.set_xticks(pct_ticks)
ax2.set_xticklabels(pct_labels, fontsize=8)
ax2.set_xlabel('Fraction of Training Data', fontsize=10)

fig.tight_layout()

fig_path = FIGURES_DIR / 'sensitivity_learning_curve.png'
fig.savefig(fig_path, dpi=300, bbox_inches='tight',
            facecolor='white', edgecolor='none')
print(f'\nSaved to {fig_path}')
plt.show()

# ============================================================
# Identify all misclassified examples from fold 0 test set
# ============================================================
# The test predictions and probabilities were stored in df_test
# during the retraining step (Section 3). Here we extract the
# misclassified subset and build a DataFrame for analysis.

misclassified_mask = (df_test['y_pred'] != df_test['label']).values
df_misclassified = df_test[misclassified_mask].copy()

n_misclassified = len(df_misclassified)
n_total = len(df_test)
error_rate = n_misclassified / n_total

print(f'Fold 0 test set: {n_total:,} examples')
print(f'Misclassified:   {n_misclassified:,} ({error_rate:.2%} error rate)')
print(f'Correct:         {n_total - n_misclassified:,} '
      f'({1 - error_rate:.2%})')

# --- Error type breakdown ---
fp_mask = ((df_misclassified['label'] == 0) &
           (df_misclassified['y_pred'] == 1))
fn_mask = ((df_misclassified['label'] == 1) &
           (df_misclassified['y_pred'] == 0))
n_fp = fp_mask.sum()
n_fn = fn_mask.sum()

print(f'\nFalse positives (distractor → predicted real): {n_fp:,} '
      f'({n_fp / max(n_misclassified, 1):.1%} of errors)')
print(f'False negatives (real pair → predicted distractor): {n_fn:,} '
      f'({n_fn / max(n_misclassified, 1):.1%} of errors)')

# --- Predicted probability distribution ---
print(f'\nPredicted P(real) for misclassified examples:')
print(df_misclassified['y_prob'].describe().round(4).to_string())

# --- Build export DataFrame ---
# Columns: definition_wn, answer_wn, label, predicted_label,
# predicted_probability, metadata, then all 32 features.
export_cols = ['definition_wn', 'answer_wn', 'label', 'y_pred', 'y_prob']
meta_extras = ['definition', 'answer', 'surface',
               'def_num_usable_synsets', 'ans_num_usable_synsets',
               'distractor_source']
for col in meta_extras:
    if col in df_misclassified.columns:
        export_cols.append(col)
export_cols += HARDER_FEATURE_COLS

df_misc_export = df_misclassified[export_cols].copy()
df_misc_export = df_misc_export.rename(columns={
    'y_pred': 'predicted_label',
    'y_prob': 'predicted_probability',
})

print(f'\nMisclassified DataFrame: {df_misc_export.shape[0]:,} rows × '
      f'{df_misc_export.shape[1]} columns')

# ============================================================
# Detailed examination of specific misclassified examples
# ============================================================
# We select three representative examples spanning different error
# types and confidence levels. For each, we show the definition–
# answer pair, the true and predicted labels, the predicted
# probability, and the values of the top 10 most important features
# (by Gini importance). Features are flagged "←" if the example's
# value is closer to the *wrong* class's training mean than its own
# class's mean — suggesting this feature contributed to the error.

# --- Training-set class means for comparison ---
train_df = df[train_mask]
class_means = {
    0: train_df[train_df['label'] == 0][HARDER_FEATURE_COLS].mean(),
    1: train_df[train_df['label'] == 1][HARDER_FEATURE_COLS].mean(),
}

# Top 10 features by Gini importance
top10_feats = gini_imp.head(10).index.tolist()


def display_misclassified_example(row, title):
    """Pretty-print a single misclassified example with feature analysis."""
    true_lbl = int(row['label'])
    pred_lbl = int(row['y_pred'])
    lbl_name = {0: 'distractor', 1: 'real pair'}

    print(f'\n{"=" * 70}')
    print(f'  {title}')
    print(f'{"=" * 70}')
    print(f'  Definition (WN): {row["definition_wn"]}')
    print(f'  Answer (WN):     {row["answer_wn"]}')
    if 'surface' in row.index and pd.notna(row.get('surface')):
        srf = str(row['surface'])
        print(f'  Clue surface:    {srf[:80]}{"..." if len(srf) > 80 else ""}')
    print(f'  True label:      {true_lbl} ({lbl_name[true_lbl]})')
    print(f'  Predicted:       {pred_lbl} ({lbl_name[pred_lbl]})')
    print(f'  P(real):         {row["y_prob"]:.4f}')
    if 'def_num_usable_synsets' in row.index:
        print(f'  Def #synsets:    {int(row["def_num_usable_synsets"])}')
    if 'ans_num_usable_synsets' in row.index:
        print(f'  Ans #synsets:    {int(row["ans_num_usable_synsets"])}')

    print(f'\n  Top 10 features by importance:')
    print(f'  {"Feature":<35s} {"Value":>8s} {"Mean(real)":>10s} '
          f'{"Mean(dist)":>10s} {"Flag":>6s}')
    print(f'  {"-" * 73}')

    unusual_count = 0
    for feat in top10_feats:
        val = row[feat]
        m_real = class_means[1][feat]
        m_dist = class_means[0][feat]

        # Flag: value is closer to the wrong class's mean than
        # its own class's mean.
        dist_own = abs(val - class_means[true_lbl][feat])
        dist_opp = abs(val - class_means[1 - true_lbl][feat])
        flag = '  ←' if dist_opp < dist_own else ''
        if flag:
            unusual_count += 1

        print(f'  {feat:<35s} {val:>8.4f} {m_real:>10.4f} '
              f'{m_dist:>10.4f} {flag:>6s}')

    print(f'\n  ({unusual_count}/10 top features closer to wrong class mean)')


# --- Select 3 representative examples ---
fps = df_misclassified[df_misclassified['label'] == 0]
fns = df_misclassified[df_misclassified['label'] == 1]

# Example 1: Most confident false positive — the model was most
# certain this distractor was a real pair.
if len(fps) > 0:
    ex1_idx = fps['y_prob'].idxmax()
    display_misclassified_example(
        df_misclassified.loc[ex1_idx],
        'EXAMPLE 1: Most Confident False Positive'
    )

# Example 2: Most confident false negative — the model was most
# certain this real pair was a distractor.
if len(fns) > 0:
    ex2_idx = fns['y_prob'].idxmin()
    display_misclassified_example(
        df_misclassified.loc[ex2_idx],
        'EXAMPLE 2: Most Confident False Negative'
    )

# Example 3: Near decision boundary — the model was least certain
# about its (wrong) prediction.
ex3_idx = (df_misclassified['y_prob'] - 0.5).abs().idxmin()
display_misclassified_example(
    df_misclassified.loc[ex3_idx],
    'EXAMPLE 3: Near Decision Boundary'
)

# ============================================================
# Categorize misclassifications into failure patterns
# ============================================================
# We assign each misclassified example to one or more categories
# based on feature-level diagnostics. Categories are not mutually
# exclusive — an example may exhibit multiple failure patterns.
# The three categories are guided by the design doc (Section 10.6)
# but confirmed against the actual data distributions.

category_assignments = pd.DataFrame(index=df_misclassified.index)

# ------------------------------------------------------------------
# Category 1: Polysemy Confusion
# ------------------------------------------------------------------
# High polysemy makes embedding averaging unreliable: the allsense
# embedding blends too many unrelated meanings, and common/obscure
# may not correspond to the sense relevant to this clue.
# Criterion: def_num_usable_synsets above the 75th percentile of
# the test set (definition has unusually many senses).

if 'def_num_usable_synsets' in df_misclassified.columns:
    synset_p75 = df_test['def_num_usable_synsets'].quantile(0.75)
    category_assignments['polysemy_confusion'] = (
        df_misclassified['def_num_usable_synsets'] > synset_p75
    )
    correct_subset = df_test[~misclassified_mask]
    print('CATEGORY 1: Polysemy Confusion')
    print(f'  Criterion: def_num_usable_synsets > {synset_p75:.0f} '
          f'(75th percentile of test set)')
    print(f'  Median synsets (misclassified): '
          f'{df_misclassified["def_num_usable_synsets"].median():.0f}')
    print(f'  Median synsets (correct):       '
          f'{correct_subset["def_num_usable_synsets"].median():.0f}')
else:
    category_assignments['polysemy_confusion'] = False
    print('CATEGORY 1: Polysemy Confusion')
    print('  (def_num_usable_synsets not available — skipping)')

cat1 = df_misclassified[category_assignments['polysemy_confusion']]
print(f'  Count: {len(cat1):,} / {n_misclassified:,} '
      f'({len(cat1) / max(n_misclassified, 1):.1%})')
print(f'  FP: {(cat1["label"] == 0).sum():,}  |  '
      f'FN: {(cat1["label"] == 1).sum():,}')

# ------------------------------------------------------------------
# Category 2: Semantic Near-Miss
# ------------------------------------------------------------------
# The context-informed cosine similarity between definition and
# answer is atypical for the true class — the pair is semantically
# borderline. We average the 3 context-informed def→answer cosines
# (cos_w1clue_w2all, cos_w1clue_w2common, cos_w1clue_w2obscure).
# FPs: distractor has unusually high similarity (looks real).
# FNs: real pair has unusually low similarity (looks fake).

ci_answer_cols = [c for c in CONTEXT_INFORMED_COLS if 'w2' in c]
avg_ci_cos_misc = df_misclassified[ci_answer_cols].mean(axis=1)

correct_df = df_test[~misclassified_mask]
real_ci_median = (correct_df[correct_df['label'] == 1][ci_answer_cols]
                  .mean(axis=1).median())
dist_ci_median = (correct_df[correct_df['label'] == 0][ci_answer_cols]
                  .mean(axis=1).median())

category_assignments['semantic_near_miss'] = (
    ((df_misclassified['label'] == 0) & (avg_ci_cos_misc > real_ci_median)) |
    ((df_misclassified['label'] == 1) & (avg_ci_cos_misc < dist_ci_median))
)

cat2 = df_misclassified[category_assignments['semantic_near_miss']]
fp_near = ((cat2['label'] == 0)).sum()
fn_near = ((cat2['label'] == 1)).sum()

print(f'\nCATEGORY 2: Semantic Near-Miss')
print(f'  Criterion: avg context-informed def→answer cosine')
print(f'    FP threshold: > {real_ci_median:.4f} (real-pair median)')
print(f'    FN threshold: < {dist_ci_median:.4f} (distractor median)')
print(f'  Count: {len(cat2):,} / {n_misclassified:,} '
      f'({len(cat2) / max(n_misclassified, 1):.1%})')
print(f'  FP: {fp_near:,}  |  FN: {fn_near:,}')

# ------------------------------------------------------------------
# Category 3: Surface Feature Artifact
# ------------------------------------------------------------------
# Surface features push the prediction in the wrong direction.
# FPs: distractor has high orthographic similarity to definition
#      (low edit distance or high character overlap).
# FNs: real pair has low orthographic similarity
#      (high edit distance or low character overlap).

ed_p25 = df_test['surface_edit_distance'].quantile(0.25)
ed_p75 = df_test['surface_edit_distance'].quantile(0.75)
co_p25 = df_test['surface_char_overlap_ratio'].quantile(0.25)
co_p75 = df_test['surface_char_overlap_ratio'].quantile(0.75)

category_assignments['surface_artifact'] = (
    ((df_misclassified['label'] == 0) &
     ((df_misclassified['surface_edit_distance'] <= ed_p25) |
      (df_misclassified['surface_char_overlap_ratio'] >= co_p75))) |
    ((df_misclassified['label'] == 1) &
     ((df_misclassified['surface_edit_distance'] >= ed_p75) |
      (df_misclassified['surface_char_overlap_ratio'] <= co_p25)))
)

cat3 = df_misclassified[category_assignments['surface_artifact']]
print(f'\nCATEGORY 3: Surface Feature Artifact')
print(f'  Criterion: surface features in wrong direction for true class')
print(f'    FP: edit_dist ≤ {ed_p25:.0f} (p25) or '
      f'char_overlap ≥ {co_p75:.3f} (p75)')
print(f'    FN: edit_dist ≥ {ed_p75:.0f} (p75) or '
      f'char_overlap ≤ {co_p25:.3f} (p25)')
print(f'  Count: {len(cat3):,} / {n_misclassified:,} '
      f'({len(cat3) / max(n_misclassified, 1):.1%})')
print(f'  FP: {(cat3["label"] == 0).sum():,}  |  '
      f'FN: {(cat3["label"] == 1).sum():,}')

# ------------------------------------------------------------------
# Overlap and coverage
# ------------------------------------------------------------------
n_cat1 = category_assignments['polysemy_confusion'].sum()
n_cat2 = category_assignments['semantic_near_miss'].sum()
n_cat3 = category_assignments['surface_artifact'].sum()

any_cat = category_assignments.any(axis=1)
n_any = any_cat.sum()
n_none = n_misclassified - n_any
n_multi = (category_assignments.sum(axis=1) > 1).sum()

print(f'\n{"—" * 60}')
print(f'Category overlap:')
print(f'  Cat 1 ∩ Cat 2: '
      f'{(category_assignments["polysemy_confusion"] & category_assignments["semantic_near_miss"]).sum()}')
print(f'  Cat 1 ∩ Cat 3: '
      f'{(category_assignments["polysemy_confusion"] & category_assignments["surface_artifact"]).sum()}')
print(f'  Cat 2 ∩ Cat 3: '
      f'{(category_assignments["semantic_near_miss"] & category_assignments["surface_artifact"]).sum()}')
print(f'\nCategorized (≥1): {n_any:,} '
      f'({n_any / max(n_misclassified, 1):.1%})')
print(f'Multiple categories: {n_multi:,}')
print(f'Uncategorized: {n_none:,} '
      f'({n_none / max(n_misclassified, 1):.1%})')

# ------------------------------------------------------------------
# Suggested future improvements
# ------------------------------------------------------------------
print(f'\n\n{"=" * 60}')
print('Suggested Future Improvements')
print(f'{"=" * 60}')
print("""
1. POLYSEMY CONFUSION → Word Sense Disambiguation (WSD)
   Use a WSD model (e.g., EWISER or BEM) to identify the
   contextually appropriate WordNet synset before computing the
   definition embedding. This replaces the allsense average with
   a targeted single-sense embedding, reducing noise from
   irrelevant senses.

2. SEMANTIC NEAR-MISS → Cross-Encoder Reranking
   Add a cross-encoder feature that jointly encodes the definition
   and answer in a single input sequence, producing a more nuanced
   similarity score than cosine similarity between independently
   computed embeddings.

3. SURFACE FEATURE ARTIFACT → Feature Regularization
   Apply L1 regularization or recursive feature elimination to
   reduce the model's reliance on orthographic features when
   semantic features provide sufficient signal. Alternatively,
   train without surface features on subsets where they dominate
   the prediction.
""")

# ============================================================
# Save failure analysis outputs
# ============================================================

# --- Save misclassified examples CSV ---
csv_path = OUTPUT_DIR / 'misclassified_examples.csv'
df_misc_export.to_csv(csv_path, index=False)
print(f'Saved {len(df_misc_export):,} misclassified examples to {csv_path}')

# --- Generate and save failure_analysis.md ---
# A standalone markdown summary for readers who want the key findings
# without opening the notebook.
md_lines = [
    '# Failure Analysis — Step 12',
    '',
    f'**Model:** {best_model_name} (Exp 2A, {len(HARDER_FEATURE_COLS)} features)',
    f'**Evaluation set:** Fold 0 test set ({n_total:,} examples)',
    f'**Generated:** Sample mode = {SAMPLE_MODE}',
    '',
    '## Error Summary',
    '',
    f'- Total misclassified: **{n_misclassified:,}** '
    f'({error_rate:.2%} error rate)',
    f'- False positives (distractor predicted as real): {n_fp:,}',
    f'- False negatives (real pair predicted as distractor): {n_fn:,}',
    '',
    '## Failure Categories',
    '',
]

cat_names_for_md = {
    'polysemy_confusion': 'Polysemy Confusion',
    'semantic_near_miss': 'Semantic Near-Miss',
    'surface_artifact': 'Surface Feature Artifact',
}

for col, name in cat_names_for_md.items():
    count = int(category_assignments[col].sum())
    pct = count / max(n_misclassified, 1)
    subset = df_misclassified[category_assignments[col]]
    n_fp_cat = int((subset['label'] == 0).sum())
    n_fn_cat = int((subset['label'] == 1).sum())
    md_lines.append(f'### {name}')
    md_lines.append(
        f'- Count: {count:,} / {n_misclassified:,} ({pct:.1%} of errors)')
    md_lines.append(f'- False positives: {n_fp_cat:,} | '
                    f'False negatives: {n_fn_cat:,}')
    md_lines.append('')

md_lines += [
    '## Suggested Improvements',
    '',
    '1. **Polysemy Confusion → Word Sense Disambiguation (WSD):**'
    ' Use a WSD model to select the contextually appropriate synset'
    ' before computing embeddings, replacing the allsense average.',
    '',
    '2. **Semantic Near-Miss → Cross-Encoder Reranking:**'
    ' Add a cross-encoder feature that jointly encodes definition'
    ' and answer, capturing nuanced differences between near-synonyms.',
    '',
    '3. **Surface Feature Artifact → Feature Regularization:**'
    ' Apply L1 regularization or feature elimination to reduce'
    ' reliance on orthographic features when semantic features suffice.',
    '',
    '---',
    '',
    '*See `misclassified_examples.csv` for the full misclassified'
    ' DataFrame.*',
    '*See `08_results_and_evaluation.ipynb` for detailed analysis'
    ' and examples.*',
]

md_path = OUTPUT_DIR / 'failure_analysis.md'
md_path.write_text('\n'.join(md_lines))
print(f'Saved failure analysis summary to {md_path}')

08 — Results and Evaluation¶

1. Configuration¶

2. Step 9 — Summary Results Table¶

Interpreting the Summary Results¶

3. Step 10a — Retrain Best Model on Fold 0¶

4. Step 10b — Feature Importance¶

Interpreting Feature Importance¶

5. Step 10c — Group-Level Ablation¶

Interpreting the Ablation Results¶

6. Step 11 — Sensitivity Analysis: Learning Curve¶

Interpreting the Learning Curve¶

Summary (Steps 9–11)¶

7. Step 12 — Failure Analysis¶

Interpreting the Failure Analysis¶

Notebook Summary¶

	Exp 1A	Exp 1B	Δ Easy	Exp 2A	Exp 2B	Δ Hard
KNN	...	...	...	...	...	...
Logistic Regression	...	...	...	...	...	...
Random Forest	...	...	...	...	...	...

Group removed	Features removed	Features remaining
Context-Informed	6	26 (= Exp 2B)
Relationship	22	10
Surface	4	28