import sys
import warnings

import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import GroupKFold

warnings.filterwarnings('ignore', category=FutureWarning)

# --- Environment Auto-Detection ---
# Same pattern as prior notebooks: detect Colab vs. local / Great Lakes.
try:
    IS_COLAB = 'google.colab' in str(get_ipython())
except NameError:
    IS_COLAB = False

if IS_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    PROJECT_ROOT = Path('/content/drive/MyDrive/SIADS 692 Milestone II/'
                        'Milestone II - NLP Cryptic Crossword Clues/'
                        'clue_misdirection')
else:
    try:
        PROJECT_ROOT = Path(__file__).resolve().parent.parent
    except NameError:
        PROJECT_ROOT = Path.cwd().parent

DATA_DIR = PROJECT_ROOT / 'data'
OUTPUT_DIR = PROJECT_ROOT / 'outputs'
SCRIPTS_DIR = PROJECT_ROOT / 'scripts'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# --- Add scripts/ to sys.path so feature_utils is importable ---
# Decision 18: feature column lists are defined in feature_utils.py
# (extracted from NB 03) so all downstream notebooks use the same names.
if str(SCRIPTS_DIR) not in sys.path:
    sys.path.insert(0, str(SCRIPTS_DIR))

from feature_utils import (
    CONTEXT_INFORMED_COLS,
    RELATIONSHIP_COLS,
    SURFACE_COLS,
)

# --- Experiment parameters ---
RANDOM_SEED = 42
N_FOLDS = 5
SAMPLE_MODE = True   # <-- Set to False for final runs
SAMPLE_SIZE = 20_000

# --- Feature sets ---
# The harder dataset has already had the 15 context-free cosine features
# removed (Decision 6: they are artifacts of the cosine-similarity-based
# distractor construction). The remaining 32 features are:
#   6 context-informed + 22 relationship + 4 surface
HARDER_FEATURE_COLS = (
    CONTEXT_INFORMED_COLS + list(RELATIONSHIP_COLS) + SURFACE_COLS
)

# Exp 2A: all 32 harder features
# Exp 2B: remove the 6 context-informed cosine features (those involving
# word1_clue_context) → 26 features. These features capture how the
# definition's meaning shifts when embedded within the clue sentence.
# Removing them tests whether clue context helps or hurts classification
# on the harder task — the central misdirection question.
EXP_2B_COLS = [c for c in HARDER_FEATURE_COLS if c not in CONTEXT_INFORMED_COLS]

print(f'Environment: {"Google Colab" if IS_COLAB else "Local / Great Lakes"}')
print(f'Project root: {PROJECT_ROOT}')
print(f'Data directory: {DATA_DIR}')
print(f'Output directory: {OUTPUT_DIR}')
print(f'\nRandom seed: {RANDOM_SEED}')
print(f'CV folds: {N_FOLDS}')
print(f'Sample mode: {SAMPLE_MODE}'
      f'{f" ({SAMPLE_SIZE:,} rows)" if SAMPLE_MODE else " (full dataset)"}')
print(f'\nFeature sets:')
print(f'  Exp 2A (all harder features):        {len(HARDER_FEATURE_COLS)}')
print(f'  Context-informed (to remove for 2B):  {len(CONTEXT_INFORMED_COLS)}')
print(f'  Exp 2B (relationship + surface only): {len(EXP_2B_COLS)}')

Environment: Local / Great Lakes
Project root: /Users/victoria/Desktop/MADS/ccc-project/clue_misdirection
Data directory: /Users/victoria/Desktop/MADS/ccc-project/clue_misdirection/data
Output directory: /Users/victoria/Desktop/MADS/ccc-project/clue_misdirection/outputs

Random seed: 42
CV folds: 5
Sample mode: True (20,000 rows)

Feature sets:
  Exp 2A (all harder features):        32
  Context-informed (to remove for 2B):  6
  Exp 2B (relationship + surface only): 26

# ============================================================
# Load dataset_harder.parquet (Step 7 output)
# ============================================================
# The harder dataset uses cosine-similarity-based distractors
# (Decision 6): for each real definition, distractor answers are
# sampled from the top-k most similar answer words by cosine
# similarity between word1_average and word2_average. This makes
# the classification task genuinely difficult — distractors are
# semantically plausible, not random. The 15 context-free cosine
# features have already been removed in NB 05 because they are
# artifacts of the cosine-based construction.
dataset_path = DATA_DIR / 'dataset_harder.parquet'
assert dataset_path.exists(), (
    f'Missing input file: {dataset_path}\n'
    f'Run 05_dataset_construction.ipynb first to produce this file.'
)

df = pd.read_parquet(dataset_path)
print(f'Loaded dataset_harder.parquet: {len(df):,} rows × {len(df.columns)} columns')

# --- Validate expected columns ---
# The harder dataset should contain the 32 features (6 context-informed
# + 22 relationship + 4 surface) plus metadata. It should NOT contain
# the 15 context-free cosine features.
missing_feat = [c for c in HARDER_FEATURE_COLS if c not in df.columns]
assert not missing_feat, f'Missing feature columns: {missing_feat}'
assert 'label' in df.columns, 'Missing label column'
assert 'definition_wn' in df.columns, 'Missing definition_wn column'
assert 'answer_wn' in df.columns, 'Missing answer_wn column'

# --- Sample mode ---
# When iterating quickly, take a stratified subsample to speed up
# cross-validation. Stratification preserves the 1:1 label balance.
# We sample each label group separately and concatenate, rather than
# using groupby().apply(), which can drop the grouping column.
if SAMPLE_MODE:
    sampled_parts = []
    for label_val in df['label'].unique():
        group = df[df['label'] == label_val]
        sampled_parts.append(
            group.sample(n=min(SAMPLE_SIZE // 2, len(group)),
                         random_state=RANDOM_SEED)
        )
    df = pd.concat(sampled_parts, ignore_index=True)
    print(f'\n⚠ SAMPLE MODE: subsampled to {len(df):,} rows '
          f'(set SAMPLE_MODE = False for final runs)')

# --- Summary ---
print(f'\nShape: {df.shape}')
print(f'\nLabel distribution:')
print(df['label'].value_counts().to_string())
print(f'\nUnique definition_wn values: {df["definition_wn"].nunique():,}')
print(f'Unique answer_wn values:     {df["answer_wn"].nunique():,}')

# Number of unique (definition_wn, answer_wn) pairs — this is the
# grouping unit for GroupKFold. Each pair may appear in multiple clue
# rows (different clue surfaces for the same definition–answer pair),
# and each real row has a corresponding distractor row with a different
# answer_wn. GroupKFold ensures all rows sharing the same pair stay in
# the same fold, preventing near-duplicate feature vectors from leaking
# across train/test splits.
n_unique_pairs = df.groupby(['definition_wn', 'answer_wn']).ngroups
print(f'Unique (definition_wn, answer_wn) pairs: {n_unique_pairs:,}')

# --- Validate no NaNs in feature columns ---
feat_nulls = df[HARDER_FEATURE_COLS].isnull().any()
if feat_nulls.any():
    print(f'\nWARNING: NaN values found in features:')
    print(feat_nulls[feat_nulls].to_string())
else:
    print(f'\nNo NaN values in any of the {len(HARDER_FEATURE_COLS)} feature columns ✓')

Loaded dataset_harder.parquet: 480,422 rows × 47 columns

⚠ SAMPLE MODE: subsampled to 20,000 rows (set SAMPLE_MODE = False for final runs)

Shape: (20000, 47)

Label distribution:
label
1    10000
0    10000

Unique definition_wn values: 8,151
Unique answer_wn values:     14,156
Unique (definition_wn, answer_wn) pairs: 19,112

No NaN values in any of the 32 feature columns ✓

# ============================================================
# Create group key and assign folds
# ============================================================
# Build a composite group key from (definition_wn, answer_wn). All rows
# sharing this pair — whether real or distractor, and across multiple
# clue surfaces — land in the same fold.
groups = df['definition_wn'].astype(str) + '|||' + df['answer_wn'].astype(str)

gkf = GroupKFold(n_splits=N_FOLDS)

# GroupKFold.split() yields (train_idx, test_idx) tuples. We only need
# the fold assignment for each row, so we iterate and record which fold
# each row's test set falls into.
df['fold'] = -1
for fold_idx, (_, test_idx) in enumerate(gkf.split(df, y=df['label'], groups=groups)):
    df.loc[df.index[test_idx], 'fold'] = fold_idx

assert (df['fold'] >= 0).all(), 'Some rows were not assigned to any fold'

# ============================================================
# Verify: no (definition_wn, answer_wn) pair spans multiple folds
# ============================================================
folds_per_group = (
    df.groupby(['definition_wn', 'answer_wn'])['fold']
      .nunique()
)
leaked_groups = folds_per_group[folds_per_group > 1]
assert len(leaked_groups) == 0, (
    f'{len(leaked_groups)} groups span multiple folds — GroupKFold failed!\n'
    f'Examples: {leaked_groups.head(5).to_dict()}'
)
print(f'GroupKFold verification passed: no (definition_wn, answer_wn) '
      f'pair spans multiple folds ✓')

# ============================================================
# Print fold sizes and label balance
# ============================================================
print(f'\n{"Fold":<6s} {"Size":>8s} {"Label=1":>10s} {"Label=0":>10s} {"% Positive":>12s}')
print('-' * 50)
for fold_idx in range(N_FOLDS):
    fold_mask = df['fold'] == fold_idx
    fold_size = fold_mask.sum()
    n_pos = (df.loc[fold_mask, 'label'] == 1).sum()
    n_neg = (df.loc[fold_mask, 'label'] == 0).sum()
    pct_pos = n_pos / fold_size * 100
    print(f'{fold_idx:<6d} {fold_size:>8,d} {n_pos:>10,d} {n_neg:>10,d} {pct_pos:>11.1f}%')

print(f'\nTotal rows: {len(df):,}')
print(f'Unique groups: {groups.nunique():,}')

GroupKFold verification passed: no (definition_wn, answer_wn) pair spans multiple folds ✓

Fold       Size    Label=1    Label=0   % Positive
--------------------------------------------------
0         4,000      2,043      1,957        51.1%
1         4,000      2,023      1,977        50.6%
2         4,000      1,975      2,025        49.4%
3         4,000      1,991      2,009        49.8%
4         4,000      1,968      2,032        49.2%

Total rows: 20,000
Unique groups: 19,112

import time
from sklearn.base import clone
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (GridSearchCV, RandomizedSearchCV,
                                     StratifiedKFold)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score, roc_auc_score)

# ============================================================
# Hyperparameter grids
# ============================================================
# Same grids as NB 06 — identical model families and search spaces
# ensure results are comparable across easy and harder datasets.
# Full grids for final runs; reduced grids when SAMPLE_MODE is True
# to keep iteration time under a few minutes.

if SAMPLE_MODE:
    knn_grid = {
        'n_neighbors': [3, 7, 15],
        'weights': ['uniform', 'distance'],
    }
    logreg_grid = {
        'C': [0.1, 1.0, 10.0],
        'l1_ratio': [0.0, 0.5, 1.0],
    }
    rf_grid = {
        'n_estimators': [100, 200],
        'max_depth': [5, 10, None],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
    }
    RF_N_ITER = 10
else:
    knn_grid = {
        'n_neighbors': [3, 5, 7, 11, 15, 21],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan'],
    }
    logreg_grid = {
        'C': [0.01, 0.1, 1.0, 10.0, 100.0],
        'l1_ratio': [0.0, 0.5, 1.0],
    }
    rf_grid = {
        'n_estimators': [100, 200, 500],
        'max_depth': [5, 10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2'],
    }
    RF_N_ITER = 20

# --- Model configurations ---
# Each entry defines the base estimator, its search grid, search
# strategy (grid vs. randomized), and whether StandardScaler should
# be applied before fitting.  Random Forest is scale-invariant, so
# it receives unscaled features directly.
model_configs = {
    'KNN': {
        'estimator': KNeighborsClassifier(),
        'param_grid': knn_grid,
        'search': 'grid',
        'scale': True,
    },
    'Logistic Regression': {
        'estimator': LogisticRegression(
            solver='saga', penalty='elasticnet',
            max_iter=5000, random_state=RANDOM_SEED),
        'param_grid': logreg_grid,
        'search': 'grid',
        'scale': True,
    },
    'Random Forest': {
        'estimator': RandomForestClassifier(random_state=RANDOM_SEED),
        'param_grid': rf_grid,
        'search': 'random',
        'scale': False,
        'n_iter': RF_N_ITER,
    },
}

# Print grid sizes so we know how long tuning will take
for name, cfg in model_configs.items():
    total = 1
    for v in cfg['param_grid'].values():
        total *= len(v)
    if cfg['search'] == 'grid':
        print(f'{name}: GridSearchCV — {total} combinations × 3 inner folds')
    else:
        n_it = cfg.get('n_iter', 20)
        print(f'{name}: RandomizedSearchCV — {n_it} of {total} '
              f'combinations × 3 inner folds')

KNN: GridSearchCV — 6 combinations × 3 inner folds
Logistic Regression: GridSearchCV — 9 combinations × 3 inner folds
Random Forest: RandomizedSearchCV — 10 of 24 combinations × 3 inner folds

def run_experiment(df, feature_cols, experiment_name, model_configs,
                   n_folds=5):
    """Run a classification experiment using pre-assigned GroupKFold splits.

    For each outer fold, hyperparameters are tuned via inner 3-fold
    StratifiedKFold CV on the training portion, then the best model is
    evaluated on the held-out test fold.  StandardScaler is fitted on
    the training fold only for scale-sensitive models (KNN, LogReg).

    Parameters
    ----------
    df : pd.DataFrame
        Must contain ``feature_cols``, ``'label'``, and ``'fold'``.
    feature_cols : list of str
        Feature columns to use as model input.
    experiment_name : str
        Label for this experiment (e.g., ``"Exp_2A"``).
    model_configs : dict
        Model definitions — see hyperparameter grids cell above.
    n_folds : int
        Number of outer CV folds (must match ``'fold'`` column values).

    Returns
    -------
    results_df : pd.DataFrame
        One row per (model, fold) with accuracy, F1, precision, recall,
        ROC AUC, and best hyperparameters.
    best_params : dict
        ``{model_name: {fold_idx: best_params_dict}}``.
    """
    X = df[feature_cols].values
    y = df['label'].values

    all_results = []
    best_params = {name: {} for name in model_configs}

    # Inner CV for hyperparameter tuning.  We use StratifiedKFold (not
    # GroupKFold) for the inner loop because the outer GroupKFold has
    # already separated definition-answer groups across folds — further
    # group separation within a single training fold is unnecessary and
    # would complicate the search with minimal benefit.
    inner_cv = StratifiedKFold(
        n_splits=3, shuffle=True, random_state=RANDOM_SEED)

    t0_exp = time.time()
    print(f'\n{"="*65}')
    print(f'{experiment_name}: {len(feature_cols)} features, {len(df):,} rows')
    print(f'{"="*65}')

    for fold_idx in range(n_folds):
        test_mask = (df['fold'] == fold_idx).values
        train_mask = ~test_mask

        X_train, X_test = X[train_mask], X[test_mask]
        y_train, y_test = y[train_mask], y[test_mask]

        print(f'\nFold {fold_idx}: '
              f'train={train_mask.sum():,}  test={test_mask.sum():,}')

        for model_name, config in model_configs.items():
            t0 = time.time()
            estimator = clone(config['estimator'])

            # --- Feature scaling ---
            # StandardScaler is fitted on the training fold only, then
            # applied to the test fold.  This prevents information about
            # test-set feature distributions from leaking into training.
            # Random Forest is scale-invariant and skips this step.
            if config['scale']:
                scaler = StandardScaler()
                X_tr = scaler.fit_transform(X_train)
                X_te = scaler.transform(X_test)
            else:
                X_tr = X_train
                X_te = X_test

            # --- Inner CV hyperparameter search ---
            if config['search'] == 'random':
                search = RandomizedSearchCV(
                    estimator, config['param_grid'],
                    n_iter=config.get('n_iter', 20),
                    cv=inner_cv, scoring='accuracy',
                    n_jobs=-1, random_state=RANDOM_SEED,
                )
            else:
                search = GridSearchCV(
                    estimator, config['param_grid'],
                    cv=inner_cv, scoring='accuracy',
                    n_jobs=-1,
                )

            search.fit(X_tr, y_train)
            best_params[model_name][fold_idx] = search.best_params_

            # --- Evaluate on held-out test fold ---
            y_pred = search.predict(X_te)
            y_prob = search.predict_proba(X_te)[:, 1]

            acc = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            prec = precision_score(y_test, y_pred)
            rec = recall_score(y_test, y_pred)
            roc = roc_auc_score(y_test, y_prob)
            elapsed = time.time() - t0

            all_results.append({
                'experiment': experiment_name,
                'model': model_name,
                'fold': fold_idx,
                'accuracy': acc,
                'f1': f1,
                'precision': prec,
                'recall': rec,
                'roc_auc': roc,
                'best_params': str(search.best_params_),
            })

            print(f'  {model_name:<22s} '
                  f'Acc={acc:.4f}  F1={f1:.4f}  AUC={roc:.4f}  '
                  f'[{elapsed:.1f}s]')

    elapsed_total = time.time() - t0_exp
    print(f'\n{experiment_name} complete in {elapsed_total:.0f}s')

    return pd.DataFrame(all_results), best_params

# ============================================================
# Run Experiment 2A: all 32 harder features
# ============================================================
results_2a, params_2a = run_experiment(
    df, HARDER_FEATURE_COLS, "Exp_2A", model_configs, n_folds=N_FOLDS)

# ============================================================
# Run Experiment 2B: 26 features (context-informed removed)
# ============================================================
results_2b, params_2b = run_experiment(
    df, EXP_2B_COLS, "Exp_2B", model_configs, n_folds=N_FOLDS)

=================================================================
Exp_2A: 32 features, 20,000 rows
=================================================================

Fold 0: train=16,000  test=4,000
  KNN                    Acc=0.7288  F1=0.7153  AUC=0.7885  [2.9s]
  Logistic Regression    Acc=0.7075  F1=0.6952  AUC=0.7696  [5.1s]
  Random Forest          Acc=0.7392  F1=0.7267  AUC=0.8122  [8.4s]

Fold 1: train=16,000  test=4,000
  KNN                    Acc=0.7272  F1=0.7148  AUC=0.7904  [0.8s]
  Logistic Regression    Acc=0.7222  F1=0.7079  AUC=0.7769  [3.5s]
  Random Forest          Acc=0.7498  F1=0.7333  AUC=0.8202  [9.8s]

Fold 2: train=16,000  test=4,000
  KNN                    Acc=0.7322  F1=0.7157  AUC=0.7853  [0.8s]
  Logistic Regression    Acc=0.7205  F1=0.7028  AUC=0.7816  [2.6s]
  Random Forest          Acc=0.7410  F1=0.7197  AUC=0.8073  [8.6s]

Fold 3: train=16,000  test=4,000
  KNN                    Acc=0.7242  F1=0.7107  AUC=0.7892  [0.8s]
  Logistic Regression    Acc=0.7288  F1=0.7139  AUC=0.7833  [3.4s]
  Random Forest          Acc=0.7575  F1=0.7431  AUC=0.8259  [8.8s]

Fold 4: train=16,000  test=4,000
  KNN                    Acc=0.7232  F1=0.7006  AUC=0.7743  [0.8s]
  Logistic Regression    Acc=0.7225  F1=0.7030  AUC=0.7718  [3.9s]
  Random Forest          Acc=0.7358  F1=0.7173  AUC=0.8081  [8.7s]

Exp_2A complete in 69s

=================================================================
Exp_2B: 26 features, 20,000 rows
=================================================================

Fold 0: train=16,000  test=4,000
  KNN                    Acc=0.6412  F1=0.6219  AUC=0.6983  [0.7s]
  Logistic Regression    Acc=0.6560  F1=0.5967  AUC=0.7056  [4.7s]
  Random Forest          Acc=0.6647  F1=0.5834  AUC=0.7311  [3.6s]

Fold 1: train=16,000  test=4,000
  KNN                    Acc=0.6452  F1=0.6237  AUC=0.6990  [0.8s]
  Logistic Regression    Acc=0.6645  F1=0.6130  AUC=0.7130  [3.0s]
  Random Forest          Acc=0.6750  F1=0.5930  AUC=0.7372  [3.9s]

Fold 2: train=16,000  test=4,000
  KNN                    Acc=0.6470  F1=0.6278  AUC=0.6977  [0.8s]
  Logistic Regression    Acc=0.6675  F1=0.6100  AUC=0.7110  [2.5s]
  Random Forest          Acc=0.6660  F1=0.6119  AUC=0.7124  [4.2s]

Fold 3: train=16,000  test=4,000
  KNN                    Acc=0.6505  F1=0.6264  AUC=0.7041  [0.7s]
  Logistic Regression    Acc=0.6655  F1=0.6110  AUC=0.7193  [3.3s]
  Random Forest          Acc=0.6737  F1=0.6262  AUC=0.7232  [4.3s]

Fold 4: train=16,000  test=4,000
  KNN                    Acc=0.6418  F1=0.6149  AUC=0.6936  [0.8s]
  Logistic Regression    Acc=0.6707  F1=0.6109  AUC=0.7061  [3.4s]
  Random Forest          Acc=0.6680  F1=0.5741  AUC=0.7320  [4.1s]

Exp_2B complete in 41s

# ============================================================
# Results Summary: mean +/- SD across folds
# ============================================================
results_all = pd.concat([results_2a, results_2b], ignore_index=True)

metrics = ['accuracy', 'f1', 'precision', 'recall', 'roc_auc']
summary_rows = []

for exp_name in ['Exp_2A', 'Exp_2B']:
    for model_name in model_configs:
        mask = ((results_all['experiment'] == exp_name) &
                (results_all['model'] == model_name))
        subset = results_all[mask]
        row = {'Experiment': exp_name, 'Model': model_name}
        for m in metrics:
            mean_val = subset[m].mean()
            std_val = subset[m].std()
            row[f'{m}_mean'] = mean_val
            row[f'{m}_std'] = std_val
            row[m] = f'{mean_val:.4f} +/- {std_val:.4f}'
        summary_rows.append(row)

summary_df = pd.DataFrame(summary_rows)

# --- Display formatted table ---
display_cols = ['Experiment', 'Model'] + metrics
print('RESULTS SUMMARY — Harder Dataset (mean +/- SD across 5 folds)')
print('=' * 115)
print(summary_df[display_cols].to_string(index=False))

# --- Delta Hard = Exp 2A - Exp 2B per model ---
# This is the central comparison for the misdirection hypothesis
# (design doc Section 8.4). A negative delta means context-informed
# features hurt classification — consistent with misdirection shifting
# the definition embedding away from the true answer.
print(f'\n{"="*65}')
print('Delta Hard (Exp 2A - Exp 2B)')
print(f'{"="*65}')
delta_rows = []
for model_name in model_configs:
    row_2a = summary_df[(summary_df['Experiment'] == 'Exp_2A') &
                        (summary_df['Model'] == model_name)].iloc[0]
    row_2b = summary_df[(summary_df['Experiment'] == 'Exp_2B') &
                        (summary_df['Model'] == model_name)].iloc[0]
    delta_row = {'Model': model_name}
    for m in metrics:
        delta_row[f'delta_{m}'] = row_2a[f'{m}_mean'] - row_2b[f'{m}_mean']
    delta_rows.append(delta_row)
    print(f'  {model_name:<22s}  '
          f'dAcc={delta_row["delta_accuracy"]:+.4f}  '
          f'dF1={delta_row["delta_f1"]:+.4f}  '
          f'dAUC={delta_row["delta_roc_auc"]:+.4f}')

delta_df = pd.DataFrame(delta_rows)

# --- Best hyperparameters (fold 0 as representative) ---
print(f'\n{"="*65}')
print('Best Hyperparameters (fold 0, representative)')
print(f'{"="*65}')
for model_name in model_configs:
    print(f'\n  {model_name}:')
    print(f'    Exp 2A: {params_2a[model_name][0]}')
    print(f'    Exp 2B: {params_2b[model_name][0]}')

# --- Save to CSV ---
save_df = summary_df[['Experiment', 'Model'] +
                      [f'{m}_mean' for m in metrics] +
                      [f'{m}_std' for m in metrics]]
save_path = OUTPUT_DIR / 'results_harder.csv'
save_df.to_csv(save_path, index=False)
print(f'\nSaved summary to {save_path}')

# Per-fold results (including best_params) for reproducibility
fold_path = OUTPUT_DIR / 'results_harder_per_fold.csv'
results_all.to_csv(fold_path, index=False)
print(f'Per-fold results saved to {fold_path}')

RESULTS SUMMARY — Harder Dataset (mean +/- SD across 5 folds)
===================================================================================================================
Experiment               Model          accuracy                f1         precision            recall           roc_auc
    Exp_2A                 KNN 0.7271 +/- 0.0036 0.7114 +/- 0.0064 0.7549 +/- 0.0105 0.6728 +/- 0.0102 0.7855 +/- 0.0066
    Exp_2A Logistic Regression 0.7203 +/- 0.0078 0.7046 +/- 0.0069 0.7466 +/- 0.0069 0.6671 +/- 0.0097 0.7767 +/- 0.0060
    Exp_2A       Random Forest 0.7446 +/- 0.0088 0.7280 +/- 0.0105 0.7786 +/- 0.0145 0.6837 +/- 0.0121 0.8147 +/- 0.0081
    Exp_2B                 KNN 0.6452 +/- 0.0038 0.6229 +/- 0.0051 0.6646 +/- 0.0101 0.5864 +/- 0.0101 0.6985 +/- 0.0038
    Exp_2B Logistic Regression 0.6649 +/- 0.0055 0.6083 +/- 0.0066 0.7318 +/- 0.0079 0.5207 +/- 0.0126 0.7110 +/- 0.0056
    Exp_2B       Random Forest 0.6695 +/- 0.0046 0.5977 +/- 0.0212 0.7664 +/- 0.0411 0.4929 +/- 0.0446 0.7272 +/- 0.0097

=================================================================
Delta Hard (Exp 2A - Exp 2B)
=================================================================
  KNN                     dAcc=+0.0820  dF1=+0.0885  dAUC=+0.0870
  Logistic Regression     dAcc=+0.0554  dF1=+0.0962  dAUC=+0.0657
  Random Forest           dAcc=+0.0751  dF1=+0.1303  dAUC=+0.0875

=================================================================
Best Hyperparameters (fold 0, representative)
=================================================================

  KNN:
    Exp 2A: {'n_neighbors': 15, 'weights': 'distance'}
    Exp 2B: {'n_neighbors': 15, 'weights': 'uniform'}

  Logistic Regression:
    Exp 2A: {'C': 0.1, 'l1_ratio': 1.0}
    Exp 2B: {'C': 0.1, 'l1_ratio': 1.0}

  Random Forest:
    Exp 2A: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': None}
    Exp 2B: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 10}

Saved summary to /Users/victoria/Desktop/MADS/ccc-project/clue_misdirection/outputs/results_harder.csv
Per-fold results saved to /Users/victoria/Desktop/MADS/ccc-project/clue_misdirection/outputs/results_harder_per_fold.csv

Experiment	Features	Count	Description
Exp 2A	All harder features	32	6 context-informed + 22 relationship + 4 surface
Exp 2B	Context-informed removed	26	22 relationship + 4 surface

07 — Experiments: Harder Dataset¶

Experiment 2A — All 32 features¶

Experiment 2B — 26 features (context-informed removed)¶

1. Configuration¶

2. Load the Harder Dataset¶

3. GroupKFold Assignment¶

4. Experiment Design¶

5. Discussion¶

Interpreting Δ Hard (Exp 2A − Exp 2B)¶

Comparison with the easy dataset (NB 06)¶

Caveats¶