import sys
import warnings

import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import GroupKFold

warnings.filterwarnings('ignore', category=FutureWarning)

# --- Environment Auto-Detection ---
# Same pattern as prior notebooks: detect Colab vs. local / Great Lakes.
try:
    IS_COLAB = 'google.colab' in str(get_ipython())
except NameError:
    IS_COLAB = False

if IS_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    PROJECT_ROOT = Path('/content/drive/MyDrive/SIADS 692 Milestone II/'
                        'Milestone II - NLP Cryptic Crossword Clues/'
                        'clue_misdirection')
else:
    try:
        PROJECT_ROOT = Path(__file__).resolve().parent.parent
    except NameError:
        PROJECT_ROOT = Path.cwd().parent

DATA_DIR = PROJECT_ROOT / 'data'
OUTPUT_DIR = PROJECT_ROOT / 'outputs'
SCRIPTS_DIR = PROJECT_ROOT / 'scripts'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# --- Add scripts/ to sys.path so feature_utils is importable ---
# Decision 18: feature column lists are defined in feature_utils.py
# (extracted from NB 03) so all downstream notebooks use the same names.
if str(SCRIPTS_DIR) not in sys.path:
    sys.path.insert(0, str(SCRIPTS_DIR))

from feature_utils import (
    ALL_FEATURE_COLS,
    CONTEXT_INFORMED_COLS,
)

# --- Experiment parameters ---
RANDOM_SEED = 42
N_FOLDS = 5
SAMPLE_MODE = True   # <-- Set to False for final runs
SAMPLE_SIZE = 20_000

# --- Feature sets ---
# Exp 1A: all 47 features
# Exp 1B: remove the 6 context-informed cosine features (those involving
# word1_clue_context) → 41 features. These features capture how the
# definition's meaning shifts when embedded within the clue sentence.
# Removing them tests whether clue context helps or hurts classification.
EXP_1B_COLS = [c for c in ALL_FEATURE_COLS if c not in CONTEXT_INFORMED_COLS]

print(f'Environment: {"Google Colab" if IS_COLAB else "Local / Great Lakes"}')
print(f'Project root: {PROJECT_ROOT}')
print(f'Data directory: {DATA_DIR}')
print(f'Output directory: {OUTPUT_DIR}')
print(f'\nRandom seed: {RANDOM_SEED}')
print(f'CV folds: {N_FOLDS}')
print(f'Sample mode: {SAMPLE_MODE}'
      f'{f" ({SAMPLE_SIZE:,} rows)" if SAMPLE_MODE else " (full dataset)"}')
print(f'\nFeature sets:')
print(f'  Exp 1A (all features):              {len(ALL_FEATURE_COLS)}')
print(f'  Context-informed (to remove for 1B): {len(CONTEXT_INFORMED_COLS)}')
print(f'  Exp 1B (context-free only):          {len(EXP_1B_COLS)}')

Environment: Local / Great Lakes
Project root: /Users/victoria/Desktop/MADS/ccc-project/clue_misdirection
Data directory: /Users/victoria/Desktop/MADS/ccc-project/clue_misdirection/data
Output directory: /Users/victoria/Desktop/MADS/ccc-project/clue_misdirection/outputs

Random seed: 42
CV folds: 5
Sample mode: True (20,000 rows)

Feature sets:
  Exp 1A (all features):              47
  Context-informed (to remove for 1B): 6
  Exp 1B (context-free only):          41

# ============================================================
# Load dataset_easy.parquet (Step 5 output)
# ============================================================
dataset_path = DATA_DIR / 'dataset_easy.parquet'
assert dataset_path.exists(), (
    f'Missing input file: {dataset_path}\n'
    f'Run 05_dataset_construction.ipynb first to produce this file.'
)

df = pd.read_parquet(dataset_path)
print(f'Loaded dataset_easy.parquet: {len(df):,} rows × {len(df.columns)} columns')

# --- Validate expected columns ---
missing_feat = [c for c in ALL_FEATURE_COLS if c not in df.columns]
assert not missing_feat, f'Missing feature columns: {missing_feat}'
assert 'label' in df.columns, 'Missing label column'
assert 'definition_wn' in df.columns, 'Missing definition_wn column'
assert 'answer_wn' in df.columns, 'Missing answer_wn column'

# --- Sample mode ---
# When iterating quickly, take a stratified subsample to speed up
# cross-validation. Stratification preserves the 1:1 label balance.
# We sample each label group separately and concatenate, rather than
# using groupby().apply(), which can drop the grouping column.
if SAMPLE_MODE:
    sampled_parts = []
    for label_val in df['label'].unique():
        group = df[df['label'] == label_val]
        sampled_parts.append(
            group.sample(n=min(SAMPLE_SIZE // 2, len(group)),
                         random_state=RANDOM_SEED)
        )
    df = pd.concat(sampled_parts, ignore_index=True)
    print(f'\n⚠ SAMPLE MODE: subsampled to {len(df):,} rows '
          f'(set SAMPLE_MODE = False for final runs)')

# --- Summary ---
print(f'\nShape: {df.shape}')
print(f'\nLabel distribution:')
print(df['label'].value_counts().to_string())
print(f'\nUnique definition_wn values: {df["definition_wn"].nunique():,}')
print(f'Unique answer_wn values:     {df["answer_wn"].nunique():,}')

# Number of unique (definition_wn, answer_wn) pairs — this is the
# grouping unit for GroupKFold. Each pair may appear in multiple clue
# rows (different clue surfaces for the same definition–answer pair),
# and each real row has a corresponding distractor row with a different
# answer_wn. GroupKFold ensures all rows sharing the same pair stay in
# the same fold, preventing near-duplicate feature vectors from leaking
# across train/test splits.
n_unique_pairs = df.groupby(['definition_wn', 'answer_wn']).ngroups
print(f'Unique (definition_wn, answer_wn) pairs: {n_unique_pairs:,}')

# --- Validate no NaNs in feature columns ---
feat_nulls = df[ALL_FEATURE_COLS].isnull().any()
if feat_nulls.any():
    print(f'\nWARNING: NaN values found in features:')
    print(feat_nulls[feat_nulls].to_string())
else:
    print(f'\nNo NaN values in any of the {len(ALL_FEATURE_COLS)} feature columns ✓')

Loaded dataset_easy.parquet: 480,422 rows × 62 columns

⚠ SAMPLE MODE: subsampled to 20,000 rows (set SAMPLE_MODE = False for final runs)

Shape: (20000, 62)

Label distribution:
label
1    10000
0    10000

Unique definition_wn values: 8,151
Unique answer_wn values:     15,114
Unique (definition_wn, answer_wn) pairs: 19,339

No NaN values in any of the 47 feature columns ✓

# ============================================================
# Create group key and assign folds
# ============================================================
# Build a composite group key from (definition_wn, answer_wn). All rows
# sharing this pair — whether real or distractor, and across multiple
# clue surfaces — land in the same fold.
groups = df['definition_wn'].astype(str) + '|||' + df['answer_wn'].astype(str)

gkf = GroupKFold(n_splits=N_FOLDS)

# GroupKFold.split() yields (train_idx, test_idx) tuples. We only need
# the fold assignment for each row, so we iterate and record which fold
# each row's test set falls into.
df['fold'] = -1
for fold_idx, (_, test_idx) in enumerate(gkf.split(df, y=df['label'], groups=groups)):
    df.loc[df.index[test_idx], 'fold'] = fold_idx

assert (df['fold'] >= 0).all(), 'Some rows were not assigned to any fold'

# ============================================================
# Verify: no (definition_wn, answer_wn) pair spans multiple folds
# ============================================================
folds_per_group = (
    df.groupby(['definition_wn', 'answer_wn'])['fold']
      .nunique()
)
leaked_groups = folds_per_group[folds_per_group > 1]
assert len(leaked_groups) == 0, (
    f'{len(leaked_groups)} groups span multiple folds — GroupKFold failed!\n'
    f'Examples: {leaked_groups.head(5).to_dict()}'
)
print(f'GroupKFold verification passed: no (definition_wn, answer_wn) '
      f'pair spans multiple folds ✓')

# ============================================================
# Print fold sizes and label balance
# ============================================================
print(f'\n{"Fold":<6s} {"Size":>8s} {"Label=1":>10s} {"Label=0":>10s} {"% Positive":>12s}')
print('-' * 50)
for fold_idx in range(N_FOLDS):
    fold_mask = df['fold'] == fold_idx
    fold_size = fold_mask.sum()
    n_pos = (df.loc[fold_mask, 'label'] == 1).sum()
    n_neg = (df.loc[fold_mask, 'label'] == 0).sum()
    pct_pos = n_pos / fold_size * 100
    print(f'{fold_idx:<6d} {fold_size:>8,d} {n_pos:>10,d} {n_neg:>10,d} {pct_pos:>11.1f}%')

print(f'\nTotal rows: {len(df):,}')
print(f'Unique groups: {groups.nunique():,}')

GroupKFold verification passed: no (definition_wn, answer_wn) pair spans multiple folds ✓

Fold       Size    Label=1    Label=0   % Positive
--------------------------------------------------
0         4,000      2,024      1,976        50.6%
1         4,000      1,965      2,035        49.1%
2         4,000      1,978      2,022        49.5%
3         4,000      1,980      2,020        49.5%
4         4,000      2,053      1,947        51.3%

Total rows: 20,000
Unique groups: 19,339

import time
from sklearn.base import clone
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (GridSearchCV, RandomizedSearchCV,
                                     StratifiedKFold)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score, roc_auc_score)

# ============================================================
# Hyperparameter grids
# ============================================================
# Full grids for final runs; reduced grids when SAMPLE_MODE is True
# to keep iteration time under a few minutes.

if SAMPLE_MODE:
    knn_grid = {
        'n_neighbors': [3, 7, 15],
        'weights': ['uniform', 'distance'],
    }
    logreg_grid = {
        'C': [0.1, 1.0, 10.0],
        'l1_ratio': [0.0, 0.5, 1.0],
    }
    rf_grid = {
        'n_estimators': [100, 200],
        'max_depth': [5, 10, None],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
    }
    RF_N_ITER = 10
else:
    knn_grid = {
        'n_neighbors': [3, 5, 7, 11, 15, 21],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan'],
    }
    logreg_grid = {
        'C': [0.01, 0.1, 1.0, 10.0, 100.0],
        'l1_ratio': [0.0, 0.5, 1.0],
    }
    rf_grid = {
        'n_estimators': [100, 200, 500],
        'max_depth': [5, 10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2'],
    }
    RF_N_ITER = 20

# --- Model configurations ---
# Each entry defines the base estimator, its search grid, search
# strategy (grid vs. randomized), and whether StandardScaler should
# be applied before fitting.  Random Forest is scale-invariant, so
# it receives unscaled features directly.
model_configs = {
    'KNN': {
        'estimator': KNeighborsClassifier(),
        'param_grid': knn_grid,
        'search': 'grid',
        'scale': True,
    },
    'Logistic Regression': {
        'estimator': LogisticRegression(
            solver='saga', penalty='elasticnet',
            max_iter=5000, random_state=RANDOM_SEED),
        'param_grid': logreg_grid,
        'search': 'grid',
        'scale': True,
    },
    'Random Forest': {
        'estimator': RandomForestClassifier(random_state=RANDOM_SEED),
        'param_grid': rf_grid,
        'search': 'random',
        'scale': False,
        'n_iter': RF_N_ITER,
    },
}

# Print grid sizes so we know how long tuning will take
for name, cfg in model_configs.items():
    total = 1
    for v in cfg['param_grid'].values():
        total *= len(v)
    if cfg['search'] == 'grid':
        print(f'{name}: GridSearchCV \u2014 {total} combinations \u00d7 3 inner folds')
    else:
        n_it = cfg.get('n_iter', 20)
        print(f'{name}: RandomizedSearchCV \u2014 {n_it} of {total} '
              f'combinations \u00d7 3 inner folds')

KNN: GridSearchCV — 6 combinations × 3 inner folds
Logistic Regression: GridSearchCV — 9 combinations × 3 inner folds
Random Forest: RandomizedSearchCV — 10 of 24 combinations × 3 inner folds

def run_experiment(df, feature_cols, experiment_name, model_configs,
                   n_folds=5):
    """Run a classification experiment using pre-assigned GroupKFold splits.

    For each outer fold, hyperparameters are tuned via inner 3-fold
    StratifiedKFold CV on the training portion, then the best model is
    evaluated on the held-out test fold.  StandardScaler is fitted on
    the training fold only for scale-sensitive models (KNN, LogReg).

    Parameters
    ----------
    df : pd.DataFrame
        Must contain ``feature_cols``, ``'label'``, and ``'fold'``.
    feature_cols : list of str
        Feature columns to use as model input.
    experiment_name : str
        Label for this experiment (e.g., ``"Exp_1A"``).
    model_configs : dict
        Model definitions — see hyperparameter grids cell above.
    n_folds : int
        Number of outer CV folds (must match ``'fold'`` column values).

    Returns
    -------
    results_df : pd.DataFrame
        One row per (model, fold) with accuracy, F1, precision, recall,
        ROC AUC, and best hyperparameters.
    best_params : dict
        ``{model_name: {fold_idx: best_params_dict}}``.
    """
    X = df[feature_cols].values
    y = df['label'].values

    all_results = []
    best_params = {name: {} for name in model_configs}

    # Inner CV for hyperparameter tuning.  We use StratifiedKFold (not
    # GroupKFold) for the inner loop because the outer GroupKFold has
    # already separated definition-answer groups across folds — further
    # group separation within a single training fold is unnecessary and
    # would complicate the search with minimal benefit.
    inner_cv = StratifiedKFold(
        n_splits=3, shuffle=True, random_state=RANDOM_SEED)

    t0_exp = time.time()
    print(f'\n{"="*65}')
    print(f'{experiment_name}: {len(feature_cols)} features, {len(df):,} rows')
    print(f'{"="*65}')

    for fold_idx in range(n_folds):
        test_mask = (df['fold'] == fold_idx).values
        train_mask = ~test_mask

        X_train, X_test = X[train_mask], X[test_mask]
        y_train, y_test = y[train_mask], y[test_mask]

        print(f'\nFold {fold_idx}: '
              f'train={train_mask.sum():,}  test={test_mask.sum():,}')

        for model_name, config in model_configs.items():
            t0 = time.time()
            estimator = clone(config['estimator'])

            # --- Feature scaling ---
            # StandardScaler is fitted on the training fold only, then
            # applied to the test fold.  This prevents information about
            # test-set feature distributions from leaking into training.
            # Random Forest is scale-invariant and skips this step.
            if config['scale']:
                scaler = StandardScaler()
                X_tr = scaler.fit_transform(X_train)
                X_te = scaler.transform(X_test)
            else:
                X_tr = X_train
                X_te = X_test

            # --- Inner CV hyperparameter search ---
            if config['search'] == 'random':
                search = RandomizedSearchCV(
                    estimator, config['param_grid'],
                    n_iter=config.get('n_iter', 20),
                    cv=inner_cv, scoring='accuracy',
                    n_jobs=-1, random_state=RANDOM_SEED,
                )
            else:
                search = GridSearchCV(
                    estimator, config['param_grid'],
                    cv=inner_cv, scoring='accuracy',
                    n_jobs=-1,
                )

            search.fit(X_tr, y_train)
            best_params[model_name][fold_idx] = search.best_params_

            # --- Evaluate on held-out test fold ---
            y_pred = search.predict(X_te)
            y_prob = search.predict_proba(X_te)[:, 1]

            acc = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            prec = precision_score(y_test, y_pred)
            rec = recall_score(y_test, y_pred)
            roc = roc_auc_score(y_test, y_prob)
            elapsed = time.time() - t0

            all_results.append({
                'experiment': experiment_name,
                'model': model_name,
                'fold': fold_idx,
                'accuracy': acc,
                'f1': f1,
                'precision': prec,
                'recall': rec,
                'roc_auc': roc,
                'best_params': str(search.best_params_),
            })

            print(f'  {model_name:<22s} '
                  f'Acc={acc:.4f}  F1={f1:.4f}  AUC={roc:.4f}  '
                  f'[{elapsed:.1f}s]')

    elapsed_total = time.time() - t0_exp
    print(f'\n{experiment_name} complete in {elapsed_total:.0f}s')

    return pd.DataFrame(all_results), best_params

# ============================================================
# Run Experiment 1A: all 47 features
# ============================================================
results_1a, params_1a = run_experiment(
    df, ALL_FEATURE_COLS, "Exp_1A", model_configs, n_folds=N_FOLDS)

# ============================================================
# Run Experiment 1B: 41 features (context-informed removed)
# ============================================================
results_1b, params_1b = run_experiment(
    df, EXP_1B_COLS, "Exp_1B", model_configs, n_folds=N_FOLDS)

=================================================================
Exp_1A: 47 features, 20,000 rows
=================================================================

Fold 0: train=16,000  test=4,000
  KNN                    Acc=0.8620  F1=0.8542  AUC=0.9265  [3.1s]
  Logistic Regression    Acc=0.8678  F1=0.8657  AUC=0.9393  [25.2s]
  Random Forest          Acc=0.8678  F1=0.8669  AUC=0.9406  [14.6s]

Fold 1: train=16,000  test=4,000
  KNN                    Acc=0.8552  F1=0.8420  AUC=0.9255  [0.9s]
  Logistic Regression    Acc=0.8615  F1=0.8540  AUC=0.9358  [34.9s]
  Random Forest          Acc=0.8695  F1=0.8636  AUC=0.9401  [17.3s]

Fold 2: train=16,000  test=4,000
  KNN                    Acc=0.8588  F1=0.8480  AUC=0.9265  [0.9s]
  Logistic Regression    Acc=0.8705  F1=0.8662  AUC=0.9367  [28.2s]
  Random Forest          Acc=0.8712  F1=0.8685  AUC=0.9398  [15.1s]

Fold 3: train=16,000  test=4,000
  KNN                    Acc=0.8538  F1=0.8402  AUC=0.9249  [0.9s]
  Logistic Regression    Acc=0.8688  F1=0.8630  AUC=0.9410  [20.7s]
  Random Forest          Acc=0.8738  F1=0.8691  AUC=0.9396  [14.9s]

Fold 4: train=16,000  test=4,000
  KNN                    Acc=0.8540  F1=0.8467  AUC=0.9256  [0.9s]
  Logistic Regression    Acc=0.8700  F1=0.8686  AUC=0.9416  [19.2s]
  Random Forest          Acc=0.8788  F1=0.8788  AUC=0.9418  [17.7s]

Exp_1A complete in 214s

=================================================================
Exp_1B: 41 features, 20,000 rows
=================================================================

Fold 0: train=16,000  test=4,000
  KNN                    Acc=0.8580  F1=0.8508  AUC=0.9231  [0.8s]
  Logistic Regression    Acc=0.8668  F1=0.8644  AUC=0.9364  [27.7s]
  Random Forest          Acc=0.8652  F1=0.8632  AUC=0.9355  [13.0s]

Fold 1: train=16,000  test=4,000
  KNN                    Acc=0.8522  F1=0.8401  AUC=0.9215  [0.9s]
  Logistic Regression    Acc=0.8618  F1=0.8543  AUC=0.9346  [28.2s]
  Random Forest          Acc=0.8678  F1=0.8611  AUC=0.9385  [14.9s]

Fold 2: train=16,000  test=4,000
  KNN                    Acc=0.8550  F1=0.8462  AUC=0.9219  [0.8s]
  Logistic Regression    Acc=0.8665  F1=0.8617  AUC=0.9348  [28.8s]
  Random Forest          Acc=0.8650  F1=0.8609  AUC=0.9339  [12.5s]

Fold 3: train=16,000  test=4,000
  KNN                    Acc=0.8528  F1=0.8417  AUC=0.9249  [0.8s]
  Logistic Regression    Acc=0.8670  F1=0.8609  AUC=0.9392  [21.3s]
  Random Forest          Acc=0.8708  F1=0.8658  AUC=0.9371  [12.6s]

Fold 4: train=16,000  test=4,000
  KNN                    Acc=0.8565  F1=0.8510  AUC=0.9239  [0.8s]
  Logistic Regression    Acc=0.8690  F1=0.8673  AUC=0.9401  [18.1s]
  Random Forest          Acc=0.8732  F1=0.8729  AUC=0.9404  [12.8s]

Exp_1B complete in 194s

# ============================================================
# Results Summary: mean +/- SD across folds
# ============================================================
results_all = pd.concat([results_1a, results_1b], ignore_index=True)

metrics = ['accuracy', 'f1', 'precision', 'recall', 'roc_auc']
summary_rows = []

for exp_name in ['Exp_1A', 'Exp_1B']:
    for model_name in model_configs:
        mask = ((results_all['experiment'] == exp_name) &
                (results_all['model'] == model_name))
        subset = results_all[mask]
        row = {'Experiment': exp_name, 'Model': model_name}
        for m in metrics:
            mean_val = subset[m].mean()
            std_val = subset[m].std()
            row[f'{m}_mean'] = mean_val
            row[f'{m}_std'] = std_val
            row[m] = f'{mean_val:.4f} +/- {std_val:.4f}'
        summary_rows.append(row)

summary_df = pd.DataFrame(summary_rows)

# --- Display formatted table ---
display_cols = ['Experiment', 'Model'] + metrics
print('RESULTS SUMMARY — Easy Dataset (mean +/- SD across 5 folds)')
print('=' * 115)
print(summary_df[display_cols].to_string(index=False))

# --- Delta Easy = Exp 1A - Exp 1B per model ---
print(f'\n{"="*65}')
print('Delta Easy (Exp 1A - Exp 1B)')
print(f'{"="*65}')
delta_rows = []
for model_name in model_configs:
    row_1a = summary_df[(summary_df['Experiment'] == 'Exp_1A') &
                        (summary_df['Model'] == model_name)].iloc[0]
    row_1b = summary_df[(summary_df['Experiment'] == 'Exp_1B') &
                        (summary_df['Model'] == model_name)].iloc[0]
    delta_row = {'Model': model_name}
    for m in metrics:
        delta_row[f'delta_{m}'] = row_1a[f'{m}_mean'] - row_1b[f'{m}_mean']
    delta_rows.append(delta_row)
    print(f'  {model_name:<22s}  '
          f'dAcc={delta_row["delta_accuracy"]:+.4f}  '
          f'dF1={delta_row["delta_f1"]:+.4f}  '
          f'dAUC={delta_row["delta_roc_auc"]:+.4f}')

delta_df = pd.DataFrame(delta_rows)

# --- Best hyperparameters (fold 0 as representative) ---
print(f'\n{"="*65}')
print('Best Hyperparameters (fold 0, representative)')
print(f'{"="*65}')
for model_name in model_configs:
    print(f'\n  {model_name}:')
    print(f'    Exp 1A: {params_1a[model_name][0]}')
    print(f'    Exp 1B: {params_1b[model_name][0]}')

# --- Save to CSV ---
save_df = summary_df[['Experiment', 'Model'] +
                      [f'{m}_mean' for m in metrics] +
                      [f'{m}_std' for m in metrics]]
save_path = OUTPUT_DIR / 'results_easy.csv'
save_df.to_csv(save_path, index=False)
print(f'\nSaved summary to {save_path}')

# Per-fold results (including best_params) for reproducibility
fold_path = OUTPUT_DIR / 'results_easy_per_fold.csv'
results_all.to_csv(fold_path, index=False)
print(f'Per-fold results saved to {fold_path}')

RESULTS SUMMARY — Easy Dataset (mean +/- SD across 5 folds)
===================================================================================================================
Experiment               Model          accuracy                f1         precision            recall           roc_auc
    Exp_1A                 KNN 0.8568 +/- 0.0035 0.8462 +/- 0.0055 0.9129 +/- 0.0056 0.7887 +/- 0.0091 0.9258 +/- 0.0007
    Exp_1A Logistic Regression 0.8677 +/- 0.0036 0.8635 +/- 0.0057 0.8913 +/- 0.0069 0.8375 +/- 0.0087 0.9389 +/- 0.0026
    Exp_1A       Random Forest 0.8722 +/- 0.0043 0.8694 +/- 0.0057 0.8888 +/- 0.0094 0.8508 +/- 0.0074 0.9404 +/- 0.0009
    Exp_1B                 KNN 0.8549 +/- 0.0024 0.8459 +/- 0.0051 0.9012 +/- 0.0085 0.7972 +/- 0.0069 0.9230 +/- 0.0014
    Exp_1B Logistic Regression 0.8662 +/- 0.0027 0.8617 +/- 0.0049 0.8912 +/- 0.0075 0.8342 +/- 0.0064 0.9370 +/- 0.0025
    Exp_1B       Random Forest 0.8684 +/- 0.0036 0.8648 +/- 0.0050 0.8889 +/- 0.0077 0.8420 +/- 0.0051 0.9371 +/- 0.0025

=================================================================
Delta Easy (Exp 1A - Exp 1B)
=================================================================
  KNN                     dAcc=+0.0019  dF1=+0.0003  dAUC=+0.0028
  Logistic Regression     dAcc=+0.0015  dF1=+0.0018  dAUC=+0.0019
  Random Forest           dAcc=+0.0038  dF1=+0.0046  dAUC=+0.0033

=================================================================
Best Hyperparameters (fold 0, representative)
=================================================================

  KNN:
    Exp 1A: {'n_neighbors': 15, 'weights': 'distance'}
    Exp 1B: {'n_neighbors': 15, 'weights': 'distance'}

  Logistic Regression:
    Exp 1A: {'C': 0.1, 'l1_ratio': 0.0}
    Exp 1B: {'C': 0.1, 'l1_ratio': 0.0}

  Random Forest:
    Exp 1A: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': None}
    Exp 1B: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': None}

Saved summary to /Users/victoria/Desktop/MADS/ccc-project/clue_misdirection/outputs/results_easy.csv
Per-fold results saved to /Users/victoria/Desktop/MADS/ccc-project/clue_misdirection/outputs/results_easy_per_fold.csv

Experiment	Features	Count	Description
Exp 1A	All	47	15 context-free + 6 context-informed + 22 relationship + 4 surface
Exp 1B	Context-informed removed	41	15 context-free + 22 relationship + 4 surface

06 — Experiments: Easy Dataset¶

Experiment 1A — All 47 features¶

Experiment 1B — 41 features (context-informed removed)¶

1. Configuration¶

2. Load the Easy Dataset¶

3. GroupKFold Assignment¶

4. Experiment Design¶

5. Discussion¶

What the results tell us¶

Why this matters for the harder dataset¶