import warnings
import sys

import numpy as np
import pandas as pd
from pathlib import Path

import nltk
from nltk.corpus import wordnet as wn

from tqdm.auto import tqdm

warnings.filterwarnings('ignore', category=FutureWarning)

# --- Environment Auto-Detection ---
# Same pattern as 02_embedding_generation.ipynb, 03_feature_engineering.ipynb,
# and 04_retrieval_analysis.ipynb: detect Colab, Great Lakes, or local and
# set paths accordingly.
try:
    IS_COLAB = 'google.colab' in str(get_ipython())
except NameError:
    IS_COLAB = False

if IS_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    PROJECT_ROOT = Path('/content/drive/MyDrive/SIADS 692 Milestone II/'
                        'Milestone II - NLP Cryptic Crossword Clues/'
                        'clue_misdirection')
else:
    # Local or Great Lakes: notebook is in clue_misdirection/notebooks/,
    # so parent is the clue_misdirection project root.
    try:
        PROJECT_ROOT = Path(__file__).resolve().parent.parent
    except NameError:
        PROJECT_ROOT = Path.cwd().parent

DATA_DIR = PROJECT_ROOT / 'data'
EMBEDDINGS_DIR = DATA_DIR / 'embeddings'
OUTPUT_DIR = PROJECT_ROOT / 'outputs'
SCRIPTS_DIR = PROJECT_ROOT / 'scripts'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# --- Add scripts/ to sys.path so feature_utils is importable ---
# Decision 18: feature computation logic is extracted from NB 03 into
# scripts/feature_utils.py to avoid duplicating code across NB 05 and NB 07.
if str(SCRIPTS_DIR) not in sys.path:
    sys.path.insert(0, str(SCRIPTS_DIR))

from feature_utils import (
    CONTEXT_FREE_COLS, CONTEXT_INFORMED_COLS, RELATIONSHIP_COLS,
    SURFACE_COLS, ALL_FEATURE_COLS, METADATA_COLS,
    rowwise_cosine,
    compute_surface_features,
    get_wordnet_synsets,
    compute_relationship_features,
    compute_cosine_features_for_pair,
)

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
rng = np.random.RandomState(RANDOM_SEED)

# Download WordNet data — needed for relationship features on distractor pairs.
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

# Embedding dimension for CALE-MBERT-en
EMBED_DIM = 1024

print(f'Environment: {"Google Colab" if IS_COLAB else "Local / Great Lakes"}')
print(f'Project root: {PROJECT_ROOT}')
print(f'Data directory: {DATA_DIR}')
print(f'Embeddings directory: {EMBEDDINGS_DIR}')
print(f'\nFeature group sizes (from feature_utils):')
print(f'  Context-free:     {len(CONTEXT_FREE_COLS)}')
print(f'  Context-informed: {len(CONTEXT_INFORMED_COLS)}')
print(f'  Relationship:     {len(RELATIONSHIP_COLS)}')
print(f'  Surface:          {len(SURFACE_COLS)}')
print(f'  Total:            {len(ALL_FEATURE_COLS)}')

Environment: Local / Great Lakes
Project root: /Users/victoria/Desktop/MADS/ccc-project/clue_misdirection
Data directory: /Users/victoria/Desktop/MADS/ccc-project/clue_misdirection/data
Embeddings directory: /Users/victoria/Desktop/MADS/ccc-project/clue_misdirection/data/embeddings

Feature group sizes (from feature_utils):
  Context-free:     15
  Context-informed: 6
  Relationship:     22
  Surface:          4
  Total:            47

# ============================================================
# Load features_all.parquet (Step 3 output)
# ============================================================
features_path = DATA_DIR / 'features_all.parquet'
assert features_path.exists(), (
    f'Missing input file: {features_path}\n'
    f'Run 03_feature_engineering.ipynb first to produce this file.'
)
df_real = pd.read_parquet(features_path)
print(f'features_all.parquet: {len(df_real):,} rows × {len(df_real.columns)} columns')
print(f'  Unique (definition, answer) pairs: '
      f'{df_real["def_answer_pair_id"].nunique():,}')
print(f'  Unique definition_wn values:       '
      f'{df_real["definition_wn"].nunique():,}')
print(f'  Unique answer_wn values:            '
      f'{df_real["answer_wn"].nunique():,}')

# Validate expected columns exist
missing_feat = [c for c in ALL_FEATURE_COLS if c not in df_real.columns]
assert not missing_feat, f'Missing feature columns in parquet: {missing_feat}'
missing_meta = [c for c in METADATA_COLS if c not in df_real.columns]
assert not missing_meta, f'Missing metadata columns in parquet: {missing_meta}'

# ============================================================
# Load embedding arrays and index files
# ============================================================
# CRITICAL: keep_default_na=False on all index CSVs — the word "nan"
# (grandmother) is a valid crossword definition/answer.

definition_embeddings = np.load(EMBEDDINGS_DIR / 'definition_embeddings.npy')
definition_index = pd.read_csv(
    EMBEDDINGS_DIR / 'definition_index.csv', index_col=0,
    keep_default_na=False)

answer_embeddings = np.load(EMBEDDINGS_DIR / 'answer_embeddings.npy')
answer_index = pd.read_csv(
    EMBEDDINGS_DIR / 'answer_index.csv', index_col=0,
    keep_default_na=False)

clue_context_embeddings = np.load(
    EMBEDDINGS_DIR / 'clue_context_embeddings.npy')

# --- Load clue_context_phrases.csv for composite-key lookups ---
# We use clue_context_phrases.csv (not clue_context_index.csv) because
# clue_id is non-unique for double-definition clues. The composite key
# (clue_id, definition) disambiguates rows. This is the same approach
# used in NB 03 and NB 04.
cc_phrases = pd.read_csv(
    EMBEDDINGS_DIR / 'clue_context_phrases.csv', keep_default_na=False)
cc_phrases['cc_row_position'] = np.arange(len(cc_phrases))

# --- Print shapes ---
print(f'\n{"File":<35s} {"Shape":<25s} {"Memory":>8s}')
print(f'{"-"*35} {"-"*25} {"-"*8}')
for name, arr in [
    ('definition_embeddings.npy', definition_embeddings),
    ('answer_embeddings.npy', answer_embeddings),
    ('clue_context_embeddings.npy', clue_context_embeddings),
]:
    mb = arr.nbytes / 1024**2
    print(f'{name:<35s} {str(arr.shape):<25s} {mb:>6.1f} MB')

print(f'\nIndex sizes:')
print(f'  definition_index:        {len(definition_index):,} rows')
print(f'  answer_index:            {len(answer_index):,} rows')
print(f'  clue_context_phrases:    {len(cc_phrases):,} rows')

# --- Shape assertions ---
n_def = len(definition_index)
n_ans = len(answer_index)
n_cc = len(cc_phrases)
assert definition_embeddings.shape == (n_def, 3, EMBED_DIM)
assert answer_embeddings.shape == (n_ans, 3, EMBED_DIM)
assert clue_context_embeddings.shape == (n_cc, EMBED_DIM)

# ============================================================
# Build word → row-index lookup dicts for embedding retrieval
# ============================================================
# definition_index and answer_index have integer row indices (0, 1, 2, ...)
# and a 'word' column. We create dicts mapping word → row position in the
# corresponding .npy array.
#
# The embedding .npy arrays have shape (N, 3, 1024) where the 3 slots are:
#   [0] = allsense_avg, [1] = common synset, [2] = obscure synset
# So to get e.g. answer "rose"'s common embedding:
#   answer_embeddings[ans_word_to_idx['rose'], 1, :]

def_word_to_idx = pd.Series(
    definition_index.index, index=definition_index['word']).to_dict()
ans_word_to_idx = pd.Series(
    answer_index.index, index=answer_index['word']).to_dict()

# Build a mapping from (clue_id, definition) → cc_row_position for
# clue-context embedding lookups. This composite key handles double-
# definition clues where multiple rows share the same clue_id.
cc_lookup = cc_phrases.set_index(
    ['clue_id', 'definition'])['cc_row_position'].to_dict()

print(f'\nLookup dicts built:')
print(f'  def_word_to_idx:  {len(def_word_to_idx):,} entries')
print(f'  ans_word_to_idx:  {len(ans_word_to_idx):,} entries')
print(f'  cc_lookup:        {len(cc_lookup):,} entries')

features_all.parquet: 240,211 rows × 60 columns
  Unique (definition, answer) pairs: 128,961
  Unique definition_wn values:       27,356
  Unique answer_wn values:            45,183

File                                Shape                       Memory
----------------------------------- ------------------------- --------
definition_embeddings.npy           (27385, 3, 1024)           320.9 MB
answer_embeddings.npy               (45254, 3, 1024)           530.3 MB
clue_context_embeddings.npy         (240211, 1024)             938.3 MB

Index sizes:
  definition_index:        27,385 rows
  answer_index:            45,254 rows
  clue_context_phrases:    240,211 rows

Lookup dicts built:
  def_word_to_idx:  27,385 entries
  ans_word_to_idx:  45,254 entries
  cc_lookup:        240,211 entries

# ============================================================
# Generate Easy Distractors
# ============================================================
# For each real row, sample one random answer_wn from the pool of all
# known answer words, excluding the true answer for that row.

all_answer_words = df_real['answer_wn'].unique()
print(f'Pool of candidate answer words: {len(all_answer_words):,}')

# Pre-build a set for fast membership checks
all_answer_set = set(all_answer_words)

# Sample one distractor answer per row. We iterate rather than vectorize
# because each row must exclude its own true answer from the pool.
distractor_answers = []
true_answers = df_real['answer_wn'].values

for i in tqdm(range(len(df_real)), desc='Sampling distractors'):
    true_ans = true_answers[i]
    # Sample from the full pool; resample if we hit the true answer.
    # With ~45K candidates, collision probability is ~0.002%, so this
    # almost never loops more than once.
    while True:
        sampled = all_answer_words[rng.randint(len(all_answer_words))]
        if sampled != true_ans:
            break
    distractor_answers.append(sampled)

df_real_copy = df_real.copy()
df_real_copy['distractor_answer_wn'] = distractor_answers

print(f'\nGenerated {len(distractor_answers):,} distractor assignments')
print(f'  Unique distractor answers used: '
      f'{len(set(distractor_answers)):,}')

# Sanity check: no row's distractor is the same as its true answer
assert (df_real_copy['answer_wn'] != df_real_copy['distractor_answer_wn']).all(), \
    'Some distractors match the true answer!'

Pool of candidate answer words: 45,183

Sampling distractors:   0%|          | 0/240211 [00:00<?, ?it/s]

Generated 240,211 distractor assignments
  Unique distractor answers used: 44,974

# ============================================================
# Step 1: Compute relationship + surface features per unique pair
# ============================================================
# Deduplicate to unique (definition_wn, distractor_answer_wn) pairs first,
# then compute WordNet relationship and surface features once per pair.
# This avoids redundant WordNet lookups — the same optimization NB 03 uses.

unique_pairs = df_real_copy[['definition_wn', 'distractor_answer_wn']].drop_duplicates()
print(f'Unique (definition_wn, distractor_answer_wn) pairs: '
      f'{len(unique_pairs):,}')
print(f'  (vs {len(df_real_copy):,} total rows — '
      f'{len(df_real_copy) / len(unique_pairs):.1f}x dedup ratio)')

# Compute relationship features (22) and surface features (4) per pair
pair_features = []
for _, row in tqdm(unique_pairs.iterrows(), total=len(unique_pairs),
                   desc='Relationship + surface features'):
    def_w = row['definition_wn']
    ans_w = row['distractor_answer_wn']
    feats = compute_relationship_features(def_w, ans_w)
    feats.update(compute_surface_features(def_w, ans_w))
    feats['definition_wn'] = def_w
    feats['distractor_answer_wn'] = ans_w
    pair_features.append(feats)

pair_feat_df = pd.DataFrame(pair_features)
print(f'\nPair feature table: {pair_feat_df.shape}')

# ============================================================
# Step 2: Compute cosine features per row
# ============================================================
# Cosine features involve the clue-context embedding, which is row-specific
# (different clues have different context embeddings even for the same
# definition). So we compute these per row.

cosine_features_list = []

for idx in tqdm(range(len(df_real_copy)), desc='Cosine features'):
    row = df_real_copy.iloc[idx]
    def_wn = row['definition_wn']
    dist_ans_wn = row['distractor_answer_wn']

    # Look up definition embeddings (unchanged from real row)
    def_row_idx = def_word_to_idx[def_wn]
    def_embs = {
        'allsense': definition_embeddings[def_row_idx, 0, :],
        'common':   definition_embeddings[def_row_idx, 1, :],
        'obscure':  definition_embeddings[def_row_idx, 2, :],
    }

    # Look up distractor answer embeddings
    ans_row_idx = ans_word_to_idx[dist_ans_wn]
    ans_embs = {
        'allsense': answer_embeddings[ans_row_idx, 0, :],
        'common':   answer_embeddings[ans_row_idx, 1, :],
        'obscure':  answer_embeddings[ans_row_idx, 2, :],
    }

    # Look up clue-context embedding (unchanged — same clue, same definition)
    cc_key = (row['clue_id'], row['definition'])
    cc_pos = cc_lookup[cc_key]
    clue_emb = clue_context_embeddings[cc_pos, :]

    # Compute all 21 cosine features (15 context-free + 6 context-informed)
    cos_feats = compute_cosine_features_for_pair(
        def_embs, ans_embs, clue_emb=clue_emb)
    cosine_features_list.append(cos_feats)

cosine_feat_df = pd.DataFrame(cosine_features_list, index=df_real_copy.index)
print(f'\nCosine feature table: {cosine_feat_df.shape}')

# ============================================================
# Step 3: Merge all distractor features together
# ============================================================
# Drop the original 47 feature columns inherited from features_all.parquet.
# These are the real-pair features — we're replacing them with distractor-pair
# features. Without this, the merge below would produce _x/_y suffixed
# duplicates for the 26 relationship + surface columns that appear in both
# df_real_copy and pair_feat_df.
df_real_copy = df_real_copy.drop(
    columns=[c for c in ALL_FEATURE_COLS if c in df_real_copy.columns])

# Merge the per-pair relationship/surface features back to all rows
distractor_df = df_real_copy.merge(
    pair_feat_df,
    on=['definition_wn', 'distractor_answer_wn'],
    how='left'
)

# Add cosine features (already aligned by index)
for col in cosine_feat_df.columns:
    distractor_df[col] = cosine_feat_df[col].values

# ============================================================
# Validation: no NaN in any feature column
# ============================================================
nan_counts = distractor_df[ALL_FEATURE_COLS].isnull().sum()
nan_cols = nan_counts[nan_counts > 0]
assert len(nan_cols) == 0, (
    f'NaN values found in distractor feature columns — violates Decision 3:\n'
    f'{nan_cols}')

print(f'\nDistractor feature computation complete.')
print(f'  {len(distractor_df):,} distractor rows')
print(f'  {len(ALL_FEATURE_COLS)} features validated (no NaN)')

Unique (definition_wn, distractor_answer_wn) pairs: 239,972
  (vs 240,211 total rows — 1.0x dedup ratio)

Relationship + surface features:   0%|          | 0/239972 [00:00<?, ?it/s]

Pair feature table: (239972, 28)

Cosine features:   0%|          | 0/240211 [00:00<?, ?it/s]

Cosine feature table: (240211, 21)

Distractor feature computation complete.
  240,211 distractor rows
  47 features validated (no NaN)

# ============================================================
# Build real-pair rows (label = 1)
# ============================================================
real_rows = df_real[METADATA_COLS + ALL_FEATURE_COLS].copy()
real_rows['label'] = 1
real_rows['distractor_source'] = np.nan  # not a distractor

# ============================================================
# Build distractor rows (label = 0)
# ============================================================
# For distractor rows, answer_wn is the distractor answer (the word whose
# features were computed), and distractor_source records which word it is.
dist_rows = distractor_df[METADATA_COLS + ALL_FEATURE_COLS].copy()

# Overwrite answer_wn with the distractor answer and record the source
dist_rows['answer_wn'] = distractor_df['distractor_answer_wn'].values
dist_rows['label'] = 0
dist_rows['distractor_source'] = distractor_df['distractor_answer_wn'].values

# ============================================================
# Concatenate and shuffle
# ============================================================
dataset_easy = pd.concat([real_rows, dist_rows], ignore_index=True)
dataset_easy = dataset_easy.sample(
    frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

# ============================================================
# Validation
# ============================================================
n_real = (dataset_easy['label'] == 1).sum()
n_dist = (dataset_easy['label'] == 0).sum()
assert n_real == n_dist, (
    f'Dataset is not balanced: {n_real:,} real vs {n_dist:,} distractor')
assert n_real == len(df_real), (
    f'Real row count mismatch: {n_real:,} vs {len(df_real):,}')

# Verify all 47 feature columns present and NaN-free
missing_cols = [c for c in ALL_FEATURE_COLS if c not in dataset_easy.columns]
assert not missing_cols, f'Missing feature columns: {missing_cols}'
assert not dataset_easy[ALL_FEATURE_COLS].isnull().any().any(), (
    'NaN values found in easy dataset features — violates Decision 3')

print(f'Easy dataset assembled:')
print(f'  Total rows:       {len(dataset_easy):,}')
print(f'  Real (label=1):   {n_real:,}')
print(f'  Distractor (0):   {n_dist:,}')
print(f'  Feature columns:  {len(ALL_FEATURE_COLS)}')
print(f'  Total columns:    {len(dataset_easy.columns)}')

# ============================================================
# Feature statistics summary (real vs distractor)
# ============================================================
print(f'\n--- Feature Statistics by Label ---')
print(f'{"Feature":<35s} {"Real mean":>10s} {"Real std":>10s}'
      f' {"Dist mean":>10s} {"Dist std":>10s}')
print(f'{"-"*35} {"-"*10} {"-"*10} {"-"*10} {"-"*10}')
for col in ALL_FEATURE_COLS[:10]:  # show first 10 for brevity
    r = dataset_easy.loc[dataset_easy['label'] == 1, col]
    d = dataset_easy.loc[dataset_easy['label'] == 0, col]
    print(f'{col:<35s} {r.mean():>10.4f} {r.std():>10.4f}'
          f' {d.mean():>10.4f} {d.std():>10.4f}')
print(f'  ... ({len(ALL_FEATURE_COLS) - 10} more features omitted)')

# ============================================================
# Save to parquet
# ============================================================
easy_path = DATA_DIR / 'dataset_easy.parquet'
dataset_easy.to_parquet(easy_path, index=False)
file_size_mb = easy_path.stat().st_size / (1024 * 1024)
print(f'\nSaved to {easy_path}')
print(f'File size: {file_size_mb:.1f} MB')

# Round-trip verification
reloaded = pd.read_parquet(easy_path)
assert reloaded.shape == dataset_easy.shape, (
    f'Shape mismatch after reload: {reloaded.shape} vs {dataset_easy.shape}')
print(f'Round-trip verification passed: '
      f'{reloaded.shape[0]:,} rows × {reloaded.shape[1]} columns')

Easy dataset assembled:
  Total rows:       480,422
  Real (label=1):   240,211
  Distractor (0):   240,211
  Feature columns:  47
  Total columns:    62

--- Feature Statistics by Label ---
Feature                              Real mean   Real std  Dist mean   Dist std
----------------------------------- ---------- ---------- ---------- ----------
cos_w1all_w1common                      0.8638     0.1257     0.8638     0.1257
cos_w1all_w1obscure                     0.8636     0.1168     0.8636     0.1168
cos_w1all_w2all                         0.6482     0.1366     0.4304     0.1189
cos_w1all_w2common                      0.5954     0.1575     0.4045     0.1175
cos_w1all_w2obscure                     0.5940     0.1545     0.4061     0.1179
cos_w1common_w1obscure                  0.6891     0.2331     0.6891     0.2331
cos_w1common_w2all                      0.5686     0.1710     0.3640     0.1194
cos_w1common_w2common                   0.5311     0.1871     0.3435     0.1184
cos_w1common_w2obscure                  0.5171     0.1867     0.3426     0.1192
cos_w1obscure_w2all                     0.5593     0.1640     0.3755     0.1207
  ... (37 more features omitted)

Saved to /Users/victoria/Desktop/MADS/ccc-project/clue_misdirection/data/dataset_easy.parquet
File size: 103.1 MB
Round-trip verification passed: 480,422 rows × 62 columns

# ============================================================
# Basic Statistics
# ============================================================
print('=== Easy Dataset Summary ===')
print(f'Total rows:                    {len(dataset_easy):,}')
print(f'Label=1 (real):                {(dataset_easy["label"] == 1).sum():,}')
print(f'Label=0 (distractor):          {(dataset_easy["label"] == 0).sum():,}')
print(f'Unique definition_wn values:   '
      f'{dataset_easy["definition_wn"].nunique():,}')
print(f'Unique answer_wn values:       '
      f'{dataset_easy["answer_wn"].nunique():,}')
print(f'Unique def_answer_pair_id:     '
      f'{dataset_easy["def_answer_pair_id"].nunique():,}')

# ============================================================
# Key Feature Comparison: Real vs Distractor
# ============================================================
# We expect real pairs to have significantly higher cosine similarity
# and WordNet path similarity than random distractors. This confirms
# the "easy" nature of the dataset.

validation_features = [
    'cos_w1all_w2all',       # primary cross-word cosine similarity
    'cos_w1common_w2common', # common-sense cosine
    'wn_max_path_sim',       # WordNet structural similarity
    'wn_shared_synset_count',# direct synonym overlap
    'surface_edit_distance', # orthographic similarity
]

real_mask = dataset_easy['label'] == 1
dist_mask = dataset_easy['label'] == 0

print(f'\n{"Feature":<30s} {"Real":>18s}  {"Distractor":>18s}  {"Gap":>8s}')
print(f'{"-"*30} {"-"*18}  {"-"*18}  {"-"*8}')

for feat in validation_features:
    r_mean = dataset_easy.loc[real_mask, feat].mean()
    r_std  = dataset_easy.loc[real_mask, feat].std()
    d_mean = dataset_easy.loc[dist_mask, feat].mean()
    d_std  = dataset_easy.loc[dist_mask, feat].std()
    gap = r_mean - d_mean
    print(f'{feat:<30s} {r_mean:>7.4f} ± {r_std:<7.4f}  '
          f'{d_mean:>7.4f} ± {d_std:<7.4f}  {gap:>+7.4f}')

# ============================================================
# Confirm distractors are substantially different
# ============================================================
# Real pairs should have notably higher cos_w1all_w2all than distractors.
# If the gap is near zero, feature computation may be incorrect.
cos_gap = (dataset_easy.loc[real_mask, 'cos_w1all_w2all'].mean()
           - dataset_easy.loc[dist_mask, 'cos_w1all_w2all'].mean())
print(f'\ncos_w1all_w2all gap (real - distractor): {cos_gap:+.4f}')
assert cos_gap > 0.05, (
    f'Cosine gap is suspiciously small ({cos_gap:.4f}). '
    f'Random distractors should be much less similar than real pairs.')
print('Validation passed: distractors are substantially less similar '
      'than real pairs, as expected for the easy dataset.')

=== Easy Dataset Summary ===
Total rows:                    480,422
Label=1 (real):                240,211
Label=0 (distractor):          240,211
Unique definition_wn values:   27,356
Unique answer_wn values:       45,183
Unique def_answer_pair_id:     128,961

Feature                                      Real          Distractor       Gap
------------------------------ ------------------  ------------------  --------
cos_w1all_w2all                 0.6482 ± 0.1366    0.4304 ± 0.1189   +0.2178
cos_w1common_w2common           0.5311 ± 0.1871    0.3435 ± 0.1184   +0.1876
wn_max_path_sim                 0.4352 ± 0.2902    0.1477 ± 0.0836   +0.2876
wn_shared_synset_count          0.2081 ± 0.5066    0.0008 ± 0.0617   +0.2073
surface_edit_distance           6.7299 ± 2.2153    7.6067 ± 2.2683   -0.8768

cos_w1all_w2all gap (real - distractor): +0.2178
Validation passed: distractors are substantially less similar than real pairs, as expected for the easy dataset.

# ============================================================
# Compute top-K most similar answers for each unique definition
# ============================================================
# We use allsense embeddings (slot [0]) for both definitions and answers,
# matching the "Word1_allsense vs Word2_allsense" specification in Step 7.
# Cosine similarity is computed as a dot product of L2-normalized vectors.

# --- Extract and normalize allsense embeddings ---
def_allsense = definition_embeddings[:, 0, :].astype(np.float32)
ans_allsense = answer_embeddings[:, 0, :].astype(np.float32)

def_norms = np.linalg.norm(def_allsense, axis=1, keepdims=True)
ans_norms = np.linalg.norm(ans_allsense, axis=1, keepdims=True)
def_normed = def_allsense / def_norms
ans_normed = ans_allsense / ans_norms

# --- Map from answer_index row positions to answer_wn strings ---
ans_words_arr = answer_index['word'].values
def_words_arr = definition_index['word'].values

# --- Compute top-K candidates per definition in chunks ---
# We store a generous upper bound (MAX_K=500) so we can tune k downward.
MAX_K = 500

top_k_indices_arr = np.zeros((n_def, MAX_K), dtype=np.int32)
top_k_sims_arr = np.zeros((n_def, MAX_K), dtype=np.float32)

CHUNK_SIZE = 500
n_chunks = (n_def + CHUNK_SIZE - 1) // CHUNK_SIZE

for chunk_start in tqdm(range(0, n_def, CHUNK_SIZE), total=n_chunks,
                        desc='Computing similarity (chunked)'):
    chunk_end = min(chunk_start + CHUNK_SIZE, n_def)
    # Cosine similarity between this chunk of definitions and all answers
    chunk_sims = def_normed[chunk_start:chunk_end] @ ans_normed.T

    for i in range(chunk_end - chunk_start):
        row_sims = chunk_sims[i]
        # argpartition is O(n), much faster than full sort for 45K candidates
        top_idx = np.argpartition(row_sims, -MAX_K)[-MAX_K:]
        sorted_order = np.argsort(row_sims[top_idx])[::-1]
        top_k_indices_arr[chunk_start + i] = top_idx[sorted_order]
        top_k_sims_arr[chunk_start + i] = row_sims[top_idx[sorted_order]]

# --- Build lookup: definition_wn -> row position in top_k arrays ---
def_word_to_topk_row = {w: i for i, w in enumerate(def_words_arr)}

print(f'Similarity computation complete:')
print(f'  Definitions processed: {n_def:,}')
print(f'  Answer candidates:     {n_ans:,}')
print(f'  Top-K stored per def:  {MAX_K}')
print(f'  Equivalent full matrix shape: ({n_def:,}, {n_ans:,})')
print(f'  Storage: {top_k_indices_arr.nbytes / 1024**2:.1f} MB (indices) + '
      f'{top_k_sims_arr.nbytes / 1024**2:.1f} MB (similarities)')

# --- Summary of similarity ranges ---
real_cos_mean = df_real['cos_w1all_w2all'].mean()
print(f'\nSimilarity ranges across definitions (mean of each rank):')
for rank in [1, 10, 50, 100, 200, 500]:
    idx = min(rank - 1, MAX_K - 1)
    mean_val = top_k_sims_arr[:, idx].mean()
    med_val = np.median(top_k_sims_arr[:, idx])
    print(f'  Rank {rank:>4d}: mean={mean_val:.4f}, median={med_val:.4f}')
print(f'\nReal pairs mean cos_w1all_w2all: {real_cos_mean:.4f}')

Computing similarity (chunked):   0%|          | 0/55 [00:00<?, ?it/s]

Similarity computation complete:
  Definitions processed: 27,385
  Answer candidates:     45,254
  Top-K stored per def:  500
  Equivalent full matrix shape: (27,385, 45,254)
  Storage: 52.2 MB (indices) + 52.2 MB (similarities)

Similarity ranges across definitions (mean of each rank):
  Rank    1: mean=0.9613, median=1.0000
  Rank   10: mean=0.8178, median=0.8251
  Rank   50: mean=0.7735, median=0.7825
  Rank  100: mean=0.7523, median=0.7618
  Rank  200: mean=0.7291, median=0.7393
  Rank  500: mean=0.6943, median=0.7049

Real pairs mean cos_w1all_w2all: 0.6482

# ============================================================
# Explore different k values
# ============================================================
# For each k, compute the expected mean similarity of a uniformly-sampled
# distractor from the top-k pool. This helps us choose k so that harder
# distractors are meaningfully closer to the definition than random ones.

k_candidates = [20, 50, 100, 200, 500]

print(f'Expected mean distractor similarity by k:')
print(f'  {"k":>6s}  {"Mean(top-k)":>12s}  {"Median(top-k)":>14s}  '
      f'{"Floor (rank k)":>14s}')
print(f'  {"-"*6}  {"-"*12}  {"-"*14}  {"-"*14}')

for k in k_candidates:
    # Mean similarity across all definitions, averaged over the top-k ranks
    mean_topk = top_k_sims_arr[:, :k].mean()
    median_topk = np.median(top_k_sims_arr[:, :k].mean(axis=1))
    floor_val = top_k_sims_arr[:, k-1].mean()
    print(f'  {k:>6d}  {mean_topk:>12.4f}  {median_topk:>14.4f}  '
          f'{floor_val:>14.4f}')

real_cos_mean = df_real['cos_w1all_w2all'].mean()
easy_dist_cos_mean = dataset_easy.loc[
    dataset_easy['label'] == 0, 'cos_w1all_w2all'].mean()
print(f'\nReference points:')
print(f'  Real pairs mean cos_w1all_w2all:          {real_cos_mean:.4f}')
print(f'  Easy (random) distractor mean:            {easy_dist_cos_mean:.4f}')

# ============================================================
# Choose k and sample distractors
# ============================================================
# k=100 gives distractors from the top ~0.2% of the answer pool for each
# definition. These are semantically plausible alternatives that should
# challenge the classifier without being indistinguishable from real pairs.

TOP_K_FINAL = 100
print(f'\n--- Selected TOP_K = {TOP_K_FINAL} ---')

harder_distractor_answers = []
harder_distractor_sims = []
true_answers_arr = df_real['answer_wn'].values
def_wns_arr = df_real['definition_wn'].values

for i in tqdm(range(len(df_real)), desc='Sampling harder distractors'):
    def_wn = def_wns_arr[i]
    true_ans = true_answers_arr[i]

    topk_row = def_word_to_topk_row[def_wn]
    candidate_indices = top_k_indices_arr[topk_row, :TOP_K_FINAL]
    candidate_sims = top_k_sims_arr[topk_row, :TOP_K_FINAL]
    candidate_words = ans_words_arr[candidate_indices]

    # Exclude the true answer from candidates
    valid_mask = candidate_words != true_ans
    valid_words = candidate_words[valid_mask]
    valid_sims = candidate_sims[valid_mask]

    if len(valid_words) == 0:
        # Extremely unlikely fallback: true answer is the only top-K candidate.
        # Fall back to random selection.
        while True:
            sampled = all_answer_words[rng.randint(len(all_answer_words))]
            if sampled != true_ans:
                break
        harder_distractor_answers.append(sampled)
        harder_distractor_sims.append(np.nan)
    else:
        idx = rng.randint(len(valid_words))
        harder_distractor_answers.append(valid_words[idx])
        harder_distractor_sims.append(float(valid_sims[idx]))

harder_distractor_answers = np.array(harder_distractor_answers)
harder_distractor_sims = np.array(harder_distractor_sims)

# ============================================================
# Report statistics
# ============================================================
print(f'\nHarder distractor statistics (k={TOP_K_FINAL}):')
print(f'  Count:              {len(harder_distractor_answers):,}')
print(f'  Unique distractors: {len(set(harder_distractor_answers)):,}')
print(f'  Mean cos sim:       {np.nanmean(harder_distractor_sims):.4f}')
print(f'  Median cos sim:     {np.nanmedian(harder_distractor_sims):.4f}')
print(f'  Std cos sim:        {np.nanstd(harder_distractor_sims):.4f}')
print(f'  NaN (fallback):     {np.isnan(harder_distractor_sims).sum()}')

print(f'\nComparison:')
print(f'  Real pairs mean:         {real_cos_mean:.4f}')
print(f'  Harder distractor mean:  {np.nanmean(harder_distractor_sims):.4f}')
print(f'  Easy distractor mean:    {easy_dist_cos_mean:.4f}')
print(f'  Gap (real - harder):     '
      f'{real_cos_mean - np.nanmean(harder_distractor_sims):+.4f}')
print(f'  Gap (real - easy):       '
      f'{real_cos_mean - easy_dist_cos_mean:+.4f}')

# Sanity: no distractor matches the true answer
assert (harder_distractor_answers != true_answers_arr).all(), \
    'Some harder distractors match the true answer!'

Expected mean distractor similarity by k:
       k   Mean(top-k)   Median(top-k)  Floor (rank k)
  ------  ------------  --------------  --------------
      20        0.8281          0.8348          0.7991
      50        0.8017          0.8094          0.7735
     100        0.7817          0.7902          0.7523
     200        0.7606          0.7694          0.7291
     500        0.7299          0.7399          0.6943

Reference points:
  Real pairs mean cos_w1all_w2all:          0.6482
  Easy (random) distractor mean:            0.4304

--- Selected TOP_K = 100 ---

Sampling harder distractors:   0%|          | 0/240211 [00:00<?, ?it/s]

Harder distractor statistics (k=100):
  Count:              240,211
  Unique distractors: 37,829
  Mean cos sim:       0.7703
  Median cos sim:     0.7749
  Std cos sim:        0.0682
  NaN (fallback):     0

Comparison:
  Real pairs mean:         0.6482
  Harder distractor mean:  0.7703
  Easy distractor mean:    0.4304
  Gap (real - harder):     -0.1221
  Gap (real - easy):       +0.2178

# ============================================================
# Compute Features for Harder Distractors
# ============================================================
# Same approach as the easy distractor feature computation above:
# relationship + surface features per unique pair (deduplicated),
# cosine features per row (because clue-context embedding is row-specific).
# We compute all 47 features first, then drop the 15 context-free columns
# in the assembly step.

df_harder_copy = df_real.copy()
df_harder_copy['distractor_answer_wn'] = harder_distractor_answers
df_harder_copy['distractor_sim'] = harder_distractor_sims

# ============================================================
# Step 1: Relationship + surface features per unique pair
# ============================================================
unique_harder_pairs = df_harder_copy[
    ['definition_wn', 'distractor_answer_wn']].drop_duplicates()
print(f'Unique (definition_wn, distractor_answer_wn) pairs: '
      f'{len(unique_harder_pairs):,}')
print(f'  (vs {len(df_harder_copy):,} total rows — '
      f'{len(df_harder_copy) / len(unique_harder_pairs):.1f}x dedup ratio)')

harder_pair_features = []
for _, row in tqdm(unique_harder_pairs.iterrows(),
                   total=len(unique_harder_pairs),
                   desc='Harder: relationship + surface features'):
    def_w = row['definition_wn']
    ans_w = row['distractor_answer_wn']
    feats = compute_relationship_features(def_w, ans_w)
    feats.update(compute_surface_features(def_w, ans_w))
    feats['definition_wn'] = def_w
    feats['distractor_answer_wn'] = ans_w
    harder_pair_features.append(feats)

harder_pair_feat_df = pd.DataFrame(harder_pair_features)
print(f'\nPair feature table: {harder_pair_feat_df.shape}')

# ============================================================
# Step 2: Cosine features per row
# ============================================================
harder_cosine_features_list = []

for idx in tqdm(range(len(df_harder_copy)),
                desc='Harder: cosine features'):
    row = df_harder_copy.iloc[idx]
    def_wn = row['definition_wn']
    dist_ans_wn = row['distractor_answer_wn']

    # Definition embeddings (unchanged from real row)
    def_row_idx = def_word_to_idx[def_wn]
    def_embs = {
        'allsense': definition_embeddings[def_row_idx, 0, :],
        'common':   definition_embeddings[def_row_idx, 1, :],
        'obscure':  definition_embeddings[def_row_idx, 2, :],
    }

    # Distractor answer embeddings
    ans_row_idx = ans_word_to_idx[dist_ans_wn]
    ans_embs = {
        'allsense': answer_embeddings[ans_row_idx, 0, :],
        'common':   answer_embeddings[ans_row_idx, 1, :],
        'obscure':  answer_embeddings[ans_row_idx, 2, :],
    }

    # Clue-context embedding (unchanged — same clue, same definition)
    cc_key = (row['clue_id'], row['definition'])
    cc_pos = cc_lookup[cc_key]
    clue_emb = clue_context_embeddings[cc_pos, :]

    # Compute all 21 cosine features (15 context-free + 6 context-informed)
    cos_feats = compute_cosine_features_for_pair(
        def_embs, ans_embs, clue_emb=clue_emb)
    harder_cosine_features_list.append(cos_feats)

harder_cosine_feat_df = pd.DataFrame(
    harder_cosine_features_list, index=df_harder_copy.index)
print(f'\nCosine feature table: {harder_cosine_feat_df.shape}')

# ============================================================
# Step 3: Merge all harder distractor features
# ============================================================
# Drop original feature columns before merging distractor features
df_harder_copy = df_harder_copy.drop(
    columns=[c for c in ALL_FEATURE_COLS if c in df_harder_copy.columns])

harder_distractor_df = df_harder_copy.merge(
    harder_pair_feat_df,
    on=['definition_wn', 'distractor_answer_wn'],
    how='left'
)

# Add cosine features (aligned by index)
for col in harder_cosine_feat_df.columns:
    harder_distractor_df[col] = harder_cosine_feat_df[col].values

# ============================================================
# Validate: no NaN in any of the 47 feature columns
# ============================================================
nan_counts = harder_distractor_df[ALL_FEATURE_COLS].isnull().sum()
nan_cols = nan_counts[nan_counts > 0]
assert len(nan_cols) == 0, (
    f'NaN found in harder distractor features:\n{nan_cols}')

print(f'\nHarder distractor feature computation complete.')
print(f'  {len(harder_distractor_df):,} distractor rows')
print(f'  {len(ALL_FEATURE_COLS)} features computed (no NaN)')
print(f'  15 context-free features will be dropped in assembly step')

Unique (definition_wn, distractor_answer_wn) pairs: 186,068
  (vs 240,211 total rows — 1.3x dedup ratio)

Harder: relationship + surface features:   0%|          | 0/186068 [00:00<?, ?it/s]

Pair feature table: (186068, 28)

Harder: cosine features:   0%|          | 0/240211 [00:00<?, ?it/s]

Cosine feature table: (240211, 21)

Harder distractor feature computation complete.
  240,211 distractor rows
  47 features computed (no NaN)
  15 context-free features will be dropped in assembly step

# ============================================================
# Define the 32 retained feature columns
# ============================================================
HARDER_FEATURE_COLS = (
    CONTEXT_INFORMED_COLS + RELATIONSHIP_COLS + SURFACE_COLS
)
assert len(HARDER_FEATURE_COLS) == 32, (
    f'Expected 32 harder features, got {len(HARDER_FEATURE_COLS)}')

print(f'Feature groups in harder dataset:')
print(f'  Context-informed: {len(CONTEXT_INFORMED_COLS)}')
print(f'  Relationship:     {len(RELATIONSHIP_COLS)}')
print(f'  Surface:          {len(SURFACE_COLS)}')
print(f'  TOTAL:            {len(HARDER_FEATURE_COLS)}')
print(f'  Removed:          {len(CONTEXT_FREE_COLS)} context-free')

# ============================================================
# Build real-pair rows (label = 1) — drop context-free features
# ============================================================
real_harder = df_real[METADATA_COLS + HARDER_FEATURE_COLS].copy()
real_harder['label'] = 1
real_harder['distractor_source'] = np.nan

# ============================================================
# Build distractor rows (label = 0) — drop context-free features
# ============================================================
dist_harder = harder_distractor_df[
    METADATA_COLS + HARDER_FEATURE_COLS].copy()
# Overwrite answer_wn with the distractor answer
dist_harder['answer_wn'] = harder_distractor_df['distractor_answer_wn'].values
dist_harder['label'] = 0
dist_harder['distractor_source'] = (
    harder_distractor_df['distractor_answer_wn'].values)

# ============================================================
# Concatenate and shuffle
# ============================================================
dataset_harder = pd.concat([real_harder, dist_harder], ignore_index=True)
dataset_harder = dataset_harder.sample(
    frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

# ============================================================
# Validation
# ============================================================
n_real_h = (dataset_harder['label'] == 1).sum()
n_dist_h = (dataset_harder['label'] == 0).sum()
assert n_real_h == n_dist_h, (
    f'Dataset not balanced: {n_real_h:,} real vs {n_dist_h:,} distractor')
assert n_real_h == len(df_real), (
    f'Real row count mismatch: {n_real_h:,} vs {len(df_real):,}')

# Verify exactly 32 feature columns, no context-free columns present
feature_cols_present = [c for c in dataset_harder.columns
                        if c in ALL_FEATURE_COLS]
assert len(feature_cols_present) == 32, (
    f'Expected 32 features, found {len(feature_cols_present)}')
context_free_present = [c for c in CONTEXT_FREE_COLS
                        if c in dataset_harder.columns]
assert not context_free_present, (
    f'Context-free features should be removed: {context_free_present}')

# No NaN in retained features
assert not dataset_harder[HARDER_FEATURE_COLS].isnull().any().any(), (
    'NaN values found in harder dataset features')

print(f'\nHarder dataset assembled:')
print(f'  Total rows:       {len(dataset_harder):,}')
print(f'  Real (label=1):   {n_real_h:,}')
print(f'  Distractor (0):   {n_dist_h:,}')
print(f'  Feature columns:  {len(HARDER_FEATURE_COLS)}')
print(f'  Total columns:    {len(dataset_harder.columns)}')

# ============================================================
# Save to parquet
# ============================================================
harder_path = DATA_DIR / 'dataset_harder.parquet'
dataset_harder.to_parquet(harder_path, index=False)
file_size_mb = harder_path.stat().st_size / (1024 * 1024)
print(f'\nSaved to {harder_path}')
print(f'File size: {file_size_mb:.1f} MB')

# Round-trip verification
reloaded_h = pd.read_parquet(harder_path)
assert reloaded_h.shape == dataset_harder.shape, (
    f'Shape mismatch: {reloaded_h.shape} vs {dataset_harder.shape}')
print(f'Round-trip verification passed: '
      f'{reloaded_h.shape[0]:,} rows x {reloaded_h.shape[1]} columns')

Feature groups in harder dataset:
  Context-informed: 6
  Relationship:     22
  Surface:          4
  TOTAL:            32
  Removed:          15 context-free

Harder dataset assembled:
  Total rows:       480,422
  Real (label=1):   240,211
  Distractor (0):   240,211
  Feature columns:  32
  Total columns:    47

Saved to /Users/victoria/Desktop/MADS/ccc-project/clue_misdirection/data/dataset_harder.parquet
File size: 71.2 MB
Round-trip verification passed: 480,422 rows x 47 columns

# ============================================================
# Harder Dataset Validation
# ============================================================
# Verify that the harder distractors are genuinely harder to distinguish
# from real pairs compared to the easy dataset's random distractors.

print('=== Harder Dataset Validation ===\n')

real_h_mask = dataset_harder['label'] == 1
dist_h_mask = dataset_harder['label'] == 0

# ============================================================
# 1. Context-informed cosine similarity (retained feature)
# ============================================================
# cos_w1clue_w2all measures how similar the definition (in clue context)
# is to the answer's allsense embedding. This is a context-informed
# feature — NOT used in distractor selection — so it provides an
# independent signal. The gap between real and distractor rows should
# be *smaller* than in the easy dataset, confirming the harder dataset
# is genuinely more challenging.

feat = 'cos_w1clue_w2all'
real_vals = dataset_harder.loc[real_h_mask, feat]
dist_vals = dataset_harder.loc[dist_h_mask, feat]
easy_real_vals = dataset_easy.loc[dataset_easy['label'] == 1, feat]
easy_dist_vals = dataset_easy.loc[dataset_easy['label'] == 0, feat]

print(f'cos_w1clue_w2all (context-informed, retained):')
print(f'  Easy dataset:    real={easy_real_vals.mean():.4f} +/- {easy_real_vals.std():.4f}  '
      f'dist={easy_dist_vals.mean():.4f} +/- {easy_dist_vals.std():.4f}  '
      f'gap={easy_real_vals.mean() - easy_dist_vals.mean():+.4f}')
print(f'  Harder dataset:  real={real_vals.mean():.4f} +/- {real_vals.std():.4f}  '
      f'dist={dist_vals.mean():.4f} +/- {dist_vals.std():.4f}  '
      f'gap={real_vals.mean() - dist_vals.mean():+.4f}')

# ============================================================
# 2. WordNet max path similarity (retained feature)
# ============================================================
feat2 = 'wn_max_path_sim'
real_path = dataset_harder.loc[real_h_mask, feat2]
dist_path = dataset_harder.loc[dist_h_mask, feat2]
easy_real_path = dataset_easy.loc[dataset_easy['label'] == 1, feat2]
easy_dist_path = dataset_easy.loc[dataset_easy['label'] == 0, feat2]

print(f'\nwn_max_path_sim (relationship, retained):')
print(f'  Easy dataset:    real={easy_real_path.mean():.4f} +/- {easy_real_path.std():.4f}  '
      f'dist={easy_dist_path.mean():.4f} +/- {easy_dist_path.std():.4f}  '
      f'gap={easy_real_path.mean() - easy_dist_path.mean():+.4f}')
print(f'  Harder dataset:  real={real_path.mean():.4f} +/- {real_path.std():.4f}  '
      f'dist={dist_path.mean():.4f} +/- {dist_path.std():.4f}  '
      f'gap={real_path.mean() - dist_path.mean():+.4f}')

# ============================================================
# 3. Example distractor pairs
# ============================================================
# Show a few examples to spot-check plausibility. Good harder distractors
# should be semantically related to the definition (not random words).

print(f'\n--- Example Harder Distractor Pairs ---')
print(f'{"Definition":<20s} {"True Answer":<15s} {"Distractor":<15s} '
      f'{"Cos Sim":>7s}  {"PathSim(real)":>13s}  {"PathSim(dist)":>13s}')
print(f'{"-"*20} {"-"*15} {"-"*15} {"-"*7}  {"-"*13}  {"-"*13}')

example_indices = rng.choice(len(df_real), size=10, replace=False)
for ex_idx in example_indices:
    row = df_real.iloc[ex_idx]
    def_wn = row['definition_wn']
    true_ans = row['answer_wn']
    dist_ans = harder_distractor_answers[ex_idx]
    dist_sim = harder_distractor_sims[ex_idx]

    real_path_val = row['wn_max_path_sim']
    dist_feats = compute_relationship_features(def_wn, dist_ans)
    dist_path_val = dist_feats['wn_max_path_sim']

    print(f'{def_wn:<20s} {true_ans:<15s} {dist_ans:<15s} '
          f'{dist_sim:>7.3f}  {real_path_val:>13.3f}  {dist_path_val:>13.3f}')

# ============================================================
# 4. Summary comparison: easy vs harder
# ============================================================
print(f'\n--- Easy vs Harder Dataset Summary ---')
print(f'{"Metric":<40s} {"Easy":>12s} {"Harder":>12s}')
print(f'{"-"*40} {"-"*12} {"-"*12}')
print(f'{"Total rows":<40s} {len(dataset_easy):>12,} {len(dataset_harder):>12,}')
print(f'{"Feature columns":<40s} {len(ALL_FEATURE_COLS):>12} '
      f'{len(HARDER_FEATURE_COLS):>12}')
print(f'{"cos_w1clue_w2all gap (real-dist)":<40s} '
      f'{easy_real_vals.mean() - easy_dist_vals.mean():>+12.4f} '
      f'{real_vals.mean() - dist_vals.mean():>+12.4f}')
print(f'{"wn_max_path_sim gap (real-dist)":<40s} '
      f'{easy_real_path.mean() - easy_dist_path.mean():>+12.4f} '
      f'{real_path.mean() - dist_path.mean():>+12.4f}')

=== Harder Dataset Validation ===

cos_w1clue_w2all (context-informed, retained):
  Easy dataset:    real=0.5446 +/- 0.1544  dist=0.3607 +/- 0.1156  gap=+0.1839
  Harder dataset:  real=0.5446 +/- 0.1544  dist=0.6148 +/- 0.1225  gap=-0.0702

wn_max_path_sim (relationship, retained):
  Easy dataset:    real=0.4352 +/- 0.2902  dist=0.1477 +/- 0.0836  gap=+0.2876
  Harder dataset:  real=0.4352 +/- 0.2902  dist=0.2798 +/- 0.1985  gap=+0.1554

--- Example Harder Distractor Pairs ---
Definition           True Answer     Distractor      Cos Sim  PathSim(real)  PathSim(dist)
-------------------- --------------- --------------- -------  -------------  -------------
culmination          apogee          tailspin          0.761          1.000          0.100
promises             assurances      promise           0.927          0.333          1.000
nimble               agile           grains            0.814          1.000          0.200
chews                champs          grinds            0.737          0.500          0.500
drier                tea_towel       abrasive          0.853          0.100          0.333
titan                prometheus      callisto          0.777          0.500          0.250
weak                 anaemic         weedy             0.793          0.333          0.333
hide                 leather         disgorge          0.769          0.333          0.250
talent               flair           bravery           0.721          0.500          0.091
prevent              block           proscribed        0.682          0.500          0.333

--- Easy vs Harder Dataset Summary ---
Metric                                           Easy       Harder
---------------------------------------- ------------ ------------
Total rows                                    480,422      480,422
Feature columns                                    47           32
cos_w1clue_w2all gap (real-dist)              +0.1839      -0.0702
wn_max_path_sim gap (real-dist)               +0.2876      +0.1554

# ============================================================
# Cross-check: Reload and verify both datasets from disk
# ============================================================
# Independent verification that the saved parquet files have the expected
# properties. This catches any corruption during the save/load round-trip.

easy_check = pd.read_parquet(DATA_DIR / 'dataset_easy.parquet')
harder_check = pd.read_parquet(DATA_DIR / 'dataset_harder.parquet')

# --- Easy dataset ---
assert easy_check.shape[0] == 480_422, (
    f'Easy dataset row count: expected 480,422, got {easy_check.shape[0]:,}')
easy_feat_cols = [c for c in easy_check.columns if c in ALL_FEATURE_COLS]
assert len(easy_feat_cols) == 47, (
    f'Easy dataset feature columns: expected 47, got {len(easy_feat_cols)}')
easy_label_counts = easy_check['label'].value_counts()
assert easy_label_counts[1] == easy_label_counts[0], (
    f'Easy dataset not balanced: {easy_label_counts.to_dict()}')

# --- Harder dataset ---
assert harder_check.shape[0] == 480_422, (
    f'Harder dataset row count: expected 480,422, got {harder_check.shape[0]:,}')
harder_feat_cols = [c for c in harder_check.columns
                    if c in ALL_FEATURE_COLS or c in HARDER_FEATURE_COLS]
# Only the 32 retained features should be present
harder_feat_only = [c for c in harder_check.columns if c in HARDER_FEATURE_COLS]
assert len(harder_feat_only) == 32, (
    f'Harder dataset feature columns: expected 32, got {len(harder_feat_only)}')
harder_label_counts = harder_check['label'].value_counts()
assert harder_label_counts[1] == harder_label_counts[0], (
    f'Harder dataset not balanced: {harder_label_counts.to_dict()}')

# --- No context-free features in harder dataset ---
context_free_in_harder = [c for c in CONTEXT_FREE_COLS
                          if c in harder_check.columns]
assert not context_free_in_harder, (
    f'Context-free features found in harder dataset: {context_free_in_harder}')

print('All cross-checks passed.')
print(f'  Easy:   {easy_check.shape[0]:,} rows, {len(easy_feat_cols)} features, '
      f'balanced ({easy_label_counts[1]:,} per class)')
print(f'  Harder: {harder_check.shape[0]:,} rows, {len(harder_feat_only)} features, '
      f'balanced ({harder_label_counts[1]:,} per class)')

	Easy	Harder
Total rows	480,422	480,422
Label balance	50/50	50/50
Feature columns	47	32 (15 context-free removed)
`cos_w1all_w2all` gap (real − distractor)	+0.218	(removed)
`cos_w1clue_w2all` gap (real − distractor)	+0.184	−0.070
`wn_max_path_sim` gap (real − distractor)	+0.288	+0.155

File	Description
`data/dataset_easy.parquet`	480,422 rows × 47 features + metadata + label
`data/dataset_harder.parquet`	480,422 rows × 32 features + metadata + label

05 — Dataset Construction¶

Step 5: Easy Dataset¶

Step 7: Harder Dataset (second half of notebook)¶

Imports and Path Setup¶

Load Data¶

Step 5: Easy Dataset Construction¶

Compute Features for Easy Distractors¶

Assemble Easy Dataset¶

Easy Dataset Validation¶

Step 7: Harder Dataset Construction¶

Compute Definition–Answer Allsense Similarity¶

Select Harder Distractors¶

Compute Features for Harder Distractors¶

Assemble Harder Dataset¶

Harder Dataset Validation¶

Summary¶

What was done¶

Dataset statistics¶

Output files¶

What comes next¶