import warnings

import numpy as np
import pandas as pd
from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity

warnings.filterwarnings('ignore', category=FutureWarning)

# --- Environment Auto-Detection ---
# Same pattern as 02_embedding_generation.ipynb and 03_feature_engineering.ipynb:
# detect Colab, Great Lakes, or local and set paths accordingly.
try:
    IS_COLAB = 'google.colab' in str(get_ipython())
except NameError:
    IS_COLAB = False

if IS_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    PROJECT_ROOT = Path('/content/drive/MyDrive/SIADS 692 Milestone II/'
                        'Milestone II - NLP Cryptic Crossword Clues/'
                        'clue_misdirection')
else:
    # Local or Great Lakes: notebook is in clue_misdirection/notebooks/,
    # so parent is the clue_misdirection project root.
    try:
        PROJECT_ROOT = Path(__file__).resolve().parent.parent
    except NameError:
        PROJECT_ROOT = Path.cwd().parent

DATA_DIR = PROJECT_ROOT / 'data'
EMBEDDINGS_DIR = DATA_DIR / 'embeddings'
OUTPUT_DIR = PROJECT_ROOT / 'outputs'
FIGURES_DIR = OUTPUT_DIR / 'figures'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
FIGURES_DIR.mkdir(parents=True, exist_ok=True)

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Embedding dimension for CALE-MBERT-en
EMBED_DIM = 1024

print(f'Environment: {"Google Colab" if IS_COLAB else "Local / Great Lakes"}')
print(f'Project root: {PROJECT_ROOT}')
print(f'Data directory: {DATA_DIR}')
print(f'Embeddings directory: {EMBEDDINGS_DIR}')
print(f'Output directory: {OUTPUT_DIR}')

Environment: Local / Great Lakes
Project root: /Users/victoria/Desktop/MADS/ccc-project/clue_misdirection
Data directory: /Users/victoria/Desktop/MADS/ccc-project/clue_misdirection/data
Embeddings directory: /Users/victoria/Desktop/MADS/ccc-project/clue_misdirection/data/embeddings
Output directory: /Users/victoria/Desktop/MADS/ccc-project/clue_misdirection/outputs

# --- Load clues_filtered.csv (Step 1 output) ---
clues_path = DATA_DIR / 'clues_filtered.csv'
assert clues_path.exists(), (
    f'Missing input file: {clues_path}\n'
    f'Run 01_data_cleaning.ipynb first to produce this file.'
)
clues_df = pd.read_csv(clues_path)
print(f'clues_filtered.csv: {len(clues_df):,} rows')

# --- Load clue_context_phrases.csv (Step 2 intermediate) ---
# This file provides definition_wn and answer_wn — the WordNet-ready lookup
# keys that map to the embedding index files. It also identifies which rows
# survived Step 2's cleanup (240,211 of the original 241,397).
# CRITICAL: keep_default_na=False prevents pandas from interpreting the word
# "nan" (grandmother) as NaN — see DATA.md and CLAUDE.md.
cc_phrases_path = EMBEDDINGS_DIR / 'clue_context_phrases.csv'
assert cc_phrases_path.exists(), (
    f'Missing input file: {cc_phrases_path}\n'
    f'Run 02_embedding_generation.ipynb first to produce this file.'
)
cc_phrases = pd.read_csv(cc_phrases_path, keep_default_na=False)
print(f'clue_context_phrases.csv: {len(cc_phrases):,} rows')

# --- Record each row's position in cc_phrases / clue_context_embeddings ---
# clue_context_phrases.csv and clue_context_embeddings.npy are in identical
# row order (verified in Step 2). By recording the row position here, we get
# a direct index into the clue-context embedding array after the merge —
# avoiding the ambiguity of mapping through clue_context_index.csv, which
# only has clue_id and can't disambiguate double-definition clues.
cc_phrases['cc_row_position'] = np.arange(len(cc_phrases))

# --- Merge to get definition_wn and answer_wn onto the clue rows ---
# Inner merge restricts to the 240,211 rows that have embeddings.
# We merge on (clue_id, definition) — NOT clue_id alone — because
# double-definition clues have multiple rows per clue_id. A clue_id-only
# merge would produce a many-to-many cross product for those clues.
df = clues_df.merge(
    cc_phrases[['clue_id', 'definition', 'definition_wn', 'answer_wn',
                'def_num_usable_synsets', 'ans_num_usable_synsets',
                'cc_row_position']],
    on=['clue_id', 'definition'],
    how='inner'
)

# Verify the merge produced exactly the expected number of rows.
assert len(df) == len(cc_phrases), (
    f'Merge produced {len(df):,} rows, expected {len(cc_phrases):,}. '
    f'This likely means a double-definition clue was not disambiguated '
    f'correctly by the (clue_id, definition) key.')

print(f'\nWorking set after merge: {len(df):,} rows')
print(f'  (dropped {len(clues_df) - len(df):,} rows without embeddings)')
n_unique_pairs = df.drop_duplicates(subset=['definition_wn', 'answer_wn']).shape[0]
print(f'  Unique (definition_wn, answer_wn) pairs: {n_unique_pairs:,}')

clues_filtered.csv: 241,397 rows
clue_context_phrases.csv: 240,211 rows

Working set after merge: 240,211 rows
  (dropped 1,186 rows without embeddings)
  Unique (definition_wn, answer_wn) pairs: 127,608

# --- Load embedding arrays and index files ---
# CRITICAL: keep_default_na=False on all index CSVs — the word "nan"
# (grandmother) is a valid crossword definition/answer.

definition_embeddings = np.load(EMBEDDINGS_DIR / 'definition_embeddings.npy')
definition_index = pd.read_csv(
    EMBEDDINGS_DIR / 'definition_index.csv', index_col=0,
    keep_default_na=False)

answer_embeddings = np.load(EMBEDDINGS_DIR / 'answer_embeddings.npy')
answer_index = pd.read_csv(
    EMBEDDINGS_DIR / 'answer_index.csv', index_col=0,
    keep_default_na=False)

clue_context_embeddings = np.load(
    EMBEDDINGS_DIR / 'clue_context_embeddings.npy')
clue_context_index = pd.read_csv(
    EMBEDDINGS_DIR / 'clue_context_index.csv', index_col=0,
    keep_default_na=False)

# --- Print shapes and sizes ---
print(f'{"File":<35s} {"Shape":<25s} {"Memory":>8s}')
print(f'{"-"*35} {"-"*25} {"-"*8}')
for name, arr in [
    ('definition_embeddings.npy', definition_embeddings),
    ('answer_embeddings.npy', answer_embeddings),
    ('clue_context_embeddings.npy', clue_context_embeddings),
]:
    mb = arr.nbytes / 1024**2
    print(f'{name:<35s} {str(arr.shape):<25s} {mb:>6.1f} MB')

total_mb = (definition_embeddings.nbytes + answer_embeddings.nbytes
            + clue_context_embeddings.nbytes) / 1024**2
print(f'\nTotal embedding memory: {total_mb:.1f} MB')
print(f'\nIndex sizes:')
print(f'  definition_index:     {len(definition_index):,} rows')
print(f'  answer_index:         {len(answer_index):,} rows')
print(f'  clue_context_index:   {len(clue_context_index):,} rows')

# --- Shape and consistency assertions ---
n_def = len(definition_index)
n_ans = len(answer_index)
n_cc = len(clue_context_index)

assert definition_embeddings.shape == (n_def, 3, EMBED_DIM), (
    f'definition_embeddings shape mismatch: expected ({n_def}, 3, {EMBED_DIM}), '
    f'got {definition_embeddings.shape}')
assert answer_embeddings.shape == (n_ans, 3, EMBED_DIM), (
    f'answer_embeddings shape mismatch: expected ({n_ans}, 3, {EMBED_DIM}), '
    f'got {answer_embeddings.shape}')
assert clue_context_embeddings.shape == (n_cc, EMBED_DIM), (
    f'clue_context_embeddings shape mismatch: expected ({n_cc}, {EMBED_DIM}), '
    f'got {clue_context_embeddings.shape}')

print(f'\nAll shape assertions passed. Embedding dimension: {EMBED_DIM}.')

File                                Shape                       Memory
----------------------------------- ------------------------- --------
definition_embeddings.npy           (27385, 3, 1024)           320.9 MB
answer_embeddings.npy               (45254, 3, 1024)           530.3 MB
clue_context_embeddings.npy         (240211, 1024)             938.3 MB

Total embedding memory: 1789.6 MB

Index sizes:
  definition_index:     27,385 rows
  answer_index:         45,254 rows
  clue_context_index:   240,211 rows

All shape assertions passed. Embedding dimension: 1024.

# --- Build word → row-position mappings for O(1) lookup ---
# definition_index and answer_index have integer row indices (0, 1, 2, ...)
# and a 'word' column. We create a Series mapping word string → row position.
def_word_to_idx = pd.Series(
    definition_index.index, index=definition_index['word'])
ans_word_to_idx = pd.Series(
    answer_index.index, index=answer_index['word'])

# --- Build the candidate answer pool ---
# answer_vocab is the sorted list of all unique answer_wn strings from
# answer_index. This is the retrieval candidate pool: for each query, we
# compute cosine similarity against every word in this pool and rank them.
answer_vocab = sorted(answer_index['word'].tolist())
answer_vocab_size = len(answer_vocab)
print(f'Answer vocabulary (candidate pool): {answer_vocab_size:,} unique answers')

# --- answer_word_to_pos: maps answer_wn string → position in answer_vocab ---
# This is needed to find the rank of the true answer after argsort.
answer_word_to_pos = {word: pos for pos, word in enumerate(answer_vocab)}

# --- Build (V, 1024) candidate answer matrices for each answer condition ---
# For each answer condition (Allsense=slot 0, Common=slot 1, Obscure=slot 2),
# we index into answer_embeddings.npy to build a matrix where row i corresponds
# to answer_vocab[i]. The row order must match answer_vocab so that
# answer_word_to_pos gives correct rank lookups.
answer_vocab_indices = np.array([ans_word_to_idx[w] for w in answer_vocab])

answer_matrices = {
    'Allsense': answer_embeddings[answer_vocab_indices, 0, :],  # (V, 1024)
    'Common':   answer_embeddings[answer_vocab_indices, 1, :],  # (V, 1024)
    'Obscure':  answer_embeddings[answer_vocab_indices, 2, :],  # (V, 1024)
}

for name, mat in answer_matrices.items():
    print(f'  answer_matrices["{name}"]: shape {mat.shape}, '
          f'{mat.nbytes / 1024**2:.1f} MB')

Answer vocabulary (candidate pool): 45,254 unique answers
  answer_matrices["Allsense"]: shape (45254, 1024), 176.8 MB
  answer_matrices["Common"]: shape (45254, 1024), 176.8 MB
  answer_matrices["Obscure"]: shape (45254, 1024), 176.8 MB

# --- Deduplicate to unique (definition_wn, answer_wn) pairs ---
# Each unique pair gets one row. We keep the first occurrence's metadata
# (def_answer_pair_id, def_num_usable_synsets, ans_num_usable_synsets).
unique_pairs = df.drop_duplicates(
    subset=['definition_wn', 'answer_wn'], keep='first'
).copy()

print(f'Unique (definition_wn, answer_wn) pairs: {len(unique_pairs):,}')
print(f'  (from {len(df):,} total clue rows)')

# --- Count how many clue rows each pair has ---
# This is relevant for the Clue Context condition, where we aggregate
# multiple clue-level ranks per pair.
clues_per_pair = (
    df.groupby(['definition_wn', 'answer_wn'])
    .size()
    .reset_index(name='clues_per_pair')
)
unique_pairs = unique_pairs.merge(
    clues_per_pair, on=['definition_wn', 'answer_wn'], how='left'
)

print(f'\nClues per unique pair:')
print(f'  Mean:   {unique_pairs["clues_per_pair"].mean():.2f}')
print(f'  Median: {unique_pairs["clues_per_pair"].median():.1f}')
print(f'  Max:    {unique_pairs["clues_per_pair"].max()}')
print(f'  Pairs with 1 clue:  '
      f'{(unique_pairs["clues_per_pair"] == 1).sum():,} '
      f'({(unique_pairs["clues_per_pair"] == 1).mean():.1%})')
print(f'  Pairs with 5+ clues: '
      f'{(unique_pairs["clues_per_pair"] >= 5).sum():,} '
      f'({(unique_pairs["clues_per_pair"] >= 5).mean():.1%})')

# --- Report single-synset pair percentages ---
# ~35% of definitions and ~46% of answers have only 1 usable WordNet synset.
# For these, Common = Obscure = Allsense, so the Common vs. Obscure
# retrieval comparison is uninformative. We report these percentages here
# as noted in FINDINGS.md pitfalls for Step 4.
single_def = (unique_pairs['def_num_usable_synsets'] == 1).mean()
single_ans = (unique_pairs['ans_num_usable_synsets'] == 1).mean()
both_single = (
    (unique_pairs['def_num_usable_synsets'] == 1) &
    (unique_pairs['ans_num_usable_synsets'] == 1)
).mean()
print(f'\nSingle-synset words (Common = Obscure = Allsense):')
print(f'  Definitions: {single_def:.1%}')
print(f'  Answers:     {single_ans:.1%}')
print(f'  Both:        {both_single:.1%}')

Unique (definition_wn, answer_wn) pairs: 127,608
  (from 240,211 total clue rows)

Clues per unique pair:
  Mean:   1.88
  Median: 1.0
  Max:    59
  Pairs with 1 clue:  86,330 (67.7%)
  Pairs with 5+ clues: 8,740 (6.8%)

Single-synset words (Common = Obscure = Allsense):
  Definitions: 17.8%
  Answers:     32.9%
  Both:        8.0%

def compute_retrieval_metrics(query_embeddings, answer_matrix,
                              true_answer_positions,
                              ks=[1, 5, 10, 50, 100],
                              batch_size=1000):
    """
    Compute retrieval metrics for a set of definition queries against a
    candidate answer matrix.

    Parameters
    ----------
    query_embeddings : np.ndarray, shape (N, 1024)
        Definition embeddings (one per query).
    answer_matrix : np.ndarray, shape (V, 1024)
        Candidate answer embeddings. V = answer vocabulary size.
    true_answer_positions : np.ndarray, shape (N,)
        Position of the true answer in answer_matrix for each query.
    ks : list of int
        Top-k thresholds for hit rate computation.
    batch_size : int
        Number of queries per batch to avoid memory issues with the full
        (N, V) similarity matrix.

    Returns
    -------
    metrics : dict
        Top-k hit rates, mean rank, median rank, mean cosine similarity.
    ranks : np.ndarray, shape (N,)
        Per-query rank of the true answer (1-indexed).
    cosine_sims : np.ndarray, shape (N,)
        Per-query cosine similarity between the query and the true answer.
    """
    N = query_embeddings.shape[0]
    V = answer_matrix.shape[0]
    ranks = np.empty(N, dtype=np.int64)
    cosine_sims = np.empty(N, dtype=np.float64)

    for start in range(0, N, batch_size):
        end = min(start + batch_size, N)
        batch_queries = query_embeddings[start:end]          # (B, 1024)
        batch_true_pos = true_answer_positions[start:end]    # (B,)

        # Compute cosine similarity for this batch: (B, V)
        sims = cosine_similarity(batch_queries, answer_matrix)

        for j in range(end - start):
            true_pos = batch_true_pos[j]
            cosine_sims[start + j] = sims[j, true_pos]

            # Rank = number of candidates with similarity >= true answer's
            # similarity. This is equivalent to argsort but faster for a
            # single position lookup: count how many are >= rather than
            # sorting the full array.
            rank = int((sims[j] >= sims[j, true_pos]).sum())
            ranks[start + j] = rank  # 1-indexed (true answer counts itself)

    # --- Aggregate metrics ---
    metrics = {}
    for k in ks:
        metrics[f'top_{k}'] = float((ranks <= k).mean())
    metrics['mean_rank'] = float(ranks.mean())
    metrics['median_rank'] = float(np.median(ranks))
    metrics['mean_cosine_sim'] = float(cosine_sims.mean())

    return metrics, ranks, cosine_sims

import time

# =====================================================================
# Context-Free Definition Conditions: Allsense, Common, Obscure
# =====================================================================
# For these 3 definition conditions, each unique pair has exactly one
# definition embedding (determined by definition_wn). We look up the
# correct row in definition_embeddings.npy at the appropriate slot
# (0=Allsense, 1=Common, 2=Obscure).

# Map each unique pair's definition_wn and answer_wn to array positions.
up_def_indices = unique_pairs['definition_wn'].map(def_word_to_idx).astype(int).values
up_ans_positions = np.array([
    answer_word_to_pos[w] for w in unique_pairs['answer_wn']
])

n_pairs = len(unique_pairs)
print(f'Running context-free retrieval over {n_pairs:,} unique pairs')
print(f'Candidate pool: {answer_vocab_size:,} answers')
print(f'Each run computes {n_pairs:,} × {answer_vocab_size:,} cosine similarities '
      f'(in batches of 1,000)\n')

def_conditions_cf = {
    'Allsense': 0,   # slot 0 in definition_embeddings
    'Common':   1,   # slot 1
    'Obscure':  2,   # slot 2
}

ans_conditions = ['Allsense', 'Common', 'Obscure']

# Store all results: key = (def_condition, ans_condition)
all_results = {}
all_ranks = {}
all_cosines = {}

for def_name, def_slot in def_conditions_cf.items():
    # Build query matrix: one row per unique pair, using the appropriate
    # definition embedding slot.
    query_embs = definition_embeddings[up_def_indices, def_slot, :]  # (N_pairs, 1024)

    for ans_name in ans_conditions:
        label = f'Def:{def_name} × Ans:{ans_name}'
        print(f'  {label} ...', end=' ', flush=True)
        t0 = time.time()

        metrics, ranks, cosines = compute_retrieval_metrics(
            query_embs, answer_matrices[ans_name], up_ans_positions
        )

        elapsed = time.time() - t0
        print(f'done ({elapsed:.1f}s) — '
              f'median rank {metrics["median_rank"]:.0f}, '
              f'top-1 {metrics["top_1"]:.2%}')

        all_results[(def_name, ans_name)] = metrics
        all_ranks[(def_name, ans_name)] = ranks
        all_cosines[(def_name, ans_name)] = cosines

print(f'\nCompleted 9 context-free retrieval runs.')

Running context-free retrieval over 127,608 unique pairs
Candidate pool: 45,254 answers
Each run computes 127,608 × 45,254 cosine similarities (in batches of 1,000)

  Def:Allsense × Ans:Allsense ... done (23.0s) — median rank 1015, top-1 0.30%
  Def:Allsense × Ans:Common ... done (22.7s) — median rank 1741, top-1 0.50%
  Def:Allsense × Ans:Obscure ... done (21.6s) — median rank 1872, top-1 0.53%
  Def:Common × Ans:Allsense ... done (21.5s) — median rank 1389, top-1 0.57%
  Def:Common × Ans:Common ... done (21.8s) — median rank 2208, top-1 0.32%
  Def:Common × Ans:Obscure ... done (22.0s) — median rank 2674, top-1 0.87%
  Def:Obscure × Ans:Allsense ... done (21.3s) — median rank 1964, top-1 0.41%
  Def:Obscure × Ans:Common ... done (21.3s) — median rank 3375, top-1 0.70%
  Def:Obscure × Ans:Obscure ... done (21.5s) — median rank 3194, top-1 0.27%

Completed 9 context-free retrieval runs.

# =====================================================================
# Clue Context Definition Condition
# =====================================================================
# Unlike the context-free conditions, clue-context embeddings are per-row
# (each clue sentence produces a different embedding for the definition
# word). We run retrieval over ALL 240K rows, then aggregate to unique
# pairs by taking the median rank per (definition_wn, answer_wn) group.
#
# Embedding lookup uses cc_row_position — a direct positional index into
# clue_context_embeddings.npy, set during the merge in cell 3. This
# correctly handles double-definition clues (where clue_id is non-unique).

cc_indices = df['cc_row_position'].values
cc_query_embs = clue_context_embeddings[cc_indices, :]  # (240211, 1024)

# True answer positions in answer_vocab for each row of df.
all_row_ans_positions = np.array([
    answer_word_to_pos[w] for w in df['answer_wn']
])

n_rows = len(df)
print(f'Running Clue Context retrieval over {n_rows:,} clue rows')
print(f'(will aggregate to {n_pairs:,} unique pairs via median rank)\n')

for ans_name in ans_conditions:
    label = f'Def:Clue Context × Ans:{ans_name}'
    print(f'  {label} ...', end=' ', flush=True)
    t0 = time.time()

    metrics_allrows, ranks_allrows, cosines_allrows = compute_retrieval_metrics(
        cc_query_embs, answer_matrices[ans_name], all_row_ans_positions
    )
    elapsed = time.time() - t0
    print(f'done ({elapsed:.1f}s)')

    # --- Aggregate per-row ranks to per-pair median ranks ---
    # Attach the per-row ranks back to df, group by unique pair, take median.
    row_results = pd.DataFrame({
        'definition_wn': df['definition_wn'].values,
        'answer_wn': df['answer_wn'].values,
        'rank': ranks_allrows,
        'cosine_sim': cosines_allrows,
    })
    pair_agg = (
        row_results
        .groupby(['definition_wn', 'answer_wn'])
        .agg(median_rank=('rank', 'median'),
             median_cosine_sim=('cosine_sim', 'median'))
        .reset_index()
    )

    # Verify we got one row per unique pair.
    assert len(pair_agg) == n_pairs, (
        f'Expected {n_pairs:,} unique pairs, got {len(pair_agg):,}')

    # Compute summary metrics from the per-pair median ranks, matching the
    # same metric names as compute_retrieval_metrics for consistency.
    median_ranks = pair_agg['median_rank'].values
    median_cosines = pair_agg['median_cosine_sim'].values

    metrics = {}
    for k in [1, 5, 10, 50, 100]:
        metrics[f'top_{k}'] = float((median_ranks <= k).mean())
    metrics['mean_rank'] = float(median_ranks.mean())
    metrics['median_rank'] = float(np.median(median_ranks))
    metrics['mean_cosine_sim'] = float(median_cosines.mean())

    print(f'    → after median aggregation: '
          f'median rank {metrics["median_rank"]:.0f}, '
          f'top-1 {metrics["top_1"]:.2%}')

    all_results[('Clue Context', ans_name)] = metrics
    all_ranks[('Clue Context', ans_name)] = median_ranks
    all_cosines[('Clue Context', ans_name)] = median_cosines

print(f'\nCompleted 3 Clue Context retrieval runs.')
print(f'Total: 12 retrieval conditions evaluated.')

Running Clue Context retrieval over 240,211 clue rows
(will aggregate to 127,608 unique pairs via median rank)

  Def:Clue Context × Ans:Allsense ... done (42.2s)
    → after median aggregation: median rank 2160, top-1 0.41%
  Def:Clue Context × Ans:Common ... done (45.2s)
    → after median aggregation: median rank 3350, top-1 0.42%
  Def:Clue Context × Ans:Obscure ... done (42.4s)
    → after median aggregation: median rank 3564, top-1 0.42%

Completed 3 Clue Context retrieval runs.
Total: 12 retrieval conditions evaluated.

# =====================================================================
# Assemble Results Table
# =====================================================================
# Collect all 12 cells into a single DataFrame for display and export.

def_order = ['Allsense', 'Common', 'Obscure', 'Clue Context']
metric_cols = ['top_1', 'top_5', 'top_10', 'top_50', 'top_100',
               'mean_rank', 'median_rank', 'mean_cosine_sim']

rows = []
for def_name in def_order:
    for ans_name in ans_conditions:
        m = all_results[(def_name, ans_name)]
        row = {
            'def_condition': def_name,
            'ans_condition': ans_name,
        }
        for col in metric_cols:
            row[col] = m[col]
        rows.append(row)

results_df = pd.DataFrame(rows)

# Save to CSV.
results_path = OUTPUT_DIR / 'retrieval_results_unique_pairs.csv'
results_df.to_csv(results_path, index=False)
print(f'Saved: {results_path}')
print(f'  {len(results_df)} rows (4 def conditions × 3 ans conditions)')

# --- Display formatted table ---
# Format percentages and numbers for readability.
display_df = results_df.copy()
for col in ['top_1', 'top_5', 'top_10', 'top_50', 'top_100']:
    display_df[col] = display_df[col].apply(lambda x: f'{x:.2%}')
display_df['mean_rank'] = display_df['mean_rank'].apply(lambda x: f'{x:,.0f}')
display_df['median_rank'] = display_df['median_rank'].apply(lambda x: f'{x:,.0f}')
display_df['mean_cosine_sim'] = display_df['mean_cosine_sim'].apply(lambda x: f'{x:.4f}')

display_df.columns = ['Def Condition', 'Ans Condition',
                       'Top-1', 'Top-5', 'Top-10', 'Top-50', 'Top-100',
                       'Mean Rank', 'Median Rank', 'Mean Cos Sim']

print(f'\n{"=" * 120}')
print('RETRIEVAL RESULTS — Unique (definition, answer) pairs')
print(f'N = {n_pairs:,} pairs | Candidate pool = {answer_vocab_size:,} answers')
print(f'{"=" * 120}')
print(display_df.to_string(index=False))
print(f'{"=" * 120}')

Saved: /Users/victoria/Desktop/MADS/ccc-project/clue_misdirection/outputs/retrieval_results_unique_pairs.csv
  12 rows (4 def conditions × 3 ans conditions)

========================================================================================================================
RETRIEVAL RESULTS — Unique (definition, answer) pairs
N = 127,608 pairs | Candidate pool = 45,254 answers
========================================================================================================================
Def Condition Ans Condition Top-1 Top-5 Top-10 Top-50 Top-100 Mean Rank Median Rank Mean Cos Sim
     Allsense      Allsense 0.30% 4.48%  7.69% 17.87%  23.77%     5,173       1,015       0.6434
     Allsense        Common 0.50% 4.22%  6.78% 15.59%  20.53%     7,698       1,741       0.5910
     Allsense       Obscure 0.53% 3.98%  6.45% 14.75%  19.71%     7,687       1,872       0.5902
       Common      Allsense 0.57% 4.38%  7.12% 16.36%  21.75%     6,761       1,389       0.5686
       Common        Common 0.32% 4.14%  6.63% 14.82%  19.48%     8,838       2,208       0.5298
       Common       Obscure 0.87% 3.89%  5.96% 13.45%  17.94%     9,485       2,674       0.5183
      Obscure      Allsense 0.41% 3.69%  6.03% 14.02%  18.80%     7,267       1,964       0.5645
      Obscure        Common 0.70% 3.42%  5.31% 12.08%  16.09%     9,852       3,375       0.5151
      Obscure       Obscure 0.27% 3.23%  5.20% 12.00%  16.11%     9,410       3,194       0.5221
 Clue Context      Allsense 0.41% 2.56%  4.40% 11.67%  16.25%     7,401       2,160       0.5421
 Clue Context        Common 0.42% 2.28%  3.86% 10.15%  14.28%     9,672       3,350       0.4992
 Clue Context       Obscure 0.42% 2.17%  3.66%  9.65%  13.69%     9,811       3,564       0.4967
========================================================================================================================

# =====================================================================
# Supplementary: All-Rows Retrieval (Allsense × Allsense only)
# =====================================================================
# Run over all 240,211 rows using the Allsense definition embedding
# (looked up per definition_wn) and the Allsense answer matrix.

# Map each row's definition_wn to its position in definition_embeddings.
allrow_def_indices = df['definition_wn'].map(def_word_to_idx).astype(int).values
allrow_query_embs = definition_embeddings[allrow_def_indices, 0, :]  # slot 0 = Allsense

print(f'Running all-rows retrieval (Allsense × Allsense)')
print(f'  N = {n_rows:,} rows | Candidate pool = {answer_vocab_size:,} answers')
t0 = time.time()

allrow_metrics, allrow_ranks, allrow_cosines = compute_retrieval_metrics(
    allrow_query_embs, answer_matrices['Allsense'], all_row_ans_positions
)

elapsed = time.time() - t0
print(f'  Done ({elapsed:.1f}s)')
print(f'  Median rank: {allrow_metrics["median_rank"]:.0f}')
print(f'  Top-1: {allrow_metrics["top_1"]:.2%}')

# --- Build and save supplementary results ---
allrow_results_df = pd.DataFrame([{
    'def_condition': 'Allsense',
    'ans_condition': 'Allsense',
    'n_queries': n_rows,
    'reporting_unit': 'all_rows',
    **allrow_metrics,
}])

allrow_results_path = OUTPUT_DIR / 'retrieval_results_all_rows.csv'
allrow_results_df.to_csv(allrow_results_path, index=False)
print(f'\nSaved: {allrow_results_path}')

# --- Compare with unique-pairs result ---
up_allsense = all_results[('Allsense', 'Allsense')]
print(f'\nComparison — Allsense × Allsense:')
print(f'  {"Metric":<20s} {"Unique Pairs":>15s} {"All Rows":>15s}')
print(f'  {"-"*20} {"-"*15} {"-"*15}')
for metric in ['top_1', 'top_5', 'top_10', 'top_50', 'top_100']:
    up_val = up_allsense[metric]
    ar_val = allrow_metrics[metric]
    print(f'  {metric:<20s} {up_val:>14.2%} {ar_val:>14.2%}')
for metric in ['mean_rank', 'median_rank']:
    up_val = up_allsense[metric]
    ar_val = allrow_metrics[metric]
    print(f'  {metric:<20s} {up_val:>14,.0f} {ar_val:>14,.0f}')
print(f'  {"mean_cosine_sim":<20s} {up_allsense["mean_cosine_sim"]:>14.4f} '
      f'{allrow_metrics["mean_cosine_sim"]:>14.4f}')

Running all-rows retrieval (Allsense × Allsense)
  N = 240,211 rows | Candidate pool = 45,254 answers
  Done (45.6s)
  Median rank: 831
  Top-1: 0.24%

Saved: /Users/victoria/Desktop/MADS/ccc-project/clue_misdirection/outputs/retrieval_results_all_rows.csv

Comparison — Allsense × Allsense:
  Metric                  Unique Pairs        All Rows
  -------------------- --------------- ---------------
  top_1                         0.30%          0.24%
  top_5                         4.48%          5.07%
  top_10                        7.69%          8.77%
  top_50                       17.87%         19.66%
  top_100                      23.77%         25.91%
  mean_rank                     5,173          4,830
  median_rank                   1,015            831
  mean_cosine_sim              0.6434         0.6482

# =====================================================================
# Figure 1: Grouped Bar Chart — Median Rank by Condition (Log Scale)
# =====================================================================
# This is the primary visualization. The key visual signal is the gap
# between the Clue Context group and the three context-free groups.

fig, ax = plt.subplots(figsize=(10, 6))

def_labels = ['Allsense', 'Common', 'Obscure', 'Clue Context']
ans_labels = ['Allsense', 'Common', 'Obscure']
colors = ['#4878CF', '#6ACC65', '#D65F5F']  # blue, green, red

x = np.arange(len(def_labels))
width = 0.22  # bar width

for j, ans_name in enumerate(ans_labels):
    median_ranks = [
        all_results[(def_name, ans_name)]['median_rank']
        for def_name in def_labels
    ]
    offset = (j - 1) * width
    bars = ax.bar(x + offset, median_ranks, width,
                  label=f'Answer: {ans_name}', color=colors[j],
                  edgecolor='white', linewidth=0.5)

    # Annotate each bar with its value.
    for bar, val in zip(bars, median_ranks):
        ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() * 1.08,
                f'{val:,.0f}', ha='center', va='bottom', fontsize=7.5,
                fontweight='bold')

ax.set_yscale('log')
ax.set_xlabel('Definition Embedding Condition', fontsize=12)
ax.set_ylabel('Median Rank of True Answer (log scale)', fontsize=12)
ax.set_title('Retrieval Performance: Median Rank by Definition and Answer Condition\n'
             f'(N = {n_pairs:,} unique pairs, candidate pool = {answer_vocab_size:,} answers)',
             fontsize=13)
ax.set_xticks(x)
ax.set_xticklabels(def_labels, fontsize=11)
ax.legend(title='Answer Embedding', fontsize=10, title_fontsize=10,
          loc='upper left')

# Add a horizontal reference line at the middle of the candidate pool
# to give a sense of scale (random baseline = V/2).
ax.axhline(y=answer_vocab_size / 2, color='gray', linestyle='--',
           linewidth=0.8, alpha=0.6)
ax.text(3.45, answer_vocab_size / 2 * 1.15, f'random baseline ({answer_vocab_size//2:,})',
        fontsize=8, color='gray', ha='right')

ax.set_ylim(bottom=None, top=answer_vocab_size * 1.5)
ax.tick_params(axis='both', labelsize=10)
sns.despine()

plt.tight_layout()
bar_chart_path = FIGURES_DIR / 'retrieval_bar_chart.png'
fig.savefig(bar_chart_path, dpi=300, bbox_inches='tight')
print(f'Saved: {bar_chart_path}')
plt.show()

Saved: /Users/victoria/Desktop/MADS/ccc-project/clue_misdirection/outputs/figures/retrieval_bar_chart.png

# =====================================================================
# Figure 2: Heatmap — Mean Cosine Similarity (4×3 Grid)
# =====================================================================
# Each cell shows the mean cosine similarity between the definition
# embedding (under that row's condition) and the true answer embedding
# (under that column's condition), averaged over all unique pairs.
# Lower similarity under Clue Context corroborates the rank findings.

def_labels = ['Allsense', 'Common', 'Obscure', 'Clue Context']
ans_labels = ['Allsense', 'Common', 'Obscure']

heatmap_data = np.zeros((len(def_labels), len(ans_labels)))
for i, def_name in enumerate(def_labels):
    for j, ans_name in enumerate(ans_labels):
        heatmap_data[i, j] = all_results[(def_name, ans_name)]['mean_cosine_sim']

fig, ax = plt.subplots(figsize=(7, 5))
im = ax.imshow(heatmap_data, cmap='YlOrRd', aspect='auto')

# Annotate each cell with the cosine similarity value.
for i in range(len(def_labels)):
    for j in range(len(ans_labels)):
        val = heatmap_data[i, j]
        # Choose text color for readability against the background.
        text_color = 'white' if val > 0.55 else 'black'
        ax.text(j, i, f'{val:.4f}', ha='center', va='center',
                fontsize=12, fontweight='bold', color=text_color)

ax.set_xticks(np.arange(len(ans_labels)))
ax.set_xticklabels(ans_labels, fontsize=11)
ax.set_yticks(np.arange(len(def_labels)))
ax.set_yticklabels(def_labels, fontsize=11)
ax.set_xlabel('Answer Embedding Condition', fontsize=12)
ax.set_ylabel('Definition Embedding Condition', fontsize=12)
ax.set_title('Mean Cosine Similarity: Definition vs. True Answer\n'
             f'(N = {n_pairs:,} unique pairs)',
             fontsize=13)

cbar = fig.colorbar(im, ax=ax, shrink=0.8)
cbar.set_label('Mean Cosine Similarity', fontsize=10)

plt.tight_layout()
heatmap_path = FIGURES_DIR / 'retrieval_heatmap.png'
fig.savefig(heatmap_path, dpi=300, bbox_inches='tight')
print(f'Saved: {heatmap_path}')
plt.show()

Saved: /Users/victoria/Desktop/MADS/ccc-project/clue_misdirection/outputs/figures/retrieval_heatmap.png

# ---------------------------------------------------------------------------
# WordNet Reachability Analysis
# ---------------------------------------------------------------------------
# We load features_all.parquet, deduplicate to unique (definition_wn, answer_wn)
# pairs, and classify the 20 wn_rel_* boolean columns into three tiers:
#   - Synonym (1 column): direct WordNet synonymy
#   - 1-hop non-synonym (6 columns): direct hypernym, hyponym, meronym, holonym
#   - 2-hop (13 columns): compound relationships like hyponym_of_hypernym
#
# "WordNet Reachability by Hop Depth" shows cumulative reachability at each tier.
# "WordNet Relationship Type Prevalence" shows individual relationship type prevalence.
# ---------------------------------------------------------------------------

features = pd.read_parquet(DATA_DIR / "features_all.parquet")

# Deduplicate to unique (definition_wn, answer_wn) pairs.
# Each pair may appear in multiple clue rows — we only need one copy because
# the wn_rel_* columns depend only on the pair, not the specific clue.
wn_rel_cols = [c for c in features.columns if c.startswith("wn_rel_")]
pairs = features[["definition_wn", "answer_wn"] + wn_rel_cols].drop_duplicates(
    subset=["definition_wn", "answer_wn"]
)
n_pairs = len(pairs)
print(f"Unique (definition_wn, answer_wn) pairs: {n_pairs:,}")

# --- Classify columns into tiers ---

synonym_cols = ["wn_rel_synonym"]

onehop_cols = [
    "wn_rel_hyponym",
    "wn_rel_hypernym",
    "wn_rel_part_holonym",
    "wn_rel_part_meronym",
    "wn_rel_substance_meronym",
    "wn_rel_member_meronym",
]

# Everything else is 2-hop (compound relationship names with underscored components)
twohop_cols = [c for c in wn_rel_cols if c not in synonym_cols + onehop_cols]

print(f"\nColumn classification:")
print(f"  Synonym:          {len(synonym_cols)} column(s)")
print(f"  1-hop non-synonym: {len(onehop_cols)} column(s)")
print(f"  2-hop compound:    {len(twohop_cols)} column(s)")
print(f"  Total:             {len(synonym_cols) + len(onehop_cols) + len(twohop_cols)}")

# Sanity check
assert len(synonym_cols) + len(onehop_cols) + len(twohop_cols) == len(wn_rel_cols)

# --- WordNet Reachability by Hop Depth (≤2-hop) ---

# A pair is "connected" at a tier if ANY column in that tier (or below) is True.
synonym_mask = pairs[synonym_cols].any(axis=1)
onehop_mask = pairs[synonym_cols + onehop_cols].any(axis=1)
twohop_mask = pairs[wn_rel_cols].any(axis=1)

synonym_count = synonym_mask.sum()
onehop_count = onehop_mask.sum()
twohop_count = twohop_mask.sum()

print("\n" + "=" * 85)
print("WordNet Reachability by Hop Depth (Unique Pairs, ≤2-hop)")
print("=" * 85)
print(
    f"{'Relationship Scope':<25} {'Description':<45} "
    f"{'Pairs':>7} {'%':>7} {'Cum %':>7}"
)
print("-" * 85)

rows_t2 = [
    (
        "Synonym only",
        "Answer is a WordNet synonym of definition",
        synonym_count,
        synonym_count / n_pairs * 100,
        synonym_count / n_pairs * 100,
    ),
    (
        "Any ≤1-hop",
        "Synonym + direct hyper/hyponym/meronym/holonym",
        onehop_count,
        onehop_count / n_pairs * 100,
        onehop_count / n_pairs * 100,
    ),
    (
        "Any ≤2-hop",
        "Above + all two-hop compounds",
        twohop_count,
        twohop_count / n_pairs * 100,
        twohop_count / n_pairs * 100,
    ),
]

for scope, desc, count, pct, cum_pct in rows_t2:
    print(f"{scope:<25} {desc:<45} {count:>7,} {pct:>6.1f}% {cum_pct:>6.1f}%")

# Note: since tiers are cumulative, "% of Pairs" and "Cumulative %" are the same
# for each row (each tier includes all lower tiers).
print("-" * 85)
print(f"{'Total unique pairs':<25} {'':45} {n_pairs:>7,}")
print(
    f"\nNote: Cumulative % equals % of Pairs because each tier includes all\n"
    f"lower tiers. The incremental gain from synonym → ≤1-hop is "
    f"{(onehop_count - synonym_count):,} pairs "
    f"({(onehop_count - synonym_count) / n_pairs * 100:.1f}pp), "
    f"and from ≤1-hop → ≤2-hop is "
    f"{(twohop_count - onehop_count):,} pairs "
    f"({(twohop_count - onehop_count) / n_pairs * 100:.1f}pp)."
)


# --- WordNet Relationship Type Prevalence (≤2-hop) ---

# Assign hop counts for display
hop_map = {c: 1 for c in synonym_cols + onehop_cols}
hop_map.update({c: 2 for c in twohop_cols})

# Count pairs with each relationship type
rel_counts = []
for col in wn_rel_cols:
    count = pairs[col].sum()
    rel_counts.append(
        {
            "Relationship Type": col.replace("wn_rel_", ""),
            "Hop Count": hop_map[col],
            "Pairs with Relationship": int(count),
            "% of Unique Pairs": count / n_pairs * 100,
        }
    )

rel_df = pd.DataFrame(rel_counts).sort_values(
    "Pairs with Relationship", ascending=False
).reset_index(drop=True)

# --- Primary table: ≤2-hop types at ≥1% prevalence ---
# These correspond to the boolean features used by the classifier in later
# notebooks. 3-hop types are computed below and shown in the appendix table.
rel_df_filtered = rel_df[
    (rel_df["Hop Count"] <= 2) & (rel_df["% of Unique Pairs"] >= 1.0)
].reset_index(drop=True)

print("\n\n" + "=" * 80)
print("WordNet Relationship Type Prevalence (≤2-hop, ≥1% of pairs)")
print("=" * 80)
print(
    f"{'Relationship Type':<35} {'Hops':>4} "
    f"{'Pairs':>8} {'% of Pairs':>11}"
)
print("-" * 80)

for _, row in rel_df_filtered.iterrows():
    print(
        f"{row['Relationship Type']:<35} {row['Hop Count']:>4} "
        f"{row['Pairs with Relationship']:>8,} "
        f"{row['% of Unique Pairs']:>10.1f}%"
    )

print("-" * 80)
print(
    f"\nShowing {len(rel_df_filtered)} of {len(rel_df)} ≤2-hop relationship types "
    f"(filtered to ≥1% prevalence)."
)
print(
    f"This table is limited to ≤2-hop types at ≥1% prevalence because these\n"
    f"correspond to the boolean features used by the classifier in later notebooks.\n"
    f"The full list of all relationship types (including 3-hop) is available in\n"
    f"the appendix version printed below."
)
print(
    f"\nNote: A single pair can have multiple relationship types (e.g., a pair\n"
    f"may be both a synonym and a hyponym via different synsets), so the counts\n"
    f"above do not sum to the cumulative totals in the reachability table."
)

Unique (definition_wn, answer_wn) pairs: 127,608

Column classification:
  Synonym:          1 column(s)
  1-hop non-synonym: 6 column(s)
  2-hop compound:    13 column(s)
  Total:             20

=====================================================================================
WordNet Reachability by Hop Depth (Unique Pairs, ≤2-hop)
=====================================================================================
Relationship Scope        Description                                     Pairs       %   Cum %
-------------------------------------------------------------------------------------
Synonym only              Answer is a WordNet synonym of definition      14,675   11.5%   11.5%
Any ≤1-hop                Synonym + direct hyper/hyponym/meronym/holonym  38,716   30.3%   30.3%
Any ≤2-hop                Above + all two-hop compounds                  56,220   44.1%   44.1%
-------------------------------------------------------------------------------------
Total unique pairs                                                      127,608

Note: Cumulative % equals % of Pairs because each tier includes all
lower tiers. The incremental gain from synonym → ≤1-hop is 24,041 pairs (18.8pp), and from ≤1-hop → ≤2-hop is 17,504 pairs (13.7pp).


================================================================================
WordNet Relationship Type Prevalence (≤2-hop, ≥1% of pairs)
================================================================================
Relationship Type                   Hops    Pairs  % of Pairs
--------------------------------------------------------------------------------
hyponym_of_hypernym                    2   25,268       19.8%
hyponym                                1   19,129       15.0%
synonym                                1   14,675       11.5%
hypernym_of_hyponym                    2   10,686        8.4%
hypernym                               1    7,273        5.7%
hyponym_of_hyponym                     2    6,759        5.3%
hypernym_of_hypernym                   2    1,626        1.3%
--------------------------------------------------------------------------------

Showing 7 of 20 ≤2-hop relationship types (filtered to ≥1% prevalence).
This table is limited to ≤2-hop types at ≥1% prevalence because these
correspond to the boolean features used by the classifier in later notebooks.
The full list of all relationship types (including 3-hop) is available in
the appendix version printed below.

Note: A single pair can have multiple relationship types (e.g., a pair
may be both a synonym and a hyponym via different synsets), so the counts
above do not sum to the cumulative totals in the reachability table.

# ---------------------------------------------------------------------------
# 3-Hop WordNet Reachability Computation
# ---------------------------------------------------------------------------
# For each pair not already connected at ≤2 hops, check all 512 possible
# three-hop paths through WordNet. This reuses the same traversal pattern as
# _check_synset_reachable in scripts/feature_utils.py.
# ---------------------------------------------------------------------------

import itertools
import time
from nltk.corpus import wordnet as wn
from tqdm.auto import tqdm

# Step A: Define all valid 3-hop relationship combinations.
# The 8 base WordNet relationship methods we traverse. These match the methods
# available on NLTK Synset objects.
BASE_METHODS = [
    "hypernyms", "hyponyms",
    "part_holonyms", "part_meronyms",
    "substance_meronyms", "member_meronyms",
    "substance_holonyms", "member_holonyms",
]

# Generate all 8^3 = 512 three-hop triples.
threehop_triples = list(itertools.product(BASE_METHODS, repeat=3))
print(f"Total 3-hop combinations: {len(threehop_triples)}")

# Name each triple using the existing right-to-left convention:
# hops = [A, B, C] → name = "C_of_B_of_A" where each method drops trailing 's'.
# Example: ['hypernyms', 'hyponyms', 'hypernyms'] → "hypernym_of_hyponym_of_hypernym"
def method_to_rel_name(method):
    """Convert a WordNet method name to a relationship name (drop trailing 's')."""
    return method.rstrip("s")

def triple_to_name(triple):
    """Convert a (hop0, hop1, hop2) triple to the canonical relationship name."""
    a, b, c = triple
    return f"{method_to_rel_name(c)}_of_{method_to_rel_name(b)}_of_{method_to_rel_name(a)}"

threehop_names = [triple_to_name(t) for t in threehop_triples]

# Quick sanity: show a few examples
for i in [0, 1, 100, 511]:
    print(f"  {threehop_triples[i]} → {threehop_names[i]}")

# Step B: Identify pairs that need 3-hop checking — those with NO ≤2-hop connection.
unconnected_mask = ~twohop_mask
unconnected_pairs = pairs.loc[unconnected_mask, ["definition_wn", "answer_wn"]].reset_index(drop=True)
n_unconnected = len(unconnected_pairs)
print(f"\nPairs unconnected at ≤2-hop: {n_unconnected:,} ({n_unconnected / n_pairs * 100:.1f}%)")

# Step C: Check all 512 three-hop paths for each unconnected pair.
# We reuse the same traversal logic as _check_synset_reachable: start from
# def_synsets, follow each method in the hop list sequentially, and check
# whether any answer synset is reached at the end.

def get_wordnet_synsets(word):
    """Look up all WordNet synsets for a word, handling multi-word entries."""
    synsets = wn.synsets(word)
    if not synsets and " " in word:
        synsets = wn.synsets(word.replace(" ", "_"))
    return synsets

def check_reachable(def_synsets, ans_synsets_set, hops):
    """Check if any answer synset is reachable via a sequence of hops."""
    current = set(def_synsets)
    for method_name in hops:
        next_level = set()
        for synset in current:
            next_level.update(getattr(synset, method_name)())
        current = next_level
        if not current:
            return False
    return bool(current & ans_synsets_set)

# Track which 3-hop types connected at least one pair (to avoid storing 512
# columns for types that never fire). We store results as a list of dicts.
results = []
types_that_fired = set()

start_time = time.time()

for idx, row in tqdm(unconnected_pairs.iterrows(), total=n_unconnected, desc="3-hop check"):
    def_word = row["definition_wn"]
    ans_word = row["answer_wn"]

    def_synsets = get_wordnet_synsets(def_word)
    ans_synsets = get_wordnet_synsets(ans_word)

    # Early skip if either word has no synsets (shouldn't happen given our
    # WordNet filter, but be safe).
    if not def_synsets or not ans_synsets:
        results.append({"definition_wn": def_word, "answer_wn": ans_word})
        continue

    ans_synsets_set = set(ans_synsets)
    pair_result = {"definition_wn": def_word, "answer_wn": ans_word}

    for triple, name in zip(threehop_triples, threehop_names):
        if check_reachable(def_synsets, ans_synsets_set, list(triple)):
            pair_result[name] = True
            types_that_fired.add(name)

    results.append(pair_result)

    # Print estimated time after first 1000 pairs
    if idx == 999:
        elapsed = time.time() - start_time
        est_total = elapsed / 1000 * n_unconnected
        print(
            f"\n  After 1,000 pairs: {elapsed:.1f}s elapsed, "
            f"estimated total: {est_total / 60:.1f} min"
        )

total_time = time.time() - start_time

# Build DataFrame of 3-hop results. Only include columns for types that
# actually connected at least one pair to keep the DataFrame compact.
threehop_df = pd.DataFrame(results)
threehop_bool_cols = sorted(types_that_fired)
# Fill NaN (types not checked or not found) with False
for col in threehop_bool_cols:
    if col not in threehop_df.columns:
        threehop_df[col] = False
    threehop_df[col] = threehop_df[col].fillna(False).astype(bool)

# Step D: Summary stats.
n_newly_connected = threehop_df[threehop_bool_cols].any(axis=1).sum()
n_types_fired = len(types_that_fired)

print(f"\n{'=' * 70}")
print(f"3-Hop Computation Summary")
print(f"{'=' * 70}")
print(f"Pairs checked:                 {n_unconnected:,}")
print(f"3-hop types that fired:        {n_types_fired} / 512")
print(f"Pairs newly connected (3-hop): {n_newly_connected:,} "
      f"({n_newly_connected / n_unconnected * 100:.1f}% of unconnected, "
      f"{n_newly_connected / n_pairs * 100:.1f}% of all pairs)")
print(f"Still unconnected at ≤3-hop:   {n_unconnected - n_newly_connected:,} "
      f"({(n_unconnected - n_newly_connected) / n_pairs * 100:.1f}% of all pairs)")
print(f"Total time:                    {total_time / 60:.1f} min")

Total 3-hop combinations: 512
  ('hypernyms', 'hypernyms', 'hypernyms') → hypernym_of_hypernym_of_hypernym
  ('hypernyms', 'hypernyms', 'hyponyms') → hyponym_of_hypernym_of_hypernym
  ('hyponyms', 'substance_meronyms', 'substance_meronyms') → substance_meronym_of_substance_meronym_of_hyponym
  ('member_holonyms', 'member_holonyms', 'member_holonyms') → member_holonym_of_member_holonym_of_member_holonym

Pairs unconnected at ≤2-hop: 71,388 (55.9%)

3-hop check:   0%|          | 0/71388 [00:00<?, ?it/s]

  After 1,000 pairs: 4.9s elapsed, estimated total: 5.8 min

======================================================================
3-Hop Computation Summary
======================================================================
Pairs checked:                 71,388
3-hop types that fired:        169 / 512
Pairs newly connected (3-hop): 9,701 (13.6% of unconnected, 7.6% of all pairs)
Still unconnected at ≤3-hop:   61,687 (48.3% of all pairs)
Total time:                    2.5 min

# ---------------------------------------------------------------------------
# Updated tables with 3-hop data
# ---------------------------------------------------------------------------

# Cumulative count at ≤3-hop = (pairs connected at ≤2-hop) + (pairs newly
# connected at 3-hop only). This is correct because we only ran 3-hop checks
# on pairs that were unconnected at ≤2-hop.
threehop_total = twohop_count + n_newly_connected

print("=" * 85)
print("WordNet Reachability by Hop Depth (Unique Pairs, ≤3-hop)")
print("=" * 85)
print(
    f"{'Relationship Scope':<25} {'Description':<45} "
    f"{'Pairs':>7} {'%':>7} {'Cum %':>7}"
)
print("-" * 85)

rows_updated = [
    (
        "Synonym only",
        "Answer is a WordNet synonym of definition",
        synonym_count,
        synonym_count / n_pairs * 100,
        synonym_count / n_pairs * 100,
    ),
    (
        "Any ≤1-hop",
        "Synonym + direct hyper/hyponym/meronym/holonym",
        onehop_count,
        onehop_count / n_pairs * 100,
        onehop_count / n_pairs * 100,
    ),
    (
        "Any ≤2-hop",
        "Above + all two-hop compounds",
        twohop_count,
        twohop_count / n_pairs * 100,
        twohop_count / n_pairs * 100,
    ),
    (
        "Any ≤3-hop",
        "Above + all three-hop compounds",
        threehop_total,
        threehop_total / n_pairs * 100,
        threehop_total / n_pairs * 100,
    ),
]

for scope, desc, count, pct, cum_pct in rows_updated:
    print(f"{scope:<25} {desc:<45} {count:>7,} {pct:>6.1f}% {cum_pct:>6.1f}%")

print("-" * 85)
print(f"{'Total unique pairs':<25} {'':45} {n_pairs:>7,}")
print(
    f"\nIncremental gains:  synonym → ≤1-hop: "
    f"+{onehop_count - synonym_count:,} ({(onehop_count - synonym_count) / n_pairs * 100:.1f}pp)"
    f"  |  ≤1-hop → ≤2-hop: "
    f"+{twohop_count - onehop_count:,} ({(twohop_count - onehop_count) / n_pairs * 100:.1f}pp)"
    f"  |  ≤2-hop → ≤3-hop: "
    f"+{n_newly_connected:,} ({n_newly_connected / n_pairs * 100:.1f}pp)"
)

# --- Updated prevalence table: append 3-hop types to the existing ≤2-hop types ---

# Build 3-hop prevalence rows. Counts are over the n_unconnected subset, but
# percentages are reported over all n_pairs for consistency with the ≤2-hop table.
threehop_rel_counts = []
for col in threehop_bool_cols:
    count = int(threehop_df[col].sum())
    threehop_rel_counts.append(
        {
            "Relationship Type": col,
            "Hop Count": 3,
            "Pairs with Relationship": count,
            "% of Unique Pairs": count / n_pairs * 100,
        }
    )

threehop_rel_df = pd.DataFrame(threehop_rel_counts)

# Combine with ≤2-hop prevalence (rel_df from the earlier cell)
combined_rel_df = pd.concat([rel_df, threehop_rel_df], ignore_index=True).sort_values(
    "Pairs with Relationship", ascending=False
).reset_index(drop=True)

print("\n\n" + "=" * 80)
print("APPENDIX: WordNet Relationship Type Prevalence (Unique Pairs, All Types)")
print("=" * 80)
print(
    f"{'Relationship Type':<45} {'Hops':>4} "
    f"{'Pairs':>8} {'% of Pairs':>11}"
)
print("-" * 80)

# Show all types with >0% prevalence (including 3-hop)
for _, row in combined_rel_df.iterrows():
    if row["Pairs with Relationship"] > 0:
        print(
            f"{row['Relationship Type']:<45} {row['Hop Count']:>4} "
            f"{row['Pairs with Relationship']:>8,} "
            f"{row['% of Unique Pairs']:>10.2f}%"
        )

print("-" * 80)
print(
    f"\nAppendix: showing all {len(combined_rel_df[combined_rel_df['Pairs with Relationship'] > 0])} "
    f"relationship types with >0% prevalence "
    f"({len(rel_df)} at ≤2-hop + {len(threehop_rel_df)} at 3-hop)."
)
print(
    f"The primary table above (≤2-hop, ≥1%) shows the subset used as classifier features."
)
print(
    f"\nNote: A single pair can have multiple relationship types, so counts\n"
    f"do not sum to the cumulative totals in the reachability table."
)

=====================================================================================
WordNet Reachability by Hop Depth (Unique Pairs, ≤3-hop)
=====================================================================================
Relationship Scope        Description                                     Pairs       %   Cum %
-------------------------------------------------------------------------------------
Synonym only              Answer is a WordNet synonym of definition      14,675   11.5%   11.5%
Any ≤1-hop                Synonym + direct hyper/hyponym/meronym/holonym  38,716   30.3%   30.3%
Any ≤2-hop                Above + all two-hop compounds                  56,220   44.1%   44.1%
Any ≤3-hop                Above + all three-hop compounds                65,921   51.7%   51.7%
-------------------------------------------------------------------------------------
Total unique pairs                                                      127,608

Incremental gains:  synonym → ≤1-hop: +24,041 (18.8pp)  |  ≤1-hop → ≤2-hop: +17,504 (13.7pp)  |  ≤2-hop → ≤3-hop: +9,701 (7.6pp)


================================================================================
APPENDIX: WordNet Relationship Type Prevalence (Unique Pairs, All Types)
================================================================================
Relationship Type                             Hops    Pairs  % of Pairs
--------------------------------------------------------------------------------
hyponym_of_hypernym                              2   25,268      19.80%
hyponym                                          1   19,129      14.99%
synonym                                          1   14,675      11.50%
hypernym_of_hyponym                              2   10,686       8.37%
hypernym                                         1    7,273       5.70%
hyponym_of_hyponym                               2    6,759       5.30%
hyponym_of_hyponym_of_hypernym                   3    3,836       3.01%
hyponym_of_hypernym_of_hypernym                  3    2,418       1.89%
hyponym_of_hyponym_of_hyponym                    3    2,168       1.70%
hypernym_of_hypernym                             2    1,626       1.27%
part_holonym_of_part_meronym                     2      802       0.63%
member_meronym_of_member_holonym                 2      408       0.32%
part_holonym_of_hyponym                          2      345       0.27%
part_meronym_of_part_holonym_of_hyponym          3      307       0.24%
hyponym_of_part_holonym                          2      269       0.21%
part_meronym                                     1      228       0.18%
part_meronym_of_hyponym                          2      224       0.18%
part_holonym                                     1      206       0.16%
hyponym_of_part_meronym                          2      145       0.11%
hypernym_of_hypernym_of_hypernym                 3      138       0.11%
hyponym_of_hypernym_of_hyponym                   3      133       0.10%
substance_meronym_of_hyponym                     2      114       0.09%
part_meronym_of_hypernym                         2      100       0.08%
member_holonym_of_hyponym_of_hypernym            3       86       0.07%
hyponym_of_hypernym_of_part_meronym              3       81       0.06%
part_holonym_of_hyponym_of_hypernym              3       79       0.06%
hyponym_of_hypernym_of_member_holonym            3       76       0.06%
part_meronym_of_hyponym_of_hypernym              3       63       0.05%
hyponym_of_hyponym_of_part_holonym               3       61       0.05%
hyponym_of_part_meronym_of_part_holonym          3       61       0.05%
member_holonym_of_member_meronym_of_member_holonym    3       61       0.05%
part_holonym_of_hypernym                         2       60       0.05%
hyponym_of_part_holonym_of_hyponym               3       58       0.05%
hypernym_of_hyponym_of_hyponym                   3       54       0.04%
hyponym_of_part_holonym_of_part_meronym          3       51       0.04%
member_holonym_of_hypernym_of_hyponym            3       50       0.04%
hyponym_of_part_meronym_of_hyponym               3       49       0.04%
member_meronym_of_member_holonym_of_hyponym      3       49       0.04%
part_meronym_of_part_meronym_of_part_holonym     3       48       0.04%
hyponym_of_part_meronym_of_hypernym              3       47       0.04%
member_meronym                                   1       41       0.03%
hyponym_of_hypernym_of_part_holonym              3       34       0.03%
part_holonym_of_hyponym_of_hyponym               3       34       0.03%
member_meronym_of_member_holonym_of_member_holonym    3       32       0.03%
hyponym_of_member_meronym_of_member_holonym      3       32       0.03%
member_holonym_of_hyponym_of_hyponym             3       31       0.02%
part_holonym_of_part_meronym_of_hyponym          3       30       0.02%
hyponym_of_part_holonym_of_hypernym              3       27       0.02%
hypernym_of_hyponym_of_hypernym                  3       27       0.02%
part_meronym_of_hyponym_of_hyponym               3       25       0.02%
hypernym_of_part_holonym_of_hyponym              3       24       0.02%
part_meronym_of_part_holonym_of_hypernym         3       23       0.02%
hypernym_of_hyponym_of_member_holonym            3       23       0.02%
hyponym_of_hypernym_of_substance_meronym         3       23       0.02%
hypernym_of_member_holonym_of_hyponym            3       20       0.02%
hyponym_of_substance_holonym_of_hyponym          3       19       0.01%
hypernym_of_part_meronym_of_hyponym              3       18       0.01%
member_meronym_of_member_meronym_of_member_holonym    3       18       0.01%
hypernym_of_hypernym_of_hyponym                  3       16       0.01%
hyponym_of_hyponym_of_member_meronym             3       16       0.01%
part_holonym_of_hyponym_of_part_meronym          3       15       0.01%
substance_meronym                                1       15       0.01%
hyponym_of_member_holonym_of_hypernym            3       14       0.01%
member_holonym_of_part_holonym_of_part_meronym    3       14       0.01%
part_meronym_of_hypernym_of_hypernym             3       14       0.01%
part_holonym_of_part_meronym_of_part_holonym     3       13       0.01%
hyponym_of_hypernym_of_member_meronym            3       13       0.01%
part_meronym_of_part_meronym_of_hyponym          3       13       0.01%
hyponym_of_hyponym_of_substance_meronym          3       13       0.01%
hyponym_of_member_holonym_of_hyponym             3       12       0.01%
hyponym_of_hyponym_of_part_meronym               3       12       0.01%
hyponym_of_substance_meronym_of_hypernym         3       12       0.01%
hypernym_of_part_meronym_of_part_holonym         3       12       0.01%
member_holonym_of_member_holonym_of_hyponym      3       12       0.01%
substance_holonym_of_hyponym_of_substance_meronym    3       12       0.01%
substance_meronym_of_hyponym_of_hypernym         3       11       0.01%
substance_holonym_of_hyponym_of_hyponym          3       11       0.01%
hyponym_of_substance_meronym_of_hyponym          3       11       0.01%
hyponym_of_hyponym_of_member_holonym             3       10       0.01%
member_meronym_of_member_holonym_of_hypernym     3       10       0.01%
part_holonym_of_part_meronym_of_hypernym         3        9       0.01%
member_meronym_of_hyponym_of_member_holonym      3        9       0.01%
part_meronym_of_part_holonym_of_part_holonym     3        9       0.01%
hypernym_of_part_meronym_of_hypernym             3        8       0.01%
member_meronym_of_hyponym_of_hypernym            3        8       0.01%
hyponym_of_member_meronym_of_hypernym            3        8       0.01%
substance_holonym_of_substance_meronym_of_hyponym    3        8       0.01%
hyponym_of_part_meronym_of_part_meronym          3        8       0.01%
hyponym_of_hypernym_of_substance_holonym         3        8       0.01%
part_meronym_of_part_meronym_of_part_meronym     3        7       0.01%
hyponym_of_part_holonym_of_part_holonym          3        7       0.01%
part_meronym_of_hypernym_of_part_holonym         3        7       0.01%
hypernym_of_hypernym_of_part_meronym             3        6       0.00%
substance_holonym_of_hyponym_of_hypernym         3        6       0.00%
part_meronym_of_hyponym_of_part_holonym          3        6       0.00%
hypernym_of_member_meronym_of_hyponym            3        6       0.00%
substance_meronym_of_hyponym_of_hyponym          3        6       0.00%
member_meronym_of_hyponym_of_hyponym             3        5       0.00%
hyponym_of_member_meronym_of_hyponym             3        5       0.00%
member_meronym_of_part_holonym_of_hyponym        3        5       0.00%
hypernym_of_part_holonym_of_hypernym             3        5       0.00%
substance_meronym_of_substance_holonym_of_hyponym    3        5       0.00%
hypernym_of_substance_holonym_of_hyponym         3        5       0.00%
part_holonym_of_hyponym_of_member_holonym        3        5       0.00%
part_holonym_of_hyponym_of_member_meronym        3        5       0.00%
part_holonym_of_part_holonym_of_hyponym          3        4       0.00%
part_holonym_of_hypernym_of_part_meronym         3        4       0.00%
part_holonym_of_hypernym_of_hypernym             3        4       0.00%
hypernym_of_member_meronym_of_member_holonym     3        4       0.00%
member_holonym_of_substance_holonym_of_substance_meronym    3        4       0.00%
hypernym_of_substance_meronym_of_hyponym         3        4       0.00%
part_meronym_of_hyponym_of_part_meronym          3        4       0.00%
member_holonym_of_member_holonym_of_member_meronym    3        4       0.00%
part_meronym_of_hypernym_of_part_meronym         3        3       0.00%
part_meronym_of_hypernym_of_hyponym              3        3       0.00%
substance_holonym_of_substance_meronym_of_substance_holonym    3        3       0.00%
member_holonym_of_hypernym_of_hypernym           3        3       0.00%
hyponym_of_hyponym_of_substance_holonym          3        3       0.00%
member_meronym_of_hypernym_of_member_holonym     3        3       0.00%
hypernym_of_hypernym_of_member_holonym           3        3       0.00%
hypernym_of_hyponym_of_substance_holonym         3        3       0.00%
hyponym_of_substance_holonym_of_substance_meronym    3        3       0.00%
hyponym_of_member_holonym_of_member_holonym      3        3       0.00%
member_holonym_of_member_meronym_of_hyponym      3        3       0.00%
hypernym_of_substance_meronym_of_hypernym        3        2       0.00%
hyponym_of_part_holonym_of_member_holonym        3        2       0.00%
part_holonym_of_part_meronym_of_part_meronym     3        2       0.00%
hyponym_of_member_meronym_of_part_meronym        3        2       0.00%
hyponym_of_member_meronym_of_part_holonym        3        2       0.00%
hypernym_of_substance_meronym_of_substance_meronym    3        2       0.00%
part_meronym_of_hyponym_of_member_meronym        3        2       0.00%
part_meronym_of_member_holonym_of_hypernym       3        2       0.00%
part_meronym_of_part_holonym_of_member_holonym    3        2       0.00%
hyponym_of_part_meronym_of_member_holonym        3        2       0.00%
part_meronym_of_part_holonym_of_part_meronym     3        2       0.00%
substance_holonym_of_hypernym_of_hyponym         3        2       0.00%
hypernym_of_part_holonym_of_part_meronym         3        2       0.00%
hypernym_of_member_meronym_of_hypernym           3        2       0.00%
substance_holonym_of_substance_holonym_of_substance_meronym    3        2       0.00%
hypernym_of_member_holonym_of_hypernym           3        2       0.00%
substance_holonym_of_substance_meronym_of_member_holonym    3        2       0.00%
hypernym_of_hypernym_of_part_holonym             3        2       0.00%
part_holonym_of_part_meronym_of_member_holonym    3        2       0.00%
member_meronym_of_hyponym_of_member_meronym      3        2       0.00%
hyponym_of_substance_holonym_of_hypernym         3        2       0.00%
part_holonym_of_hyponym_of_part_holonym          3        2       0.00%
part_holonym_of_part_holonym_of_part_meronym     3        2       0.00%
member_meronym_of_part_meronym_of_part_holonym    3        2       0.00%
member_holonym_of_hyponym_of_member_meronym      3        2       0.00%
member_holonym_of_hyponym_of_member_holonym      3        2       0.00%
member_meronym_of_part_holonym_of_hypernym       3        2       0.00%
member_holonym_of_part_meronym_of_part_holonym    3        2       0.00%
hyponym_of_substance_meronym_of_part_meronym     3        2       0.00%
member_meronym_of_hypernym_of_hypernym           3        2       0.00%
substance_meronym_of_hypernym_of_part_meronym    3        1       0.00%
substance_holonym_of_substance_holonym_of_hypernym    3        1       0.00%
substance_holonym_of_hypernym_of_substance_meronym    3        1       0.00%
member_meronym_of_member_holonym_of_member_meronym    3        1       0.00%
member_meronym_of_part_meronym_of_part_meronym    3        1       0.00%
hypernym_of_member_meronym_of_part_meronym       3        1       0.00%
substance_meronym_of_hyponym_of_part_holonym     3        1       0.00%
hypernym_of_hypernym_of_member_meronym           3        1       0.00%
member_meronym_of_part_meronym_of_member_holonym    3        1       0.00%
substance_meronym_of_hypernym_of_hyponym         3        1       0.00%
substance_holonym_of_substance_meronym_of_hypernym    3        1       0.00%
member_meronym_of_part_holonym_of_part_meronym    3        1       0.00%
hypernym_of_part_holonym_of_substance_holonym    3        1       0.00%
substance_meronym_of_hypernym_of_hypernym        3        1       0.00%
member_holonym_of_part_holonym_of_hypernym       3        1       0.00%
part_meronym_of_part_meronym_of_hypernym         3        1       0.00%
hypernym_of_part_meronym_of_member_meronym       3        1       0.00%
member_holonym_of_hypernym_of_part_holonym       3        1       0.00%
part_holonym_of_part_holonym_of_part_holonym     3        1       0.00%
part_holonym_of_part_holonym_of_hypernym         3        1       0.00%
part_holonym_of_substance_holonym_of_hypernym    3        1       0.00%
part_holonym_of_member_holonym_of_hyponym        3        1       0.00%
hyponym_of_member_holonym_of_part_holonym        3        1       0.00%
member_meronym_of_hypernym_of_part_meronym       3        1       0.00%
member_holonym_of_hypernym_of_member_meronym     3        1       0.00%
hypernym_of_substance_meronym_of_substance_holonym    3        1       0.00%
hypernym_of_part_meronym_of_part_meronym         3        1       0.00%
hypernym_of_substance_meronym_of_part_meronym    3        1       0.00%
part_meronym_of_member_holonym_of_member_meronym    3        1       0.00%
hypernym_of_substance_holonym_of_substance_meronym    3        1       0.00%
part_holonym_of_hypernym_of_part_holonym         3        1       0.00%
part_meronym_of_part_holonym_of_member_meronym    3        1       0.00%
hypernym_of_substance_holonym_of_hypernym        3        1       0.00%
part_holonym_of_hypernym_of_member_meronym       3        1       0.00%
substance_meronym_of_substance_meronym_of_hyponym    3        1       0.00%
--------------------------------------------------------------------------------

Appendix: showing all 189 relationship types with >0% prevalence (20 at ≤2-hop + 169 at 3-hop).
The primary table above (≤2-hop, ≥1%) shows the subset used as classifier features.

Note: A single pair can have multiple relationship types, so counts
do not sum to the cumulative totals in the reachability table.

# ---------------------------------------------------------------------------
# WordNet Relationship Glossary — 10 types at ≥1% of unique pairs
# ---------------------------------------------------------------------------
# Each entry includes a plain-English explanation (from definition to answer)
# and a candidate example pair. Every example is programmatically verified
# against WordNet before display, and the full traversal path (with real
# intermediate words found by walking the graph) is shown. We prefer examples
# where the two words are NOT also WordNet synonyms, so the example highlights
# what is distinctive about that relationship type.
# ---------------------------------------------------------------------------

# --- Step A: Define glossary entries ---
# 'hops' follows the right-to-left convention: "X_of_Y" → [Y_method, X_method].
# 'alternates' provides fallback pairs if the primary example fails verification.

GLOSSARY = {
    "hyponym_of_hypernym": {
        "explanation": (
            "The definition and answer are co-hyponyms — 'siblings' that are "
            "both specific types of the same broader category. For example, "
            "'oak' and 'elm' are both types of 'tree'."
        ),
        "example": ("oak", "elm"),
        "hops": ["hypernyms", "hyponyms"],
        "alternates": [("trout", "salmon"), ("apple", "banana"), ("piano", "guitar")],
    },
    "hyponym": {
        "explanation": (
            "The answer is a more specific type of the definition — one level "
            "down in the taxonomy. For example, 'oak' is a specific kind of 'tree'."
        ),
        "example": ("tree", "oak"),
        "hops": ["hyponyms"],
        "alternates": [("fish", "trout"), ("drink", "tea"), ("toy", "doll")],
    },
    "synonym": {
        "explanation": (
            "The definition and answer share a WordNet synset — they can express "
            "the same concept. For example, 'bucket' and 'pail' mean the same thing."
        ),
        "example": ("bucket", "pail"),
        "hops": None,  # uses synonym check
        "alternates": [("happy", "glad"), ("couch", "sofa")],
    },
    "hypernym_of_hyponym": {
        "explanation": (
            "A 'cousin' relationship — the answer is reached by going down to a "
            "more specific type of the definition, then back up to a different "
            "broader category. The definition and answer share a child concept "
            "but are in different branches of the tree."
        ),
        "example": ("game", "sport"),
        "hops": ["hyponyms", "hypernyms"],
        "alternates": [("duty", "tariff"), ("tool", "machine")],
    },
    "hypernym": {
        "explanation": (
            "The answer is a more general category that includes the definition — "
            "one level up in the taxonomy. For example, 'toy' is the broader "
            "category that includes 'doll'."
        ),
        "example": ("doll", "toy"),
        "hops": ["hypernyms"],
        "alternates": [("tea", "drink"), ("silk", "cloth"), ("salmon", "fish")],
    },
    "hyponym_of_hyponym": {
        "explanation": (
            "The answer is two levels more specific than the definition — a "
            "'grandchild' in the taxonomy, reached by going down twice."
        ),
        "example": ("food", "bread"),
        "hops": ["hyponyms", "hyponyms"],
        "alternates": [],
    },
    "hyponym_of_hyponym_of_hypernym": {
        "explanation": (
            "The answer is reached by going up one level to a broader category, "
            "then down two levels to a specific instance — like a distant cousin, "
            "connected through a shared ancestor but two steps removed."
        ),
        "example": ("bread", "cookie"),
        "hops": ["hypernyms", "hyponyms", "hyponyms"],
        "alternates": [],
    },
    "hyponym_of_hypernym_of_hypernym": {
        "explanation": (
            "The answer is reached by going up two levels to a broad category, "
            "then down one level — connecting through a high-level ancestor that "
            "groups the definition and answer under the same general umbrella."
        ),
        "example": ("chair", "table"),
        "hops": ["hypernyms", "hypernyms", "hyponyms"],
        "alternates": [("bread", "cheese"), ("canoe", "sled")],
    },
    "hyponym_of_hyponym_of_hyponym": {
        "explanation": (
            "The answer is three levels more specific than the definition — a "
            "'great-grandchild' in the taxonomy, reached by going down three "
            "successive levels."
        ),
        "example": ("animal", "poodle"),
        "hops": ["hyponyms", "hyponyms", "hyponyms"],
        "alternates": [("food", "cookie"), ("fruit", "lemon")],
    },
    "hypernym_of_hypernym": {
        "explanation": (
            "The answer is two levels more general than the definition — a "
            "'grandparent' in the taxonomy, reached by going up twice."
        ),
        "example": ("cabin", "building"),
        "hops": ["hypernyms", "hypernyms"],
        "alternates": [("cottage", "building"), ("bread", "food")],
    },
}

# --- Step B: Path-tracing functions ---
# These find real intermediate words by walking the WordNet graph, so the
# glossary can show the full traversal path (not just endpoints).

def check_synonym_pair(def_word, ans_word):
    """Check if two words share at least one WordNet synset (synonym check)."""
    def_synsets = get_wordnet_synsets(def_word)
    for syn in def_synsets:
        lemma_names = {lemma.name().lower().replace("_", " ") for lemma in syn.lemmas()}
        if ans_word.replace("_", " ") in lemma_names:
            return True
    return False

def _collect_all_paths(start, ans_set, hops, current_path, results):
    """DFS to collect ALL valid paths through WordNet, recording intermediate
    synset lemmas at each non-final hop."""
    if len(hops) == 0:
        if start in ans_set:
            results.append(list(current_path))
        return
    if len(hops) == 1:
        # Final hop: check if any reachable synset is in the answer set.
        # No intermediate to record — the answer word itself is the endpoint.
        method = hops[0]
        for syn in getattr(start, method)():
            if syn in ans_set:
                results.append(list(current_path))
        return
    # Non-final hop: record the intermediate word (first lemma of the synset).
    method = hops[0]
    hop_label = method.rstrip("s")
    for syn in getattr(start, method)():
        lemma = syn.lemmas()[0].name().lower().replace("_", " ")
        current_path.append((lemma, hop_label))
        _collect_all_paths(syn, ans_set, hops[1:], current_path, results)
        current_path.pop()

def find_best_path(def_word, ans_word, hops):
    """Find the most readable path from def_word to ans_word through WordNet.

    When multiple valid paths exist (e.g., oak→tree→elm vs. oak→wood→elm),
    prefer paths with shorter, single-word intermediates to maximize clarity.
    """
    def_synsets = get_wordnet_synsets(def_word)
    ans_set = set(get_wordnet_synsets(ans_word))
    all_paths = []
    for ds in def_synsets:
        _collect_all_paths(ds, ans_set, list(hops), [], all_paths)
    if not all_paths:
        return None
    # Score: penalize multi-word intermediates, then prefer shorter words,
    # then break ties alphabetically for determinism.
    def score(path):
        multi_word_penalty = sum(10 for w, _ in path if " " in w)
        total_len = sum(len(w) for w, _ in path)
        alpha_key = tuple(w for w, _ in path)
        return (multi_word_penalty, total_len, alpha_key)
    return min(all_paths, key=score)

def format_path(def_word, ans_word, hops, intermediates):
    """Format a path as a human-readable string showing each hop.

    Single-hop:  tree → oak (hyponym)
    Two-hop:     oak → tree (hypernym) → elm (hyponym)
    Three-hop:   chair → seat (hypernym) → furniture (hypernym) → table (hyponym)
    Synonym:     bucket ↔ pail (shared synset)
    """
    if hops is None:
        return f"{def_word} \u2194 {ans_word} (shared synset)"
    if intermediates is None:
        return "(no path found)"
    parts = [def_word]
    for word, label in intermediates:
        parts.append(f"{word} ({label})")
    last_label = hops[-1].rstrip("s")
    parts.append(f"{ans_word} ({last_label})")
    return " \u2192 ".join(parts)

# --- Step C: Verify examples and trace paths ---

verified_glossary = []

for rel_type, entry in GLOSSARY.items():
    explanation = entry["explanation"]
    hops = entry["hops"]

    # Try primary example, then alternates
    all_candidates = [entry["example"]] + entry.get("alternates", [])
    verified_pair = None
    verified_path_str = None

    for def_word, ans_word in all_candidates:
        if hops is None:
            ok = check_synonym_pair(def_word, ans_word)
            path_str = format_path(def_word, ans_word, None, None) if ok else None
        else:
            def_synsets = get_wordnet_synsets(def_word)
            ans_synsets = set(get_wordnet_synsets(ans_word))
            ok = check_reachable(def_synsets, ans_synsets, hops)
            if ok:
                intermediates = find_best_path(def_word, ans_word, hops)
                path_str = format_path(def_word, ans_word, hops, intermediates)
            else:
                path_str = None

        is_syn = check_synonym_pair(def_word, ans_word)
        syn_note = " [also synonym]" if is_syn and hops is not None else ""
        status = "\u2713" if ok else "\u2717"
        print(f"  {status} {rel_type}: {def_word} \u2192 {ans_word}{syn_note}")

        if ok and verified_pair is None:
            verified_pair = (def_word, ans_word)
            verified_path_str = path_str
            break  # Use the first verified example

    if verified_pair is None:
        print(f"  WARNING: No verified example for {rel_type}!")

    verified_glossary.append({
        "rel_type": rel_type,
        "hops_count": len(hops) if hops else 1,
        "explanation": explanation,
        "example": verified_pair,
        "path_str": verified_path_str,
    })

# --- Step D: Print the glossary as a formatted table ---
# Look up prevalence from combined_rel_df for each type.
prevalence_map = dict(
    zip(combined_rel_df["Relationship Type"], combined_rel_df["% of Unique Pairs"])
)

# Sort by prevalence descending
verified_glossary.sort(key=lambda x: prevalence_map.get(x["rel_type"], 0), reverse=True)

print("\n" + "=" * 140)
print("WordNet Relationship Glossary (10 Types at \u22651% of Unique Pairs)")
print("=" * 140)
print(
    f"{'Relationship Type':<35} {'Hops':>4} {'% Pairs':>8}  "
    f"{'Explanation':<55} {'Verified Example (full path)'}"
)
print("-" * 140)

for entry in verified_glossary:
    rel = entry["rel_type"]
    hops = entry["hops_count"]
    pct = prevalence_map.get(rel, 0)
    expl = entry["explanation"]
    path_str = entry["path_str"] if entry["path_str"] else "(no verified example)"

    # Truncate explanation for table display
    expl_short = expl[:52] + "..." if len(expl) > 55 else expl
    print(f"{rel:<35} {hops:>4} {pct:>7.2f}%  {expl_short:<55} {path_str}")

print("-" * 140)

# Print full explanations and paths below the table for readability
print("\nFull explanations and paths:")
for entry in verified_glossary:
    rel = entry["rel_type"]
    pct = prevalence_map.get(rel, 0)
    path_str = entry["path_str"] if entry["path_str"] else "(none)"
    print(f"\n  {rel} ({pct:.2f}%)")
    print(f"    Path:  {path_str}")
    print(f"    {entry['explanation']}")

  ✓ hyponym_of_hypernym: oak → elm
  ✓ hyponym: tree → oak
  ✓ synonym: bucket → pail
  ✓ hypernym_of_hyponym: game → sport
  ✓ hypernym: doll → toy
  ✓ hyponym_of_hyponym: food → bread
  ✓ hyponym_of_hyponym_of_hypernym: bread → cookie
  ✓ hyponym_of_hypernym_of_hypernym: chair → table
  ✓ hyponym_of_hyponym_of_hyponym: animal → poodle
  ✓ hypernym_of_hypernym: cabin → building

============================================================================================================================================
WordNet Relationship Glossary (10 Types at ≥1% of Unique Pairs)
============================================================================================================================================
Relationship Type                   Hops  % Pairs  Explanation                                             Verified Example (full path)
--------------------------------------------------------------------------------------------------------------------------------------------
hyponym_of_hypernym                    2   19.80%  The definition and answer are co-hyponyms — 'sibling... oak → tree (hypernym) → elm (hyponym)
hyponym                                1   14.99%  The answer is a more specific type of the definition... tree → oak (hyponym)
synonym                                1   11.50%  The definition and answer share a WordNet synset — t... bucket ↔ pail (shared synset)
hypernym_of_hyponym                    2    8.37%  A 'cousin' relationship — the answer is reached by g... game → athletic game (hyponym) → sport (hypernym)
hypernym                               1    5.70%  The answer is a more general category that includes ... doll → toy (hypernym)
hyponym_of_hyponym                     2    5.30%  The answer is two levels more specific than the defi... food → baked goods (hyponym) → bread (hyponym)
hyponym_of_hyponym_of_hypernym         3    3.01%  The answer is reached by going up one level to a bro... bread → baked goods (hypernym) → cake (hyponym) → cookie (hyponym)
hyponym_of_hypernym_of_hypernym        3    1.89%  The answer is reached by going up two levels to a br... chair → seat (hypernym) → furniture (hypernym) → table (hyponym)
hyponym_of_hyponym_of_hyponym          3    1.70%  The answer is three levels more specific than the de... animal → domestic animal (hyponym) → dog (hyponym) → poodle (hyponym)
hypernym_of_hypernym                   2    1.27%  The answer is two levels more general than the defin... cabin → house (hypernym) → building (hypernym)
--------------------------------------------------------------------------------------------------------------------------------------------

Full explanations and paths:

  hyponym_of_hypernym (19.80%)
    Path:  oak → tree (hypernym) → elm (hyponym)
    The definition and answer are co-hyponyms — 'siblings' that are both specific types of the same broader category. For example, 'oak' and 'elm' are both types of 'tree'.

  hyponym (14.99%)
    Path:  tree → oak (hyponym)
    The answer is a more specific type of the definition — one level down in the taxonomy. For example, 'oak' is a specific kind of 'tree'.

  synonym (11.50%)
    Path:  bucket ↔ pail (shared synset)
    The definition and answer share a WordNet synset — they can express the same concept. For example, 'bucket' and 'pail' mean the same thing.

  hypernym_of_hyponym (8.37%)
    Path:  game → athletic game (hyponym) → sport (hypernym)
    A 'cousin' relationship — the answer is reached by going down to a more specific type of the definition, then back up to a different broader category. The definition and answer share a child concept but are in different branches of the tree.

  hypernym (5.70%)
    Path:  doll → toy (hypernym)
    The answer is a more general category that includes the definition — one level up in the taxonomy. For example, 'toy' is the broader category that includes 'doll'.

  hyponym_of_hyponym (5.30%)
    Path:  food → baked goods (hyponym) → bread (hyponym)
    The answer is two levels more specific than the definition — a 'grandchild' in the taxonomy, reached by going down twice.

  hyponym_of_hyponym_of_hypernym (3.01%)
    Path:  bread → baked goods (hypernym) → cake (hyponym) → cookie (hyponym)
    The answer is reached by going up one level to a broader category, then down two levels to a specific instance — like a distant cousin, connected through a shared ancestor but two steps removed.

  hyponym_of_hypernym_of_hypernym (1.89%)
    Path:  chair → seat (hypernym) → furniture (hypernym) → table (hyponym)
    The answer is reached by going up two levels to a broad category, then down one level — connecting through a high-level ancestor that groups the definition and answer under the same general umbrella.

  hyponym_of_hyponym_of_hyponym (1.70%)
    Path:  animal → domestic animal (hyponym) → dog (hyponym) → poodle (hyponym)
    The answer is three levels more specific than the definition — a 'great-grandchild' in the taxonomy, reached by going down three successive levels.

  hypernym_of_hypernym (1.27%)
    Path:  cabin → house (hypernym) → building (hypernym)
    The answer is two levels more general than the definition — a 'grandparent' in the taxonomy, reached by going up twice.

# =====================================================================
# Build Merged Analysis DataFrame: Retrieval Ranks + WordNet Connectivity
# =====================================================================
# Combine the per-pair retrieval ranks (from the 4x3 retrieval matrix) with
# the WordNet structural connectivity data (from the reachability analysis)
# into a single DataFrame for joint analysis.
# =====================================================================

# --- Step A: Attach retrieval ranks to unique pairs ---
# all_ranks arrays are positionally aligned with unique_pairs (context-free
# conditions iterate over unique_pairs directly; Clue Context aggregates
# to median ranks per unique pair).
analysis_df = unique_pairs[['definition_wn', 'answer_wn']].copy().reset_index(drop=True)

# Map condition names to column-friendly names.
def condition_to_colname(def_cond, ans_cond):
    """Convert (def_condition, ans_condition) tuple to a rank column name."""
    d = def_cond.lower().replace(' ', '_')
    a = ans_cond.lower().replace(' ', '_')
    return f'rank_{d}_{a}'

for key, ranks_arr in all_ranks.items():
    col = condition_to_colname(*key)
    analysis_df[col] = ranks_arr

print(f'Step A: analysis_df has {len(analysis_df):,} rows, '
      f'{len(analysis_df.columns)} columns')
assert len(analysis_df) == 127_608, (
    f'Expected 127,608 rows, got {len(analysis_df):,}')

# --- Step B: Merge in WordNet connectivity booleans ---
# The `pairs` DataFrame (from features_all.parquet) has the same 127,608
# unique pairs with 20 wn_rel_* boolean columns. Merge on the pair key.
analysis_df = analysis_df.merge(
    pairs[['definition_wn', 'answer_wn'] + wn_rel_cols],
    on=['definition_wn', 'answer_wn'],
    how='left'
)

assert len(analysis_df) == 127_608, (
    f'Row count changed after merge: {len(analysis_df):,} (expected 127,608)')
print(f'Step B: merged {len(wn_rel_cols)} wn_rel_* columns — '
      f'still {len(analysis_df):,} rows')

# --- Step C: Assign minimum hop depth tier ---
# Each pair gets exactly one tier label based on the shallowest level at
# which it connects in WordNet. Tiers are mutually exclusive: a pair that
# is both a synonym and a hyponym is classified as "Synonym" (the simplest
# relationship). np.select evaluates conditions in order — the first match wins.

# Recompute masks on analysis_df (aligned with this DataFrame's index).
syn_mask = analysis_df[synonym_cols].any(axis=1)
onehop_any_mask = analysis_df[onehop_cols].any(axis=1)
twohop_any_mask = analysis_df[twohop_cols].any(axis=1)

# 3-hop: merge the threehop_df to identify pairs newly connected at 3 hops.
# threehop_df only contains pairs that were unconnected at <=2-hop, so the
# merge will produce NaN for already-connected pairs (filled to False).
threehop_connected = threehop_df[['definition_wn', 'answer_wn']].copy()
threehop_connected['_threehop_any'] = (
    threehop_df[threehop_bool_cols].any(axis=1)
)

analysis_df = analysis_df.merge(
    threehop_connected,
    on=['definition_wn', 'answer_wn'],
    how='left'
)
analysis_df['_threehop_any'] = analysis_df['_threehop_any'].fillna(False)

assert len(analysis_df) == 127_608, (
    f'Row count changed after 3-hop merge: {len(analysis_df):,}')

# Assign tiers using np.select (first matching condition wins).
conditions = [
    syn_mask,                       # Synonym (shallowest)
    onehop_any_mask,                # 1-hop (not synonym — already matched)
    twohop_any_mask,                # 2-hop (not synonym/1-hop)
    analysis_df['_threehop_any'],   # 3-hop (not <=2-hop)
]
conditions = [c.astype(bool).values if hasattr(c, 'astype') else c for c in conditions]
choices = ['Synonym', '1-hop', '2-hop', '3-hop']
analysis_df['min_hop_tier'] = np.select(conditions, choices,
                                        default='Unconnected')

# Binary connected flag (any connection <=3-hop).
analysis_df['wn_connected'] = analysis_df['min_hop_tier'] != 'Unconnected'

# Clean up temporary column.
analysis_df.drop(columns=['_threehop_any'], inplace=True)

# --- Verify tier counts ---
# These should match the incremental gains from the reachability table:
#   Synonym: 14,675 | 1-hop: 24,041 | 2-hop: 17,504 | 3-hop: 9,701
#   Unconnected: 61,687
tier_counts = analysis_df['min_hop_tier'].value_counts()
print(f'\nStep C: min_hop_tier value counts:')
for tier in ['Synonym', '1-hop', '2-hop', '3-hop', 'Unconnected']:
    count = tier_counts.get(tier, 0)
    print(f'  {tier:<12s}: {count:>7,}  ({count / len(analysis_df):.1%})')

print(f'\n  Connected (<=3-hop): {analysis_df["wn_connected"].sum():,}')
print(f'  Unconnected:         {(~analysis_df["wn_connected"]).sum():,}')
print(f'\nFinal analysis_df: {len(analysis_df):,} rows, '
      f'{len(analysis_df.columns)} columns')

Step A: analysis_df has 127,608 rows, 14 columns
Step B: merged 20 wn_rel_* columns — still 127,608 rows

Step C: min_hop_tier value counts:
  Synonym     :  14,675  (11.5%)
  1-hop       :  24,041  (18.8%)
  2-hop       :  17,504  (13.7%)
  3-hop       :   9,701  (7.6%)
  Unconnected :  61,687  (48.3%)

  Connected (<=3-hop): 65,921
  Unconnected:         61,687

Final analysis_df: 127,608 rows, 36 columns

# =====================================================================
# Figure: WordNet Reachability by Hop Depth (Cumulative)
# =====================================================================

tier_labels_cum = ['Synonym', '\u22641-hop', '\u22642-hop', '\u22643-hop']
tier_cum_counts = [
    (analysis_df['min_hop_tier'] == 'Synonym').sum(),
    (analysis_df['min_hop_tier'].isin(['Synonym', '1-hop'])).sum(),
    (analysis_df['min_hop_tier'].isin(['Synonym', '1-hop', '2-hop'])).sum(),
    analysis_df['wn_connected'].sum(),
]
n_total = len(analysis_df)
tier_cum_pcts = [count / n_total * 100 for count in tier_cum_counts]

# Color gradient: dark (synonym, strongest connection) to light (3-hop, weakest).
colors_cum = ['#1a3a5c', '#2e6299', '#4a8cc7', '#85c1e9']

fig, ax = plt.subplots(figsize=(10, 5))
y_pos = np.arange(len(tier_labels_cum))
bars = ax.barh(y_pos, tier_cum_pcts, color=colors_cum,
               edgecolor='white', linewidth=0.5)

# Annotate each bar with its cumulative percentage.
for bar, pct in zip(bars, tier_cum_pcts):
    ax.text(bar.get_width() + 1.0, bar.get_y() + bar.get_height() / 2,
            f'{pct:.1f}%', ha='left', va='center', fontsize=11,
            fontweight='bold')

# Reference line at 100% to show the gap to full coverage.
ax.axvline(x=100, color='gray', linestyle='--', linewidth=0.8, alpha=0.6)
ax.text(100, len(tier_labels_cum) - 0.5, '100%', fontsize=9, color='gray',
        ha='center', va='bottom')

ax.set_yticks(y_pos)
ax.set_yticklabels(tier_labels_cum, fontsize=11)
ax.set_xlabel('Cumulative % of Unique Pairs Connected', fontsize=12)
ax.set_title('WordNet Reachability by Hop Depth '
             f'(N = {n_total:,} unique pairs)', fontsize=13)
ax.set_xlim(0, 115)
ax.tick_params(axis='x', labelsize=10)
sns.despine()

plt.tight_layout()
fig.savefig(FIGURES_DIR / 'wordnet_reachability_bar.png', dpi=150,
            bbox_inches='tight')
print(f'Saved: {FIGURES_DIR / "wordnet_reachability_bar.png"}')
plt.show()

Saved: /Users/victoria/Desktop/MADS/ccc-project/clue_misdirection/outputs/figures/wordnet_reachability_bar.png

# =====================================================================
# Figure: Distribution of Minimum WordNet Connection Depth
# =====================================================================
# Single stacked horizontal bar showing how all 127,608 pairs break down
# by minimum WordNet connection depth. Each pair appears exactly once.

tier_order = ['Synonym', '1-hop', '2-hop', '3-hop', 'Unconnected']
tier_colors = ['#1a3a5c', '#2e6299', '#4a8cc7', '#85c1e9', '#bdbdbd']

tier_counts_incr = [
    (analysis_df['min_hop_tier'] == tier).sum() for tier in tier_order
]
n_total = len(analysis_df)
tier_pcts = [count / n_total * 100 for count in tier_counts_incr]

fig, ax = plt.subplots(figsize=(12, 2.5))

# Build stacked horizontal bar from left to right.
left = 0.0
for tier, pct, count, color in zip(tier_order, tier_pcts,
                                    tier_counts_incr, tier_colors):
    ax.barh(0, pct, left=left, color=color, edgecolor='white',
            linewidth=0.8, height=0.6)

    # Label each segment with count and percentage.
    cx = left + pct / 2
    # Use white text on dark segments, black on light (Unconnected).
    text_color = 'white' if color != '#bdbdbd' else 'black'
    # Adjust font size for narrower segments.
    fs = 8.5 if pct < 10 else 9.5
    ax.text(cx, 0, f'{tier}\n{count:,} ({pct:.1f}%)',
            ha='center', va='center', fontsize=fs,
            fontweight='bold', color=text_color, linespacing=1.4)
    left += pct

ax.set_xlim(0, 100)
ax.set_yticks([])
ax.set_xlabel('% of Unique Pairs', fontsize=12)
ax.set_title('Distribution of Minimum WordNet Connection Depth '
             f'(N = {n_total:,} unique pairs)', fontsize=13)
ax.tick_params(axis='x', labelsize=10)
sns.despine(left=True)

plt.tight_layout()
fig.savefig(FIGURES_DIR / 'wordnet_hop_distribution.png', dpi=150,
            bbox_inches='tight')
print(f'Saved: {FIGURES_DIR / "wordnet_hop_distribution.png"}')
plt.show()

Saved: /Users/victoria/Desktop/MADS/ccc-project/clue_misdirection/outputs/figures/wordnet_hop_distribution.png

# =====================================================================
# Figure: Paired Median Rank Heatmaps (Connected vs. Unconnected)
# =====================================================================

def_cond_labels = ['Allsense', 'Common', 'Obscure', 'Clue Context']
ans_cond_labels = ['Allsense', 'Common', 'Obscure']

# Map display labels to column name fragments.
def_cond_keys = ['allsense', 'common', 'obscure', 'clue_context']
ans_cond_keys = ['allsense', 'common', 'obscure']

# Build 4x3 median rank matrices for each group.
connected_mask = analysis_df['wn_connected']
n_conn = connected_mask.sum()
n_unconn = (~connected_mask).sum()

mat_conn = np.zeros((4, 3))
mat_unconn = np.zeros((4, 3))

for i, dk in enumerate(def_cond_keys):
    for j, ak in enumerate(ans_cond_keys):
        col = f'rank_{dk}_{ak}'
        mat_conn[i, j] = analysis_df.loc[connected_mask, col].median()
        mat_unconn[i, j] = analysis_df.loc[~connected_mask, col].median()

# Shared color scale across both heatmaps.
vmin = min(mat_conn.min(), mat_unconn.min())
vmax = max(mat_conn.max(), mat_unconn.max())

# Use GridSpec to place two heatmaps and a dedicated colorbar axes on the
# far right, with enough padding so the label is never clipped.
fig = plt.figure(figsize=(15, 5))
gs = fig.add_gridspec(1, 2, wspace=0.25)
ax1 = fig.add_subplot(gs[0, 0])
ax2 = fig.add_subplot(gs[0, 1])

# Place colorbar right next to ax2 using inset_axes or manual positioning
from mpl_toolkits.axes_grid1 import make_axes_locatable
divider = make_axes_locatable(ax2)
cax = divider.append_axes("right", size="5%", pad=0.1)

for ax, mat, title in [
    (ax1, mat_conn, f'WordNet-Connected Pairs (N={n_conn:,})'),
    (ax2, mat_unconn, f'WordNet-Unconnected Pairs (N={n_unconn:,})'),
]:
    im = ax.imshow(mat, cmap='viridis', aspect='auto',
                   vmin=vmin, vmax=vmax)

    # Annotate each cell with the median rank.
    for i in range(4):
        for j in range(3):
            val = mat[i, j]
            # White text on dark cells, black on light.
            norm_val = (val - vmin) / (vmax - vmin)
            text_color = 'white' if norm_val < 0.6 else 'black'
            ax.text(j, i, f'{val:,.0f}', ha='center', va='center',
                    fontsize=11, fontweight='bold', color=text_color)

    ax.set_xticks(np.arange(3))
    ax.set_xticklabels(ans_cond_labels, fontsize=10)
    ax.set_yticks(np.arange(4))
    ax.set_yticklabels(def_cond_labels, fontsize=10)
    ax.set_xlabel('Answer Condition', fontsize=11)
    ax.set_ylabel('Definition Condition', fontsize=11)
    ax.set_title(title, fontsize=12)

fig.suptitle('Median Retrieval Rank by Definition \u00d7 Answer Condition',
             fontsize=14, y=1.02)

# Colorbar in its own axes on the far right.
cbar = fig.colorbar(im, cax=cax)
cbar.set_label('Median Rank (lower = better)', fontsize=10)

fig.savefig(FIGURES_DIR / 'retrieval_by_wordnet_connectivity.png',
            dpi=150, bbox_inches='tight')
print(f'Saved: {FIGURES_DIR / "retrieval_by_wordnet_connectivity.png"}')
plt.show()

Saved: /Users/victoria/Desktop/MADS/ccc-project/clue_misdirection/outputs/figures/retrieval_by_wordnet_connectivity.png

# =====================================================================
# Figure: Misdirection Delta (Clue Context Rank − Allsense Rank)
# =====================================================================

# Print column names to confirm exact rank column names.
rank_cols = [c for c in analysis_df.columns if c.startswith('rank_')]
print('Rank columns:', rank_cols)

# Misdirection delta: how much worse does clue context perform vs. allsense?
analysis_df['misdirection_delta'] = (
    analysis_df['rank_clue_context_allsense']
    - analysis_df['rank_allsense_allsense']
)

connected_mask = analysis_df['wn_connected']
delta_conn = analysis_df.loc[connected_mask, 'misdirection_delta']
delta_unconn = analysis_df.loc[~connected_mask, 'misdirection_delta']

# --- Summary statistics ---
print('\nMisdirection Delta Summary:')
print(f'{"":20s} {"Connected":>12s} {"Unconnected":>12s}')
print(f'{"-"*44}')
print(f'{"N pairs":20s} {len(delta_conn):>12,} {len(delta_unconn):>12,}')
print(f'{"Median delta":20s} {delta_conn.median():>12,.0f} '
      f'{delta_unconn.median():>12,.0f}')
print(f'{"Mean delta":20s} {delta_conn.mean():>12,.0f} '
      f'{delta_unconn.mean():>12,.0f}')
print(f'{"% delta > 0":20s} {(delta_conn > 0).mean():>11.1%} '
      f'{(delta_unconn > 0).mean():>11.1%}')
print(f'{"% delta < 0":20s} {(delta_conn < 0).mean():>11.1%} '
      f'{(delta_unconn < 0).mean():>11.1%}')

# --- KDE plot ---
# Clip to a reasonable range based on quantiles to avoid extreme outliers.
q_low = analysis_df['misdirection_delta'].quantile(0.01)
q_high = analysis_df['misdirection_delta'].quantile(0.99)
clip_low = max(q_low, -15000)
clip_high = min(q_high, 35000)

fig, ax = plt.subplots(figsize=(10, 6))

ax.hist(delta_conn.clip(clip_low, clip_high), bins=120,
        density=True, alpha=0.45, color='#4878CF',
        label=f'Connected (N={len(delta_conn):,})', edgecolor='none')
ax.hist(delta_unconn.clip(clip_low, clip_high), bins=120,
        density=True, alpha=0.45, color='#D65F5F',
        label=f'Unconnected (N={len(delta_unconn):,})', edgecolor='none')

# Median lines.
med_conn = delta_conn.median()
med_unconn = delta_unconn.median()
ymax = ax.get_ylim()[1]
ax.axvline(med_conn, color='#2e4a7a', linestyle='--', linewidth=1.5,
           alpha=0.9)
ax.axvline(med_unconn, color='#a03030', linestyle='--', linewidth=1.5,
           alpha=0.9)
ax.text(med_conn, ymax * 0.92,
        f'  Connected median: {med_conn:+,.0f}',
        fontsize=9, color='#2e4a7a', fontweight='bold', va='top')
ax.text(med_unconn, ymax * 0.82,
        f'  Unconnected median: {med_unconn:+,.0f}',
        fontsize=9, color='#a03030', fontweight='bold', va='top')

# Zero line for reference.
ax.axvline(0, color='gray', linestyle='-', linewidth=0.5, alpha=0.5)

ax.set_xlim(clip_low, clip_high)
ax.set_xlabel('Misdirection Delta (Clue Context Rank \u2212 Allsense Rank)',
              fontsize=12)
ax.set_ylabel('Density', fontsize=12)
ax.set_title('Distribution of Misdirection Delta '
             '(Clue Context Rank \u2212 Allsense Rank)', fontsize=13)
ax.legend(fontsize=10, loc='upper right')
ax.tick_params(axis='both', labelsize=10)
sns.despine()

plt.tight_layout()
fig.savefig(FIGURES_DIR / 'misdirection_delta_by_connectivity.png',
            dpi=150, bbox_inches='tight')
print(f'\nSaved: {FIGURES_DIR / "misdirection_delta_by_connectivity.png"}')
plt.show()

Rank columns: ['rank_allsense_allsense', 'rank_allsense_common', 'rank_allsense_obscure', 'rank_common_allsense', 'rank_common_common', 'rank_common_obscure', 'rank_obscure_allsense', 'rank_obscure_common', 'rank_obscure_obscure', 'rank_clue_context_allsense', 'rank_clue_context_common', 'rank_clue_context_obscure']

Misdirection Delta Summary:
                        Connected  Unconnected
--------------------------------------------
N pairs                    65,921       61,687
Median delta                  763            4
Mean delta                  3,881          462
% delta > 0                64.4%       50.2%
% delta < 0                35.5%       49.8%

Saved: /Users/victoria/Desktop/MADS/ccc-project/clue_misdirection/outputs/figures/misdirection_delta_by_connectivity.png

# =====================================================================
# Figure: Retrieval Rank by Minimum WordNet Connection Depth
# =====================================================================

tier_order = ['Synonym', '1-hop', '2-hop', '3-hop', 'Unconnected']
tier_colors = ['#1a3a5c', '#2e6299', '#4a8cc7', '#85c1e9', '#bdbdbd']
rank_col = 'rank_allsense_allsense'

fig, ax = plt.subplots(figsize=(10, 6))

# Prepare data for each tier.
box_data = []
for tier in tier_order:
    vals = analysis_df.loc[
        analysis_df['min_hop_tier'] == tier, rank_col
    ].values
    box_data.append(vals)

bp = ax.boxplot(box_data, positions=range(len(tier_order)),
                widths=0.55, patch_artist=True,
                showfliers=False,  # suppress outlier dots for clarity
                medianprops=dict(color='black', linewidth=1.5))

# Color each box.
for patch, color in zip(bp['boxes'], tier_colors):
    patch.set_facecolor(color)
    patch.set_edgecolor('white')
    patch.set_linewidth(0.5)

# Annotate each box with its median rank.
for i, (tier, vals) in enumerate(zip(tier_order, box_data)):
    med = np.median(vals)
    ax.text(i, med * 0.72, f'{med:,.0f}',
            ha='center', va='top', fontsize=9,
            fontweight='bold', color='black')

# Random baseline: half the candidate pool (45,254 / 2 = 22,627).
ax.axhline(y=22_627, color='gray', linestyle='--', linewidth=0.8,
           alpha=0.6)
ax.text(4.45, 22_627 * 1.12, 'random baseline (22,627)',
        fontsize=8, color='gray', ha='right')

ax.set_yscale('log')
ax.set_ylim(top=45_254 * 1.5)
ax.set_xticks(range(len(tier_order)))
ax.set_xticklabels(tier_order, fontsize=11)
ax.set_xlabel('Minimum WordNet Connection Depth', fontsize=12)
ax.set_ylabel('Retrieval Rank (log scale)', fontsize=12)
ax.set_title('Retrieval Rank (Allsense \u00d7 Allsense) '
             'by Minimum WordNet Connection Depth', fontsize=13)
ax.tick_params(axis='both', labelsize=10)
sns.despine()

plt.tight_layout()
fig.savefig(FIGURES_DIR / 'retrieval_rank_by_hop_depth.png',
            dpi=150, bbox_inches='tight')
print(f'Saved: {FIGURES_DIR / "retrieval_rank_by_hop_depth.png"}')
plt.show()

# --- Summary table ---
print(f'\n{"Tier":<14s} {"N":>8s} {"Median Rank":>12s} '
      f'{"Mean Rank":>11s} {"Top-10 Hit Rate":>16s}')
print('-' * 63)
for tier, vals in zip(tier_order, box_data):
    n = len(vals)
    med = np.median(vals)
    mean = np.mean(vals)
    top10 = (vals <= 10).mean()
    print(f'{tier:<14s} {n:>8,} {med:>12,.0f} '
          f'{mean:>11,.0f} {top10:>15.2%}')

Saved: /Users/victoria/Desktop/MADS/ccc-project/clue_misdirection/outputs/figures/retrieval_rank_by_hop_depth.png

Tier                  N  Median Rank   Mean Rank  Top-10 Hit Rate
---------------------------------------------------------------
Synonym          14,675          258       3,092          19.01%
1-hop            24,041          387       3,290          11.26%
2-hop            17,504          570       3,738           9.26%
3-hop             9,701          928       4,329           4.41%
Unconnected      61,687        2,107       6,941           3.68%

	Answer: Allsense	Answer: Common	Answer: Obscure
Def: Allsense	baseline
Def: Common
Def: Obscure
Def: Clue Context	misdirection

File	Description
`outputs/retrieval_results_unique_pairs.csv`	12 rows: metrics for each of the 4×3 conditions
`outputs/retrieval_results_all_rows.csv`	1 row: Allsense × Allsense over all 240K rows
`outputs/figures/retrieval_bar_chart.png`	Grouped bar chart: median rank (log scale)
`outputs/figures/retrieval_heatmap.png`	Heatmap: mean cosine similarity (4×3)
`outputs/figures/wordnet_reachability_bar.png`	Cumulative WordNet reachability by hop depth
`outputs/figures/wordnet_hop_distribution.png`	Stacked bar: pair distribution by connection depth
`outputs/figures/retrieval_by_wordnet_connectivity.png`	Paired heatmaps: connected vs. unconnected median rank
`outputs/figures/misdirection_delta_by_connectivity.png`	Misdirection delta by WordNet connectivity
`outputs/figures/retrieval_rank_by_hop_depth.png`	Box plot: retrieval rank by hop depth

Def Condition	Ans Condition	Top-1	Top-10	Top-100	Mean Rank	Median Rank	Mean Cos Sim
Allsense	Allsense	1.11%	5.41%	18.08%	4,827	1,015	0.6461
Allsense	Common	0.97%	4.80%	16.25%	5,226	1,207	0.6093
Allsense	Obscure	0.93%	4.58%	15.67%	5,363	1,289	0.5982
Common	Allsense	0.94%	4.69%	15.64%	5,433	1,360	0.5983
Common	Common	0.91%	4.36%	14.57%	5,693	1,516	0.5792
Common	Obscure	0.83%	4.02%	13.84%	5,822	1,617	0.5622
Obscure	Allsense	0.89%	4.48%	14.87%	5,595	1,466	0.5873
Obscure	Common	0.82%	4.08%	13.84%	5,847	1,644	0.5666
Obscure	Obscure	0.82%	3.91%	13.37%	5,945	1,717	0.5569
Clue Context	Allsense	0.69%	3.09%	11.59%	6,441	2,160	0.5364
Clue Context	Common	0.57%	2.68%	10.25%	6,789	2,499	0.5019
Clue Context	Obscure	0.55%	2.54%	9.84%	6,891	2,581	0.4928

Step 4: Retrieval Analysis (Descriptive, Pre-Modeling)¶

What we do¶

Reporting unit (Decision 5)¶

Inputs¶

Outputs¶

Data Loading¶

Embedding Alignment¶

Unique-Pair Deduplication¶

Retrieval Evaluation Function¶

Primary Analysis: 4×3 Retrieval Matrix (Unique Pairs)¶

Supplementary Analysis: All-Rows View¶

Interpretation Guide¶

1. Misdirection effect: Allsense vs. Clue Context (definition side)¶

2. Sense exploitation: Common vs. Obscure (definition side)¶

3. Answer-side sense variation¶

4. Unique pairs vs. all rows¶

Visualizations¶

Discussion¶

1. Misdirection effect confirmed¶

2. Allsense outperforms both Common and Obscure on the definition side¶

3. Answer-side pattern: Allsense consistently best¶

4. Single-synset caveat¶

5. All-rows vs. unique-pairs comparison¶

6. Scale difference from preliminary results¶

WordNet Reachability Analysis¶

Extending to 3-hop relationships¶

WordNet Relationship Glossary¶

Interpretation¶

Linking Retrieval Ranks to WordNet Connectivity¶

WordNet Reachability by Hop Depth¶

Distribution of Minimum Connection Depth¶

Retrieval Performance by WordNet Connectivity¶

Misdirection Magnitude by WordNet Connectivity¶

Retrieval Rank Stratified by WordNet Connection Depth¶

Crossover Interpretation¶

Summary¶

What was done¶

Key findings¶

Output files¶

What comes next¶

Findings for FINDINGS.md¶

Step 4: Retrieval Analysis¶