import warnings

import numpy as np
import pandas as pd
from pathlib import Path

import nltk
from nltk.corpus import wordnet as wn
from sklearn.metrics.pairwise import cosine_similarity

from tqdm.auto import tqdm

warnings.filterwarnings('ignore', category=FutureWarning)

# --- Environment Auto-Detection ---
# Same pattern as 02_embedding_generation.ipynb: detect Colab, Great Lakes,
# or local and set paths accordingly.
try:
    IS_COLAB = 'google.colab' in str(get_ipython())
except NameError:
    IS_COLAB = False

if IS_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    PROJECT_ROOT = Path('/content/drive/MyDrive/SIADS 692 Milestone II/'
                        'Milestone II - NLP Cryptic Crossword Clues/'
                        'clue_misdirection')
else:
    # Local or Great Lakes: notebook is in clue_misdirection/notebooks/,
    # so parent is the clue_misdirection project root.
    try:
        PROJECT_ROOT = Path(__file__).resolve().parent.parent
    except NameError:
        PROJECT_ROOT = Path.cwd().parent

DATA_DIR = PROJECT_ROOT / 'data'
EMBEDDINGS_DIR = DATA_DIR / 'embeddings'
OUTPUT_DIR = PROJECT_ROOT / 'outputs'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Download WordNet data — needed for the 21 relationship features computed
# later in this notebook.
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

print(f'Environment: {"Google Colab" if IS_COLAB else "Local / Great Lakes"}')
print(f'Project root: {PROJECT_ROOT}')
print(f'Data directory: {DATA_DIR}')
print(f'Embeddings directory: {EMBEDDINGS_DIR}')

Environment: Local / Great Lakes
Project root: /Users/victoria/Desktop/MADS/ccc-project/clue_misdirection
Data directory: /Users/victoria/Desktop/MADS/ccc-project/clue_misdirection/data
Embeddings directory: /Users/victoria/Desktop/MADS/ccc-project/clue_misdirection/data/embeddings

# --- Load clues_filtered.csv (Step 1 output) ---
clues_path = DATA_DIR / 'clues_filtered.csv'
assert clues_path.exists(), (
    f'Missing input file: {clues_path}\n'
    f'Run 01_data_cleaning.ipynb first to produce this file.'
)
clues_df = pd.read_csv(clues_path)
print(f'clues_filtered.csv: {len(clues_df):,} rows')

# --- Load clue_context_phrases.csv (Step 2 intermediate) ---
# This file provides definition_wn and answer_wn — the WordNet-ready lookup
# keys that map to the embedding index files. It also identifies which rows
# survived Step 2's cleanup (240,211 of the original 241,397).
# CRITICAL: keep_default_na=False prevents pandas from interpreting the word
# "nan" (grandmother) as NaN — see DATA.md.
cc_phrases_path = EMBEDDINGS_DIR / 'clue_context_phrases.csv'
assert cc_phrases_path.exists(), (
    f'Missing input file: {cc_phrases_path}\n'
    f'Run 02_embedding_generation.ipynb first to produce this file.'
)
cc_phrases = pd.read_csv(cc_phrases_path, keep_default_na=False)
print(f'clue_context_phrases.csv: {len(cc_phrases):,} rows')

# --- Record each row's position in cc_phrases / clue_context_embeddings ---
# clue_context_phrases.csv and clue_context_embeddings.npy are in identical
# row order (verified in Step 2). By recording the row position here, we get
# a direct index into the clue-context embedding array after the merge —
# avoiding the ambiguity of mapping through clue_context_index.csv, which
# only has clue_id and can't disambiguate double-definition clues.
cc_phrases['cc_row_position'] = np.arange(len(cc_phrases))

# --- Merge to get definition_wn and answer_wn onto the clue rows ---
# Inner merge restricts to the 240,211 rows that have embeddings.
# We merge on (clue_id, definition) — NOT clue_id alone — because
# double-definition clues have multiple rows per clue_id. A clue_id-only
# merge would produce a many-to-many cross product for those clues,
# inflating the row count. The 'definition' column appears in both files
# (original-case definition text) and disambiguates which definition each
# row corresponds to.
df = clues_df.merge(
    cc_phrases[['clue_id', 'definition', 'definition_wn', 'answer_wn',
                'def_num_usable_synsets', 'ans_num_usable_synsets',
                'cc_row_position']],
    on=['clue_id', 'definition'],
    how='inner'
)

# Verify the merge produced exactly the expected number of rows — no
# inflation from many-to-many joins and no unexpected drops.
assert len(df) == len(cc_phrases), (
    f'Merge produced {len(df):,} rows, expected {len(cc_phrases):,}. '
    f'This likely means a double-definition clue was not disambiguated '
    f'correctly by the (clue_id, definition) key.')

print(f'\nWorking set after merge: {len(df):,} rows')
print(f'  (dropped {len(clues_df) - len(df):,} rows without embeddings)')
print(f'  Unique (definition, answer) pairs: '
      f'{df["def_answer_pair_id"].nunique():,}')
print(f'\nColumns: {list(df.columns)}')
df.head(3)

clues_filtered.csv: 241,397 rows
clue_context_phrases.csv: 240,211 rows

Working set after merge: 240,211 rows
  (dropped 1,186 rows without embeddings)
  Unique (definition, answer) pairs: 128,961

Columns: ['clue_id', 'clue', 'surface', 'surface_normalized', 'definition', 'answer', 'answer_format', 'num_definitions', 'def_answer_pair_id', 'definition_wn', 'answer_wn', 'def_num_usable_synsets', 'ans_num_usable_synsets', 'cc_row_position']

# Load embedding arrays and index files.
# CRITICAL: keep_default_na=False on all index CSVs — the word "nan"
# (grandmother) is a valid crossword definition/answer.

definition_embeddings = np.load(EMBEDDINGS_DIR / 'definition_embeddings.npy')
definition_index = pd.read_csv(
    EMBEDDINGS_DIR / 'definition_index.csv', index_col=0,
    keep_default_na=False)

answer_embeddings = np.load(EMBEDDINGS_DIR / 'answer_embeddings.npy')
answer_index = pd.read_csv(
    EMBEDDINGS_DIR / 'answer_index.csv', index_col=0,
    keep_default_na=False)

clue_context_embeddings = np.load(
    EMBEDDINGS_DIR / 'clue_context_embeddings.npy')
clue_context_index = pd.read_csv(
    EMBEDDINGS_DIR / 'clue_context_index.csv', index_col=0,
    keep_default_na=False)

# --- Print shapes and sizes ---
EMBED_DIM = 1024
print(f'{"File":<35s} {"Shape":<25s} {"Memory":>8s}')
print(f'{"-"*35} {"-"*25} {"-"*8}')
for name, arr in [
    ('definition_embeddings.npy', definition_embeddings),
    ('answer_embeddings.npy', answer_embeddings),
    ('clue_context_embeddings.npy', clue_context_embeddings),
]:
    mb = arr.nbytes / 1024**2
    print(f'{name:<35s} {str(arr.shape):<25s} {mb:>6.1f} MB')

total_mb = (definition_embeddings.nbytes + answer_embeddings.nbytes
            + clue_context_embeddings.nbytes) / 1024**2
print(f'\nTotal embedding memory: {total_mb:.1f} MB')
print(f'\nIndex sizes:')
print(f'  definition_index:     {len(definition_index):,} rows')
print(f'  answer_index:         {len(answer_index):,} rows')
print(f'  clue_context_index:   {len(clue_context_index):,} rows')

# --- Shape and consistency assertions ---
n_def = len(definition_index)
n_ans = len(answer_index)
n_cc = len(clue_context_index)

assert definition_embeddings.shape == (n_def, 3, EMBED_DIM), (
    f'definition_embeddings shape mismatch: expected ({n_def}, 3, {EMBED_DIM}), '
    f'got {definition_embeddings.shape}')
assert answer_embeddings.shape == (n_ans, 3, EMBED_DIM), (
    f'answer_embeddings shape mismatch: expected ({n_ans}, 3, {EMBED_DIM}), '
    f'got {answer_embeddings.shape}')
assert clue_context_embeddings.shape == (n_cc, EMBED_DIM), (
    f'clue_context_embeddings shape mismatch: expected ({n_cc}, {EMBED_DIM}), '
    f'got {clue_context_embeddings.shape}')

print(f'\nAll shape assertions passed.')

File                                Shape                       Memory
----------------------------------- ------------------------- --------
definition_embeddings.npy           (27385, 3, 1024)           320.9 MB
answer_embeddings.npy               (45254, 3, 1024)           530.3 MB
clue_context_embeddings.npy         (240211, 1024)             938.3 MB

Total embedding memory: 1789.6 MB

Index sizes:
  definition_index:     27,385 rows
  answer_index:         45,254 rows
  clue_context_index:   240,211 rows

All shape assertions passed.

# --- Build word → row-position mappings for O(1) lookup ---
# definition_index and answer_index have integer row indices (0, 1, 2, ...)
# and a 'word' column. We create a Series mapping word string → row position.
def_word_to_idx = pd.Series(
    definition_index.index, index=definition_index['word'])
ans_word_to_idx = pd.Series(
    answer_index.index, index=answer_index['word'])

# --- Map each row's lookup key to its position in the embedding arrays ---
# .map() returns NaN for any key not found — we assert none are missing.
def_indices = df['definition_wn'].map(def_word_to_idx)
assert def_indices.notna().all(), (
    f'{def_indices.isna().sum()} definition_wn values not found in '
    f'definition_index. Examples: '
    f'{df.loc[def_indices.isna(), "definition_wn"].head().tolist()}')
def_indices = def_indices.astype(int).values

ans_indices = df['answer_wn'].map(ans_word_to_idx)
assert ans_indices.notna().all(), (
    f'{ans_indices.isna().sum()} answer_wn values not found in '
    f'answer_index. Examples: '
    f'{df.loc[ans_indices.isna(), "answer_wn"].head().tolist()}')
ans_indices = ans_indices.astype(int).values

# For clue-context embeddings, we use cc_row_position directly. This column
# was set to np.arange(len(cc_phrases)) before the merge, giving each row
# its position in clue_context_embeddings.npy. Unlike clue_context_index.csv
# (which only has clue_id), cc_row_position correctly handles double-definition
# clues where multiple rows share the same clue_id but have different
# clue-context embeddings.
cc_indices = df['cc_row_position'].values

# --- Fancy-index into embedding arrays to build (N, 1024) matrices ---
# Each matrix has one row per clue in our working set.
N = len(df)
print(f'Building 7 embedding matrices of shape ({N:,}, {EMBED_DIM})...')

w1_all     = definition_embeddings[def_indices, 0, :]   # (N, 1024)
w1_common  = definition_embeddings[def_indices, 1, :]   # (N, 1024)
w1_obscure = definition_embeddings[def_indices, 2, :]   # (N, 1024)
w2_all     = answer_embeddings[ans_indices, 0, :]       # (N, 1024)
w2_common  = answer_embeddings[ans_indices, 1, :]       # (N, 1024)
w2_obscure = answer_embeddings[ans_indices, 2, :]       # (N, 1024)
w1_clue    = clue_context_embeddings[cc_indices, :]     # (N, 1024)

print(f'Done. Each matrix: {w1_all.shape}')
print(f'Total memory for 7 matrices: '
      f'{7 * w1_all.nbytes / 1024**2:.1f} MB')

Building 7 embedding matrices of shape (240,211, 1024)...
Done. Each matrix: (240211, 1024)
Total memory for 7 matrices: 6568.3 MB

def rowwise_cosine(A, B):
    """Compute row-wise cosine similarity between corresponding rows of A and B.

    Parameters
    ----------
    A, B : np.ndarray, shape (N, D)
        Two matrices with the same shape.

    Returns
    -------
    np.ndarray, shape (N,)
        Cosine similarity for each row pair: cos(A[i], B[i]).
    """
    # L2-normalize each row, then take the row-wise dot product.
    A_norm = np.linalg.norm(A, axis=1, keepdims=True)
    B_norm = np.linalg.norm(B, axis=1, keepdims=True)
    return np.sum((A / A_norm) * (B / B_norm), axis=1)


# --- Context-Free Meaning: 15 features ---
# Cross-word pairs (definition × answer): 3 × 3 = 9
df['cos_w1all_w2all']       = rowwise_cosine(w1_all, w2_all)
df['cos_w1all_w2common']    = rowwise_cosine(w1_all, w2_common)
df['cos_w1all_w2obscure']   = rowwise_cosine(w1_all, w2_obscure)
df['cos_w1common_w2all']    = rowwise_cosine(w1_common, w2_all)
df['cos_w1common_w2common'] = rowwise_cosine(w1_common, w2_common)
df['cos_w1common_w2obscure']= rowwise_cosine(w1_common, w2_obscure)
df['cos_w1obscure_w2all']   = rowwise_cosine(w1_obscure, w2_all)
df['cos_w1obscure_w2common']= rowwise_cosine(w1_obscure, w2_common)
df['cos_w1obscure_w2obscure'] = rowwise_cosine(w1_obscure, w2_obscure)

# Within-definition pairs: C(3,2) = 3
df['cos_w1all_w1common']    = rowwise_cosine(w1_all, w1_common)
df['cos_w1all_w1obscure']   = rowwise_cosine(w1_all, w1_obscure)
df['cos_w1common_w1obscure']= rowwise_cosine(w1_common, w1_obscure)

# Within-answer pairs: C(3,2) = 3
df['cos_w2all_w2common']    = rowwise_cosine(w2_all, w2_common)
df['cos_w2all_w2obscure']   = rowwise_cosine(w2_all, w2_obscure)
df['cos_w2common_w2obscure']= rowwise_cosine(w2_common, w2_obscure)

# --- Context-Informed Meaning: 6 features ---
# word1_clue_context paired with each of the 6 context-free embeddings.
df['cos_w1clue_w1all']      = rowwise_cosine(w1_clue, w1_all)
df['cos_w1clue_w1common']   = rowwise_cosine(w1_clue, w1_common)
df['cos_w1clue_w1obscure']  = rowwise_cosine(w1_clue, w1_obscure)
df['cos_w1clue_w2all']      = rowwise_cosine(w1_clue, w2_all)
df['cos_w1clue_w2common']   = rowwise_cosine(w1_clue, w2_common)
df['cos_w1clue_w2obscure']  = rowwise_cosine(w1_clue, w2_obscure)

print('Computed 21 cosine similarity features.')

Computed 21 cosine similarity features.

# --- Identify the 21 cosine columns ---
cos_cols = [c for c in df.columns if c.startswith('cos_')]

# Assert exactly 21 cosine similarity columns
assert len(cos_cols) == 21, (
    f'Expected 21 cosine similarity columns, found {len(cos_cols)}: {cos_cols}')

# Assert no NaN values in any cosine column (Decision 3: no NaN features).
# Cosine similarities are guaranteed non-NaN as long as embeddings are non-zero,
# which we verified in Step 2.
nan_counts = df[cos_cols].isnull().sum()
assert (nan_counts == 0).all(), (
    f'NaN values found in cosine columns:\n'
    f'{nan_counts[nan_counts > 0]}')

print(f'Verification passed: 21 cosine columns, 0 NaN values.')
print(f'Rows: {len(df):,}\n')

# --- Descriptive statistics ---
# Context-free features (15): cross-word similarities tend to be moderate
# (definition and answer are semantically related but not identical), while
# within-word similarities tend to be high (different senses of the same word
# still share much of the embedding space).
# Context-informed features (6): the clue context shifts the definition
# embedding — the degree of shift is the misdirection signal.
print('--- Descriptive Statistics (21 Cosine Similarity Features) ---\n')
stats = df[cos_cols].describe().T[['mean', 'std', 'min', 'max']]
# Use wider display so columns don't wrap
with pd.option_context('display.float_format', '{:.4f}'.format,
                       'display.max_colwidth', 30):
    print(stats.to_string())

print(f'\n--- First 5 Rows (Cosine Similarity Features) ---\n')
with pd.option_context('display.float_format', '{:.4f}'.format,
                       'display.max_columns', None):
    display(df[cos_cols].head())

Verification passed: 21 cosine columns, 0 NaN values.
Rows: 240,211

--- Descriptive Statistics (21 Cosine Similarity Features) ---

                          mean    std     min    max
cos_w1all_w2all         0.6482 0.1366  0.0909 1.0000
cos_w1all_w2common      0.5954 0.1575  0.0180 0.9689
cos_w1all_w2obscure     0.5940 0.1545  0.0148 0.9689
cos_w1common_w2all      0.5686 0.1710  0.0085 0.9689
cos_w1common_w2common   0.5311 0.1871 -0.0402 1.0000
cos_w1common_w2obscure  0.5171 0.1867 -0.0420 0.9768
cos_w1obscure_w2all     0.5593 0.1640  0.0128 0.9689
cos_w1obscure_w2common  0.5096 0.1801 -0.0444 0.9768
cos_w1obscure_w2obscure 0.5171 0.1754 -0.0407 1.0000
cos_w1all_w1common      0.8638 0.1257  0.2143 1.0000
cos_w1all_w1obscure     0.8636 0.1168  0.3153 1.0000
cos_w1common_w1obscure  0.6891 0.2331 -0.0262 1.0000
cos_w2all_w2common      0.9150 0.1021  0.2143 1.0000
cos_w2all_w2obscure     0.9145 0.0982  0.3153 1.0000
cos_w2common_w2obscure  0.7798 0.2194 -0.0262 1.0000
cos_w1clue_w1all        0.7453 0.1171  0.1011 0.9772
cos_w1clue_w1common     0.6721 0.1780 -0.0199 0.9748
cos_w1clue_w1obscure    0.6374 0.1794 -0.0490 0.9763
cos_w1clue_w2all        0.5446 0.1544 -0.0028 0.9404
cos_w1clue_w2common     0.5019 0.1695 -0.0382 0.9427
cos_w1clue_w2obscure    0.4987 0.1676 -0.0545 0.9441

--- First 5 Rows (Cosine Similarity Features) ---

# ============================================================
# WordNet Relationship Feature Functions
# ============================================================
# These functions are standalone and can be extracted into
# scripts/feature_utils.py for reuse in distractor feature
# computation (Steps 5 and 7). See Decision 18.
# ============================================================


def get_wordnet_synsets(word):
    """Look up all WordNet synsets for a word, handling multi-word entries.

    Tries the word as-is first (which works for both single-word and
    underscore-separated multi-word entries like "ice_cream"). If no
    synsets are found and the word contains spaces, retries with spaces
    converted to underscores — WordNet's convention for multi-word entries.

    Parameters
    ----------
    word : str
        The word to look up (from the definition_wn or answer_wn column).

    Returns
    -------
    list of nltk.corpus.reader.wordnet.Synset
        All synsets found for the word. Empty list if no synsets exist.
    """
    synsets = wn.synsets(word)
    if not synsets and ' ' in word:
        synsets = wn.synsets(word.replace(' ', '_'))
    return synsets


def check_synonym(def_synsets, ans_word):
    """Check if the answer word is a synonym of the definition in WordNet.

    Two words are WordNet synonyms if they share at least one synset — i.e.,
    the answer word appears as a lemma in some synset of the definition word.
    This is a lemma-level check, not a synset-graph traversal: we look at
    the set of lemma names within each definition synset and check whether
    the answer word is among them.

    Parameters
    ----------
    def_synsets : list of Synset
        All synsets for the definition word.
    ans_word : str
        The answer word (lowercase, underscored for multi-word; from the
        answer_wn column).

    Returns
    -------
    bool
        True if the answer appears as a lemma in any definition synset.
    """
    for syn in def_synsets:
        lemma_names = {lemma.name().lower() for lemma in syn.lemmas()}
        if ans_word in lemma_names:
            return True
    return False


def check_synset_reachable(def_synsets, ans_synsets_set, hops):
    """Check if any answer synset is reachable from definition synsets via
    a sequence of WordNet relationship hops.

    For single-hop relationships (e.g., "hyponym"), ``hops`` contains one
    method name — we follow that relationship from each definition synset and
    check if any answer synset appears among the targets. For compound
    two-hop relationships (e.g., "hyponym_of_hypernym"), ``hops`` contains
    two method names: the first hop follows the second word of the compound
    name (hypernyms), then the second hop follows the first word (hyponyms).
    This right-to-left reading matches the English semantics: "hyponym OF
    hypernym" = take hypernyms first, then take hyponyms of those.

    Parameters
    ----------
    def_synsets : list of Synset
        Starting synsets (from the definition word).
    ans_synsets_set : set of Synset
        Target synsets (from the answer word).
    hops : list of str
        Sequence of WordNet synset method names to follow. Each must be
        a valid method on nltk.corpus.reader.wordnet.Synset (e.g.,
        'hypernyms', 'hyponyms', 'part_meronyms').

    Returns
    -------
    bool
        True if any answer synset is reachable via the specified path.
    """
    current = set(def_synsets)
    for method_name in hops:
        next_level = set()
        for synset in current:
            next_level.update(getattr(synset, method_name)())
        current = next_level
        if not current:
            return False
    return bool(current & ans_synsets_set)


def compute_max_path_similarity(def_synsets, ans_synsets):
    """Compute the maximum path similarity across all definition-answer
    synset pairs.

    Path similarity (Rada et al., 1989) measures the inverse of the shortest
    path length between two synsets in the WordNet hypernym/hyponym hierarchy,
    normalized to [0, 1]. We use path_similarity (not wup_similarity) for
    consistency with Hans's preliminary classification experiments, where
    wn_path_sim was the single most predictive feature.

    Parameters
    ----------
    def_synsets : list of Synset
        All synsets for the definition word.
    ans_synsets : list of Synset
        All synsets for the answer word.

    Returns
    -------
    float
        Maximum path similarity across all synset pairs. Returns 0.0 if
        no synset pair connects or if either word has no synsets.
    """
    max_sim = 0.0
    for ds in def_synsets:
        for as_ in ans_synsets:
            sim = ds.path_similarity(as_)
            if sim is not None and sim > max_sim:
                max_sim = sim
    return max_sim


def compute_shared_synset_count(def_synsets, ans_synsets):
    """Count synsets that contain both the definition and answer as lemmas.

    A shared synset means both words can express the same concept — they are
    direct synonyms for that particular meaning. This is related to but
    distinct from the synonym boolean: check_synonym looks at lemma names
    within synsets, while this counts the actual synset overlap. In practice
    these are equivalent, but shared_synset_count captures *how many* senses
    overlap, not just whether any do.

    Parameters
    ----------
    def_synsets : list of Synset
        All synsets for the definition word.
    ans_synsets : list of Synset
        All synsets for the answer word.

    Returns
    -------
    int
        Number of synsets appearing in both lists. Returns 0 if no overlap.
    """
    return len(set(def_synsets) & set(ans_synsets))


# --- Relationship type definitions ---
# Maps each relationship type name to the sequence of WordNet synset methods
# needed to check reachability. For compound types "X_of_Y", the first hop
# follows Y and the second hop follows X (read right-to-left). For example,
# "hyponym_of_hypernym" means: from definition synsets, follow hypernyms()
# (first hop), then follow hyponyms() of those results (second hop), and
# check if any answer synset appears in the final set.
#
# These 19 types (plus synonym handled separately = 20 total) cover all
# relationship types from Table 3 of the design doc.
RELATIONSHIP_HOPS = {
    # --- Single-hop relationships (6) ---
    # Direct taxonomic and part-whole relationships.
    'hyponym':           ['hyponyms'],
    'hypernym':          ['hypernyms'],
    'part_holonym':      ['part_holonyms'],
    'part_meronym':      ['part_meronyms'],
    'substance_meronym': ['substance_meronyms'],
    'member_meronym':    ['member_meronyms'],

    # --- Two-hop relationships (13) ---
    # Taxonomic two-hop: moving up/down the is-a hierarchy twice.
    'hyponym_of_hypernym':  ['hypernyms', 'hyponyms'],   # co-hyponymy
    'hypernym_of_hyponym':  ['hyponyms', 'hypernyms'],   # co-hypernymy
    'hyponym_of_hyponym':   ['hyponyms', 'hyponyms'],    # grandchild
    'hypernym_of_hypernym': ['hypernyms', 'hypernyms'],  # grandparent

    # Mixed taxonomic + part-whole: one taxonomic hop + one part-whole hop.
    'part_holonym_of_hyponym':      ['hyponyms', 'part_holonyms'],
    'hyponym_of_part_holonym':      ['part_holonyms', 'hyponyms'],
    'substance_meronym_of_hyponym': ['hyponyms', 'substance_meronyms'],
    'part_meronym_of_hyponym':      ['hyponyms', 'part_meronyms'],
    'hyponym_of_part_meronym':      ['part_meronyms', 'hyponyms'],
    'part_meronym_of_hypernym':     ['hypernyms', 'part_meronyms'],
    'part_holonym_of_hypernym':     ['hypernyms', 'part_holonyms'],

    # Part-whole two-hop: two part-whole hops.
    'part_holonym_of_part_meronym':     ['part_meronyms', 'part_holonyms'],
    'member_meronym_of_member_holonym': ['member_holonyms', 'member_meronyms'],
}


def compute_wordnet_relationship_features(def_word, ans_word):
    """Compute all WordNet relationship features for a (definition, answer) pair.

    This is the master function that combines all relationship checks into
    a single feature dictionary. It is designed to be called once per unique
    (definition_wn, answer_wn) pair; the results are then merged back to all
    rows sharing that pair.

    Parameters
    ----------
    def_word : str
        The definition word (lowercase, underscored for multi-word; from the
        definition_wn column).
    ans_word : str
        The answer word (lowercase, underscored for multi-word; from the
        answer_wn column).

    Returns
    -------
    dict
        Keys are feature column names, values are feature values:
        - 20 boolean features (int 0/1): wn_rel_synonym, wn_rel_hyponym, ...
        - wn_max_path_sim (float): max path similarity, default 0.0
        - wn_shared_synset_count (int): shared synset count, default 0
    """
    def_synsets = get_wordnet_synsets(def_word)
    ans_synsets = get_wordnet_synsets(ans_word)
    ans_synsets_set = set(ans_synsets)

    features = {}

    # --- Synonym (lemma-based, one hop) ---
    features['wn_rel_synonym'] = int(check_synonym(def_synsets, ans_word))

    # --- Synset-based relationship checks (one-hop and two-hop) ---
    for rel_name, hops in RELATIONSHIP_HOPS.items():
        features[f'wn_rel_{rel_name}'] = int(
            check_synset_reachable(def_synsets, ans_synsets_set, hops)
        )

    # --- Max path similarity ---
    features['wn_max_path_sim'] = compute_max_path_similarity(
        def_synsets, ans_synsets)

    # --- Shared synset count ---
    features['wn_shared_synset_count'] = compute_shared_synset_count(
        def_synsets, ans_synsets)

    return features


# Quick sanity check: verify the function produces the expected keys
# and sensible values for a known word pair.
_test = compute_wordnet_relationship_features('flower', 'rose')
_expected_keys = (
    ['wn_rel_synonym']
    + [f'wn_rel_{r}' for r in RELATIONSHIP_HOPS]
    + ['wn_max_path_sim', 'wn_shared_synset_count']
)
assert set(_test.keys()) == set(_expected_keys), (
    f'Key mismatch. Expected {len(_expected_keys)} keys, got {len(_test)}')
print(f'Function sanity check passed (flower → rose):')
print(f'  wn_rel_synonym={_test["wn_rel_synonym"]}, '
      f'wn_rel_hypernym={_test["wn_rel_hypernym"]}, '
      f'wn_rel_hyponym={_test["wn_rel_hyponym"]}, '
      f'wn_max_path_sim={_test["wn_max_path_sim"]:.4f}, '
      f'wn_shared_synset_count={_test["wn_shared_synset_count"]}')
print(f'  Total features per pair: {len(_test)}')

Function sanity check passed (flower → rose):
  wn_rel_synonym=0, wn_rel_hypernym=0, wn_rel_hyponym=0, wn_max_path_sim=0.2500, wn_shared_synset_count=0
  Total features per pair: 22

# --- Deduplicate to unique (definition_wn, answer_wn) pairs ---
# Many clue rows share the same definition–answer pair (different clue
# sentences for the same word pair). Relationship features depend only on
# the word pair, not the clue text, so we compute them once per unique pair
# and merge back. This avoids redundant WordNet traversals and dramatically
# reduces computation time.

unique_pairs = df[['definition_wn', 'answer_wn']].drop_duplicates()
print(f'Total rows:                              {len(df):>10,}')
print(f'Unique (definition_wn, answer_wn) pairs: {len(unique_pairs):>10,}')
print(f'Deduplication ratio:                     {len(df) / len(unique_pairs):>10.2f}x')
print(f'\nComputing WordNet relationship features for {len(unique_pairs):,} '
      f'unique pairs...')
print(f'(This may take 10–30 minutes on CPU.)\n')

# --- Compute features for each unique pair ---
# Use itertuples (faster than iterrows) to iterate over the unique pairs.
pair_list = list(unique_pairs.itertuples(index=False, name=None))
results = []
for def_word, ans_word in tqdm(pair_list, desc='WordNet relationships'):
    feats = compute_wordnet_relationship_features(def_word, ans_word)
    feats['definition_wn'] = def_word
    feats['answer_wn'] = ans_word
    results.append(feats)

rel_features_df = pd.DataFrame(results)
print(f'\nComputed {len(rel_features_df):,} unique-pair feature rows.')

# --- Merge relationship features back to all rows ---
# Left merge preserves all 240,211 rows in df. Each row gets the
# relationship features for its (definition_wn, answer_wn) pair.
n_before = len(df)
df = df.merge(rel_features_df, on=['definition_wn', 'answer_wn'], how='left')
assert len(df) == n_before, (
    f'Merge changed row count: {n_before:,} → {len(df):,}. '
    f'This should not happen with a left merge on unique keys.')

print(f'Merged relationship features back to {len(df):,} rows.')

Total rows:                                 240,211
Unique (definition_wn, answer_wn) pairs:    127,608
Deduplication ratio:                           1.88x

Computing WordNet relationship features for 127,608 unique pairs...
(This may take 10–30 minutes on CPU.)

WordNet relationships:   0%|          | 0/127608 [00:00<?, ?it/s]

Computed 127,608 unique-pair feature rows.
Merged relationship features back to 240,211 rows.

# --- Identify relationship columns ---
wn_rel_bool_cols = sorted([c for c in df.columns if c.startswith('wn_rel_')])
wn_continuous_cols = ['wn_max_path_sim', 'wn_shared_synset_count']
wn_all_cols = wn_rel_bool_cols + wn_continuous_cols

print(f'Boolean relationship columns ({len(wn_rel_bool_cols)}):')
for col in wn_rel_bool_cols:
    print(f'  {col}')
print(f'\nContinuous relationship columns ({len(wn_continuous_cols)}):')
for col in wn_continuous_cols:
    print(f'  {col}')
print(f'\nTotal relationship columns: {len(wn_all_cols)}')

# --- Assert expected counts ---
# The design doc Table 3 lists 20 relationship types (19 from the original
# plan + hypernym_of_hyponym). Together with path similarity and shared
# synset count, that gives 22 total relationship columns — one more than
# the original PLAN.md estimate of 21 (which assumed 19 boolean types).
assert len(wn_rel_bool_cols) == 20, (
    f'Expected 20 boolean relationship columns, found {len(wn_rel_bool_cols)}')
assert len(wn_all_cols) == 22, (
    f'Expected 22 total relationship columns, found {len(wn_all_cols)}')

# --- Assert no NaN values (Decision 3) ---
nan_counts = df[wn_all_cols].isnull().sum()
assert (nan_counts == 0).all(), (
    f'NaN values found in relationship columns:\n'
    f'{nan_counts[nan_counts > 0]}')
print(f'\nVerification passed: {len(wn_rel_bool_cols)} boolean + '
      f'{len(wn_continuous_cols)} continuous = {len(wn_all_cols)} '
      f'relationship columns, 0 NaN values.')
print(f'Rows: {len(df):,}\n')

# --- Boolean feature prevalence ---
# Which relationship types are most common? We expect hyponym and synonym
# to dominate, since cryptic crossword definitions typically use "is-a"
# relationships (hypernym→answer) or direct synonymy.
print(f'--- Boolean Feature Prevalence ---')
print(f'{"Feature":<45s} {"True":>8s} {"False":>8s} {"% True":>8s}')
print(f'{"-"*45} {"-"*8} {"-"*8} {"-"*8}')
for col in wn_rel_bool_cols:
    n_true = int(df[col].sum())
    n_false = len(df) - n_true
    pct = 100 * n_true / len(df)
    print(f'{col:<45s} {n_true:>8,} {n_false:>8,} {pct:>7.2f}%')

# --- Rows with at least one boolean True ---
any_true = df[wn_rel_bool_cols].any(axis=1).sum()
pct_any = 100 * any_true / len(df)
print(f'\nRows with at least one boolean True: '
      f'{any_true:,} / {len(df):,} ({pct_any:.1f}%)')
print(f'(Design doc preliminary finding: ~31–34%)')

# --- Continuous feature statistics ---
print(f'\n--- Continuous Feature Statistics ---\n')
with pd.option_context('display.float_format', '{:.4f}'.format):
    print(df[wn_continuous_cols].describe().to_string())

# --- Distribution of wn_shared_synset_count ---
print(f'\n--- wn_shared_synset_count value distribution ---')
ssc_counts = df['wn_shared_synset_count'].value_counts().sort_index()
for val, cnt in ssc_counts.items():
    print(f'  {val}: {cnt:,} rows ({100 * cnt / len(df):.2f}%)')

Boolean relationship columns (20):
  wn_rel_hypernym
  wn_rel_hypernym_of_hypernym
  wn_rel_hypernym_of_hyponym
  wn_rel_hyponym
  wn_rel_hyponym_of_hypernym
  wn_rel_hyponym_of_hyponym
  wn_rel_hyponym_of_part_holonym
  wn_rel_hyponym_of_part_meronym
  wn_rel_member_meronym
  wn_rel_member_meronym_of_member_holonym
  wn_rel_part_holonym
  wn_rel_part_holonym_of_hypernym
  wn_rel_part_holonym_of_hyponym
  wn_rel_part_holonym_of_part_meronym
  wn_rel_part_meronym
  wn_rel_part_meronym_of_hypernym
  wn_rel_part_meronym_of_hyponym
  wn_rel_substance_meronym
  wn_rel_substance_meronym_of_hyponym
  wn_rel_synonym

Continuous relationship columns (2):
  wn_max_path_sim
  wn_shared_synset_count

Total relationship columns: 22

Verification passed: 20 boolean + 2 continuous = 22 relationship columns, 0 NaN values.
Rows: 240,211

--- Boolean Feature Prevalence ---
Feature                                           True    False   % True
--------------------------------------------- -------- -------- --------
wn_rel_hypernym                                 14,158  226,053    5.89%
wn_rel_hypernym_of_hypernym                      3,008  237,203    1.25%
wn_rel_hypernym_of_hyponym                      25,010  215,201   10.41%
wn_rel_hyponym                                  45,084  195,127   18.77%
wn_rel_hyponym_of_hypernym                      51,674  188,537   21.51%
wn_rel_hyponym_of_hyponym                       16,322  223,889    6.79%
wn_rel_hyponym_of_part_holonym                     651  239,560    0.27%
wn_rel_hyponym_of_part_meronym                     318  239,893    0.13%
wn_rel_member_meronym                               60  240,151    0.02%
wn_rel_member_meronym_of_member_holonym            856  239,355    0.36%
wn_rel_part_holonym                                443  239,768    0.18%
wn_rel_part_holonym_of_hypernym                    132  240,079    0.05%
wn_rel_part_holonym_of_hyponym                     836  239,375    0.35%
wn_rel_part_holonym_of_part_meronym              2,021  238,190    0.84%
wn_rel_part_meronym                                471  239,740    0.20%
wn_rel_part_meronym_of_hypernym                    211  240,000    0.09%
wn_rel_part_meronym_of_hyponym                     416  239,795    0.17%
wn_rel_substance_meronym                            35  240,176    0.01%
wn_rel_substance_meronym_of_hyponym                392  239,819    0.16%
wn_rel_synonym                                  36,121  204,090   15.04%

Rows with at least one boolean True: 120,854 / 240,211 (50.3%)
(Design doc preliminary finding: ~31–34%)

--- Continuous Feature Statistics ---

       wn_max_path_sim  wn_shared_synset_count
count      240211.0000             240211.0000
mean            0.4352                  0.2081
std             0.2902                  0.5066
min             0.0385                  0.0000
25%             0.2500                  0.0000
50%             0.3333                  0.0000
75%             0.5000                  0.0000
max             1.0000                 16.0000

--- wn_shared_synset_count value distribution ---
  0: 197,886 rows (82.38%)
  1: 36,660 rows (15.26%)
  2: 4,336 rows (1.81%)
  3: 934 rows (0.39%)
  4: 244 rows (0.10%)
  5: 87 rows (0.04%)
  6: 43 rows (0.02%)
  7: 16 rows (0.01%)
  14: 1 rows (0.00%)
  16: 4 rows (0.00%)

# What's the percentage at the unique-pair level?
pair_df = df.drop_duplicates(subset=['definition_wn', 'answer_wn'])
pair_any_true = pair_df[wn_rel_bool_cols].any(axis=1).sum()
print(f"Unique pairs with at least one boolean True: "
      f"{pair_any_true:,} / {len(pair_df):,} "
      f"({100 * pair_any_true / len(pair_df):.1f}%)")

Unique pairs with at least one boolean True: 56,220 / 127,608 (44.1%)

# ============================================================
# Surface Feature Function
# ============================================================
# This function is standalone and can be extracted into
# scripts/feature_utils.py for reuse in distractor feature
# computation (Steps 5 and 7). See Decision 18.
# ============================================================


def compute_surface_features(def_word, ans_word):
    """Compute all 4 surface features for a (definition, answer) pair.

    Surface features measure shallow orthographic similarity between words.
    This function is standalone and can be extracted into
    scripts/feature_utils.py for reuse in distractor feature computation
    (Steps 5 and 7). See Decision 18.

    Parameters
    ----------
    def_word : str
        The definition word (from the definition_wn column).
    ans_word : str
        The answer word (from the answer_wn column).

    Returns
    -------
    dict
        Keys are feature column names, values are feature values:
        - surface_edit_distance (int): Levenshtein edit distance
        - surface_length_ratio (float): len(shorter) / len(longer), in [0, 1]
        - surface_shared_first_letter (int): 1 if same first character, else 0
        - surface_char_overlap_ratio (float): Jaccard similarity of character sets
    """
    features = {}

    # --- Levenshtein edit distance ---
    # Uses NLTK's edit_distance which computes the standard dynamic-programming
    # Levenshtein distance (insertions, deletions, substitutions each cost 1).
    features['surface_edit_distance'] = nltk.edit_distance(def_word, ans_word)

    # --- Length ratio: len(shorter) / len(longer) ---
    # Always in [0, 1]. A value of 1.0 means identical lengths.
    # Default to 0.0 if both strings are empty (should not occur given
    # WordNet filter, but be safe).
    len_def = len(def_word)
    len_ans = len(ans_word)
    if len_def == 0 and len_ans == 0:
        features['surface_length_ratio'] = 0.0
    else:
        shorter = min(len_def, len_ans)
        longer = max(len_def, len_ans)
        features['surface_length_ratio'] = shorter / longer

    # --- Shared first letter ---
    # Binary indicator: do the words start with the same character?
    if def_word and ans_word:
        features['surface_shared_first_letter'] = int(
            def_word[0] == ans_word[0])
    else:
        features['surface_shared_first_letter'] = 0

    # --- Character overlap ratio (Jaccard similarity of character sets) ---
    # Measures what fraction of distinct characters appear in both words.
    # For example, "plant" {p,l,a,n,t} and "aster" {a,s,t,e,r} share
    # {a, t} out of {p,l,a,n,t,s,e,r} → 2/8 = 0.25.
    def_chars = set(def_word)
    ans_chars = set(ans_word)
    union = def_chars | ans_chars
    if len(union) == 0:
        features['surface_char_overlap_ratio'] = 0.0
    else:
        features['surface_char_overlap_ratio'] = (
            len(def_chars & ans_chars) / len(union))

    return features


# Quick sanity check with a known pair
_test_surf = compute_surface_features('flower', 'rose')
print('Sanity check (flower → rose):')
for k, v in _test_surf.items():
    print(f'  {k} = {v}')
assert len(_test_surf) == 4

print(f'\nComputing surface features for {len(df):,} rows...')

# --- Apply to all rows ---
# Surface features are cheap to compute (no WordNet or embedding lookups),
# so we compute them directly on all rows without deduplication.
# Using a list comprehension over zip is faster than df.apply(axis=1).
surf_results = [
    compute_surface_features(d, a)
    for d, a in zip(df['definition_wn'], df['answer_wn'])
]
surf_df = pd.DataFrame(surf_results, index=df.index)
for col in surf_df.columns:
    df[col] = surf_df[col]

print(f'Added {len(surf_df.columns)} surface feature columns to df.')

Sanity check (flower → rose):
  surface_edit_distance = 4
  surface_length_ratio = 0.6666666666666666
  surface_shared_first_letter = 0
  surface_char_overlap_ratio = 0.42857142857142855

Computing surface features for 240,211 rows...
Added 4 surface feature columns to df.

# --- Identify surface columns ---
SURFACE_FEATURE_COLS = ['surface_edit_distance', 'surface_length_ratio',
                        'surface_shared_first_letter', 'surface_char_overlap_ratio']
surface_cols = SURFACE_FEATURE_COLS

# --- Assert exactly 4 surface columns ---
assert len(surface_cols) == 4, (
    f'Expected 4 surface columns, found {len(surface_cols)}: {surface_cols}')

# --- Assert no NaN values (Decision 3) ---
nan_counts = df[surface_cols].isnull().sum()
assert (nan_counts == 0).all(), (
    f'NaN values found in surface columns:\n'
    f'{nan_counts[nan_counts > 0]}')

print(f'Verification passed: 4 surface columns, 0 NaN values.')
print(f'Rows: {len(df):,}\n')

# --- Descriptive statistics ---
print('--- Descriptive Statistics (Surface Features) ---\n')
with pd.option_context('display.float_format', '{:.4f}'.format):
    print(df[surface_cols].describe().to_string())

Verification passed: 4 surface columns, 0 NaN values.
Rows: 240,211

--- Descriptive Statistics (Surface Features) ---

       surface_edit_distance  surface_length_ratio  surface_shared_first_letter  surface_char_overlap_ratio
count            240211.0000           240211.0000                  240211.0000                 240211.0000
mean                  6.7299                0.7259                       0.0738                      0.2304
std                   2.2153                0.1836                       0.2615                      0.1451
min                   0.0000                0.0667                       0.0000                      0.0000
25%                   5.0000                0.6000                       0.0000                      0.1250
50%                   6.0000                0.7500                       0.0000                      0.2222
75%                   8.0000                0.8571                       0.0000                      0.3333
max                  26.0000                1.0000                       1.0000                      1.0000

# ============================================================
# Feature Group Definitions
# ============================================================

# --- Context-Free Meaning (15): cosine similarities not involving clue context ---
CONTEXT_FREE_COLS = sorted([
    c for c in df.columns if c.startswith('cos_') and 'w1clue' not in c
])

# --- Context-Informed Meaning (6): cosine similarities involving w1clue ---
CONTEXT_INFORMED_COLS = sorted([
    c for c in df.columns if c.startswith('cos_') and 'w1clue' in c
])

# --- Relationship (22): 20 boolean + path similarity + shared synset count ---
RELATIONSHIP_COLS = (
    sorted([c for c in df.columns if c.startswith('wn_rel_')])
    + ['wn_max_path_sim', 'wn_shared_synset_count']
)

# --- Surface (4): orthographic similarity ---
SURFACE_COLS = ['surface_edit_distance', 'surface_length_ratio',
                'surface_shared_first_letter', 'surface_char_overlap_ratio']

# --- All features ---
ALL_FEATURE_COLS = (
    CONTEXT_FREE_COLS + CONTEXT_INFORMED_COLS
    + RELATIONSHIP_COLS + SURFACE_COLS
)

print('Feature group counts:')
print(f'  Context-free cosine:     {len(CONTEXT_FREE_COLS):>3d}')
print(f'  Context-informed cosine: {len(CONTEXT_INFORMED_COLS):>3d}')
print(f'  WordNet relationship:    {len(RELATIONSHIP_COLS):>3d}')
print(f'  Surface:                 {len(SURFACE_COLS):>3d}')
print(f'  {"─" * 35}')
print(f'  Total features:          {len(ALL_FEATURE_COLS):>3d}')

# --- Assert expected group sizes ---
assert len(CONTEXT_FREE_COLS) == 15, (
    f'Expected 15 context-free, got {len(CONTEXT_FREE_COLS)}')
assert len(CONTEXT_INFORMED_COLS) == 6, (
    f'Expected 6 context-informed, got {len(CONTEXT_INFORMED_COLS)}')
assert len(RELATIONSHIP_COLS) == 22, (
    f'Expected 22 relationship, got {len(RELATIONSHIP_COLS)}')
assert len(SURFACE_COLS) == 4, (
    f'Expected 4 surface, got {len(SURFACE_COLS)}')
assert len(ALL_FEATURE_COLS) == 47, (
    f'Expected 47 total features, got {len(ALL_FEATURE_COLS)}')

# ============================================================
# Metadata Columns
# ============================================================
# Carry through identifiers and lookup keys needed by downstream notebooks.
# Includes def_num_usable_synsets and ans_num_usable_synsets for downstream
# stratification by polysemy level (Decision 19).
# Does NOT include raw embedding matrices or intermediate columns (e.g.,
# cc_row_position used for embedding array indexing).

METADATA_COLS = [
    'clue_id', 'clue', 'surface', 'surface_normalized',
    'definition', 'answer', 'definition_wn', 'answer_wn',
    'def_answer_pair_id', 'answer_format', 'num_definitions',
    'def_num_usable_synsets', 'ans_num_usable_synsets',
]

# Verify all metadata columns exist in df
missing_meta = [c for c in METADATA_COLS if c not in df.columns]
assert not missing_meta, f'Missing metadata columns: {missing_meta}'

# ============================================================
# Build Output DataFrame
# ============================================================
output_df = df[METADATA_COLS + ALL_FEATURE_COLS].copy()

# ============================================================
# Validation
# ============================================================
expected_rows = len(df)
assert len(output_df) == expected_rows, (
    f'Row count mismatch: {len(output_df):,} vs {expected_rows:,}')
assert not output_df[ALL_FEATURE_COLS].isnull().any().any(), (
    'NaN values found in feature columns — violates Decision 3')
assert len(ALL_FEATURE_COLS) == 47, (
    f'Expected 47 features, got {len(ALL_FEATURE_COLS)}')

print(f'\nOutput DataFrame: {len(output_df):,} rows × '
      f'{len(output_df.columns)} columns')
print(f'  Metadata columns: {len(METADATA_COLS)}')
print(f'  Feature columns:  {len(ALL_FEATURE_COLS)}')
print(f'  Total columns:    {len(METADATA_COLS) + len(ALL_FEATURE_COLS)}')

# ============================================================
# Feature Statistics (all 47 features, transposed)
# ============================================================
print(f'\n--- Feature Statistics (all 47 features) ---\n')
with pd.option_context('display.float_format', '{:.4f}'.format,
                       'display.max_rows', 50):
    print(output_df[ALL_FEATURE_COLS].describe().T[
        ['mean', 'std', 'min', '25%', '50%', '75%', 'max']
    ].to_string())

# ============================================================
# Correlation Highlights
# ============================================================
# Flag the top 5 most correlated feature pairs. Some multicollinearity is
# expected: within-word cosine pairs (e.g., cos_w1all_w1common and
# cos_w1all_w1obscure) share most of their variance because allsense is an
# average that includes the common and obscure senses. Certain relationship
# booleans (synonym and hyponym_of_hypernym) may also co-occur frequently.
print(f'\n--- Top 5 Most Correlated Feature Pairs ---\n')
corr_matrix = output_df[ALL_FEATURE_COLS].corr()
# Extract upper triangle (exclude diagonal and lower triangle)
upper_tri = corr_matrix.where(
    np.triu(np.ones(corr_matrix.shape, dtype=bool), k=1))
# Stack and sort by absolute correlation
corr_pairs = upper_tri.stack().reset_index()
corr_pairs.columns = ['feature_1', 'feature_2', 'correlation']
corr_pairs['abs_corr'] = corr_pairs['correlation'].abs()
top_5 = corr_pairs.nlargest(5, 'abs_corr')
for _, row in top_5.iterrows():
    print(f'  {row["feature_1"]:<40s} × {row["feature_2"]:<40s}'
          f'  r = {row["correlation"]:+.4f}')

# ============================================================
# Save to Parquet
# ============================================================
output_path = DATA_DIR / 'features_all.parquet'
output_df.to_parquet(output_path, index=False)
file_size_mb = output_path.stat().st_size / (1024 * 1024)
print(f'\nSaved to {output_path}')
print(f'File size: {file_size_mb:.1f} MB')

# --- Round-trip verification ---
reloaded = pd.read_parquet(output_path)
assert reloaded.shape == output_df.shape, (
    f'Shape mismatch after reload: {reloaded.shape} vs {output_df.shape}')
print(f'Round-trip verification passed: '
      f'{reloaded.shape[0]:,} rows × {reloaded.shape[1]} columns')

Feature group counts:
  Context-free cosine:      15
  Context-informed cosine:   6
  WordNet relationship:     22
  Surface:                   4
  ───────────────────────────────────
  Total features:           47

Output DataFrame: 240,211 rows × 60 columns
  Metadata columns: 13
  Feature columns:  47
  Total columns:    60

--- Feature Statistics (all 47 features) ---

                                          mean    std     min    25%    50%    75%     max
cos_w1all_w1common                      0.8638 0.1257  0.2143 0.8146 0.8939 0.9521  1.0000
cos_w1all_w1obscure                     0.8636 0.1168  0.3153 0.8004 0.8887 0.9503  1.0000
cos_w1all_w2all                         0.6482 0.1366  0.0909 0.5597 0.6622 0.7512  1.0000
cos_w1all_w2common                      0.5954 0.1575  0.0180 0.4850 0.6090 0.7172  0.9689
cos_w1all_w2obscure                     0.5940 0.1545  0.0148 0.4862 0.6071 0.7112  0.9689
cos_w1common_w1obscure                  0.6891 0.2331 -0.0262 0.5217 0.7244 0.8780  1.0000
cos_w1common_w2all                      0.5686 0.1710  0.0085 0.4431 0.5815 0.7019  0.9689
cos_w1common_w2common                   0.5311 0.1871 -0.0402 0.3819 0.5337 0.6800  1.0000
cos_w1common_w2obscure                  0.5171 0.1867 -0.0420 0.3694 0.5192 0.6640  0.9768
cos_w1obscure_w2all                     0.5593 0.1640  0.0128 0.4390 0.5676 0.6856  0.9689
cos_w1obscure_w2common                  0.5096 0.1801 -0.0444 0.3693 0.5072 0.6487  0.9768
cos_w1obscure_w2obscure                 0.5171 0.1754 -0.0407 0.3818 0.5153 0.6509  1.0000
cos_w2all_w2common                      0.9150 0.1021  0.2143 0.8729 0.9430 1.0000  1.0000
cos_w2all_w2obscure                     0.9145 0.0982  0.3153 0.8696 0.9409 1.0000  1.0000
cos_w2common_w2obscure                  0.7798 0.2194 -0.0262 0.6318 0.8236 1.0000  1.0000
cos_w1clue_w1all                        0.7453 0.1171  0.1011 0.6814 0.7681 0.8318  0.9772
cos_w1clue_w1common                     0.6721 0.1780 -0.0199 0.5655 0.7170 0.8105  0.9748
cos_w1clue_w1obscure                    0.6374 0.1794 -0.0490 0.5127 0.6725 0.7809  0.9763
cos_w1clue_w2all                        0.5446 0.1544 -0.0028 0.4318 0.5546 0.6642  0.9404
cos_w1clue_w2common                     0.5019 0.1695 -0.0382 0.3701 0.5033 0.6360  0.9427
cos_w1clue_w2obscure                    0.4987 0.1676 -0.0545 0.3683 0.4994 0.6303  0.9441
wn_rel_hypernym                         0.0589 0.2355  0.0000 0.0000 0.0000 0.0000  1.0000
wn_rel_hypernym_of_hypernym             0.0125 0.1112  0.0000 0.0000 0.0000 0.0000  1.0000
wn_rel_hypernym_of_hyponym              0.1041 0.3054  0.0000 0.0000 0.0000 0.0000  1.0000
wn_rel_hyponym                          0.1877 0.3905  0.0000 0.0000 0.0000 0.0000  1.0000
wn_rel_hyponym_of_hypernym              0.2151 0.4109  0.0000 0.0000 0.0000 0.0000  1.0000
wn_rel_hyponym_of_hyponym               0.0679 0.2517  0.0000 0.0000 0.0000 0.0000  1.0000
wn_rel_hyponym_of_part_holonym          0.0027 0.0520  0.0000 0.0000 0.0000 0.0000  1.0000
wn_rel_hyponym_of_part_meronym          0.0013 0.0364  0.0000 0.0000 0.0000 0.0000  1.0000
wn_rel_member_meronym                   0.0002 0.0158  0.0000 0.0000 0.0000 0.0000  1.0000
wn_rel_member_meronym_of_member_holonym 0.0036 0.0596  0.0000 0.0000 0.0000 0.0000  1.0000
wn_rel_part_holonym                     0.0018 0.0429  0.0000 0.0000 0.0000 0.0000  1.0000
wn_rel_part_holonym_of_hypernym         0.0005 0.0234  0.0000 0.0000 0.0000 0.0000  1.0000
wn_rel_part_holonym_of_hyponym          0.0035 0.0589  0.0000 0.0000 0.0000 0.0000  1.0000
wn_rel_part_holonym_of_part_meronym     0.0084 0.0913  0.0000 0.0000 0.0000 0.0000  1.0000
wn_rel_part_meronym                     0.0020 0.0442  0.0000 0.0000 0.0000 0.0000  1.0000
wn_rel_part_meronym_of_hypernym         0.0009 0.0296  0.0000 0.0000 0.0000 0.0000  1.0000
wn_rel_part_meronym_of_hyponym          0.0017 0.0416  0.0000 0.0000 0.0000 0.0000  1.0000
wn_rel_substance_meronym                0.0001 0.0121  0.0000 0.0000 0.0000 0.0000  1.0000
wn_rel_substance_meronym_of_hyponym     0.0016 0.0404  0.0000 0.0000 0.0000 0.0000  1.0000
wn_rel_synonym                          0.1504 0.3574  0.0000 0.0000 0.0000 0.0000  1.0000
wn_max_path_sim                         0.4352 0.2902  0.0385 0.2500 0.3333 0.5000  1.0000
wn_shared_synset_count                  0.2081 0.5066  0.0000 0.0000 0.0000 0.0000 16.0000
surface_edit_distance                   6.7299 2.2153  0.0000 5.0000 6.0000 8.0000 26.0000
surface_length_ratio                    0.7259 0.1836  0.0667 0.6000 0.7500 0.8571  1.0000
surface_shared_first_letter             0.0738 0.2615  0.0000 0.0000 0.0000 0.0000  1.0000
surface_char_overlap_ratio              0.2304 0.1451  0.0000 0.1250 0.2222 0.3333  1.0000

--- Top 5 Most Correlated Feature Pairs ---

  cos_w1common_w2all                       × cos_w1common_w2common                     r = +0.8850
  cos_w1obscure_w2all                      × cos_w1obscure_w2obscure                   r = +0.8731
  cos_w1common_w2all                       × cos_w1common_w2obscure                    r = +0.8710
  cos_w1obscure_w2all                      × cos_w1obscure_w2common                    r = +0.8653
  cos_w2all_w2obscure                      × cos_w2common_w2obscure                    r = +0.8624

Saved to /Users/victoria/Desktop/MADS/ccc-project/clue_misdirection/data/features_all.parquet
File size: 46.8 MB
Round-trip verification passed: 240,211 rows × 60 columns

	clue_id	clue	surface	surface_normalized	definition	answer	answer_format	num_definitions	def_answer_pair_id	definition_wn	answer_wn	def_num_usable_synsets	ans_num_usable_synsets	cc_row_position
0	1	Acquisitive chap, as we see it (8)	Acquisitive chap, as we see it	acquisitive chap as we see it	Acquisitive	COVETOUS	8	1	0	acquisitive	covetous	1	2	0
1	2	Back yard fencing weak and sagging (6)	Back yard fencing weak and sagging	back yard fencing weak and sagging	sagging	DROOPY	6	1	1	sagging	droopy	3	1	1
2	3	Stripping off uniform, love holding colonel's ...	Stripping off uniform, love holding colonel's ...	stripping off uniform love holding colonels coat	Stripping	UNCLOTHING	10	1	2	stripping	unclothing	14	3	2

Abbrev	Source	Slot in `.npy` array
`w1_all`	definition allsense-average	`definition_embeddings[idx, 0, :]`
`w1_common`	definition common synset	`definition_embeddings[idx, 1, :]`
`w1_obscure`	definition obscure synset	`definition_embeddings[idx, 2, :]`
`w2_all`	answer allsense-average	`answer_embeddings[idx, 0, :]`
`w2_common`	answer common synset	`answer_embeddings[idx, 1, :]`
`w2_obscure`	answer obscure synset	`answer_embeddings[idx, 2, :]`
`w1_clue`	definition in clue context	`clue_context_embeddings[idx, :]`

	cos_w1all_w2all	cos_w1all_w2common	cos_w1all_w2obscure	cos_w1common_w2all	cos_w1common_w2common	cos_w1common_w2obscure	cos_w1obscure_w2all	cos_w1obscure_w2common	cos_w1obscure_w2obscure	cos_w1all_w1common	cos_w1all_w1obscure	cos_w1common_w1obscure	cos_w2all_w2common	cos_w2all_w2obscure	cos_w2common_w2obscure	cos_w1clue_w1all	cos_w1clue_w1common	cos_w1clue_w1obscure	cos_w1clue_w2all	cos_w1clue_w2common	cos_w1clue_w2obscure
0	0.5553	0.5385	0.5349	0.5553	0.5385	0.5349	0.5553	0.5385	0.5349	1.0000	1.0000	1.0000	0.9660	0.9669	0.8680	0.8058	0.8058	0.8058	0.5530	0.5572	0.5120
1	0.7398	0.7398	0.7398	0.7110	0.7110	0.7110	0.7100	0.7100	0.7100	0.9444	0.9177	0.8282	1.0000	1.0000	1.0000	0.6649	0.6840	0.6812	0.5610	0.5610	0.5610
2	0.8343	0.6033	0.7967	0.7282	0.5366	0.6508	0.7749	0.5748	0.7374	0.9136	0.8823	0.7993	0.8615	0.9469	0.7181	0.6194	0.5856	0.6176	0.4217	0.2459	0.3888
3	0.3062	0.2764	0.3207	0.3062	0.2764	0.3207	0.3062	0.2764	0.3207	1.0000	1.0000	1.0000	0.9747	0.9738	0.8985	0.8256	0.8256	0.8256	0.3337	0.2933	0.3575
4	0.7116	0.7116	0.7116	0.6720	0.6720	0.6720	0.5386	0.5386	0.5386	0.9113	0.7305	0.4987	1.0000	1.0000	1.0000	0.8402	0.8220	0.5556	0.6772	0.6772	0.6772

03 — Feature Engineering¶

Imports¶

Environment and Paths¶

Load Data¶

Load Embeddings¶

Build Embedding Matrices¶

Cosine Similarity Features (21 total)¶

Verification: Cosine Similarity Features¶

WordNet Relationship Features (22 total)¶

Verification: WordNet Relationship Features¶

Interpretation¶

Surface Features (4 total)¶

Verification: Surface Features¶

Final Assembly, Validation, and Save¶

Summary¶

Feature groups¶

Key statistics by group¶

Notable observations¶

Output¶

FINDINGS.md update¶

Group	Count	Description
Context-free cosine	15	Pairwise cosine similarities among the 6 sense-specific embeddings (definition × answer cross-word, within-definition, within-answer)
Context-informed cosine	6	Cosine similarities between the clue-contextualized definition embedding and each of the 6 sense-specific embeddings
WordNet relationship	22	20 boolean two-hop relationship types + max path similarity + shared synset count
Surface	4	Edit distance, length ratio, shared first letter, character overlap ratio
Total	47