import os
import re
import time
import warnings

import numpy as np
import pandas as pd
import torch
from pathlib import Path
from sentence_transformers import SentenceTransformer

import nltk
from nltk.corpus import wordnet as wn

# --- Environment Auto-Detection ---
# Detect whether we are running on Google Colab, Great Lakes, or a local machine.
# This determines how we set up paths and batch sizes.
try:
    IS_COLAB = 'google.colab' in str(get_ipython())
except NameError:
    IS_COLAB = False

if IS_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    PROJECT_ROOT = Path('/content/drive/MyDrive/SIADS 692 Milestone II/Milestone II - NLP Cryptic Crossword Clues/clue_misdirection')
else:
    # Local or Great Lakes: notebook is in clue_misdirection/notebooks/,
    # so parent is the clue_misdirection project root.
    try:
        PROJECT_ROOT = Path(__file__).resolve().parent.parent
    except NameError:
        PROJECT_ROOT = Path.cwd().parent

DATA_DIR = PROJECT_ROOT / 'data'
EMBEDDINGS_DIR = DATA_DIR / 'embeddings'
EMBEDDINGS_DIR.mkdir(parents=True, exist_ok=True)

# Batch size for embedding generation.
# Colab free-tier T4 GPUs have 16 GB VRAM — use a smaller batch to avoid OOM.
# Great Lakes V100/A40 and local GPUs with more VRAM can handle larger batches.
BATCH_SIZE = 32 if IS_COLAB else 64

np.random.seed(42)

# Download WordNet data — needed for synset lookups that drive all sense-based
# embeddings. omw-1.4 (Open Multilingual Wordnet) provides lemma names.
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

# Print environment info for reproducibility
print(f'Environment: {"Google Colab" if IS_COLAB else "Local / Great Lakes"}')
print(f'Project root: {PROJECT_ROOT}')
print(f'Data directory: {DATA_DIR}')
print(f'Embeddings directory: {EMBEDDINGS_DIR}')
print(f'Batch size: {BATCH_SIZE}')
print(f'GPU available: {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print(f'GPU device: {torch.cuda.get_device_name(0)}')

input_file = DATA_DIR / 'clues_filtered.csv'
assert input_file.exists(), (
    f'Missing input file: {input_file}\n'
    f'Run 01_data_cleaning.ipynb first to produce this file.'
)

df = pd.read_csv(input_file)
print(f'Loaded {len(df):,} rows from clues_filtered.csv')
print(f'Columns: {list(df.columns)}')
print()
df.head()

# Extract unique definition and answer strings, lowercased for WordNet lookup.
# Answers in clues_filtered.csv are ALL-CAPS (e.g., "ASTER"), so .lower() is
# essential for WordNet synset lookup (wn.synsets('aster') works, wn.synsets('ASTER')
# does not).
unique_definitions = sorted(df['definition'].str.lower().unique())
unique_answers = sorted(df['answer'].str.lower().unique())

print(f'Total rows: {len(df):,}')
print(f'Unique definitions: {len(unique_definitions):,}')
print(f'Unique answers: {len(unique_answers):,}')
print()

print('Example definitions (first 10):')
for d in unique_definitions[:10]:
    print(f'  "{d}"')
print()

print('Example answers (first 10):')
for a in unique_answers[:10]:
    print(f'  "{a}"')

def strip_leading_article(word):
    """Prepare a word for WordNet lookup, handling multi-word expressions and
    leading articles.

    WordNet uses underscores for multi-word entries (e.g., 'a_little', 'ice_cream').
    Step 1's data cleaning stored definitions/answers with spaces, so we need to
    try underscore variants. Step 1 also used article stripping ('a shade' -> 'shade')
    during WordNet filtering, so we mirror that heuristic here.

    Priority order (return the first that has synsets):
    1. word as-is (lowercased)
    2. word with spaces replaced by underscores (multi-word WordNet entries)
    3. If word starts with 'a ': stripped version (without 'a ')
    4. If word starts with 'a ': stripped version with underscores
    5. If nothing works: return lowercased original (will get zero synsets)
    """
    word_lower = word.lower()

    # Priority 1: word as-is (handles single words and multi-word phrases that
    # WordNet happens to accept with spaces — rare, but possible)
    if wn.synsets(word_lower):
        return word_lower

    # Priority 2: multi-word with underscores (e.g., "ice cream" -> "ice_cream",
    # "a little" -> "a_little"). WordNet stores multi-word entries this way.
    underscore = word_lower.replace(' ', '_')
    if underscore != word_lower and wn.synsets(underscore):
        return underscore

    # Priority 3: article-stripped single word (e.g., "a shade" -> "shade")
    if word_lower.startswith('a '):
        stripped = word_lower[2:]
        if wn.synsets(stripped):
            return stripped

        # Priority 4: article-stripped multi-word with underscores
        # (e.g., "a far cry" -> "far_cry" if that were a WordNet entry)
        stripped_underscore = stripped.replace(' ', '_')
        if stripped_underscore != stripped and wn.synsets(stripped_underscore):
            return stripped_underscore

    # Priority 5: fallback — return lowercased original. This will produce zero
    # synsets downstream, which build_all_phrases handles with a warning and a
    # bare-word fallback embedding.
    return word_lower


# Apply to create WordNet-ready columns. The original definition/answer columns
# are preserved for surface matching in the clue-context embedding step.
df['definition_wn'] = df['definition'].apply(strip_leading_article)
df['answer_wn'] = df['answer'].apply(strip_leading_article)

# --- Stats ---

# Classify each transformation type for definitions
def_lower = df['definition'].str.lower()
def_changed_mask = df['definition_wn'] != def_lower  # any change beyond lowercasing
def_has_underscore = df['definition_wn'].str.contains('_', regex=False)

# Article stripping: definition_wn differs from lowercased original AND the
# lowercased original started with 'a '.
def_article_mask = def_changed_mask & def_lower.str.startswith('a ')
# Of those, which ended up as underscore multi-word vs. single word?
def_stripped_to_underscore = def_article_mask & def_has_underscore
def_stripped_to_single = def_article_mask & ~def_has_underscore

# Underscore without article stripping: word_wn has underscore but original
# didn't start with 'a ' (or did but the underscore came from priority 2, not 4).
# More precisely: changed AND has underscore AND NOT article-stripped.
def_underscore_no_strip = def_changed_mask & def_has_underscore & ~def_article_mask
# Also count: kept multi-word form via underscore (priority 2, no article strip)
# These are cases where the original had a space, was NOT stripped, and the
# result has an underscore.
def_kept_multiword = ~def_article_mask & def_has_underscore

print(f'=== Definition transformations ({len(df):,} rows) ===')
print(f'  Unchanged (lowercase only):         {(~def_changed_mask).sum():,}')
print(f'  Underscore multi-word (no strip):   {def_underscore_no_strip.sum():,}')
print(f'  Article stripped -> single word:     {def_stripped_to_single.sum():,}')
print(f'  Article stripped -> underscore:      {def_stripped_to_underscore.sum():,}')
print(f'  Total with underscore in wn form:   {def_has_underscore.sum():,}')

# Same classification for answers
ans_lower = df['answer'].str.lower()
ans_changed_mask = df['answer_wn'] != ans_lower
ans_has_underscore = df['answer_wn'].str.contains('_', regex=False)

ans_article_mask = ans_changed_mask & ans_lower.str.startswith('a ')
ans_stripped_to_underscore = ans_article_mask & ans_has_underscore
ans_stripped_to_single = ans_article_mask & ~ans_has_underscore

ans_underscore_no_strip = ans_changed_mask & ans_has_underscore & ~ans_article_mask
ans_kept_multiword = ~ans_article_mask & ans_has_underscore

print(f'\n=== Answer transformations ({len(df):,} rows) ===')
print(f'  Unchanged (lowercase only):         {(~ans_changed_mask).sum():,}')
print(f'  Underscore multi-word (no strip):   {ans_underscore_no_strip.sum():,}')
print(f'  Article stripped -> single word:     {ans_stripped_to_single.sum():,}')
print(f'  Article stripped -> underscore:      {ans_stripped_to_underscore.sum():,}')
print(f'  Total with underscore in wn form:   {ans_has_underscore.sum():,}')

# Update unique string lists to use the WordNet-ready versions.
# Some formerly-distinct definitions (e.g., "a shade" and "shade") may now
# merge, reducing the unique count.
unique_definitions_wn = sorted(df['definition_wn'].unique())
unique_answers_wn = sorted(df['answer_wn'].unique())

print(f'\n=== Unique count changes ===')
print(f'Unique definitions (original lowercase): {len(unique_definitions):,}')
print(f'Unique definitions (after WN prep):      {len(unique_definitions_wn):,}')
print(f'  -> {len(unique_definitions) - len(unique_definitions_wn):,} merged')

print(f'\nUnique answers (original lowercase): {len(unique_answers):,}')
print(f'Unique answers (after WN prep):      {len(unique_answers_wn):,}')
print(f'  -> {len(unique_answers) - len(unique_answers_wn):,} merged')

# Show examples of definitions that kept multi-word form (underscore, no strip)
mw_defs = df.loc[def_kept_multiword, ['definition', 'definition_wn']].drop_duplicates()
if len(mw_defs) > 0:
    print(f'\nExample definitions kept as multi-word (underscore) ({len(mw_defs):,} unique):')
    for _, row in mw_defs.head(10).iterrows():
        print(f'  "{row["definition"]}" -> "{row["definition_wn"]}"')

mw_ans = df.loc[ans_kept_multiword, ['answer', 'answer_wn']].drop_duplicates()
if len(mw_ans) > 0:
    print(f'\nExample answers kept as multi-word (underscore) ({len(mw_ans):,} unique):')
    for _, row in mw_ans.head(10).iterrows():
        print(f'  "{row["answer"]}" -> "{row["answer_wn"]}"')

# Show examples of article-stripped definitions
stripped_defs = df.loc[def_article_mask, ['definition', 'definition_wn']].drop_duplicates()
if len(stripped_defs) > 0:
    print(f'\nExample article-stripped definitions ({len(stripped_defs):,} unique):')
    for _, row in stripped_defs.head(10).iterrows():
        print(f'  "{row["definition"]}" -> "{row["definition_wn"]}"')

stripped_ans = df.loc[ans_article_mask, ['answer', 'answer_wn']].drop_duplicates()
if len(stripped_ans) > 0:
    print(f'\nExample article-stripped answers ({len(stripped_ans):,} unique):')
    for _, row in stripped_ans.head(10).iterrows():
        print(f'  "{row["answer"]}" -> "{row["answer_wn"]}"')

def build_synset_context(word, synset):
    """Construct a context sentence with <t></t> delimiters for CALE.

    Uses a priority cascade that ensures the target word appears exactly once
    in the output, which is what CALE's delimiter mechanism requires.

    Priority order:
    1. Usage example — if the word appears exactly once, wrap it with <t></t>.
       Preferred because examples are natural sentences, closer to CALE's
       training distribution than definition fragments.
    2. Definition text — if the word appears exactly once, wrap it with <t></t>.
    3. Fallback A — if the word does NOT appear in the definition text at all,
       use "<t>word</t>: definition_text". Safe because the word only appears
       once (inside delimiters).
    4. Unresolvable — if the word appears 2+ times in both example and
       definition, AND also appears in the definition (making fallback A
       unsafe), return None. These cases will be assessed and handled
       separately.

    Args:
        word: The target word (lowercased, article-stripped via definition_wn/answer_wn)
        synset: A WordNet Synset object

    Returns:
        tuple: (phrase_string_or_None, path_label)
        where path_label is one of: 'example', 'definition', 'fallback_a', 'unresolvable'
    """
    definition_text = synset.definition()
    examples = synset.examples()

    # Word boundary pattern prevents partial matches: \bplant\b matches "plant"
    # but not "planted", "transplant", or "implant". This is critical at scale
    # where partial matches would silently produce incorrect <t></t> placements.
    pattern = re.compile(r'\b' + re.escape(word) + r'\b', re.IGNORECASE)

    # Priority 1: Usage example — natural sentences are closest to CALE's
    # training distribution. Check each example for exactly 1 occurrence.
    for example in examples:
        if len(pattern.findall(example)) == 1:
            phrase = pattern.sub(lambda m: f'<t>{m.group()}</t>', example, count=1)
            return (phrase, 'example')

    # Priority 2: Definition text — if the word appears exactly once.
    def_count = len(pattern.findall(definition_text))
    if def_count == 1:
        phrase = pattern.sub(lambda m: f'<t>{m.group()}</t>', definition_text, count=1)
        return (phrase, 'definition')

    # Priority 3: Fallback A — word does NOT appear in definition at all.
    # Safe to prepend "<t>word</t>:" because the word appears only inside delimiters.
    if def_count == 0:
        return (f'<t>{word}</t>: {definition_text}', 'fallback_a')

    # Priority 4: Unresolvable — word appears 2+ times in definition, and no
    # example had exactly 1 occurrence. We cannot safely place <t></t> without
    # ambiguity.
    return (None, 'unresolvable')


# --- Test cases ---
# These tests verify the priority cascade using real WordNet entries.

# Test 1: word found in example (priority 1)
# plant.n.01 has the example "they built a large plant to manufacture automobiles"
# which contains "plant" exactly once. The definition "buildings for carrying on
# industrial labor" does NOT contain "plant", so the example takes priority.
ss_example = wn.synsets('plant')[0]
result_example, path_example = build_synset_context('plant', ss_example)
print('Test 1 — word found in example (priority 1):')
print(f'  Synset:     {ss_example.name()}')
print(f'  Definition: "{ss_example.definition()}"')
print(f'  Examples:   {ss_example.examples()}')
print(f'  Result:     "{result_example}"')
print(f'  Path:       {path_example}')
assert path_example == 'example'
assert '<t>' in result_example and '</t>' in result_example
print()

# Test 2: word found in definition but not example
# aster.n.01 definition is "any of various chiefly fall-blooming herbs of the
# genus Aster with showy daisylike flowers" — contains "Aster" exactly once.
# Its examples (if any) do not contain "aster".
ss_def = wn.synsets('aster')[0]
result_def, path_def = build_synset_context('aster', ss_def)
print('Test 2 — word found in definition (priority 2):')
print(f'  Synset:     {ss_def.name()}')
print(f'  Definition: "{ss_def.definition()}"')
print(f'  Examples:   {ss_def.examples()}')
print(f'  Result:     "{result_def}"')
print(f'  Path:       {path_def}')
assert '<t>' in result_def and '</t>' in result_def
print()

# Test 3: fallback A — word not in definition at all
# plant.n.02 definition is "(botany) a living organism lacking the power of
# locomotion" — "plant" does not appear. No examples contain "plant" either.
# Fallback A prepends "<t>plant</t>:" before the definition.
ss_fallback = wn.synsets('plant')[1]
result_fallback, path_fallback = build_synset_context('plant', ss_fallback)
print('Test 3 — fallback A (word not in definition):')
print(f'  Synset:     {ss_fallback.name()}')
print(f'  Definition: "{ss_fallback.definition()}"')
print(f'  Examples:   {ss_fallback.examples()}')
print(f'  Result:     "{result_fallback}"')
print(f'  Path:       {path_fallback}')
assert path_fallback == 'fallback_a'
assert result_fallback.startswith('<t>plant</t>:')
print()

# Test 4: unresolvable — word appears 2+ times in definition, no clean example.
# These are rare. We search for a real case to demonstrate.
print('Test 4 — searching for an unresolvable case...')
unresolvable_found = False
for test_word in ['set', 'run', 'go', 'line', 'point', 'right', 'well', 'back']:
    for ss in wn.synsets(test_word):
        _, path = build_synset_context(test_word, ss)
        if path == 'unresolvable':
            print(f'  Found unresolvable: word="{test_word}", synset={ss.name()}')
            print(f'  Definition: "{ss.definition()}"')
            print(f'  Examples: {ss.examples()}')
            unresolvable_found = True
            break
    if unresolvable_found:
        break

if not unresolvable_found:
    print('  No unresolvable cases found among tested words.')
    print('  (This is expected — unresolvable cases require the word to appear')
    print('   2+ times in the definition with no clean example, which is rare.)')
print()

print('All build_synset_context tests passed.')

from tqdm.auto import tqdm


def build_all_phrases(unique_words, label="words"):
    """Build CALE context phrases for all synsets of each unique word.

    For each word:
    1. Look up all WordNet synsets
    2. For each synset, call build_synset_context to get a (phrase, path) tuple
    3. Record the word, synset info, phrase, and which path was used

    Args:
        unique_words: list of unique lowercased, article-stripped strings
        label: string for progress bar description (e.g., "definitions", "answers")

    Returns:
        DataFrame with columns: [word, synset_name, synset_idx, num_synsets,
        is_common, is_obscure, phrase, path]
    """
    rows = []
    no_synset_words = []

    for word in tqdm(unique_words, desc=f'Building {label} phrases'):
        # WordNet uses underscores for multi-word entries (e.g., 'ice cream'
        # -> 'ice_cream'). The replace() call is necessary because our data
        # may contain multi-word definitions and answers that have WordNet
        # entries (e.g., "ice cream", "New York").
        lookup = word.replace(' ', '_')
        synsets = wn.synsets(lookup)

        if not synsets:
            # This should not happen — Step 1 filtered to words with >= 1
            # WordNet synset, and we applied article stripping. If it does
            # occur, record a warning and use the bare-word fallback so the
            # pipeline doesn't crash.
            no_synset_words.append(word)
            rows.append({
                'word': word,
                'synset_name': 'NONE',
                'synset_idx': 0,
                'num_synsets': 0,
                'is_common': True,
                'is_obscure': True,
                'phrase': f'<t>{word}</t>',
                'path': 'no_synsets',
            })
            continue

        num_synsets = len(synsets)
        for idx, ss in enumerate(synsets):
            phrase, path = build_synset_context(word, ss)
            rows.append({
                'word': word,
                'synset_name': ss.name(),
                'synset_idx': idx,
                'num_synsets': num_synsets,
                # WordNet orders synsets by estimated frequency: index 0 is
                # the most common sense, and the last index is the least
                # common (most obscure). For single-synset words, both
                # is_common and is_obscure are True.
                'is_common': idx == 0,
                'is_obscure': idx == num_synsets - 1,
                'phrase': phrase,
                'path': path,
            })

    if no_synset_words:
        warnings.warn(
            f'{len(no_synset_words)} words had zero WordNet synsets '
            f'(unexpected given Step 1 filter + article stripping). '
            f'First 5 examples: {no_synset_words[:5]}'
        )

    return pd.DataFrame(rows)


# Build phrases for all unique definitions
print('Building phrases for unique definitions...\n')
def_phrases_df = build_all_phrases(unique_definitions_wn, label="definitions")

# Check for the "nan" word issue: "nan" is a valid crossword definition
# meaning grandmother, but pandas interprets the string "nan" as NaN when
# reading/writing CSVs. Detect this early so downstream CSV round-trips
# don't silently corrupt the data.
nan_word_count = def_phrases_df['word'].isna().sum()
if nan_word_count > 0:
    print(f'\n⚠ WARNING: {nan_word_count} rows have NaN in the "word" column.')
    print(f'  This is likely the word "nan" (grandmother) being interpreted as')
    print(f'  NaN by pandas. Use keep_default_na=False when reading CSVs that')
    print(f'  contain this column.')

# --- Comprehensive Definition Phrase Stats ---
print(f'\n{"=" * 60}')
print(f'DEFINITION PHRASE SUMMARY')
print(f'{"=" * 60}')
print(f'Unique definitions processed: {len(unique_definitions_wn):,}')
print(f'Total phrases generated:      {len(def_phrases_df):,}')

avg_synsets = def_phrases_df.groupby('word')['synset_name'].count().mean()
print(f'Average synsets per definition: {avg_synsets:.1f}')

single_synset = (def_phrases_df.groupby('word')['num_synsets'].first() == 1).sum()
print(f'Definitions with only 1 synset: {single_synset:,} '
      f'({single_synset / len(unique_definitions_wn):.1%})')

# Path distribution — how each phrase was constructed
print(f'\nPath distribution:')
path_counts = def_phrases_df['path'].value_counts()
for path_name, count in path_counts.items():
    pct = count / len(def_phrases_df)
    print(f'  {path_name:15s}: {count:>7,} ({pct:.1%})')

# Zero-synset words
n_zero = (def_phrases_df['synset_name'] == 'NONE').sum()
print(f'\nZero-synset definitions: {n_zero} (should be 0)')

# Show a few examples from each path
print(f'\nExample phrases by path:')
for path_name in ['example', 'definition', 'fallback_a', 'unresolvable']:
    subset = def_phrases_df[def_phrases_df['path'] == path_name]
    if len(subset) > 0:
        print(f'\n  --- {path_name} ({len(subset):,} phrases) ---')
        for _, row in subset.head(3).iterrows():
            print(f'  word="{row["word"]}", synset={row["synset_name"]}')
            if pd.isna(row['phrase']):
                # Unresolvable: phrase is None/NaN — show the synset details
                # so the reader can see why it failed (word appears 2+ times).
                ss = wn.synset(row['synset_name'])
                print(f'    phrase: None (unresolvable)')
                print(f'    definition: "{ss.definition()[:80]}"')
                if ss.examples():
                    print(f'    examples: {[e[:60] for e in ss.examples()[:2]]}')
            else:
                phrase_str = str(row['phrase'])
                print(f'    phrase: "{phrase_str[:80]}..."' if len(phrase_str) > 80
                      else f'    phrase: "{phrase_str}"')

# Build phrases for all unique answers
print('Building phrases for unique answers...\n')
ans_phrases_df = build_all_phrases(unique_answers_wn, label="answers")

# Check for the "nan" word issue: "nan" is a valid crossword answer
# meaning grandmother, but pandas interprets the string "nan" as NaN when
# reading/writing CSVs. Detect this early so downstream CSV round-trips
# don't silently corrupt the data.
nan_word_count = ans_phrases_df['word'].isna().sum()
if nan_word_count > 0:
    print(f'\n⚠ WARNING: {nan_word_count} rows have NaN in the "word" column.')
    print(f'  This is likely the word "nan" (grandmother) being interpreted as')
    print(f'  NaN by pandas. Use keep_default_na=False when reading CSVs that')
    print(f'  contain this column.')

# --- Comprehensive Answer Phrase Stats ---
print(f'\n{"=" * 60}')
print(f'ANSWER PHRASE SUMMARY')
print(f'{"=" * 60}')
print(f'Unique answers processed: {len(unique_answers_wn):,}')
print(f'Total phrases generated:  {len(ans_phrases_df):,}')

avg_synsets = ans_phrases_df.groupby('word')['synset_name'].count().mean()
print(f'Average synsets per answer: {avg_synsets:.1f}')

single_synset = (ans_phrases_df.groupby('word')['num_synsets'].first() == 1).sum()
print(f'Answers with only 1 synset: {single_synset:,} '
      f'({single_synset / len(unique_answers_wn):.1%})')

# Path distribution
print(f'\nPath distribution:')
path_counts = ans_phrases_df['path'].value_counts()
for path_name, count in path_counts.items():
    pct = count / len(ans_phrases_df)
    print(f'  {path_name:15s}: {count:>7,} ({pct:.1%})')

# Zero-synset words
n_zero = (ans_phrases_df['synset_name'] == 'NONE').sum()
print(f'\nZero-synset answers: {n_zero} (should be 0)')

# Show a few examples from each path
print(f'\nExample phrases by path:')
for path_name in ['example', 'definition', 'fallback_a', 'unresolvable']:
    subset = ans_phrases_df[ans_phrases_df['path'] == path_name]
    if len(subset) > 0:
        print(f'\n  --- {path_name} ({len(subset):,} phrases) ---')
        for _, row in subset.head(3).iterrows():
            print(f'  word="{row["word"]}", synset={row["synset_name"]}')
            if pd.isna(row['phrase']):
                # Unresolvable: phrase is None/NaN — show the synset details
                # so the reader can see why it failed (word appears 2+ times).
                ss = wn.synset(row['synset_name'])
                print(f'    phrase: None (unresolvable)')
                print(f'    definition: "{ss.definition()[:80]}"')
                if ss.examples():
                    print(f'    examples: {[e[:60] for e in ss.examples()[:2]]}')
            else:
                phrase_str = str(row['phrase'])
                print(f'    phrase: "{phrase_str[:80]}..."' if len(phrase_str) > 80
                      else f'    phrase: "{phrase_str}"')

# --- Assess Unresolvable Phrases ---
# Unresolvable phrases are cases where the target word appears 2+ times in the
# synset definition, no example has exactly 1 occurrence, and we cannot safely
# place <t></t> delimiters. This diagnostic cell quantifies the impact.

unresolvable_defs = def_phrases_df[def_phrases_df['path'] == 'unresolvable']
unresolvable_ans = ans_phrases_df[ans_phrases_df['path'] == 'unresolvable']

n_unresolvable_phrases = len(unresolvable_defs) + len(unresolvable_ans)
n_unresolvable_def_words = unresolvable_defs['word'].nunique()
n_unresolvable_ans_words = unresolvable_ans['word'].nunique()
n_unresolvable_words = n_unresolvable_def_words + n_unresolvable_ans_words

print(f'{"=" * 60}')
print(f'UNRESOLVABLE PHRASE ASSESSMENT')
print(f'{"=" * 60}')
print(f'Total unresolvable phrases:    {n_unresolvable_phrases}')
print(f'  From definitions:            {len(unresolvable_defs)}')
print(f'  From answers:                {len(unresolvable_ans)}')
print(f'Unique words affected:         {n_unresolvable_words}')
print(f'  Definition words:            {n_unresolvable_def_words}')
print(f'  Answer words:                {n_unresolvable_ans_words}')

# For each unresolvable word, check: how many of its synsets are unresolvable
# vs. resolvable? A word with 10 synsets where 1 is unresolvable still has 9
# usable synsets for allsense averaging — we just skip the unresolvable one.
print(f'\n--- Impact per word ---')

for label, phrases_df, side in [('definition', def_phrases_df, 'definition_wn'),
                                 ('answer', ans_phrases_df, 'answer_wn')]:
    unresolvable_subset = phrases_df[phrases_df['path'] == 'unresolvable']
    if len(unresolvable_subset) == 0:
        print(f'\n  {label.title()}s: No unresolvable phrases.')
        continue

    affected_words = unresolvable_subset['word'].unique()
    all_unresolvable_words = []

    print(f'\n  {label.title()}s with unresolvable synsets:')
    for word in affected_words[:15]:  # Show up to 15 examples
        word_rows = phrases_df[phrases_df['word'] == word]
        n_total_synsets = len(word_rows)
        n_unresolvable = (word_rows['path'] == 'unresolvable').sum()
        n_resolvable = n_total_synsets - n_unresolvable

        if n_resolvable == 0:
            all_unresolvable_words.append(word)

        print(f'    "{word}": {n_unresolvable}/{n_total_synsets} synsets unresolvable '
              f'({n_resolvable} usable)')

        # Show details for each unresolvable synset
        for _, row in word_rows[word_rows['path'] == 'unresolvable'].head(2).iterrows():
            ss = wn.synset(row['synset_name'])
            print(f'      synset: {row["synset_name"]}')
            print(f'      definition: "{ss.definition()[:80]}"')
            if ss.examples():
                print(f'      examples: {[e[:60] for e in ss.examples()[:2]]}')

    if len(affected_words) > 15:
        print(f'    ... and {len(affected_words) - 15} more words')

    # Count words where ALL synsets are unresolvable
    all_unresolvable = []
    for word in affected_words:
        word_rows = phrases_df[phrases_df['word'] == word]
        if (word_rows['path'] == 'unresolvable').all():
            all_unresolvable.append(word)

    if all_unresolvable:
        # Count how many rows in clues_filtered.csv would be affected
        affected_rows = df[df[side].isin(all_unresolvable)]
        print(f'\n  WARNING: {len(all_unresolvable)} {label}(s) have ALL synsets '
              f'unresolvable: {all_unresolvable[:10]}')
        print(f'  These affect {len(affected_rows):,} rows in clues_filtered.csv')
    else:
        print(f'\n  No {label}s have ALL synsets unresolvable (all words have at '
              f'least 1 usable synset).')

# --- Summary ---
all_unresolvable_def_words = [
    w for w in unresolvable_defs['word'].unique()
    if (def_phrases_df[def_phrases_df['word'] == w]['path'] == 'unresolvable').all()
]
all_unresolvable_ans_words = [
    w for w in unresolvable_ans['word'].unique()
    if (ans_phrases_df[ans_phrases_df['word'] == w]['path'] == 'unresolvable').all()
]
n_all_unresolvable = len(all_unresolvable_def_words) + len(all_unresolvable_ans_words)

affected_def_rows = len(df[df['definition_wn'].isin(all_unresolvable_def_words)]) if all_unresolvable_def_words else 0
affected_ans_rows = len(df[df['answer_wn'].isin(all_unresolvable_ans_words)]) if all_unresolvable_ans_words else 0

print(f'\n{"=" * 60}')
print(f'SUMMARY')
print(f'{"=" * 60}')
print(f'{n_unresolvable_words} words have at least one unresolvable synset '
      f'({n_unresolvable_phrases} total unresolvable phrases).')
print(f'{n_all_unresolvable} words have ALL synsets unresolvable, '
      f'affecting {affected_def_rows + affected_ans_rows:,} rows in clues_filtered.csv.')
if n_all_unresolvable == 0:
    print('No data loss from unresolvable phrases — all words have at least '
          'one usable synset for embedding.')

# Adapted from 00_model_comparison.ipynb (Victoria).
# Uses re.escape + re.IGNORECASE without word boundaries because we are
# matching the full definition substring within the surface text — the
# definition was already verified to appear in the surface during Step 1
# cleaning, so partial match risk is minimal. This is different from
# build_synset_context, which uses \b word boundaries to match a single
# word inside longer text.

def insert_cale_delimiters(surface, definition):
    """Insert <t></t> delimiters around the definition in the surface text.
    Adapted from 00_model_comparison.ipynb (Victoria).
    Uses case-insensitive matching. No word boundaries needed since we match
    the full definition phrase within the surface.
    """
    pattern = re.compile(re.escape(definition), re.IGNORECASE)
    match = pattern.search(surface)
    if match:
        start, end = match.start(), match.end()
        return surface[:start] + '<t>' + surface[start:end] + '</t>' + surface[end:]
    # Fallback: prepend the delimited definition. This should be rare — Step 1
    # verified that the definition appears in the surface text.
    return f'<t>{definition}</t> {surface}'


# =======================================================================
# Cleanup and Quality Tracking
# =======================================================================
# Before saving, we clean up two categories of problematic data:
#
# 1. UNRESOLVABLE SYNSET PHRASES — where the target word appears 2+ times
#    in the definition text and no example has exactly 1 occurrence, so we
#    cannot safely place <t></t> delimiters. These are removed from the
#    phrase DataFrames. Words that lose ALL synsets this way ("fully
#    unresolvable") cause their clue rows to be dropped entirely.
#
# 2. MULTI-OCCURRENCE CLUE CONTEXT — where the definition substring appears
#    2+ times in the surface text (e.g., definition="right" in surface
#    "Right to right a wrong"). insert_cale_delimiters only wraps the first
#    occurrence, but CALE may attend to the second unwrapped occurrence too,
#    producing an ambiguous embedding. Safer to drop these rows.

# --- 1. Filter unresolvable phrases from synset DataFrames ---
n_unresolvable_def = (def_phrases_df['path'] == 'unresolvable').sum()
n_unresolvable_ans = (ans_phrases_df['path'] == 'unresolvable').sum()

def_phrases_df = def_phrases_df[def_phrases_df['path'] != 'unresolvable'].copy()
ans_phrases_df = ans_phrases_df[ans_phrases_df['path'] != 'unresolvable'].copy()

# --- 1b. Recalculate is_common and is_obscure after removing unresolvable phrases ---
# The original is_common/is_obscure flags were assigned based on synset_idx
# relative to the full set of WordNet synsets (is_common = synset_idx == 0,
# is_obscure = synset_idx == num_synsets - 1). If the first or last synset
# was unresolvable, the word now has no row with is_common=True or
# is_obscure=True. Fix: within each word's remaining synsets, the one with
# the lowest synset_idx is the new "common" and the one with the highest
# synset_idx is the new "obscure".
for phrases_df in [def_phrases_df, ans_phrases_df]:
    phrases_df['is_common'] = False
    phrases_df['is_obscure'] = False
    for word, group in phrases_df.groupby('word'):
        min_idx = group.index[group['synset_idx'] == group['synset_idx'].min()]
        max_idx = group.index[group['synset_idx'] == group['synset_idx'].max()]
        phrases_df.loc[min_idx, 'is_common'] = True
        phrases_df.loc[max_idx, 'is_obscure'] = True

# Verify: every word must have exactly one is_common=True and one
# is_obscure=True row after recalculation.
for phrases_df, label in [(def_phrases_df, 'definition'), (ans_phrases_df, 'answer')]:
    common_counts = phrases_df.groupby('word')['is_common'].sum()
    obscure_counts = phrases_df.groupby('word')['is_obscure'].sum()
    assert (common_counts == 1).all(), (
        f'{label}: some words have != 1 is_common row: '
        f'{common_counts[common_counts != 1].head().to_dict()}'
    )
    assert (obscure_counts == 1).all(), (
        f'{label}: some words have != 1 is_obscure row: '
        f'{obscure_counts[obscure_counts != 1].head().to_dict()}'
    )
print('is_common/is_obscure recalculation verified')

# --- 2. Compute num_usable_synsets per word (after filtering) ---
# This tells downstream notebooks how many synsets survived for each word.
# Words with num_usable_synsets == 1 have identical common and obscure
# embeddings, which matters for stratified analysis.
def_usable = def_phrases_df.groupby('word').size().rename('num_usable_synsets')
def_phrases_df = def_phrases_df.merge(def_usable, on='word', how='left')

ans_usable = ans_phrases_df.groupby('word').size().rename('num_usable_synsets')
ans_phrases_df = ans_phrases_df.merge(ans_usable, on='word', how='left')

# --- 3. Flag fully unresolvable words ---
# These are words that had synsets in WordNet but ALL of those synsets were
# unresolvable (word appeared 2+ times in every definition, no clean example).
# Clue rows referencing these words must be dropped because we cannot produce
# any synset-based embedding for them.
all_def_words = set(unique_definitions_wn)
usable_def_words = set(def_phrases_df['word'].unique())
fully_unresolvable_defs = all_def_words - usable_def_words

all_ans_words = set(unique_answers_wn)
usable_ans_words = set(ans_phrases_df['word'].unique())
fully_unresolvable_ans = all_ans_words - usable_ans_words

# --- 4. Drop clue-context rows where definition appears 2+ times in surface ---
# insert_cale_delimiters wraps the first match, but if the definition appears
# again later in the surface, CALE sees the target word both inside and outside
# delimiters, which is ambiguous. We count occurrences and flag multi-match rows.
def count_def_occurrences(row):
    pattern = re.compile(re.escape(row['definition']), re.IGNORECASE)
    return len(pattern.findall(row['surface']))

df['def_surface_count'] = df.apply(count_def_occurrences, axis=1)
multi_occurrence_mask = df['def_surface_count'] > 1

# --- 5. Drop rows where definition_wn or answer_wn is fully unresolvable ---
unresolvable_mask = (df['definition_wn'].isin(fully_unresolvable_defs) |
                     df['answer_wn'].isin(fully_unresolvable_ans))

drop_mask = multi_occurrence_mask | unresolvable_mask
df_clean = df[~drop_mask].copy()

# --- 6. Add usable synset counts to df_clean for downstream use ---
# These columns let downstream notebooks know when common == obscure
# (num_usable_synsets == 1) without re-running WordNet lookups.
def_synset_map = def_phrases_df.groupby('word')['num_usable_synsets'].first()
ans_synset_map = ans_phrases_df.groupby('word')['num_usable_synsets'].first()
df_clean['def_num_usable_synsets'] = df_clean['definition_wn'].map(def_synset_map).astype(int)
df_clean['ans_num_usable_synsets'] = df_clean['answer_wn'].map(ans_synset_map).astype(int)

# --- 7. Rebuild clue_context_phrase on df_clean ---
# Recompute rather than carry forward, so the column is guaranteed consistent
# with the cleaned row set.
df_clean['clue_context_phrase'] = df_clean.apply(
    lambda row: insert_cale_delimiters(row['surface'], row['definition']), axis=1)

# --- Print summary ---
print(f'{"=" * 60}')
print(f'CLEANUP SUMMARY')
print(f'{"=" * 60}')
print(f'Unresolvable phrases removed: {n_unresolvable_def + n_unresolvable_ans} '
      f'(definition: {n_unresolvable_def}, answer: {n_unresolvable_ans})')
print(f'Fully unresolvable definitions: {fully_unresolvable_defs if fully_unresolvable_defs else "none"}')
print(f'Fully unresolvable answers:     {fully_unresolvable_ans if fully_unresolvable_ans else "none"}')
print()
print(f'Rows dropped — definition appears 2+ times in surface: {multi_occurrence_mask.sum():,}')
print(f'Rows dropped — fully unresolvable definition or answer: {unresolvable_mask.sum():,}')
print(f'Overlap (both conditions): {(multi_occurrence_mask & unresolvable_mask).sum():,}')
print(f'Total rows dropped: {drop_mask.sum():,}')
print(f'Rows remaining: {len(df_clean):,} of {len(df):,} ({len(df_clean) / len(df):.1%})')
print()
print(f'Sense variation (definitions): '
      f'{(def_synset_map >= 2).sum():,} words with 2+ usable synsets, '
      f'{(def_synset_map == 1).sum():,} with exactly 1')
print(f'Sense variation (answers):     '
      f'{(ans_synset_map >= 2).sum():,} words with 2+ usable synsets, '
      f'{(ans_synset_map == 1).sum():,} with exactly 1')
print()
print(f'Definition phrases remaining: {len(def_phrases_df):,}')
print(f'Answer phrases remaining:     {len(ans_phrases_df):,}')

# --- Save definition phrases (filtered, with num_usable_synsets) ---
def_phrases_path = EMBEDDINGS_DIR / 'definition_phrases.csv'
def_phrases_df.to_csv(def_phrases_path, index=False)
def_size_kb = def_phrases_path.stat().st_size / 1024
print(f'Saved {len(def_phrases_df):,} definition phrases to {def_phrases_path.name}')
print(f'  Columns: {list(def_phrases_df.columns)}')
print(f'  File size: {def_size_kb:.0f} KB')

# --- Save answer phrases (filtered, with num_usable_synsets) ---
ans_phrases_path = EMBEDDINGS_DIR / 'answer_phrases.csv'
ans_phrases_df.to_csv(ans_phrases_path, index=False)
ans_size_kb = ans_phrases_path.stat().st_size / 1024
print(f'\nSaved {len(ans_phrases_df):,} answer phrases to {ans_phrases_path.name}')
print(f'  Columns: {list(ans_phrases_df.columns)}')
print(f'  File size: {ans_size_kb:.0f} KB')

# --- Save clue-context phrases (cleaned rows with synset counts) ---
clue_ctx_path = EMBEDDINGS_DIR / 'clue_context_phrases.csv'
clue_ctx_cols = [
    'clue_id', 'definition', 'definition_wn', 'answer', 'answer_wn',
    'surface', 'clue_context_phrase',
    'def_num_usable_synsets', 'ans_num_usable_synsets',
]
clue_ctx_df = df_clean[clue_ctx_cols]
clue_ctx_df.to_csv(clue_ctx_path, index=False)
clue_size_kb = clue_ctx_path.stat().st_size / 1024
print(f'\nSaved {len(clue_ctx_df):,} clue-context phrases to {clue_ctx_path.name}')
print(f'  Columns: {list(clue_ctx_cols)}')
print(f'  File size: {clue_size_kb:.0f} KB')

# --- Verify all files ---
print(f'\n{"=" * 60}')
print(f'VERIFICATION')
print(f'{"=" * 60}')
for path, name, expected_rows in [
    (def_phrases_path, 'definition_phrases', len(def_phrases_df)),
    (ans_phrases_path, 'answer_phrases', len(ans_phrases_df)),
    (clue_ctx_path, 'clue_context_phrases', len(clue_ctx_df)),
]:
    assert path.exists(), f'{name} was not written: {path}'
    assert path.stat().st_size > 0, f'{name} is empty: {path}'
    # The word "nan" (grandmother) is a valid definition — prevent pandas
    # from interpreting it as NaN when reading the CSV back.
    check_df = pd.read_csv(path, keep_default_na=False)
    assert len(check_df) == expected_rows, (
        f'{name} row count mismatch: expected {expected_rows}, got {len(check_df)}'
    )
    print(f'  {name}: {len(check_df):,} rows verified')

print('\nAll phrase files saved and verified.')

# Load all 6 embedding output files produced by scripts/embed_phrases.py.
# keep_default_na=False prevents pandas from interpreting the word "nan"
# (grandmother) as NaN — see Decision 15 and the build_all_phrases cells.

definition_embeddings = np.load(EMBEDDINGS_DIR / 'definition_embeddings.npy')
definition_index = pd.read_csv(
    EMBEDDINGS_DIR / 'definition_index.csv', index_col=0, keep_default_na=False)

answer_embeddings = np.load(EMBEDDINGS_DIR / 'answer_embeddings.npy')
answer_index = pd.read_csv(
    EMBEDDINGS_DIR / 'answer_index.csv', index_col=0, keep_default_na=False)

clue_context_embeddings = np.load(EMBEDDINGS_DIR / 'clue_context_embeddings.npy')
clue_context_index = pd.read_csv(
    EMBEDDINGS_DIR / 'clue_context_index.csv', index_col=0, keep_default_na=False)

# Print shapes and memory usage
print(f'{"File":<35s} {"Shape":<25s} {"Memory":>8s}')
print(f'{"-"*35} {"-"*25} {"-"*8}')
for name, arr in [
    ('definition_embeddings.npy', definition_embeddings),
    ('answer_embeddings.npy', answer_embeddings),
    ('clue_context_embeddings.npy', clue_context_embeddings),
]:
    mb = arr.nbytes / 1024**2
    print(f'{name:<35s} {str(arr.shape):<25s} {mb:>6.1f} MB')

total_mb = (definition_embeddings.nbytes + answer_embeddings.nbytes
            + clue_context_embeddings.nbytes) / 1024**2
print(f'\nTotal embedding memory: {total_mb:.1f} MB')
print(f'\nIndex sizes:')
print(f'  definition_index: {len(definition_index):,} rows')
print(f'  answer_index:     {len(answer_index):,} rows')
print(f'  clue_context_index: {len(clue_context_index):,} rows')

# --- Shape and consistency assertions ---

EMBED_DIM = 1024
n_def = len(definition_index)
n_ans = len(answer_index)
n_rows = len(clue_context_index)

# Shape checks
assert definition_embeddings.shape == (n_def, 3, EMBED_DIM), (
    f'definition_embeddings shape: expected ({n_def}, 3, {EMBED_DIM}), '
    f'got {definition_embeddings.shape}')
assert answer_embeddings.shape == (n_ans, 3, EMBED_DIM), (
    f'answer_embeddings shape: expected ({n_ans}, 3, {EMBED_DIM}), '
    f'got {answer_embeddings.shape}')
assert clue_context_embeddings.shape == (n_rows, EMBED_DIM), (
    f'clue_context_embeddings shape: expected ({n_rows}, {EMBED_DIM}), '
    f'got {clue_context_embeddings.shape}')

# Embedding dimension
assert definition_embeddings.shape[2] == EMBED_DIM
assert answer_embeddings.shape[2] == EMBED_DIM
assert clue_context_embeddings.shape[1] == EMBED_DIM

# No NaN values
assert not np.isnan(definition_embeddings).any(), 'NaN found in definition_embeddings'
assert not np.isnan(answer_embeddings).any(), 'NaN found in answer_embeddings'
assert not np.isnan(clue_context_embeddings).any(), 'NaN found in clue_context_embeddings'

# No all-zero rows — every embedding should have a nonzero L2 norm.
# An all-zero embedding would mean the model produced no representation,
# which would silently corrupt all downstream cosine similarities.
def_norms = np.linalg.norm(definition_embeddings.reshape(-1, EMBED_DIM), axis=1)
assert (def_norms > 0).all(), (
    f'{(def_norms == 0).sum()} all-zero rows in definition_embeddings')
ans_norms = np.linalg.norm(answer_embeddings.reshape(-1, EMBED_DIM), axis=1)
assert (ans_norms > 0).all(), (
    f'{(ans_norms == 0).sum()} all-zero rows in answer_embeddings')
cc_norms = np.linalg.norm(clue_context_embeddings, axis=1)
assert (cc_norms > 0).all(), (
    f'{(cc_norms == 0).sum()} all-zero rows in clue_context_embeddings')

# Index uniqueness — each word should appear exactly once in the index
assert definition_index['word'].is_unique, (
    f'Duplicate words in definition_index: '
    f'{definition_index["word"][definition_index["word"].duplicated()].tolist()[:5]}')
assert answer_index['word'].is_unique, (
    f'Duplicate words in answer_index: '
    f'{answer_index["word"][answer_index["word"].duplicated()].tolist()[:5]}')

# Cross-check: every definition_wn and answer_wn referenced by clue rows
# must have a corresponding embedding in the index files. If any are missing,
# downstream feature computation would fail silently with KeyErrors.
cc_phrases = pd.read_csv(
    EMBEDDINGS_DIR / 'clue_context_phrases.csv', keep_default_na=False)
def_words_in_clues = set(cc_phrases['definition_wn'].unique())
def_words_in_index = set(definition_index['word'].values)
missing_defs = def_words_in_clues - def_words_in_index
assert len(missing_defs) == 0, (
    f'{len(missing_defs)} definition_wn values in clue_context_phrases.csv '
    f'are missing from definition_index: {list(missing_defs)[:5]}')

ans_words_in_clues = set(cc_phrases['answer_wn'].unique())
ans_words_in_index = set(answer_index['word'].values)
missing_ans = ans_words_in_clues - ans_words_in_index
assert len(missing_ans) == 0, (
    f'{len(missing_ans)} answer_wn values in clue_context_phrases.csv '
    f'are missing from answer_index: {list(missing_ans)[:5]}')

print('All shape and consistency checks passed.')
print(f'  definition_embeddings: {definition_embeddings.shape}')
print(f'  answer_embeddings:     {answer_embeddings.shape}')
print(f'  clue_context_embeddings: {clue_context_embeddings.shape}')
print(f'  All {n_def:,} definition words and {n_ans:,} answer words '
      f'have embeddings for all {n_rows:,} clue rows.')

# --- Spot-check known embeddings ---
# Pick a few well-known words and verify their embeddings make semantic sense
# using cosine similarity. "plant" is a classic polysemous word in crosswords
# (botanical, factory, verb meaning to place), so it's a good test case.

from sklearn.metrics.pairwise import cosine_similarity


def lookup_embedding(index_df, embeddings_arr, word, slot=0):
    """Look up a word's embedding by name.

    Args:
        index_df: DataFrame with 'word' column (definition_index or answer_index)
        embeddings_arr: numpy array of shape (N, 3, 1024)
        word: string to look up
        slot: 0=allsense_avg, 1=common, 2=obscure

    Returns:
        1D numpy array of shape (1024,), or None if word not found
    """
    matches = index_df[index_df['word'] == word]
    if len(matches) == 0:
        return None
    row_pos = matches.index[0]
    return embeddings_arr[row_pos, slot, :]


def cos_sim(a, b):
    """Cosine similarity between two 1D vectors."""
    return cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))[0, 0]


# Test words — chosen because they are common in cryptic crosswords and
# span a range of semantic relatedness. We check definition_index first,
# then fall back to answer_index.
test_words = ['plant', 'flower', 'banana', 'letter']
slot_names = {0: 'allsense', 1: 'common', 2: 'obscure'}

# Look up all test words, preferring definition_index
word_embs = {}
for w in test_words:
    emb = lookup_embedding(definition_index, definition_embeddings, w)
    source = 'definition_index'
    if emb is None:
        emb = lookup_embedding(answer_index, answer_embeddings, w)
        source = 'answer_index'
    if emb is not None:
        # Store all 3 slots
        idx_df = definition_index if source == 'definition_index' else answer_index
        arr = definition_embeddings if source == 'definition_index' else answer_embeddings
        word_embs[w] = {
            slot_names[s]: lookup_embedding(idx_df, arr, w, slot=s)
            for s in range(3)
        }
        print(f'Found "{w}" in {source}')
    else:
        print(f'"{w}" not found in either index — skipping')

print()

# 1. Polysemy check: "plant" common vs obscure
if 'plant' in word_embs:
    p = word_embs['plant']
    sim_co = cos_sim(p['common'], p['obscure'])
    sim_ac = cos_sim(p['allsense'], p['common'])
    sim_ao = cos_sim(p['allsense'], p['obscure'])
    print(f'=== Polysemy check: "plant" ===')
    print(f'  cos(common, obscure)  = {sim_co:.4f}  (< 1.0 if multiple senses)')
    print(f'  cos(allsense, common) = {sim_ac:.4f}')
    print(f'  cos(allsense, obscure) = {sim_ao:.4f}')
    print()

# 2–4. Pairwise similarity checks
pairs = [
    ('plant', 'flower', 'related words — should be moderately high'),
    ('plant', 'banana', 'somewhat related — should be lower than plant/flower'),
    ('plant', 'letter', 'unrelated — should be low'),
]
print(f'=== Pairwise allsense similarity ===')
for w1, w2, interpretation in pairs:
    if w1 in word_embs and w2 in word_embs:
        sim = cos_sim(word_embs[w1]['allsense'], word_embs[w2]['allsense'])
        print(f'  cos("{w1}", "{w2}") = {sim:.4f}  ({interpretation})')
    else:
        missing = [w for w in [w1, w2] if w not in word_embs]
        print(f'  cos("{w1}", "{w2}") — skipped, missing: {missing}')

# --- Embedding distribution summary ---
# Overall statistics about the embedding space. These help us assess whether
# the CALE model is producing reasonable, well-distributed representations.

# L2 norms by embedding type — should be roughly consistent across types.
# Large outliers would indicate degenerate embeddings.
print(f'{"Embedding Type":<30s} {"Mean L2 Norm":>12s} {"Std":>8s}')
print(f'{"-"*30} {"-"*12} {"-"*8}')

norm_stats = []
for name, arr in [
    ('definition allsense',  definition_embeddings[:, 0, :]),
    ('definition common',    definition_embeddings[:, 1, :]),
    ('definition obscure',   definition_embeddings[:, 2, :]),
    ('answer allsense',      answer_embeddings[:, 0, :]),
    ('answer common',        answer_embeddings[:, 1, :]),
    ('answer obscure',       answer_embeddings[:, 2, :]),
    ('clue-context',         clue_context_embeddings),
]:
    norms = np.linalg.norm(arr, axis=1)
    norm_stats.append((name, norms.mean(), norms.std()))
    print(f'{name:<30s} {norms.mean():>12.4f} {norms.std():>8.4f}')

# Mean cosine similarity between common and obscure senses — tells us how
# different the two sense extremes are on average. If close to 1.0, the
# embeddings are not discriminating between senses (which could happen for
# single-synset words or if the model treats all senses similarly).
print(f'\n{"="*60}')
print('Common vs. Obscure Sense Similarity')
print(f'{"="*60}')

def pairwise_cos_sim(arr_a, arr_b):
    """Row-wise cosine similarity between two (N, D) arrays."""
    # Normalize rows, then take dot product
    a_norm = arr_a / np.linalg.norm(arr_a, axis=1, keepdims=True)
    b_norm = arr_b / np.linalg.norm(arr_b, axis=1, keepdims=True)
    return np.sum(a_norm * b_norm, axis=1)

def_cos_co = pairwise_cos_sim(
    definition_embeddings[:, 1, :],  # common
    definition_embeddings[:, 2, :],  # obscure
)
ans_cos_co = pairwise_cos_sim(
    answer_embeddings[:, 1, :],
    answer_embeddings[:, 2, :],
)

print(f'Definitions — cos(common, obscure):')
print(f'  Mean: {def_cos_co.mean():.4f}  Std: {def_cos_co.std():.4f}  '
      f'Min: {def_cos_co.min():.4f}  Max: {def_cos_co.max():.4f}')
print(f'Answers — cos(common, obscure):')
print(f'  Mean: {ans_cos_co.mean():.4f}  Std: {ans_cos_co.std():.4f}  '
      f'Min: {ans_cos_co.min():.4f}  Max: {ans_cos_co.max():.4f}')

# Count words where common == obscure (cos ≈ 1.0), which should correspond
# to single-usable-synset words. We use a tight threshold (> 0.9999) because
# floating-point averaging may produce tiny differences even for identical
# source phrases.
def_identical = (def_cos_co > 0.9999).sum()
ans_identical = (ans_cos_co > 0.9999).sum()

# Compare to num_usable_synsets counts from the phrase DataFrames
def_phrases_check = pd.read_csv(
    EMBEDDINGS_DIR / 'definition_phrases.csv', keep_default_na=False)
ans_phrases_check = pd.read_csv(
    EMBEDDINGS_DIR / 'answer_phrases.csv', keep_default_na=False)
def_single = (def_phrases_check.groupby('word')['num_usable_synsets'].first() == 1).sum()
ans_single = (ans_phrases_check.groupby('word')['num_usable_synsets'].first() == 1).sum()

print(f'\nSingle-synset words (common == obscure):')
print(f'  Definitions: {def_identical:,} with cos > 0.9999  '
      f'(vs {def_single:,} with num_usable_synsets == 1)')
print(f'  Answers:     {ans_identical:,} with cos > 0.9999  '
      f'(vs {ans_single:,} with num_usable_synsets == 1)')

File	Shape	Description
`definition_embeddings.npy`	(N_def, 3, 1024)	Per unique definition: [allsense_avg, common, obscure]
`definition_index.csv`	N_def rows	Maps row position → definition string
`answer_embeddings.npy`	(N_ans, 3, 1024)	Per unique answer: [allsense_avg, common, obscure]
`answer_index.csv`	N_ans rows	Maps row position → answer string
`clue_context_embeddings.npy`	(N_rows, 1024)	Per clue row: word1_clue_context
`clue_context_index.csv`	N_rows rows	Maps row position → clue_id

#	Name	Source	Deduplication	How constructed
1	word1_allsense	Definition	Per unique definition	Embed definition in each WordNet synset context with `<t></t>`, average all
2	word1_clue_context	Definition in clue	Per row (unique clue text)	Definition embedded within the `surface` sentence using `<t></t>` delimiters
3	word1_common	Definition	Per unique definition	Definition in most-common WordNet synset context with `<t></t>`
4	word1_obscure	Definition	Per unique definition	Definition in least-common WordNet synset context with `<t></t>`
5	word2_allsense	Answer	Per unique answer	Embed answer in each WordNet synset context with `<t></t>`, average all
6	word2_common	Answer	Per unique answer	Answer in most-common WordNet synset context with `<t></t>`
7	word2_obscure	Answer	Per unique answer	Answer in least-common WordNet synset context with `<t></t>`

File	Shape	Description
`definition_embeddings.npy`	(N_def, 3, 1024)	Per unique definition: [allsense_avg, common, obscure]
`definition_index.csv`	N_def rows	Maps row position to definition_wn string
`answer_embeddings.npy`	(N_ans, 3, 1024)	Per unique answer: [allsense_avg, common, obscure]
`answer_index.csv`	N_ans rows	Maps row position to answer_wn string
`clue_context_embeddings.npy`	(N_rows, 1024)	Per clue row: word1_clue_context
`clue_context_index.csv`	N_rows rows	Maps row position to clue_id

Step 2: Embedding Generation with CALE¶

Embedding Types¶

Deduplication Strategy¶

Great Lakes Session Settings¶

Running on Google Colab¶

Imports¶

Environment Auto-Detection and Paths¶

Load Filtered Clues¶

Extract Unique Strings for Deduplication¶

Derive WordNet-Ready Strings¶

WordNet Phrase Construction¶

Prepare Clue-Context Phrases¶

Save Phrase Files¶

Verification: Load and Validate Embeddings¶

Summary¶

CPU: Phrase Construction¶

GPU: Embedding Generation¶

Verification Results¶

Sense Variation¶

Downstream Usage¶

Findings for FINDINGS.md¶