import time
import numpy as np
import pandas as pd
import torch
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

import nltk
from nltk.corpus import wordnet as wn

try:
    wn.synsets("test")
except LookupError:
    nltk.download("wordnet", quiet=True)

# --- Environment Auto-Detection ---
import os

try:
    IS_COLAB = "google.colab" in str(get_ipython())
except NameError:
    IS_COLAB = False

IS_GREAT_LAKES = "SLURM_JOB_ID" in os.environ

if IS_COLAB:
    env_name = "Google Colab"
elif IS_GREAT_LAKES:
    env_name = "Great Lakes"
else:
    env_name = "Local"

print(f"Environment: {env_name}")

# GPU check
GPU_AVAILABLE = torch.cuda.is_available()
if GPU_AVAILABLE:
    DEVICE_NAME = torch.cuda.get_device_name(0)
    print(f"GPU available: {DEVICE_NAME}")
    print(f"CUDA version: {torch.version.cuda}")
else:
    DEVICE_NAME = "CPU"
    print("WARNING: No GPU detected. Continuing on CPU (will be slower).")

Environment: Local
WARNING: No GPU detected. Continuing on CPU (will be slower).

TEST_PAIRS = [
    {"definition": "plant",  "answer": "aster",   "surface": "Plant in a garden party"},
    {"definition": "letter", "answer": "epistle", "surface": "Foreign letter coming in is the French letter"},
    {"definition": "bank",   "answer": "shore",   "surface": "Bank offering a current account"},
    {"definition": "key",    "answer": "note",    "surface": "Key figure in music"},
]

test_df = pd.DataFrame(TEST_PAIRS)
test_df

MODEL_NAMES = [
    "gabrielloiseau/CALE-MBERT-en",
    "BAAI/bge-base-en-v1.5",
    "all-mpnet-base-v2",
]

models = {}
model_info = []

for name in MODEL_NAMES:
    print(f"Loading {name}...")
    t0 = time.time()
    m = SentenceTransformer(name)
    load_time = time.time() - t0

    dim = m.get_sentence_embedding_dimension()
    max_seq = m.max_seq_length

    # Count parameters in the underlying transformer
    n_params = sum(p.numel() for p in m.parameters())

    models[name] = m
    model_info.append({
        "Model": name.split("/")[-1],
        "Full Name": name,
        "Dimension": dim,
        "Max Seq Length": max_seq,
        "Parameters (M)": round(n_params / 1e6, 1),
        "Load Time (s)": round(load_time, 1),
    })
    print(f"  dim={dim}, max_seq={max_seq}, params={n_params/1e6:.1f}M, "
          f"loaded in {load_time:.1f}s")
    print()

info_df = pd.DataFrame(model_info).set_index("Model")
print("=" * 70)
print("Model Summary")
print("=" * 70)
info_df

Loading gabrielloiseau/CALE-MBERT-en...

Loading weights:   0%|          | 0/170 [00:00<?, ?it/s]

  dim=1024, max_seq=8192, params=394.8M, loaded in 4.4s

Loading BAAI/bge-base-en-v1.5...

Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: BAAI/bge-base-en-v1.5
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.

  dim=768, max_seq=512, params=109.5M, loaded in 2.1s

Loading all-mpnet-base-v2...

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

MPNetModel LOAD REPORT from: sentence-transformers/all-mpnet-base-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.

  dim=768, max_seq=384, params=109.5M, loaded in 2.1s

======================================================================
Model Summary
======================================================================

import re


def insert_cale_delimiters(surface, definition):
    """Insert <t></t> delimiters around the definition word in the surface text.

    Uses case-insensitive matching so that 'Plant' in the surface matches
    'plant' as the definition. Returns the modified surface string.
    """
    # Case-insensitive search for the definition within the surface
    pattern = re.compile(re.escape(definition), re.IGNORECASE)
    match = pattern.search(surface)
    if match:
        start, end = match.start(), match.end()
        return surface[:start] + "<t>" + surface[start:end] + "</t>" + surface[end:]
    # Fallback: prepend the delimited word (should not happen with our test data)
    return f"<t>{definition}</t> {surface}"


def extract_token_embedding(model, surface, definition):
    """Extract the embedding for a specific word from a sentence using
    token-level embeddings and offset_mapping.

    For non-CALE models that lack a delimiter mechanism, this is the standard
    approach: encode the full sentence to get per-token embeddings, identify
    which tokens correspond to the target word using character offsets, and
    average those token embeddings.
    """
    # Find the character-level position of the definition in the surface
    pattern = re.compile(re.escape(definition), re.IGNORECASE)
    match = pattern.search(surface)
    if not match:
        # Fallback: return sentence-level embedding (should not happen)
        return model.encode(surface)

    def_start, def_end = match.start(), match.end()

    # Get per-token embeddings from the model
    token_embeddings = model.encode(surface, output_value="token_embeddings")

    # Get the tokenizer's offset_mapping to find which tokens correspond
    # to our target word's character span
    tokenizer = model.tokenizer
    encoded = tokenizer(
        surface,
        return_offsets_mapping=True,
        return_tensors="pt",
        truncation=True,
    )
    offsets = encoded["offset_mapping"][0].tolist()  # list of (start, end) tuples

    # Find token indices that overlap with the definition's character span.
    # A token overlaps if its character range intersects with [def_start, def_end).
    target_indices = []
    for idx, (tok_start, tok_end) in enumerate(offsets):
        # Skip special tokens (offset 0,0 for [CLS], [SEP], etc.)
        if tok_start == 0 and tok_end == 0 and idx > 0:
            continue
        # Check overlap with definition span
        if tok_start < def_end and tok_end > def_start:
            target_indices.append(idx)

    if not target_indices:
        # Fallback: return sentence-level embedding
        return model.encode(surface)

    # Average the token embeddings for the target word.
    # token_embeddings shape: (n_tokens, dim)
    target_embs = np.array(token_embeddings[target_indices].cpu())
    return np.mean(target_embs, axis=0)


# Quick test of the delimiter insertion
test_surface = "Plant in a garden party"
test_def = "plant"
print(f"Delimiter insertion test:")
print(f"  Input:  '{test_surface}' + definition='{test_def}'")
print(f"  Output: '{insert_cale_delimiters(test_surface, test_def)}'")

Delimiter insertion test:
  Input:  'Plant in a garden party' + definition='plant'
  Output: '<t>Plant</t> in a garden party'

CALE_NAME = "gabrielloiseau/CALE-MBERT-en"
EMB_LABELS = ["W1_avg", "W1_clue_ctx", "Sentence1", "W2_avg"]


def compute_four_embeddings(model_name, model, definition, answer, surface):
    """Compute the four embedding types for a single (definition, answer, surface) tuple.

    Returns a dict mapping embedding label to a 1D numpy array.
    """
    # W1_avg: bare definition word
    w1_avg = model.encode(definition)

    # Sentence1: full surface sentence
    sentence1 = model.encode(surface)

    # W2_avg: bare answer word
    w2_avg = model.encode(answer)

    # W1_clue_ctx: definition word in clue context
    if model_name == CALE_NAME:
        # CALE: use <t></t> delimiters
        delimited_surface = insert_cale_delimiters(surface, definition)
        w1_clue_ctx = model.encode(delimited_surface)
    else:
        # Non-CALE: extract token embeddings for the definition word
        w1_clue_ctx = extract_token_embedding(model, surface, definition)

    return {
        "W1_avg": w1_avg,
        "W1_clue_ctx": w1_clue_ctx,
        "Sentence1": sentence1,
        "W2_avg": w2_avg,
    }

def print_similarity_matrix(embeddings, labels, title):
    """Print a 4x4 cosine similarity matrix for a set of embeddings."""
    vecs = np.array([embeddings[label] for label in labels])
    sim_matrix = cosine_similarity(vecs)

    print(f"\n{title}")
    print("-" * len(title))

    # Header
    header = f"{'':>15s}" + "".join(f"{l:>15s}" for l in labels)
    print(header)

    for i, label in enumerate(labels):
        row = f"{label:>15s}"
        for j in range(len(labels)):
            row += f"{sim_matrix[i, j]:>15.4f}"
        print(row)

    return sim_matrix

# Compute and display 4x4 similarity matrices for all models and test pairs.
# We also collect the key metric cos(W1_clue_ctx, Sentence1) for the
# summary comparison at the end of this section.

key_metric_rows = []  # for the summary table

for model_name in MODEL_NAMES:
    model = models[model_name]
    short_name = model_name.split("/")[-1]
    print("=" * 70)
    print(f"MODEL: {short_name}")
    print("=" * 70)

    for _, row in test_df.iterrows():
        defn = row["definition"]
        ans = row["answer"]
        surf = row["surface"]

        embs = compute_four_embeddings(model_name, model, defn, ans, surf)
        title = f"{short_name} | {defn} / {ans}"
        sim_mat = print_similarity_matrix(embs, EMB_LABELS, title)

        # Extract cos(W1_clue_ctx, Sentence1) — indices 1 and 2 in EMB_LABELS
        ctx_sent_sim = sim_mat[1, 2]
        # Also extract cos(W1_avg, W2_avg) for reference
        w1w2_sim = sim_mat[0, 3]

        key_metric_rows.append({
            "Model": short_name,
            "Definition": defn,
            "Answer": ans,
            "cos(W1_avg, W2_avg)": round(w1w2_sim, 4),
            "cos(W1_clue_ctx, Sentence1)": round(ctx_sent_sim, 4),
        })

    print()

======================================================================
MODEL: CALE-MBERT-en
======================================================================

CALE-MBERT-en | plant / aster
-----------------------------
                        W1_avg    W1_clue_ctx      Sentence1         W2_avg
         W1_avg         1.0000         0.5446         0.7083         0.7715
    W1_clue_ctx         0.5446         1.0000         0.6589         0.4015
      Sentence1         0.7083         0.6589         1.0000         0.6316
         W2_avg         0.7715         0.4015         0.6316         1.0000

CALE-MBERT-en | letter / epistle
--------------------------------
                        W1_avg    W1_clue_ctx      Sentence1         W2_avg
         W1_avg         1.0000         0.1197         0.6826         0.8977
    W1_clue_ctx         0.1197         1.0000         0.2778         0.1684
      Sentence1         0.6826         0.2778         1.0000         0.7319
         W2_avg         0.8977         0.1684         0.7319         1.0000

CALE-MBERT-en | bank / shore
----------------------------
                        W1_avg    W1_clue_ctx      Sentence1         W2_avg
         W1_avg         1.0000         0.4240         0.6895         0.7750
    W1_clue_ctx         0.4240         1.0000         0.6774         0.2975
      Sentence1         0.6895         0.6774         1.0000         0.5259
         W2_avg         0.7750         0.2975         0.5259         1.0000

CALE-MBERT-en | key / note
--------------------------
                        W1_avg    W1_clue_ctx      Sentence1         W2_avg
         W1_avg         1.0000         0.4095         0.6313         0.8063
    W1_clue_ctx         0.4095         1.0000         0.4848         0.3305
      Sentence1         0.6313         0.4848         1.0000         0.6289
         W2_avg         0.8063         0.3305         0.6289         1.0000

======================================================================
MODEL: bge-base-en-v1.5
======================================================================

/var/folders/3k/s3lth05j71x9zqxntyjngc0c0000gn/T/ipykernel_8947/1473410711.py:69: DeprecationWarning: __array__ implementation doesn't accept a copy keyword, so passing copy=False failed. __array__ must implement 'dtype' and 'copy' keyword arguments. To learn more, see the migration guide https://numpy.org/devdocs/numpy_2_0_migration_guide.html#adapting-to-changes-in-the-copy-keyword
  target_embs = np.array(token_embeddings[target_indices].cpu())

bge-base-en-v1.5 | plant / aster
--------------------------------
                        W1_avg    W1_clue_ctx      Sentence1         W2_avg
         W1_avg         1.0000         0.7091         0.7218         0.5504
    W1_clue_ctx         0.7091         1.0000         0.9292         0.4421
      Sentence1         0.7218         0.9292         1.0000         0.4313
         W2_avg         0.5504         0.4421         0.4313         1.0000

/var/folders/3k/s3lth05j71x9zqxntyjngc0c0000gn/T/ipykernel_8947/1473410711.py:69: DeprecationWarning: __array__ implementation doesn't accept a copy keyword, so passing copy=False failed. __array__ must implement 'dtype' and 'copy' keyword arguments. To learn more, see the migration guide https://numpy.org/devdocs/numpy_2_0_migration_guide.html#adapting-to-changes-in-the-copy-keyword
  target_embs = np.array(token_embeddings[target_indices].cpu())
/var/folders/3k/s3lth05j71x9zqxntyjngc0c0000gn/T/ipykernel_8947/1473410711.py:69: DeprecationWarning: __array__ implementation doesn't accept a copy keyword, so passing copy=False failed. __array__ must implement 'dtype' and 'copy' keyword arguments. To learn more, see the migration guide https://numpy.org/devdocs/numpy_2_0_migration_guide.html#adapting-to-changes-in-the-copy-keyword
  target_embs = np.array(token_embeddings[target_indices].cpu())

bge-base-en-v1.5 | letter / epistle
-----------------------------------
                        W1_avg    W1_clue_ctx      Sentence1         W2_avg
         W1_avg         1.0000         0.7281         0.7133         0.7170
    W1_clue_ctx         0.7281         1.0000         0.9381         0.5900
      Sentence1         0.7133         0.9381         1.0000         0.5706
         W2_avg         0.7170         0.5900         0.5706         1.0000

bge-base-en-v1.5 | bank / shore
-------------------------------
                        W1_avg    W1_clue_ctx      Sentence1         W2_avg
         W1_avg         1.0000         0.7284         0.7089         0.5384
    W1_clue_ctx         0.7284         1.0000         0.9305         0.4638
      Sentence1         0.7089         0.9305         1.0000         0.4279
         W2_avg         0.5384         0.4638         0.4279         1.0000

/var/folders/3k/s3lth05j71x9zqxntyjngc0c0000gn/T/ipykernel_8947/1473410711.py:69: DeprecationWarning: __array__ implementation doesn't accept a copy keyword, so passing copy=False failed. __array__ must implement 'dtype' and 'copy' keyword arguments. To learn more, see the migration guide https://numpy.org/devdocs/numpy_2_0_migration_guide.html#adapting-to-changes-in-the-copy-keyword
  target_embs = np.array(token_embeddings[target_indices].cpu())

bge-base-en-v1.5 | key / note
-----------------------------
                        W1_avg    W1_clue_ctx      Sentence1         W2_avg
         W1_avg         1.0000         0.6638         0.6286         0.7195
    W1_clue_ctx         0.6638         1.0000         0.9447         0.6224
      Sentence1         0.6286         0.9447         1.0000         0.6292
         W2_avg         0.7195         0.6224         0.6292         1.0000

======================================================================
MODEL: all-mpnet-base-v2
======================================================================

/var/folders/3k/s3lth05j71x9zqxntyjngc0c0000gn/T/ipykernel_8947/1473410711.py:69: DeprecationWarning: __array__ implementation doesn't accept a copy keyword, so passing copy=False failed. __array__ must implement 'dtype' and 'copy' keyword arguments. To learn more, see the migration guide https://numpy.org/devdocs/numpy_2_0_migration_guide.html#adapting-to-changes-in-the-copy-keyword
  target_embs = np.array(token_embeddings[target_indices].cpu())

all-mpnet-base-v2 | plant / aster
---------------------------------
                        W1_avg    W1_clue_ctx      Sentence1         W2_avg
         W1_avg         1.0000         0.5211         0.4669         0.2096
    W1_clue_ctx         0.5211         1.0000         0.9039         0.1133
      Sentence1         0.4669         0.9039         1.0000         0.1118
         W2_avg         0.2096         0.1133         0.1118         1.0000

/var/folders/3k/s3lth05j71x9zqxntyjngc0c0000gn/T/ipykernel_8947/1473410711.py:69: DeprecationWarning: __array__ implementation doesn't accept a copy keyword, so passing copy=False failed. __array__ must implement 'dtype' and 'copy' keyword arguments. To learn more, see the migration guide https://numpy.org/devdocs/numpy_2_0_migration_guide.html#adapting-to-changes-in-the-copy-keyword
  target_embs = np.array(token_embeddings[target_indices].cpu())
/var/folders/3k/s3lth05j71x9zqxntyjngc0c0000gn/T/ipykernel_8947/1473410711.py:69: DeprecationWarning: __array__ implementation doesn't accept a copy keyword, so passing copy=False failed. __array__ must implement 'dtype' and 'copy' keyword arguments. To learn more, see the migration guide https://numpy.org/devdocs/numpy_2_0_migration_guide.html#adapting-to-changes-in-the-copy-keyword
  target_embs = np.array(token_embeddings[target_indices].cpu())

# Summary: the key metric across all models and test pairs
key_df = pd.DataFrame(key_metric_rows)

print("=" * 70)
print("KEY METRIC: cos(W1_clue_ctx, Sentence1)")
print("=" * 70)
print()
print("This measures how much the model differentiates between the target")
print("word embedding and the full sentence embedding. Values close to 1.0")
print("mean the model treats them as nearly identical (poor differentiation).")
print("Lower values mean the model produces genuinely distinct representations.")
print()

# Pivot to show models as columns
pivot = key_df.pivot_table(
    index=["Definition", "Answer"],
    columns="Model",
    values="cos(W1_clue_ctx, Sentence1)",
)
print(pivot.to_string())
print()

# Mean per model
print("Mean cos(W1_clue_ctx, Sentence1) per model:")
means = key_df.groupby("Model")["cos(W1_clue_ctx, Sentence1)"].mean()
for model_short, mean_val in means.items():
    print(f"  {model_short:>25s}: {mean_val:.4f}")

======================================================================
KEY METRIC: cos(W1_clue_ctx, Sentence1)
======================================================================

This measures how much the model differentiates between the target
word embedding and the full sentence embedding. Values close to 1.0
mean the model treats them as nearly identical (poor differentiation).
Lower values mean the model produces genuinely distinct representations.

Model               CALE-MBERT-en  all-mpnet-base-v2  bge-base-en-v1.5
Definition Answer                                                     
bank       shore           0.6774             0.8804            0.9305
key        note            0.4848             0.9153            0.9447
letter     epistle         0.2778             0.9103            0.9381
plant      aster           0.6589             0.9039            0.9292

Mean cos(W1_clue_ctx, Sentence1) per model:
              CALE-MBERT-en: 0.5247
          all-mpnet-base-v2: 0.9025
           bge-base-en-v1.5: 0.9356

def build_synset_context(word, synset):
    """Build a context sentence for a word from a WordNet synset's definition
    and usage example, with <t></t> delimiters around the target word.

    Strategy:
    - Start with the synset's definition text.
    - If there's a usage example, append it after a semicolon.
    - If the target word appears in the combined text, wrap it with <t></t>.
    - If it doesn't appear (common for multi-word synset definitions), prepend
      the word as "<t>word</t>: context".

    This ensures every output has exactly one <t></t> pair around the target
    word, matching CALE's expected input format.
    """
    definition_text = synset.definition()
    examples = synset.examples()

    # Combine definition and first example
    if examples:
        context = f"{definition_text}; {examples[0]}"
    else:
        context = definition_text

    # Try to find the word in the context (case-insensitive)
    pattern = re.compile(re.escape(word), re.IGNORECASE)
    match = pattern.search(context)

    if match:
        start, end = match.start(), match.end()
        return context[:start] + "<t>" + context[start:end] + "</t>" + context[end:]
    else:
        # Word doesn't appear in context text — prepend it
        return f"<t>{word}</t>: {context}"


def get_allsense_embedding(model, word):
    """Compute the allsense-average embedding for a word using all its
    WordNet synsets.

    Also returns the per-synset context sentences and embeddings for
    inspection. Returns (allsense_avg, common_emb, obscure_emb, synset_info)
    where synset_info is a list of dicts with synset details.
    """
    lookup = word.lower().replace(" ", "_")
    synsets = wn.synsets(lookup)

    if not synsets:
        # Fallback: single delimited word (should not happen given our
        # Step 1 WordNet filter, but included for completeness)
        emb = model.encode(f"<t>{word}</t>")
        return emb, emb, emb, []

    synset_info = []
    embeddings = []

    for ss in synsets:
        ctx = build_synset_context(word, ss)
        emb = model.encode(ctx)
        embeddings.append(emb)
        synset_info.append({
            "synset": ss.name(),
            "pos": ss.pos(),
            "definition": ss.definition(),
            "context_sentence": ctx,
        })

    embeddings = np.array(embeddings)
    allsense_avg = embeddings.mean(axis=0)
    common_emb = embeddings[0]     # first synset = most common sense
    obscure_emb = embeddings[-1]   # last synset = least common sense

    return allsense_avg, common_emb, obscure_emb, synset_info

# Demonstrate the allsense approach on the "plant" / "aster" pair.
# This pair is ideal because "plant" has many senses (botanical,
# industrial, espionage) and "aster" is a specific flower.

cale_model = models[CALE_NAME]

# Get allsense embeddings and synset details for "plant"
w1_allsense, w1_common, w1_obscure, w1_synsets = get_allsense_embedding(
    cale_model, "plant"
)

print(f"Word: 'plant'")
print(f"Number of WordNet synsets: {len(w1_synsets)}")
print()
print("Synset context sentences used for allsense-average:")
print("=" * 70)
for i, info in enumerate(w1_synsets):
    tag = ""
    if i == 0:
        tag = "  [COMMON]"
    elif i == len(w1_synsets) - 1:
        tag = "  [OBSCURE]"
    print(f"  {info['synset']:30s} ({info['pos']}) {info['definition'][:50]}...{tag}")
    print(f"    Context: {info['context_sentence'][:80]}...")
    print()

Word: 'plant'
Number of WordNet synsets: 10

Synset context sentences used for allsense-average:
======================================================================
  plant.n.01                     (n) buildings for carrying on industrial labor...  [COMMON]
    Context: buildings for carrying on industrial labor; they built a large <t>plant</t> to m...

  plant.n.02                     (n) (botany) a living organism lacking the power of lo...
    Context: <t>plant</t>: (botany) a living organism lacking the power of locomotion...

  plant.n.03                     (n) an actor situated in the audience whose acting is ...
    Context: <t>plant</t>: an actor situated in the audience whose acting is rehearsed but se...

  plant.n.04                     (n) something planted secretly for discovery by anothe...
    Context: something <t>plant</t>ed secretly for discovery by another; the police used a pl...

  plant.v.01                     (v) put or set (seeds, seedlings, or plants) into the ...
    Context: put or set (seeds, seedlings, or <t>plant</t>s) into the ground; Let's plant flo...

  implant.v.01                   (v) fix or set securely or deeply...
    Context: fix or set securely or deeply; He <t>plant</t>ed a knee in the back of his oppon...

  establish.v.02                 (v) set up or lay the groundwork for...
    Context: <t>plant</t>: set up or lay the groundwork for; establish a new department...

  plant.v.04                     (v) place into a river...
    Context: place into a river; <t>plant</t> fish...

  plant.v.05                     (v) place something or someone in a certain position i...
    Context: place something or someone in a certain position in order to secretly observe or...

  plant.v.06                     (v) put firmly in the mind...  [OBSCURE]
    Context: put firmly in the mind; <t>Plant</t> a thought in the students' minds...

# Get allsense embeddings for "aster"
w2_allsense, w2_common, w2_obscure, w2_synsets = get_allsense_embedding(
    cale_model, "aster"
)

print(f"Word: 'aster'")
print(f"Number of WordNet synsets: {len(w2_synsets)}")
print()
print("Synset context sentences used for allsense-average:")
print("=" * 70)
for i, info in enumerate(w2_synsets):
    tag = ""
    if i == 0:
        tag = "  [COMMON]"
    elif i == len(w2_synsets) - 1:
        tag = "  [OBSCURE]"
    print(f"  {info['synset']:30s} ({info['pos']}) {info['definition'][:50]}...{tag}")
    print(f"    Context: {info['context_sentence'][:80]}...")
    print()

Word: 'aster'
Number of WordNet synsets: 2

Synset context sentences used for allsense-average:
======================================================================
  aster.n.01                     (n) any of various chiefly fall-blooming herbs of the ...  [COMMON]
    Context: any of various chiefly fall-blooming herbs of the genus <t>Aster</t> with showy ...

  aster.n.02                     (n) star-shaped structure formed in the cytoplasm of a...  [OBSCURE]
    Context: <t>aster</t>: star-shaped structure formed in the cytoplasm of a cell having fib...

# Build the full similarity matrix for the plant/aster pair.
# This includes allsense, common, obscure, clue-context, and bare-word
# embeddings — showing how the allsense approach compares to bare words.

# Get CALE clue-context embedding for "plant" in its clue
surface_plant = "Plant in a garden party"
delimited_plant = insert_cale_delimiters(surface_plant, "plant")
w1_clue_ctx = cale_model.encode(delimited_plant)

# Get bare-word embeddings (no context, no delimiters) for comparison
w1_bare = cale_model.encode("plant")
w2_bare = cale_model.encode("aster")

# Assemble all embedding types into a labeled dict
allsense_labels = [
    "W1_allsense", "W1_common", "W1_obscure",
    "W1_clue_ctx", "W1_bare",
    "W2_allsense", "W2_bare",
]
allsense_vecs = np.array([
    w1_allsense, w1_common, w1_obscure,
    w1_clue_ctx, w1_bare,
    w2_allsense, w2_bare,
])

sim_matrix = cosine_similarity(allsense_vecs)

print("Allsense Similarity Matrix: plant / aster (CALE only)")
print("=" * 120)

# Print header
header = f"{'':>15s}" + "".join(f"{l:>15s}" for l in allsense_labels)
print(header)

for i, label in enumerate(allsense_labels):
    row_str = f"{label:>15s}"
    for j in range(len(allsense_labels)):
        row_str += f"{sim_matrix[i, j]:>15.4f}"
    print(row_str)

Allsense Similarity Matrix: plant / aster (CALE only)
========================================================================================================================
                   W1_allsense      W1_common     W1_obscure    W1_clue_ctx        W1_bare    W2_allsense        W2_bare
    W1_allsense         1.0000         0.5401         0.7344         0.7159         0.4393         0.5821         0.3263
      W1_common         0.5401         1.0000         0.2291         0.4082         0.3011         0.2675         0.2546
     W1_obscure         0.7344         0.2291         1.0000         0.3929         0.2148         0.2391         0.1997
    W1_clue_ctx         0.7159         0.4082         0.3929         1.0000         0.5446         0.6593         0.4015
        W1_bare         0.4393         0.3011         0.2148         0.5446         1.0000         0.4570         0.7715
    W2_allsense         0.5821         0.2675         0.2391         0.6593         0.4570         1.0000         0.3419
        W2_bare         0.3263         0.2546         0.1997         0.4015         0.7715         0.3419         1.0000

# Quantify the key comparisons for easy reference

def cos(a, b):
    """Cosine similarity between two 1D vectors."""
    return cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))[0, 0]


print("Key Comparisons for plant / aster (CALE):")
print("=" * 55)
print(f"  W1_allsense vs W2_allsense:  {cos(w1_allsense, w2_allsense):.4f}")
print(f"  W1_common   vs W1_obscure:   {cos(w1_common, w1_obscure):.4f}")
print(f"  W1_clue_ctx vs W1_allsense:  {cos(w1_clue_ctx, w1_allsense):.4f}")
print(f"  W1_bare     vs W2_bare:      {cos(w1_bare, w2_bare):.4f}")
print()

# Also show bare-word discrimination problem with an unrelated word
w_banana_bare = cale_model.encode("banana")
print("Bare-word discrimination problem (CALE without context):")
print(f"  cos(plant_bare, aster_bare):  {cos(w1_bare, w2_bare):.4f}  (related)")
print(f"  cos(plant_bare, banana_bare): {cos(w1_bare, w_banana_bare):.4f}  (unrelated)")
print("  -> These should differ substantially, but CALE's bare embeddings")
print("     cannot discriminate. The allsense approach solves this.")
print()

# Compare with allsense discrimination
w_banana_allsense, _, _, _ = get_allsense_embedding(cale_model, "banana")
print("Allsense discrimination (CALE with WordNet contexts):")
print(f"  cos(plant_allsense, aster_allsense):  {cos(w1_allsense, w2_allsense):.4f}  (related)")
print(f"  cos(plant_allsense, banana_allsense): {cos(w1_allsense, w_banana_allsense):.4f}  (unrelated)")
print("  -> The allsense approach should show better separation.")

Key Comparisons for plant / aster (CALE):
=======================================================
  W1_allsense vs W2_allsense:  0.5821
  W1_common   vs W1_obscure:   0.2291
  W1_clue_ctx vs W1_allsense:  0.7159
  W1_bare     vs W2_bare:      0.7715

Bare-word discrimination problem (CALE without context):
  cos(plant_bare, aster_bare):  0.7715  (related)
  cos(plant_bare, banana_bare): 0.7859  (unrelated)
  -> These should differ substantially, but CALE's bare embeddings
     cannot discriminate. The allsense approach solves this.

Allsense discrimination (CALE with WordNet contexts):
  cos(plant_allsense, aster_allsense):  0.5821  (related)
  cos(plant_allsense, banana_allsense): 0.4996  (unrelated)
  -> The allsense approach should show better separation.

# Test 1: With vs. without delimiters
surface_test = "Plant in a garden party"
with_delim = "<t>Plant</t> in a garden party"
without_delim = "Plant in a garden party"

emb_with = cale_model.encode(with_delim)
emb_without = cale_model.encode(without_delim)

sim_with_vs_without = cos(emb_with, emb_without)

print("Test 1: Effect of <t></t> delimiters")
print("=" * 50)
print(f"  With delimiters:    '{with_delim}'")
print(f"  Without delimiters: '{without_delim}'")
print(f"  cos(with, without): {sim_with_vs_without:.4f}")
print()
if sim_with_vs_without < 0.95:
    print("  -> Delimiters produce a meaningfully different embedding.")
    print("     The model is responding to the <t></t> tags as intended.")
else:
    print("  -> WARNING: Delimiters have little effect. Check model loading.")
print()

Test 1: Effect of <t></t> delimiters
==================================================
  With delimiters:    '<t>Plant</t> in a garden party'
  Without delimiters: 'Plant in a garden party'
  cos(with, without): 0.6589

  -> Delimiters produce a meaningfully different embedding.
     The model is responding to the <t></t> tags as intended.

# Test 2: Double delimiters (should degrade)
# Wrapping two different words in <t></t> is outside CALE's training
# distribution (it was trained with exactly one <t></t> pair per input).
single_delim = "<t>Plant</t> in a garden party"
double_delim = "<t>Plant</t> in a <t>garden</t> party"

emb_single = cale_model.encode(single_delim)
emb_double = cale_model.encode(double_delim)

# Compare: the single-delim embedding should be a clean "plant-in-context"
# representation. The double-delim embedding is ambiguous (which word is
# the target?) and should differ.
sim_single_vs_double = cos(emb_single, emb_double)

# Also compare both to the bare definition word
emb_plant_bare = cale_model.encode("plant")
sim_single_vs_bare = cos(emb_single, emb_plant_bare)
sim_double_vs_bare = cos(emb_double, emb_plant_bare)

print("Test 2: Single vs. Double <t></t> delimiters")
print("=" * 55)
print(f"  Single: '{single_delim}'")
print(f"  Double: '{double_delim}'")
print()
print(f"  cos(single, double):          {sim_single_vs_double:.4f}")
print(f"  cos(single_delim, bare_word): {sim_single_vs_bare:.4f}")
print(f"  cos(double_delim, bare_word): {sim_double_vs_bare:.4f}")
print()
if sim_single_vs_double < 0.95:
    print("  -> Double delimiters produce a different (degraded) embedding.")
    print("     Our pipeline must ensure exactly one <t></t> pair per input.")
else:
    print("  -> Double delimiters had minimal effect. The model may be robust")
    print("     to this, but we should still avoid it for safety.")

Test 2: Single vs. Double <t></t> delimiters
=======================================================
  Single: '<t>Plant</t> in a garden party'
  Double: '<t>Plant</t> in a <t>garden</t> party'

  cos(single, double):          0.5434
  cos(single_delim, bare_word): 0.5446
  cos(double_delim, bare_word): 0.3416

  -> Double delimiters produce a different (degraded) embedding.
     Our pipeline must ensure exactly one <t></t> pair per input.

Embedding	How Computed	What It Represents
W1_avg	`model.encode(definition)`	Bare definition word, sentence-level pooling
W1_clue_ctx	Target word extracted from clue context (see below)	Definition meaning as shifted by clue
Sentence1	`model.encode(surface)`	Full clue sentence embedding
W2_avg	`model.encode(answer)`	Bare answer word

Model Comparison: Choosing an Embedding Model for Clue Misdirection¶

Imports¶

Environment Detection and GPU Check¶

Test Data¶

Note on BGE-M3¶

Section 1: Basic Model Properties¶

Section 2: Four-Type Similarity Matrix¶

Section 2 Interpretation¶

Section 3: Allsense-Average Approach (CALE Only)¶

Section 3 Interpretation: Allsense-Average for plant / aster¶

Section 4: CALE Delimiter Validation¶

Summary¶

Key Findings¶

Decisions Documented¶

	definition	answer	surface
0	plant	aster	Plant in a garden party
1	letter	epistle	Foreign letter coming in is the French letter
2	bank	shore	Bank offering a current account
3	key	note	Key figure in music

	Full Name	Dimension	Max Seq Length	Parameters (M)	Load Time (s)
Model
CALE-MBERT-en	gabrielloiseau/CALE-MBERT-en	1024	8192	394.8	4.4
bge-base-en-v1.5	BAAI/bge-base-en-v1.5	768	512	109.5	2.1
all-mpnet-base-v2	all-mpnet-base-v2	768	384	109.5	2.1