import os
import re
import unicodedata
import numpy as np
import pandas as pd
from pathlib import Path

from sentence_transformers import SentenceTransformer
import umap

import hdbscan
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score

import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

matplotlib.rcParams['figure.dpi'] = 120


try:
    IS_COLAB = 'google.colab' in str(get_ipython())
except NameError:
    IS_COLAB = False

IS_GREATLAKES = 'SLURM_JOB_ID' in os.environ

if IS_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    PROJECT_ROOT = Path('/content/drive/MyDrive/SIADS 692 Milestone II/Milestone II - NLP Cryptic Crossword Clues')
elif IS_GREATLAKES:
    # Update YOUR_UNIQNAME to your actual UMich uniqname
    PROJECT_ROOT = Path('/home/nycantwe/ccc_project')
else:
    # Local: notebooks/ is one level below project root
    PROJECT_ROOT = Path.cwd().parent

DATA_DIR = PROJECT_ROOT / 'data'
OUTPUT_DIR = PROJECT_ROOT / 'outputs'
FIGURES_DIR = OUTPUT_DIR / 'figures'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
FIGURES_DIR.mkdir(parents=True, exist_ok=True)

# Larger batches are faster; reduce to 64 if CUDA OOM on Colab T4
BATCH_SIZE = 32 if IS_COLAB else 256

env_name = 'Colab' if IS_COLAB else ('Great Lakes' if IS_GREATLAKES else 'Local')
print(f'Environment : {env_name}')
print(f'Project root: {PROJECT_ROOT}')
print(f'Data dir    : {DATA_DIR}')
print(f'Batch size  : {BATCH_SIZE}')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Environment : Colab
Project root: /content/drive/MyDrive/SIADS 692 Milestone II/Milestone II - NLP Cryptic Crossword Clues
Data dir    : /content/drive/MyDrive/SIADS 692 Milestone II/Milestone II - NLP Cryptic Crossword Clues/data
Batch size  : 32


# Set the global random seed for reproducibility.
# Every stochastic step (UMAP, HDBSCAN soft assignments) must use this seed
# so results match across runs — a hard requirement from CLAUDE.md.
np.random.seed(42)
print('Random seed set to 42.')

Random seed set to 42.


required = {
    'clues_raw.csv': DATA_DIR / 'clues_raw.csv',
}

all_present = True
for name, path in required.items():
    status = 'OK     ' if path.exists() else 'MISSING'
    print(f'  [{status}] {name}')
    if not path.exists():
        all_present = False

if not all_present:
    raise FileNotFoundError(
        'Required input file missing.\n'
        'Run 00_data_extraction.ipynb first to produce clues_raw.csv.'
    )
print('\nRequired file present. Proceeding.')

  [OK     ] clues_raw.csv

Required file present. Proceeding.


df_clues = pd.read_csv(DATA_DIR / 'clues_raw.csv')
print(f'Loaded {len(df_clues):,} rows from clues_raw.csv')
print(f'Columns: {df_clues.columns.tolist()}')

def normalize(s: str) -> str:
    """Remove accents and punctuation, convert to lowercase.
    Keeps letters, digits, and spaces only — matches 01b_data_cleaning logic."""
    s_normalized = ''.join(
        ch for ch in unicodedata.normalize('NFD', s)
        if unicodedata.category(ch).startswith(('L', 'N', 'Zs'))
    ).lower()
    return s_normalized

Loaded 660,613 rows from clues_raw.csv
Columns: ['clue_id', 'clue', 'answer', 'definition', 'clue_number', 'puzzle_date', 'puzzle_name', 'source_url', 'source']


# Drop rows where clue, answer, or definition is missing; null definitions are kept (01b behavior)
df_clues.dropna(subset=['clue', 'answer', 'definition'], inplace=True)

# Strip trailing letter-count parentheses, e.g. "(5,2,3)" -> preserve surface only
df_clues['surface'] = df_clues['clue'].astype(str).apply(
    lambda x: re.sub(r'\s*\(\d+(?:[,\s-]+\d+)*\)$', '', x)
)

df_clues['surface_normalized']    = df_clues['surface'].astype(str).apply(normalize)
df_clues['answer_normalized']     = df_clues['answer'].astype(str).apply(normalize)
df_clues['definition_normalized'] = df_clues['definition'].astype(str).apply(normalize)

print(f'Rows after dropping null clue/answer: {len(df_clues):,}')

Rows after dropping null clue/answer: 510,886


# Extract the required answer format from the parenthetical at the end of the clue
df_clues['req_ans_format'] = df_clues['clue'].astype(str).str.extract(
    r'\((\d+(?:[,\s-]+\d+)*)\)$'
)
df_clues['req_ans_letter_count'] = df_clues['req_ans_format'].apply(
    lambda x: sum(int(n) for n in re.findall(r'\d+', str(x))) if pd.notnull(x) else 0
)

def check_format_match(row):
    answer     = str(row['answer'])
    req_format = str(row['req_ans_format'])
    required_lengths = [int(n) for n in re.findall(r'\d+', req_format)]
    answer_segments  = re.findall(r'[a-zA-Z0-9]+', answer)
    answer_lengths   = [len(seg) for seg in answer_segments]
    return required_lengths == answer_lengths

df_clues['ans_format_valid'] = df_clues.apply(check_format_match, axis=1)
print(f'ans_format_valid: {df_clues["ans_format_valid"].sum():,} / {len(df_clues):,} rows')

ans_format_valid: 473,397 / 510,886 rows


# Build a fast lookup dictionary: clue_id -> normalized surface
clue_lookup = df_clues.set_index('clue_id')['surface_normalized'].to_dict()

def verify_clues(definition, clue_ids):
    """Return subset of clue_ids where `definition` appears word-boundary-intact."""
    if not isinstance(clue_ids, list):
        clue_ids = []
    if not clue_ids:
        return []
    pattern = rf'\b{re.escape(str(definition))}\b'
    verified = []
    for cid in clue_ids:
        surface = clue_lookup.get(cid)
        if surface and re.search(pattern, surface):
            verified.append(cid)
    return verified


# 1. Build definition -> clue_ids mapping
definition_to_clues = (
    df_clues.groupby('definition_normalized')['clue_id']
    .apply(list)
    .to_dict()
)

# 2. Verify once per unique definition
verified_map = {}

for definition, clue_ids in definition_to_clues.items():
    verified_map[definition] = verify_clues(definition, clue_ids)

# 3. Map results back
df_clues['clue_ids'] = df_clues['definition_normalized'].map(definition_to_clues)
df_clues['clue_ids_verified'] = df_clues['definition_normalized'].map(verified_map)
df_clues['num_clues_verified'] = df_clues['clue_ids_verified'].map(len)

# 4. Print summary
print(f'Unique definitions total   : {df_clues["definition_normalized"].nunique():,}')
print(f'Unique definitions verified: {df_clues[df_clues["num_clues_verified"] > 0]["definition_normalized"].nunique():,}')

Unique definitions total   : 229,604
Unique definitions verified: 196,636


# Keep only rows where at least one clue verified the definition
df_export = df_clues[df_clues['num_clues_verified'] > 0].copy()
df_export['clue_id'] = df_export['clue_id'].astype(int)

# Drop rows where 'definition' is erroneously the entire clue.
df_export = df_export[df_export['surface_normalized']!=df_export['definition_normalized']]

# Drop rows with string value 'nan' as the definition value.
df_export['definition'] = df_export['definition'].replace('nan', np.nan)
df_export = df_export.dropna(subset=['definition'])

# Select and rename to the final two-column schema
df_export = (
    df_export[['clue_id', 'definition_normalized']]
    .rename(columns={'definition_normalized': 'definition'})
    .replace('nan', np.nan)
    .replace('', np.nan)
    .dropna(subset=['clue_id', 'definition'])
)

print(f'Unique clue_ids    : {df_export["clue_id"].nunique():,}')
print(f'Unique definitions : {df_export["definition"].nunique():,}')

df_export.to_csv(DATA_DIR / 'verified_definition_clues.csv', index=False)
print(f'\nSaved {len(df_export):,} rows to verified_definition_clues.csv')

Unique clue_ids    : 464,054
Unique definitions : 185,160

Saved 464,054 rows to verified_definition_clues.csv


# Some definitions of exceptional length.
df_export.sort_values(by='definition', key=lambda col: col.str.len(), ascending=False).head(10)


# Example long-form definition that is correctly represented.
print(df_clues[df_clues['clue_id']==504161].surface_normalized.values)
print(df_clues[df_clues['clue_id']==504161].definition_normalized.values)
print(df_clues[df_clues['clue_id']==504161].answer.values)

['moving article about posh woman instrumental in romes transition from a monarchy to a republic']
['woman instrumental in romes transition from a monarchy to a republic']
['LUCRETIA']


df_verified_defs = pd.read_csv(DATA_DIR / 'verified_definition_clues.csv')
print(f'verified_definition_clues.csv: {len(df_verified_defs):,} rows')
print(f'Unique clue_ids   : {df_verified_defs["clue_id"].nunique():,}')
print(f'Unique definitions: {df_verified_defs["definition"].nunique():,}')
print(f'\nSample rows:')
print(df_verified_defs.sample(5, random_state=42).to_string(index=False))

# Unique definition strings — preserving original case (matches indicator handling)
unique_def_strings = df_verified_defs['definition'].str.strip().unique()
df_definitions = pd.DataFrame({'definition': sorted(unique_def_strings)})
print(f'\nUnique definition strings for embedding: {len(df_definitions):,}')

verified_definition_clues.csv: 464,054 rows
Unique clue_ids   : 464,054
Unique definitions: 185,160

Sample rows:
 clue_id                definition
  131756                    finger
  614135                      fish
  442849                    floors
  129419 something to fall back on
  445486  accommodating of callers

Unique definition strings for embedding: 185,061


defs_csv_path = DATA_DIR / 'definitions_unique.csv'
df_definitions.to_csv(defs_csv_path, index=False)

print(f'Saved {len(df_definitions):,} unique definitions to:')
print(f'  {defs_csv_path}')
print(f'\nLength distribution:')
lengths = df_definitions['definition'].str.split().str.len()
print(lengths.describe().to_string())

Saved 185,061 unique definitions to:
  /content/drive/MyDrive/SIADS 692 Milestone II/Milestone II - NLP Cryptic Crossword Clues/data/definitions_unique.csv

Length distribution:
count    185061.000000
mean          2.707632
std           1.285482
min           1.000000
25%           2.000000
50%           2.000000
75%           3.000000
max          16.000000


# Load the same BGE-M3 model used in NB 02 for indicators.
# First run: ~2.3 GB download. Subsequent runs: loads from local cache in seconds.
model = SentenceTransformer('BAAI/bge-m3')

print(f'Model loaded : BAAI/bge-m3')
print(f'Embedding dim: {model.get_sentence_embedding_dimension()}')


# Load definitions_unique.csv (works even if kernel was restarted after Section 1)
df_definitions = pd.read_csv(DATA_DIR / 'definitions_unique.csv')
definitions_list = df_definitions['definition'].tolist()
print(f'Encoding {len(definitions_list):,} definitions  (batch_size={BATCH_SIZE})...')

embeddings_defs = model.encode(
    definitions_list,
    batch_size=BATCH_SIZE,
    show_progress_bar=True
)

print(f'\nEmbeddings shape: {embeddings_defs.shape}')
print(f'Dtype           : {embeddings_defs.dtype}')
print(f'Memory          : {embeddings_defs.nbytes / 1024**2:.1f} MB')


emb_path   = DATA_DIR / 'embeddings_bge_m3_definitions.npy'
index_path = DATA_DIR / 'definition_index.csv'

np.save(emb_path, embeddings_defs)
df_definitions.to_csv(index_path, index=True)  # integer row index is the key

print(f'Saved embeddings : {emb_path.name}  {embeddings_defs.shape}')
print(f'Saved index      : {index_path.name}  ({len(df_definitions):,} rows)')

Saved embeddings : embeddings_bge_m3_definitions.npy  (185061, 1024)
Saved index      : definition_index.csv  (185,061 rows)


# Reload and verify: shapes must match, and embeddings must be non-zero
emb_check   = np.load(DATA_DIR / 'embeddings_bge_m3_definitions.npy')
index_check = pd.read_csv(DATA_DIR / 'definition_index.csv', index_col=0)

assert emb_check.shape[0] == len(index_check), \
    f'Row mismatch: embeddings={emb_check.shape[0]}, index={len(index_check)}'
assert emb_check.shape[1] == 1024, \
    f'Expected 1024 dimensions, got {emb_check.shape[1]}'

sample_norm = np.linalg.norm(emb_check[0])
print(f'Verification passed.')
print(f'Embeddings : {emb_check.shape}  (N definitions x 1024 dims)')
print(f'Index rows : {len(index_check):,}')
print(f'Row-0 L2 norm: {sample_norm:.4f}  (should be ~1 for normalised BGE-M3 output)')

Verification passed.
Embeddings : (185061, 1024)  (N definitions x 1024 dims)
Index rows : 185,061
Row-0 L2 norm: 1.0000  (should be ~1 for normalised BGE-M3 output)


# Load embeddings from file so this section can run independently
# (e.g., if the kernel was restarted after completing Section 2)
emb_path = DATA_DIR / 'embeddings_bge_m3_definitions.npy'
assert emb_path.exists(), (
    f'Missing: {emb_path}\nRun Section 2 first to generate definition embeddings.'
)
embeddings_defs = np.load(emb_path)
print(f'Loaded embeddings for UMAP: {embeddings_defs.shape}')

Loaded embeddings for UMAP: (185061, 1024)


# UMAP parameters are fixed to match NB 03 exactly
UMAP_PARAMS = dict(n_neighbors=15, min_dist=0.1, metric='cosine', random_state=42)

print('Fitting 10D UMAP (for clustering input)...')
print(f'Parameters: n_neighbors={UMAP_PARAMS["n_neighbors"]}, '
      f'min_dist={UMAP_PARAMS["min_dist"]}, metric={UMAP_PARAMS["metric"]}')

reducer_10d = umap.UMAP(n_components=10, **UMAP_PARAMS)
embeddings_umap_10d = reducer_10d.fit_transform(embeddings_defs)

print(f'\n10D UMAP complete. Shape: {embeddings_umap_10d.shape}')
print(f'Value range: [{embeddings_umap_10d.min():.3f}, {embeddings_umap_10d.max():.3f}]')

Fitting 10D UMAP (for clustering input)...
Parameters: n_neighbors=15, min_dist=0.1, metric=cosine

/usr/local/lib/python3.12/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(

10D UMAP complete. Shape: (185061, 10)
Value range: [-1.695, 11.346]


print('Fitting 2D UMAP (for visualisation only)...')

reducer_2d = umap.UMAP(n_components=2, **UMAP_PARAMS)
embeddings_umap_2d = reducer_2d.fit_transform(embeddings_defs)

print(f'2D UMAP complete. Shape: {embeddings_umap_2d.shape}')

Fitting 2D UMAP (for visualisation only)...

/usr/local/lib/python3.12/dist-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(

2D UMAP complete. Shape: (185061, 2)


# Save both reductions. File names are prefixed with "definitions_" to distinguish
# from the indicator UMAP files (embeddings_umap_10d.npy, embeddings_umap_2d.npy).
np.save(DATA_DIR / 'embeddings_umap_10d_definitions.npy', embeddings_umap_10d)
np.save(DATA_DIR / 'embeddings_umap_2d_definitions.npy',  embeddings_umap_2d)

print('Saved UMAP reductions:')
print(f'  embeddings_umap_10d_definitions.npy  {embeddings_umap_10d.shape}')
print(f'  embeddings_umap_2d_definitions.npy   {embeddings_umap_2d.shape}')

Saved UMAP reductions:
  embeddings_umap_10d_definitions.npy  (185061, 10)
  embeddings_umap_2d_definitions.npy   (185061, 2)


# Load UMAP outputs and definition names from file.
# This makes Section 4 independently runnable after Sections 2–3 have completed.
embeddings_umap_10d = np.load(DATA_DIR / 'embeddings_umap_10d_definitions.npy')
embeddings_umap_2d  = np.load(DATA_DIR / 'embeddings_umap_2d_definitions.npy')
df_def_index = pd.read_csv(DATA_DIR / 'definition_index.csv', index_col=0)
definition_names = df_def_index['definition'].values  # aligned with UMAP rows

print(f'10D UMAP  : {embeddings_umap_10d.shape}')
print(f'2D UMAP   : {embeddings_umap_2d.shape}')
print(f'Definitions: {len(definition_names):,}')

10D UMAP  : (185061, 10)
2D UMAP   : (185061, 2)
Definitions: 185,061


print('Running HDBSCAN  eps=0.0  min_cluster_size=10 ...')

clusterer_hdbscan = hdbscan.HDBSCAN(
    min_cluster_size=10,
    cluster_selection_epsilon=0.0,  # no merging — finds finest structure
    allow_single_cluster=False
)
labels_hdbscan = clusterer_hdbscan.fit_predict(embeddings_umap_10d)

n_clusters_h = len(set(labels_hdbscan)) - (1 if -1 in labels_hdbscan else 0)
n_noise_h    = int(np.sum(labels_hdbscan == -1))
noise_pct_h  = 100 * n_noise_h / len(labels_hdbscan)
print(f'Clusters    : {n_clusters_h}')
print(f'Noise points: {n_noise_h:,}  ({noise_pct_h:.1f}%)')

Running HDBSCAN  eps=0.0  min_cluster_size=10 ...
Clusters    : 2726
Noise points: 81,382  (44.0%)


# Silhouette and Davies-Bouldin are computed on non-noise points only.
# Including noise points (label=-1) would unfairly penalise the score.
mask_h = labels_hdbscan != -1
sil_h, db_h = np.nan, np.nan

if len(set(labels_hdbscan[mask_h])) >= 2:
    sil_h = silhouette_score(embeddings_umap_10d[mask_h], labels_hdbscan[mask_h])
    db_h  = davies_bouldin_score(embeddings_umap_10d[mask_h], labels_hdbscan[mask_h])
else:
    print('Warning: fewer than 2 clusters found — metrics undefined.')

print(f'Metrics (on {mask_h.sum():,} non-noise points):')
print(f'  Silhouette     : {sil_h:.4f}')
print(f'  Davies-Bouldin : {db_h:.4f}')

# Save cluster label assignments
hdbscan_out = DATA_DIR / 'definitions_cluster_labels_hdbscan_eps_0p0.csv'
pd.DataFrame({'definition': definition_names, 'cluster': labels_hdbscan}).to_csv(
    hdbscan_out, index=False
)
print(f'\nSaved: {hdbscan_out.name}')

Metrics (on 103,679 non-noise points):
  Silhouette     : 0.5943
  Davies-Bouldin : 0.5202

Saved: definitions_cluster_labels_hdbscan_eps_0p0.csv


# ============================================================
# Downsampling Definition Embeddings
# ============================================================
#
# Rationale:
# Ward's agglomerative clustering requires O(n^2) memory.
# With 185,061 embeddings, this is computationally infeasible.
#
# To maintain symmetry with the ~12k indicator embeddings used
# in prior stages while preserving representativeness, we
# randomly downsample a fixed percentage of the full dataset.
#
# The sampling is:
# - Random
# - Reproducible (seed=42)
# - Without replacement
#
# This preserves the global semantic distribution while making
# Ward clustering tractable.
# ============================================================

n_total = embeddings_umap_10d.shape[0]
print(f"Total definitions: {n_total:,}")

target_percent = 0.55   # max size possible for running on Great Lakes cluster
target_size = int(n_total * target_percent)
print(f"Target percent: {target_percent*100:.1f}%")
print(f"Target sample size: {target_size:,}")

indices = np.random.choice(
    n_total,
    size=target_size,
    replace=False
)

embeddings_umap_10d_sample = embeddings_umap_10d[indices]
embeddings_umap_2d_sample = embeddings_umap_2d[indices]
definition_names_sample = definition_names[indices]
print(f"Downsampled shape: {embeddings_umap_10d_sample.shape}")

# Save indices for reproducibility
np.save(DATA_DIR / 'definition_downsample_indices.npy', indices)
print("Downsampling complete.")

Total definitions: 185,061
Target percent: 55.0%
Target sample size: 101,783
Downsampled shape: (101783, 10)
Downsampling complete.


K_VALUES = [8, 10, 34]
agglo_metrics = []

for k in K_VALUES:
    clusterer = AgglomerativeClustering(n_clusters=k, linkage='ward')
    labels_k  = clusterer.fit_predict(embeddings_umap_10d_sample)

    sil = silhouette_score(embeddings_umap_10d_sample, labels_k)
    db  = davies_bouldin_score(embeddings_umap_10d_sample, labels_k)
    agglo_metrics.append({'method': 'Agglomerative', 'k': k,
                          'n_clusters': k, 'silhouette': round(sil, 4),
                          'davies_bouldin': round(db, 4)})

    # Save label assignments to CSV
    out_path = DATA_DIR / f'definitions_cluster_labels_agglo_k{k}.csv'
    pd.DataFrame({'definition': definition_names_sample, 'cluster': labels_k}).to_csv(
        out_path, index=False
    )
    print(f'  k={k:>3}: silhouette={sil:.4f}  DB={db:.4f}  → saved {out_path.name}')

  k=  8: silhouette=0.1640  DB=1.5789  → saved definitions_cluster_labels_agglo_k8.csv
  k= 10: silhouette=0.1718  DB=1.4916  → saved definitions_cluster_labels_agglo_k10.csv
  k= 34: silhouette=0.1853  DB=1.5083  → saved definitions_cluster_labels_agglo_k34.csv


hdbscan_row = {
    'method': 'HDBSCAN', 'k': float('nan'),
    'n_clusters': n_clusters_h, 'n_noise': n_noise_h,
    'noise_pct': round(noise_pct_h, 2),
    'silhouette': round(sil_h, 4) if not np.isnan(sil_h) else float('nan'),
    'davies_bouldin': round(db_h, 4) if not np.isnan(db_h) else float('nan'),
}

df_def_metrics = pd.DataFrame([hdbscan_row] + agglo_metrics)
df_def_metrics['n_noise']   = df_def_metrics.get('n_noise', pd.NA)
df_def_metrics['noise_pct'] = df_def_metrics.get('noise_pct', pd.NA)

metrics_out = OUTPUT_DIR / 'definitions_clustering_metrics.csv'
df_def_metrics.to_csv(metrics_out, index=False)

print('Definitions clustering metrics:')
print(df_def_metrics.to_string(index=False))
print(f'\nSaved to: {metrics_out.name}')

Definitions clustering metrics:
       method    k  n_clusters  n_noise  noise_pct  silhouette  davies_bouldin
      HDBSCAN  NaN        2726  81382.0      43.98      0.5943          0.5202
Agglomerative  8.0           8      NaN        NaN      0.1640          1.5789
Agglomerative 10.0          10      NaN        NaN      0.1718          1.4916
Agglomerative 34.0          34      NaN        NaN      0.1853          1.5083

Saved to: definitions_clustering_metrics.csv


ind_metrics_path = DATA_DIR / 'clustering_metrics_summary.csv'
df_ind_metrics   = None

if ind_metrics_path.exists():
    df_ind_metrics = pd.read_csv(ind_metrics_path)
    print(f'Loaded indicator metrics: {df_ind_metrics.shape}')
    print(f'Columns: {df_ind_metrics.columns.tolist()}')
    print()
    print(df_ind_metrics.to_string(index=False))
else:
    print('WARNING: clustering_metrics_summary.csv not found.')
    print(f'Expected at: {ind_metrics_path}')
    print('Run 04_clustering.ipynb first to generate indicator metrics.')
    print('Proceeding with definitions-only analysis.')

Loaded indicator metrics: (30, 8)
Columns: ['method', 'parameters', 'n_clusters', 'n_noise', 'noise_pct', 'silhouette', 'davies_bouldin', 'calinski_harabasz']

              method                      parameters  n_clusters  n_noise  noise_pct  silhouette  davies_bouldin  calinski_harabasz
             HDBSCAN    min_cluster_size=10, eps=0.0         282     4212  33.370306    0.630992        0.470028                NaN
             HDBSCAN  min_cluster_size=10, eps=0.214         244     3783  29.971478    0.584483        0.508532                NaN
             HDBSCAN  min_cluster_size=10, eps=0.428          62     1306  10.347013   -0.117756        1.047558                NaN
             HDBSCAN  min_cluster_size=10, eps=0.642          17      313   2.479797   -0.296464        0.976518                NaN
             HDBSCAN min_cluster_size=10, eps=0.7788          11      114   0.903185   -0.185984        0.775157                NaN
             HDBSCAN  min_cluster_size=10, eps=0.856          10       28   0.221835   -0.167519        0.797766                NaN
             HDBSCAN   min_cluster_size=10, eps=1.07           6        0   0.000000   -0.120162        0.782393                NaN
             HDBSCAN  min_cluster_size=10, eps=1.284           6        0   0.000000   -0.120162        0.782393                NaN
             HDBSCAN  min_cluster_size=10, eps=1.498           4        0   0.000000    0.230049        0.549086                NaN
             HDBSCAN min_cluster_size=10, eps=1.9327           4        0   0.000000    0.230049        0.549086                NaN
             HDBSCAN min_cluster_size=10, eps=2.2334           3        0   0.000000    0.385651        0.461465                NaN
             HDBSCAN min_cluster_size=10, eps=2.4729           3        0   0.000000    0.385651        0.461465                NaN
             HDBSCAN min_cluster_size=10, eps=2.6847           3        0   0.000000    0.385651        0.461465                NaN
Agglomerative (Ward)                             k=4           4        0   0.000000    0.246021        1.456096        3909.220873
Agglomerative (Ward)                             k=6           6        0   0.000000    0.259034        1.322581        3920.874006
Agglomerative (Ward)                             k=8           8        0   0.000000    0.272435        1.267374        3897.154308
Agglomerative (Ward)                             k=9           9        0   0.000000    0.288711        1.184741        3844.271002
Agglomerative (Ward)                            k=10          10        0   0.000000    0.298516        1.163678        3788.683596
Agglomerative (Ward)                            k=11          11        0   0.000000    0.281040        1.184305        3654.531355
Agglomerative (Ward)                            k=12          12        0   0.000000    0.281408        1.225761        3552.264089
Agglomerative (Ward)                            k=16          16        0   0.000000    0.278622        1.274775        3319.894167
Agglomerative (Ward)                            k=20          20        0   0.000000    0.295055        1.193753        3203.470119
Agglomerative (Ward)                            k=26          26        0   0.000000    0.304119        1.172194        3072.139975
Agglomerative (Ward)                            k=34          34        0   0.000000    0.321995        1.067513        2992.948844
Agglomerative (Ward)                            k=50          50        0   0.000000    0.343506        1.010616        2944.236720
Agglomerative (Ward)                            k=75          75        0   0.000000    0.370112        1.002773        2838.968311
Agglomerative (Ward)                           k=100         100        0   0.000000    0.377600        0.978082        2786.891253
Agglomerative (Ward)                           k=150         150        0   0.000000    0.388251        0.959118        2734.306732
Agglomerative (Ward)                           k=200         200        0   0.000000    0.418647        0.890381        2744.780282
Agglomerative (Ward)                           k=250         250        0   0.000000    0.431290        0.883640        2801.724667


if df_ind_metrics is not None:
    # Locate the agglomerative rows — handle varying column naming conventions
    method_col = 'method' if 'method' in df_ind_metrics.columns else None
    k_col      = 'k' if 'k' in df_ind_metrics.columns else 'n_clusters'

    mask_agglo = (
        df_ind_metrics[method_col].str.lower().str.contains('agglo', na=False)
        if method_col else pd.Series([True] * len(df_ind_metrics))
    )
    ind_agglo = (
        df_ind_metrics[mask_agglo & df_ind_metrics[k_col].isin(K_VALUES)]
        [[k_col, 'silhouette', 'davies_bouldin']]
        .rename(columns={'silhouette': 'ind_sil', 'davies_bouldin': 'ind_db', k_col: 'k'})
    )
    def_agglo = (
        df_def_metrics[df_def_metrics['method'] == 'Agglomerative']
        [['k', 'silhouette', 'davies_bouldin']]
        .rename(columns={'silhouette': 'def_sil', 'davies_bouldin': 'def_db'})
    )
    cmp = pd.merge(ind_agglo.astype({'k': int}), def_agglo.astype({'k': int}), on='k')
    cmp['sil_diff'] = (cmp['ind_sil'] - cmp['def_sil']).round(4)
    cmp['db_diff']  = (cmp['def_db'] - cmp['ind_db']).round(4)
    print('=== Indicator vs. Definition Clustering (Agglomerative Ward\'s) ===')
    print('(sil_diff > 0 → indicators cluster better; db_diff > 0 → definitions worse)')
    print(cmp.to_string(index=False))
else:
    print('Indicator metrics unavailable — run 04_clustering.ipynb first.')

=== Indicator vs. Definition Clustering (Agglomerative Ward's) ===
(sil_diff > 0 → indicators cluster better; db_diff > 0 → definitions worse)
 k  ind_sil   ind_db  def_sil  def_db  sil_diff  db_diff
 8 0.272435 1.267374   0.1640  1.5789    0.1084   0.3115
10 0.298516 1.163678   0.1718  1.4916    0.1267   0.3279
34 0.321995 1.067513   0.1853  1.5083    0.1367   0.4408


labels_k10 = pd.read_csv(
    DATA_DIR / 'definitions_cluster_labels_agglo_k10.csv'
)['cluster'].values

fig, ax = plt.subplots(figsize=(10, 7))
scatter = ax.scatter(
    embeddings_umap_2d_sample[:, 0], embeddings_umap_2d_sample[:, 1],
    c=labels_k10, cmap='tab10', s=4, alpha=0.55
)
plt.colorbar(scatter, ax=ax, label='Cluster')
ax.set_title('Definition Embeddings — UMAP 2D, Agglomerative Ward k=10', fontsize=13)
ax.set_xlabel('UMAP Dimension 1')
ax.set_ylabel('UMAP Dimension 2')
plt.tight_layout()

fig_path = FIGURES_DIR / 'definitions_umap_agglo_k10.png'
plt.savefig(fig_path, dpi=150)
plt.show()
print(f'Saved: {fig_path}')

Saved: /home/nycantwe/ccc_project/outputs/figures/definitions_umap_agglo_k10.png


K_INSPECT = 10
labels_inspect = pd.read_csv(
    DATA_DIR / f'definitions_cluster_labels_agglo_k{K_INSPECT}.csv'
)['cluster'].values

print(f'=== Centroid-Nearest Definitions per Cluster (k={K_INSPECT}) ===\n')
for cluster_id in range(K_INSPECT):
    mask_c    = labels_inspect == cluster_id
    centroid  = embeddings_umap_10d_sample[mask_c].mean(axis=0)
    dists     = np.linalg.norm(embeddings_umap_10d_sample[mask_c] - centroid, axis=1)
    top5_idx  = np.argsort(dists)[:5]
    top5_defs = definition_names_sample[mask_c][top5_idx]
    n_in_clust = mask_c.sum()
    print(f'Cluster {cluster_id}  (n={n_in_clust:,}):')
    for d in top5_defs:
        print(f'    · {d}')
    print()

=== Centroid-Nearest Definitions per Cluster (k=10) ===

Cluster 0  (n=18,799):
    · they ruminate
    · they may be high
    · rays
    · this bloke
    · get cracking with this

Cluster 1  (n=9,570):
    · with measles
    · inyourface
    · it could be critical
    · what faces
    · get bent

Cluster 2  (n=16,473):
    · aloof
    · to pry
    · ones highmaintenance
    · the most bent
    · faced

Cluster 3  (n=13,133):
    · entanglements
    · confab
    · commie
    · ray
    · ones peaked

Cluster 4  (n=13,347):
    · be
    · comb
    · hammer this
    · giddyup
    · potential share

Cluster 5  (n=4,666):
    · that might get trump high
    · a ray
    · they may be wearing rings
    · as shown by face
    · they are balmy

Cluster 6  (n=9,299):
    · compere
    · on the dot
    · entrenched peak
    · enticed
    · enfeebled

Cluster 7  (n=5,121):
    · have an inkling of
    · facet
    · one may generate
    · one on top in something vulgar
    · it could be dramatic

Cluster 8  (n=5,785):
    · tendency to pry
    · compel
    · encumbered
    · they might be tall
    · it may create space

Cluster 9  (n=5,590):
    · be in cahoots
    · another bloke
    · crowning point
    · a tyrants thing
    · he could give you a ring

	clue_id	definition
422689	422690	song message why not be our ruler one day ver...
139087	139088	what leading characters from harlequins and hu...
250678	250679	start to swiftly forget when heated romance br...
389901	389902	for him and four others here a fantastic route...
549309	549310	these decisions conveyed by sherlock holmes to...
504160	504161	woman instrumental in romes transition from a ...
512141	512142	a description of clocks having no similarities...
292202	292203	to recap with it roman somehow encapsulated h...
161880	161881	migrants originally one saw in fringes of camr...
240926	240927	thunderous and grey in conclusion cumulonimbus...

Reduction	Dimensions	Purpose
10D	10	Input to clustering algorithms (preserves structure; reduces noise)
2D	2	Scatter plots for visualisation only

Metric	Indicators (NB 04)	Definitions (this notebook)	Interpretation
Silhouette	[value from NB 04]	[value from Section 4]	Higher = more compact clusters
Davies-Bouldin	[value from NB 04]	[value from Section 4]	Lower = more separated clusters

File	Location	Description
`verified_definition_clues.csv`	`data/`	Verified (clue_id, definition) pairs
`definitions_unique.csv`	`data/`	Unique definition strings for embedding
`embeddings_bge_m3_definitions.npy`	`data/`	BGE-M3 embeddings (N × 1024)
`definition_index.csv`	`data/`	Row number → definition string
`embeddings_umap_10d_definitions.npy`	`data/`	10D UMAP for clustering
`embeddings_umap_2d_definitions.npy`	`data/`	2D UMAP for visualisation
`definitions_cluster_labels_hdbscan_eps_0p0.csv`	`data/`	HDBSCAN labels
`definition_downsample_indices.npy`	`data/`	Definition sample indices for agglomerative clustering
`definitions_cluster_labels_agglo_k8.csv`	`data/`	Agglomerative k=8 labels
`definitions_cluster_labels_agglo_k10.csv`	`data/`	Agglomerative k=10 labels
`definitions_cluster_labels_agglo_k34.csv`	`data/`	Agglomerative k=34 labels
`definitions_clustering_metrics.csv`	`outputs/`	Metrics summary (all runs)
`definitions_umap_agglo_k10.png`	`outputs/figures/`	Cluster scatter plot

Stage 7: Definitions as a Control Condition¶

Research Question¶

Why This Notebook Exists¶

Running on Google Colab¶

Section 0: Setup¶

Imports¶

Environment Auto-Detection and Path Configuration¶

Section 1: Extract Definitions¶

What is a definition in a cryptic crossword clue?¶

How Definitions Are Extracted¶

Required Input File¶

Step 1: Generate `verified_definition_clues.csv`¶

Normalize Clue Text, Answers, and Definitions¶

Verify That Each Definition Appears in Its Clue Surface¶

Step 2: Load `verified_definition_clues.csv` and Extract Unique Definitions¶

Save `definitions_unique.csv`¶

Section 2: Embed Definitions with BGE-M3¶

Why the Same Model Matters¶

Section 2 outputs¶

Load the BGE-M3 Model¶

Encode All Unique Definitions¶

Save Embeddings and Definition Index¶

Section 3: UMAP Dimensionality Reduction¶

10-Dimensional UMAP (for Clustering)¶

2-Dimensional UMAP (for Visualisation)¶

Section 4: Clustering¶

HDBSCAN at `eps=0.0` (Baseline Run)¶

Agglomerative Clustering with Ward's Linkage¶

Compile and Save the Definitions Metrics Summary¶

Section 5: Comparison to Indicator Results¶

Load Indicator Metrics from NB 04¶

Side-by-Side Metrics Comparison¶

UMAP Scatter Plot of Definition Clusters (k=10)¶

Qualitative Inspection: Centroid-Nearest Definitions¶

Section 6: Interpretation¶

6.1 Cluster Quality Comparison¶

6.2 Cluster Organisation Principle¶

6.3 What This Tells Us About Indicator Clustering¶

6.4 For the Report¶

Output File Summary¶

Stage 7: Definitions as a Control Condition¶

Research Question¶

Why This Notebook Exists¶

Running on Google Colab¶

Section 0: Setup¶

Imports¶

Environment Auto-Detection and Path Configuration¶

Section 1: Extract Definitions¶

What is a definition in a cryptic crossword clue?¶

How Definitions Are Extracted¶

Required Input File¶

Step 1: Generate verified_definition_clues.csv¶

Normalize Clue Text, Answers, and Definitions¶

Verify That Each Definition Appears in Its Clue Surface¶

Step 2: Load verified_definition_clues.csv and Extract Unique Definitions¶

Save definitions_unique.csv¶

Section 2: Embed Definitions with BGE-M3¶

Why the Same Model Matters¶

Section 2 outputs¶

Load the BGE-M3 Model¶

Encode All Unique Definitions¶

Save Embeddings and Definition Index¶

Section 3: UMAP Dimensionality Reduction¶

10-Dimensional UMAP (for Clustering)¶

2-Dimensional UMAP (for Visualisation)¶

Section 4: Clustering¶

HDBSCAN at eps=0.0 (Baseline Run)¶

Agglomerative Clustering with Ward's Linkage¶

Compile and Save the Definitions Metrics Summary¶

Section 5: Comparison to Indicator Results¶

Load Indicator Metrics from NB 04¶

Side-by-Side Metrics Comparison¶

UMAP Scatter Plot of Definition Clusters (k=10)¶

Qualitative Inspection: Centroid-Nearest Definitions¶

Section 6: Interpretation¶

6.1 Cluster Quality Comparison¶

6.2 Cluster Organisation Principle¶

6.3 What This Tells Us About Indicator Clustering¶

6.4 For the Report¶

Output File Summary¶

Step 1: Generate `verified_definition_clues.csv`¶

Step 2: Load `verified_definition_clues.csv` and Extract Unique Definitions¶

Save `definitions_unique.csv`¶

HDBSCAN at `eps=0.0` (Baseline Run)¶