import os
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import (
    silhouette_score,
    davies_bouldin_score,
    calinski_harabasz_score,
)
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import linkage, dendrogram

import hdbscan  # install via: pip install hdbscan

import matplotlib.pyplot as plt
import matplotlib

matplotlib.rcParams['figure.dpi'] = 120

np.random.seed(42)

# --- Environment Auto-Detection ---
try:
    IS_COLAB = 'google.colab' in str(get_ipython())
except NameError:
    IS_COLAB = False

if IS_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    PROJECT_ROOT = Path('/content/drive/MyDrive/SIADS 692 Milestone II/Milestone II - NLP Cryptic Crossword Clues')
else:
    try:
        PROJECT_ROOT = Path(__file__).resolve().parent.parent
    except NameError:
        PROJECT_ROOT = Path.cwd().parent

DATA_DIR = PROJECT_ROOT / 'data'
OUTPUT_DIR = PROJECT_ROOT / 'outputs'
FIGURES_DIR = OUTPUT_DIR / 'figures'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
FIGURES_DIR.mkdir(parents=True, exist_ok=True)

print(f'Project root: {PROJECT_ROOT}')
print(f'Data directory: {DATA_DIR}')
print(f'Output directory: {OUTPUT_DIR}')
print(f'Figures directory: {FIGURES_DIR}')

Project root: /home/vwinters/ccc-project/indicator_clustering
Data directory: /home/vwinters/ccc-project/indicator_clustering/data
Output directory: /home/vwinters/ccc-project/indicator_clustering/outputs
Figures directory: /home/vwinters/ccc-project/indicator_clustering/outputs/figures

required_files = {
    'embeddings_umap_10d.npy': 'Run 03_dimensionality_reduction.ipynb (Stage 3)',
    'embeddings_umap_2d.npy': 'Run 03_dimensionality_reduction.ipynb (Stage 3)',
    'indicator_index_all.csv': 'Run 02_embedding_generation.ipynb (Stage 2)',
}

for fname, fix_msg in required_files.items():
    fpath = DATA_DIR / fname
    assert fpath.exists(), (
        f'Missing required file: {fpath}\n'
        f'Fix: {fix_msg}'
    )

print('All input files found:')
for fname in required_files:
    print(f'  {fname}')

All input files found:
  embeddings_umap_10d.npy
  embeddings_umap_2d.npy
  indicator_index_all.csv

# Load UMAP embeddings
embeddings_10d = np.load(DATA_DIR / 'embeddings_umap_10d.npy')
embeddings_2d = np.load(DATA_DIR / 'embeddings_umap_2d.npy')

# Load indicator index (maps row i -> indicator string)
df_index = pd.read_csv(DATA_DIR / 'indicator_index_all.csv', index_col=0)
indicator_col = df_index.columns[0]  # should be 'indicator'
indicator_names = df_index[indicator_col].values  # numpy array for fast indexing

print(f'10D embeddings shape: {embeddings_10d.shape}')
print(f'2D embeddings shape:  {embeddings_2d.shape}')
print(f'Indicator index rows: {len(df_index)}')

# Sanity checks
n_indicators = len(df_index)
assert embeddings_10d.shape == (n_indicators, 10), (
    f'Expected 10D shape ({n_indicators}, 10), got {embeddings_10d.shape}'
)
assert embeddings_2d.shape == (n_indicators, 2), (
    f'Expected 2D shape ({n_indicators}, 2), got {embeddings_2d.shape}'
)
print(f'\nShape checks passed: {n_indicators:,} indicators')

10D embeddings shape: (12622, 10)
2D embeddings shape:  (12622, 2)
Indicator index rows: 12622

Shape checks passed: 12,622 indicators

# Sample 2,000 points for pairwise distance computation
sample_size = 2000
rng = np.random.RandomState(42)
sample_idx = rng.choice(len(embeddings_10d), size=sample_size, replace=False)
sample_embeddings = embeddings_10d[sample_idx]

# Compute pairwise Euclidean distances (returns condensed form)
# pdist returns a 1D array of all unique pairs
pairwise_dists = pdist(sample_embeddings, metric='euclidean')

print(f'Sample size: {sample_size}')
print(f'Number of pairwise distances: {len(pairwise_dists):,}')
print(f'\nDistance statistics:')
print(f'  Min:    {pairwise_dists.min():.4f}')
print(f'  Max:    {pairwise_dists.max():.4f}')
print(f'  Mean:   {pairwise_dists.mean():.4f}')
print(f'  Median: {np.median(pairwise_dists):.4f}')
print(f'  Std:    {pairwise_dists.std():.4f}')

# Key percentiles — these will guide epsilon selection
percentiles = [5, 10, 25, 50, 75, 90, 95]
percentile_values = np.percentile(pairwise_dists, percentiles)

print(f'\nKey percentiles:')
for p, v in zip(percentiles, percentile_values):
    print(f'  {p:3d}th percentile: {v:.4f}')

Sample size: 2000
Number of pairwise distances: 1,999,000

Distance statistics:
  Min:    0.0025
  Max:    8.5080
  Mean:   3.5617
  Median: 3.5518
  Std:    1.2452

Key percentiles:
    5th percentile: 1.4980
   10th percentile: 1.9327
   25th percentile: 2.6847
   50th percentile: 3.5518
   75th percentile: 4.4582
   90th percentile: 5.2181
   95th percentile: 5.5993

fig, ax = plt.subplots(figsize=(12, 5))

ax.hist(pairwise_dists, bins=200, color='steelblue', alpha=0.7, edgecolor='none',
        density=True)

# Mark key percentiles
colors_pctl = ['#e41a1c', '#ff7f00', '#4daf4a', '#377eb8', '#4daf4a', '#ff7f00', '#e41a1c']
for p, v, c in zip(percentiles, percentile_values, colors_pctl):
    ax.axvline(v, color=c, linestyle='--', alpha=0.7, linewidth=1.2,
               label=f'{p}th pctl = {v:.3f}')

ax.set_xlabel('Euclidean Distance (10D UMAP space)')
ax.set_ylabel('Density')
ax.set_title('Pairwise Distance Distribution (2,000-point sample)')
ax.legend(fontsize=8, loc='upper right')

plt.tight_layout()
fig.savefig(FIGURES_DIR / 'pairwise_distance_distribution.png', dpi=150,
            bbox_inches='tight')
plt.show()

print(f'Saved: {FIGURES_DIR / "pairwise_distance_distribution.png"}')

Saved: /home/vwinters/ccc-project/indicator_clustering/outputs/figures/pairwise_distance_distribution.png

# The 5th percentile marks the upper bound of the critical transition zone
p5 = float(np.percentile(pairwise_dists, 5))

# Fine-grained candidates: 6 evenly spaced values between 0 and the 5th percentile.
# These fill in the gap where HDBSCAN transitions from many small clusters to few
# large ones. np.linspace(0, p5, 8) gives 8 points including both endpoints;
# we take the 6 interior values since 0 and p5 appear in other lists.
fine_eps = list(np.linspace(0, p5, 8)[1:-1])

# Coarse candidates from percentiles (1st through 25th)
coarse_eps = [
    float(np.percentile(pairwise_dists, 1)),
    float(np.percentile(pairwise_dists, 5)),
    float(np.percentile(pairwise_dists, 10)),
    float(np.percentile(pairwise_dists, 15)),
    float(np.percentile(pairwise_dists, 20)),
    float(np.percentile(pairwise_dists, 25)),
]

# Combine: 0.0 + fine-grained + coarse, then deduplicate and sort
all_eps = [0.0] + fine_eps + coarse_eps
epsilon_candidates = sorted(set(float(round(e, 4)) for e in all_eps))

print(f'5th percentile distance: {p5:.4f}')
print(f'\nEpsilon candidates for HDBSCAN sweep ({len(epsilon_candidates)} values):')
for i, eps in enumerate(epsilon_candidates):
    zone = '(fine)' if 0 < eps < p5 else '(percentile)' if eps >= p5 else '(baseline)'
    print(f'  {i+1:2d}. epsilon = {eps:.4f}  {zone}')

5th percentile distance: 1.4980

Epsilon candidates for HDBSCAN sweep (13 values):
   1. epsilon = 0.0000  (baseline)
   2. epsilon = 0.2140  (fine)
   3. epsilon = 0.4280  (fine)
   4. epsilon = 0.6420  (fine)
   5. epsilon = 0.7788  (fine)
   6. epsilon = 0.8560  (fine)
   7. epsilon = 1.0700  (fine)
   8. epsilon = 1.2840  (fine)
   9. epsilon = 1.4980  (percentile)
  10. epsilon = 1.9327  (percentile)
  11. epsilon = 2.2334  (percentile)
  12. epsilon = 2.4729  (percentile)
  13. epsilon = 2.6847  (percentile)

hdbscan_results = []
hdbscan_labels_dict = {}  # store labels for each epsilon

for eps in epsilon_candidates:
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=10,
        cluster_selection_epsilon=eps,
        metric='euclidean',
        core_dist_n_jobs=-1,
    )
    labels = clusterer.fit_predict(embeddings_10d)
    hdbscan_labels_dict[eps] = labels

    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise = (labels == -1).sum()
    noise_pct = n_noise / len(labels) * 100

    # Compute metrics on non-noise points only
    non_noise_mask = labels != -1
    n_clustered = non_noise_mask.sum()

    if n_clusters >= 2 and n_clustered >= 2:
        sil = silhouette_score(embeddings_10d[non_noise_mask], labels[non_noise_mask])
        db = davies_bouldin_score(embeddings_10d[non_noise_mask], labels[non_noise_mask])
    else:
        sil = float('nan')
        db = float('nan')

    hdbscan_results.append({
        'epsilon': eps,
        'n_clusters': n_clusters,
        'n_noise': n_noise,
        'noise_pct': noise_pct,
        'n_clustered': n_clustered,
        'silhouette': sil,
        'davies_bouldin': db,
    })

    print(f'eps={eps:.4f}: {n_clusters:4d} clusters, '
          f'{n_noise:5d} noise ({noise_pct:5.1f}%), '
          f'silhouette={sil:.3f}, DB={db:.3f}')

df_hdbscan = pd.DataFrame(hdbscan_results)
print('\n--- HDBSCAN Sweep Summary ---')
print(df_hdbscan.to_string(index=False))

eps=0.0000:  281 clusters,  4193 noise ( 33.2%), silhouette=0.630, DB=0.473

eps=0.2140:  243 clusters,  3764 noise ( 29.8%), silhouette=0.584, DB=0.512

eps=0.4280:   61 clusters,  1280 noise ( 10.1%), silhouette=-0.119, DB=1.046

eps=0.6420:   17 clusters,   311 noise (  2.5%), silhouette=-0.296, DB=0.978

eps=0.7788:   11 clusters,   114 noise (  0.9%), silhouette=-0.186, DB=0.775

eps=0.8560:   10 clusters,    28 noise (  0.2%), silhouette=-0.168, DB=0.798

eps=1.0700:    6 clusters,     0 noise (  0.0%), silhouette=-0.120, DB=0.782

eps=1.2840:    6 clusters,     0 noise (  0.0%), silhouette=-0.120, DB=0.782

eps=1.4980:    4 clusters,     0 noise (  0.0%), silhouette=0.230, DB=0.549

eps=1.9327:    4 clusters,     0 noise (  0.0%), silhouette=0.230, DB=0.549

fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Panel 1: Number of clusters
axes[0].plot(df_hdbscan['epsilon'], df_hdbscan['n_clusters'],
             'o-', color='steelblue', linewidth=2, markersize=6)
axes[0].set_xlabel('Epsilon')
axes[0].set_ylabel('Number of Clusters')
axes[0].set_title('Clusters Found')
axes[0].grid(True, alpha=0.3)

# Panel 2: Noise percentage
axes[1].plot(df_hdbscan['epsilon'], df_hdbscan['noise_pct'],
             'o-', color='#e41a1c', linewidth=2, markersize=6)
axes[1].set_xlabel('Epsilon')
axes[1].set_ylabel('Noise Points (%)')
axes[1].set_title('Noise Percentage')
axes[1].grid(True, alpha=0.3)

# Panel 3: Silhouette score
axes[2].plot(df_hdbscan['epsilon'], df_hdbscan['silhouette'],
             'o-', color='#4daf4a', linewidth=2, markersize=6)
axes[2].set_xlabel('Epsilon')
axes[2].set_ylabel('Silhouette Score')
axes[2].set_title('Silhouette Score (non-noise only)')
axes[2].grid(True, alpha=0.3)

plt.suptitle('HDBSCAN Epsilon Sensitivity Analysis', fontsize=14, y=1.02)
plt.tight_layout()
fig.savefig(FIGURES_DIR / 'hdbscan_epsilon_sensitivity.png', dpi=150,
            bbox_inches='tight')
plt.show()

print(f'Saved: {FIGURES_DIR / "hdbscan_epsilon_sensitivity.png"}')

Saved: /home/vwinters/ccc-project/indicator_clustering/outputs/figures/hdbscan_epsilon_sensitivity.png

best_hdbscan_idx = df_hdbscan['silhouette'].idxmax()
best_hdbscan_row = df_hdbscan.loc[best_hdbscan_idx]
best_eps = best_hdbscan_row['epsilon']
best_hdbscan_labels = hdbscan_labels_dict[best_eps]

print(f'Best HDBSCAN run by silhouette score:')
print(f'  Epsilon:    {best_eps}')
print(f'  Clusters:   {int(best_hdbscan_row["n_clusters"])}')
print(f'  Noise:      {int(best_hdbscan_row["n_noise"])} ({best_hdbscan_row["noise_pct"]:.1f}%)')
print(f'  Silhouette: {best_hdbscan_row["silhouette"]:.3f}')
print(f'  Davies-Bouldin: {best_hdbscan_row["davies_bouldin"]:.3f}')

Best HDBSCAN run by silhouette score:
  Epsilon:    0.0
  Clusters:   281
  Noise:      4193 (33.2%)
  Silhouette: 0.630
  Davies-Bouldin: 0.473

k_values = [4, 6, 8, 9, 10, 11, 12, 16, 20, 26, 34, 50, 75, 100, 150, 200, 250]

agglo_results = []
agglo_labels_dict = {}

for k in k_values:
    clusterer = AgglomerativeClustering(
        n_clusters=k,
        linkage='ward',
    )
    labels = clusterer.fit_predict(embeddings_10d)
    agglo_labels_dict[k] = labels

    sil = silhouette_score(embeddings_10d, labels)
    db = davies_bouldin_score(embeddings_10d, labels)
    ch = calinski_harabasz_score(embeddings_10d, labels)

    agglo_results.append({
        'k': k,
        'silhouette': sil,
        'davies_bouldin': db,
        'calinski_harabasz': ch,
    })

    print(f'k={k:>3d}: silhouette={sil:.3f}, DB={db:.3f}, CH={ch:.0f}')

df_agglo = pd.DataFrame(agglo_results)
print('\n--- Agglomerative Clustering Metric Sweep ---')
print(df_agglo.to_string(index=False))

k=  4: silhouette=0.246, DB=1.456, CH=3909

k=  6: silhouette=0.259, DB=1.323, CH=3921

k=  8: silhouette=0.272, DB=1.267, CH=3897

k=  9: silhouette=0.289, DB=1.185, CH=3844

k= 10: silhouette=0.299, DB=1.164, CH=3789

k= 11: silhouette=0.281, DB=1.184, CH=3655

k= 12: silhouette=0.281, DB=1.226, CH=3552

k= 16: silhouette=0.279, DB=1.275, CH=3320

k= 20: silhouette=0.295, DB=1.194, CH=3203

k= 26: silhouette=0.304, DB=1.172, CH=3072

# --- Metrics-vs-k plot (agglomerative equivalent of HDBSCAN sensitivity analysis) ---
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

axes[0].plot(df_agglo['k'], df_agglo['silhouette'],
             'o-', color='#4daf4a', linewidth=2, markersize=5)
axes[0].set_xlabel('Number of Clusters (k)')
axes[0].set_ylabel('Silhouette Score')
axes[0].set_title('Silhouette Score (higher is better)')
axes[0].grid(True, alpha=0.3)

axes[1].plot(df_agglo['k'], df_agglo['davies_bouldin'],
             'o-', color='#e41a1c', linewidth=2, markersize=5)
axes[1].set_xlabel('Number of Clusters (k)')
axes[1].set_ylabel('Davies-Bouldin Index')
axes[1].set_title('Davies-Bouldin Index (lower is better)')
axes[1].grid(True, alpha=0.3)

axes[2].plot(df_agglo['k'], df_agglo['calinski_harabasz'],
             'o-', color='steelblue', linewidth=2, markersize=5)
axes[2].set_xlabel('Number of Clusters (k)')
axes[2].set_ylabel('Calinski-Harabasz Index')
axes[2].set_title('Calinski-Harabasz Index (higher is better)')
axes[2].grid(True, alpha=0.3)

plt.suptitle('Agglomerative Clustering: Metrics vs. Number of Clusters',
             fontsize=14, y=1.02)
plt.tight_layout()
fig.savefig(FIGURES_DIR / 'agglo_metrics_vs_k.png', dpi=150, bbox_inches='tight')
plt.show()
print(f'Saved: {FIGURES_DIR / "agglo_metrics_vs_k.png"}')

# --- Part B: Fixed selection of k values for detailed inspection ---
# k=8  : coarse reference point
# k=10 : local silhouette optimum
# k=34 : mid-range, cross-references with centroid dendrogram
# k=250: fine-grained endpoint
selected_k_values = [8, 10, 34, 250]

print(f'\nSelected k values for detailed inspection:')
for k in selected_k_values:
    row = df_agglo[df_agglo['k'] == k].iloc[0]
    print(f'  k={k:>3d}: silhouette={row["silhouette"]:.3f}, '
          f'DB={row["davies_bouldin"]:.3f}, CH={row["calinski_harabasz"]:.0f}')

Saved: /home/vwinters/ccc-project/indicator_clustering/outputs/figures/agglo_metrics_vs_k.png

Selected k values for detailed inspection:
  k=  8: silhouette=0.272, DB=1.267, CH=3897
  k= 10: silhouette=0.299, DB=1.164, CH=3789
  k= 34: silhouette=0.322, DB=1.068, CH=2993
  k=250: silhouette=0.431, DB=0.884, CH=2802

# --- Approach A: Truncated Dendrogram (Full Data) ---
# Compute Ward's linkage on all 10D UMAP embeddings.
# This produces the full merge tree for all 12,622 indicators.
print('Computing Ward linkage on all indicators (this may take a minute)...')
Z = linkage(embeddings_10d, method='ward')
print(f'Linkage matrix shape: {Z.shape}')  # (n-1, 4)
print(f'Final merge distance: {Z[-1, 2]:.2f}')

# Truncated dendrogram: show only the last 50 merges.
# Each leaf in this view represents a pre-formed cluster of points,
# with the cluster size shown in parentheses.
fig, ax = plt.subplots(figsize=(16, 8))
dendrogram(
    Z,
    truncate_mode='lastp',
    p=50,  # show last 50 merges
    leaf_rotation=90,
    leaf_font_size=8,
    ax=ax,
    color_threshold=0,  # uniform color for clarity
)
ax.set_xlabel('Cluster (size shown in parentheses)')
ax.set_ylabel('Ward Merge Distance')
ax.set_title('Truncated Dendrogram — Last 50 Merges (All 12,622 Indicators)')

plt.tight_layout()
fig.savefig(FIGURES_DIR / 'dendrogram_truncated_top50.png', dpi=150,
            bbox_inches='tight')
plt.show()
print(f'Saved: {FIGURES_DIR / "dendrogram_truncated_top50.png"}')

# --- Approach B: Dendrogram of k=34 Cluster Centroids ---
# Compute the centroid (mean embedding) of each of the 34 agglomerative clusters,
# then run hierarchical clustering on those 34 centroids.
k34_labels = agglo_labels_dict[34]
centroids_34 = np.array([
    embeddings_10d[k34_labels == cl].mean(axis=0)
    for cl in range(34)
])
cluster_sizes_34 = [int((k34_labels == cl).sum()) for cl in range(34)]
centroid_labels = [f'C{cl} (n={sz})' for cl, sz in enumerate(cluster_sizes_34)]

print(f'\nCentroid matrix shape: {centroids_34.shape}')  # (34, 10)

# Hierarchical clustering on the 34 centroids
Z_centroids = linkage(centroids_34, method='ward')

fig, ax = plt.subplots(figsize=(16, 8))
dendrogram(
    Z_centroids,
    labels=centroid_labels,
    leaf_rotation=90,
    leaf_font_size=8,
    ax=ax,
    color_threshold=0,
)
ax.set_xlabel('Cluster (k=34 agglomerative)')
ax.set_ylabel('Ward Merge Distance')
ax.set_title('Dendrogram of k=34 Cluster Centroids')

plt.tight_layout()
fig.savefig(FIGURES_DIR / 'dendrogram_k34_centroids.png', dpi=150,
            bbox_inches='tight')
plt.show()
print(f'Saved: {FIGURES_DIR / "dendrogram_k34_centroids.png"}')

# --- Approach C: Centroid dendrograms for other selected k values ---
# For each selected k value other than 34 (already covered by Approach B),
# compute cluster centroids and build a dendrogram.
for sel_k in selected_k_values:
    if sel_k == 34:
        continue  # already produced in Approach B

    sel_labels = agglo_labels_dict[sel_k]
    n_sel_clusters = len(set(sel_labels))
    centroids_sel = np.array([
        embeddings_10d[sel_labels == cl].mean(axis=0)
        for cl in range(n_sel_clusters)
    ])
    sizes_sel = [int((sel_labels == cl).sum()) for cl in range(n_sel_clusters)]
    labels_sel = [f'C{cl} (n={sz})' for cl, sz in enumerate(sizes_sel)]

    Z_sel = linkage(centroids_sel, method='ward')

    fig, ax = plt.subplots(figsize=(16, 8))
    dendrogram(
        Z_sel,
        labels=labels_sel,
        leaf_rotation=90,
        leaf_font_size=8,
        ax=ax,
        color_threshold=0,
    )
    ax.set_xlabel(f'Cluster (k={sel_k} agglomerative)')
    ax.set_ylabel('Ward Merge Distance')
    ax.set_title(f'Dendrogram of k={sel_k} Cluster Centroids')

    plt.tight_layout()
    fig.savefig(FIGURES_DIR / f'dendrogram_k{sel_k}_centroids.png', dpi=150,
                bbox_inches='tight')
    plt.show()
    print(f'Saved: {FIGURES_DIR / f"dendrogram_k{sel_k}_centroids.png"}')

Computing Ward linkage on all indicators (this may take a minute)...

Linkage matrix shape: (12621, 4)
Final merge distance: 229.76

Saved: /home/vwinters/ccc-project/indicator_clustering/outputs/figures/dendrogram_truncated_top50.png

Centroid matrix shape: (34, 10)

Saved: /home/vwinters/ccc-project/indicator_clustering/outputs/figures/dendrogram_k34_centroids.png

Saved: /home/vwinters/ccc-project/indicator_clustering/outputs/figures/dendrogram_k8_centroids.png

Saved: /home/vwinters/ccc-project/indicator_clustering/outputs/figures/dendrogram_k10_centroids.png

Saved: /home/vwinters/ccc-project/indicator_clustering/outputs/figures/dendrogram_k250_centroids.png

def plot_clusters(embeddings_2d, labels, title, filename, noise_label=-1):
    """Scatter plot of 2D UMAP colored by cluster assignment."""
    fig, ax = plt.subplots(figsize=(12, 9))

    # Plot noise points first (gray, small)
    noise_mask = labels == noise_label
    if noise_mask.any():
        ax.scatter(
            embeddings_2d[noise_mask, 0],
            embeddings_2d[noise_mask, 1],
            s=1, alpha=0.15, color='lightgray', label='noise', zorder=1
        )

    # Plot clustered points
    non_noise_mask = ~noise_mask
    unique_labels = sorted(set(labels[non_noise_mask]))
    n_clusters = len(unique_labels)

    # Use a colormap with enough distinct colors
    cmap = plt.cm.get_cmap('tab20', max(n_clusters, 20))
    for i, cl in enumerate(unique_labels):
        mask = labels == cl
        ax.scatter(
            embeddings_2d[mask, 0],
            embeddings_2d[mask, 1],
            s=2, alpha=0.4, color=cmap(i % 20), zorder=2
        )

    ax.set_xlabel('UMAP 1')
    ax.set_ylabel('UMAP 2')
    ax.set_title(f'{title} ({n_clusters} clusters)')

    if noise_mask.any():
        n_noise = noise_mask.sum()
        ax.annotate(f'Noise: {n_noise} ({n_noise/len(labels)*100:.1f}%)',
                    xy=(0.02, 0.98), xycoords='axes fraction',
                    fontsize=9, va='top',
                    bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

    plt.tight_layout()
    fig.savefig(FIGURES_DIR / filename, dpi=150, bbox_inches='tight')
    plt.show()
    print(f'Saved: {FIGURES_DIR / filename}')

for k in selected_k_values:
    plot_clusters(
        embeddings_2d,
        agglo_labels_dict[k],
        title=f'Agglomerative Clustering (Ward, k={k})',
        filename=f'agglo_k{k}_clusters.png'
    )

/tmp/ipykernel_964393/4156939200.py:20: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed in 3.11. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap()`` or ``pyplot.get_cmap()`` instead.
  cmap = plt.cm.get_cmap('tab20', max(n_clusters, 20))

Saved: /home/vwinters/ccc-project/indicator_clustering/outputs/figures/agglo_k8_clusters.png

Saved: /home/vwinters/ccc-project/indicator_clustering/outputs/figures/agglo_k10_clusters.png

Saved: /home/vwinters/ccc-project/indicator_clustering/outputs/figures/agglo_k34_clusters.png

Saved: /home/vwinters/ccc-project/indicator_clustering/outputs/figures/agglo_k250_clusters.png

plot_clusters(
    embeddings_2d,
    best_hdbscan_labels,
    title=f'HDBSCAN (eps={best_eps})',
    filename='hdbscan_best_clusters.png'
)

/tmp/ipykernel_964393/4156939200.py:20: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed in 3.11. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap()`` or ``pyplot.get_cmap()`` instead.
  cmap = plt.cm.get_cmap('tab20', max(n_clusters, 20))

Saved: /home/vwinters/ccc-project/indicator_clustering/outputs/figures/hdbscan_best_clusters.png

def inspect_clusters(labels, embeddings_10d, indicator_names, method_name,
                     n_examples=10):
    """Print centroid-nearest indicators and cluster size per cluster.

    This is a label-free inspection: we look at which indicator strings are
    closest to each cluster's centroid to judge semantic coherence.
    """
    unique_labels = sorted(set(labels))

    for cl in unique_labels:
        if cl == -1:
            # Summarize noise points briefly
            n_noise = (labels == -1).sum()
            print(f'\n{"=" * 60}')
            print(f'{method_name} — Noise points: {n_noise}')
            print(f'{"=" * 60}')
            continue

        mask = labels == cl
        cluster_size = mask.sum()
        cluster_embeddings = embeddings_10d[mask]
        cluster_names = indicator_names[mask]

        # Compute centroid and find nearest points
        centroid = cluster_embeddings.mean(axis=0)
        dists_to_centroid = np.linalg.norm(cluster_embeddings - centroid, axis=1)
        nearest_idx = np.argsort(dists_to_centroid)[:n_examples]
        nearest = [cluster_names[i] for i in nearest_idx]

        print(f'\n{"=" * 60}')
        print(f'{method_name} — Cluster {cl} (n={cluster_size})')
        print(f'{"-" * 60}')
        print(f'Nearest to centroid: {", ".join(nearest)}')

for k in selected_k_values:
    print(f'\n{"#" * 60}')
    print(f'  k = {k}')
    print(f'{"#" * 60}')
    inspect_clusters(
        agglo_labels_dict[k],
        embeddings_10d,
        indicator_names,
        method_name=f'Agglomerative k={k}'
    )

############################################################
  k = 8
############################################################

============================================================
Agglomerative k=8 — Cluster 0 (n=1196)
------------------------------------------------------------
Nearest to centroid: in rehab, cured, recovered from, after resettlement, following rehab, rediscovered, retrieving, recovered, retrieved, recovery

============================================================
Agglomerative k=8 — Cluster 1 (n=1921)
------------------------------------------------------------
Nearest to centroid: profligate, perverse, perversely, travesty, preposterous, promiscuously, promiscuous, pervertedly, precariously, errant

============================================================
Agglomerative k=8 — Cluster 2 (n=1615)
------------------------------------------------------------
Nearest to centroid: as an anagram, anagram, anagrams, as a, as herbal, cast of, as a show of, cast in, featured, cast

============================================================
Agglomerative k=8 — Cluster 3 (n=1853)
------------------------------------------------------------
Nearest to centroid: must encase, must be caught, into care of, must be gripped by, must plug, must catch, must be put in, it takes, must be beaten, must be found in

============================================================
Agglomerative k=8 — Cluster 4 (n=1488)
------------------------------------------------------------
Nearest to centroid: spared, dumping, discards, flushed, trash, rubbishy, rubbish, rubbished, wiped, putting away

============================================================
Agglomerative k=8 — Cluster 5 (n=1717)
------------------------------------------------------------
Nearest to centroid: to go, goes about, put around, parked in, garaging, on a bender, garaged, on the blink, to go outside, going about

============================================================
Agglomerative k=8 — Cluster 6 (n=1634)
------------------------------------------------------------
Nearest to centroid: sozzled, when sozzled, pickled, snarled, to scoff, collared, to snaffle, snarfing, snarling, to swell

============================================================
Agglomerative k=8 — Cluster 7 (n=1198)
------------------------------------------------------------
Nearest to centroid: to make cladding for, modelled, for modelling, model, modelling, models, to model, decorative, engineered, used for modelling

############################################################
  k = 10
############################################################

============================================================
Agglomerative k=10 — Cluster 0 (n=1921)
------------------------------------------------------------
Nearest to centroid: profligate, perverse, perversely, travesty, preposterous, promiscuously, promiscuous, pervertedly, precariously, errant

============================================================
Agglomerative k=10 — Cluster 1 (n=1488)
------------------------------------------------------------
Nearest to centroid: spared, dumping, discards, flushed, trash, rubbishy, rubbish, rubbished, wiped, putting away

============================================================
Agglomerative k=10 — Cluster 2 (n=1717)
------------------------------------------------------------
Nearest to centroid: to go, goes about, put around, parked in, garaging, on a bender, garaged, on the blink, to go outside, going about

============================================================
Agglomerative k=10 — Cluster 3 (n=1853)
------------------------------------------------------------
Nearest to centroid: must encase, must be caught, into care of, must be gripped by, must plug, must catch, must be put in, it takes, must be beaten, must be found in

============================================================
Agglomerative k=10 — Cluster 4 (n=1030)
------------------------------------------------------------
Nearest to centroid: this, like this, such, how, this way, origin, source of, source, one in a, form of this

============================================================
Agglomerative k=10 — Cluster 5 (n=822)
------------------------------------------------------------
Nearest to centroid: after reform, after reforming, after reformation, after adjustment, having reformed, after revolution, after modifying, after liberation, must be refined, must be revised

============================================================
Agglomerative k=10 — Cluster 6 (n=1634)
------------------------------------------------------------
Nearest to centroid: sozzled, when sozzled, pickled, snarled, to scoff, collared, to snaffle, snarfing, snarling, to swell

============================================================
Agglomerative k=10 — Cluster 7 (n=1198)
------------------------------------------------------------
Nearest to centroid: to make cladding for, modelled, for modelling, model, modelling, models, to model, decorative, engineered, used for modelling

============================================================
Agglomerative k=10 — Cluster 8 (n=374)
------------------------------------------------------------
Nearest to centroid: playing back, is retro, going back, heading back, piece back, gone back, setting back, set back, going retro, back

============================================================
Agglomerative k=10 — Cluster 9 (n=585)
------------------------------------------------------------
Nearest to centroid: pronouncing, pronounce as, enunciating, in conference, speech of, pronounced for some, enunciated, in pronouncing, pronounced, in talk

############################################################
  k = 34
############################################################

============================================================
Agglomerative k=34 — Cluster 0 (n=457)
------------------------------------------------------------
Nearest to centroid: must keep hold of, must keep, has to keep, must hold, having to hold, should keep, held by, should be kept in, should hold, to be held by

============================================================
Agglomerative k=34 — Cluster 1 (n=320)
------------------------------------------------------------
Nearest to centroid: for pulping, cod, wrought, code, coded, moderated, for grinding, coding, tempered, core

============================================================
Agglomerative k=34 — Cluster 2 (n=762)
------------------------------------------------------------
Nearest to centroid: connected by, accessing, getting to, gaining access, in which you get, in which to get, allowing in, must enter, accessed by, should enter

============================================================
Agglomerative k=34 — Cluster 3 (n=633)
------------------------------------------------------------
Nearest to centroid: to buzz around, cartwheels, went around clinging to, driven around, rooting about, wheeled, wheels, orbiting, going around, on orbital

============================================================
Agglomerative k=34 — Cluster 4 (n=523)
------------------------------------------------------------
Nearest to centroid: directed, engineers, engineering, engineer, with engineering, engineered, artificial, synthetic, into shape, in shape

============================================================
Agglomerative k=34 — Cluster 5 (n=653)
------------------------------------------------------------
Nearest to centroid: defective, defect, getting defective, invalid, is defective, found defective, imperfect, inadequately, imperfectly, inadequate

============================================================
Agglomerative k=34 — Cluster 6 (n=128)
------------------------------------------------------------
Nearest to centroid: looked after by, nursed by, nurtures, will nurture, nurturing, careering, looking after, being nursed, taking care of, to nurse

============================================================
Agglomerative k=34 — Cluster 7 (n=159)
------------------------------------------------------------
Nearest to centroid: upside down, found upside down, displayed upside down, appearing upside down, turned upside down, kept upside down, from right to left, from the right, passed to the left, from down under

============================================================
Agglomerative k=34 — Cluster 8 (n=390)
------------------------------------------------------------
Nearest to centroid: swelled by, to snare, snare, ditches, to spear, snakes, snaking, larking, snags, smitten

============================================================
Agglomerative k=34 — Cluster 9 (n=585)
------------------------------------------------------------
Nearest to centroid: pronouncing, pronounce as, enunciating, in conference, speech of, pronounced for some, enunciated, in pronouncing, pronounced, in talk

============================================================
Agglomerative k=34 — Cluster 10 (n=542)
------------------------------------------------------------
Nearest to centroid: impossible, irrational, may be awkward, may be ridiculous, irregularity, unbelievably, awkwardly, uncannily, unfortunate, sitting awkwardly

============================================================
Agglomerative k=34 — Cluster 11 (n=615)
------------------------------------------------------------
Nearest to centroid: to be blasted, to be blotto, will get smashed, when blotto, and is smashed, blasted, to be cracked by, to be battered, to be injured, gets smashed

============================================================
Agglomerative k=34 — Cluster 12 (n=420)
------------------------------------------------------------
Nearest to centroid: on its head, on the hoof, to go by, put across, pasted across, extended over, put over, placed over, going head over heels, across

============================================================
Agglomerative k=34 — Cluster 13 (n=260)
------------------------------------------------------------
Nearest to centroid: blanketed with, blanketing, shrouded in, shaded by, given a coating of, coated with, eclipsed by, has enveloped, overshadowed by, sheathed by

============================================================
Agglomerative k=34 — Cluster 14 (n=540)
------------------------------------------------------------
Nearest to centroid: to be left out, oddly removed, regular expulsions, left out, with head dismissed, having forfeited, oddly dropped, head removed, has been removed, removed

============================================================
Agglomerative k=34 — Cluster 15 (n=355)
------------------------------------------------------------
Nearest to centroid: new organisation, layout of, properly allocated, with arrangement, in arrangement, in a special arrangement, layout, an arrangement, arrangement, the arrangement

============================================================
Agglomerative k=34 — Cluster 16 (n=487)
------------------------------------------------------------
Nearest to centroid: to be edited, amended, with alteration, after not much revision, after some editing, modify, is amended, after editing, edited, edited out

============================================================
Agglomerative k=34 — Cluster 17 (n=26)
------------------------------------------------------------
Nearest to centroid: retirement, to retire, for retirement, retired chap, to be retired, upon retirement, after retirement, following retirement, retired peer, being retired

============================================================
Agglomerative k=34 — Cluster 18 (n=374)
------------------------------------------------------------
Nearest to centroid: playing back, is retro, going back, heading back, piece back, gone back, setting back, set back, going retro, back

============================================================
Agglomerative k=34 — Cluster 19 (n=269)
------------------------------------------------------------
Nearest to centroid: having gone up, to get up, having to go up, going up, awakening, goes belly up, getting elevated, to rise, getting up, flag raised

============================================================
Agglomerative k=34 — Cluster 20 (n=184)
------------------------------------------------------------
Nearest to centroid: sexy, erotica, merry, get aroused, merrymaking, happy, enjoying, happily, aroused, to make merry

============================================================
Agglomerative k=34 — Cluster 21 (n=405)
------------------------------------------------------------
Nearest to centroid: is in, present in, its, set during, pub about, his, is it, when in, about it, about

============================================================
Agglomerative k=34 — Cluster 22 (n=176)
------------------------------------------------------------
Nearest to centroid: by subtracting, subtraction, dispensing, cut back, to divulge, dissecting, oddly cut, cutting off, for distilling, cutting taken from

============================================================
Agglomerative k=34 — Cluster 23 (n=542)
------------------------------------------------------------
Nearest to centroid: in convulsions, in seizure, boisterous, convulsive, in disorder, convulsed, turbulent, disorder, disordered, is convulsed

============================================================
Agglomerative k=34 — Cluster 24 (n=174)
------------------------------------------------------------
Nearest to centroid: could bring about, may get together, may come to, may turn out, could be forming, that could make, could conjure up, could come from, may turn out to be, may be in

============================================================
Agglomerative k=34 — Cluster 25 (n=943)
------------------------------------------------------------
Nearest to centroid: curled, quaint, bumpy, wrinkled, furrowed, curls, squirm, flog, bumpily, bucking

============================================================
Agglomerative k=34 — Cluster 26 (n=246)
------------------------------------------------------------
Nearest to centroid: being caught in, is caught, that catches, caught up, contrary characters caught, caught in, caught, getting caught by, caught out, caught up in

============================================================
Agglomerative k=34 — Cluster 27 (n=335)
------------------------------------------------------------
Nearest to centroid: retrofit, getting reinvented, being reselected, retrained, retuned, to be reappointed to, overhauled, reinvention, to be recast, redesign

============================================================
Agglomerative k=34 — Cluster 28 (n=197)
------------------------------------------------------------
Nearest to centroid: appearing alternately, appearing intermittently, now and then, again, alternating, alternately, every second, occurring intermittently, seen intermittently, every other

============================================================
Agglomerative k=34 — Cluster 29 (n=236)
------------------------------------------------------------
Nearest to centroid: coming loose, loose, becomes loose, breaking loose, get loose, made loose, on the loose, to be let loose, liberal, being let loose

============================================================
Agglomerative k=34 — Cluster 30 (n=131)
------------------------------------------------------------
Nearest to centroid: to thwart, thwarting, restraining, constraining, restrained by, restrains, to restrain, to suppress, when restraining, restrained

============================================================
Agglomerative k=34 — Cluster 31 (n=16)
------------------------------------------------------------
Nearest to centroid: interred in, will be buried in, interred by, buried, buried in, buries, buried into, buried by, has buried, and buried in

============================================================
Agglomerative k=34 — Cluster 32 (n=285)
------------------------------------------------------------
Nearest to centroid: to buffet, in buffet, buffets, buffet, being fed, feeding, having eaten, sandwiches, sandwich for, when eating

============================================================
Agglomerative k=34 — Cluster 33 (n=254)
------------------------------------------------------------
Nearest to centroid: some items in, something of, some of it, some of a, a tissue of, a spot of, a piece of, assorted bits of, created partly by, a piece to

############################################################
  k = 250
############################################################

============================================================
Agglomerative k=250 — Cluster 0 (n=118)
------------------------------------------------------------
Nearest to centroid: to evict a, scrapping, when discarding, getting rid of, being disposed of, disposing of, ousting, wiping off, disposal of, to be got rid of

============================================================
Agglomerative k=250 — Cluster 1 (n=62)
------------------------------------------------------------
Nearest to centroid: after torturing, tortured, torment, tortuous, tormented, when tortured, torrential, torturous, in chaos, topsy turvy

============================================================
Agglomerative k=250 — Cluster 2 (n=48)
------------------------------------------------------------
Nearest to centroid: must perform, must be found in, must have, must contribute, must be given, must be spread, must be put in, must work, must plug, must wear

============================================================
Agglomerative k=250 — Cluster 3 (n=42)
------------------------------------------------------------
Nearest to centroid: oddly deficient, by mistake, inadvertently, accidentally, unintentionally, by accident, fortuitously, nonconformist, randomly, at random

============================================================
Agglomerative k=250 — Cluster 4 (n=107)
------------------------------------------------------------
Nearest to centroid: getting outside, decked out, when outside, outside, going outside, going to the dogs, exterior, outskirts, on outside of, on the outside

============================================================
Agglomerative k=250 — Cluster 5 (n=101)
------------------------------------------------------------
Nearest to centroid: zippily, foggy, foully, idly, kinkily, frilly, frizzy, get creaky, eerily, get kinky

============================================================
Agglomerative k=250 — Cluster 6 (n=108)
------------------------------------------------------------
Nearest to centroid: to hold in, to hold onto, holds on to, to be kept by, holding in, kept up, stays around, that keeps, kept in, to be kept within

============================================================
Agglomerative k=250 — Cluster 7 (n=60)
------------------------------------------------------------
Nearest to centroid: custom, bespoke, is customised, designed for, designed to, specially designed for, specially designed, made specially, specialised, designed

============================================================
Agglomerative k=250 — Cluster 8 (n=76)
------------------------------------------------------------
Nearest to centroid: mines, mining, being mined, pins, for mincing, coins, mince, pepper, lima, grinding

============================================================
Agglomerative k=250 — Cluster 9 (n=71)
------------------------------------------------------------
Nearest to centroid: rubbishing, stampeding, stampeded, rubbing, rubbed, clobbered, clobber, cribbing, rubbed into, schemed

============================================================
Agglomerative k=250 — Cluster 10 (n=189)
------------------------------------------------------------
Nearest to centroid: to the top, elevating, to rise, record elevated, getting elevated, being risen, uplifted, flag raised, after an upswing, after rise

============================================================
Agglomerative k=250 — Cluster 11 (n=40)
------------------------------------------------------------
Nearest to centroid: appropriate to, suitable, fits with, fitfully, appropriately, to fit, fittingly, fit for, fitting, correctly

============================================================
Agglomerative k=250 — Cluster 12 (n=35)
------------------------------------------------------------
Nearest to centroid: to lay out, newly laid out, laid out, bedded, laid back, laying in, to lie in, lying across, lying in, after being laid back

============================================================
Agglomerative k=250 — Cluster 13 (n=84)
------------------------------------------------------------
Nearest to centroid: bumbling, bobbing, ruffling, bristling, rattling, bubbling, wrinkled, in garbled fashion, bungling, burbled

============================================================
Agglomerative k=250 — Cluster 14 (n=32)
------------------------------------------------------------
Nearest to centroid: laps, lark, is lax, lop, larks, lax, lame, laces, lad, laced

============================================================
Agglomerative k=250 — Cluster 15 (n=93)
------------------------------------------------------------
Nearest to centroid: nails, nipping, nicking, nailing, snares, necking, snags, nicked, nipped by, snaking

============================================================
Agglomerative k=250 — Cluster 16 (n=66)
------------------------------------------------------------
Nearest to centroid: going through revision, for revision, having revised, revision, to revise, being revised, subject to revision, is revised, revision of, with revision

============================================================
Agglomerative k=250 — Cluster 17 (n=114)
------------------------------------------------------------
Nearest to centroid: broken, when broke, broken by, broken down, broke down, broke, after breaking, having broken, beginning to break, is broken

============================================================
Agglomerative k=250 — Cluster 18 (n=98)
------------------------------------------------------------
Nearest to centroid: in error, being laid out wrongly, the wrong, that went wrong, error in, must be wrong, to go wrong, after mistake, wrongly positioned, error

============================================================
Agglomerative k=250 — Cluster 19 (n=89)
------------------------------------------------------------
Nearest to centroid: beyond, comes over, looms over, going over, to go over, over shoulder, go over, pore over, can go over, anorak put over

============================================================
Agglomerative k=250 — Cluster 20 (n=64)
------------------------------------------------------------
Nearest to centroid: runny, running amok, running rampant, runaway, runs, ran amok, run amok, runs amok, run around, run

============================================================
Agglomerative k=250 — Cluster 21 (n=116)
------------------------------------------------------------
Nearest to centroid: go around, going round, lug around, on orbital, doing the rounds, to go around, round the bend, getting around, gets around, round about

============================================================
Agglomerative k=250 — Cluster 22 (n=69)
------------------------------------------------------------
Nearest to centroid: to be chopped, slicing, cutting, after cutting, slices, cutting up rough, slice, when cutting, to cut, sliced by

============================================================
Agglomerative k=250 — Cluster 23 (n=30)
------------------------------------------------------------
Nearest to centroid: parade, islands, italics, parading, garaged, garaging, byzantine, paradise, israel, ireland

============================================================
Agglomerative k=250 — Cluster 24 (n=42)
------------------------------------------------------------
Nearest to centroid: having got hold of, grabbing some of, posed, taking hold of excise, possessing, possesses, possess, to pick up, to possess, having picked up

============================================================
Agglomerative k=250 — Cluster 25 (n=93)
------------------------------------------------------------
Nearest to centroid: showered, splashed, being smeared, shower, smeared, drizzled with, dripped, shower of, sprayed, smearing

============================================================
Agglomerative k=250 — Cluster 26 (n=108)
------------------------------------------------------------
Nearest to centroid: oddly enough, oddly enough but developed, becoming odd, peculiar, strangely enough, appears unusually, unusual spelling, most odd, unusually, oddly

============================================================
Agglomerative k=250 — Cluster 27 (n=44)
------------------------------------------------------------
Nearest to centroid: is about, anagrams, given special display, illustrated by, about, about it, for it, about that, in special display, anagram

============================================================
Agglomerative k=250 — Cluster 28 (n=85)
------------------------------------------------------------
Nearest to centroid: will eat, to eat, maybe to eat, eating flower, eat, is eating, eating, stomach, to be eaten, fed with

============================================================
Agglomerative k=250 — Cluster 29 (n=103)
------------------------------------------------------------
Nearest to centroid: regularly avoided, forgoing, forgoes, ignore the odds, has avoided, avoids, avoid, having dismissed, avoiding, originally dismissed

============================================================
Agglomerative k=250 — Cluster 30 (n=94)
------------------------------------------------------------
Nearest to centroid: anxious, anxiously, becoming queasy, nervously, getting jittery, nervous, when nervous, infuriated, gets jittery, annoying

============================================================
Agglomerative k=250 — Cluster 31 (n=58)
------------------------------------------------------------
Nearest to centroid: dispersing, disseminating, diffuse, to disseminate, dissemination, disseminated, to distribute, for distributing, for dispersal, distribute

============================================================
Agglomerative k=250 — Cluster 32 (n=101)
------------------------------------------------------------
Nearest to centroid: head back, taking back, moving back, putting back, setting back, combed back, turn back, printed back to front, falling back, falls back

============================================================
Agglomerative k=250 — Cluster 33 (n=37)
------------------------------------------------------------
Nearest to centroid: pirouette, pirouetting, acrobatic, acrobatically, ballet, seen dancing, dancing around, dance about, dances, disco

============================================================
Agglomerative k=250 — Cluster 34 (n=77)
------------------------------------------------------------
Nearest to centroid: to be stuck in, stuck inside, get stuck in, locks, stuck up, gets stuck in, stuck into, stuck between, gets stuck, jammed in

============================================================
Agglomerative k=250 — Cluster 35 (n=131)
------------------------------------------------------------
Nearest to centroid: renovated, to regenerate, renewing, refurbished, refurbishment of, for restoring, refurbishment, for restoration, for refurbishment, redress

============================================================
Agglomerative k=250 — Cluster 36 (n=176)
------------------------------------------------------------
Nearest to centroid: reprogramming, redesign of, is redesigned, redesign, redesigned, redesigning, reconfigure, to redesign, retuning, reconfigured

============================================================
Agglomerative k=250 — Cluster 37 (n=27)
------------------------------------------------------------
Nearest to centroid: overthrowing, overthrown, overthrow, overthrown in, overturn, overwritten, overturning, after overturning, to overturn, all overturned

============================================================
Agglomerative k=250 — Cluster 38 (n=65)
------------------------------------------------------------
Nearest to centroid: make violent, offensively, vile, violently, violent, vulgar, spelling savagely, vicious, savagely, crudely

============================================================
Agglomerative k=250 — Cluster 39 (n=69)
------------------------------------------------------------
Nearest to centroid: vulgarly mentioned, quoted, priest about, mentioned, citing, speaks of, mentioning, cited, is being mentioned, quotes

============================================================
Agglomerative k=250 — Cluster 40 (n=63)
------------------------------------------------------------
Nearest to centroid: to bank, to bag, packing, having packed, packaging, pack, is packing, to package, to pack, bagged

============================================================
Agglomerative k=250 — Cluster 41 (n=38)
------------------------------------------------------------
Nearest to centroid: construed, construe, construct, being constructed, construction, constructed, build, to be constructed, built, got built

============================================================
Agglomerative k=250 — Cluster 42 (n=66)
------------------------------------------------------------
Nearest to centroid: to obtain, to achieve, earn, obtains, obtained by, procured by, obtained, to receive, obtain, achieving

============================================================
Agglomerative k=250 — Cluster 43 (n=72)
------------------------------------------------------------
Nearest to centroid: rolling around, rolling about, rotary, rotated, rotation, roll over, rotating, for rolling, wheeled over, to roll

============================================================
Agglomerative k=250 — Cluster 44 (n=71)
------------------------------------------------------------
Nearest to centroid: devastated, blighted, damaging, mendelssohn damaging, become ruined, ruined, vandalising, vandalised, ruined by, in ruins

============================================================
Agglomerative k=250 — Cluster 45 (n=74)
------------------------------------------------------------
Nearest to centroid: clothed, pants, shirt, clad in, suit, wears, clothed in, clothing, wear, clothes

============================================================
Agglomerative k=250 — Cluster 46 (n=112)
------------------------------------------------------------
Nearest to centroid: to hear, be listened to, being heard, for us to hear, brought into hearing, listened, at hearing, listen, after hearing, to be heard

============================================================
Agglomerative k=250 — Cluster 47 (n=90)
------------------------------------------------------------
Nearest to centroid: but unruly, unsafe, unseemly, unsteady, unfathomable, untoward, unwisely, unconvincingly, uneven, unstable

============================================================
Agglomerative k=250 — Cluster 48 (n=24)
------------------------------------------------------------
Nearest to centroid: less, discounted, reduction, reduced by, reduced, reducing, much reduced, discount, subtraction, discounting lead

============================================================
Agglomerative k=250 — Cluster 49 (n=29)
------------------------------------------------------------
Nearest to centroid: after manoeuvres, manoeuvres, during manoeuvres, on manoeuvres, manoeuvres to, manoeuvred, manoeuvring, to manoeuvre, manoeuvre, for manoeuvring

============================================================
Agglomerative k=250 — Cluster 50 (n=96)
------------------------------------------------------------
Nearest to centroid: to get busted, gets caught, arresting, to be arrested by, to be arrested, to arrest, to be caught by, getting caught, arrests, got arrested

============================================================
Agglomerative k=250 — Cluster 51 (n=63)
------------------------------------------------------------
Nearest to centroid: agglomeration, to gather, to assemble, gathers, i gather, gathered by, held together by, gather, assembly, assembles

============================================================
Agglomerative k=250 — Cluster 52 (n=49)
------------------------------------------------------------
Nearest to centroid: the reverse, in the opposite direction, in reverse, inverted, inverse, operated in reverse, reversed, just the opposite, inversion, suffering reverse

============================================================
Agglomerative k=250 — Cluster 53 (n=51)
------------------------------------------------------------
Nearest to centroid: frayed, strewn, slithering, strain, strewn about, stray, straying, swilled, strays, stricken

============================================================
Agglomerative k=250 — Cluster 54 (n=38)
------------------------------------------------------------
Nearest to centroid: surrealist, sculpture of, scrawling, sculpture, sculpting, sculpted, scrawled, material, materialising, drawn up

============================================================
Agglomerative k=250 — Cluster 55 (n=129)
------------------------------------------------------------
Nearest to centroid: is brought back, in return, on return, coming back, bloomer in return, having returned, returning wise, american spies returned, returning, made a return

============================================================
Agglomerative k=250 — Cluster 56 (n=48)
------------------------------------------------------------
Nearest to centroid: twisty, twist, after twisting, twists, in twist, to twist, is twisted, twisting, to do the twist, twist of

============================================================
Agglomerative k=250 — Cluster 57 (n=55)
------------------------------------------------------------
Nearest to centroid: is found out, found, found out, find, to be discovered, discovered, being discovered in, when found out, discovered in, found in

============================================================
Agglomerative k=250 — Cluster 58 (n=51)
------------------------------------------------------------
Nearest to centroid: moving to get, to move, on moving, to be moving, moving, move, make a move, is moved, when moving, is moving

============================================================
Agglomerative k=250 — Cluster 59 (n=64)
------------------------------------------------------------
Nearest to centroid: astonishingly, a staggering, eccentricity, eccentrically, staggering, surprised, expression of surprise about, surprising, eccentric, exceptionally

============================================================
Agglomerative k=250 — Cluster 60 (n=61)
------------------------------------------------------------
Nearest to centroid: being set up, set up, setting, to set, to be set up, having to set, setting up, to set up, setting for, set right

============================================================
Agglomerative k=250 — Cluster 61 (n=34)
------------------------------------------------------------
Nearest to centroid: disciplined, trained, discipline, being trained, with training, in training, trained for, must be trained, when trained, for training

============================================================
Agglomerative k=250 — Cluster 62 (n=50)
------------------------------------------------------------
Nearest to centroid: threshed, getting toppled, sloshed, roughed up, thrashed, to be roughed up, roughing up, lashed, ripped up, remoulded

============================================================
Agglomerative k=250 — Cluster 63 (n=25)
------------------------------------------------------------
Nearest to centroid: decrypted, contorted, convoluted, deciphered, contrived, deciphering, contrive, decipher, conflation, conjured up

============================================================
Agglomerative k=250 — Cluster 64 (n=135)
------------------------------------------------------------
Nearest to centroid: abominable, dreadfully, appalling, appallingly, diabolically, awful, ominous, disgracefully, outrageous, deplorably

============================================================
Agglomerative k=250 — Cluster 65 (n=69)
------------------------------------------------------------
Nearest to centroid: put another way, a different way, other, elsewhere, in another way, in a different way, to put it another way, another way, put differently, in a different format

============================================================
Agglomerative k=250 — Cluster 66 (n=92)
------------------------------------------------------------
Nearest to centroid: it can be seen travelling round, travelling around, tripping, itinerant, after travelling, gone travelling, travelling about, roamed, travels around, roaming around

============================================================
Agglomerative k=250 — Cluster 67 (n=98)
------------------------------------------------------------
Nearest to centroid: being troubled, challenged, to be troubled, becoming troublesome, when in distress, made troublesome, when in difficulties, being troubled by, when troubled, when in trouble

============================================================
Agglomerative k=250 — Cluster 68 (n=54)
------------------------------------------------------------
Nearest to centroid: to embody, embodying, that embodies, embodies, encasing, to encapsulate, to enclose, encapsulating, enclosure, enclosing

============================================================
Agglomerative k=250 — Cluster 69 (n=86)
------------------------------------------------------------
Nearest to centroid: discombobulated, decayed, defunct, dilapidated, desecrated, disfigurement, decay, discomposed, impairment, degenerated

============================================================
Agglomerative k=250 — Cluster 70 (n=82)
------------------------------------------------------------
Nearest to centroid: could do for, could make one, could be made, could make you, it can make, may make you, may make, could make one build, might make it, could also make

============================================================
Agglomerative k=250 — Cluster 71 (n=39)
------------------------------------------------------------
Nearest to centroid: impaled by, getting snagged by, quaffed by, clutched by, having impaled, shafted by, tethered by, impounded in, shackled by, ambushed by

============================================================
Agglomerative k=250 — Cluster 72 (n=30)
------------------------------------------------------------
Nearest to centroid: recollecting, in recollection, recollected, being recollected, recollection, recollection of, to recollect, recollect, in hindsight, in retrospect

============================================================
Agglomerative k=250 — Cluster 73 (n=61)
------------------------------------------------------------
Nearest to centroid: intrinsic to, integral to, insider, intrinsic, insiders, inherent, inset, inherent to, inherently, intrinsically

============================================================
Agglomerative k=250 — Cluster 74 (n=30)
------------------------------------------------------------
Nearest to centroid: bursting, burst, bursts, made to burst, bursts into, to burst, to blow up, burst out, bust, blowout

============================================================
Agglomerative k=250 — Cluster 75 (n=91)
------------------------------------------------------------
Nearest to centroid: a piece of, a portion of, one part so, a piece to, portion of, parts of, share of, a tissue of, piece of, a part

============================================================
Agglomerative k=250 — Cluster 76 (n=32)
------------------------------------------------------------
Nearest to centroid: being thrown around, throwing, throws, toss of, thrown, tossing, tossing and turning, tossed, thrown on, when thrown

============================================================
Agglomerative k=250 — Cluster 77 (n=37)
------------------------------------------------------------
Nearest to centroid: unravelled, unloaded, unleashed, can be unravelled, unfolded, unwrapped, unravels, unloaded from, unhooked, uncoiled

============================================================
Agglomerative k=250 — Cluster 78 (n=87)
------------------------------------------------------------
Nearest to centroid: needs adapting, adjustment needed, tweaked, needing correction, needing adjustment, for an adjustment, adjustments made, needs adjusting, with adjustments made, must be adjusted

============================================================
Agglomerative k=250 — Cluster 79 (n=108)
------------------------------------------------------------
Nearest to centroid: being entertained by, fired up, to attract, aroused, that would excite, quite excited, entertained by, enjoys, to be impressed, miffed

============================================================
Agglomerative k=250 — Cluster 80 (n=99)
------------------------------------------------------------
Nearest to centroid: furled, dappled, coiled, jolted, jellied, daubed, gnarled, tailed, soiled, doodled

============================================================
Agglomerative k=250 — Cluster 81 (n=48)
------------------------------------------------------------
Nearest to centroid: restless, reckless, being reckless, lawless, restlessly, nameless, headless, shapeless, gutless, leaderless

============================================================
Agglomerative k=250 — Cluster 82 (n=43)
------------------------------------------------------------
Nearest to centroid: outspoken, being read aloud, uttered, offered out loud, utter, read out loud, read aloud, spoken out loud, being read out, loudly demanding

============================================================
Agglomerative k=250 — Cluster 83 (n=89)
------------------------------------------------------------
Nearest to centroid: misapplying, misapplied, mislaid, misguidedly, mislaying the odds, misplacing, misfiring, misfit, oddly mislaid, being miscast

============================================================
Agglomerative k=250 — Cluster 84 (n=36)
------------------------------------------------------------
Nearest to centroid: melting, melt, melts, melted, meltingly, matted, to mush, mould, juiced, moulded

============================================================
Agglomerative k=250 — Cluster 85 (n=33)
------------------------------------------------------------
Nearest to centroid: goes off retreating, retraction, refracted, to retract, retracted, retreat, retreating, retreats, retreated, to retreat

============================================================
Agglomerative k=250 — Cluster 86 (n=69)
------------------------------------------------------------
Nearest to centroid: rears, for speech, used in speech, in speaking, in pronouncing, pronouncement, in speech, speech, of speech, pronounced for some

============================================================
Agglomerative k=250 — Cluster 87 (n=45)
------------------------------------------------------------
Nearest to centroid: palming, pliant, plying, plugging, plaits, pelting, plaited, plied, plastered, clogging

============================================================
Agglomerative k=250 — Cluster 88 (n=41)
------------------------------------------------------------
Nearest to centroid: delusional, perversion, pervert, perverted, found to be perverted, devilish, delirious, pervertedly, deliriously, devious

============================================================
Agglomerative k=250 — Cluster 89 (n=60)
------------------------------------------------------------
Nearest to centroid: based in, in station, based around, based, attendant in, established within, lands in, got about, established around, takes place in

============================================================
Agglomerative k=250 — Cluster 90 (n=21)
------------------------------------------------------------
Nearest to centroid: deftly, teasingly, derisively, teased, being teased, when teased, loosely, teaser, loony, erring

============================================================
Agglomerative k=250 — Cluster 91 (n=48)
------------------------------------------------------------
Nearest to centroid: with exercising, for exercising, for exercise, exercise, after cycling, exercising, exercised, exercises, after exercising, working out

============================================================
Agglomerative k=250 — Cluster 92 (n=15)
------------------------------------------------------------
Nearest to centroid: busy, active, hyperactive, action, activity, actively, being busy, lively, toy, factor

============================================================
Agglomerative k=250 — Cluster 93 (n=32)
------------------------------------------------------------
Nearest to centroid: finishing prematurely, held back by, stopped short, coming to a premature stop, early closing, to finish early, held back in, held back, when held back, to hold back

============================================================
Agglomerative k=250 — Cluster 94 (n=66)
------------------------------------------------------------
Nearest to centroid: originally ordered, ordering, as ordered, in order, after ordering, orderly, ordered off, when ordered, order of, being ordered otherwise

============================================================
Agglomerative k=250 — Cluster 95 (n=33)
------------------------------------------------------------
Nearest to centroid: to appear around, being seen in, to appear, seen during, to be spotted amidst, in sight amid, seen among, look in, seen around in, observed around

============================================================
Agglomerative k=250 — Cluster 96 (n=72)
------------------------------------------------------------
Nearest to centroid: in, in its, in opening of, in which, in company of, on the case of, in the course of, in the, in presence of, in affray

============================================================
Agglomerative k=250 — Cluster 97 (n=23)
------------------------------------------------------------
Nearest to centroid: after a tidy up, to be cleaned up, tidied up, gets tidied, tidied, clean bra, tidy up, cleaned, tidying up, wash up

============================================================
Agglomerative k=250 — Cluster 98 (n=22)
------------------------------------------------------------
Nearest to centroid: rancid, rages, rhapsody, rave, rogue, rash, ragtag, renegade, raging, rabid

============================================================
Agglomerative k=250 — Cluster 99 (n=65)
------------------------------------------------------------
Nearest to centroid: hysterically, hysterical, fevered, feverishly, feverish, freaks out, nutty, fever, frenziedly, crazed

============================================================
Agglomerative k=250 — Cluster 100 (n=74)
------------------------------------------------------------
Nearest to centroid: showing content, displaying, showing, displayed by, shown by, displayed in, displaying in, to show, displays in, shown in

============================================================
Agglomerative k=250 — Cluster 101 (n=95)
------------------------------------------------------------
Nearest to centroid: taken every so often, regularly substituting, regularly citing, regularly ascending, score regularly, took regularly, regularly throughout, consistently, regularly, regularly taken

============================================================
Agglomerative k=250 — Cluster 102 (n=70)
------------------------------------------------------------
Nearest to centroid: gets played, to be played, portrayed in play, play of, plays, play, played, is playable, having played, for play

============================================================
Agglomerative k=250 — Cluster 103 (n=72)
------------------------------------------------------------
Nearest to centroid: encircled by, surrounded, surrounded by, framed by, wrapped by, enveloped by, shrouded in, sheathed by, bordered by, enveloped in

============================================================
Agglomerative k=250 — Cluster 104 (n=54)
------------------------------------------------------------
Nearest to centroid: clinching, clinches, clenched, clenches, clenching, clutches, pricked by, clutching, as inch is to chin, has squeezed

============================================================
Agglomerative k=250 — Cluster 105 (n=71)
------------------------------------------------------------
Nearest to centroid: turn to, turned to, turned, gets turned, stomach turning, in turn, turning, turning turtle, having turned, after turning

============================================================
Agglomerative k=250 — Cluster 106 (n=36)
------------------------------------------------------------
Nearest to centroid: fought, battles, in a fight, battle, characters fighting, fighting, in battle, at war, when warring, in the wars

============================================================
Agglomerative k=250 — Cluster 107 (n=66)
------------------------------------------------------------
Nearest to centroid: some, some in, portrayed in some, view some, some of, some from, one in, a form of, has some, some characters for

============================================================
Agglomerative k=250 — Cluster 108 (n=48)
------------------------------------------------------------
Nearest to centroid: when fixing, for fixing, caused by fixing, fix, fixing, being fixed, fixup, to fix, for repair, should be fixed

============================================================
Agglomerative k=250 — Cluster 109 (n=24)
------------------------------------------------------------
Nearest to centroid: flees, adrift, fleeing, to drift off, duck escaping, drifting away, escapes, driven away, escaped, escaping

============================================================
Agglomerative k=250 — Cluster 110 (n=62)
------------------------------------------------------------
Nearest to centroid: taken, take, take on, take in, being taken in, being taken up, takes, taking up, taken up, taken in

============================================================
Agglomerative k=250 — Cluster 111 (n=92)
------------------------------------------------------------
Nearest to centroid: could turn out to be, may turn out to be, may be in, can be a source of, may be, may lead to, may turn out, might be, may result in, can be

============================================================
Agglomerative k=250 — Cluster 112 (n=53)
------------------------------------------------------------
Nearest to centroid: it sounds, recording of, sounds as if, the sound, does that sound like, in audio, sounds here, sounds like, what sounds like vegetables, they sound like

============================================================
Agglomerative k=250 — Cluster 113 (n=59)
------------------------------------------------------------
Nearest to centroid: getting doctored, doctor, gets treated, doctoring, to doctor, doctors, being treated, healing, treating, to receive treatment

============================================================
Agglomerative k=250 — Cluster 114 (n=56)
------------------------------------------------------------
Nearest to centroid: provided by, provided, providing, provided in, donated by, provides, provide, given in, donned by, to provide

============================================================
Agglomerative k=250 — Cluster 115 (n=25)
------------------------------------------------------------
Nearest to centroid: fenced by, fencing, fences off, to fence, to fence in, to insulate, seal, sealed by, to foil, sealing

============================================================
Agglomerative k=250 — Cluster 116 (n=18)
------------------------------------------------------------
Nearest to centroid: coming across, comes across, appearance through, seething, encountered in, seen through, seen across, bumped into, bumping into, seen over

============================================================
Agglomerative k=250 — Cluster 117 (n=22)
------------------------------------------------------------
Nearest to centroid: backer, that backs, backed, to support, backing, given backing, having backed, backed in, backing in, supports

============================================================
Agglomerative k=250 — Cluster 118 (n=51)
------------------------------------------------------------
Nearest to centroid: slammed, to crush, crushing, blasted, being crushed, crush, crushed, having crushed, crushes, to smash

============================================================
Agglomerative k=250 — Cluster 119 (n=61)
------------------------------------------------------------
Nearest to centroid: flapping, flicks, flash, flicked, flickering, flitting, flailing, flares, flaps, flashing

============================================================
Agglomerative k=250 — Cluster 120 (n=39)
------------------------------------------------------------
Nearest to centroid: presses, hammers, hemmed, to tackle, hamper, hemming in, hemming, hellish, to press, introduce pressure to

============================================================
Agglomerative k=250 — Cluster 121 (n=69)
------------------------------------------------------------
Nearest to centroid: a new, new, get new, anew, the new, a new formula for, new collection, nouveau, in new compilation, in new ensemble

============================================================
Agglomerative k=250 — Cluster 122 (n=55)
------------------------------------------------------------
Nearest to centroid: rum, rims, roams, rack, robes, rink, rends, rock, reels, to rock

============================================================
Agglomerative k=250 — Cluster 123 (n=28)
------------------------------------------------------------
Nearest to centroid: to be redeveloped, being recycled, redeveloped, for redevelopment, redevelopment, redevelop, what redevelopment could make, redeveloping, to replant, for recycling

============================================================
Agglomerative k=250 — Cluster 124 (n=24)
------------------------------------------------------------
Nearest to centroid: call, calls, calling, to call up, amplifying call, called, phoney, by phone, called in by, called for

============================================================
Agglomerative k=250 — Cluster 125 (n=90)
------------------------------------------------------------
Nearest to centroid: transmutational, transmutation, transmuted, transplanted, to be converted, converted, transforming, transfixing, transform, convertible

============================================================
Agglomerative k=250 — Cluster 126 (n=65)
------------------------------------------------------------
Nearest to centroid: as reported, reported by, according to report, claimed, allegedly, claims, is reported, according to announcement, claim, claiming

============================================================
Agglomerative k=250 — Cluster 127 (n=16)
------------------------------------------------------------
Nearest to centroid: interred in, will be buried in, interred by, buried, buried in, buries, buried into, buried by, has buried, and buried in

============================================================
Agglomerative k=250 — Cluster 128 (n=31)
------------------------------------------------------------
Nearest to centroid: having abused, exploited, for abusing, exploitation of, exploiting, being abused, for abuse, abused, abuse, abusing

============================================================
Agglomerative k=250 — Cluster 129 (n=99)
------------------------------------------------------------
Nearest to centroid: suppress, represses, to suppress, repressing, impeded by, suppressed, suppressing, initially suppressed, inhibited by, suppresses

============================================================
Agglomerative k=250 — Cluster 130 (n=65)
------------------------------------------------------------
Nearest to centroid: getting out of, out of it, away with the fairies, out of joint, away from, offset, go off, getting off, off, coming off

============================================================
Agglomerative k=250 — Cluster 131 (n=31)
------------------------------------------------------------
Nearest to centroid: whipping up, stirring, stirred, stirred up, whisked up, whipped, stirring it up, when stirred up, spurning, whipped up

============================================================
Agglomerative k=250 — Cluster 132 (n=15)
------------------------------------------------------------
Nearest to centroid: to absorb, absorbed by, having absorbed, absorbed in, absorbed, absorb, is absorbed in, are absorbing, absorbing time, absorbs

============================================================
Agglomerative k=250 — Cluster 133 (n=58)
------------------------------------------------------------
Nearest to centroid: northerly, northern, leaning towards the west, to the north, northwards, headed north, coming north, northbound, heading north, to north

============================================================
Agglomerative k=250 — Cluster 134 (n=37)
------------------------------------------------------------
Nearest to centroid: crumble, crumbles, cramps, scrabble, scrambled, is scrambled, scramble, scrambled up, scrambling, crumbs

============================================================
Agglomerative k=250 — Cluster 135 (n=58)
------------------------------------------------------------
Nearest to centroid: so the saying goes, i say, as they say, as the saying goes, it says, as satnav says, as one says, saying, say, said

============================================================
Agglomerative k=250 — Cluster 136 (n=56)
------------------------------------------------------------
Nearest to centroid: after accident, following smash, after smash, after crash, in accident, after crashing, accident, after injury, having accident, crashed

============================================================
Agglomerative k=250 — Cluster 137 (n=41)
------------------------------------------------------------
Nearest to centroid: contains, accounts for, comprising, included by, included in, comprising some, contain, inclusion of, but including, that includes

============================================================
Agglomerative k=250 — Cluster 138 (n=37)
------------------------------------------------------------
Nearest to centroid: controlled by, control, under control, to control, controls, for control, governing, controlling, managed, administration

============================================================
Agglomerative k=250 — Cluster 139 (n=29)
------------------------------------------------------------
Nearest to centroid: at the heart of, in the heart of, at the centre of, in the centre of, central role, heart of, centrally, central part of, central, focus of

============================================================
Agglomerative k=250 — Cluster 140 (n=35)
------------------------------------------------------------
Nearest to centroid: mess up, messed with, mess about, mess with, being messed about, messed, making a mess of, i mess about when lapsed, making a mess, messing with

============================================================
Agglomerative k=250 — Cluster 141 (n=64)
------------------------------------------------------------
Nearest to centroid: somewhat, a bit, just a bit, partially, to some extent, approximately, roughly, just over half, almost recognise, nearly

============================================================
Agglomerative k=250 — Cluster 142 (n=80)
------------------------------------------------------------
Nearest to centroid: comes up, come up, coming up beneath it, to come up, finally turning up, came up, he turns up, having turned up, will come up, when up

============================================================
Agglomerative k=250 — Cluster 143 (n=26)
------------------------------------------------------------
Nearest to centroid: retirement, to retire, for retirement, retired chap, to be retired, upon retirement, after retirement, following retirement, retired peer, being retired

============================================================
Agglomerative k=250 — Cluster 144 (n=40)
------------------------------------------------------------
Nearest to centroid: to separate, to splinter, to split up, separating, separation, to split, splitting, as a result of splitting up, separates, must separate

============================================================
Agglomerative k=250 — Cluster 145 (n=23)
------------------------------------------------------------
Nearest to centroid: switched around, to shift about, switches, switch, switched, shifted from, to shift, switched over, shifted, shifting

============================================================
Agglomerative k=250 — Cluster 146 (n=77)
------------------------------------------------------------
Nearest to centroid: missing in odd places, needing to miss a, disappears, oddly going missing, to get lost, oddly disappears, to be losing heart, missing the queen, gets lost, having got lost

============================================================
Agglomerative k=250 — Cluster 147 (n=52)
------------------------------------------------------------
Nearest to centroid: having grabbed, grabbing, grabbed with, grabbed, being grabbed, getting grabbed by, having grasped, grab, when grabbed by, grasped

============================================================
Agglomerative k=250 — Cluster 148 (n=13)
------------------------------------------------------------
Nearest to centroid: tucked into, tucking into, is to tuck into, tucking in, tail tucked in, to tuck in, tucked in, tucking, to tuck into, tuck into

============================================================
Agglomerative k=250 — Cluster 149 (n=36)
------------------------------------------------------------
Nearest to centroid: reject, being rejected, snubbed, declined, for rejection, oddly rejected, snubbing, and rejected, rejected, rebuffed

============================================================
Agglomerative k=250 — Cluster 150 (n=29)
------------------------------------------------------------
Nearest to centroid: for knitting, after knitting, after embroidery, to embroider, interweaving, quilt, to stitch, weaved, weave, weaving

============================================================
Agglomerative k=250 — Cluster 151 (n=29)
------------------------------------------------------------
Nearest to centroid: to be served up, to serve rum, getting served up, serving, serving up, served up as, will be served up, served up some, being served, serve up

============================================================
Agglomerative k=250 — Cluster 152 (n=30)
------------------------------------------------------------
Nearest to centroid: coming out, having exported, for coming out, first to come out, having released, first to go, to stand outside, standing outside, one coming out, coming out around

============================================================
Agglomerative k=250 — Cluster 153 (n=16)
------------------------------------------------------------
Nearest to centroid: to be invested, getting invested in, invested, invested in, investing in, invest in, to invest in, invests in, to be invested in, invested with

============================================================
Agglomerative k=250 — Cluster 154 (n=18)
------------------------------------------------------------
Nearest to centroid: spouting, shouting, crying, cry of, shouts, shouted, moaning, spouted, spat out, stifled by shout of disapproval

============================================================
Agglomerative k=250 — Cluster 155 (n=32)
------------------------------------------------------------
Nearest to centroid: is embraced by, being embraced by, having embraced, embraced by, embraces, caressed by, embraced, embracing one, to be hugged by, to embrace

============================================================
Agglomerative k=250 — Cluster 156 (n=15)
------------------------------------------------------------
Nearest to centroid: getting settled, is settled, wanting settlement, settled, somehow settled, settlement, settled for, settling, settle in, to settle

============================================================
Agglomerative k=250 — Cluster 157 (n=80)
------------------------------------------------------------
Nearest to centroid: to alter, in altered state, alter, alteration, for alteration, for a change, affected by change, changes, to change, change of

============================================================
Agglomerative k=250 — Cluster 158 (n=30)
------------------------------------------------------------
Nearest to centroid: mangled, manacles, to get in a tangle, tangle, tangles, bungled, in a tangle, to tangle, tangle of, tangling

============================================================
Agglomerative k=250 — Cluster 159 (n=10)
------------------------------------------------------------
Nearest to centroid: is represented, representing, representation, letters represented, represented as, representation of, represent, being represented, novel representation, represented

============================================================
Agglomerative k=250 — Cluster 160 (n=31)
------------------------------------------------------------
Nearest to centroid: planning of, prepare, plan, plot, planning, being prepared, in preparation, being planned, is planned, readied

============================================================
Agglomerative k=250 — Cluster 161 (n=29)
------------------------------------------------------------
Nearest to centroid: eradicated, liquidated, to eradicate, quashed, for demolition, erased, demolished, in liquidation, being demolished, squandering

============================================================
Agglomerative k=250 — Cluster 162 (n=94)
------------------------------------------------------------
Nearest to centroid: being involved in, got involved, being involved, to be involved in, involvement in, engaged by, involved, involving church, involvement, implicated in

============================================================
Agglomerative k=250 — Cluster 163 (n=41)
------------------------------------------------------------
Nearest to centroid: being let loose, to be let loose, breaking loose, get loose, becomes loose, loosening, freed from, made loose, loosened, loosening up

============================================================
Agglomerative k=250 — Cluster 164 (n=9)
------------------------------------------------------------
Nearest to centroid: to comprehend, comprehended, comprehending, comprehends that, comprehended by, comprehend, comprehends, discerned, determined

============================================================
Agglomerative k=250 — Cluster 165 (n=39)
------------------------------------------------------------
Nearest to centroid: foolishly, foolish, stupidly, fools, stupidly put, is stupid, being stupid, looking silly, stupid, looks silly

============================================================
Agglomerative k=250 — Cluster 166 (n=38)
------------------------------------------------------------
Nearest to centroid: knocked over, knocked to the left, knocked for six, knocked about, when knocked over, being knocked over, getting knocked over, knocked around, gets knocked over, knocked out

============================================================
Agglomerative k=250 — Cluster 167 (n=33)
------------------------------------------------------------
Nearest to centroid: when boarding, going aboard, aboard, on board, when at sea, to get aboard, landing aboard, afloat, boarding, at sea

============================================================
Agglomerative k=250 — Cluster 168 (n=35)
------------------------------------------------------------
Nearest to centroid: residents of, inhabiting, housed by, is home to, resident, occupant of, housed in, when inhabiting, some occupants of, inhabits

============================================================
Agglomerative k=250 — Cluster 169 (n=37)
------------------------------------------------------------
Nearest to centroid: not last, not the first, not new, not first, not originally, not for the most part, not initially, not initially romantic, not all, not altogether

============================================================
Agglomerative k=250 — Cluster 170 (n=23)
------------------------------------------------------------
Nearest to centroid: out of sequence, out of character, out of tune, out of sorts, is out of place, out of place, being out of tune, when out of sorts, out of shape, out of sync

============================================================
Agglomerative k=250 — Cluster 171 (n=19)
------------------------------------------------------------
Nearest to centroid: inviting, must welcome, welcomed by, to welcome, being welcomed by, welcomed, welcome, being welcomed in, welcomes, welcomed in

============================================================
Agglomerative k=250 — Cluster 172 (n=26)
------------------------------------------------------------
Nearest to centroid: to bring in, brought in by, brought in, brought in to, gets brought in, brings in, bringing in, which one must bring in, brought, brought into

============================================================
Agglomerative k=250 — Cluster 173 (n=24)
------------------------------------------------------------
Nearest to centroid: filled in, filled, filled out by, to be filled with, fills, full of, filling of, for filling, fill, filled by

============================================================
Agglomerative k=250 — Cluster 174 (n=17)
------------------------------------------------------------
Nearest to centroid: emission, emitting, apart, eclipsing, exuding, emitted, excreting, ejecting, eclipses, exuded

============================================================
Agglomerative k=250 — Cluster 175 (n=35)
------------------------------------------------------------
Nearest to centroid: when developing, to develop, to be developed, being developed, development, for development, for developing, evolved, develop, on developing

============================================================
Agglomerative k=250 — Cluster 176 (n=33)
------------------------------------------------------------
Nearest to centroid: for styling, fashioned, styling of, styled, pattern, fashionably, patterned, fashioning, tailoring, fashionable

============================================================
Agglomerative k=250 — Cluster 177 (n=29)
------------------------------------------------------------
Nearest to centroid: to check, i study a, checking, probing, checked by, test, probe, probed by, having to check, probes

============================================================
Agglomerative k=250 — Cluster 178 (n=28)
------------------------------------------------------------
Nearest to centroid: scribbled, having penned, written in the, writhing, penned, scribbling, attack written up, written in, to write up, written

============================================================
Agglomerative k=250 — Cluster 179 (n=28)
------------------------------------------------------------
Nearest to centroid: mock, funnily, mockery of, funnily enough, ludicrously, laughable, hilariously, funny, comically, found funny

============================================================
Agglomerative k=250 — Cluster 180 (n=14)
------------------------------------------------------------
Nearest to centroid: doing an about face, faced, getting framed in, to face, framed, frames, framing, photoshopped, facing, surfaces

============================================================
Agglomerative k=250 — Cluster 181 (n=21)
------------------------------------------------------------
Nearest to centroid: contributing to, making contribution, makes a contribution to, contribute to, contributing in, making contribution to, added to, contributed to, contribution to, contribution from

============================================================
Agglomerative k=250 — Cluster 182 (n=46)
------------------------------------------------------------
Nearest to centroid: after alteration, after modifying, after changes, after adjustment, after an exchange, before and after, after replacement, after changes made, after metamorphosis, after changeover

============================================================
Agglomerative k=250 — Cluster 183 (n=74)
------------------------------------------------------------
Nearest to centroid: drink up, drink, to drink, drinking, would drink, bottled up, drinking in, drinks, bottled in, to bottle

============================================================
Agglomerative k=250 — Cluster 184 (n=26)
------------------------------------------------------------
Nearest to centroid: killing, assassinated, slaughtered, murdered, beheaded, assassination of, crucified, being massacred, massacre, beheading

============================================================
Agglomerative k=250 — Cluster 185 (n=34)
------------------------------------------------------------
Nearest to centroid: with detonation, detonating, detonated, in explosion, exploding, explode, explosion in, exploded, to explode, explodes

============================================================
Agglomerative k=250 — Cluster 186 (n=44)
------------------------------------------------------------
Nearest to centroid: to protect, safeguards, in protection of, protected in, protective, provide protection, protect, protecting, protected, protects

============================================================
Agglomerative k=250 — Cluster 187 (n=43)
------------------------------------------------------------
Nearest to centroid: suffering disruption, disrupts, being disrupted, disruption, disrupted, to disrupt, disrupting, disruptive, being disruptive, is disrupted

============================================================
Agglomerative k=250 — Cluster 188 (n=35)
------------------------------------------------------------
Nearest to centroid: crossing borders, crossing, crosses, crossed, to cross, that crosses, after crossing, when crossing, crossing over, traversed by

============================================================
Agglomerative k=250 — Cluster 189 (n=33)
------------------------------------------------------------
Nearest to centroid: varying, with variety, variety, variegated, a variety, varies, at variance, in variety, with variations, appearing in variety

============================================================
Agglomerative k=250 — Cluster 190 (n=21)
------------------------------------------------------------
Nearest to centroid: planted in, cultivated, in which to plant, cultivated in, cultivating, for cultivating, cultivate, agricultural, for cultivation, cultivation

============================================================
Agglomerative k=250 — Cluster 191 (n=30)
------------------------------------------------------------
Nearest to centroid: from the bottom, from below, viewed from the south, seen from below, drawn from centre, viewed from behind, pulled from the rear, view from rear, as seen from behind, seen from behind

============================================================
Agglomerative k=250 — Cluster 192 (n=61)
------------------------------------------------------------
Nearest to centroid: banged up, beat, getting beaten up, being beaten up, beating, beaten up, beat it, get beaten, has beaten, beaten

============================================================
Agglomerative k=250 — Cluster 193 (n=37)
------------------------------------------------------------
Nearest to centroid: being upset, upset some, to upset, get upset, to be upset, getting all upset, should be upset, newspaper upset, upset girl, however upset

============================================================
Agglomerative k=250 — Cluster 194 (n=57)
------------------------------------------------------------
Nearest to centroid: blend of, hybrid, mix, in a mixture, mixed together, mixes, mixes with, mixture, hybridised, to mix

============================================================
Agglomerative k=250 — Cluster 195 (n=72)
------------------------------------------------------------
Nearest to centroid: concealing, concealed in, being hidden, camouflaged, to conceal it, hidden, hid, concealed presence of, concealed, conceals

============================================================
Agglomerative k=250 — Cluster 196 (n=43)
------------------------------------------------------------
Nearest to centroid: to revolt, revolting, revolts, revolt, in revolt, rebels, rioted, revolted, rioting by, totally revolting

============================================================
Agglomerative k=250 — Cluster 197 (n=53)
------------------------------------------------------------
Nearest to centroid: to work in, needing to work, wanting work, for work, at work, with working, in work, having worked, work, worked on

============================================================
Agglomerative k=250 — Cluster 198 (n=12)
------------------------------------------------------------
Nearest to centroid: fallen over, after falling over, fall of, having fallen, fall over, fallen, to fall over, falling over, falls, falls in

============================================================
Agglomerative k=250 — Cluster 199 (n=14)
------------------------------------------------------------
Nearest to centroid: open to, having opened, opens, open, has opened, opened by, opening, to open, open out, when opening

============================================================
Agglomerative k=250 — Cluster 200 (n=22)
------------------------------------------------------------
Nearest to centroid: a form, form, forms, formation, constituting, formulation, in irregular formation, formulated, does somersault, do a somersault

============================================================
Agglomerative k=250 — Cluster 201 (n=20)
------------------------------------------------------------
Nearest to centroid: to resolve, being solved, resolution of, to be resolved, having been resolved, solve, resolution, may be resolved, for resolution, resolving

============================================================
Agglomerative k=250 — Cluster 202 (n=37)
------------------------------------------------------------
Nearest to centroid: going badly, for poor service, is poor, poor, goes bad, going bad, poorly, poorly expressed, for the worse, coming out badly

============================================================
Agglomerative k=250 — Cluster 203 (n=36)
------------------------------------------------------------
Nearest to centroid: riddled, fiddled, after fiddling, fidgeted, raddled, fiddled with, fidget, addled, fiddling, fidgeting

============================================================
Agglomerative k=250 — Cluster 204 (n=64)
------------------------------------------------------------
Nearest to centroid: having no, brooking no, showing no, has no, no parking, getting nothing even, not seen, no energy, no time for, zero

============================================================
Agglomerative k=250 — Cluster 205 (n=17)
------------------------------------------------------------
Nearest to centroid: sent, get sent out, sent out, sent in to, sent up, sent skyward, being sent up, sent flying, getting sent up, sent north

============================================================
Agglomerative k=250 — Cluster 206 (n=25)
------------------------------------------------------------
Nearest to centroid: pounds, pound, pulp, poles, politico, pockets, policed, pocketed, police, pocketing

============================================================
Agglomerative k=250 — Cluster 207 (n=16)
------------------------------------------------------------
Nearest to centroid: with reflection, in reflection, reflected, reflects, on reflection, to reflect, reflection, mirror, reflective, reflection to

============================================================
Agglomerative k=250 — Cluster 208 (n=15)
------------------------------------------------------------
Nearest to centroid: can withdraw, undergoing withdrawal, to withdraw, withdraw, suffers withdrawal, withdrawal, withdrawn, withdrawing, withdrawn for example, withdraws

============================================================
Agglomerative k=250 — Cluster 209 (n=65)
------------------------------------------------------------
Nearest to centroid: cooking, casserole, cook up, cookery, needs cooking, cooks, cook, get cooked, getting cooked, being cooked

============================================================
Agglomerative k=250 — Cluster 210 (n=52)
------------------------------------------------------------
Nearest to centroid: bouncing around, bouncing, when bustling about, clowning, swirling about, slings, bouncing back, shuffling, swarming, swirling around

============================================================
Agglomerative k=250 — Cluster 211 (n=50)
------------------------------------------------------------
Nearest to centroid: to accept, being accepted, to be accepted, to admit, accepted by, accepting, having to accept, accepted, will accept, accepted in

============================================================
Agglomerative k=250 — Cluster 212 (n=28)
------------------------------------------------------------
Nearest to centroid: making use of, use, used about, used, using up, used for, used by, used in, uses, used on

============================================================
Agglomerative k=250 — Cluster 213 (n=60)
------------------------------------------------------------
Nearest to centroid: being arranged, arrange for, get arranged, after arranging, having arranged, after arrangement, having been organised, for arrangement, arrange, when arranged

============================================================
Agglomerative k=250 — Cluster 214 (n=57)
------------------------------------------------------------
Nearest to centroid: infiltrates, infiltrated, infesting, entrapped, impounding, impregnates, impregnating, infiltrating, infiltrated by, inhales

============================================================
Agglomerative k=250 — Cluster 215 (n=31)
------------------------------------------------------------
Nearest to centroid: covering, coverage, the cover of, is covered by, when covered in, covered, cover, for the cover, covered by, cover for

============================================================
Agglomerative k=250 — Cluster 216 (n=23)
------------------------------------------------------------
Nearest to centroid: jockey, jackal, junk, jerk, jerks, judders, jazz, jockeys, to jockey, jar

============================================================
Agglomerative k=250 — Cluster 217 (n=73)
------------------------------------------------------------
Nearest to centroid: squiggly, squidgy, scruffy, squally, chatty, chirpy, splotchy, sloppily, scatty, squiffy

============================================================
Agglomerative k=250 — Cluster 218 (n=21)
------------------------------------------------------------
Nearest to centroid: plugged in, getting to, logged, connected by, accessing, had, in which you get, in which to get, will be in, having

============================================================
Agglomerative k=250 — Cluster 219 (n=11)
------------------------------------------------------------
Nearest to centroid: winding up, winds up, windswept, wind, winding in and out, needing wind, winds, windy, winding, winding back

============================================================
Agglomerative k=250 — Cluster 220 (n=39)
------------------------------------------------------------
Nearest to centroid: for improvisation, improvised, artfully, illusion, improvising, with imagination, imaginatively, artful, idea, creatively

============================================================
Agglomerative k=250 — Cluster 221 (n=25)
------------------------------------------------------------
Nearest to centroid: extract, selection from, an extract from, is extractable from this, extractable from, selections, item taken from, extracted from, extract from, extracted

============================================================
Agglomerative k=250 — Cluster 222 (n=32)
------------------------------------------------------------
Nearest to centroid: that will be broadcast, to be broadcast, for broadcast, to broadcast, broadcast of, be broadcast, broadcasts, broadcasting, gets broadcast, live

============================================================
Agglomerative k=250 — Cluster 223 (n=34)
------------------------------------------------------------
Nearest to centroid: stocked by, store for, in stock, to store, stored up, store, store in, stored, stored by, stored in

============================================================
Agglomerative k=250 — Cluster 224 (n=16)
------------------------------------------------------------
Nearest to centroid: resorted, remitted, recidivist, recoiling, resorting, relapsing, resort to, recurrence, recurring, can resort

============================================================
Agglomerative k=250 — Cluster 225 (n=17)
------------------------------------------------------------
Nearest to centroid: with the shakes, shaken about, shake, shaken, to shake, shaken all about, shaken up, shakes, when shaken, to be shaken

============================================================
Agglomerative k=250 — Cluster 226 (n=26)
------------------------------------------------------------
Nearest to centroid: distorting, diverted, distorted, distortion, diverting, diversion, diverging, after a diversion, is shown in diversion, on diversion

============================================================
Agglomerative k=250 — Cluster 227 (n=85)
------------------------------------------------------------
Nearest to centroid: swaggering, waltzing, swashbuckling, for wagging, wagging, rippling, wrinkling, wafting, wags, wrangle

============================================================
Agglomerative k=250 — Cluster 228 (n=8)
------------------------------------------------------------
Nearest to centroid: being smuggled into, smuggled therein, to smuggle in, smuggling, smuggled into, smuggled in, smuggled by, smuggling in

============================================================
Agglomerative k=250 — Cluster 229 (n=25)
------------------------------------------------------------
Nearest to centroid: failed, after failing, failed in the end, fails, failure, having failed, fail, failing, when failing, to fail

============================================================
Agglomerative k=250 — Cluster 230 (n=32)
------------------------------------------------------------
Nearest to centroid: to chew, to swallow, for chewing, chewed up, get chewed, what could chew, chew, chewed, must swallow, chewing

============================================================
Agglomerative k=250 — Cluster 231 (n=64)
------------------------------------------------------------
Nearest to centroid: baffling, puzzled, in confusion, perplexing, perplex, perplexed by, perplexedly, perplexed, confusedly, confusingly

============================================================
Agglomerative k=250 — Cluster 232 (n=66)
------------------------------------------------------------
Nearest to centroid: dropping off, dropping the bass, drops out, i being a dropout, surrendering, drops off, quits, need to drop, having dropped, two at the top must leave

============================================================
Agglomerative k=250 — Cluster 233 (n=22)
------------------------------------------------------------
Nearest to centroid: in audition, in the audience, for recital, in auditorium, giving the audience, in audience, fans, for audience, for the audience, to audience

============================================================
Agglomerative k=250 — Cluster 234 (n=47)
------------------------------------------------------------
Nearest to centroid: dipped in, dipping in, dips into, bathed in, diving in, plunged into, dipping into, plunging into, dives into, diving into

============================================================
Agglomerative k=250 — Cluster 235 (n=12)
------------------------------------------------------------
Nearest to centroid: electric, erect, elastic, express, erection, erecting, elite, energetically, excised, energetic

============================================================
Agglomerative k=250 — Cluster 236 (n=8)
------------------------------------------------------------
Nearest to centroid: after setback, setback, having suffered setback, facing setback, suffering setback, suffers setback, after a setback, partial setback

============================================================
Agglomerative k=250 — Cluster 237 (n=13)
------------------------------------------------------------
Nearest to centroid: to host, being host to, host to, having to host, hosted, host, hosted by, hosts, hosting, when hosting

============================================================
Agglomerative k=250 — Cluster 238 (n=11)
------------------------------------------------------------
Nearest to centroid: pulled up, pulling up, heeled, hauled up, hailed, pulling in, hoisted, stacked up, pushing up, pushed up

============================================================
Agglomerative k=250 — Cluster 239 (n=11)
------------------------------------------------------------
Nearest to centroid: flowing, flowing over, flows, flow, flowing round, flowing into, flows northwards, flowing west, keels over, keeling over

============================================================
Agglomerative k=250 — Cluster 240 (n=69)
------------------------------------------------------------
Nearest to centroid: going in to, going into, getting into it, getting in, coming in to, is entering, coming in to divide, got into, entering into, to come in

============================================================
Agglomerative k=250 — Cluster 241 (n=9)
------------------------------------------------------------
Nearest to centroid: stuffed in, stuffed, stuffed with, stuffed by, getting stuffed by, stuffing for, stuffed into, for stuffing, stuffing

============================================================
Agglomerative k=250 — Cluster 242 (n=78)
------------------------------------------------------------
Nearest to centroid: made complex, made compact, was made, made out, creates, made into, made, creating, generates, made in

============================================================
Agglomerative k=250 — Cluster 243 (n=8)
------------------------------------------------------------
Nearest to centroid: briefly, bit, bits, brief, in bits, in short, shortly, various bits

============================================================
Agglomerative k=250 — Cluster 244 (n=15)
------------------------------------------------------------
Nearest to centroid: in replacement, to replace, replacement, replacing, for replacing, on replacement, replacement for, replace, substitute, substituted

============================================================
Agglomerative k=250 — Cluster 245 (n=34)
------------------------------------------------------------
Nearest to centroid: improper, improperly, immature, use indecently, negligently, impure, illicitly, improperly used, illegal, illegally

============================================================
Agglomerative k=250 — Cluster 246 (n=34)
------------------------------------------------------------
Nearest to centroid: giving sanctuary to, providing shelter for, to provide home for, accommodation on the outskirts of, shelters in, lodging, providing accommodation for, lodging in, given refuge, provide hiding place for

============================================================
Agglomerative k=250 — Cluster 247 (n=29)
------------------------------------------------------------
Nearest to centroid: needed, are needed, needs, required, needed in, needed for, requires, needed by, required for, required during

============================================================
Agglomerative k=250 — Cluster 248 (n=37)
------------------------------------------------------------
Nearest to centroid: beset by, that has been gripped by, gripped by, ensnared by, engrossed by, possessed by, imbued with, entrenched by, anchored, to be besieged by

============================================================
Agglomerative k=250 — Cluster 249 (n=22)
------------------------------------------------------------
Nearest to centroid: carried around, carry, carried, carrying, has to be carried around, carried in, carries around, when carrying, carried about, to be carried by

# For HDBSCAN, inspect only the largest clusters to keep output concise
unique_hdbscan_labels = sorted(set(best_hdbscan_labels))
n_hdbscan_clusters = len([l for l in unique_hdbscan_labels if l != -1])

if n_hdbscan_clusters > 15:
    # Find the 15 largest clusters by size
    cluster_sizes = pd.Series(best_hdbscan_labels[best_hdbscan_labels != -1]).value_counts()
    top_15_labels = set(cluster_sizes.head(15).index)

    # Create a filtered label array: keep top 15, set rest to -1
    filtered_labels = np.where(
        np.isin(best_hdbscan_labels, list(top_15_labels)),
        best_hdbscan_labels,
        -1
    )
    print(f'HDBSCAN found {n_hdbscan_clusters} clusters. '
          f'Showing the 15 largest below.')
    inspect_clusters(
        filtered_labels,
        embeddings_10d,
        indicator_names,
        method_name=f'HDBSCAN (eps={best_eps})'
    )
else:
    inspect_clusters(
        best_hdbscan_labels,
        embeddings_10d,
        indicator_names,
        method_name=f'HDBSCAN (eps={best_eps})'
    )

HDBSCAN found 281 clusters. Showing the 15 largest below.

============================================================
HDBSCAN (eps=0.0) — Noise points: 11059
============================================================

============================================================
HDBSCAN (eps=0.0) — Cluster 21 (n=133)
------------------------------------------------------------
Nearest to centroid: bringing back, to return, getting back, to bring back, to come back, to return in, to get back, about to go back, coming back in, needing to go back

============================================================
HDBSCAN (eps=0.0) — Cluster 28 (n=69)
------------------------------------------------------------
Nearest to centroid: turned, turned to, gets turned, turn to, turning, turning turtle, stomach turning, having turned, after turning, in turn

============================================================
HDBSCAN (eps=0.0) — Cluster 51 (n=64)
------------------------------------------------------------
Nearest to centroid: changes, for a change, affected by change, change of, to change, for changing, to alter, change in, exhibits change, change for

============================================================
HDBSCAN (eps=0.0) — Cluster 53 (n=273)
------------------------------------------------------------
Nearest to centroid: getting reinvented, retrofit, being reselected, retuned, to be reappointed to, reinvention, redesign, being rebuilt, rebuilt, retrained

============================================================
HDBSCAN (eps=0.0) — Cluster 76 (n=70)
------------------------------------------------------------
Nearest to centroid: concealed in, concealing, being hidden, camouflaged, hid, conceals, concealed presence of, hidden, concealed, hiding key

============================================================
HDBSCAN (eps=0.0) — Cluster 102 (n=76)
------------------------------------------------------------
Nearest to centroid: becomes involved, get involved, gets involved, getting involved, being involved, to be involved in, to become involved, involving church, involvement in, getting involved in

============================================================
HDBSCAN (eps=0.0) — Cluster 130 (n=129)
------------------------------------------------------------
Nearest to centroid: going up, having gone up, celebrity on the up, goes belly up, goes up, has upturn, yields up, after an upswing, rising up, on the up

============================================================
HDBSCAN (eps=0.0) — Cluster 133 (n=81)
------------------------------------------------------------
Nearest to centroid: for us to hear, being heard, to be heard, be heard, will be listened to, hear, be listened to, overheard, listened, heard to be

============================================================
HDBSCAN (eps=0.0) — Cluster 139 (n=65)
------------------------------------------------------------
Nearest to centroid: getting in, going in to, is entering, entering into, going into, getting into it, coming in to, to come in, to get in, coming in to divide

============================================================
HDBSCAN (eps=0.0) — Cluster 140 (n=110)
------------------------------------------------------------
Nearest to centroid: insert, inlay, insert this, innards, inlaid, innards from, ingrained, inserting, inane, to incorporate

============================================================
HDBSCAN (eps=0.0) — Cluster 168 (n=95)
------------------------------------------------------------
Nearest to centroid: made complex, made compact, to generate, was made, manufactured, generates, is fabricated, to create, made in, made out

============================================================
HDBSCAN (eps=0.0) — Cluster 230 (n=122)
------------------------------------------------------------
Nearest to centroid: slovenly, shabby, shaggy, sally, shoddily, scantily, zippily, pushily, shakily, cheeky

============================================================
HDBSCAN (eps=0.0) — Cluster 253 (n=81)
------------------------------------------------------------
Nearest to centroid: waltzing, swaggering, wagging, wafting, swashbuckling, wags, wrinkling, waggling, rippling, wagged

============================================================
HDBSCAN (eps=0.0) — Cluster 258 (n=113)
------------------------------------------------------------
Nearest to centroid: strangely enough, peculiar, appears unusually, oddly enough but developed, unusually, oddly enough, to be peculiar, bizarrely illustrated, strangely, in peculiar guise

============================================================
HDBSCAN (eps=0.0) — Cluster 271 (n=82)
------------------------------------------------------------
Nearest to centroid: dreadfully, awful, abominable, atrociously, is dreadful, hideous, heinous, awfully, woefully, atrocious

# Save HDBSCAN cluster labels for each epsilon
for eps, labels in hdbscan_labels_dict.items():
    eps_str = f'{eps:.4f}'.replace('.', 'p')  # e.g., 0.1500 -> 0p1500
    fname = f'cluster_labels_hdbscan_eps_{eps_str}.csv'
    out_df = pd.DataFrame({
        'indicator': indicator_names,
        'cluster_label': labels,
    })
    out_df.to_csv(DATA_DIR / fname, index=False)
    print(f'Saved: {fname}')

Saved: cluster_labels_hdbscan_eps_0p0000.csv

Saved: cluster_labels_hdbscan_eps_0p2140.csv
Saved: cluster_labels_hdbscan_eps_0p4280.csv
Saved: cluster_labels_hdbscan_eps_0p6420.csv
Saved: cluster_labels_hdbscan_eps_0p7788.csv

Saved: cluster_labels_hdbscan_eps_0p8560.csv
Saved: cluster_labels_hdbscan_eps_1p0700.csv
Saved: cluster_labels_hdbscan_eps_1p2840.csv

Saved: cluster_labels_hdbscan_eps_1p4980.csv

Saved: cluster_labels_hdbscan_eps_1p9327.csv
Saved: cluster_labels_hdbscan_eps_2p2334.csv
Saved: cluster_labels_hdbscan_eps_2p4729.csv

Saved: cluster_labels_hdbscan_eps_2p6847.csv

# Save agglomerative cluster labels for each k
for k, labels in agglo_labels_dict.items():
    fname = f'cluster_labels_agglo_k{k}.csv'
    out_df = pd.DataFrame({
        'indicator': indicator_names,
        'cluster_label': labels,
    })
    out_df.to_csv(DATA_DIR / fname, index=False)
    print(f'Saved: {fname}')

Saved: cluster_labels_agglo_k4.csv
Saved: cluster_labels_agglo_k6.csv

Saved: cluster_labels_agglo_k8.csv
Saved: cluster_labels_agglo_k9.csv

Saved: cluster_labels_agglo_k10.csv
Saved: cluster_labels_agglo_k11.csv
Saved: cluster_labels_agglo_k12.csv

Saved: cluster_labels_agglo_k16.csv
Saved: cluster_labels_agglo_k20.csv

Saved: cluster_labels_agglo_k26.csv
Saved: cluster_labels_agglo_k34.csv

Saved: cluster_labels_agglo_k50.csv
Saved: cluster_labels_agglo_k75.csv

Saved: cluster_labels_agglo_k100.csv
Saved: cluster_labels_agglo_k150.csv
Saved: cluster_labels_agglo_k200.csv
Saved: cluster_labels_agglo_k250.csv

# Build a combined metrics summary for all runs
all_metrics = []

# HDBSCAN runs
for _, row in df_hdbscan.iterrows():
    all_metrics.append({
        'method': 'HDBSCAN',
        'parameters': f'min_cluster_size=10, eps={row["epsilon"]}',
        'n_clusters': int(row['n_clusters']),
        'n_noise': int(row['n_noise']),
        'noise_pct': row['noise_pct'],
        'silhouette': row['silhouette'],
        'davies_bouldin': row['davies_bouldin'],
        'calinski_harabasz': float('nan'),  # not computed for HDBSCAN (noise points)
    })

# Agglomerative runs
for _, row in df_agglo.iterrows():
    all_metrics.append({
        'method': 'Agglomerative (Ward)',
        'parameters': f'k={int(row["k"])}',
        'n_clusters': int(row['k']),
        'n_noise': 0,
        'noise_pct': 0.0,
        'silhouette': row['silhouette'],
        'davies_bouldin': row['davies_bouldin'],
        'calinski_harabasz': row['calinski_harabasz'],
    })

df_all_metrics = pd.DataFrame(all_metrics)
metrics_path = OUTPUT_DIR / 'clustering_metrics_summary.csv'
df_all_metrics.to_csv(metrics_path, index=False)

print(f'Saved: {metrics_path}')
print(f'\n--- Full Metrics Summary ---')
print(df_all_metrics.to_string(index=False))

Saved: /home/vwinters/ccc-project/indicator_clustering/outputs/clustering_metrics_summary.csv

--- Full Metrics Summary ---
              method                      parameters  n_clusters  n_noise  noise_pct  silhouette  davies_bouldin  calinski_harabasz
             HDBSCAN    min_cluster_size=10, eps=0.0         281     4193  33.219775    0.630275        0.473026                NaN
             HDBSCAN  min_cluster_size=10, eps=0.214         243     3764  29.820948    0.583843        0.512302                NaN
             HDBSCAN  min_cluster_size=10, eps=0.428          61     1280  10.141024   -0.119369        1.045560                NaN
             HDBSCAN  min_cluster_size=10, eps=0.642          17      311   2.463952   -0.296469        0.977529                NaN
             HDBSCAN min_cluster_size=10, eps=0.7788          11      114   0.903185   -0.185984        0.775157                NaN
             HDBSCAN  min_cluster_size=10, eps=0.856          10       28   0.221835   -0.167519        0.797766                NaN
             HDBSCAN   min_cluster_size=10, eps=1.07           6        0   0.000000   -0.120162        0.782393                NaN
             HDBSCAN  min_cluster_size=10, eps=1.284           6        0   0.000000   -0.120162        0.782393                NaN
             HDBSCAN  min_cluster_size=10, eps=1.498           4        0   0.000000    0.230049        0.549086                NaN
             HDBSCAN min_cluster_size=10, eps=1.9327           4        0   0.000000    0.230049        0.549086                NaN
             HDBSCAN min_cluster_size=10, eps=2.2334           3        0   0.000000    0.385651        0.461465                NaN
             HDBSCAN min_cluster_size=10, eps=2.4729           3        0   0.000000    0.385651        0.461465                NaN
             HDBSCAN min_cluster_size=10, eps=2.6847           3        0   0.000000    0.385651        0.461465                NaN
Agglomerative (Ward)                             k=4           4        0   0.000000    0.246021        1.456096        3909.220873
Agglomerative (Ward)                             k=6           6        0   0.000000    0.259034        1.322581        3920.874006
Agglomerative (Ward)                             k=8           8        0   0.000000    0.272435        1.267374        3897.154308
Agglomerative (Ward)                             k=9           9        0   0.000000    0.288711        1.184741        3844.271002
Agglomerative (Ward)                            k=10          10        0   0.000000    0.298516        1.163678        3788.683596
Agglomerative (Ward)                            k=11          11        0   0.000000    0.281040        1.184305        3654.531355
Agglomerative (Ward)                            k=12          12        0   0.000000    0.281408        1.225761        3552.264089
Agglomerative (Ward)                            k=16          16        0   0.000000    0.278622        1.274775        3319.894167
Agglomerative (Ward)                            k=20          20        0   0.000000    0.295055        1.193753        3203.470119
Agglomerative (Ward)                            k=26          26        0   0.000000    0.304119        1.172194        3072.139975
Agglomerative (Ward)                            k=34          34        0   0.000000    0.321995        1.067513        2992.948844
Agglomerative (Ward)                            k=50          50        0   0.000000    0.343506        1.010616        2944.236720
Agglomerative (Ward)                            k=75          75        0   0.000000    0.370112        1.002773        2838.968311
Agglomerative (Ward)                           k=100         100        0   0.000000    0.377600        0.978082        2786.891253
Agglomerative (Ward)                           k=150         150        0   0.000000    0.388251        0.959118        2734.306732
Agglomerative (Ward)                           k=200         200        0   0.000000    0.418647        0.890381        2744.780282
Agglomerative (Ward)                           k=250         250        0   0.000000    0.431290        0.883640        2801.724667

# Final file listing
print('=== All Output Files ===')
print(f'\nCluster labels (in {DATA_DIR}):')
for f in sorted(DATA_DIR.glob('cluster_labels_*.csv')):
    print(f'  {f.name}')

print(f'\nMetrics summary (in {OUTPUT_DIR}):')
print(f'  clustering_metrics_summary.csv')

print(f'\nFigures (in {FIGURES_DIR}):')
for f in sorted(FIGURES_DIR.glob('*.png')):
    print(f'  {f.name}')

print('\nDone. All outputs saved for Notebook 05.')

=== All Output Files ===

Cluster labels (in /home/vwinters/ccc-project/indicator_clustering/data):
  cluster_labels_agglo_k10.csv
  cluster_labels_agglo_k100.csv
  cluster_labels_agglo_k11.csv
  cluster_labels_agglo_k12.csv
  cluster_labels_agglo_k150.csv
  cluster_labels_agglo_k16.csv
  cluster_labels_agglo_k20.csv
  cluster_labels_agglo_k200.csv
  cluster_labels_agglo_k250.csv
  cluster_labels_agglo_k26.csv
  cluster_labels_agglo_k34.csv
  cluster_labels_agglo_k4.csv
  cluster_labels_agglo_k50.csv
  cluster_labels_agglo_k6.csv
  cluster_labels_agglo_k75.csv
  cluster_labels_agglo_k8.csv
  cluster_labels_agglo_k9.csv
  cluster_labels_hdbscan_eps_0p0000.csv
  cluster_labels_hdbscan_eps_0p2140.csv
  cluster_labels_hdbscan_eps_0p4280.csv
  cluster_labels_hdbscan_eps_0p6420.csv
  cluster_labels_hdbscan_eps_0p7788.csv
  cluster_labels_hdbscan_eps_0p8560.csv
  cluster_labels_hdbscan_eps_1p0700.csv
  cluster_labels_hdbscan_eps_1p2840.csv
  cluster_labels_hdbscan_eps_1p4980.csv
  cluster_labels_hdbscan_eps_1p9327.csv
  cluster_labels_hdbscan_eps_2p2334.csv
  cluster_labels_hdbscan_eps_2p4729.csv
  cluster_labels_hdbscan_eps_2p6847.csv

Metrics summary (in /home/vwinters/ccc-project/indicator_clustering/outputs):
  clustering_metrics_summary.csv

Figures (in /home/vwinters/ccc-project/indicator_clustering/outputs/figures):
  agglo_k10_clusters.png
  agglo_k250_clusters.png
  agglo_k34_clusters.png
  agglo_k8_clusters.png
  agglo_k8_ho_alternation.png
  agglo_k8_ho_anagram.png
  agglo_k8_ho_container.png
  agglo_k8_ho_deletion.png
  agglo_k8_ho_hidden.png
  agglo_k8_ho_homophone.png
  agglo_k8_ho_insertion.png
  agglo_k8_ho_reversal.png
  agglo_metrics_vs_k.png
  dendrogram_k10_centroids.png
  dendrogram_k250_centroids.png
  dendrogram_k34_centroids.png
  dendrogram_k8_centroids.png
  dendrogram_truncated_top50.png
  hdbscan_best_clusters.png
  hdbscan_best_ho_alternation.png
  hdbscan_best_ho_anagram.png
  hdbscan_best_ho_container.png
  hdbscan_best_ho_deletion.png
  hdbscan_best_ho_hidden.png
  hdbscan_best_ho_homophone.png
  hdbscan_best_ho_insertion.png
  hdbscan_best_ho_reversal.png
  hdbscan_epsilon_sensitivity.png
  pairwise_distance_distribution.png

Done. All outputs saved for Notebook 05.

File	Produced by	Description
`embeddings_umap_10d.npy`	Stage 3	10D UMAP embeddings for clustering
`embeddings_umap_2d.npy`	Stage 3	2D UMAP embeddings for visualization
`indicator_index_all.csv`	Stage 2	Row-to-indicator-string mapping

File	Location	Description
`cluster_labels_hdbscan_eps_{value}.csv`	`DATA_DIR`	Labels per HDBSCAN epsilon run
`cluster_labels_agglo_k{value}.csv`	`DATA_DIR`	Labels per agglomerative k (all swept values)
`clustering_metrics_summary.csv`	`OUTPUT_DIR`	All metrics from all runs
`pairwise_distance_distribution.png`	`FIGURES_DIR`	Distance histogram
`hdbscan_epsilon_sensitivity.png`	`FIGURES_DIR`	HDBSCAN sensitivity analysis
`agglo_metrics_vs_k.png`	`FIGURES_DIR`	Agglomerative metrics vs k plot
`agglo_k{value}_clusters.png`	`FIGURES_DIR`	Scatter plots for selected k values
`hdbscan_best_clusters.png`	`FIGURES_DIR`	Best HDBSCAN scatter plot
`dendrogram_truncated_top50.png`	`FIGURES_DIR`	Truncated dendrogram (full data)
`dendrogram_k34_centroids.png`	`FIGURES_DIR`	Dendrogram of k=34 centroids
`dendrogram_k{k}_centroids.png`	`FIGURES_DIR`	Centroid dendrograms for each selected k ≠ 34

Stage 4: Clustering (Unconstrained Exploration)¶

Running on Google Colab¶

Section 1: Setup and Data Preparation¶

Imports¶

Environment Auto-Detection and Paths¶

Input File Validation¶

Load Input Files¶

Section 2: Pairwise Distance Analysis¶

Distance Distribution Histogram¶

Section 3: HDBSCAN with Epsilon Sensitivity Analysis¶

What is HDBSCAN?¶

Key Parameters¶

Interpreting the Sensitivity Analysis¶

Select Epsilon Candidates¶

Run HDBSCAN Sweep¶

Sensitivity Plot¶

Identify Best HDBSCAN Run¶

Section 4: Agglomerative Clustering with Ward's Method¶

Why Ward's Method?¶

Two-Part Strategy¶

Part A: Full Metric Sweep¶

Interpreting the Metrics¶

Metrics vs. k: Finding the Data's Preferred Granularity¶

Part B: Selected k Values¶

Dendrogram Visualization¶

Section 5: Cluster Visualization¶

Visualization Helper Functions¶

Selected Agglomerative Visualizations¶

Best HDBSCAN Visualization¶

Section 6: Qualitative Inspection¶

Selected Agglomerative — Cluster Inspection¶

Best HDBSCAN — Cluster Inspection¶

Section 7: Save All Outputs¶

Interpretation of Results¶

HDBSCAN Epsilon Sensitivity¶

Agglomerative Metrics Across k Values¶

Qualitative Cluster Coherence¶

Dendrogram Structure¶

What This Notebook Does NOT Tell Us¶