# imports
import os
from pathlib import Path
import pandas as pd
import numpy as np
import re
import string
import unicodedata
import matplotlib.pyplot as plt

# ==========================
# PATHS & CONFIG
# ==========================
# 1. Detect environment
try:
    IS_COLAB = 'google.colab' in str(get_ipython())
except NameError:
    IS_COLAB = False

if IS_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    PROJECT_ROOT = Path('/content/drive/MyDrive/SIADS 692 Milestone II/Milestone II - NLP Cryptic Crossword Clues')
else:
    try:
        PROJECT_ROOT = Path(__file__).resolve().parent.parent
    except NameError:
        PROJECT_ROOT = Path.cwd().parent

DATA_DIR = PROJECT_ROOT / "data"
OUTPUT_DIR = PROJECT_ROOT / "outputs"

# Read each CSV file into a DataFrame
df_clues = pd.read_csv(f'{DATA_DIR}/clues_raw.csv')
df_indicators = pd.read_csv(f'{DATA_DIR}/indicators_raw.csv')
df_ind_by_clue = pd.read_csv(f'{DATA_DIR}/indicators_by_clue_raw.csv')
df_ind_consolidated = pd.read_csv(f'{DATA_DIR}/indicators_consolidated_raw.csv')
df_charades = pd.read_csv(f'{DATA_DIR}/charades_raw.csv')
df_charades_by_clue = pd.read_csv(f'{DATA_DIR}/charades_by_clue_raw.csv')

# Uncomment to see how the clue_id data looks before cleaning
#df_indicators.sample().style.set_properties(**{"white-space": "pre-wrap"})

# Instead of a string with redundant indices, extract only the clue_ids in
# brackets to create a list of integers
df_indicators["clue_ids"] = (
    df_indicators["clue_ids"]
    .str.findall(r"\[(\d+)\]")
    .apply(lambda xs: [int(x) for x in xs])
)

# Include a new column to keep track of how many clues have this indicator
df_indicators["num_clues"] = df_indicators["clue_ids"].apply(len)

df_indicators.sample(3).style.set_properties(**{"white-space": "pre-wrap"})

# Uncomment to see what the clue_ids look like before cleaning
#df_charades.sample().style.set_properties(**{"white-space": "pre-wrap"})

# Instead of a string with redundant indices, extract only the clue_ids in
# brackets to create a list of integers
df_charades["clue_ids"] = (
    df_charades["clue_ids"]
    .str.findall(r"\[(\d+)\]")
    .apply(lambda xs: [int(x) for x in xs])
)

# Include a new column to keep track of how many clues have this charade
df_charades["num_clues"] = df_charades["clue_ids"].apply(len)

df_charades.sample(3).style.set_properties(**{"white-space": "pre-wrap"})

# View all the info for a specific clue (by clue_id), including
# clue surface, answer, definition, charades, and indicators
def clue_info(n):
  clue_cols = ['clue_id', 'clue', 'answer', 'definition', 'source_url']
  display(
      df_clues[df_clues['clue_id'] == n][clue_cols].style.set_properties(
        subset=["clue", 'source_url'],
        **{"white-space": "pre-wrap"}
    )
      )
  print()
  display(df_charades_by_clue[df_charades_by_clue['clue_id']== n])
  print()
  display(df_ind_by_clue[df_ind_by_clue["clue_id"] == n])
  print()
  display(df_indicators[df_indicators['clue_ids'].apply(lambda lst: n in lst)])

clue_info(623961)

# Normalize takes a string (clue surface, indicator, definition, answer),
# And returns the same text but with punctuation (including dashes) and
# accents removed, and all lowercase.
def normalize(s: str) -> str:
  # remove accents and punctuation, convert to lowercase
  s_normalized = ''.join(
      ch for ch in unicodedata.normalize('NFD', s)
      if unicodedata.category(ch).startswith(('L', 'N', 'Zs'))
  ).lower()

  return s_normalized

def count_unique_clues(series):
  """
  Calculates the total number of unique elements across all lists in a pandas
  Series. Applied to a column of `clue_ids`, this will count the number of
  unique clues represented in an indicator dataframe.

  Args:
    series (pandas.Series): A Series where each element is a list.

  Returns:
    int: The total count of unique elements.
  """
  unique_elements = set()
  for sublist in series:
    if isinstance(sublist, list):
      unique_elements.update(sublist)
  return len(unique_elements)

df_indicators.sample(3).style.set_properties(
        subset=["clue_ids"],
        **{"white-space": "pre-wrap"}
    )

# Uncomment to prove that `indicator` is already normalized - no accents,
# punctuation (including dashes), or capital letters

# Create a column of normalized indicators
#df_indicators['indicator_normalized'] = df_indicators['indicator'].apply(normalize)

# Check out all rows where normalization changed the indicator
#df_indicators.loc[df_indicators['indicator'] != df_indicators['indicator_normalized']]

df_ind_by_clue.head()

df_ind_consolidated

# Create a dictionary where the key is the wordplay type, and the value is
# the list of associated unique indicators.
ind_by_wordplay_dict = {}

for wordplay in df_ind_consolidated.columns:
  ind_by_wordplay_dict[wordplay] = df_ind_consolidated[wordplay].values[0].split('\n')

# Uncomment or change key to view all indicators for that wordplay
#ind_by_wordplay_dict['insertion']

# See how many unique indicators there are for each type of wordplay
for wordplay in ind_by_wordplay_dict:
  print(f"{wordplay}: {len(ind_by_wordplay_dict[wordplay])}")

alternation: 244
anagram: 7346
container: 1950
deletion: 877
hidden: 1110
homophone: 669
insertion: 2173
reversal: 1692

# Uncomment to see how many rows have NaN for 'clue', 'answer', or 'definition'
#df_clues['clue'].value_counts(dropna=False).head()
#df_clues['answer'].value_counts(dropna=False).head()
#df_clues['definition'].value_counts(dropna=False).head()

# Drop all rows where the clue or answer is NaN (they are type float, and we want clue to be a string)
df_clues.dropna(subset=['clue', 'answer'], inplace=True)

# Surface: remove trailing numeric parentheses in clue
df_clues['surface'] = df_clues['clue'].astype(str).apply(lambda x: re.sub(r'\s*\(\d+(?:[,\s-]+\d+)*\)$', '', x))

# Create surface normalized - no accents, punctuation, capitalized letters
df_clues['surface_normalized'] = df_clues['surface'].astype(str).apply(normalize)

# Create answer normalized - no accents, punctuation, capitalized letters
df_clues['answer_normalized'] = df_clues['answer'].astype(str).apply(normalize)

# Create definition normalized - no accents, punctuation, capitalized letters
#df_clues['definition_normalized'] = df_clues['definition'].astype(str).apply(normalize)

# Extract the required answer format (the content inside the parentheses in clue)
df_clues['req_ans_format'] = df_clues['clue'].astype(str).str.extract(r'\((\d+(?:[,\s-]+\d+)*)\)$')

# Sums the numbers found in the answer format string, which specifies
# the required answer letter count.
df_clues['req_ans_letter_count'] = df_clues['req_ans_format'].apply(
    lambda x: sum(int(n) for n in re.findall(r'\d+', str(x))) if pd.notnull(x) else 0
)

# Create a column that determines whether the answer format is valid
# This will be handy later when we're determine ground truth labels for
# wordplay types, and algorithmically searching for permutations of the
# answer inside the clue text.
def check_format_match(row):
    answer = str(row['answer'])
    req_format = str(row['req_ans_format'])
    
    # 1. Extract all numbers from the format (e.g., '5,2,3,4' -> ['5', '2', '3', '4'])
    required_lengths = [int(n) for n in re.findall(r'\d+', req_format)]
    
    # 2. Extract all word segments from the ORIGINAL answer, ignoring punctuation
    # This splits 'knock on the head' or 'modern-day' into clean lists
    answer_segments = re.findall(r'[a-zA-Z0-9]+', answer)
    answer_lengths = [len(segment) for segment in answer_segments]
    
    # 3. Compare the two lists
    return required_lengths == answer_lengths

# Apply to your dataframe
df_clues['ans_format_valid'] = df_clues.apply(check_format_match, axis=1)

df_clues.head()

df_charades_by_clue.sample(3)

df_charades.sample(3).style.set_properties(
        subset=["clue_ids"],
        **{"white-space": "pre-wrap"}
    )

# Create a dataframe and add the counts from Indicators
df_ind_counts = pd.DataFrame(columns=["unique_inds"])
df_ind_counts['unique_inds'] = df_indicators.groupby(by=['wordplay']).count()['indicator']

# Also keep track of the total number of indicators
ind_total = df_ind_counts['unique_inds'].sum()

# Include a column that counts indicators by clue, which will
# double-count any indicator appearing in multiple clues
df_ind_counts['all_instances'] = df_ind_by_clue.count()

# Rearrange the columns to go from large to small, remove counts from
# ind_consolidated because they don't have associated clue IDs.
df_ind_counts = df_ind_counts[['all_instances', 'unique_inds']]

print(f"Total Number of Clues: {len(df_clues):,}")
print(f"Total Unique Indicators: {ind_total:,}")
print(f"Total Instances of Indicators in All Clues: {df_ind_counts['all_instances'].sum():,}")
print(f"Total Number of Clues Containing Indicator(s): {df_ind_by_clue['clue_id'].count():,}")

Total Number of Clues: 658,031
Total Unique Indicators: 15,735
Total Instances of Indicators in All Clues: 93,867
Total Number of Clues Containing Indicator(s): 88,037

df_ind_counts

# Add a column with a list of VERIFIED clue IDs: where we know the indicator
# appeared in the surface text as intact words.

# Build fast lookup dictionary
clue_lookup = df_clues.set_index("clue_id")["surface_normalized"].to_dict()

# Given an indicator and its list of clue_ids where it appears,
# return a new list of clue_ids where the indicator definitely
# appears in the normalized clue surface as intact words.
def verify_clues(indicator, clue_ids):
    if not clue_ids:
        return []

    # Escape regex special characters inside indicator
    pattern = rf"\b{re.escape(indicator)}\b"

    verified = []

    for cid in clue_ids:
        surface = clue_lookup.get(cid)

        if surface and re.search(pattern, surface):
            verified.append(cid)

    return verified


# add the column for the list of verified clue_ids
df_indicators["clue_ids_verified"] = df_indicators.apply(
    lambda row: verify_clues(row["indicator"], row["clue_ids"]),
    axis=1
)

# Add a column that counts the number of verified clue_ids for each indicator
df_indicators['num_clues_verified'] = df_indicators['clue_ids_verified'].apply(len)

# Uncomment to inspect the indicators table
df_indicators.sample(3)

# Inspect some clues where the indicators were invalid
#clue_info(635505) # indicator not in clue or on webpage
#clue_info(591484) # indicator not in clue or on webpage
#clue_info(627621) # indicator not in clue, defn NaN
#clue_info(422350) # indicator is a partial word in clue bc blogger error
#clue_info(76808) # misparsed 'hidden' formatting, the identified indicator is actually fodder

# Keep track of how many indicators are left if we keep only ones with
# at least one verified clue_id
mask = df_indicators['num_clues_verified'] > 0
df_ind_counts['verified_inds'] = df_indicators[mask].groupby(by=['wordplay']).count()['indicator']

df_ind_counts.style.format('{:,}')

df_ind_counts.sum()

all_instances    93867
unique_inds      15735
verified_inds    14195
dtype: int64

# Create a column for the number of characters in the indicator phrase
df_indicators['num_chars'] = df_indicators['indicator'].apply(len)

# See the counts for each indicator length, just for verified indicators
mask = (df_indicators['num_clues_verified'] > 0)
display(df_indicators[mask]['num_chars'].value_counts(dropna=False).sort_index())

num_chars
2       12
3       83
4      342
5      595
6      919
7     1445
8     1555
9     1501
10    1501
11    1270
12    1078
13     911
14     704
15     653
16     424
17     350
18     248
19     170
20     110
21     110
22      58
23      51
24      36
25      19
26      15
27      11
28      10
29       5
31       5
32       2
33       1
36       1
Name: count, dtype: int64

# Visualize the distribution of indicator length (as number of characters)
# just for unique indicators with verified clues
df_indicators[mask]['num_chars'].value_counts().sort_index().plot(kind='bar')

<Axes: xlabel='num_chars'>

# Uncomment to manually inspect 2-character verified indicators
#cols = ['wordplay', 'indicator', 'clue_ids_verified', 'num_clues_verified', 'num_clues']
#mask = (df_indicators['num_clues_verified'] > 0) & (df_indicators['num_chars'] == 2)
#df_indicators[mask][cols].head(12).sort_values(by='num_clues_verified', ascending=False)

# Uncomment to manually inspect 3-character verified indicators
#cols = ['wordplay', 'indicator', 'clue_ids_verified', 'num_clues_verified', 'num_clues']
#mask = (df_indicators['num_clues_verified'] > 0) & (df_indicators['num_chars'] == 3)
#df_indicators[mask][cols].head(83).sort_values(by='num_clues_verified', ascending=False)

# Uncomment to manually inspect the longest verified indicators
#cols = ['wordplay', 'indicator', 'clue_ids_verified', 'num_clues', 'num_chars']
#mask = (df_indicators['num_clues_verified'] > 0) & (df_indicators['num_chars'] > 25)
#df_indicators[mask][cols].sort_values(by='num_chars', ascending=False)

df_clues.sample()

df_indicators.sample()

# Compute hidden_fwd and hidden_rev
# Helper function to remove all whitespace for hidden word search
def remove_all_whitespace(text: str) -> str:
    if isinstance(text, str):
        return text.replace(" ", "")
    return ""

# Create 'answer_no_spaces' from 'answer_normalized'
df_clues['answer_no_spaces'] = df_clues['answer_normalized'].apply(remove_all_whitespace)

# Create 'surface_no_spaces' from 'surface_normalized'
df_clues['surface_no_spaces'] = df_clues['surface_normalized'].apply(remove_all_whitespace)

# Calculate 'hidden_fwd'
df_clues['hidden_fwd'] = df_clues.apply(
    lambda row: row['answer_no_spaces'] in row['surface_no_spaces'],
    axis=1
)

# Calculate 'hidden_rev'
df_clues['answer_no_spaces_rev'] = df_clues['answer_no_spaces'].apply(lambda x: x[::-1])
df_clues['hidden_rev'] = df_clues.apply(
    lambda row: row['answer_no_spaces_rev'] in row['surface_no_spaces'],
    axis=1
)

df_clues[df_clues['hidden_fwd']].shape[0]

23054

df_clues[df_clues['hidden_rev']].shape[0]

6798

df_clues.head()

# Compute answer letter count (needed for hidden_fwd filtering below and later exports)
df_clues['answer_letter_count'] = df_clues['answer_no_spaces'].apply(len)

# See some examples of 2-letter, 3-letter, and 4-letter answers, just for verified hidden_fwd,
# to determine if they are likely to be real words.
cols = ['clue', 'answer', 'definition']

for i in [2, 3, 4]:
    mask = (df_clues['hidden_fwd']) & (df_clues['answer_letter_count'] == i)
    display(df_clues[mask][cols].sample(5).style.set_properties(**{"white-space": "pre-wrap"}))

# An efficient way to find alternation

# We cache the regex pattern to avoid re-compiling inside the loop
# This looks for the answer characters with exactly one char between them
def check_alternation_seq(ans, clue):
    if not ans or not clue:
        return False
    # Creates "A.N.S.W.E.R"
    pattern = ".".join(re.escape(c) for c in ans)
    return bool(re.search(pattern, clue))

# Applying to the dataframe
df_clues['alternation'] = [
    check_alternation_seq(ans, clue)
    for ans, clue in zip(df_clues['answer_no_spaces'], df_clues['surface_no_spaces'])
]

df_clues[df_clues['alternation']].shape[0]

4213

df_ind_counts.style.format('{:,}')

df_ind_counts.sum().to_frame().T.style.format('{:,}')

df_ind_counts.sort_values(by='all_instances').plot.barh(stacked=False, figsize=(8, 5))

<Axes: ylabel='wordplay'>

# Add a column for the number of words within an indicator
df_indicators['ind_wc'] = df_indicators['indicator'].apply(lambda x: len(x.split()))

# Visualize the valid indicators by word count
mask = df_indicators['num_clues_verified'] > 0
df_indicators[mask]['ind_wc'].value_counts().sort_index().plot(kind='bar')

<Axes: xlabel='ind_wc'>

# Visualize the prevalence/redundancy of valid indicators
mask = df_indicators['num_clues_verified'] > 0
df_indicators[mask]['num_clues_verified'].value_counts().head(15).sort_index().plot(kind='bar', figsize=(8, 5))

<Axes: xlabel='num_clues_verified'>

# View some examples of the most common indicators
df_indicators[['num_clues_verified', 'indicator', 'wordplay']].sort_values(by='num_clues_verified', ascending=False).head(10)

df_indicators.head()

mask = df_indicators['num_clues_verified'] > 0
print(len(df_indicators['indicator'].unique()))
print(len(df_indicators[mask]['indicator'].unique()))

13920
12622

# Deduplicated list of unique indicator strings for Stage 2 embedding input
mask = df_indicators['num_clues_verified'] > 0
unique_indicators = (
    df_indicators[mask]['indicator']
    .drop_duplicates()
    .sort_values()
    .reset_index(drop=True)
)
unique_indicators.to_csv(
    DATA_DIR / 'verified_indicators_unique.csv',
    index=False,
    header=['indicator']
)
print(f"Saved {len(unique_indicators)} unique indicator strings to verified_indicators_unique.csv")

Saved 12622 unique indicator strings to verified_indicators_unique.csv

df_ind_counts.style.format('{:,}')

df_ind_counts.sum().to_frame().T.style.format('{:,}')

alternation_clue_ids = set(df_clues[df_clues['alternation'] == True]['clue_id'])

df_alternation = df_indicators[df_indicators['wordplay'] == 'alternation'].copy()

df_alternation['clue_ids_verified'] = df_alternation['clue_ids_verified'].apply(
    lambda x: [clue_id for clue_id in x if clue_id in alternation_clue_ids]
)

# Filter out rows where clue_ids_verified is empty after refinement
df_alternation = df_alternation[df_alternation['clue_ids_verified'].apply(len) > 0]

df_alternation = df_alternation[['wordplay', 'indicator', 'clue_ids_verified']]

df_alternation.head()

df_alternation.shape[0]

121

df_indicators[df_indicators['wordplay'] == 'alternation'].shape[0]

244

count_unique_clues(df_alternation['clue_ids_verified'])

255

count_unique_clues(df_indicators['clue_ids_verified'])

70959

count_unique_clues(df_indicators[df_indicators['wordplay'] == 'alternation']['clue_ids_verified'])

654

def check_anagram_in_surface(answer_no_spaces_text, surface_no_spaces_text):
    if not answer_no_spaces_text or not surface_no_spaces_text:
        return False

    answer_len = len(answer_no_spaces_text)
    if answer_len == 0:
        return False

    sorted_answer_chars = sorted(answer_no_spaces_text)

    for i in range(len(surface_no_spaces_text) - answer_len + 1):
        substring = surface_no_spaces_text[i : i + answer_len]
        if sorted(substring) == sorted_answer_chars:
            return True
    return False

# Apply this function to df_clues
df_clues['is_anagram_in_surface'] = df_clues.apply(
    lambda row: check_anagram_in_surface(row['answer_no_spaces'], row['surface_no_spaces']),
    axis=1
)

# Display the DataFrame with the new column
df_clues.head()

df_clues[df_clues['is_anagram_in_surface']].sample(10)

clue_info(261488)

# answer_letter_count was computed earlier (after hidden detection)
# Verify it exists
assert 'answer_letter_count' in df_clues.columns, "answer_letter_count missing from df_clues"

df_clues['answer_letter_count'].value_counts(dropna=False)

answer_letter_count
7     115192
6     107437
8      97656
5      94664
9      75354
4      66124
10     38343
11     12504
12     12183
3      11777
13      9254
15      7154
14      6588
1       1528
2        841
16       431
17       257
18       188
19       136
20       104
22        77
21        75
23        53
24        30
26        23
28        14
25        10
30         9
27         5
29         5
33         3
31         3
32         2
39         2
37         1
45         1
46         1
92         1
38         1
Name: count, dtype: int64

df_clues.columns

Index(['clue_id', 'clue', 'answer', 'definition', 'clue_number', 'puzzle_date',
       'puzzle_name', 'source_url', 'source', 'surface', 'surface_normalized',
       'answer_normalized', 'req_ans_format', 'req_ans_letter_count',
       'ans_format_valid', 'answer_no_spaces', 'surface_no_spaces',
       'hidden_fwd', 'answer_no_spaces_rev', 'hidden_rev',
       'answer_letter_count', 'alternation', 'is_anagram_in_surface'],
      dtype='str')

# === Step A: Explode clue_ids_verified and join with df_clues ===

# Start with verified indicators only
df_export = (
    df_indicators[df_indicators['num_clues_verified'] > 0]
    [['wordplay', 'indicator', 'clue_ids_verified']]
    .copy()
)

# Explode so each row is one (indicator, clue_id) pair
df_export = df_export.explode('clue_ids_verified').rename(
    columns={'clue_ids_verified': 'clue_id', 'wordplay': 'wordplay_ho'}
)

# Ensure clue_id is int for the merge
df_export['clue_id'] = df_export['clue_id'].astype(int)

# Merge with df_clues to get pattern detection columns and answer length
clue_cols = ['clue_id', 'hidden_fwd', 'hidden_rev', 'alternation',
             'is_anagram_in_surface', 'answer_letter_count', 'ans_format_valid']
df_export = df_export.merge(df_clues[clue_cols], on='clue_id', how='left')

print(f"Rows after explode + merge: {len(df_export):,}")
print(f"Unique clue_ids: {df_export['clue_id'].nunique():,}")
print(f"Unique indicators: {df_export['indicator'].nunique():,}")

# === Step B: Compute ground-truth labels ===

# Gate all ground truth on answer length >= 3 and the answer format is valid
answer_ok = (df_export['answer_letter_count'] >= 3) & df_export['ans_format_valid']

# Define pattern columns and their corresponding labels, in priority order
gt_checks = [
    ('hidden_fwd', 'hidden'),
    ('hidden_rev', 'reversal'),
    ('alternation', 'alternation'),
    ('is_anagram_in_surface', 'anagram'),
]

# wordplay_gt_all: all labels that fired (comma-separated), null if none or short answer
fired_labels = pd.DataFrame({
    label: df_export[col].fillna(False) & answer_ok
    for col, label in gt_checks
})
df_export['wordplay_gt_all'] = fired_labels.apply(
    lambda row: ','.join(col for col in fired_labels.columns if row[col]) or None,
    axis=1
)

# wordplay_gt: single winning label using priority order (first match wins)
df_export['wordplay_gt'] = None
for col, label in reversed(gt_checks):
    mask = df_export[col].fillna(False) & answer_ok
    df_export.loc[mask, 'wordplay_gt'] = label

# label_match: does the Ho label agree with our ground-truth label?
df_export['label_match'] = df_export['wordplay_ho'] == df_export['wordplay_gt']

# === Step C: Select final columns and save ===

final_cols = ['clue_id', 'indicator', 'wordplay_ho', 'wordplay_gt',
              'wordplay_gt_all', 'answer_letter_count', 'ans_format_valid', 'label_match']
df_export = df_export[final_cols]

df_export.to_csv(DATA_DIR / 'verified_clues_labeled.csv', index=False)
print(f"\nSaved {len(df_export):,} rows to verified_clues_labeled.csv")

Rows after explode + merge: 76,015
Unique clue_ids: 70,959
Unique indicators: 12,622

Saved 76,015 rows to verified_clues_labeled.csv

# === Summary statistics for verified_clues_labeled.csv ===

print("=== wordplay_ho (Ho blog label) distribution ===")
print(df_export['wordplay_ho'].value_counts().to_string())

print(f"\n=== wordplay_gt (ground-truth label) distribution ===")
print(df_export['wordplay_gt'].value_counts(dropna=False).to_string())

print(f"\n=== label_match ===")
# Only meaningful where wordplay_gt is not null
has_gt = df_export['wordplay_gt'].notna()
print(f"Rows with a ground-truth label: {has_gt.sum():,} / {len(df_export):,}")
if has_gt.sum() > 0:
    match_rate = df_export.loc[has_gt, 'label_match'].mean()
    print(f"Label match rate (where GT exists): {match_rate:.1%}")

print(f"\n=== Duplicate (clue_id, indicator) pairs (multi-label cases) ===")
dupes = df_export.duplicated(subset=['clue_id', 'indicator'], keep=False)
print(f"Rows involved in multi-label pairs: {dupes.sum():,}")
print(f"Unique (clue_id, indicator) pairs with >1 wordplay_ho: "
      f"{df_export[dupes].groupby(['clue_id', 'indicator']).ngroups:,}")

print(f"\n=== Sample rows ===")
df_export.sample(5, random_state=42)

=== wordplay_ho (Ho blog label) distribution ===
wordplay_ho
anagram        38226
container      10836
reversal       10149
insertion       8305
homophone       3642
hidden          2595
deletion        1608
alternation      654

=== wordplay_gt (ground-truth label) distribution ===
wordplay_gt
None           56348
anagram        15346
hidden          2556
reversal        1506
alternation      259

=== label_match ===
Rows with a ground-truth label: 19,667 / 76,015
Label match rate (where GT exists): 92.4%

=== Duplicate (clue_id, indicator) pairs (multi-label cases) ===
Rows involved in multi-label pairs: 620
Unique (clue_id, indicator) pairs with >1 wordplay_ho: 310

=== Sample rows ===

	ind_id	wordplay	indicator	clue_ids	num_clues
1190	1191	anagram	chafe	[428423]	1
12341	12342	insertion	content of	[16848, 414963]	2
12823	12824	insertion	hide	[414618]	1

	charade_id	charade	charade_answer	clue_ids	num_clues
10896	10897	copier	APER	[12750]	1
54290	54291	venison	DEER	[590126, 626466]	2
48765	48766	surrogate ruler	REGENT	[36395]	1

	ind_id	wordplay	indicator	clue_ids	num_clues
3391	3392	anagram	ingredients	[484990, 494059]	2
5742	5743	anagram	squalid	[348311, 373769, 430650, 458054]	4
1118	1119	anagram	can be a source of	[412196]	1

	clue_id	clue	answer	definition	clue_number	puzzle_date	puzzle_name	source_url	source	surface	surface_normalized	answer_normalized	req_ans_format	req_ans_letter_count	ans_format_valid
0	1	Acquisitive chap, as we see it (8)	COVETOUS	Acquisitive	1a	2019-08-08	Times 27424	https://times-xwd-times.livejournal.com/218581...	times_xwd_times	Acquisitive chap, as we see it	acquisitive chap as we see it	covetous	8	8	True
1	2	Back yard fencing weak and sagging (6)	DROOPY	sagging	5a	2019-08-08	Times 27424	https://times-xwd-times.livejournal.com/218581...	times_xwd_times	Back yard fencing weak and sagging	back yard fencing weak and sagging	droopy	6	6	True
2	3	Stripping off uniform, love holding colonel's ...	UNCLOTHING	Stripping	8a	2019-08-08	Times 27424	https://times-xwd-times.livejournal.com/218581...	times_xwd_times	Stripping off uniform, love holding colonel's ...	stripping off uniform love holding colonels coat	unclothing	10	10	True
3	4	Without a mark where they should be gained (4)	EXAM	where they should be gained	9a	2019-08-08	Times 27424	https://times-xwd-times.livejournal.com/218581...	times_xwd_times	Without a mark where they should be gained	without a mark where they should be gained	exam	4	4	True
4	5	Put a stop to Rugby's foul school leader (5,2,...	KNOCK ON THE HEAD	Put a stop to	10a	2019-08-08	Times 27424	https://times-xwd-times.livejournal.com/218581...	times_xwd_times	Put a stop to Rugby's foul school leader	put a stop to rugbys foul school leader	knock on the head	5,2,3,4	14	True

	clue_id	charade	charade_answer
121254	633952	delicate fabric	LACE
18182	47471	falls	DIPS
125932	660397	fellow	MAN

Stage 1: EDA and Data Cleaning for Indicator Clustering¶

Imports¶

Loading the Data¶

Reformat `clue_ids`¶

Indicators Table `clue_ids`¶

Charades Table `clue_ids`¶

Helper Functions¶

`clue_info()` - Investigate A Clue¶

`normalize()` - Remove punctuation, accents, make lowercase¶

Normalization Question: Remove Dashes in Answer?¶

`count_unique_clues()`¶

All Available Tables¶

Indicators¶

Indicators by Clue¶

Indicators Consolidated¶

Dictionary for Indicators Consolidated¶

Clues¶

Charades by Clue¶

Charades¶

Data Requirements & Unresolved Dilemmas¶

Indicator word(s) must appear in the clue surface text¶

Character Lengths of Indicators Must Be Reasonable¶

Issue: Some indicator phrases may contain some fodder¶

Verifiable Wordplay Types¶

Hiddens (FWD & REV)¶

Alternation¶

Summary of Indicators¶

Export Verified Indicators for Downstream Stages¶

Export Deduplicated Unique Indicators¶

Task¶

Extract Alternation Clue IDs¶

Subtask:¶

Summary:¶

Data Analysis Key Findings¶

Insights or Next Steps¶

Export Verified Clues with Labels¶

Schema¶

Ground-truth priority ordering¶

Note on duplicate rows¶

Row Count Reconciliation¶

	clue_id	alternation	anagram	container	deletion	hidden	homophone	insertion	reversal
0	90	NaN	transforming	NaN	NaN	NaN	NaN	NaN	NaN
1	97	NaN	ground	NaN	NaN	NaN	NaN	NaN	NaN
2	101	NaN	NaN	NaN	NaN	NaN	verbally	NaN	NaN
3	142	NaN	NaN	NaN	NaN	NaN	NaN	NaN	about to go back
4	145	NaN	NaN	NaN	NaN	NaN	NaN	NaN	returned

	charade_id	charade	charade_answer	clue_ids	num_clues
31310	31311	most recent	LATEST	[589267]	1
14103	14104	dog knocked out	BOOMERANGED	[450441]	1
15435	15436	encountered	I MET	[266032]	1

	all_instances	unique_inds
wordplay
alternation	769	244
anagram	45648	7121
container	14144	1909
deletion	2093	873
hidden	3381	1110
homophone	4672	663
insertion	11171	2155
reversal	11989	1660

	ind_id	wordplay	indicator	clue_ids	num_clues	clue_ids_verified	num_clues_verified
12612	12613	insertion	filling of	[361169]	1	[361169]	1
1934	1935	anagram	done right	[595222]	1	[]	0
7608	7609	container	claiming	[33737, 122731, 136566, 142842, 177543, 233123...	14	[33737, 122731, 136566, 142842, 177543, 233123...	14

	clue	answer	definition
420790	English cricket team, paid up, to pursue island obsession (4,4)	ID	nan
437927	Come back into section of green territory (2-5)	RE	nan
310209	Had old money dropped repeatedly in ditch (2-2)	HA	ditch
455513	Policeman leading one group — assistant taking control? (2-5)	CO	nan
388487	I note the French workers’ leader indicating what union may provide (2-3)	IN	what union may provide

	clue	answer	definition
573797	Reaction a constructor hopes for in Tallahassee? (3)	AHA	nan
441719	Selection of modern poetry (3)	ODE	poetry
703	Grounded winger involved in three murders (3)	EMU	Grounded winger
267604	What’s written in Cleopatra’s stars (3)	LEO	stars
4900	Star not entirely graceful (3)	ACE	Star

	clue	answer	definition
286767	River in Far North	ARNO	nan
624597	Characters in "Follies" targeted celebrity (4)	STAR	nan
329069	Test part of door alarmed (4)	ORAL	Test
275244	Written communication from troublesome Moscow (4)	MEMO	Written communication
613378	Test for all except the first and last (4)	ORAL	nan

	num_clues_verified	indicator	wordplay
12878	1487	in	insertion
7367	1251	about	container
4468	789	out	anagram
14173	660	back	reversal
15558	603	up	reversal
4278	540	new	anagram
11706	490	reportedly	homophone
14081	426	about	reversal
281	410	about	anagram
4651	390	possibly	anagram

	ind_id	wordplay	indicator	clue_ids	num_clues	clue_ids_verified	num_clues_verified	num_chars	ind_wc
0	1	alternation	abnormal	[623961]	1	[]	0	8	1
1	2	alternation	after odd losses	[139327]	1	[139327]	1	16	3
2	3	alternation	after regular excisions	[107211]	1	[107211]	1	23	3
3	4	alternation	alternately	[407055]	1	[407055]	1	11	1
4	5	alternation	alternating	[449798]	1	[449798]	1	11	1

	clue_id	clue	answer	definition	clue_number	puzzle_date	puzzle_name	source_url	source	surface	...	req_ans_letter_count	ans_format_valid	answer_no_spaces	surface_no_spaces	hidden_fwd	answer_no_spaces_rev	hidden_rev	answer_letter_count	alternation	is_anagram_in_surface
22506	22507	Initiation ceremony unsettled MPs a bit (7)	BAPTISM	Initiation ceremony	1d	2017-07-26	Quick Cryptic 882	https://times-xwd-times.livejournal.com/178215...	times_xwd_times	Initiation ceremony unsettled MPs a bit	...	7	True	baptism	initiationceremonyunsettledmpsabit	False	msitpab	False	7	False	True
585114	588154	Carelessly gash sole in footwear (8)	GALOSHES	footwear	13d	2016-02-13	Saturday, February 13, 2016 — No Bench Warmers...	https://natpostcryptic.blogspot.com/2016/02/sa...	natpostcryptic	Carelessly gash sole in footwear	...	8	True	galoshes	carelesslygashsoleinfootwear	False	sehsolag	False	8	False	True
483246	483247	Liquid nitrogen alert (8)	RINGTONE	NaN	4a	2011-06-18	NTSPP – 071	http://bigdave44.com/2011/06/18/ntspp-071-review/	bigdave44	Liquid nitrogen alert	...	8	True	ringtone	liquidnitrogenalert	False	enotgnir	False	8	False	True
203801	203802	The crux of recent research (6)	CENTRE	The crux	19d	2017-08-09	Guardian Cryptic 27271 by Arachne	https://www.fifteensquared.net/2017/08/09/guar...	fifteensquared	The crux of recent research	...	6	True	centre	thecruxofrecentresearch	True	ertnec	False	6	False	True
98347	98348	Drop an ugly misbehaving group of children her...	PLAYGROUND	children here	13d	2021-03-08	Guardian Quiptic 1112 Matilda	https://www.fifteensquared.net/2021/03/08/guar...	fifteensquared	Drop an ugly misbehaving group of children here	...	10	True	playground	dropanuglymisbehavinggroupofchildrenhere	False	dnuorgyalp	False	10	False	True
339468	339469	Religious celebration at centre of belief I es...	FIESTA	Religious celebration	22d	2017-05-18	Sunday Telegraph 2898	http://bigdave44.com/2017/05/18/st-2898/	bigdave44	Religious celebration at centre of belief I es...	...	6	True	fiesta	religiouscelebrationatcentreofbeliefiestablish	True	atseif	False	6	False	True
631057	634097	A little spirit, Nat’s nightcap, served up wit...	INSTANT	without delay	6d	2022-07-22	Daily Telegraph 30046	http://bigdave44.com/2022/07/22/dt-30046/	bigdave44	A little spirit, Nat’s nightcap, served up wit...	...	7	True	instant	alittlespiritnatsnightcapservedupwithoutdelay	False	tnatsni	True	7	False	True
207502	207503	It’s safe to break up for holidays (7)	FIESTAS	holidays	23a	2017-05-16	Financial Times 15550 by Armonie	https://www.fifteensquared.net/2017/05/16/fina...	fifteensquared	It’s safe to break up for holidays	...	7	True	fiestas	itssafetobreakupforholidays	False	satseif	False	7	False	True
474764	474765	One is creating a din (5)	NOISE	NaN	3d	2010-11-08	Daily Telegraph 26392	http://bigdave44.com/2010/11/08/dt-26392/	bigdave44	One is creating a din	...	5	True	noise	oneiscreatingadin	False	esion	False	5	False	True
490932	490933	Delights to ensure trap is set (10)	ENRAPTURES	NaN	8a	2014-07-14	Rookie Corner 014	http://bigdave44.com/2014/07/14/rookie-corner-...	bigdave44	Delights to ensure trap is set	...	10	True	enraptures	delightstoensuretrapisset	False	serutparne	False	10	False	True

	all_instances	unique_inds	verified_inds
wordplay
alternation	769	244	216
anagram	45,648	7,121	6,610
container	14,144	1,909	1,728
deletion	2,093	873	695
hidden	3,381	1,110	971
homophone	4,672	663	565
insertion	11,171	2,155	1,915
reversal	11,989	1,660	1,495

	all_instances	unique_inds	verified_inds
wordplay
alternation	769	244	216
anagram	45,648	7,121	6,610
container	14,144	1,909	1,728
deletion	2,093	873	695
hidden	3,381	1,110	971
homophone	4,672	663	565
insertion	11,171	2,155	1,915
reversal	11,989	1,660	1,495

	all_instances	unique_inds	verified_inds
wordplay
alternation	769	244	216
anagram	45,648	7,121	6,610
container	14,144	1,909	1,728
deletion	2,093	873	695
hidden	3,381	1,110	971
homophone	4,672	663	565
insertion	11,171	2,155	1,915
reversal	11,989	1,660	1,495

Column	Description
`clue_id`	The clue ID from `df_clues`
`indicator`	The verified indicator string
`wordplay_ho`	The wordplay type label assigned by George Ho's blog parser (one of: alternation, anagram, container, deletion, hidden, homophone, insertion, reversal)
`wordplay_gt`	A ground-truth label derived from pattern detection on the clue surface. See priority ordering below. Null if no pattern fires or if `answer_letter_count < 4`.
`wordplay_gt_all`	Comma-separated list of ALL ground-truth labels that fired before priority resolution (for debugging/analysis). Also null if `answer_letter_count < 4`.
`answer_letter_count`	Number of letters in the answer, so downstream users can apply their own length filters
`ans_format_valid`	Boolean: True if `answer` adheres to the format given in `clue`
`label_match`	Boolean: True if `wordplay_ho == wordplay_gt`

	clue_id	indicator	wordplay_ho	wordplay_gt	wordplay_gt_all	answer_letter_count	ans_format_valid	label_match
12590	435311	elaborated	anagram	None	NaN	9	True	False
74894	222190	up	reversal	None	NaN	4	True	False
56763	379939	soundly	homophone	None	NaN	4	True	False
19428	495282	misguided	anagram	None	NaN	12	True	False
30216	129705	sorted	anagram	None	NaN	8	True	False

Stage 1: EDA and Data Cleaning for Indicator Clustering¶

Imports¶

Loading the Data¶

Reformat clue_ids¶

Indicators Table clue_ids¶

Charades Table clue_ids¶

Helper Functions¶

clue_info() - Investigate A Clue¶

normalize() - Remove punctuation, accents, make lowercase¶

Normalization Question: Remove Dashes in Answer?¶

count_unique_clues()¶

All Available Tables¶

Indicators¶

Indicators by Clue¶

Indicators Consolidated¶

Dictionary for Indicators Consolidated¶

Clues¶

Charades by Clue¶

Charades¶

Data Requirements & Unresolved Dilemmas¶

Indicator word(s) must appear in the clue surface text¶

Character Lengths of Indicators Must Be Reasonable¶

Issue: Some indicator phrases may contain some fodder¶

Verifiable Wordplay Types¶

Hiddens (FWD & REV)¶

Alternation¶

Summary of Indicators¶

Export Verified Indicators for Downstream Stages¶

Export Deduplicated Unique Indicators¶

Task¶

Extract Alternation Clue IDs¶

Subtask:¶

Summary:¶

Data Analysis Key Findings¶

Insights or Next Steps¶

Export Verified Clues with Labels¶

Schema¶

Ground-truth priority ordering¶

Note on duplicate rows¶

Row Count Reconciliation¶

Reformat `clue_ids`¶

Indicators Table `clue_ids`¶

Charades Table `clue_ids`¶

`clue_info()` - Investigate A Clue¶

`normalize()` - Remove punctuation, accents, make lowercase¶

`count_unique_clues()`¶