# imports
import os
from pathlib import Path
import pandas as pd
import numpy as np
import re
import string
import unicodedata
import matplotlib.pyplot as plt

# ==========================
# PATHS & CONFIG
# ==========================
# 1. Detect environment
try:
    IS_COLAB = 'google.colab' in str(get_ipython())
except NameError:
    IS_COLAB = False

if IS_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    PROJECT_ROOT = Path('/content/drive/MyDrive/SIADS 692 Milestone II/Milestone II - NLP Cryptic Crossword Clues')
else:
    try:
        PROJECT_ROOT = Path(__file__).resolve().parent.parent
    except NameError:
        PROJECT_ROOT = Path.cwd().parent

DATA_DIR = PROJECT_ROOT / "data"
OUTPUT_DIR = PROJECT_ROOT / "outputs"

# Read each CSV file into a DataFrame
df_clues = pd.read_csv(f'{DATA_DIR}/clues_raw.csv')
df_indicators = pd.read_csv(f'{DATA_DIR}/indicators_raw.csv')
df_ind_by_clue = pd.read_csv(f'{DATA_DIR}/indicators_by_clue_raw.csv')
df_ind_consolidated = pd.read_csv(f'{DATA_DIR}/indicators_consolidated_raw.csv')
df_charades = pd.read_csv(f'{DATA_DIR}/charades_raw.csv')
df_charades_by_clue = pd.read_csv(f'{DATA_DIR}/charades_by_clue_raw.csv')

# Uncomment to see how the clue_id data looks before cleaning
#df_indicators.sample().style.set_properties(**{"white-space": "pre-wrap"})

# Instead of a string with redundant indices, extract only the clue_ids in
# brackets to create a list of integers
df_indicators["clue_ids"] = (
    df_indicators["clue_ids"]
    .str.findall(r"\[(\d+)\]")
    .apply(lambda xs: [int(x) for x in xs])
)

# Include a new column to keep track of how many clues have this indicator
df_indicators["num_clues"] = df_indicators["clue_ids"].apply(len)

df_indicators.sample(3).style.set_properties(**{"white-space": "pre-wrap"})

# Uncomment to see what the clue_ids look like before cleaning
#df_charades.sample().style.set_properties(**{"white-space": "pre-wrap"})

# Instead of a string with redundant indices, extract only the clue_ids in
# brackets to create a list of integers
df_charades["clue_ids"] = (
    df_charades["clue_ids"]
    .str.findall(r"\[(\d+)\]")
    .apply(lambda xs: [int(x) for x in xs])
)

# Include a new column to keep track of how many clues have this charade
df_charades["num_clues"] = df_charades["clue_ids"].apply(len)

df_charades.sample(3).style.set_properties(**{"white-space": "pre-wrap"})

# View all the info for a specific clue (by clue_id), including
# clue surface, answer, definition, charades, and indicators
def clue_info(n):
  clue_cols = ['clue_id', 'clue', 'answer', 'definition', 'source_url']
  display(
      df_clues[df_clues['clue_id'] == n][clue_cols].style.set_properties(
        subset=["clue", 'source_url'],
        **{"white-space": "pre-wrap"}
    )
      )
  print()
  display(df_charades_by_clue[df_charades_by_clue['clue_id']== n])
  print()
  display(df_ind_by_clue[df_ind_by_clue["clue_id"] == n])
  print()
  display(df_indicators[df_indicators['clue_ids'].apply(lambda lst: n in lst)])

clue_info(623961)

# Normalize takes a string (clue surface, indicator, definition, answer),
# And returns the same text but with punctuation (including dashes) and
# accents removed, and all lowercase.
def normalize(s: str) -> str:
  # remove accents and punctuation, convert to lowercase
  s_normalized = ''.join(
      ch for ch in unicodedata.normalize('NFD', s)
      if unicodedata.category(ch).startswith(('L', 'N', 'Zs'))
  ).lower()

  return s_normalized

def count_unique_clues(series):
  """
  Calculates the total number of unique elements across all lists in a pandas
  Series. Applied to a column of `clue_ids`, this will count the number of
  unique clues represented in an indicator dataframe.

  Args:
    series (pandas.Series): A Series where each element is a list.

  Returns:
    int: The total count of unique elements.
  """
  unique_elements = set()
  for sublist in series:
    if isinstance(sublist, list):
      unique_elements.update(sublist)
  return len(unique_elements)

df_indicators.sample(3).style.set_properties(
        subset=["clue_ids"],
        **{"white-space": "pre-wrap"}
    )

# Uncomment to prove that `indicator` is already normalized - no accents,
# punctuation (including dashes), or capital letters

# Create a column of normalized indicators
#df_indicators['indicator_normalized'] = df_indicators['indicator'].apply(normalize)

# Check out all rows where normalization changed the indicator
#df_indicators.loc[df_indicators['indicator'] != df_indicators['indicator_normalized']]

# How many indicators do we have?
len(df_indicators)

15735

df_ind_by_clue.head()

df_ind_consolidated

# Create a dictionary where the key is the wordplay type, and the value is
# the list of associated unique indicators.
ind_by_wordplay_dict = {}

for wordplay in df_ind_consolidated.columns:
  ind_by_wordplay_dict[wordplay] = df_ind_consolidated[wordplay].values[0].split('\n')

# Uncomment or change key to view all indicators for that wordplay
#ind_by_wordplay_dict['insertion']

# See how many unique indicators there are for each type of wordplay
for wordplay in ind_by_wordplay_dict:
  print(f"{wordplay}: {len(ind_by_wordplay_dict[wordplay])}")

alternation: 244
anagram: 7346
container: 1950
deletion: 877
hidden: 1110
homophone: 669
insertion: 2173
reversal: 1692

# Uncomment to see how many rows have NaN for 'clue', 'answer', or 'definition'
#df_clues['clue'].value_counts(dropna=False).head()
#df_clues['answer'].value_counts(dropna=False).head()
#df_clues['definition'].value_counts(dropna=False).head()

# Drop all rows where the clue or answer is NaN (they are type float, and we want clue to be a string)
df_clues.dropna(subset=['clue', 'answer'], inplace=True)

# Surface: remove trailing numeric parentheses in clue
df_clues['surface'] = df_clues['clue'].astype(str).apply(lambda x: re.sub(r'\s*\(\d+(?:[,\s-]+\d+)*\)$', '', x))

# Create surface normalized - no accents, punctuation, capitalized letters
df_clues['surface_normalized'] = df_clues['surface'].astype(str).apply(normalize)

# Create answer normalized - no accents, punctuation, capitalized letters
df_clues['answer_normalized'] = df_clues['answer'].astype(str).apply(normalize)

# Create definition normalized - no accents, punctuation, capitalized letters
#df_clues['definition_normalized'] = df_clues['definition'].astype(str).apply(normalize)

# Extract the required answer format (the content inside the parentheses in clue)
df_clues['req_ans_format'] = df_clues['clue'].astype(str).str.extract(r'\((\d+(?:[,\s-]+\d+)*)\)$')

# Sums the numbers found in the answer format string, which specifies
# the required answer letter count.
df_clues['req_ans_letter_count'] = df_clues['req_ans_format'].apply(
    lambda x: sum(int(n) for n in re.findall(r'\d+', str(x))) if pd.notnull(x) else 0
)

# Create a column that determines whether the answer format is valid
# This will be handy later when we're determine ground truth labels for
# wordplay types, and algorithmically searching for permutations of the
# answer inside the clue text.
def check_format_match(row):
    answer = str(row['answer'])
    req_format = str(row['req_ans_format'])
    
    # 1. Extract all numbers from the format (e.g., '5,2,3,4' -> ['5', '2', '3', '4'])
    required_lengths = [int(n) for n in re.findall(r'\d+', req_format)]
    
    # 2. Extract all word segments from the ORIGINAL answer, ignoring punctuation
    # This splits 'knock on the head' or 'modern-day' into clean lists
    answer_segments = re.findall(r'[a-zA-Z0-9]+', answer)
    answer_lengths = [len(segment) for segment in answer_segments]
    
    # 3. Compare the two lists
    return required_lengths == answer_lengths

# Apply to your dataframe
df_clues['ans_format_valid'] = df_clues.apply(check_format_match, axis=1)

# Get the date range for the dataset
df_clues['puzzle_date'] = pd.to_datetime(df_clues['puzzle_date'], exact=False)
print(df_clues['puzzle_date'].min())
print(df_clues['puzzle_date'].max())

2009-01-01 00:00:00
2023-07-15 00:00:00

df_charades_by_clue.sample(3)

df_charades.sample(3).style.set_properties(
        subset=["clue_ids"],
        **{"white-space": "pre-wrap"}
    )

# Create a dataframe and add the counts from Indicators
df_ind_counts = pd.DataFrame(columns=["unique_inds"])
df_ind_counts['unique_inds'] = df_indicators.groupby(by=['wordplay']).count()['indicator']

# Also keep track of the total number of indicators
ind_total = df_ind_counts['unique_inds'].sum()

# Include a column that counts indicators by clue, which will
# double-count any indicator appearing in multiple clues
df_ind_counts['all_instances'] = df_ind_by_clue.count()

# Rearrange the columns to go from large to small, remove counts from
# ind_consolidated because they don't have associated clue IDs.
df_ind_counts = df_ind_counts[['all_instances', 'unique_inds']]

print(f"Total Number of Clues: {len(df_clues):,}")
print(f"Total Unique Indicators: {ind_total:,}")
print(f"Total Instances of Indicators in All Clues: {df_ind_counts['all_instances'].sum():,}")
print(f"Total Number of Clues Containing Indicator(s): {df_ind_by_clue['clue_id'].count():,}")

Total Number of Clues: 658,031
Total Unique Indicators: 15,735
Total Instances of Indicators in All Clues: 93,867
Total Number of Clues Containing Indicator(s): 88,037

df_ind_counts

# Add a column with a list of VERIFIED clue IDs: where we know the indicator
# appeared in the surface text as intact words.

# Build fast lookup dictionary
clue_lookup = df_clues.set_index("clue_id")["surface_normalized"].to_dict()

# Given an indicator and its list of clue_ids where it appears,
# return a new list of clue_ids where the indicator definitely
# appears in the normalized clue surface as intact words.
def verify_clues(indicator, clue_ids):
    if not clue_ids:
        return []

    # Escape regex special characters inside indicator
    pattern = rf"\b{re.escape(indicator)}\b"

    verified = []

    for cid in clue_ids:
        surface = clue_lookup.get(cid)

        if surface and re.search(pattern, surface):
            verified.append(cid)

    return verified


# add the column for the list of verified clue_ids
df_indicators["clue_ids_verified"] = df_indicators.apply(
    lambda row: verify_clues(row["indicator"], row["clue_ids"]),
    axis=1
)

# Add a column that counts the number of verified clue_ids for each indicator
df_indicators['num_clues_verified'] = df_indicators['clue_ids_verified'].apply(len)

# Uncomment to inspect the indicators table
df_indicators.sample(3)

# Inspect some clues where the indicators were invalid
#clue_info(635505) # indicator not in clue or on webpage
#clue_info(591484) # indicator not in clue or on webpage
#clue_info(627621) # indicator not in clue, defn NaN
#clue_info(422350) # indicator is a partial word in clue bc blogger error
#clue_info(76808) # misparsed 'hidden' formatting, the identified indicator is actually fodder

# Keep track of how many indicators are left if we keep only ones with
# at least one verified clue_id
mask = df_indicators['num_clues_verified'] > 0
df_ind_counts['verified_inds'] = df_indicators[mask].groupby(by=['wordplay']).count()['indicator']

df_ind_counts.style.format('{:,}')

df_ind_counts.sum()

all_instances    93867
unique_inds      15735
verified_inds    14195
dtype: int64

# Create a column for the number of characters in the indicator phrase
df_indicators['num_chars'] = df_indicators['indicator'].apply(len)

# See the counts for each indicator length, just for verified indicators
mask = (df_indicators['num_clues_verified'] > 0)
display(df_indicators[mask]['num_chars'].value_counts(dropna=False).sort_index())

num_chars
2       12
3       83
4      342
5      595
6      919
7     1445
8     1555
9     1501
10    1501
11    1270
12    1078
13     911
14     704
15     653
16     424
17     350
18     248
19     170
20     110
21     110
22      58
23      51
24      36
25      19
26      15
27      11
28      10
29       5
31       5
32       2
33       1
36       1
Name: count, dtype: int64

# Visualize the distribution of indicator length (as number of characters)
# just for unique indicators with verified clues
df_indicators[mask]['num_chars'].value_counts().sort_index().plot(kind='bar')

<Axes: xlabel='num_chars'>

# Uncomment to manually inspect 2-character verified indicators
#cols = ['wordplay', 'indicator', 'clue_ids_verified', 'num_clues_verified', 'num_clues']
#mask = (df_indicators['num_clues_verified'] > 0) & (df_indicators['num_chars'] == 2)
#df_indicators[mask][cols].head(12).sort_values(by='num_clues_verified', ascending=False)

# Uncomment to manually inspect 3-character verified indicators
#cols = ['wordplay', 'indicator', 'clue_ids_verified', 'num_clues_verified', 'num_clues']
#mask = (df_indicators['num_clues_verified'] > 0) & (df_indicators['num_chars'] == 3)
#df_indicators[mask][cols].head(83).sort_values(by='num_clues_verified', ascending=False)

# Uncomment to manually inspect the longest verified indicators
#cols = ['wordplay', 'indicator', 'clue_ids_verified', 'num_clues', 'num_chars']
#mask = (df_indicators['num_clues_verified'] > 0) & (df_indicators['num_chars'] > 25)
#df_indicators[mask][cols].sort_values(by='num_chars', ascending=False)

df_clues.sample()

df_indicators.sample()

# Compute hidden_fwd and hidden_rev
# Helper function to remove all whitespace for hidden word search
def remove_all_whitespace(text: str) -> str:
    if isinstance(text, str):
        return text.replace(" ", "")
    return ""

# Create 'answer_no_spaces' from 'answer_normalized'
df_clues['answer_no_spaces'] = df_clues['answer_normalized'].apply(remove_all_whitespace)

# Create 'surface_no_spaces' from 'surface_normalized'
df_clues['surface_no_spaces'] = df_clues['surface_normalized'].apply(remove_all_whitespace)

# Calculate 'hidden_fwd'
df_clues['hidden_fwd'] = df_clues.apply(
    lambda row: row['answer_no_spaces'] in row['surface_no_spaces'],
    axis=1
)

# Calculate 'hidden_rev'
df_clues['answer_no_spaces_rev'] = df_clues['answer_no_spaces'].apply(lambda x: x[::-1])
df_clues['hidden_rev'] = df_clues.apply(
    lambda row: row['answer_no_spaces_rev'] in row['surface_no_spaces'],
    axis=1
)

df_clues[df_clues['hidden_fwd']].shape[0]

23054

df_clues[df_clues['hidden_rev']].shape[0]

6798

df_clues.head()

# Compute answer letter count (needed for hidden_fwd filtering below and later exports)
df_clues['answer_letter_count'] = df_clues['answer_no_spaces'].apply(len)

# See some examples of 2-letter, 3-letter, and 4-letter answers, just for verified hidden_fwd,
# to determine if they are likely to be real words.
cols = ['clue', 'answer', 'definition']

for i in [2, 3, 4]:
    mask = (df_clues['hidden_fwd']) & (df_clues['answer_letter_count'] == i)
    display(df_clues[mask][cols].sample(5).style.set_properties(**{"white-space": "pre-wrap"}))

# An efficient way to find alternation

# We cache the regex pattern to avoid re-compiling inside the loop
# This looks for the answer characters with exactly one char between them
def check_alternation_seq(ans, clue):
    if not ans or not clue:
        return False
    # Creates "A.N.S.W.E.R"
    pattern = ".".join(re.escape(c) for c in ans)
    return bool(re.search(pattern, clue))

# Applying to the dataframe
df_clues['alternation'] = [
    check_alternation_seq(ans, clue)
    for ans, clue in zip(df_clues['answer_no_spaces'], df_clues['surface_no_spaces'])
]

df_clues[df_clues['alternation']].shape[0]

4213

df_ind_counts.style.format('{:,}')

df_ind_counts.sum().to_frame().T.style.format('{:,}')

df_ind_counts.sort_values(by='all_instances').plot.barh(stacked=False, figsize=(8, 5))

<Axes: ylabel='wordplay'>

# Add a column for the number of words within an indicator
df_indicators['ind_wc'] = df_indicators['indicator'].apply(lambda x: len(x.split()))

# Visualize the valid indicators by word count
mask = df_indicators['num_clues_verified'] > 0
df_indicators[mask]['ind_wc'].value_counts().sort_index().plot(kind='bar')

<Axes: xlabel='ind_wc'>

# Visualize the prevalence/redundancy of valid indicators
mask = df_indicators['num_clues_verified'] > 0
df_indicators[mask]['num_clues_verified'].value_counts().head(15).sort_index().plot(kind='bar', figsize=(8, 5))

<Axes: xlabel='num_clues_verified'>

# View some examples of the most common indicators
df_indicators[['num_clues_verified', 'indicator', 'wordplay']].sort_values(by='num_clues_verified', ascending=False).head(10)

df_indicators.head()

mask = df_indicators['num_clues_verified'] > 0
print(len(df_indicators['indicator'].unique()))
print(len(df_indicators[mask]['indicator'].unique()))

13920
12622

# Deduplicated list of unique indicator strings for Stage 2 embedding input
mask = df_indicators['num_clues_verified'] > 0
unique_indicators = (
    df_indicators[mask]['indicator']
    .drop_duplicates()
    .sort_values()
    .reset_index(drop=True)
)
unique_indicators.to_csv(
    DATA_DIR / 'verified_indicators_unique.csv',
    index=False,
    header=['indicator']
)
print(f"Saved {len(unique_indicators)} unique indicator strings to verified_indicators_unique.csv")

Saved 12622 unique indicator strings to verified_indicators_unique.csv

df_ind_counts.style.format('{:,}')

df_ind_counts.sum().to_frame().T.style.format('{:,}')

alternation_clue_ids = set(df_clues[df_clues['alternation'] == True]['clue_id'])

df_alternation = df_indicators[df_indicators['wordplay'] == 'alternation'].copy()

df_alternation['clue_ids_verified'] = df_alternation['clue_ids_verified'].apply(
    lambda x: [clue_id for clue_id in x if clue_id in alternation_clue_ids]
)

# Filter out rows where clue_ids_verified is empty after refinement
df_alternation = df_alternation[df_alternation['clue_ids_verified'].apply(len) > 0]

df_alternation = df_alternation[['wordplay', 'indicator', 'clue_ids_verified']]

df_alternation.head()

df_alternation.shape[0]

121

df_indicators[df_indicators['wordplay'] == 'alternation'].shape[0]

244

count_unique_clues(df_alternation['clue_ids_verified'])

255

count_unique_clues(df_indicators['clue_ids_verified'])

70959

count_unique_clues(df_indicators[df_indicators['wordplay'] == 'alternation']['clue_ids_verified'])

654

def check_anagram_in_surface(answer_no_spaces_text, surface_no_spaces_text):
    if not answer_no_spaces_text or not surface_no_spaces_text:
        return False

    answer_len = len(answer_no_spaces_text)
    if answer_len == 0:
        return False

    sorted_answer_chars = sorted(answer_no_spaces_text)

    for i in range(len(surface_no_spaces_text) - answer_len + 1):
        substring = surface_no_spaces_text[i : i + answer_len]
        if sorted(substring) == sorted_answer_chars:
            return True
    return False

# Apply this function to df_clues
df_clues['is_anagram_in_surface'] = df_clues.apply(
    lambda row: check_anagram_in_surface(row['answer_no_spaces'], row['surface_no_spaces']),
    axis=1
)

# Display the DataFrame with the new column
df_clues.head()

df_clues[df_clues['is_anagram_in_surface']].sample(10)

clue_info(261488)

# answer_letter_count was computed earlier (after hidden detection)
# Verify it exists
assert 'answer_letter_count' in df_clues.columns, "answer_letter_count missing from df_clues"

df_clues['answer_letter_count'].value_counts(dropna=False)

answer_letter_count
7     115192
6     107437
8      97656
5      94664
9      75354
4      66124
10     38343
11     12504
12     12183
3      11777
13      9254
15      7154
14      6588
1       1528
2        841
16       431
17       257
18       188
19       136
20       104
22        77
21        75
23        53
24        30
26        23
28        14
25        10
30         9
27         5
29         5
33         3
31         3
32         2
39         2
37         1
45         1
46         1
92         1
38         1
Name: count, dtype: int64

df_clues.columns

Index(['clue_id', 'clue', 'answer', 'definition', 'clue_number', 'puzzle_date',
       'puzzle_name', 'source_url', 'source', 'surface', 'surface_normalized',
       'answer_normalized', 'req_ans_format', 'req_ans_letter_count',
       'ans_format_valid', 'answer_no_spaces', 'surface_no_spaces',
       'hidden_fwd', 'answer_no_spaces_rev', 'hidden_rev',
       'answer_letter_count', 'alternation', 'is_anagram_in_surface'],
      dtype='str')

# === Step A: Explode clue_ids_verified and join with df_clues ===

# Start with verified indicators only
df_export = (
    df_indicators[df_indicators['num_clues_verified'] > 0]
    [['wordplay', 'indicator', 'clue_ids_verified']]
    .copy()
)

# Explode so each row is one (indicator, clue_id) pair
df_export = df_export.explode('clue_ids_verified').rename(
    columns={'clue_ids_verified': 'clue_id', 'wordplay': 'wordplay_ho'}
)

# Ensure clue_id is int for the merge
df_export['clue_id'] = df_export['clue_id'].astype(int)

# Merge with df_clues to get pattern detection columns and answer length
clue_cols = ['clue_id', 'hidden_fwd', 'hidden_rev', 'alternation',
             'is_anagram_in_surface', 'answer_letter_count', 'ans_format_valid']
df_export = df_export.merge(df_clues[clue_cols], on='clue_id', how='left')

print(f"Rows after explode + merge: {len(df_export):,}")
print(f"Unique clue_ids: {df_export['clue_id'].nunique():,}")
print(f"Unique indicators: {df_export['indicator'].nunique():,}")

# === Step B: Compute ground-truth labels ===

# Gate all ground truth on answer length >= 3 and the answer format is valid
answer_ok = (df_export['answer_letter_count'] >= 3) & df_export['ans_format_valid']

# Define pattern columns and their corresponding labels, in priority order
gt_checks = [
    ('hidden_fwd', 'hidden'),
    ('hidden_rev', 'reversal'),
    ('alternation', 'alternation'),
    ('is_anagram_in_surface', 'anagram'),
]

# wordplay_gt_all: all labels that fired (comma-separated), null if none or short answer
fired_labels = pd.DataFrame({
    label: df_export[col].fillna(False) & answer_ok
    for col, label in gt_checks
})
df_export['wordplay_gt_all'] = fired_labels.apply(
    lambda row: ','.join(col for col in fired_labels.columns if row[col]) or None,
    axis=1
)

# wordplay_gt: single winning label using priority order (first match wins)
df_export['wordplay_gt'] = None
for col, label in reversed(gt_checks):
    mask = df_export[col].fillna(False) & answer_ok
    df_export.loc[mask, 'wordplay_gt'] = label

# label_match: does the Ho label agree with our ground-truth label?
df_export['label_match'] = df_export['wordplay_ho'] == df_export['wordplay_gt']

# === Step C: Select final columns and save ===

final_cols = ['clue_id', 'indicator', 'wordplay_ho', 'wordplay_gt',
              'wordplay_gt_all', 'answer_letter_count', 'ans_format_valid', 'label_match']
df_export = df_export[final_cols]

df_export.to_csv(DATA_DIR / 'verified_clues_labeled.csv', index=False)
print(f"\nSaved {len(df_export):,} rows to verified_clues_labeled.csv")

Rows after explode + merge: 76,015
Unique clue_ids: 70,959
Unique indicators: 12,622

Saved 76,015 rows to verified_clues_labeled.csv

# === Summary statistics for verified_clues_labeled.csv ===

print("=== wordplay_ho (Ho blog label) distribution ===")
print(df_export['wordplay_ho'].value_counts().to_string())

print(f"\n=== wordplay_gt (ground-truth label) distribution ===")
print(df_export['wordplay_gt'].value_counts(dropna=False).to_string())

print(f"\n=== label_match ===")
# Only meaningful where wordplay_gt is not null
has_gt = df_export['wordplay_gt'].notna()
print(f"Rows with a ground-truth label: {has_gt.sum():,} / {len(df_export):,}")
if has_gt.sum() > 0:
    match_rate = df_export.loc[has_gt, 'label_match'].mean()
    print(f"Label match rate (where GT exists): {match_rate:.1%}")

print(f"\n=== Duplicate (clue_id, indicator) pairs (multi-label cases) ===")
dupes = df_export.duplicated(subset=['clue_id', 'indicator'], keep=False)
print(f"Rows involved in multi-label pairs: {dupes.sum():,}")
print(f"Unique (clue_id, indicator) pairs with >1 wordplay_ho: "
      f"{df_export[dupes].groupby(['clue_id', 'indicator']).ngroups:,}")

print(f"\n=== Sample rows ===")
df_export.sample(5, random_state=42)

=== wordplay_ho (Ho blog label) distribution ===
wordplay_ho
anagram        38226
container      10836
reversal       10149
insertion       8305
homophone       3642
hidden          2595
deletion        1608
alternation      654

=== wordplay_gt (ground-truth label) distribution ===
wordplay_gt
None           56348
anagram        15346
hidden          2556
reversal        1506
alternation      259

=== label_match ===
Rows with a ground-truth label: 19,667 / 76,015
Label match rate (where GT exists): 92.4%

=== Duplicate (clue_id, indicator) pairs (multi-label cases) ===
Rows involved in multi-label pairs: 620
Unique (clue_id, indicator) pairs with >1 wordplay_ho: 310

=== Sample rows ===

	ind_id	wordplay	indicator	clue_ids	num_clues
7003	7004	anagram	when excited	[116576, 188494, 659369, 666437, 667328]	5
14436	14437	reversal	fixed up	[326282]	1
2242	2243	anagram	flickered	[622828]	1

	charade_id	charade	charade_answer	clue_ids	num_clues
49198	49199	take up barbering	COME A CROPPER	[52257]	1
34774	34775	orgy	RIOT	[6409]	1
16011	16012	evaluates	MARKS	[76483]	1

	ind_id	wordplay	indicator	clue_ids	num_clues
8430	8431	container	out of	[117865, 203629, 344177]	3
10658	10659	hidden	in evidence	[412071]	1
11365	11366	homophone	by the ear	[411101]	1

	clue_id	charade	charade_answer
59680	172653	pious type	ST
40209	108575	floozie	TART
28058	74075	church	CH

	charade_id	charade	charade_answer	clue_ids	num_clues
1803	1804	against	VS	[84574]	1
18966	18967	for no special reason	IDLY	[22255]	1
33022	33023	novelist	FITZGERALD	[95214]	1

Stage 1: EDA and Data Cleaning for Indicator Clustering¶

Imports¶

Loading the Data¶

Reformat `clue_ids`¶

Indicators Table `clue_ids`¶

Charades Table `clue_ids`¶

Helper Functions¶

`clue_info()` - Investigate A Clue¶

`normalize()` - Remove punctuation, accents, make lowercase¶

Normalization Question: Remove Dashes in Answer?¶

`count_unique_clues()`¶

All Available Tables¶

Indicators¶

Indicators by Clue¶

Indicators Consolidated¶

Dictionary for Indicators Consolidated¶

Clues¶

Charades by Clue¶

Charades¶

Data Requirements & Unresolved Dilemmas¶

Indicator word(s) must appear in the clue surface text¶

Character Lengths of Indicators Must Be Reasonable¶

Issue: Some indicator phrases may contain some fodder¶

Verifiable Wordplay Types¶

Hiddens (FWD & REV)¶

Alternation¶

Summary of Indicators¶

Export Verified Indicators for Downstream Stages¶

Export Deduplicated Unique Indicators¶

Task¶

Extract Alternation Clue IDs¶

Subtask:¶

Summary:¶

Data Analysis Key Findings¶

Insights or Next Steps¶

Export Verified Clues with Labels¶

Schema¶

Ground-truth priority ordering¶

Note on duplicate rows¶

Row Count Reconciliation¶

	clue_id	alternation	anagram	container	deletion	hidden	homophone	insertion	reversal
0	90	NaN	transforming	NaN	NaN	NaN	NaN	NaN	NaN
1	97	NaN	ground	NaN	NaN	NaN	NaN	NaN	NaN
2	101	NaN	NaN	NaN	NaN	NaN	verbally	NaN	NaN
3	142	NaN	NaN	NaN	NaN	NaN	NaN	NaN	about to go back
4	145	NaN	NaN	NaN	NaN	NaN	NaN	NaN	returned

	all_instances	unique_inds
wordplay
alternation	769	244
anagram	45648	7121
container	14144	1909
deletion	2093	873
hidden	3381	1110
homophone	4672	663
insertion	11171	2155
reversal	11989	1660

	ind_id	wordplay	indicator	clue_ids	num_clues	clue_ids_verified	num_clues_verified
338	339	anagram	afloat	[314967]	1	[314967]	1
9594	9595	deletion	has taken	[641230]	1	[]	0
5880	5881	anagram	suspected	[378483]	1	[378483]	1

	clue_id	clue	answer	definition	clue_number	puzzle_date	puzzle_name	source_url	source	surface	surface_normalized	answer_normalized	req_ans_format	req_ans_letter_count	ans_format_valid	answer_no_spaces	surface_no_spaces	hidden_fwd	answer_no_spaces_rev	hidden_rev
0	1	Acquisitive chap, as we see it (8)	COVETOUS	Acquisitive	1a	2019-08-08	Times 27424	https://times-xwd-times.livejournal.com/218581...	times_xwd_times	Acquisitive chap, as we see it	acquisitive chap as we see it	covetous	8	8	True	covetous	acquisitivechapasweseeit	False	suotevoc	False
1	2	Back yard fencing weak and sagging (6)	DROOPY	sagging	5a	2019-08-08	Times 27424	https://times-xwd-times.livejournal.com/218581...	times_xwd_times	Back yard fencing weak and sagging	back yard fencing weak and sagging	droopy	6	6	True	droopy	backyardfencingweakandsagging	False	ypoord	False
2	3	Stripping off uniform, love holding colonel's ...	UNCLOTHING	Stripping	8a	2019-08-08	Times 27424	https://times-xwd-times.livejournal.com/218581...	times_xwd_times	Stripping off uniform, love holding colonel's ...	stripping off uniform love holding colonels coat	unclothing	10	10	True	unclothing	strippingoffuniformloveholdingcolonelscoat	False	gnihtolcnu	False
3	4	Without a mark where they should be gained (4)	EXAM	where they should be gained	9a	2019-08-08	Times 27424	https://times-xwd-times.livejournal.com/218581...	times_xwd_times	Without a mark where they should be gained	without a mark where they should be gained	exam	4	4	True	exam	withoutamarkwheretheyshouldbegained	False	maxe	False
4	5	Put a stop to Rugby's foul school leader (5,2,...	KNOCK ON THE HEAD	Put a stop to	10a	2019-08-08	Times 27424	https://times-xwd-times.livejournal.com/218581...	times_xwd_times	Put a stop to Rugby's foul school leader	put a stop to rugbys foul school leader	knock on the head	5,2,3,4	14	True	knockonthehead	putastoptorugbysfoulschoolleader	False	daehehtnokconk	False

	clue	answer	definition
105913	EG (for example; for one)+ an anagram of (order of) GREY – This construction doesn’t seem to fit the clue which requires the anagram of GREY to contain (swallows) EG. The containment only works if EG is reversed.	EG	nan
440981	Down-to-earth hearing perhaps after November in middle of day (2-8)	NO	Down-to-earth
371500	One degree Celsius in river in thaw (2-3)	DE	thaw
347602	Stir also around edge to dissolve (2-2)	TO	Stir
245139	I’m suffering with distemper over old PM	PM	nan

	clue	answer	definition
183546	To some extent, ideas perplex Serpent (3)	ASP	Serpent
64935	Fair enough concealing intense anger (3)	IRE	intense anger
622145	Some smear campaigns provide spin (3)	ARC	spin
228369	Good time to have some, if unhappy? (3	FUN	Good time
223469	Salt found in the mustard (3)	TAR	nan

	clue	answer	definition
344642	Greek character confined to maze, taurine (4)	ZETA	Greek character
472372	Standard feature of Saturn or Mars (4)	NORM	nan
137177	. A source of pain in past years (4)	STYE	nan
100380	City covered in Virginia creepers	ACRE	City
612248	Different part of tunnel searched (4)	ELSE	nan

	num_clues_verified	indicator	wordplay
12878	1487	in	insertion
7367	1251	about	container
4468	789	out	anagram
14173	660	back	reversal
15558	603	up	reversal
4278	540	new	anagram
11706	490	reportedly	homophone
14081	426	about	reversal
281	410	about	anagram
4651	390	possibly	anagram

	ind_id	wordplay	indicator	clue_ids	num_clues	clue_ids_verified	num_clues_verified	num_chars	ind_wc
0	1	alternation	abnormal	[623961]	1	[]	0	8	1
1	2	alternation	after odd losses	[139327]	1	[139327]	1	16	3
2	3	alternation	after regular excisions	[107211]	1	[107211]	1	23	3
3	4	alternation	alternately	[407055]	1	[407055]	1	11	1
4	5	alternation	alternating	[449798]	1	[449798]	1	11	1

	clue_id	clue	answer	definition	clue_number	puzzle_date	puzzle_name	source_url	source	surface	...	req_ans_letter_count	ans_format_valid	answer_no_spaces	surface_no_spaces	hidden_fwd	answer_no_spaces_rev	hidden_rev	answer_letter_count	alternation	is_anagram_in_surface
407170	407171	Composed a bad tune with no let up (8)	UNABATED	NaN	16d	2019-10-21	Rookie Corner – 289	http://bigdave44.com/2019/10/21/rookie-corner-...	bigdave44	Composed a bad tune with no let up	...	8	True	unabated	composedabadtunewithnoletup	False	detabanu	False	8	False	True
294692	294693	Lots of damaged blades cut OK (11)	BUCKETLOADS	Lots	7d	2021-03-05	Daily Telegraph 29610	http://bigdave44.com/2021/03/05/dt-29610/	bigdave44	Lots of damaged blades cut OK	...	11	True	bucketloads	lotsofdamagedbladescutok	False	sdaoltekcub	False	11	False	True
228074	228075	Jacob supplanted him in the wages audit (4)	ESAU	Jacob supplanted him	3d	2016-12-21	Financial Times 15421 Dante	https://www.fifteensquared.net/2016/12/21/fina...	fifteensquared	Jacob supplanted him in the wages audit	...	4	True	esau	jacobsupplantedhiminthewagesaudit	True	uase	False	4	False	True
568965	571297	Stranger is using and supplying (7)	ISSUING	NaN	3d	NaT	**1/2	cru-cryptics/Cryptic189.puz	cru_cryptics	Stranger is using and supplying	...	7	True	issuing	strangerisusingandsupplying	False	gniussi	False	7	False	True
267793	267794	Undressed hair went wild (2,3,3)	IN THE RAW	Undressed	24a	2014-02-18	Financial Times 14549 With Picture Quiz	https://www.fifteensquared.net/2014/02/18/fina...	fifteensquared	Undressed hair went wild	...	8	True	intheraw	undressedhairwentwild	False	warehtni	False	8	False	True
571351	573683	Parading on, playing instrument (5,5)	GRANDPIANO	NaN	14d	NaT	One-Star IV	cru-cryptics/Cryptic143.puz	cru_cryptics	Parading on, playing instrument	...	10	False	grandpiano	paradingonplayinginstrument	False	onaipdnarg	False	10	False	True
74420	74421	Wind was single flute, perhaps? (9)	WINEGLASS	flute, perhaps?	1a	2018-01-22	Times 26941	https://times-xwd-times.livejournal.com/188212...	times_xwd_times	Wind was single flute, perhaps?	...	9	True	wineglass	windwassinglefluteperhaps	False	ssalgeniw	False	9	False	True
355259	355260	Outsiders in Sale are characters to be shifted...	ALIENS	Outsiders	24a	2018-04-19	Daily Telegraph 28717	http://bigdave44.com/2018/04/19/dt-28717/	bigdave44	Outsiders in Sale are characters to be shifted	...	6	True	aliens	outsidersinsalearecharacterstobeshifted	False	sneila	False	6	False	True
530233	530234	Ma shaky about face veil (7)	YASHMAK	face veil	10a	2016-09-26	No 11810, Monday 26 Sep 2016, Gridman	https://thehinducrosswordcorner.blogspot.com/2...	thehinducrosswordcorner	Ma shaky about face veil	...	7	True	yashmak	mashakyaboutfaceveil	False	kamhsay	False	7	False	True
497935	497936	A co-host broadcast in this sort of collusion (7)	CAHOOTS	NaN	15a	2015-09-12	NTSPP – 292	http://bigdave44.com/2015/09/12/ntspp-292/	bigdave44	A co-host broadcast in this sort of collusion	...	7	True	cahoots	acohostbroadcastinthissortofcollusion	False	stoohac	False	7	False	True

	all_instances	unique_inds	verified_inds
wordplay
alternation	769	244	216
anagram	45,648	7,121	6,610
container	14,144	1,909	1,728
deletion	2,093	873	695
hidden	3,381	1,110	971
homophone	4,672	663	565
insertion	11,171	2,155	1,915
reversal	11,989	1,660	1,495

	all_instances	unique_inds	verified_inds
wordplay
alternation	769	244	216
anagram	45,648	7,121	6,610
container	14,144	1,909	1,728
deletion	2,093	873	695
hidden	3,381	1,110	971
homophone	4,672	663	565
insertion	11,171	2,155	1,915
reversal	11,989	1,660	1,495

	all_instances	unique_inds	verified_inds
wordplay
alternation	769	244	216
anagram	45,648	7,121	6,610
container	14,144	1,909	1,728
deletion	2,093	873	695
hidden	3,381	1,110	971
homophone	4,672	663	565
insertion	11,171	2,155	1,915
reversal	11,989	1,660	1,495

Column	Description
`clue_id`	The clue ID from `df_clues`
`indicator`	The verified indicator string
`wordplay_ho`	The wordplay type label assigned by George Ho's blog parser (one of: alternation, anagram, container, deletion, hidden, homophone, insertion, reversal)
`wordplay_gt`	A ground-truth label derived from pattern detection on the clue surface. See priority ordering below. Null if no pattern fires or if `answer_letter_count < 4`.
`wordplay_gt_all`	Comma-separated list of ALL ground-truth labels that fired before priority resolution (for debugging/analysis). Also null if `answer_letter_count < 4`.
`answer_letter_count`	Number of letters in the answer, so downstream users can apply their own length filters
`ans_format_valid`	Boolean: True if `answer` adheres to the format given in `clue`
`label_match`	Boolean: True if `wordplay_ho == wordplay_gt`

	clue_id	indicator	wordplay_ho	wordplay_gt	wordplay_gt_all	answer_letter_count	ans_format_valid	label_match
12590	435311	elaborated	anagram	None	NaN	9	True	False
74894	222190	up	reversal	None	NaN	4	True	False
56763	379939	soundly	homophone	None	NaN	4	True	False
19428	495282	misguided	anagram	None	NaN	12	True	False
30216	129705	sorted	anagram	None	NaN	8	True	False

Stage 1: EDA and Data Cleaning for Indicator Clustering¶

Imports¶

Loading the Data¶

Reformat clue_ids¶

Indicators Table clue_ids¶

Charades Table clue_ids¶

Helper Functions¶

clue_info() - Investigate A Clue¶

normalize() - Remove punctuation, accents, make lowercase¶

Normalization Question: Remove Dashes in Answer?¶

count_unique_clues()¶

All Available Tables¶

Indicators¶

Indicators by Clue¶

Indicators Consolidated¶

Dictionary for Indicators Consolidated¶

Clues¶

Charades by Clue¶

Charades¶

Data Requirements & Unresolved Dilemmas¶

Indicator word(s) must appear in the clue surface text¶

Character Lengths of Indicators Must Be Reasonable¶

Issue: Some indicator phrases may contain some fodder¶

Verifiable Wordplay Types¶

Hiddens (FWD & REV)¶

Alternation¶

Summary of Indicators¶

Export Verified Indicators for Downstream Stages¶

Export Deduplicated Unique Indicators¶

Task¶

Extract Alternation Clue IDs¶

Subtask:¶

Summary:¶

Data Analysis Key Findings¶

Insights or Next Steps¶

Export Verified Clues with Labels¶

Schema¶

Ground-truth priority ordering¶

Note on duplicate rows¶

Row Count Reconciliation¶

Reformat `clue_ids`¶

Indicators Table `clue_ids`¶

Charades Table `clue_ids`¶

`clue_info()` - Investigate A Clue¶

`normalize()` - Remove punctuation, accents, make lowercase¶

`count_unique_clues()`¶