import pandas as pd
import numpy as np
import re
import unicodedata
from pathlib import Path
import matplotlib.pyplot as plt

# We need WordNet throughout this notebook and the rest of the pipeline:
# - Step 1 (here): filter to rows where both definition and answer have at
#   least one WordNet synset, ensuring we can construct sense-specific
#   embeddings later.
# - Step 2: construct sense-specific embeddings (common vs. obscure synset)
#   for each definition and answer.
# - Step 3: compute the 21 WordNet relationship features (synonym, hypernym,
#   path similarity, etc.) between definition–answer pairs.
import nltk
from nltk.corpus import wordnet as wn

try:
    wn.synsets("test")
except LookupError:
    nltk.download("wordnet", quiet=True)

# Detect environment: Google Colab vs. local
try:
    import google.colab  # noqa: F401
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

if IN_COLAB:
    from google.colab import drive
    drive.mount("/content/drive")
    # Adjust this path to your shared Google Drive folder
    PROJECT_ROOT = Path("/content/drive/MyDrive/Milestone II - NLP Cryptic Crossword Clues")
else:
    # Local: this notebook lives in clue_misdirection/notebooks/
    PROJECT_ROOT = Path(".").resolve().parent.parent  # -> ccc-project/

# Shared data directory (contains clues_raw.csv from indicator_clustering NB00)
DATA_DIR = PROJECT_ROOT / "data"

# Output directory for this component's intermediate files
OUTPUT_DIR = PROJECT_ROOT / "clue_misdirection" / "data"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Environment:  {'Google Colab' if IN_COLAB else 'Local'}")
print(f"DATA_DIR:     {DATA_DIR}")
print(f"OUTPUT_DIR:   {OUTPUT_DIR}")

Environment:  Local
DATA_DIR:     /Users/victoria/Desktop/MADS/ccc-project/data
OUTPUT_DIR:   /Users/victoria/Desktop/MADS/ccc-project/clue_misdirection/data

# Load the raw clue dataset.
# We only need four columns: clue_id (unique identifier), clue (full text
# including answer format), answer, and definition. Other columns like
# source_url and puzzle_date are metadata we don't need for this pipeline.
df_clues = pd.read_csv(
    DATA_DIR / "clues_raw.csv",
    usecols=["clue_id", "clue", "answer", "definition"],
    index_col="clue_id",
)

print(f"Loaded {df_clues.shape[0]:,} rows x {df_clues.shape[1]} columns")
df_clues.head(3)

Loaded 660,613 rows x 3 columns

# Initialize a cleaning log to track how many rows survive each filter step.
# This will be reported in the notebook summary and saved to cleaning_log.md.
cleaning_log = {"00_raw": df_clues.shape[0]}

def normalize(s: str) -> str:
    """Normalize a string for comparison by removing accents, punctuation
    (including dashes and hyphens), and converting to lowercase.

    Uses NFKD unicode decomposition to strip accent marks (e.g., é → e),
    then keeps only letters, digits, and spaces. This lets us compare
    definition text against clue surface text without being tripped up by
    formatting differences in capitalization, punctuation, or diacritics.

    Examples:
        normalize("LA-DI-DA")        -> "ladida"
        normalize("café")            -> "cafe"
        normalize("Don't stop!")     -> "dont stop"
    """
    # Decompose unicode characters so accented letters become base + combining mark
    # Then keep only letters (L*), digits (N*), and whitespace (Zs)
    return "".join(
        ch
        for ch in unicodedata.normalize("NFD", s)
        if unicodedata.category(ch).startswith(("L", "N", "Zs"))
    ).lower()


# Quick sanity check
assert normalize("LA-DI-DA") == "ladida"
assert normalize("café") == "cafe"
assert normalize("Plant in a garden party") == "plant in a garden party"
print("normalize() tests passed.")

normalize() tests passed.

# Show how many nulls exist in each required column before filtering
print("Missing values before filtering:")
print(df_clues[["clue", "answer", "definition"]].isna().sum())
print()

n_before = df_clues.shape[0]
df_clues = df_clues.dropna(subset=["clue", "answer", "definition"])
n_after = df_clues.shape[0]

print(f"Removed {n_before - n_after:,} rows with missing data")
print(f"Remaining: {n_after:,} rows")

cleaning_log["01_no_missing"] = n_after

Missing values before filtering:
clue             323
answer          2259
definition    149096
dtype: int64

Removed 149,727 rows with missing data
Remaining: 510,886 rows

has_brackets = df_clues["clue"].str.contains(r"\[", na=False)

n_before = df_clues.shape[0]
df_clues = df_clues[~has_brackets]
n_after = df_clues.shape[0]

print(f"Removed {n_before - n_after:,} rows with square brackets in clue text")
print(f"Remaining: {n_after:,} rows")

cleaning_log["02_no_brackets"] = n_after

Removed 165 rows with square brackets in clue text
Remaining: 510,721 rows

# Strip the trailing answer format from the clue text.
# The regex matches patterns like (5), (3,4), (2-3), (4, 3, 2),
# (2-3,4), etc. at the end of the string, with optional leading whitespace.
df_clues["surface"] = df_clues["clue"].astype(str).apply(
    lambda x: re.sub(r"\s*\(\d+(?:[,\s-]+\d+)*\)$", "", x)
)

# Show a few examples so we can visually verify the regex is working
print("Clue → Surface examples:")
print("=" * 70)
for _, row in df_clues[["clue", "surface"]].sample(5, random_state=42).iterrows():
    print(f"  Clue:    {row['clue']}")
    print(f"  Surface: {row['surface']}")
    print()

Clue → Surface examples:
======================================================================
  Clue:    Squat is round behind back of lodgings (6)
  Surface: Squat is round behind back of lodgings

  Clue:    Photograph taken by mate, one in cinema (7,6)
  Surface: Photograph taken by mate, one in cinema

  Clue:    Turned round and put back into poetry
  Surface: Turned round and put back into poetry

  Clue:    Say no to limited editions (3-3)
  Surface: Say no to limited editions

  Clue:    Something in the salad that sent nine to bed? (7)
  Surface: Something in the salad that sent nine to bed?

df_clues["surface_normalized"] = df_clues["surface"].astype(str).apply(normalize)

# Quick check
df_clues[["surface", "surface_normalized"]].sample(3, random_state=42)

# (a) Extract the required answer format from the clue's trailing parentheses.
# For example, "Plant in a garden party (5)" → "5"
#              "Top man at last getting support (3-2)" → "3-2"
#              "Old piano is grand (4,5)" → "4,5"
# Spaces inside the parentheses are removed for consistent comparison.
df_clues["required_answer_format"] = (
    df_clues["clue"]
    .str.extract(r"\(\s*([\d,\s-]+)\s*\)\s*$", expand=False)
    .str.replace(" ", "", regex=False)
)

def pattern_from_A(s):
    """Compute the format pattern of an answer string, mirroring how the
    clue encodes it in trailing parentheses.

    Spaces become commas (separating word lengths), hyphens are preserved,
    and each alphabetic run contributes its length.

    Examples:
        pattern_from_A("PLANT")       -> "5"
        pattern_from_A("TOP-UP")      -> "3-2"
        pattern_from_A("OLD PIANO")   -> "3,5"
        pattern_from_A("JACK-IN-THE-BOX") -> "4-2-3-3"
    """
    if not isinstance(s, str):
        return None

    s = s.upper().strip()
    words = s.split()  # split on spaces → comma-separated word groups
    word_patterns = []

    for word in words:
        parts = re.findall(r"[A-Z]+", word)  # splits on hyphens/punctuation
        if not parts:
            continue

        if "-" in word:
            # Preserve hyphen structure: "TOP-UP" → "3-2"
            word_patterns.append("-".join(str(len(p)) for p in parts))
        else:
            word_patterns.append(str(len(parts[0])))

    return ",".join(word_patterns)


# (b) Compute the actual answer format from the answer string
df_clues["answer_format"] = df_clues["answer"].apply(pattern_from_A)

# Show a few examples of the two format columns side by side
df_clues[["clue", "answer", "required_answer_format", "answer_format"]].sample(
    5, random_state=42
)

# (c) Filter 3: Keep only rows where the answer matches the clue's stated format.
# First, drop rows where we couldn't extract a required format at all (the clue
# didn't have trailing parentheses with a number pattern — we can't validate these).
n_before = df_clues.shape[0]
df_clues = df_clues.dropna(subset=["required_answer_format", "answer_format"])
n_after_dropna = df_clues.shape[0]

print(f"Dropped {n_before - n_after_dropna:,} rows with no extractable answer format")

# Now keep only rows where the formats match
format_match = df_clues["required_answer_format"] == df_clues["answer_format"]
n_mismatch = (~format_match).sum()
df_clues = df_clues[format_match]
n_after = df_clues.shape[0]

print(f"Dropped {n_mismatch:,} rows where answer format didn't match clue's stated format")
print(f"Remaining: {n_after:,} rows")

cleaning_log["03_answer_format_valid"] = n_after

Dropped 26,574 rows with no extractable answer format
Dropped 12,333 rows where answer format didn't match clue's stated format
Remaining: 471,814 rows

# Pre-compile patterns reused across all rows for performance
_slash_splitter = re.compile(r"/+")
_ws_normalizer = re.compile(r"\s+")


def extract_valid_phrases(row):
    """Split a clue's definition field on '/' and return definitions that
    appear as intact whole words in the surface text.

    Returns original-case definition strings (not normalized), since
    downstream embedding steps need the text as it appears in the clue.
    """
    surface_norm = row["surface_normalized"]
    definition = row["definition"]

    if pd.isna(definition) or pd.isna(surface_norm):
        return []

    # 1. Split on one or more slashes (handles malformed '//////' cases)
    parts = _slash_splitter.split(str(definition))

    # 2. Clean whitespace, strip ends, and deduplicate. We use the
    #    normalized form as the dedup key so that case-only differences
    #    (e.g., "Scheme" vs "scheme") don't produce false duplicates.
    seen_norm = set()
    unique_parts = []  # list of (original_cleaned, normalized) tuples
    for p in parts:
        p_clean = _ws_normalizer.sub(" ", p).strip()
        if not p_clean:
            continue
        p_norm = normalize(p_clean)
        if not p_norm:  # fragment was purely punctuation
            continue
        if p_norm not in seen_norm:
            seen_norm.add(p_norm)
            unique_parts.append((p_clean, p_norm))

    # 3. Keep only definitions that appear as intact whole words in
    #    surface_normalized.  The \b word-boundary anchors ensure we match
    #    "bear" in "bear market" but NOT "ear" inside "bear".
    valid = []
    for p_orig, p_norm in unique_parts:
        if re.search(r"\b" + re.escape(p_norm) + r"\b", surface_norm):
            valid.append(p_orig)

    return valid


df_clues["definition_list"] = df_clues.apply(extract_valid_phrases, axis=1)

# ── Quick statistics ────────────────────────────────────────────────────────
n_multi = df_clues["definition"].str.contains("/", na=False).sum()
pct_multi = 100 * n_multi / len(df_clues)
print(f"Clues with '/' in definition field: {n_multi:,} ({pct_multi:.1f}%)\n")

def_counts = df_clues["definition_list"].apply(len)
print("Valid definitions per clue:")
print(def_counts.value_counts().sort_index())

# Show a few multi-definition examples for verification
multi_mask = def_counts > 1
print(f"\nExample multi-definition clues ({multi_mask.sum():,} total):")
df_clues.loc[multi_mask, ["clue", "definition", "definition_list", "answer"]].head(3)

Clues with '/' in definition field: 29,033 (6.2%)

Valid definitions per clue:
definition_list
0     11040
1    438938
2     21094
3       656
4        68
5        11
6         5
8         2
Name: count, dtype: int64

Example multi-definition clues (21,836 total):

n_before = df_clues.shape[0]
empty_defs = df_clues["definition_list"].apply(len) == 0
df_clues = df_clues[~empty_defs]
n_after = df_clues.shape[0]

print(f"Removed {n_before - n_after:,} rows with no valid definition in surface text")
print(f"Remaining: {n_after:,} rows")

cleaning_log["04_has_valid_definition"] = n_after

Removed 11,040 rows with no valid definition in surface text
Remaining: 460,774 rows

def def_at_edge(surface_norm, definitions):
    """Check if at least one definition appears at the start or end of
    the normalized surface text.

    Uses \\b word boundaries so "green" matches at the start of
    "green light in spare room" but "gree" would not.
    """
    if not definitions:
        return False

    for d in definitions:
        d_norm = normalize(d)
        if not d_norm:
            continue
        escaped = re.escape(d_norm)
        # Definition anchored at the START of the surface
        if re.search(r"^" + escaped + r"\b", surface_norm):
            return True
        # Definition anchored at the END of the surface
        if re.search(r"\b" + escaped + r"$", surface_norm):
            return True

    return False


at_edge = [
    def_at_edge(sn, dl)
    for sn, dl in zip(df_clues["surface_normalized"], df_clues["definition_list"])
]

n_before = df_clues.shape[0]
df_clues = df_clues[at_edge]
n_after = df_clues.shape[0]

print(f"Removed {n_before - n_after:,} rows where no definition appears at start/end of surface")
print(f"Remaining: {n_after:,} rows")

cleaning_log["05_def_at_edge"] = n_after

Removed 4,449 rows where no definition appears at start/end of surface
Remaining: 456,325 rows

# Record how many valid definitions each clue had before expansion.
# This stays constant across the expanded rows for a given clue.
df_clues["num_definitions"] = df_clues["definition_list"].apply(len)

n_before = df_clues.shape[0]

# Explode: each element of definition_list becomes its own row.
# The index (clue_id) is preserved so we can trace rows back to their
# source clue — duplicate clue_ids are expected after this step.
df_clues = df_clues.explode("definition_list", ignore_index=False)

# Overwrite the original slash-separated definition column with the single
# definition for this row (the value that came out of the explosion)
df_clues["definition"] = df_clues["definition_list"]

n_after = df_clues.shape[0]
n_new = n_after - n_before

print(f"Expanded {n_before:,} clue rows into {n_after:,} definition-answer rows")
print(f"  ({n_new:,} new rows created from multi-definition clues)")

cleaning_log["06_after_expansion"] = n_after

Expanded 456,325 clue rows into 478,981 definition-answer rows
  (22,656 new rows created from multi-definition clues)

def has_wordnet_synset(text: str) -> bool:
    """Check whether a definition or answer string has at least one
    WordNet synset.

    Performs two attempts:
    1. Look up the text as-is (lowercased, spaces replaced with
       underscores for multi-word WordNet entries like "ice_cream").
    2. If no synsets found and the text starts with "a ", strip the
       leading article and retry (recovers "a shade" -> "shade").

    Returns True if any synsets are found in either attempt.
    """
    t = str(text).lower()
    lookup = t.replace(" ", "_")
    if wn.synsets(lookup):
        return True
    # Try stripping a leading indefinite article
    if t.startswith("a ") and len(t) > 2:
        if wn.synsets(t[2:].replace(" ", "_")):
            return True
    return False


# Quick sanity checks
assert has_wordnet_synset("shade") is True       # direct match
assert has_wordnet_synset("a shade") is True     # recovered by stripping "a "
assert has_wordnet_synset("PLOT") is True        # uppercase answer
assert has_wordnet_synset("ice cream") is True   # multi-word
assert has_wordnet_synset("xyzzy") is False      # no synset
print("has_wordnet_synset() tests passed.")

has_wordnet_synset() tests passed.

# Check which definitions have synsets (with article-stripping fallback)
wn_def_mask = df_clues["definition"].apply(has_wordnet_synset)

# Track article-stripping stats: how many pass ONLY after stripping "a "?
direct_def_mask = df_clues["definition"].apply(
    lambda x: bool(wn.synsets(str(x).lower().replace(" ", "_")))
)
n_def_stripped = int((wn_def_mask & ~direct_def_mask).sum())

# Show examples of definitions that failed both attempts
failing_defs = df_clues.loc[~wn_def_mask, "definition"].unique()
print(f"Definitions NOT found in WordNet: {len(failing_defs):,} unique values")
print("Examples of failing definitions:")
for d in failing_defs[:10]:
    print(f"  '{d}'")
print()

n_before = df_clues.shape[0]
df_clues = df_clues[wn_def_mask]
n_after = df_clues.shape[0]

print(f"Removed {n_before - n_after:,} rows where definition has no WordNet synset")
print(f"  ({n_def_stripped:,} definitions recovered by stripping leading 'a ')")
print(f"Remaining: {n_after:,} rows")

cleaning_log["07_def_in_wordnet"] = n_after

Definitions NOT found in WordNet: 181,043 unique values
Examples of failing definitions:
  'where they should be gained'
  'Put a stop to'
  'Heady mixture'
  'of merit'
  'show amusement'
  'puritanical views'
  'What handyman has'
  'Praise, primarily, to give the thing acclaim'
  'It’s still to be settled'
  'sort presenting a threat to workers'

Removed 216,436 rows where definition has no WordNet synset
  (3,049 definitions recovered by stripping leading 'a ')
Remaining: 262,545 rows

# Check which answers have synsets
wn_ans_mask = df_clues["answer"].apply(has_wordnet_synset)

# Track article-stripping stats for answers
direct_ans_mask = df_clues["answer"].apply(
    lambda x: bool(wn.synsets(str(x).lower().replace(" ", "_")))
)
n_ans_stripped = int((wn_ans_mask & ~direct_ans_mask).sum())

# Show examples of answers that failed
failing_answers = df_clues.loc[~wn_ans_mask, "answer"].unique()
print(f"Answers NOT found in WordNet: {len(failing_answers):,} unique values")
print("Examples of failing answers:")
for a in failing_answers[:10]:
    print(f"  '{a}'")
print()

n_before = df_clues.shape[0]
df_clues = df_clues[wn_ans_mask]
n_after = df_clues.shape[0]

print(f"Removed {n_before - n_after:,} rows where answer has no WordNet synset")
print(f"  ({n_ans_stripped:,} answers recovered by stripping leading 'a ')")
print(f"Remaining: {n_after:,} rows")

cleaning_log["08_answer_in_wordnet"] = n_after

Answers NOT found in WordNet: 12,318 unique values
Examples of failing answers:
  'AGITATO'
  'DOGLEGGED'
  'ERNEST'
  'DENARII'
  'TO A DEGREE'
  'ALEXANDRA'
  'LINDA'
  'ARABICA'
  'ASTI'
  'SEND-UP'

Removed 21,148 rows where answer has no WordNet synset
  (33 answers recovered by stripping leading 'a ')
Remaining: 241,397 rows

# Factorize on the lowercased (definition, answer) tuple to create a
# unique integer ID for each pair. Lowercasing ensures that "Shade"/SHADE
# and "shade"/SHADE map to the same pair.
pair_keys = df_clues["definition"].str.lower() + "|||" + df_clues["answer"].str.lower()
df_clues["def_answer_pair_id"] = pd.factorize(pair_keys)[0]

n_rows = df_clues.shape[0]
n_pairs = df_clues["def_answer_pair_id"].nunique()

print(f"Total rows:                    {n_rows:,}")
print(f"Unique (definition, answer) pairs: {n_pairs:,}")
print(f"Average clues per pair:        {n_rows / n_pairs:.1f}")

Total rows:                    241,397
Unique (definition, answer) pairs: 129,429
Average clues per pair:        1.9

# Select and order columns per DATA.md schema
output_cols = [
    "clue",
    "surface",
    "surface_normalized",
    "definition",
    "answer",
    "answer_format",
    "num_definitions",
    "def_answer_pair_id",
]
df_out = df_clues[output_cols].copy()

# Save with clue_id as a regular column (not the index)
df_out.to_csv(OUTPUT_DIR / "clues_filtered.csv", index=True)

print(f"Saved {df_out.shape[0]:,} rows x {df_out.shape[1] + 1} columns "  # +1 for clue_id index
      f"to {OUTPUT_DIR / 'clues_filtered.csv'}")
print()
df_out.head()

Saved 241,397 rows x 9 columns to /Users/victoria/Desktop/MADS/ccc-project/clue_misdirection/data/clues_filtered.csv

# Build the cleaning log table
log_lines = [
    "# Cleaning Log \u2014 Step 1: Filter and Clean the Raw Dataset\n",
    "",
    "| Step | Description | Rows After | Rows Removed |",
    "|------|-------------|------------|--------------|",
]

step_descriptions = {
    "00_raw": "Raw dataset loaded",
    "01_no_missing": "Remove rows with missing clue/answer/definition",
    "02_no_brackets": "Remove clues with square brackets (mis-parsed)",
    "03_answer_format_valid": "Keep rows where answer format matches clue",
    "04_has_valid_definition": "Require definition appears as whole words in surface",
    "05_def_at_edge": "Require definition at start or end of surface",
    "06_after_expansion": "Expand multi-definition clues into separate rows",
    "07_def_in_wordnet": "Require definition has WordNet synset",
    "08_answer_in_wordnet": "Require answer has WordNet synset",
}

prev_count = None
for key, count in cleaning_log.items():
    desc = step_descriptions.get(key, key)
    removed = "\u2014" if prev_count is None else f"{prev_count - count:,}"
    log_lines.append(f"| {key} | {desc} | {count:,} | {removed} |")
    prev_count = count

log_lines.append("")
log_lines.append(f"**Final dataset:** {cleaning_log[list(cleaning_log.keys())[-1]]:,} rows")
log_lines.append("")
log_lines.append(f"**Unique (definition, answer) pairs:** {n_pairs:,}")
log_lines.append("")
log_lines.append("**Notes:**")
log_lines.append('- Article stripping (\'a \') was applied during WordNet lookup '
                 'to recover additional matches.')
log_lines.append(f"  - Definitions recovered by article stripping: {n_def_stripped:,}")
log_lines.append(f"  - Answers recovered by article stripping: {n_ans_stripped:,}")
log_lines.append("- More aggressive lemmatization is a potential future improvement.")

log_text = "\n".join(log_lines) + "\n"

(OUTPUT_DIR / "cleaning_log.md").write_text(log_text)
print(f"Saved cleaning log to {OUTPUT_DIR / 'cleaning_log.md'}\n")
print(log_text)

Saved cleaning log to /Users/victoria/Desktop/MADS/ccc-project/clue_misdirection/data/cleaning_log.md

# Cleaning Log — Step 1: Filter and Clean the Raw Dataset


| Step | Description | Rows After | Rows Removed |
|------|-------------|------------|--------------|
| 00_raw | Raw dataset loaded | 660,613 | — |
| 01_no_missing | Remove rows with missing clue/answer/definition | 510,886 | 149,727 |
| 02_no_brackets | Remove clues with square brackets (mis-parsed) | 510,721 | 165 |
| 03_answer_format_valid | Keep rows where answer format matches clue | 471,814 | 38,907 |
| 04_has_valid_definition | Require definition appears as whole words in surface | 460,774 | 11,040 |
| 05_def_at_edge | Require definition at start or end of surface | 456,325 | 4,449 |
| 06_after_expansion | Expand multi-definition clues into separate rows | 478,981 | -22,656 |
| 07_def_in_wordnet | Require definition has WordNet synset | 262,545 | 216,436 |
| 08_answer_in_wordnet | Require answer has WordNet synset | 241,397 | 21,148 |

**Final dataset:** 241,397 rows

**Unique (definition, answer) pairs:** 129,429

**Notes:**
- Article stripping ('a ') was applied during WordNet lookup to recover additional matches.
  - Definitions recovered by article stripping: 3,049
  - Answers recovered by article stripping: 33
- More aggressive lemmatization is a potential future improvement.

	clue	answer	definition
clue_id
1	Acquisitive chap, as we see it (8)	COVETOUS	Acquisitive
2	Back yard fencing weak and sagging (6)	DROOPY	sagging
3	Stripping off uniform, love holding colonel's ...	UNCLOTHING	Stripping

	clue	definition	definition_list	answer
clue_id
150	Ruined a sculpture (4)	Ruined/a sculpture	[Ruined, a sculpture]	BUST
156	Great cricketer becomes more like a judge? (6)	Great cricketer/becomes more like a judge	[Great cricketer, becomes more like a judge]	SOBERS
161	Back in a short while (6)	Back /a short while	[Back, a short while]	SECOND

	clue	surface	surface_normalized	definition	answer	answer_format	num_definitions	def_answer_pair_id
clue_id
1	Acquisitive chap, as we see it (8)	Acquisitive chap, as we see it	acquisitive chap as we see it	Acquisitive	COVETOUS	8	1	0
2	Back yard fencing weak and sagging (6)	Back yard fencing weak and sagging	back yard fencing weak and sagging	sagging	DROOPY	6	1	1
3	Stripping off uniform, love holding colonel's ...	Stripping off uniform, love holding colonel's ...	stripping off uniform love holding colonels coat	Stripping	UNCLOTHING	10	1	2
6	Foreign letter coming in is the French letter (7)	Foreign letter coming in is the French letter	foreign letter coming in is the french letter	letter	EPISTLE	7	1	3
8	At first, bear one fruit or another (7)	At first, bear one fruit or another	at first bear one fruit or another	another	BANANAS	7	1	4

Step 1: Filter and Clean the Raw Dataset¶

Imports¶

Configuration and Data Loading¶

Helper Function: `normalize()`¶

Filter 1: Remove Rows with Missing Data¶

Filter 2: Remove Bracketed Clues¶

Derive `surface` Column¶

Derive `surface_normalized` Column¶

Extract and Validate Answer Format¶

Parse Double-Definition Clues and Build `definition_list`¶

Filter 4: Require at Least One Valid Definition¶

Filter 5: Definition Must Appear at Start or End of Surface¶

Expand Rows for Multiple Valid Definitions¶

Helper: WordNet Lookup with Article Stripping¶

Filter 6: Definition Must Be in WordNet¶

Filter 7: Answer Must Be in WordNet¶

Assign `def_answer_pair_id`¶

Final Column Selection and Export¶

Generate `cleaning_log.md`¶

Summary¶

Cleaning Pipeline¶

Output¶

Key Observations¶

For FINDINGS.md¶

	surface	surface_normalized
clue_id
379867	Squat is round behind back of lodgings	squat is round behind back of lodgings
639653	Photograph taken by mate, one in cinema	photograph taken by mate one in cinema
136878	Turned round and put back into poetry	turned round and put back into poetry

	clue	answer	required_answer_format	answer_format
clue_id
379867	Squat is round behind back of lodgings (6)	STUBBY	6	6
639653	Photograph taken by mate, one in cinema (7,6)	PICTURE PALACE	7,6	7,6
136878	Turned round and put back into poetry	REVERSED	NaN	8
512650	Say no to limited editions (3-3)	BOO-BOO	3-3	3-3
171522	Something in the salad that sent nine to bed? (7)	ICEBERG	7	7

Step 1: Filter and Clean the Raw Dataset¶

Imports¶

Configuration and Data Loading¶

Helper Function: normalize()¶

Filter 1: Remove Rows with Missing Data¶

Filter 2: Remove Bracketed Clues¶

Derive surface Column¶

Derive surface_normalized Column¶

Extract and Validate Answer Format¶

Parse Double-Definition Clues and Build definition_list¶

Filter 4: Require at Least One Valid Definition¶

Filter 5: Definition Must Appear at Start or End of Surface¶

Expand Rows for Multiple Valid Definitions¶

Helper: WordNet Lookup with Article Stripping¶

Filter 6: Definition Must Be in WordNet¶

Filter 7: Answer Must Be in WordNet¶

Assign def_answer_pair_id¶

Final Column Selection and Export¶

Generate cleaning_log.md¶

Summary¶

Cleaning Pipeline¶

Output¶

Key Observations¶

For FINDINGS.md¶

Helper Function: `normalize()`¶

Derive `surface` Column¶

Derive `surface_normalized` Column¶

Parse Double-Definition Clues and Build `definition_list`¶

Assign `def_answer_pair_id`¶

Generate `cleaning_log.md`¶