{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "e1c7e284",
   "metadata": {},
   "source": [
    "# Advanced Topics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "abaf523a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "from pathlib import Path\n",
    "\n",
    "# Find project root by looking for _config.yml\n",
    "current = Path.cwd()\n",
    "for parent in [current, *current.parents]:\n",
    "    if (parent / '_config.yml').exists():\n",
    "        project_root = parent\n",
    "        break\n",
    "else:\n",
    "    project_root = Path.cwd().parent.parent\n",
    "\n",
    "# Add project root to path\n",
    "sys.path.insert(0, str(project_root))\n",
    "\n",
    "# Import shared teaching helpers and cell magics\n",
    "from shared import thinkpython, diagram, jupyturtle, structshape\n",
    "from shared.download import download\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f9f93d6b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2ee22e3c",
   "metadata": {},
   "source": [
    "## Flags\n",
    "\n",
    "Flags change matching behavior:\n",
    "\n",
    "| Flag | Shorthand | Meaning |\n",
    "|---|---|---|\n",
    "| `re.IGNORECASE` | `re.I` | Case-insensitive matching |\n",
    "| `re.MULTILINE` | `re.M` | `^`/`$` match line start/end |\n",
    "| `re.DOTALL` | `re.S` | `.` matches newline too |\n",
    "| `re.VERBOSE` | `re.X` | Allow comments/whitespace in pattern |"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "id": "4d3245a1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Hello', 'HELLO', 'hello']\n",
      "['<div>\\nsome content\\n</div>']\n",
      "['hello@example.com', 'support@test.org']\n"
     ]
    }
   ],
   "source": [
    "# IGNORECASE\n",
    "print(re.findall(r\"hello\", \"Hello HELLO hello\", re.I))  # ['Hello', 'HELLO', 'hello']\n",
    "\n",
    "# DOTALL — dot matches newline\n",
    "text = \"<div>\\nsome content\\n</div>\"\n",
    "print(re.findall(r\"<div>.*</div>\", text, re.DOTALL))  # matches across lines\n",
    "\n",
    "# VERBOSE — write readable patterns with comments\n",
    "email_pattern = re.compile(r\"\"\"\n",
    "    [\\w.+-]+       # username\n",
    "    @              # at sign\n",
    "    [\\w-]+         # domain name\n",
    "    \\.             # dot\n",
    "    [\\w.]+         # TLD\n",
    "\"\"\", re.VERBOSE)\n",
    "\n",
    "print(email_pattern.findall(\"Contact us at hello@example.com or support@test.org\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f7afa4c2",
   "metadata": {
    "tags": [
     "thebe-interactive"
    ]
   },
   "outputs": [],
   "source": [
    "### EXERCISE: Regex Flags\n",
    "# Difficulty: Basic\n",
    "import re\n",
    "log = \"\"\"INFO: Server started\n",
    "error: disk full\n",
    "WARNING: low memory\n",
    "ERROR: connection lost\"\"\"\n",
    "# 1. Use re.findall() with re.IGNORECASE | re.MULTILINE to\n",
    "#    extract every line that begins with 'error'\n",
    "# 2. Print the list of matches\n",
    "### Your code starts here:\n",
    "\n",
    "\n",
    "\n",
    "### Your code ends here."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "15265b18",
   "metadata": {
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# Solution\n",
    "import re\n",
    "log = \"\"\"INFO: Server started\n",
    "error: disk full\n",
    "WARNING: low memory\n",
    "ERROR: connection lost\"\"\"\n",
    "results = re.findall(r'^error.*$', log, flags=re.IGNORECASE | re.MULTILINE)\n",
    "print(results)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e898c620",
   "metadata": {},
   "source": [
    "## Compiled Patterns\n",
    "Use `re.compile()` when reusing the same pattern multiple times — more efficient and cleaner."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "id": "1c2b8f29",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found: ['123-456-7890'] in 'Call me at 123-456-7890'\n",
      "Found: ['987.654.3210'] in 'My number is 987.654.3210'\n",
      "Found: ['555-123-4567', '800-999-0000'] in 'Reach us at 555-123-4567 or 800-999-0000'\n"
     ]
    }
   ],
   "source": [
    "# Compile once, use many times\n",
    "phone_pattern = re.compile(r\"\\b\\d{3}[-.]\\d{3}[-.]\\d{4}\\b\")\n",
    "\n",
    "texts = [\n",
    "    \"Call me at 123-456-7890\",\n",
    "    \"My number is 987.654.3210\",\n",
    "    \"No phone here\",\n",
    "    \"Reach us at 555-123-4567 or 800-999-0000\"\n",
    "]\n",
    "\n",
    "for t in texts:\n",
    "    matches = phone_pattern.findall(t)\n",
    "    if matches:\n",
    "        print(f\"Found: {matches} in '{t}'\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2c84d8f4",
   "metadata": {
    "tags": [
     "thebe-interactive"
    ]
   },
   "outputs": [],
   "source": [
    "### EXERCISE: Compiled Patterns\n",
    "# Difficulty: Basic\n",
    "import re\n",
    "emails = ['alice@example.com', 'not-an-email', 'bob@company.org', 'charlie_at_test.net']\n",
    "# 1. Compile a regex pattern that matches a simple email address\n",
    "# 2. Print each email with True or False using the compiled pattern\n",
    "### Your code starts here:\n",
    "\n",
    "\n",
    "\n",
    "### Your code ends here."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d023474f",
   "metadata": {
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# Solution\n",
    "import re\n",
    "emails = ['alice@example.com', 'not-an-email', 'bob@company.org', 'charlie_at_test.net']\n",
    "email_re = re.compile(r'[\\w.+-]+@[\\w-]+\\.[\\w.]+')\n",
    "for e in emails:\n",
    "    print(e, bool(email_re.fullmatch(e)))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f90ad03b",
   "metadata": {},
   "source": [
    "## Lookahead & Lookbehind\n",
    "\n",
    "Match a pattern only if it is (or isn't) preceded/followed by another pattern — without including that other pattern in the match.\n",
    "\n",
    "| Syntax | Type | Meaning |\n",
    "|---|---|---|\n",
    "| `(?=...)` | Positive lookahead | Followed by |\n",
    "| `(?!...)` | Negative lookahead | NOT followed by |\n",
    "| `(?<=...)` | Positive lookbehind | Preceded by |\n",
    "| `(?<!...)` | Negative lookbehind | NOT preceded by |"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "id": "c84820d8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['100', '300']\n",
      "['10', '200', '30']\n",
      "['42.99', '5.00']\n"
     ]
    }
   ],
   "source": [
    "# Positive lookahead — prices followed by USD\n",
    "text = \"100USD 200EUR 300USD\"\n",
    "print(re.findall(r\"\\d+(?=USD)\", text))     # ['100', '300']\n",
    "\n",
    "# Negative lookahead\n",
    "print(re.findall(r\"\\d+(?!USD)\", text))     # numbers NOT followed by USD\n",
    "\n",
    "# Positive lookbehind — extract amount after $\n",
    "text2 = \"Price: $42.99, discount: $5.00\"\n",
    "print(re.findall(r\"(?<=\\$)[\\d.]+\", text2)) # ['42.99', '5.00']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "60e69911",
   "metadata": {
    "tags": [
     "thebe-interactive"
    ]
   },
   "outputs": [],
   "source": [
    "### EXERCISE: Lookahead & Lookbehind\n",
    "# Difficulty: Intermediate\n",
    "import re\n",
    "text = \"Alice scored 95pts, Bob scored 80pts, Charlie scored 73pts\"\n",
    "# 1. Use a positive lookahead to extract all numbers followed by 'pts'\n",
    "# 2. Use a positive lookbehind to extract numbers preceded by 'scored '\n",
    "### Your code starts here:\n",
    "\n",
    "\n",
    "\n",
    "### Your code ends here."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c5ae44a4",
   "metadata": {
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [],
   "source": [
    "# Solution\n",
    "import re\n",
    "text = \"Alice scored 95pts, Bob scored 80pts, Charlie scored 73pts\"\n",
    "print(re.findall(r'\\d+(?=pts)', text))        # positive lookahead\n",
    "print(re.findall(r'(?<=scored )\\d+', text))   # positive lookbehind"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}