{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "a13d93b5",
   "metadata": {},
   "source": [
    "# Markov analysis\n",
    "\n",
    "We can do better with Markov chain text analysis, which computes, for each word in a text, the list of words that come next.\n",
    "As an example, we'll analyze these lyrics from the Monty Python song *Eric, the Half a Bee*:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f6f57594",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "from pathlib import Path\n",
    "\n",
    "# Find project root by looking for _config.yml\n",
    "current = Path.cwd()\n",
    "for parent in [current, *current.parents]:\n",
    "    if (parent / '_config.yml').exists():\n",
    "        project_root = parent\n",
    "        break\n",
    "else:\n",
    "    project_root = Path.cwd().parent.parent\n",
    "\n",
    "# Add project root to path\n",
    "sys.path.insert(0, str(project_root))\n",
    "\n",
    "# Import shared teaching helpers and cell magics\n",
    "from shared import thinkpython, diagram, jupyturtle, structshape\n",
    "from shared.download import download\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7d96e2aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "import unicodedata\n",
    "\n",
    "# Text-analysis setup shared by the split sections.\n",
    "data_dir = project_root / 'data'\n",
    "data_dir.mkdir(parents=True, exist_ok=True)\n",
    "raw_path = data_dir / 'pg43.txt'\n",
    "filename = data_dir / 'dr_jekyll.txt'\n",
    "\n",
    "if not filename.exists():\n",
    "    if not raw_path.exists():\n",
    "        download('https://www.gutenberg.org/cache/epub/43/pg43.txt', str(raw_path))\n",
    "\n",
    "    with open(raw_path, encoding='utf-8') as reader, open(filename, 'w', encoding='utf-8') as writer:\n",
    "        in_body = False\n",
    "        for line in reader:\n",
    "            if line.startswith('***'):\n",
    "                if not in_body:\n",
    "                    in_body = True\n",
    "                    continue\n",
    "                break\n",
    "            if in_body:\n",
    "                writer.write(line)\n",
    "\n",
    "def split_line(line):\n",
    "    return line.replace('—', ' ').split()\n",
    "\n",
    "punc_marks = {}\n",
    "for line in open(filename, encoding='utf-8'):\n",
    "    for char in line:\n",
    "        category = unicodedata.category(char)\n",
    "        if category.startswith('P'):\n",
    "            punc_marks[char] = 1\n",
    "\n",
    "punctuation = ''.join(punc_marks)\n",
    "\n",
    "def clean_word(word):\n",
    "    return word.strip(punctuation).lower()\n",
    "\n",
    "word_counter = {}\n",
    "for line in open(filename, encoding='utf-8'):\n",
    "    for word in split_line(line):\n",
    "        word = clean_word(word)\n",
    "        if word:\n",
    "            word_counter[word] = word_counter.get(word, 0) + 1\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "id": "3171d592",
   "metadata": {},
   "outputs": [],
   "source": [
    "song = \"\"\"\n",
    "Half a bee, philosophically,\n",
    "Must, ipso facto, half not be.\n",
    "But half the bee has got to be\n",
    "Vis a vis, its entity. D'you see?\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "583ab9f0",
   "metadata": {},
   "source": [
    "To store the results, we'll use a dictionary that maps from each word to the list of words that follow it."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "id": "3321e6a4",
   "metadata": {},
   "outputs": [],
   "source": [
    "successor_map = {}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d5d85b09",
   "metadata": {},
   "source": [
    "As an example, let's start with the first two words of the song."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "id": "e4e55c71",
   "metadata": {},
   "outputs": [],
   "source": [
    "first = 'half'\n",
    "second = 'a'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0349fe78",
   "metadata": {},
   "source": [
    "If the first word is not in `successor_map`, we have to add a new item that maps from the first word to a list containing the second word."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "id": "f25dcb5e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'half': ['not']}"
      ]
     },
     "execution_count": 119,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "successor_map[first] = [second]\n",
    "successor_map"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "55bb8df9",
   "metadata": {},
   "source": [
    "If the first word is already in the dictionary, we can look it up to get the list of successors we've seen so far, and append the new one."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "id": "990354a0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'half': ['a', 'not']}"
      ]
     },
     "execution_count": 114,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "first = 'half'\n",
    "second = 'not'\n",
    "\n",
    "successor_map[first].append(second)\n",
    "successor_map"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6289cc32",
   "metadata": {},
   "source": [
    "The following function encapsulates these steps."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "id": "b9371452",
   "metadata": {},
   "outputs": [],
   "source": [
    "def add_bigram(bigram):\n",
    "    first, second = bigram\n",
    "    \n",
    "    if first not in successor_map:\n",
    "        successor_map[first] = [second]\n",
    "    else:\n",
    "        successor_map[first].append(second)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "74a51700",
   "metadata": {},
   "source": [
    "If the same bigram appears more that once, the second word is added to the list more than once.\n",
    "In this way, `successor_map` keeps track of how many times each successor appears.\n",
    "\n",
    "As we did in the previous section, we'll use a list called `window` to store pairs of consecutive words.\n",
    "And we'll use the following function to process the words one at a time."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "id": "8c3f45c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_word_bigram(word):\n",
    "    window.append(word)\n",
    "    \n",
    "    if len(window) == 2:\n",
    "        add_bigram(window)\n",
    "        window.pop(0)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "861a60d9",
   "metadata": {},
   "source": [
    "Here's how we use it to process the words in the song."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "id": "641990a3",
   "metadata": {},
   "outputs": [],
   "source": [
    "successor_map = {}\n",
    "window = []\n",
    "\n",
    "for word in song.split():\n",
    "    word = clean_word(word)\n",
    "    process_word_bigram(word)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bf490d67",
   "metadata": {},
   "source": [
    "And here are the results."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "id": "9322a49a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'half': ['a', 'not', 'the'],\n",
       " 'a': ['bee', 'vis'],\n",
       " 'bee': ['philosophically', 'has'],\n",
       " 'philosophically': ['must'],\n",
       " 'must': ['ipso'],\n",
       " 'ipso': ['facto'],\n",
       " 'facto': ['half'],\n",
       " 'not': ['be'],\n",
       " 'be': ['but', 'vis'],\n",
       " 'but': ['half'],\n",
       " 'the': ['bee'],\n",
       " 'has': ['got'],\n",
       " 'got': ['to'],\n",
       " 'to': ['be'],\n",
       " 'vis': ['a', 'its'],\n",
       " 'its': ['entity'],\n",
       " 'entity': [\"d'you\"],\n",
       " \"d'you\": ['see']}"
      ]
     },
     "execution_count": 121,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "successor_map"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ff7bad74",
   "metadata": {},
   "source": [
    "The word `'half'` can be followed by `'a'`, `'not'`, or `'the'`.\n",
    "The word `'a'` can be followed by `'bee'` or `'vis'`.\n",
    "Most of the other words appear only once, so they are followed by only a single word.\n",
    "\n",
    "Now let's analyze the book."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "id": "45a60c52",
   "metadata": {},
   "outputs": [],
   "source": [
    "successor_map = {}\n",
    "window = []\n",
    "\n",
    "for line in open(filename):\n",
    "    for word in split_line(line):\n",
    "        word = clean_word(word)\n",
    "        process_word_bigram(word)\n",
    "        \n",
    "        \n",
    "# from collections import defaultdict\n",
    "# model = defaultdict(list)\n",
    "\n",
    "# for i in range(len(words) - 1):\n",
    "#     current_word = words[i]\n",
    "#     next_word = words[i + 1]\n",
    "#     model[current_word].append(next_word)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2676e2fb",
   "metadata": {},
   "source": [
    "We can look up any word and find the words that can follow it."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "id": "3e86102c",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "story ['of', 'of', 'indeed', 'but', 'for', 'of', 'that']\n",
      "incident ['of', 'of', 'at', 'of', 'of', 'at', 'this']\n",
      "lanyon’s ['narrative', 'there', 'face', 'manner', 'narrative', 'the', 'condemnation']\n",
      "common ['it', 'interest', 'friends', 'friends', 'observers', 'but', 'quarry']\n",
      "relief ['the', 'to', 'when', 'that', 'that', 'of', 'it']\n",
      "appearance ['of', 'well', 'something', 'he', 'amply', 'of', 'which']\n",
      "going ['east', 'in', 'to', 'to', 'up', 'to', 'of']\n",
      "till ['at', 'the', 'i', 'yesterday', 'the', 'that', 'weariness']\n",
      "walk ['and', 'into', 'with', 'all', 'steadfastly', 'attired', 'with']\n",
      "sounds ['nothing', 'carried', 'out', 'the', 'of', 'of', 'with']\n",
      "really ['like', 'damnable', 'can', 'a', 'a', 'not', 'be']\n",
      "does ['not', 'not', 'indeed', 'not', 'the', 'not', 'not']\n",
      "reply ['i', 'but', 'whose', 'i', 'some', 'that’s', 'i']\n",
      "continued ['mr', 'the', 'the', 'the', 'the', 'poole', 'utterson']\n",
      "seems ['scarcely', 'hardly', 'to', 'she', 'much', 'he', 'to']\n",
      "walked ['on', 'over', 'some', 'was', 'on', 'with', 'fast']\n",
      "that’s ['a', 'it', 'talking', 'very', 'not', 'such', 'not']\n",
      "although ['i', 'a', 'it', 'the', 'we', 'they', 'i']\n",
      "until ['the', 'the', 'they', 'they', 'the', 'to-morrow', 'i']\n",
      "disappearance ['or', 'the', 'of', 'of', 'here', 'and', 'but']\n",
      "step ['into', 'or', 'back', 'natural', 'into', 'of', 'leaping']\n",
      "wish ['the', 'you', 'to', 'you', 'to', 'i', 'to']\n",
      "aware ['of', 'of', 'of', 'that', 'jekyll’s', 'that', 'of']\n",
      "thank ['you', 'you', 'you', 'you', 'you', 'god', 'you']\n",
      "maid ['servant', 'described', 'fainted', 'had', 'calls', 'had', 'lifted']\n",
      "besides ['were', 'was', 'for', 'a', 'with', 'with', 'which']\n",
      "observed ['the', 'utterson', 'with', 'the', 'that', 'that', 'that']\n",
      "among ['other', 'the', 'the', 'the', 'my', 'my', 'temptations']\n"
     ]
    }
   ],
   "source": [
    "# I used this cell to find a predecessor with a good number of \n",
    "# possible successors and at least one repeated word.\n",
    "\n",
    "def has_duplicates(t):\n",
    "    return len(set(t)) < len(t)         ### A set is a collection of unique \n",
    "                                        ### elements, so if the length of \n",
    "                                        # the set is less than the length \n",
    "                                        # of the original list, it means \n",
    "                                        # there are duplicates in the list.\n",
    "\n",
    "for key, value in successor_map.items():\n",
    "    if len(value) == 7 and has_duplicates(value):\n",
    "        print(key, value)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "id": "e49d52f7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['east', 'in', 'to', 'to', 'up', 'to', 'of']"
      ]
     },
     "execution_count": 125,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "successor_map['going']"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7b777a9c",
   "metadata": {},
   "source": [
    "In this list of successors, notice that the word `'to'` appears three times -- the other successors only appear once."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "id": "8b4c2ef8",
   "metadata": {
    "tags": [
     "thebe-interactive"
    ]
   },
   "outputs": [],
   "source": [
    "### EXERCISE: Building a Successor Map\n",
    "\n",
    "text = \"alice was nice alice was clever alice was kind\"\n",
    "# 1. Split the text into words\n",
    "# 2. Build a successor_map where each word maps to the list of words that follow it\n",
    "# 3. Print the map and verify that 'alice' maps to ['was', 'was', 'was']\n",
    "### Your code starts here:\n",
    "\n",
    "\n",
    "\n",
    "### Your code ends here.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "id": "c6880389",
   "metadata": {
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'alice': ['was', 'was', 'was'], 'was': ['nice', 'clever', 'kind'], 'nice': ['alice'], 'clever': ['alice']}\n"
     ]
    }
   ],
   "source": [
    "# Solution\n",
    "text = \"alice was nice alice was clever alice was kind\"\n",
    "words_list = text.split()\n",
    "smap = {}\n",
    "for i in range(len(words_list) - 1):\n",
    "    first, second = words_list[i], words_list[i + 1]\n",
    "    if first not in smap:\n",
    "        smap[first] = [second]\n",
    "    else:\n",
    "        smap[first].append(second)\n",
    "print(smap)\n",
    "# 'alice' → ['was', 'was', 'was'], 'was' → ['nice', 'clever', 'kind']\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e8bf85fc",
   "metadata": {},
   "source": [
    "## Generating text\n",
    "\n",
    "We can use the results from the previous section to generate new text with the same relationships between consecutive words as in the original.\n",
    "Here's how it works:\n",
    "\n",
    "* Starting with any word that appears in the text, we look up its possible successors and choose one **at random**.\n",
    "\n",
    "* Then, using the chosen word, we look up its possible successors, and choose one **at random**.\n",
    "\n",
    "We can repeat this process to generate as many words as we want.\n",
    "As an example, let's start with the word `'although'`.\n",
    "Here are the words that can follow it."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "id": "15108884",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['i', 'a', 'it', 'the', 'we', 'they', 'i']"
      ]
     },
     "execution_count": 130,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "word = 'although'\n",
    "successors = successor_map[word]\n",
    "successors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "id": "747a41be",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# this cell initializes the random number generator so it \n",
    "# starts at the same point in the sequence each time this\n",
    "# notebook runs.\n",
    "\n",
    "random.seed(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b26a2ead",
   "metadata": {},
   "source": [
    "We can use `choice` to choose from the list with equal probability."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "id": "5a4682dc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'i'"
      ]
     },
     "execution_count": 135,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "word = random.choice(successors)\n",
    "word"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9741beca",
   "metadata": {},
   "source": [
    "If the same word appears more than once in the list, it is more likely to be selected.\n",
    "\n",
    "Repeating these steps, we can use the following loop to generate a longer series."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "id": "36ee0f76",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "to his study a word or death of colour not "
     ]
    }
   ],
   "source": [
    "for i in range(10):\n",
    "    successors = successor_map[word]\n",
    "    word = random.choice(successors)\n",
    "    print(word, end=' ')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "38a2d79a",
   "metadata": {},
   "source": [
    "The result sounds more like a real sentence, but it still doesn't make much sense.\n",
    "\n",
    "We can do better using more than one word as a key in `successor_map`.\n",
    "For example, we can make a dictionary that maps from each bigram -- or trigram -- to the list of words that come next. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "id": "65541203",
   "metadata": {
    "tags": [
     "thebe-interactive"
    ]
   },
   "outputs": [],
   "source": [
    "### EXERCISE: Generate Text from Successor Map\n",
    "\n",
    "import random\n",
    "random.seed(7)\n",
    "# Using the successor_map built from Dr. Jekyll and Mr. Hyde:\n",
    "# 1. Start with the word 'jekyll'\n",
    "# 2. Generate a sequence of 8 more words by repeatedly looking up successors\n",
    "#    and choosing one at random with random.choice()\n",
    "# 3. Print the full generated sequence as a sentence\n",
    "### Your code starts here:\n",
    "\n",
    "\n",
    "\n",
    "### Your code ends here.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "id": "41f77d8e",
   "metadata": {
    "tags": [
     "hide-input"
    ]
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "jekyll god cried the situation tell you say is\n"
     ]
    }
   ],
   "source": [
    "# Solution\n",
    "import random\n",
    "random.seed(7)\n",
    "word = 'jekyll'\n",
    "result = [word]\n",
    "for i in range(8):\n",
    "    if word in successor_map and successor_map[word]:\n",
    "        word = random.choice(successor_map[word])\n",
    "        result.append(word)\n",
    "    else:\n",
    "        break\n",
    "print(' '.join(result))\n"
   ]
  }
 ],
 "metadata": {
  "celltoolbar": "Tags",
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}