9.2.5. Advanced Topics#

import sys
from pathlib import Path

# Find project root by looking for _config.yml
current = Path.cwd()
for parent in [current, *current.parents]:
    if (parent / '_config.yml').exists():
        project_root = parent
        break
else:
    project_root = Path.cwd().parent.parent

# Add project root to path
sys.path.insert(0, str(project_root))

# Import shared teaching helpers and cell magics
from shared import thinkpython, diagram, jupyturtle, structshape
from shared.download import download
import re

9.2.5.1. Flags#

Flags change matching behavior:

Flag

Shorthand

Meaning

re.IGNORECASE

re.I

Case-insensitive matching

re.MULTILINE

re.M

^/$ match line start/end

re.DOTALL

re.S

. matches newline too

re.VERBOSE

re.X

Allow comments/whitespace in pattern

# IGNORECASE
print(re.findall(r"hello", "Hello HELLO hello", re.I))  # ['Hello', 'HELLO', 'hello']

# DOTALL — dot matches newline
text = "<div>\nsome content\n</div>"
print(re.findall(r"<div>.*</div>", text, re.DOTALL))  # matches across lines

# VERBOSE — write readable patterns with comments
email_pattern = re.compile(r"""
    [\w.+-]+       # username
    @              # at sign
    [\w-]+         # domain name
    \.             # dot
    [\w.]+         # TLD
""", re.VERBOSE)

print(email_pattern.findall("Contact us at hello@example.com or support@test.org"))
['Hello', 'HELLO', 'hello']
['<div>\nsome content\n</div>']
['hello@example.com', 'support@test.org']
### EXERCISE: Regex Flags
# Difficulty: Basic
import re
log = """INFO: Server started
error: disk full
WARNING: low memory
ERROR: connection lost"""
# 1. Use re.findall() with re.IGNORECASE | re.MULTILINE to
#    extract every line that begins with 'error'
# 2. Print the list of matches
### Your code starts here:



### Your code ends here.

Hide code cell source

# Solution
import re
log = """INFO: Server started
error: disk full
WARNING: low memory
ERROR: connection lost"""
results = re.findall(r'^error.*$', log, flags=re.IGNORECASE | re.MULTILINE)
print(results)
['error: disk full', 'ERROR: connection lost']

9.2.5.2. Compiled Patterns#

Use re.compile() when reusing the same pattern multiple times — more efficient and cleaner.

# Compile once, use many times
phone_pattern = re.compile(r"\b\d{3}[-.]\d{3}[-.]\d{4}\b")

texts = [
    "Call me at 123-456-7890",
    "My number is 987.654.3210",
    "No phone here",
    "Reach us at 555-123-4567 or 800-999-0000"
]

for t in texts:
    matches = phone_pattern.findall(t)
    if matches:
        print(f"Found: {matches} in '{t}'")
Found: ['123-456-7890'] in 'Call me at 123-456-7890'
Found: ['987.654.3210'] in 'My number is 987.654.3210'
Found: ['555-123-4567', '800-999-0000'] in 'Reach us at 555-123-4567 or 800-999-0000'
### EXERCISE: Compiled Patterns
# Difficulty: Basic
import re
emails = ['alice@example.com', 'not-an-email', 'bob@company.org', 'charlie_at_test.net']
# 1. Compile a regex pattern that matches a simple email address
# 2. Print each email with True or False using the compiled pattern
### Your code starts here:



### Your code ends here.

Hide code cell source

# Solution
import re
emails = ['alice@example.com', 'not-an-email', 'bob@company.org', 'charlie_at_test.net']
email_re = re.compile(r'[\w.+-]+@[\w-]+\.[\w.]+')
for e in emails:
    print(e, bool(email_re.fullmatch(e)))
alice@example.com True
not-an-email False
bob@company.org True
charlie_at_test.net False

9.2.5.3. Lookahead & Lookbehind#

Match a pattern only if it is (or isn’t) preceded/followed by another pattern — without including that other pattern in the match.

Syntax

Type

Meaning

(?=...)

Positive lookahead

Followed by

(?!...)

Negative lookahead

NOT followed by

(?<=...)

Positive lookbehind

Preceded by

(?<!...)

Negative lookbehind

NOT preceded by

# Positive lookahead — prices followed by USD
text = "100USD 200EUR 300USD"
print(re.findall(r"\d+(?=USD)", text))     # ['100', '300']

# Negative lookahead
print(re.findall(r"\d+(?!USD)", text))     # numbers NOT followed by USD

# Positive lookbehind — extract amount after $
text2 = "Price: $42.99, discount: $5.00"
print(re.findall(r"(?<=\$)[\d.]+", text2)) # ['42.99', '5.00']
['100', '300']
['10', '200', '30']
['42.99', '5.00']
### EXERCISE: Lookahead & Lookbehind
# Difficulty: Intermediate
import re
text = "Alice scored 95pts, Bob scored 80pts, Charlie scored 73pts"
# 1. Use a positive lookahead to extract all numbers followed by 'pts'
# 2. Use a positive lookbehind to extract numbers preceded by 'scored '
### Your code starts here:



### Your code ends here.

Hide code cell source

# Solution
import re
text = "Alice scored 95pts, Bob scored 80pts, Charlie scored 73pts"
print(re.findall(r'\d+(?=pts)', text))        # positive lookahead
print(re.findall(r'(?<=scored )\d+', text))   # positive lookbehind
['95', '80', '73']
['95', '80', '73']