9.2.5. Advanced Topics#
import sys
from pathlib import Path
# Find project root by looking for _config.yml
current = Path.cwd()
for parent in [current, *current.parents]:
if (parent / '_config.yml').exists():
project_root = parent
break
else:
project_root = Path.cwd().parent.parent
# Add project root to path
sys.path.insert(0, str(project_root))
# Import shared teaching helpers and cell magics
from shared import thinkpython, diagram, jupyturtle, structshape
from shared.download import download
import re
9.2.5.1. Flags#
Flags change matching behavior:
Flag |
Shorthand |
Meaning |
|---|---|---|
|
|
Case-insensitive matching |
|
|
|
|
|
|
|
|
Allow comments/whitespace in pattern |
# IGNORECASE
print(re.findall(r"hello", "Hello HELLO hello", re.I)) # ['Hello', 'HELLO', 'hello']
# DOTALL — dot matches newline
text = "<div>\nsome content\n</div>"
print(re.findall(r"<div>.*</div>", text, re.DOTALL)) # matches across lines
# VERBOSE — write readable patterns with comments
email_pattern = re.compile(r"""
[\w.+-]+ # username
@ # at sign
[\w-]+ # domain name
\. # dot
[\w.]+ # TLD
""", re.VERBOSE)
print(email_pattern.findall("Contact us at hello@example.com or support@test.org"))
['Hello', 'HELLO', 'hello']
['<div>\nsome content\n</div>']
['hello@example.com', 'support@test.org']
### EXERCISE: Regex Flags
# Difficulty: Basic
import re
log = """INFO: Server started
error: disk full
WARNING: low memory
ERROR: connection lost"""
# 1. Use re.findall() with re.IGNORECASE | re.MULTILINE to
# extract every line that begins with 'error'
# 2. Print the list of matches
### Your code starts here:
### Your code ends here.
['error: disk full', 'ERROR: connection lost']
9.2.5.2. Compiled Patterns#
Use re.compile() when reusing the same pattern multiple times — more efficient and cleaner.
# Compile once, use many times
phone_pattern = re.compile(r"\b\d{3}[-.]\d{3}[-.]\d{4}\b")
texts = [
"Call me at 123-456-7890",
"My number is 987.654.3210",
"No phone here",
"Reach us at 555-123-4567 or 800-999-0000"
]
for t in texts:
matches = phone_pattern.findall(t)
if matches:
print(f"Found: {matches} in '{t}'")
Found: ['123-456-7890'] in 'Call me at 123-456-7890'
Found: ['987.654.3210'] in 'My number is 987.654.3210'
Found: ['555-123-4567', '800-999-0000'] in 'Reach us at 555-123-4567 or 800-999-0000'
### EXERCISE: Compiled Patterns
# Difficulty: Basic
import re
emails = ['alice@example.com', 'not-an-email', 'bob@company.org', 'charlie_at_test.net']
# 1. Compile a regex pattern that matches a simple email address
# 2. Print each email with True or False using the compiled pattern
### Your code starts here:
### Your code ends here.
alice@example.com True
not-an-email False
bob@company.org True
charlie_at_test.net False
9.2.5.3. Lookahead & Lookbehind#
Match a pattern only if it is (or isn’t) preceded/followed by another pattern — without including that other pattern in the match.
Syntax |
Type |
Meaning |
|---|---|---|
|
Positive lookahead |
Followed by |
|
Negative lookahead |
NOT followed by |
|
Positive lookbehind |
Preceded by |
|
Negative lookbehind |
NOT preceded by |
# Positive lookahead — prices followed by USD
text = "100USD 200EUR 300USD"
print(re.findall(r"\d+(?=USD)", text)) # ['100', '300']
# Negative lookahead
print(re.findall(r"\d+(?!USD)", text)) # numbers NOT followed by USD
# Positive lookbehind — extract amount after $
text2 = "Price: $42.99, discount: $5.00"
print(re.findall(r"(?<=\$)[\d.]+", text2)) # ['42.99', '5.00']
['100', '300']
['10', '200', '30']
['42.99', '5.00']
### EXERCISE: Lookahead & Lookbehind
# Difficulty: Intermediate
import re
text = "Alice scored 95pts, Bob scored 80pts, Charlie scored 73pts"
# 1. Use a positive lookahead to extract all numbers followed by 'pts'
# 2. Use a positive lookbehind to extract numbers preceded by 'scored '
### Your code starts here:
### Your code ends here.
['95', '80', '73']
['95', '80', '73']