Python’s re module is one of the most powerful tools in a developer’s toolkit. In 2026, regular expressions remain essential for text parsing, validation, data extraction, and transformation. This guide covers Python regex from basics to advanced patterns that every developer should know.
📋 Table of Contents
Core Functions
import re
text = "Contact us at support@example.com or sales@company.org"
# re.search() — find first match anywhere in string
m = re.search(r'[\w.+-]+@[\w-]+\.[a-z]{2,}', text)
if m: print(m.group()) # "support@example.com"
# re.match() — match at START of string only
m = re.match(r'Contact', text) # matches
m = re.match(r'support', text) # None (not at start)
# re.findall() — return all matches as list
emails = re.findall(r'[\w.+-]+@[\w-]+\.[a-z]{2,}', text)
print(emails) # ['support@example.com', 'sales@company.org']
# re.finditer() — return match objects (for groups + positions)
for match in re.finditer(r'[\w.+-]+@[\w-]+\.[a-z]{2,}', text):
print(f"{match.group()} at position {match.start()}-{match.end()}")
# re.sub() — replace matches
clean = re.sub(r'[\w.+-]+@[\w-]+\.[a-z]{2,}', '[EMAIL]', text)
print(clean) # "Contact us at [EMAIL] or [EMAIL]"
# re.sub() with function
def capitalize_words(m): return m.group().upper()
result = re.sub(r'[a-z]+', capitalize_words, "hello world foo bar")
# re.split()
parts = re.split(r'[,;\s]+', "one, two;three four")
print(parts) # ['one', 'two', 'three', 'four']
Groups and Named Groups
# Capturing groups ()
m = re.search(r'(\d{4})-(\d{2})-(\d{2})', '2026-05-29')
if m:
year, month, day = m.groups()
print(year, month, day) # 2026 05 29
# Named groups (?P<name>...)
m = re.search(r'(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})', '2026-05-29')
if m:
print(m.group('year')) # 2026
print(m.groupdict()) # {'year': '2026', 'month': '05', 'day': '29'}
# Non-capturing group (?:...) — group without capturing
m = re.search(r'(?:https?|ftp)://(\S+)', 'Visit https://example.com/path')
if m:
print(m.group(1)) # 'example.com/path' (group 1, not the scheme)
# Backreferences or \g<name>
# Match repeated words
m = re.search(r'(\w+)\s+', 'the the cat sat', re.IGNORECASE)
if m: print(f"Repeated word: {m.group(1)}")
# Substitution with groups
result = re.sub(r'(\w+)\s(\w+)', r' ', 'first last')
print(result) # 'last first'
Lookahead and Lookbehind
text = "price: $100, discount: $20, total: $80"
# Positive lookahead (?=...) — match if followed by
prices = re.findall(r'\$\d+(?=,|\s*$)', text) # prices before comma or end
# Negative lookahead (?!...)
# Match words not followed by '.'
words = re.findall(r'\w+(?!\.)', 'Hello. World! Python. Language')
# Positive lookbehind (?<=...) — match if preceded by
amounts = re.findall(r'(?<=\$)\d+', text) # numbers after $
print(amounts) # ['100', '20', '80']
# Negative lookbehind (?<!...)
# Match digits not preceded by '-'
nums = re.findall(r'(?<!-)\d+', "3 -4 10 -5")
print(nums) # ['3', '10']
# Practical: extract quoted strings without quotes
strings = re.findall(r'(?<=")[^"]+(?=")', 'He said "hello" and "goodbye"')
print(strings) # ['hello', 'goodbye']
Flags and Compilation
import re
# Common flags
re.IGNORECASE # or re.I — case insensitive
re.MULTILINE # or re.M — ^ and $ match line boundaries
re.DOTALL # or re.S — . matches
too
re.VERBOSE # or re.X — allow comments and whitespace
# Verbose pattern (readable)
email_pattern = re.compile(
r'[\w.+-]+@[\w-]+(?:\.[\w-]+)*\.[a-z]{2,}',
re.IGNORECASE
)
emails = email_pattern.findall("Contact: Alice@Example.COM, bob@test.co.uk")
print(emails)
# Compile for performance (reuse same pattern)
phone_re = re.compile(r'\+?1?\s*\(?\d{3}\)?[\s.-]\d{3}[\s.-]\d{4}')
numbers = phone_re.findall("Call +1 (555) 123-4567 or 800.555.0100")
Real-World Patterns
import re
# URL parsing
URL_RE = re.compile(
r'https?://'
r'(?:www\.)?'
r'(?P<domain>[a-zA-Z0-9.-]+)'
r'(?P<port>:\d+)?'
r'(?P<path>/[^\s?#]*)?'
r'(?P<query>\?[^\s#]*)?'
r'(?P<fragment>#\S*)?'
)
m = URL_RE.match('https://api.example.com:8080/v1/users?page=1#top')
if m:
print(m.groupdict())
# Parse log lines
LOG_RE = re.compile(
r'(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})'
r' \[(?P<level>DEBUG|INFO|WARNING|ERROR)\]'
r' (?P<message>.+)'
)
for line in log_file:
if m := LOG_RE.match(line): # walrus operator
process_log(m.group('level'), m.group('message'))
# Extract code blocks from markdown
code_blocks = re.findall(r'```(\w+)?
(.*?)```', markdown_text, re.DOTALL)
for lang, code in code_blocks:
print(f"Language: {lang}, Code length: {len(code)}")
# Validate passwords (must have upper, lower, digit, special)
def is_strong_password(pwd: str) -> bool:
patterns = [
r'[A-Z]', # uppercase
r'[a-z]', # lowercase
r'\d', # digit
r'[!@#$%^&*]', # special
]
return (len(pwd) >= 8 and all(re.search(p, pwd) for p in patterns))
# HTML tag removal
clean_html = re.sub(r'<[^>]+>', '', '<p>Hello <b>world</b></p>')
print(clean_html) # "Hello world"
# CamelCase to snake_case
def camel_to_snake(name: str) -> str:
s = re.sub(r'([A-Z]+)([A-Z][a-z])', r'_', name)
return re.sub(r'([a-z\d])([A-Z])', r'_', s).lower()
print(camel_to_snake('CamelCaseString')) # 'camel_case_string'
print(camel_to_snake('HTTPSConnection')) # 'https_connection'
Python regex in 2026: essential for any text processing task. Master the core functions (search, findall, sub), understand groups and named groups for extraction, use lookaheads for context-aware matching, and always compile frequently-used patterns. For complex parsing tasks, consider libraries like pyparsing or lark-parser — regex gets unwieldy for deeply nested structures.
📚 You might also like
🔗 Share this article




✍️ Leave a Comment