Das re-Modul von Python ist eines der leistungsstärksten Tools im Toolkit eines Entwicklers. Auch im Jahr 2026 bleiben reguläre Ausdrücke für die Textanalyse, Validierung, Datenextraktion und Transformation unverzichtbar. Dieser Leitfaden behandelt Python-Regex von den Grundlagen bis hin zu fortgeschrittenen Mustern, die jeder Entwickler kennen sollte.
📋 Table of Contents
Kernfunktionen
import re
text = "Contact us at support@example.com or sales@company.org"
# re.search() — find first match anywhere in string
m = re.search(r'[\w.+-]+@[\w-]+\.[a-z]{2,}', text)
if m: print(m.group()) # "support@example.com"
# re.match() — match at START of string only
m = re.match(r'Contact', text) # matches
m = re.match(r'support', text) # None (not at start)
# re.findall() — return all matches as list
emails = re.findall(r'[\w.+-]+@[\w-]+\.[a-z]{2,}', text)
print(emails) # ['support@example.com', 'sales@company.org']
# re.finditer() — return match objects (for groups + positions)
for match in re.finditer(r'[\w.+-]+@[\w-]+\.[a-z]{2,}', text):
print(f"{match.group()} at position {match.start()}-{match.end()}")
# re.sub() — replace matches
clean = re.sub(r'[\w.+-]+@[\w-]+\.[a-z]{2,}', '[EMAIL]', text)
print(clean) # "Contact us at [EMAIL] or [EMAIL]"
# re.sub() with function
def capitalize_words(m): return m.group().upper()
result = re.sub(r'[a-z]+', capitalize_words, "hello world foo bar")
# re.split()
parts = re.split(r'[,;\s]+', "one, two;three four")
print(parts) # ['one', 'two', 'three', 'four']
Gruppen und benannte Gruppen
# Capturing groups ()
m = re.search(r'(\d{4})-(\d{2})-(\d{2})', '2026-05-29')
if m:
year, month, day = m.groups()
print(year, month, day) # 2026 05 29
# Named groups (?P<name>...)
m = re.search(r'(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})', '2026-05-29')
if m:
print(m.group('year')) # 2026
print(m.groupdict()) # {'year': '2026', 'month': '05', 'day': '29'}
# Non-capturing group (?:...) — group without capturing
m = re.search(r'(?:https?|ftp)://(\S+)', 'Visit https://example.com/path')
if m:
print(m.group(1)) # 'example.com/path' (group 1, not the scheme)
# Backreferences or \g<name>
# Match repeated words
m = re.search(r'(\w+)\s+', 'the the cat sat', re.IGNORECASE)
if m: print(f"Repeated word: {m.group(1)}")
# Substitution with groups
result = re.sub(r'(\w+)\s(\w+)', r' ', 'first last')
print(result) # 'last first'
Lookahead und Lookbehind
text = "price: $100, discount: $20, total: $80"
# Positive lookahead (?=...) — match if followed by
prices = re.findall(r'\$\d+(?=,|\s*$)', text) # prices before comma or end
# Negative lookahead (?!...)
# Match words not followed by '.'
words = re.findall(r'\w+(?!\.)', 'Hello. World! Python. Language')
# Positive lookbehind (?<=...) — match if preceded by
amounts = re.findall(r'(?<=\$)\d+', text) # numbers after $
print(amounts) # ['100', '20', '80']
# Negative lookbehind (?<!...)
# Match digits not preceded by '-'
nums = re.findall(r'(?<!-)\d+', "3 -4 10 -5")
print(nums) # ['3', '10']
# Practical: extract quoted strings without quotes
strings = re.findall(r'(?<=")[^"]+(?=")', 'He said "hello" and "goodbye"')
print(strings) # ['hello', 'goodbye']
Flaggen und Zusammenstellung
import re
# Common flags
re.IGNORECASE # or re.I — case insensitive
re.MULTILINE # or re.M — ^ and $ match line boundaries
re.DOTALL # or re.S — . matches
too
re.VERBOSE # or re.X — allow comments and whitespace
# Verbose pattern (readable)
email_pattern = re.compile(
r'[\w.+-]+@[\w-]+(?:\.[\w-]+)*\.[a-z]{2,}',
re.IGNORECASE
)
emails = email_pattern.findall("Contact: Alice@Example.COM, bob@test.co.uk")
print(emails)
# Compile for performance (reuse same pattern)
phone_re = re.compile(r'\+?1?\s*\(?\d{3}\)?[\s.-]\d{3}[\s.-]\d{4}')
numbers = phone_re.findall("Call +1 (555) 123-4567 or 800.555.0100")
Muster aus der realen Welt
import re
# URL parsing
URL_RE = re.compile(
r'https?://'
r'(?:www\.)?'
r'(?P<domain>[a-zA-Z0-9.-]+)'
r'(?P<port>:\d+)?'
r'(?P<path>/[^\s?#]*)?'
r'(?P<query>\?[^\s#]*)?'
r'(?P<fragment>#\S*)?'
)
m = URL_RE.match('https://api.example.com:8080/v1/users?page=1#top')
if m:
print(m.groupdict())
# Parse log lines
LOG_RE = re.compile(
r'(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})'
r' \[(?P<level>DEBUG|INFO|WARNING|ERROR)\]'
r' (?P<message>.+)'
)
for line in log_file:
if m := LOG_RE.match(line): # walrus operator
process_log(m.group('level'), m.group('message'))
# Extract code blocks from markdown
code_blocks = re.findall(r'```(\w+)?
(.*?)```', markdown_text, re.DOTALL)
for lang, code in code_blocks:
print(f"Language: {lang}, Code length: {len(code)}")
# Validate passwords (must have upper, lower, digit, special)
def is_strong_password(pwd: str) -> bool:
patterns = [
r'[A-Z]', # uppercase
r'[a-z]', # lowercase
r'\d', # digit
r'[!@#$%^&*]', # special
]
return (len(pwd) >= 8 and all(re.search(p, pwd) for p in patterns))
# HTML tag removal
clean_html = re.sub(r'<[^>]+>', '', '<p>Hello <b>world</b></p>')
print(clean_html) # "Hello world"
# CamelCase to snake_case
def camel_to_snake(name: str) -> str:
s = re.sub(r'([A-Z]+)([A-Z][a-z])', r'_', name)
return re.sub(r'([a-z\d])([A-Z])', r'_', s).lower()
print(camel_to_snake('CamelCaseString')) # 'camel_case_string'
print(camel_to_snake('HTTPSConnection')) # 'https_connection'
Python-Regex im Jahr 2026: unverzichtbar für jede Textverarbeitungsaufgabe. Beherrschen Sie die Kernfunktionen (Suche, Findall, Sub), verstehen Sie Gruppen und benannte Gruppen für die Extraktion, verwenden Sie Lookaheads für kontextbezogenen Abgleich und kompilieren Sie immer häufig verwendete Muster. Ziehen Sie für komplexe Parsing-Aufgaben Bibliotheken wie Pyparsing oder Lark-Parser in Betracht – Regex wird für tief verschachtelte Strukturen unhandlich.
🔗 Share this article
✍️ Leave a Comment