怎么在Python的正则表达式中找到每个匹配的确切位置

re 模块中的 finditer() 函数。它会返回一个迭代器，每个元素都是 Match 对象，其中包含了匹配的起始和结束位置。

基本方法

import re

text = "Hello 123, Python 456!"
pattern = r'\d+'  # 匹配数字

for match in re.finditer(pattern, text):
    start_pos = match.start()  # 匹配起始位置
    end_pos = match.end()      # 匹配结束位置（不包含）
    matched_text = match.group()  # 匹配的文本
    print(f"在位置 {start_pos}-{end_pos} 找到: '{matched_text}'")

完整示例

import re

text = "The quick brown fox jumps over 123 lazy dogs. Python 3.10 is awesome!"
pattern = r'\b\w+\b'  # 匹配单词

print("文本:", text)
print("="*50)

# 方法1：使用finditer
print("方法1 - 使用finditer():")
for i, match in enumerate(re.finditer(pattern, text), 1):
    print(f"匹配{i}: '{match.group()}' 位置: [{match.start()}, {match.end()})")

print("\n" + "="*50)

# 方法2：使用finditer获取更详细的信息
print("方法2 - 获取更多信息:")
for match in re.finditer(r'\d+', text):
    start = match.start()
    end = match.end()
    span = match.span()  # 返回(start, end)元组
    print(f"数字 '{match.group()}' 在位置 {span}，即字符 '{text[start:end]}'")

print("\n" + "="*50)

# 方法3：查找带重叠的匹配（使用lookahead）
text2 = "aaa"
print("方法3 - 查找重叠匹配:")
for match in re.finditer(r'(?=(a{2}))', text2):  # 使用正向先行断言
    print(f"找到 'aa' 在位置 {match.start()}")

处理多行文本

import re

multiline_text = """第一行: Python is great
第二行: 数字 123 和 456
第三行: 结束"""

pattern = r'\d+'

print("多行文本匹配:")
for match in re.finditer(pattern, multiline_text):
    line_start = multiline_text.rfind('\n', 0, match.start()) + 1
    line_number = multiline_text.count('\n', 0, match.start()) + 1
    print(f"第{line_number}行，位置 {match.start()}: '{match.group()}'")

封装为函数

import re
from typing import List, Tuple

def find_all_positions(pattern: str, text: str, flags=0) -> List[Tuple[str, int, int]]:
    """
    查找所有匹配并返回(匹配文本, 起始位置, 结束位置)列表

    Args:
        pattern: 正则表达式模式
        text: 要搜索的文本
        flags: re模块标志

    Returns:
        匹配结果列表
    """
    results = []
    for match in re.finditer(pattern, text, flags):
        results.append((match.group(), match.start(), match.end()))
    return results

def find_all_with_context(pattern: str, text: str, context_chars=10) -> List[dict]:
    """
    查找匹配并包含上下文信息

    Args:
        pattern: 正则表达式模式
        text: 要搜索的文本
        context_chars: 上下文字符数

    Returns:
        包含详细信息的字典列表
    """
    results = []
    for match in re.finditer(pattern, text):
        start = match.start()
        end = match.end()
        context_start = max(0, start - context_chars)
        context_end = min(len(text), end + context_chars)

        results.append({
            'match': match.group(),
            'start': start,
            'end': end,
            'context': text[context_start:context_end],
            'full_match': match
        })
    return results

# 使用示例
text = "Python 3.10 发布于 2021年10月4日，Python 3.11 发布于 2022年10月24日"
pattern = r'Python \d+\.\d+'

print("函数使用示例:")
print("="*50)

# 使用简单函数
matches = find_all_positions(pattern, text)
for match_text, start, end in matches:
    print(f"找到 '{match_text}' 在位置 [{start}, {end})")

print("\n" + "="*50)

# 使用带上下文的函数
detailed_matches = find_all_with_context(r'\d{4}年\d{1,2}月\d{1,2}日', text, context_chars=15)
for match_info in detailed_matches:
    print(f"日期: {match_info['match']}")
    print(f"位置: [{match_info['start']}, {match_info['end']})")
    print(f"上下文: ...{match_info['context']}...")
    print("-"*30)

实用技巧

import re

# 1. 使用编译后的正则表达式提高性能
text = "测试文本中有很多123数字和456数字"
pattern = re.compile(r'\d+')

for match in pattern.finditer(text):
    print(f"找到数字: {match.group()} 在位置 {match.start()}")

# 2. 处理特殊字符
text = "查找 [括号] 内的内容 (还有圆括号) {以及花括号}"
pattern = r'\[(.*?)\]|\((.*?)\)|\{(.*?)\}'

for match in re.finditer(pattern, text):
    # 获取实际匹配的分组内容（非None的分组）
    actual_match = next((g for g in match.groups() if g is not None))
    print(f"找到: '{actual_match}' 在位置 {match.start()}")

# 3. 多模式匹配
def find_multiple_patterns(patterns, text):
    """同时匹配多个模式"""
    all_matches = []
    for pattern in patterns:
        for match in re.finditer(pattern, text):
            all_matches.append({
                'pattern': pattern,
                'match': match.group(),
                'start': match.start(),
                'end': match.end()
            })
    # 按位置排序
    return sorted(all_matches, key=lambda x: x['start'])

# 使用示例
text = "价格: $99.99 折扣: 20% 日期: 2023-12-31"
patterns = [r'\$\d+\.?\d*', r'\d+%', r'\d{4}-\d{2}-\d{2}']
matches = find_multiple_patterns(patterns, text)

for m in matches:
    print(f"模式 '{m['pattern']}' 找到: '{m['match']}' 在位置 {m['start']}")

注意事项

位置索引：Python使用0-based索引，match.end() 返回的是结束位置的下一个索引 多行模式：使用 re.MULTILINE 标志时，^ 和 $ 会匹配每行的开头和结尾 Unicode字符：对于包含多字节字符的文本，位置对应的是字符数，不是字节数性能：对于大量文本或频繁匹配，使用 re.compile() 预编译正则表达式

选择哪种方法取决于具体需求：finditer() 适合需要位置信息的情况，而 findall() 只返回匹配文本，不包含位置信息。