https://pypi.org/project/PyMuPDF/
Here’s a single, pipeline‑ready script that adds everything you asked for:
It still preserves:
You can drop this into a .py file and adapt the config at the bottom.
You’ll want:
pip install pymupdf rapidfuzz tqdm
Full script
import fitz
import re
import logging
from pathlib import Path
from typing import Dict, Iterable, Pattern, List, Tuple, Optional
from concurrent.futures import ProcessPoolExecutor, as_completed
from rapidfuzz import fuzz
from tqdm import tqdm
# ---------- Logging setup ----------
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
)
# ---------- Core helpers ----------
def build_text_stream(words: List[dict]) -> Tuple[str, List[Tuple[Optional[int], Optional[int]]]]:
"""
Build a logical text stream and map character offsets to word entries.
Returns:
full_text: str
index_map: list of (word_index, char_in_word)
"""
stream = []
index_map: List[Tuple[Optional[int], Optional[int]]] = []
for i, w in enumerate(words):
text = w["text"]
for c_idx, c in enumerate(text):
stream.append(c)
index_map.append((i, c_idx))
# logical space separator
stream.append(" ")
index_map.append((None, None))
return "".join(stream), index_map
def find_spans_for_rect(page: fitz.Page, rect: fitz.Rect):
raw = page.get_text("rawdict")
spans_info = []
for block in raw["blocks"]:
if block["type"] != 0:
continue
for line in block["lines"]:
for span in line["spans"]:
span_rect = fitz.Rect(span["bbox"])
if span_rect.intersects(rect):
spans_info.append((span, span_rect))
return spans_info
# ---------- Regex + fuzzy replacement on a single page ----------
def replace_on_page(
page: fitz.Page,
regex_map: Dict[Pattern, str],
fuzzy_map: Dict[str, Tuple[str, int]],
):
"""
regex_map: { compiled_pattern: replacement }
fuzzy_map: { target_string: (replacement, threshold) }
"""
raw_words = page.get_text("words")
if not raw_words:
return
words = [
{
"rect": fitz.Rect(w[0], w[1], w[2], w[3]),
"text": w[4],
"block": w[5],
"line": w[6],
"word_index": w[7],
}
for w in raw_words
]
full_text, index_map = build_text_stream(words)
replacements: List[Tuple[fitz.Rect, dict, str]] = []
# ----- Regex replacements (case-insensitive, unicode-aware) -----
for pattern, replacement in regex_map.items():
for match in pattern.finditer(full_text):
start, end = match.span()
touched_word_indices = set()
for pos in range(start, end):
w_idx, _ = index_map[pos]
if w_idx is not None:
touched_word_indices.add(w_idx)
if not touched_word_indices:
continue
rects = [words[i]["rect"] for i in touched_word_indices]
merged_rect = rects[0]
for r in rects[1:]:
merged_rect |= r
spans = find_spans_for_rect(page, merged_rect)
if not spans:
continue
span, span_rect = spans[0]
m = span.get("matrix", None)
if m is not None:
text_matrix = fitz.Matrix(m[0], m[1], m[2], m[3])
else:
text_matrix = fitz.Matrix(1, 0, 0, 1)
style = {
"font": span["font"],
"size": span["size"],
"color": span["color"],
"matrix": text_matrix,
"origin": fitz.Point(span_rect.x0, span_rect.y1),
}
replacements.append((merged_rect, style, replacement))
# ----- Fuzzy replacements (word-level, Levenshtein) -----
# fuzzy_map: { "target": ("replacement", threshold) }
for i, w in enumerate(words):
text = w["text"]
for target, (replacement, threshold) in fuzzy_map.items():
score = fuzz.ratio(text, target)
if score >= threshold:
rect = w["rect"]
spans = find_spans_for_rect(page, rect)
if not spans:
continue
span, span_rect = spans[0]
m = span.get("matrix", None)
if m is not None:
text_matrix = fitz.Matrix(m[0], m[1], m[2], m[3])
else:
text_matrix = fitz.Matrix(1, 0, 0, 1)
style = {
"font": span["font"],
"size": span["size"],
"color": span["color"],
"matrix": text_matrix,
"origin": fitz.Point(span_rect.x0, span_rect.y1),
}
replacements.append((rect, style, replacement))
# ----- Apply redactions -----
for rect, _, _ in replacements:
page.add_redact_annot(rect)
if replacements:
page.apply_redactions()
# ----- Insert replacements -----
for _, style, replacement_text in replacements:
page.insert_text(
style["origin"],
replacement_text,
fontname=style["font"],
fontsize=style["size"],
color=style["color"],
matrix=style["matrix"],
)
# ---------- Per-PDF processing ----------
def process_single_pdf(
input_path: Path,
output_path: Path,
regex_map: Dict[Pattern, str],
fuzzy_map: Dict[str, Tuple[str, int]],
) -> Tuple[Path, bool, Optional[str]]:
try:
doc = fitz.open(input_path)
for page in doc:
replace_on_page(page, regex_map, fuzzy_map)
output_path.parent.mkdir(parents=True, exist_ok=True)
doc.save(output_path, garbage=3, deflate=True)
doc.close()
return input_path, True, None
except Exception as e:
logging.exception(f"Error processing {input_path}: {e}")
return input_path, False, str(e)
# ---------- Batch + parallel ----------
def batch_process_pdfs(
input_dir: Path,
output_dir: Path,
regex_map: Dict[Pattern, str],
fuzzy_map: Dict[str, Tuple[str, int]],
patterns: Iterable[str] = ("*.pdf",),
recursive: bool = True,
max_workers: int = 4,
):
if recursive:
files = [p for pattern in patterns for p in input_dir.rglob(pattern)]
else:
files = [p for pattern in patterns for p in input_dir.glob(pattern)]
if not files:
logging.warning("No PDF files found.")
return
logging.info(f"Found {len(files)} PDF(s) to process.")
tasks = []
with ProcessPoolExecutor(max_workers=max_workers) as executor:
for pdf_path in files:
rel = pdf_path.relative_to(input_dir)
out_path = output_dir / rel
tasks.append(
executor.submit(
process_single_pdf,
pdf_path,
out_path,
regex_map,
fuzzy_map,
)
)
for f in tqdm(as_completed(tasks), total=len(tasks), desc="Processing PDFs"):
input_path, ok, err = f.result()
if ok:
logging.info(f"Processed: {input_path}")
else:
logging.error(f"Failed: {input_path} | {err}")
# ---------- Example configuration ----------
if __name__ == "__main__":
input_root = Path("input_pdfs")
output_root = Path("output_pdfs")
# Case-insensitive, Unicode-aware regex patterns
regex_map: Dict[Pattern, str] = {
re.compile(r"\binvoice\s+\d{4}\b", re.IGNORECASE | re.UNICODE): "Invoice [REDACTED]",
re.compile(r"\bcompany\s+abc\b", re.IGNORECASE | re.UNICODE): "Company XYZ",
re.compile(r"\btotal:\s+£\d+\.\d{2}\b", re.IGNORECASE | re.UNICODE): "Total: £0.00",
}
# Fuzzy targets: "target": ("replacement", threshold)
fuzzy_map: Dict[str, Tuple[str, int]] = {
"AcmeCorp": ("Acme Corporation", 85),
"Foldng Ltd": ("Folding Ltd", 80),
}
batch_process_pdfs(
input_dir=input_root,
output_dir=output_root,
regex_map=regex_map,
fuzzy_map=fuzzy_map,
patterns=("*.pdf",),
recursive=True,
max_workers=8,
)
If you tell me roughly how messy your PDFs are (scanned vs digital, lots of rotation, etc.), I can suggest small tweaks—like widening intersection tolerances or adding per‑page debug dumps—to make this rock‑solid for your dataset.