Friday, 17 April 2026

Advanced PDF Text Replacement Script

https://pypi.org/project/PyMuPDF/ 

Here’s a single, pipeline‑ready script that adds everything you asked for:

  • Case‑insensitive regex

  • Unicode‑aware matching

  • Fuzzy matching (Levenshtein)

  • Logging + progress bars

  • Parallel processing

It still preserves:

  • font (name, size, color)

  • rotation & skew (via text matrix)

  • line spacing & layout

  • multi‑span words

You can drop this into a .py file and adapt the config at the bottom.

You’ll want:
pip install pymupdf rapidfuzz tqdm


Full script

import fitz
import re
import logging
from pathlib import Path
from typing import Dict, Iterable, Pattern, List, Tuple, Optional
from concurrent.futures import ProcessPoolExecutor, as_completed

from rapidfuzz import fuzz
from tqdm import tqdm


# ---------- Logging setup ----------

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
)


# ---------- Core helpers ----------

def build_text_stream(words: List[dict]) -> Tuple[str, List[Tuple[Optional[int], Optional[int]]]]:
    """
    Build a logical text stream and map character offsets to word entries.
    Returns:
        full_text: str
        index_map: list of (word_index, char_in_word)
    """
    stream = []
    index_map: List[Tuple[Optional[int], Optional[int]]] = []

    for i, w in enumerate(words):
        text = w["text"]
        for c_idx, c in enumerate(text):
            stream.append(c)
            index_map.append((i, c_idx))
        # logical space separator
        stream.append(" ")
        index_map.append((None, None))

    return "".join(stream), index_map


def find_spans_for_rect(page: fitz.Page, rect: fitz.Rect):
    raw = page.get_text("rawdict")
    spans_info = []

    for block in raw["blocks"]:
        if block["type"] != 0:
            continue
        for line in block["lines"]:
            for span in line["spans"]:
                span_rect = fitz.Rect(span["bbox"])
                if span_rect.intersects(rect):
                    spans_info.append((span, span_rect))
    return spans_info


# ---------- Regex + fuzzy replacement on a single page ----------

def replace_on_page(
    page: fitz.Page,
    regex_map: Dict[Pattern, str],
    fuzzy_map: Dict[str, Tuple[str, int]],
):
    """
    regex_map: { compiled_pattern: replacement }
    fuzzy_map: { target_string: (replacement, threshold) }
    """

    raw_words = page.get_text("words")
    if not raw_words:
        return

    words = [
        {
            "rect": fitz.Rect(w[0], w[1], w[2], w[3]),
            "text": w[4],
            "block": w[5],
            "line": w[6],
            "word_index": w[7],
        }
        for w in raw_words
    ]

    full_text, index_map = build_text_stream(words)
    replacements: List[Tuple[fitz.Rect, dict, str]] = []

    # ----- Regex replacements (case-insensitive, unicode-aware) -----
    for pattern, replacement in regex_map.items():
        for match in pattern.finditer(full_text):
            start, end = match.span()

            touched_word_indices = set()
            for pos in range(start, end):
                w_idx, _ = index_map[pos]
                if w_idx is not None:
                    touched_word_indices.add(w_idx)

            if not touched_word_indices:
                continue

            rects = [words[i]["rect"] for i in touched_word_indices]
            merged_rect = rects[0]
            for r in rects[1:]:
                merged_rect |= r

            spans = find_spans_for_rect(page, merged_rect)
            if not spans:
                continue

            span, span_rect = spans[0]
            m = span.get("matrix", None)
            if m is not None:
                text_matrix = fitz.Matrix(m[0], m[1], m[2], m[3])
            else:
                text_matrix = fitz.Matrix(1, 0, 0, 1)

            style = {
                "font": span["font"],
                "size": span["size"],
                "color": span["color"],
                "matrix": text_matrix,
                "origin": fitz.Point(span_rect.x0, span_rect.y1),
            }

            replacements.append((merged_rect, style, replacement))

    # ----- Fuzzy replacements (word-level, Levenshtein) -----
    # fuzzy_map: { "target": ("replacement", threshold) }
    for i, w in enumerate(words):
        text = w["text"]
        for target, (replacement, threshold) in fuzzy_map.items():
            score = fuzz.ratio(text, target)
            if score >= threshold:
                rect = w["rect"]
                spans = find_spans_for_rect(page, rect)
                if not spans:
                    continue

                span, span_rect = spans[0]
                m = span.get("matrix", None)
                if m is not None:
                    text_matrix = fitz.Matrix(m[0], m[1], m[2], m[3])
                else:
                    text_matrix = fitz.Matrix(1, 0, 0, 1)

                style = {
                    "font": span["font"],
                    "size": span["size"],
                    "color": span["color"],
                    "matrix": text_matrix,
                    "origin": fitz.Point(span_rect.x0, span_rect.y1),
                }

                replacements.append((rect, style, replacement))

    # ----- Apply redactions -----
    for rect, _, _ in replacements:
        page.add_redact_annot(rect)
    if replacements:
        page.apply_redactions()

    # ----- Insert replacements -----
    for _, style, replacement_text in replacements:
        page.insert_text(
            style["origin"],
            replacement_text,
            fontname=style["font"],
            fontsize=style["size"],
            color=style["color"],
            matrix=style["matrix"],
        )


# ---------- Per-PDF processing ----------

def process_single_pdf(
    input_path: Path,
    output_path: Path,
    regex_map: Dict[Pattern, str],
    fuzzy_map: Dict[str, Tuple[str, int]],
) -> Tuple[Path, bool, Optional[str]]:
    try:
        doc = fitz.open(input_path)
        for page in doc:
            replace_on_page(page, regex_map, fuzzy_map)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        doc.save(output_path, garbage=3, deflate=True)
        doc.close()
        return input_path, True, None
    except Exception as e:
        logging.exception(f"Error processing {input_path}: {e}")
        return input_path, False, str(e)


# ---------- Batch + parallel ----------

def batch_process_pdfs(
    input_dir: Path,
    output_dir: Path,
    regex_map: Dict[Pattern, str],
    fuzzy_map: Dict[str, Tuple[str, int]],
    patterns: Iterable[str] = ("*.pdf",),
    recursive: bool = True,
    max_workers: int = 4,
):
    if recursive:
        files = [p for pattern in patterns for p in input_dir.rglob(pattern)]
    else:
        files = [p for pattern in patterns for p in input_dir.glob(pattern)]

    if not files:
        logging.warning("No PDF files found.")
        return

    logging.info(f"Found {len(files)} PDF(s) to process.")

    tasks = []
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        for pdf_path in files:
            rel = pdf_path.relative_to(input_dir)
            out_path = output_dir / rel
            tasks.append(
                executor.submit(
                    process_single_pdf,
                    pdf_path,
                    out_path,
                    regex_map,
                    fuzzy_map,
                )
            )

        for f in tqdm(as_completed(tasks), total=len(tasks), desc="Processing PDFs"):
            input_path, ok, err = f.result()
            if ok:
                logging.info(f"Processed: {input_path}")
            else:
                logging.error(f"Failed: {input_path} | {err}")


# ---------- Example configuration ----------

if __name__ == "__main__":
    input_root = Path("input_pdfs")
    output_root = Path("output_pdfs")

    # Case-insensitive, Unicode-aware regex patterns
    regex_map: Dict[Pattern, str] = {
        re.compile(r"\binvoice\s+\d{4}\b", re.IGNORECASE | re.UNICODE): "Invoice [REDACTED]",
        re.compile(r"\bcompany\s+abc\b", re.IGNORECASE | re.UNICODE): "Company XYZ",
        re.compile(r"\btotal:\s+£\d+\.\d{2}\b", re.IGNORECASE | re.UNICODE): "Total: £0.00",
    }

    # Fuzzy targets: "target": ("replacement", threshold)
    fuzzy_map: Dict[str, Tuple[str, int]] = {
        "AcmeCorp": ("Acme Corporation", 85),
        "Foldng Ltd": ("Folding Ltd", 80),
    }

    batch_process_pdfs(
        input_dir=input_root,
        output_dir=output_root,
        regex_map=regex_map,
        fuzzy_map=fuzzy_map,
        patterns=("*.pdf",),
        recursive=True,
        max_workers=8,
    )

If you tell me roughly how messy your PDFs are (scanned vs digital, lots of rotation, etc.), I can suggest small tweaks—like widening intersection tolerances or adding per‑page debug dumps—to make this rock‑solid for your dataset.

No comments:

Post a Comment

Note: only a member of this blog may post a comment.