OCRmyPDF Tutorial: Convert Scanned Documents to Searchable PDF/A Files with Sidecar Text Extraction and Batch Processing

admin 36 minutes ago

0 0 2 minutes read

OCRmyPDF Tutorial: Convert Scanned Documents to Searchable PDF/A Files with Sidecar Text Extraction and Batch Processing

def _purge(*prefixes):
   for name in [m for m in list(sys.modules)
                if any(m == p or m.startswith(p + ".") for p in prefixes)]:
       del sys.modules[name]
def _load_ocrmypdf():
   _purge("PIL", "ocrmypdf")
   import ocrmypdf
   return ocrmypdf
try:
   ocrmypdf = _load_ocrmypdf()
except ImportError as e:
   if "_Ink" in str(e) or "PIL" in str(e):
       print("Repairing an incompatible Pillow (reinstalling pillow<12)...")
       sh(f'"{sys.executable}" -m pip install -q --force-reinstall "pillow<12"')
       try:
           ocrmypdf = _load_ocrmypdf()
           print("Pillow repaired — continuing without a restart.")
       except Exception:
           raise RuntimeError(
               "Pillow is still incompatible in this session. Use the Colab menu: "
               "Runtime > Restart session, then run this cell again."
           )
   else:
       raise
from ocrmypdf.exceptions import (
   ExitCode,
   PriorOcrFoundError,
   EncryptedPdfError,
   MissingDependencyError,
   TaggedPDFError,
   DigitalSignatureError,
   DpiError,
   InputFileError,
   UnsupportedImageFormatError,
)
from ocrmypdf.helpers import check_pdf
from ocrmypdf.pdfa import file_claims_pdfa
import img2pdf
from PIL import Image, ImageDraw, ImageFont, ImageFilter
logging.basicConfig(level=logging.WARNING, format="%(levelname)s: %(message)s")
logging.getLogger("ocrmypdf").setLevel(logging.WARNING)
logging.getLogger("pdfminer").setLevel(logging.ERROR)
logging.getLogger("PIL").setLevel(logging.WARNING)
SAMPLE_TEXT_PAGES = [
   "Optical Character Recognition, commonly abbreviated as OCR, is the "
   "process of converting images of typed or printed text into machine "
   "encoded text. This page was generated as a synthetic scan so that the "
   "OCRmyPDF pipeline has something realistic to recognize and search.",
   "On 14 March 2026 the archive contained 1,482 pages across 37 folders. "
   "Roughly 92 percent of those pages were scanned at 200 to 300 dots per "
   "inch. The remaining 8 percent were skewed and required deskewing before "
   "any reliable recognition was possible.",
   "After OCRmyPDF finishes, the output is a searchable PDF/A file. You can "
   "select text, copy it, and run full text search across thousands of "
   "documents. The original image resolution is preserved while a hidden "
   "text layer is placed accurately underneath the page image.",
]
def _find_font():
   for cand in (
       "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
       "/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf",
   ):
       if os.path.exists(cand):
           return cand
   return None
_FONT_PATH = _find_font()
FONT = ImageFont.truetype(_FONT_PATH, 40) if _FONT_PATH else ImageFont.load_default()
def _add_speckle(img, n=6000, dark=60):
   """Sprinkle light dark specks to imitate scanner noise (motivates --clean)."""
   import random
   px = img.load()
   w, h = img.size
   for _ in range(n):
       px[random.randint(0, w - 1), random.randint(0, h - 1)] = random.randint(0, dark)
   return img
def render_page(text, skew=False):
   """Render one A4 page (1654x2339 px ≈ 200 DPI) of dark text on white."""
   W, H = 1654, 2339
   img = Image.new("L", (W, H), 255)
   draw = ImageDraw.Draw(img)
   draw.multiline_text((150, 180), textwrap.fill(text, width=58),
                       fill=25, font=FONT, spacing=18)
   if skew:
       img = img.rotate(6, resample=Image.BICUBIC, expand=False, fillcolor=255)
       img = img.filter(ImageFilter.GaussianBlur(0.6))
       img = _add_speckle(img)
   return img
def build_scanned_pdf(pdf_path: Path, pages_text, skew_index=1):
   """Render pages to PNGs and wrap them losslessly into an image-only PDF."""
   pngs = []
   for i, text in enumerate(pages_text):
       img = render_page(text, skew=(i == skew_index))
       p = pdf_path.parent / f"_pg_{pdf_path.stem}_{i}.png"
       img.save(p, format="PNG", dpi=(200, 200))
       pngs.append(str(p))
   with open(pdf_path, "wb") as f:
       f.write(img2pdf.convert(pngs))
   for p in pngs:
       os.remove(p)
   return pdf_path
def do_ocr(input_file, output_file, **kw):
   """Wrapper around ocrmypdf.ocr() that disables the progress bar and times it."""
   kw.setdefault("progress_bar", False)
   t0 = time.perf_counter()
   rc = ocrmypdf.ocr(input_file, output_file, **kw)
   return rc, time.perf_counter() - t0
def tokens(s: str):
   return re.findall(r"[a-z0-9]+", s.lower())
def kb(path) -> str:
   return f"{Path(path).stat().st_size / 1024:,.1f} KB"
def banner(title: str):
   line = "─" * 74
   print(f"n{line}n  {title}n{line}")

admin 36 minutes ago

0 0 2 minutes read