Source code for pdftools_sdk.ocr.processor

from __future__ import annotations
import io
from typing import List, Iterator, Tuple, Optional, Any, TYPE_CHECKING, Callable
from ctypes import *
from datetime import datetime
from numbers import Number
from pdftools_sdk.internal import _lib
from pdftools_sdk.internal.utils import _string_to_utf16, _utf16_to_string
from pdftools_sdk.internal.streams import _StreamDescriptor, _NativeStream
from pdftools_sdk.internal.native_base import _NativeBase
from pdftools_sdk.internal.native_object import _NativeObject

import pdftools_sdk.internal

if TYPE_CHECKING:
    from pdftools_sdk.pdf.document import Document
    from pdftools_sdk.ocr.engine import Engine
    from pdftools_sdk.ocr.ocr_options import OcrOptions
    from pdftools_sdk.pdf.output_options import OutputOptions
    from pdftools_sdk.ocr.warning_category import WarningCategory

else:
    Document = "pdftools_sdk.pdf.document.Document"
    Engine = "pdftools_sdk.ocr.engine.Engine"
    OcrOptions = "pdftools_sdk.ocr.ocr_options.OcrOptions"
    OutputOptions = "pdftools_sdk.pdf.output_options.OutputOptions"
    WarningCategory = "pdftools_sdk.ocr.warning_category.WarningCategory"


if not TYPE_CHECKING:
    WarningCategory = "WarningCategory"

WarningFunc = Callable[[str, WarningCategory, int, str], None]
"""
Event for warnings occurring during OCR processing

Non-critical issues during processing are reported via this event.
It is recommended to review the :class:`pdftools_sdk.ocr.warning_category.WarningCategory`  and handle
warnings if necessary for the application.



Args:
    message (str): 
        The message describing the warning

    category (pdftools_sdk.ocr.warning_category.WarningCategory): 
        The category of the warning

    pageNo (int): 
        The page number this warning is associated to, or `0` if not page-specific

    context (str): 
        A description of the context where the warning occurred


"""

[docs] class Processor(_NativeObject): """ Process PDF documents with OCR The processor applies Optical Character Recognition (OCR) to PDF documents. It can make scanned documents searchable, fix text extraction issues and generate PDF tagging/structure. The processor is decoupled from the document - it takes a :class:`pdftools_sdk.pdf.document.Document` as input and produces a new :class:`pdftools_sdk.pdf.document.Document` as output. """ # Event definition _WarningFunc = CFUNCTYPE(None, c_void_p, c_wchar_p, c_int, c_int, c_wchar_p) def _wrap_warning_func(self, py_callback: WarningFunc) -> Processor._WarningFunc: def _c_callback(event_context, message, category, page_no, context): from pdftools_sdk.ocr.warning_category import WarningCategory # Call the Python callback py_callback(_utf16_to_string(message), WarningCategory(category), page_no, _utf16_to_string(context)) # Wrap the callback in CFUNCTYPE so it becomes a valid C function pointer return Processor._WarningFunc(_c_callback)
[docs] def __init__(self): """ """ _lib.PdfToolsOcr_Processor_New.argtypes = [] _lib.PdfToolsOcr_Processor_New.restype = c_void_p ret_val = _lib.PdfToolsOcr_Processor_New() if ret_val is None: _NativeBase._throw_last_error(False) super()._initialize(ret_val) self._warning_callback_map = {}
[docs] def process(self, document: Document, engine: Optional[Engine], out_stream: io.IOBase, options: Optional[OcrOptions] = None, out_options: Optional[OutputOptions] = None) -> Document: """ Apply OCR to a PDF document Process the input PDF document with OCR according to the specified options. The processed document is written to the output stream. Non-critical processing issues raise a :func:`pdftools_sdk.ocr.processor.WarningFunc` . It is recommended to review the :class:`pdftools_sdk.ocr.warning_category.WarningCategory` and handle them if necessary for the application. Args: document (pdftools_sdk.pdf.document.Document): The input PDF document to process engine (Optional[pdftools_sdk.ocr.engine.Engine]): The OCR engine to use for recognition. This parameter may be `None` for operations that do not require OCR, such as :attr:`pdftools_sdk.ocr.image_processing_mode.ImageProcessingMode.REMOVETEXT` . For all other modes, a valid engine must be provided. outStream (io.IOBase): The stream to which the output PDF is written. The stream must support both random read and write access. options (Optional[pdftools_sdk.ocr.ocr_options.OcrOptions]): The OCR processing options. If `None`, default options are used. outOptions (Optional[pdftools_sdk.pdf.output_options.OutputOptions]): The PDF output options, e.g. to encrypt the output document. Returns: pdftools_sdk.pdf.document.Document: The resulting output PDF which can be used as a new input for further processing. Note that this object must be disposed before the output stream object (method argument `outStream`). Raises: pdftools_sdk.license_error.LicenseError: The license check has failed. OSError: Writing to the `outStream` failed. pdftools_sdk.processing_error.ProcessingError: The document could not be processed. ValueError: An OCR engine is required for the specified options but `engine` is `None`. ValueError: The `options` specifies invalid or contradictory settings. ValueError: The `outOptions` specifies document encryption for a PDF/A file, which is not allowed. pdftools_sdk.generic_error.GenericError: An unexpected failure occurred. pdftools_sdk.corrupt_error.CorruptError: An input image in the document is corrupt and cannot be read. pdftools_sdk.password_error.PasswordError: The document is encrypted and the password is invalid. pdftools_sdk.conformance_error.ConformanceError: The document has an invalid conformance level. pdftools_sdk.unsupported_feature_error.UnsupportedFeatureError: The input PDF contains unrendered XFA form fields. See :attr:`pdftools_sdk.pdf.document.Document.xfa` for more information. """ from pdftools_sdk.pdf.document import Document from pdftools_sdk.ocr.engine import Engine from pdftools_sdk.ocr.ocr_options import OcrOptions from pdftools_sdk.pdf.output_options import OutputOptions if not isinstance(document, Document): raise TypeError(f"Expected type {Document.__name__}, but got {type(document).__name__}.") if engine is not None and not isinstance(engine, Engine): raise TypeError(f"Expected type {Engine.__name__} or None, but got {type(engine).__name__}.") if not isinstance(out_stream, io.IOBase): raise TypeError(f"Expected type {io.IOBase.__name__}, but got {type(out_stream).__name__}.") if options is not None and not isinstance(options, OcrOptions): raise TypeError(f"Expected type {OcrOptions.__name__} or None, but got {type(options).__name__}.") if out_options is not None and not isinstance(out_options, OutputOptions): raise TypeError(f"Expected type {OutputOptions.__name__} or None, but got {type(out_options).__name__}.") _lib.PdfToolsOcr_Processor_Process.argtypes = [c_void_p, c_void_p, c_void_p, POINTER(pdftools_sdk.internal.streams._StreamDescriptor), c_void_p, c_void_p] _lib.PdfToolsOcr_Processor_Process.restype = c_void_p ret_val = _lib.PdfToolsOcr_Processor_Process(self._handle, document._handle, engine._handle if engine is not None else None, _StreamDescriptor(out_stream), options._handle if options is not None else None, out_options._handle if out_options is not None else None) if ret_val is None: _NativeBase._throw_last_error(False) return Document._create_dynamic_type(ret_val)
[docs] def add_warning_handler(self, handler: WarningFunc) -> None: """ Add handler for the :func:`WarningFunc` event. Args: handler: Event handler. If a handler is added that is already registered, it is ignored. """ _lib.PdfToolsOcr_Processor_AddWarningHandlerW.argtypes = [c_void_p, c_void_p, self._WarningFunc] _lib.PdfToolsOcr_Processor_AddWarningHandlerW.restype = c_bool # Wrap the handler with the C callback _c_callback = self._wrap_warning_func(handler) # Now pass the callback function as a proper C function type instance if not _lib.PdfToolsOcr_Processor_AddWarningHandlerW(self._handle, None, _c_callback): _NativeBase._throw_last_error() # Add to the class-level callback map (increase count if already added) if handler in self._warning_callback_map: self._warning_callback_map[handler]['count'] += 1 else: self._warning_callback_map[handler] = {'callback': _c_callback, 'count': 1}
[docs] def remove_warning_handler(self, handler: WarningFunc) -> None: """ Remove registered handler of the :func:`WarningFunc` event. Args: handler: Event handler that shall be removed. If a handler is not registered, it is ignored. """ _lib.PdfToolsOcr_Processor_RemoveWarningHandlerW.argtypes = [c_void_p, c_void_p, self._WarningFunc] _lib.PdfToolsOcr_Processor_RemoveWarningHandlerW.restype = c_bool # Check if the handler exists in the class-level map if handler in self._warning_callback_map: from pdftools_sdk.not_found_error import NotFoundError _c_callback = self._warning_callback_map[handler]['callback'] try: if not _lib.PdfToolsOcr_Processor_RemoveWarningHandlerW(self._handle, None, _c_callback): _NativeBase._throw_last_error() except pdftools_sdk.NotFoundError as e: del self._warning_callback_map[handler] # Decrease the count or remove the callback entirely if self._warning_callback_map[handler]['count'] > 1: self._warning_callback_map[handler]['count'] -= 1 else: del self._warning_callback_map[handler]
@staticmethod def _create_dynamic_type(handle): return Processor._from_handle(handle) @classmethod def _from_handle(cls, handle): """ Internal factory method for constructing an instance using an internal handle. This method creates an instance of the class by bypassing the public constructor. """ instance = Processor.__new__(cls) # Bypass __init__ instance._initialize(handle) return instance def _initialize(self, handle): super()._initialize(handle) self._warning_callback_map = {}