from __future__ import annotations
import io
from typing import List, Iterator, Tuple, Optional, Any, TYPE_CHECKING, Callable
from ctypes import *
from datetime import datetime
from numbers import Number
from pdftools_sdk.internal import _lib
from pdftools_sdk.internal.utils import _string_to_utf16, _utf16_to_string
from pdftools_sdk.internal.streams import _StreamDescriptor, _NativeStream
from pdftools_sdk.internal.native_base import _NativeBase
from pdftools_sdk.internal.native_object import _NativeObject
import pdftools_sdk.internal
if TYPE_CHECKING:
from pdftools_sdk.pdf.document import Document
from pdftools_sdk.ocr.engine import Engine
from pdftools_sdk.ocr.ocr_options import OcrOptions
from pdftools_sdk.pdf.output_options import OutputOptions
from pdftools_sdk.ocr.warning_category import WarningCategory
else:
Document = "pdftools_sdk.pdf.document.Document"
Engine = "pdftools_sdk.ocr.engine.Engine"
OcrOptions = "pdftools_sdk.ocr.ocr_options.OcrOptions"
OutputOptions = "pdftools_sdk.pdf.output_options.OutputOptions"
WarningCategory = "pdftools_sdk.ocr.warning_category.WarningCategory"
if not TYPE_CHECKING:
WarningCategory = "WarningCategory"
WarningFunc = Callable[[str, WarningCategory, int, str], None]
"""
Event for warnings occurring during OCR processing
Non-critical issues during processing are reported via this event.
It is recommended to review the :class:`pdftools_sdk.ocr.warning_category.WarningCategory` and handle
warnings if necessary for the application.
Args:
message (str):
The message describing the warning
category (pdftools_sdk.ocr.warning_category.WarningCategory):
The category of the warning
pageNo (int):
The page number this warning is associated to, or `0` if not page-specific
context (str):
A description of the context where the warning occurred
"""
[docs]
class Processor(_NativeObject):
"""
Process PDF documents with OCR
The processor applies Optical Character Recognition (OCR) to PDF documents.
It can make scanned documents searchable, fix text extraction issues and
generate PDF tagging/structure.
The processor is decoupled from the document - it takes a :class:`pdftools_sdk.pdf.document.Document`
as input and produces a new :class:`pdftools_sdk.pdf.document.Document` as output.
"""
# Event definition
_WarningFunc = CFUNCTYPE(None, c_void_p, c_wchar_p, c_int, c_int, c_wchar_p)
def _wrap_warning_func(self, py_callback: WarningFunc) -> Processor._WarningFunc:
def _c_callback(event_context, message, category, page_no, context):
from pdftools_sdk.ocr.warning_category import WarningCategory
# Call the Python callback
py_callback(_utf16_to_string(message), WarningCategory(category), page_no, _utf16_to_string(context))
# Wrap the callback in CFUNCTYPE so it becomes a valid C function pointer
return Processor._WarningFunc(_c_callback)
[docs]
def __init__(self):
"""
"""
_lib.PdfToolsOcr_Processor_New.argtypes = []
_lib.PdfToolsOcr_Processor_New.restype = c_void_p
ret_val = _lib.PdfToolsOcr_Processor_New()
if ret_val is None:
_NativeBase._throw_last_error(False)
super()._initialize(ret_val)
self._warning_callback_map = {}
[docs]
def process(self, document: Document, engine: Optional[Engine], out_stream: io.IOBase, options: Optional[OcrOptions] = None, out_options: Optional[OutputOptions] = None) -> Document:
"""
Apply OCR to a PDF document
Process the input PDF document with OCR according to the specified options.
The processed document is written to the output stream.
Non-critical processing issues raise a :func:`pdftools_sdk.ocr.processor.WarningFunc` .
It is recommended to review the :class:`pdftools_sdk.ocr.warning_category.WarningCategory` and handle
them if necessary for the application.
Args:
document (pdftools_sdk.pdf.document.Document):
The input PDF document to process
engine (Optional[pdftools_sdk.ocr.engine.Engine]):
The OCR engine to use for recognition.
This parameter may be `None` for operations that do not require OCR,
such as :attr:`pdftools_sdk.ocr.image_processing_mode.ImageProcessingMode.REMOVETEXT` .
For all other modes, a valid engine must be provided.
outStream (io.IOBase):
The stream to which the output PDF is written.
The stream must support both random read and write access.
options (Optional[pdftools_sdk.ocr.ocr_options.OcrOptions]):
The OCR processing options.
If `None`, default options are used.
outOptions (Optional[pdftools_sdk.pdf.output_options.OutputOptions]):
The PDF output options, e.g. to encrypt the output document.
Returns:
pdftools_sdk.pdf.document.Document:
The resulting output PDF which can be used as a new input
for further processing.
Note that this object must be disposed before the output stream
object (method argument `outStream`).
Raises:
pdftools_sdk.license_error.LicenseError:
The license check has failed.
OSError:
Writing to the `outStream` failed.
pdftools_sdk.processing_error.ProcessingError:
The document could not be processed.
ValueError:
An OCR engine is required for the specified options but `engine` is `None`.
ValueError:
The `options` specifies invalid or contradictory settings.
ValueError:
The `outOptions` specifies document encryption for a PDF/A file, which is not allowed.
pdftools_sdk.generic_error.GenericError:
An unexpected failure occurred.
pdftools_sdk.corrupt_error.CorruptError:
An input image in the document is corrupt and cannot be read.
pdftools_sdk.password_error.PasswordError:
The document is encrypted and the password is invalid.
pdftools_sdk.conformance_error.ConformanceError:
The document has an invalid conformance level.
pdftools_sdk.unsupported_feature_error.UnsupportedFeatureError:
The input PDF contains unrendered XFA form fields.
See :attr:`pdftools_sdk.pdf.document.Document.xfa` for more information.
"""
from pdftools_sdk.pdf.document import Document
from pdftools_sdk.ocr.engine import Engine
from pdftools_sdk.ocr.ocr_options import OcrOptions
from pdftools_sdk.pdf.output_options import OutputOptions
if not isinstance(document, Document):
raise TypeError(f"Expected type {Document.__name__}, but got {type(document).__name__}.")
if engine is not None and not isinstance(engine, Engine):
raise TypeError(f"Expected type {Engine.__name__} or None, but got {type(engine).__name__}.")
if not isinstance(out_stream, io.IOBase):
raise TypeError(f"Expected type {io.IOBase.__name__}, but got {type(out_stream).__name__}.")
if options is not None and not isinstance(options, OcrOptions):
raise TypeError(f"Expected type {OcrOptions.__name__} or None, but got {type(options).__name__}.")
if out_options is not None and not isinstance(out_options, OutputOptions):
raise TypeError(f"Expected type {OutputOptions.__name__} or None, but got {type(out_options).__name__}.")
_lib.PdfToolsOcr_Processor_Process.argtypes = [c_void_p, c_void_p, c_void_p, POINTER(pdftools_sdk.internal.streams._StreamDescriptor), c_void_p, c_void_p]
_lib.PdfToolsOcr_Processor_Process.restype = c_void_p
ret_val = _lib.PdfToolsOcr_Processor_Process(self._handle, document._handle, engine._handle if engine is not None else None, _StreamDescriptor(out_stream), options._handle if options is not None else None, out_options._handle if out_options is not None else None)
if ret_val is None:
_NativeBase._throw_last_error(False)
return Document._create_dynamic_type(ret_val)
[docs]
def add_warning_handler(self, handler: WarningFunc) -> None:
"""
Add handler for the :func:`WarningFunc` event.
Args:
handler: Event handler. If a handler is added that is already registered, it is ignored.
"""
_lib.PdfToolsOcr_Processor_AddWarningHandlerW.argtypes = [c_void_p, c_void_p, self._WarningFunc]
_lib.PdfToolsOcr_Processor_AddWarningHandlerW.restype = c_bool
# Wrap the handler with the C callback
_c_callback = self._wrap_warning_func(handler)
# Now pass the callback function as a proper C function type instance
if not _lib.PdfToolsOcr_Processor_AddWarningHandlerW(self._handle, None, _c_callback):
_NativeBase._throw_last_error()
# Add to the class-level callback map (increase count if already added)
if handler in self._warning_callback_map:
self._warning_callback_map[handler]['count'] += 1
else:
self._warning_callback_map[handler] = {'callback': _c_callback, 'count': 1}
[docs]
def remove_warning_handler(self, handler: WarningFunc) -> None:
"""
Remove registered handler of the :func:`WarningFunc` event.
Args:
handler: Event handler that shall be removed. If a handler is not registered, it is ignored.
"""
_lib.PdfToolsOcr_Processor_RemoveWarningHandlerW.argtypes = [c_void_p, c_void_p, self._WarningFunc]
_lib.PdfToolsOcr_Processor_RemoveWarningHandlerW.restype = c_bool
# Check if the handler exists in the class-level map
if handler in self._warning_callback_map:
from pdftools_sdk.not_found_error import NotFoundError
_c_callback = self._warning_callback_map[handler]['callback']
try:
if not _lib.PdfToolsOcr_Processor_RemoveWarningHandlerW(self._handle, None, _c_callback):
_NativeBase._throw_last_error()
except pdftools_sdk.NotFoundError as e:
del self._warning_callback_map[handler]
# Decrease the count or remove the callback entirely
if self._warning_callback_map[handler]['count'] > 1:
self._warning_callback_map[handler]['count'] -= 1
else:
del self._warning_callback_map[handler]
@staticmethod
def _create_dynamic_type(handle):
return Processor._from_handle(handle)
@classmethod
def _from_handle(cls, handle):
"""
Internal factory method for constructing an instance using an internal handle.
This method creates an instance of the class by bypassing the public constructor.
"""
instance = Processor.__new__(cls) # Bypass __init__
instance._initialize(handle)
return instance
def _initialize(self, handle):
super()._initialize(handle)
self._warning_callback_map = {}