Source code for pdftools_sdk.extraction.extractor

from __future__ import annotations
import io
from typing import List, Iterator, Tuple, Optional, Any, TYPE_CHECKING, Callable
from ctypes import *
from datetime import datetime
from numbers import Number
from pdftools_sdk.internal import _lib
from pdftools_sdk.internal.utils import _string_to_utf16, _utf16_to_string
from pdftools_sdk.internal.streams import _StreamDescriptor, _NativeStream
from pdftools_sdk.internal.native_base import _NativeBase
from pdftools_sdk.internal.native_object import _NativeObject

import pdftools_sdk.internal

if TYPE_CHECKING:
    from pdftools_sdk.pdf.document import Document
    from pdftools_sdk.extraction.text_options import TextOptions

else:
    Document = "pdftools_sdk.pdf.document.Document"
    TextOptions = "pdftools_sdk.extraction.text_options.TextOptions"


[docs] class Extractor(_NativeObject): """ Allows for extracting page-wide content of a PDF. """
[docs] def __init__(self): """ """ _lib.PdfToolsExtraction_Extractor_New.argtypes = [] _lib.PdfToolsExtraction_Extractor_New.restype = c_void_p ret_val = _lib.PdfToolsExtraction_Extractor_New() if ret_val is None: _NativeBase._throw_last_error(False) super()._initialize(ret_val)
[docs] def extract_text(self, in_doc: Document, out_stream: io.IOBase, options: Optional[TextOptions] = None, first_page: Optional[int] = None, last_page: Optional[int] = None) -> None: """ Extract text from a PDF document Args: inDoc (pdftools_sdk.pdf.document.Document): The input PDF document. outStream (io.IOBase): The stream to which output file the extracted text is written. options (Optional[pdftools_sdk.extraction.text_options.TextOptions]): The option object that controls the text extraction. firstPage (Optional[int]): Optional parameter denoting the index of the first page to be copied. This index is one-based. If set, the number must be in the range of `1` (first page) to :attr:`pdftools_sdk.pdf.document.Document.page_count` (last page). If not set, `1` is used. lastPage (Optional[int]): Optional parameter denoting the index of the last page to be copied. This index is one-based. If set, the number must be in the range of `1` (first page) to :attr:`pdftools_sdk.pdf.document.Document.page_count` (last page). If not set, :attr:`pdftools_sdk.pdf.document.Document.page_count` is used. Raises: pdftools_sdk.license_error.LicenseError: The license check has failed. pdftools_sdk.processing_error.ProcessingError: The processing has failed. OSError: Writing to the output text file has failed. pdftools_sdk.generic_error.GenericError: A generic error occurred. ValueError: The `firstPage` or `lastPage` are not in the allowed range. """ from pdftools_sdk.pdf.document import Document from pdftools_sdk.extraction.text_options import TextOptions if not isinstance(in_doc, Document): raise TypeError(f"Expected type {Document.__name__}, but got {type(in_doc).__name__}.") if not isinstance(out_stream, io.IOBase): raise TypeError(f"Expected type {io.IOBase.__name__}, but got {type(out_stream).__name__}.") if options is not None and not isinstance(options, TextOptions): raise TypeError(f"Expected type {TextOptions.__name__} or None, but got {type(options).__name__}.") if first_page is not None and not isinstance(first_page, int): raise TypeError(f"Expected type {int.__name__} or None, but got {type(first_page).__name__}.") if last_page is not None and not isinstance(last_page, int): raise TypeError(f"Expected type {int.__name__} or None, but got {type(last_page).__name__}.") _lib.PdfToolsExtraction_Extractor_ExtractText.argtypes = [c_void_p, c_void_p, POINTER(pdftools_sdk.internal.streams._StreamDescriptor), c_void_p, POINTER(c_int), POINTER(c_int)] _lib.PdfToolsExtraction_Extractor_ExtractText.restype = c_bool if not _lib.PdfToolsExtraction_Extractor_ExtractText(self._handle, in_doc._handle, _StreamDescriptor(out_stream), options._handle if options is not None else None, byref(c_int(first_page)) if first_page is not None else None, byref(c_int(last_page)) if last_page is not None else None): _NativeBase._throw_last_error(False)
@staticmethod def _create_dynamic_type(handle): return Extractor._from_handle(handle) @classmethod def _from_handle(cls, handle): """ Internal factory method for constructing an instance using an internal handle. This method creates an instance of the class by bypassing the public constructor. """ instance = Extractor.__new__(cls) # Bypass __init__ instance._initialize(handle) return instance def _initialize(self, handle): super()._initialize(handle)