Generic PDF objects

Implementation of generic PDF objects (dictionary, number, string, …).

class pypdf.generic.BooleanObject(value: Any)[source]

Bases: PdfObject

clone(pdf_dest: PdfWriterProtocol, force_duplicate: bool = False, ignore_fields: Optional[Sequence[Union[str, int]]] = ()) → BooleanObject[source]: Clone object into pdf_dest.

write_to_stream(stream: IO[Any], encryption_key: Union[None, str, bytes] = None) → None[source]

static read_from_stream(stream: IO[Any]) → BooleanObject[source]

class pypdf.generic.FloatObject(value: Union[str, Any] = '0.0', context: Optional[Any] = None)[source]

Bases: float, PdfObject

clone(pdf_dest: Any, force_duplicate: bool = False, ignore_fields: Optional[Sequence[Union[str, int]]] = ()) → FloatObject[source]: Clone object into pdf_dest.

myrepr() → str[source]

as_numeric() → float[source]

write_to_stream(stream: IO[Any], encryption_key: Union[None, str, bytes] = None) → None[source]

class pypdf.generic.NumberObject(value: Any)[source]

Bases: int, PdfObject

NumberPattern = re.compile(b'[^+-.0-9]')

clone(pdf_dest: Any, force_duplicate: bool = False, ignore_fields: Optional[Sequence[Union[str, int]]] = ()) → NumberObject[source]: Clone object into pdf_dest.

as_numeric() → int[source]

write_to_stream(stream: IO[Any], encryption_key: Union[None, str, bytes] = None) → None[source]

static read_from_stream(stream: IO[Any]) → Union[NumberObject, FloatObject][source]

class pypdf.generic.NameObject[source]

Bases: str, PdfObject

delimiter_pattern = re.compile(b'\\s+|[\\(\\)<>\\[\\]{}/%]')

surfix = b'/'

renumber_table: ClassVar[Dict[str, bytes]] = {'\x00': b'#00', '\x01': b'#01', '\x02': b'#02', '\x03': b'#03', '\x04': b'#04', '\x05': b'#05', '\x06': b'#06', '\x07': b'#07', '\x08': b'#08', '\t': b'#09', '\n': b'#0A', '\x0b': b'#0B', '\x0c': b'#0C', '\r': b'#0D', '\x0e': b'#0E', '\x0f': b'#0F', '\x10': b'#10', '\x11': b'#11', '\x12': b'#12', '\x13': b'#13', '\x14': b'#14', '\x15': b'#15', '\x16': b'#16', '\x17': b'#17', '\x18': b'#18', '\x19': b'#19', '\x1a': b'#1A', '\x1b': b'#1B', '\x1c': b'#1C', '\x1d': b'#1D', '\x1e': b'#1E', '\x1f': b'#1F', ' ': b'#20', '#': b'#23', '%': b'#25', '(': b'#28', ')': b'#29', '/': b'#2F'}

clone(pdf_dest: Any, force_duplicate: bool = False, ignore_fields: Optional[Sequence[Union[str, int]]] = ()) → NameObject[source]: Clone object into pdf_dest.

write_to_stream(stream: IO[Any], encryption_key: Union[None, str, bytes] = None) → None[source]

renumber() → bytes[source]

static unnumber(sin: bytes) → bytes[source]

static read_from_stream(stream: IO[Any], pdf: Any) → NameObject[source]

class pypdf.generic.IndirectObject(idnum: int, generation: int, pdf: Any)[source]

Bases: PdfObject

clone(pdf_dest: PdfWriterProtocol, force_duplicate: bool = False, ignore_fields: Optional[Sequence[Union[str, int]]] = ()) → IndirectObject[source]: Clone object into pdf_dest.

property indirect_reference: IndirectObject

get_object() → Optional[PdfObject][source]

write_to_stream(stream: IO[Any], encryption_key: Union[None, str, bytes] = None) → None[source]

static read_from_stream(stream: IO[Any], pdf: Any) → IndirectObject[source]

class pypdf.generic.NullObject(*args, **kwargs)[source]

Bases: PdfObject

clone(pdf_dest: PdfWriterProtocol, force_duplicate: bool = False, ignore_fields: Optional[Sequence[Union[str, int]]] = ()) → NullObject[source]: Clone object into pdf_dest.

write_to_stream(stream: IO[Any], encryption_key: Union[None, str, bytes] = None) → None[source]

static read_from_stream(stream: IO[Any]) → NullObject[source]

class pypdf.generic.PdfObject(*args, **kwargs)[source]

Bases: PdfObjectProtocol

hash_func(*, usedforsecurity=True): Returns a sha1 hash object; optionally initialized with a string

indirect_reference: Optional[IndirectObject]

hash_value_data() → bytes[source]

hash_value() → bytes[source]

clone(pdf_dest: PdfWriterProtocol, force_duplicate: bool = False, ignore_fields: Optional[Sequence[Union[str, int]]] = ()) → PdfObject[source]

Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter).

By default, this method will call _reference_clone (see _reference).

Parameters

pdf_dest – Target to clone to.
force_duplicate – By default, if the object has already been cloned and referenced, the copy will be returned; when True, a new copy will be created. (Default value = False)
ignore_fields – List/tuple of field names (for dictionaries) that will be ignored during cloning (applies to children duplication as well). If fields are to be considered for a limited number of levels, you have to add it as integer, for example [1,"/B","/TOTO"] means that "/B" will be ignored at the first level only but "/TOTO" on all levels.

Returns

The cloned PdfObject

get_object() → Optional[PdfObject][source]: Resolve indirect references.

write_to_stream(stream: IO[Any], encryption_key: Union[None, str, bytes] = None) → None[source]

class pypdf.generic.TextStringObject(*args, **kwargs)[source]

Bases: str, PdfObject

A string object that has been decoded into a real unicode string.

If read from a PDF document, this string appeared to match the PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding to occur.

clone(pdf_dest: Any, force_duplicate: bool = False, ignore_fields: Optional[Sequence[Union[str, int]]] = ()) → TextStringObject[source]: Clone object into pdf_dest.

autodetect_pdfdocencoding = False

autodetect_utf16 = False

property original_bytes: bytes: It is occasionally possible that a text string object gets created where a byte string object was expected due to the autodetection mechanism – if that occurs, this “original_bytes” property can be used to back-calculate what the original encoded bytes were.

get_original_bytes() → bytes[source]

get_encoded_bytes() → bytes[source]

write_to_stream(stream: IO[Any], encryption_key: Union[None, str, bytes] = None) → None[source]

class pypdf.generic.ByteStringObject(*args, **kwargs)[source]

Bases: bytes, PdfObject

Represents a string object where the text encoding could not be determined.

This occurs quite often, as the PDF spec doesn’t provide an alternate way to represent strings – for example, the encryption data stored in files (like /O) is clearly not text, but is still stored in a “String” object.

clone(pdf_dest: Any, force_duplicate: bool = False, ignore_fields: Optional[Sequence[Union[str, int]]] = ()) → ByteStringObject[source]: Clone object into pdf_dest.

property original_bytes: bytes: For compatibility with TextStringObject.original_bytes.

write_to_stream(stream: IO[Any], encryption_key: Union[None, str, bytes] = None) → None[source]

class pypdf.generic.AnnotationBuilder[source]

Bases: object

The AnnotationBuilder is deprecated.

Instead, use the annotation classes in pypdf.annotations.

See adding PDF annotations for it’s usage combined with PdfWriter.

static text(rect: Union[RectangleObject, Tuple[float, float, float, float]], text: str, open: bool = False, flags: int = 0) → DictionaryObject[source]

Add text annotation.

Parameters

rect – array of four integers [xLL, yLL, xUR, yUR] specifying the clickable rectangular area
text – The text that is added to the document
open –
flags –

Returns

A dictionary object representing the annotation.

static free_text(text: str, rect: Union[RectangleObject, Tuple[float, float, float, float]], font: str = 'Helvetica', bold: bool = False, italic: bool = False, font_size: str = '14pt', font_color: str = '000000', border_color: Optional[str] = '000000', background_color: Optional[str] = 'ffffff') → DictionaryObject[source]

Add text in a rectangle to a page.

Parameters

text – Text to be added
rect – array of four integers [xLL, yLL, xUR, yUR] specifying the clickable rectangular area
font – Name of the Font, e.g. ‘Helvetica’
bold – Print the text in bold
italic – Print the text in italic
font_size – How big the text will be, e.g. ‘14pt’
font_color – Hex-string for the color, e.g. cdcdcd
border_color – Hex-string for the border color, e.g. cdcdcd. Use None for no border.
background_color – Hex-string for the background of the annotation, e.g. cdcdcd. Use None for transparent background.

Returns

A dictionary object representing the annotation.

static popup(*, rect: Union[RectangleObject, Tuple[float, float, float, float]], flags: int = 0, parent: Optional[DictionaryObject] = None, open: bool = False) → DictionaryObject[source]

Add a popup to the document.

Parameters

rect – Specifies the clickable rectangular area as [xLL, yLL, xUR, yUR]
flags – 1 - invisible, 2 - hidden, 3 - print, 4 - no zoom, 5 - no rotate, 6 - no view, 7 - read only, 8 - locked, 9 - toggle no view, 10 - locked contents
open – Whether the popup should be shown directly (default is False).
parent – The contents of the popup. Create this via the AnnotationBuilder.

Returns

A dictionary object representing the annotation.

static line(p1: Tuple[float, float], p2: Tuple[float, float], rect: Union[RectangleObject, Tuple[float, float, float, float]], text: str = '', title_bar: Optional[str] = None) → DictionaryObject[source]

Draw a line on the PDF.

Parameters

p1 – First point
p2 – Second point
rect – array of four integers [xLL, yLL, xUR, yUR] specifying the clickable rectangular area
text – Text to be displayed as the line annotation
title_bar – Text to be displayed in the title bar of the annotation; by convention this is the name of the author

Returns

A dictionary object representing the annotation.

static polyline(vertices: List[Tuple[float, float]]) → DictionaryObject[source]

Draw a polyline on the PDF.

Parameters: vertices – Array specifying the vertices (x, y) coordinates of the poly-line.
Returns: A dictionary object representing the annotation.

static rectangle(rect: Union[RectangleObject, Tuple[float, float, float, float]], interiour_color: Optional[str] = None) → DictionaryObject[source]

Draw a rectangle on the PDF.

This method uses the /Square annotation type of the PDF format.

Parameters

rect – array of four integers [xLL, yLL, xUR, yUR] specifying the clickable rectangular area
interiour_color – None or hex-string for the color, e.g. cdcdcd If None is used, the interiour is transparent.

Returns

A dictionary object representing the annotation.

static highlight(*, rect: Union[RectangleObject, Tuple[float, float, float, float]], quad_points: ArrayObject, highlight_color: str = 'ff0000') → DictionaryObject[source]

Add a highlight annotation to the document.

Parameters

rect – Array of four integers [xLL, yLL, xUR, yUR] specifying the highlighted area
quad_points – An ArrayObject of 8 FloatObjects. Must match a word or a group of words, otherwise no highlight will be shown.
highlight_color – The color used for the hightlight

Returns

A dictionary object representing the annotation.

static ellipse(rect: Union[RectangleObject, Tuple[float, float, float, float]], interiour_color: Optional[str] = None) → DictionaryObject[source]

Draw a rectangle on the PDF.

This method uses the /Circle annotation type of the PDF format.

Parameters

rect – array of four integers [xLL, yLL, xUR, yUR] specifying the bounding box of the ellipse
interiour_color – None or hex-string for the color, e.g. cdcdcd If None is used, the interiour is transparent.

Returns

A dictionary object representing the annotation.

static polygon(vertices: List[Tuple[float, float]]) → DictionaryObject[source]

DEFAULT_FIT = <pypdf.generic._fit.Fit object>

static link(rect: ~typing.Union[~pypdf.generic._rectangle.RectangleObject, ~typing.Tuple[float, float, float, float]], border: ~typing.Optional[~pypdf.generic._data_structures.ArrayObject] = None, url: ~typing.Optional[str] = None, target_page_index: ~typing.Optional[int] = None, fit: ~pypdf.generic._fit.Fit = <pypdf.generic._fit.Fit object>) → DictionaryObject[source]

Add a link to the document.

The link can either be an external link or an internal link.

An external link requires the URL parameter. An internal link requires the target_page_index, fit, and fit args.

Parameters

rect – array of four integers [xLL, yLL, xUR, yUR] specifying the clickable rectangular area
border – if provided, an array describing border-drawing properties. See the PDF spec for details. No border will be drawn if this argument is omitted. - horizontal corner radius, - vertical corner radius, and - border width - Optionally: Dash
url – Link to a website (if you want to make an external link)
target_page_index – index of the page to which the link should go (if you want to make an internal link)
fit – Page fit or ‘zoom’ option.

Returns

A dictionary object representing the annotation.

class pypdf.generic.ArrayObject(iterable=(), /)[source]

Bases: List[Any], PdfObject

clone(pdf_dest: PdfWriterProtocol, force_duplicate: bool = False, ignore_fields: Optional[Sequence[Union[str, int]]] = ()) → ArrayObject[source]: Clone object into pdf_dest.

items() → Iterable[Any][source]: Emulate DictionaryObject.items for a list (index, object).

write_to_stream(stream: IO[Any], encryption_key: Union[None, str, bytes] = None) → None[source]

static read_from_stream(stream: IO[Any], pdf: Optional[PdfReaderProtocol], forced_encoding: Union[None, str, List[str], Dict[int, str]] = None) → ArrayObject[source]

class pypdf.generic.DictionaryObject[source]

Bases: Dict[Any, Any], PdfObject

clone(pdf_dest: PdfWriterProtocol, force_duplicate: bool = False, ignore_fields: Optional[Sequence[Union[str, int]]] = ()) → DictionaryObject[source]: Clone object into pdf_dest.

raw_get(key: Any) → Any[source]

setdefault(key: Any, value: Optional[Any] = None) → Any[source]

property xmp_metadata: Optional[PdfObject]

Retrieve XMP (Extensible Metadata Platform) data relevant to the this object, if available.

Stability: Added in v1.12, will exist for all future v1.x releases. See Table 315 – Additional entries in a metadata stream dictionary

Returns: Returns a {@link #xmp.XmpInformation XmlInformation} instance that can be used to access XMP metadata from the document. Can also return None if no metadata was found on the document root.

write_to_stream(stream: IO[Any], encryption_key: Union[None, str, bytes] = None) → None[source]

static read_from_stream(stream: IO[Any], pdf: Optional[PdfReaderProtocol], forced_encoding: Union[None, str, List[str], Dict[int, str]] = None) → DictionaryObject[source]

class pypdf.generic.TreeObject(dct: Optional[DictionaryObject] = None)[source]

Bases: DictionaryObject

hasChildren() → bool[source]

has_children() → bool[source]

children() → Iterable[Any][source]

add_child(child: Any, pdf: PdfWriterProtocol) → None[source]

inc_parent_counter_default(parent: Union[None, IndirectObject, TreeObject], n: int) → None[source]

inc_parent_counter_outline(parent: Union[None, IndirectObject, TreeObject], n: int) → None[source]

insert_child(child: Any, before: Any, pdf: PdfWriterProtocol, inc_parent_counter: Optional[Callable[[...], Any]] = None) → IndirectObject[source]

remove_child(child: Any) → None[source]

remove_from_tree() → None[source]: Remove the object from the tree it is in.

emptyTree() → None[source]

empty_tree() → None[source]

class pypdf.generic.StreamObject[source]

Bases: DictionaryObject

get_data() → Union[bytes, str][source]

set_data(data: bytes) → None[source]

hash_value_data() → bytes[source]

write_to_stream(stream: IO[Any], encryption_key: Union[None, str, bytes] = None) → None[source]

static initializeFromDictionary(data: Dict[str, Any]) → Union[EncodedStreamObject, DecodedStreamObject][source]

static initialize_from_dictionary(data: Dict[str, Any]) → Union[EncodedStreamObject, DecodedStreamObject][source]

flate_encode(level: int = - 1) → EncodedStreamObject[source]

class pypdf.generic.DecodedStreamObject[source]: Bases: StreamObject

class pypdf.generic.EncodedStreamObject[source]

Bases: StreamObject

get_data() → Union[bytes, str][source]

set_data(data: bytes) → None[source]

class pypdf.generic.ContentStream(stream: Any, pdf: Any, forced_encoding: Union[None, str, List[str], Dict[int, str]] = None)[source]

Bases: DecodedStreamObject

In order to be fast, this datastructure can contain either: * raw data in ._data * parsed stream operations in ._operations

At any time, ContentStream object can either have one or both of those fields defined, and zero or one of those fields set to None.

Those fields are “rebuilt” lazily, when accessed: * when .get_data() is called, if ._data is None, it is rebuilt from ._operations * when .operations is called, if ._operations is None, it is rebuilt from ._data

On the other side, those fields can be invalidated: * when .set_data() is called, ._operations is set to None * when .operations is set, ._data is set to None

clone(pdf_dest: Any, force_duplicate: bool = False, ignore_fields: Optional[Sequence[Union[str, int]]] = ()) → ContentStream[source]

Clone object into pdf_dest.

Parameters

pdf_dest –
force_duplicate –
ignore_fields –

Returns

The cloned ContentStream

get_data() → bytes[source]

set_data(data: bytes) → None[source]

property operations: List[Tuple[Any, Any]]

isolate_graphics_state() → None[source]

write_to_stream(stream: IO[Any], encryption_key: Union[None, str, bytes] = None) → None[source]

class pypdf.generic.ViewerPreferences(value: Any = None)[source]: Bases: DictionaryObject

class pypdf.generic.OutlineItem(title: str, page: Union[NumberObject, IndirectObject, NullObject, DictionaryObject], fit: Fit)[source]

Bases: Destination

write_to_stream(stream: IO[Any], encryption_key: Union[None, str, bytes] = None) → None[source]

class pypdf.generic.OutlineFontFlag(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)[source]

Bases: IntFlag

A class used as an enumerable flag for formatting an outline font.

italic = 1

bold = 2

pypdf.generic.read_object(stream: IO[Any], pdf: Optional[PdfReaderProtocol], forced_encoding: Union[None, str, List[str], Dict[int, str]] = None) → Union[PdfObject, int, str, ContentStream][source]

pypdf.generic.create_string_object(string: Union[str, bytes], forced_encoding: Union[None, str, List[str], Dict[int, str]] = None) → Union[TextStringObject, ByteStringObject][source]

Create a ByteStringObject or a TextStringObject from a string to represent the string.

Parameters

string – The data being used
forced_encoding – Typically None, or an encoding string

Returns

A ByteStringObject

Raises

TypeError – If string is not of type str or bytes.

pypdf.generic.encode_pdfdocencoding(unicode_string: str) → bytes[source]

pypdf.generic.decode_pdfdocencoding(byte_array: bytes) → str[source]

pypdf.generic.hex_to_rgb(value: str) → Tuple[float, float, float][source]

pypdf.generic.read_hex_string_from_stream(stream: IO[Any], forced_encoding: Union[None, str, List[str], Dict[int, str]] = None) → Union[TextStringObject, ByteStringObject][source]

pypdf.generic.read_string_from_stream(stream: IO[Any], forced_encoding: Union[None, str, List[str], Dict[int, str]] = None) → Union[TextStringObject, ByteStringObject][source]

class pypdf._protocols.PdfObjectProtocol(*args, **kwargs)[source]

Bases: Protocol

indirect_reference: Any

clone(pdf_dest: Any, force_duplicate: bool = False, ignore_fields: Optional[Union[Tuple[str, ...], List[str]]] = ()) → Any[source]

get_object() → Optional[PdfObjectProtocol][source]

hash_value() → bytes[source]

write_to_stream(stream: IO[Any], encryption_key: Union[None, str, bytes] = None) → None[source]

class pypdf._protocols.PdfReaderProtocol(*args, **kwargs)[source]

Bases: Protocol

property pdf_header: str

property strict: bool

property xref: Dict[int, Dict[int, Any]]

property pages: List[Any]

property trailer: Dict[str, Any]

get_object(indirect_reference: Any) → Optional[PdfObjectProtocol][source]

class pypdf._protocols.PdfWriterProtocol(*args, **kwargs)[source]

Bases: Protocol

get_object(indirect_reference: Any) → Optional[PdfObjectProtocol][source]

write(stream: Union[Path, str, IO[Any]]) → Tuple[bool, IO[Any]][source]

property pages: List[Any]

property pdf_header: bytes