Generic PDF objects

Implementation of generic PDF objects (dictionary, number, string, …).

class pypdf.generic.AnnotationBuilder[source]

Bases: object

The AnnotationBuilder is deprecated.

Instead, use the annotation classes in pypdf.annotations.

See adding PDF annotations for its usage combined with PdfWriter.

static text(rect: RectangleObject | Tuple[float, float, float, float], text: str, open: bool = False, flags: int = 0) None[source]
static free_text(text: str, rect: RectangleObject | Tuple[float, float, float, float], font: str = 'Helvetica', bold: bool = False, italic: bool = False, font_size: str = '14pt', font_color: str = '000000', border_color: str | None = '000000', background_color: str | None = 'ffffff') None[source]
static popup(*, rect: RectangleObject | Tuple[float, float, float, float], flags: int = 0, parent: DictionaryObject | None = None, open: bool = False) None[source]
static line(p1: Tuple[float, float], p2: Tuple[float, float], rect: RectangleObject | Tuple[float, float, float, float], text: str = '', title_bar: str | None = None) None[source]
static polyline(vertices: List[Tuple[float, float]]) None[source]
static rectangle(rect: RectangleObject | Tuple[float, float, float, float], interiour_color: str | None = None) None[source]
static highlight(*, rect: RectangleObject | Tuple[float, float, float, float], quad_points: ArrayObject, highlight_color: str = 'ff0000', printing: bool = False) None[source]
static ellipse(rect: RectangleObject | Tuple[float, float, float, float], interiour_color: str | None = None) None[source]
static polygon(vertices: List[Tuple[float, float]]) None[source]
DEFAULT_FIT = <pypdf.generic._fit.Fit object>
class pypdf.generic.ArrayObject(iterable=(), /)[source]

Bases: List[Any], PdfObject

replicate(pdf_dest: PdfWriterProtocol) ArrayObject[source]
clone(pdf_dest: PdfWriterProtocol, force_duplicate: bool = False, ignore_fields: Sequence[str | int] | None = ()) ArrayObject[source]

Clone object into pdf_dest.

hash_bin() int[source]

Used to detect modified object.

Returns:

Hash considering type and value.

items() Iterable[Any][source]

Emulate DictionaryObject.items for a list (index, object).

write_to_stream(stream: IO[Any], encryption_key: None | str | bytes = None) None[source]
static read_from_stream(stream: IO[Any], pdf: PdfReaderProtocol | None, forced_encoding: None | str | List[str] | Dict[int, str] = None) ArrayObject[source]
class pypdf.generic.BooleanObject(value: Any)[source]

Bases: PdfObject

clone(pdf_dest: PdfWriterProtocol, force_duplicate: bool = False, ignore_fields: Sequence[str | int] | None = ()) BooleanObject[source]

Clone object into pdf_dest.

hash_bin() int[source]

Used to detect modified object.

Returns:

Hash considering type and value.

write_to_stream(stream: IO[Any], encryption_key: None | str | bytes = None) None[source]
static read_from_stream(stream: IO[Any]) BooleanObject[source]
class pypdf.generic.ByteStringObject(*args, **kwargs)[source]

Bases: bytes, PdfObject

Represents a string object where the text encoding could not be determined.

This occurs quite often, as the PDF spec doesn’t provide an alternate way to represent strings – for example, the encryption data stored in files (like /O) is clearly not text, but is still stored in a “String” object.

clone(pdf_dest: Any, force_duplicate: bool = False, ignore_fields: Sequence[str | int] | None = ()) ByteStringObject[source]

Clone object into pdf_dest.

hash_bin() int[source]

Used to detect modified object.

Returns:

Hash considering type and value.

property original_bytes: bytes

For compatibility with TextStringObject.original_bytes.

write_to_stream(stream: IO[Any], encryption_key: None | str | bytes = None) None[source]
class pypdf.generic.ContentStream(stream: Any, pdf: Any, forced_encoding: None | str | List[str] | Dict[int, str] = None)[source]

Bases: DecodedStreamObject

In order to be fast, this data structure can contain either:

  • raw data in ._data

  • parsed stream operations in ._operations.

At any time, ContentStream object can either have both of those fields defined, or one field defined and the other set to None.

These fields are “rebuilt” lazily, when accessed:

  • when .get_data() is called, if ._data is None, it is rebuilt from ._operations.

  • when .operations is called, if ._operations is None, it is rebuilt from ._data.

Conversely, these fields can be invalidated:

  • when .set_data() is called, ._operations is set to None.

  • when .operations is set, ._data is set to None.

replicate(pdf_dest: PdfWriterProtocol) ContentStream[source]
clone(pdf_dest: Any, force_duplicate: bool = False, ignore_fields: Sequence[str | int] | None = ()) ContentStream[source]

Clone object into pdf_dest.

Parameters:
  • pdf_dest

  • force_duplicate

  • ignore_fields

Returns:

The cloned ContentStream

get_data() bytes[source]
set_data(data: bytes) None[source]
property operations: List[Tuple[Any, bytes]]
isolate_graphics_state() None[source]
write_to_stream(stream: IO[Any], encryption_key: None | str | bytes = None) None[source]
class pypdf.generic.DecodedStreamObject[source]

Bases: StreamObject

class pypdf.generic.DictionaryObject[source]

Bases: Dict[Any, Any], PdfObject

replicate(pdf_dest: PdfWriterProtocol) DictionaryObject[source]
clone(pdf_dest: PdfWriterProtocol, force_duplicate: bool = False, ignore_fields: Sequence[str | int] | None = ()) DictionaryObject[source]

Clone object into pdf_dest.

hash_bin() int[source]

Used to detect modified object.

Returns:

Hash considering type and value.

raw_get(key: Any) Any[source]
get_inherited(key: str, default: Any = None) Any[source]

Returns the value of a key or from the parent if not found. If not found returns default.

Parameters:
  • key – string identifying the field to return

  • default – default value to return

Returns:

Current key or inherited one, otherwise default value.

setdefault(key: Any, value: Any | None = None) Any[source]
property xmp_metadata: XmpInformationProtocol | None

Retrieve XMP (Extensible Metadata Platform) data relevant to the this object, if available.

See Table 347 — Additional entries in a metadata stream dictionary.

Returns:

Returns a XmpInformation instance that can be used to access XMP metadata from the document. Can also return None if no metadata was found on the document root.

write_to_stream(stream: IO[Any], encryption_key: None | str | bytes = None) None[source]
static read_from_stream(stream: IO[Any], pdf: PdfReaderProtocol | None, forced_encoding: None | str | List[str] | Dict[int, str] = None) DictionaryObject[source]
class pypdf.generic.EmbeddedFile(name: str, pdf_object: DictionaryObject)[source]

Bases: object

Container holding the information on an embedded file.

Attributes are evaluated lazily if possible.

Further information on embedded files can be found in section 7.11 of the PDF 2.0 specification.

property alternative_name: str | None

Retrieve the alternative name (file specification).

property description: str | None

Retrieve the description.

property associated_file_relationship: str

Retrieve the relationship of the referring document to this embedded file.

property subtype: str | None

Retrieve the subtype. This is a MIME media type, prefixed by a slash.

property content: bytes

Retrieve the actual file content.

property size: int | None

Retrieve the size of the uncompressed file in bytes.

property creation_date: datetime | None

Retrieve the file creation datetime.

property modification_date: datetime | None

Retrieve the datetime of the last file modification.

property checksum: bytes | None

Retrieve the MD5 checksum of the (uncompressed) file.

class pypdf.generic.EncodedStreamObject[source]

Bases: StreamObject

get_data() bytes[source]
set_data(data: bytes) None[source]
class pypdf.generic.FloatObject(value: Any = '0.0', context: Any | None = None)[source]

Bases: float, PdfObject

clone(pdf_dest: Any, force_duplicate: bool = False, ignore_fields: Sequence[str | int] | None = ()) FloatObject[source]

Clone object into pdf_dest.

hash_bin() int[source]

Used to detect modified object.

Returns:

Hash considering type and value.

myrepr() str[source]
as_numeric() float[source]
write_to_stream(stream: IO[Any], encryption_key: None | str | bytes = None) None[source]
class pypdf.generic.IndirectObject(idnum: int, generation: int, pdf: Any)[source]

Bases: PdfObject

hash_bin() int[source]

Used to detect modified object.

Returns:

Hash considering type and value.

replicate(pdf_dest: PdfWriterProtocol) PdfObject[source]
clone(pdf_dest: PdfWriterProtocol, force_duplicate: bool = False, ignore_fields: Sequence[str | int] | None = ()) IndirectObject[source]

Clone object into pdf_dest.

property indirect_reference: IndirectObject
get_object() PdfObject | None[source]
write_to_stream(stream: IO[Any], encryption_key: None | str | bytes = None) None[source]
static read_from_stream(stream: IO[Any], pdf: Any) IndirectObject[source]
class pypdf.generic.NameObject[source]

Bases: str, PdfObject

delimiter_pattern = re.compile(b'\\s+|[\\(\\)<>\\[\\]{}/%]')
surfix = b'/'
renumber_table: ClassVar[Dict[str, bytes]] = {'\x00': b'#00', '\x01': b'#01', '\x02': b'#02', '\x03': b'#03', '\x04': b'#04', '\x05': b'#05', '\x06': b'#06', '\x07': b'#07', '\x08': b'#08', '\t': b'#09', '\n': b'#0A', '\x0b': b'#0B', '\x0c': b'#0C', '\r': b'#0D', '\x0e': b'#0E', '\x0f': b'#0F', '\x10': b'#10', '\x11': b'#11', '\x12': b'#12', '\x13': b'#13', '\x14': b'#14', '\x15': b'#15', '\x16': b'#16', '\x17': b'#17', '\x18': b'#18', '\x19': b'#19', '\x1a': b'#1A', '\x1b': b'#1B', '\x1c': b'#1C', '\x1d': b'#1D', '\x1e': b'#1E', '\x1f': b'#1F', ' ': b'#20', '#': b'#23', '%': b'#25', '(': b'#28', ')': b'#29', '/': b'#2F'}
clone(pdf_dest: Any, force_duplicate: bool = False, ignore_fields: Sequence[str | int] | None = ()) NameObject[source]

Clone object into pdf_dest.

hash_bin() int[source]

Used to detect modified object.

Returns:

Hash considering type and value.

write_to_stream(stream: IO[Any], encryption_key: None | str | bytes = None) None[source]
renumber() bytes[source]
static unnumber(sin: bytes) bytes[source]
CHARSETS = ('utf-8', 'gbk', 'latin1')
static read_from_stream(stream: IO[Any], pdf: Any) NameObject[source]
class pypdf.generic.NullObject(*args, **kwargs)[source]

Bases: PdfObject

clone(pdf_dest: PdfWriterProtocol, force_duplicate: bool = False, ignore_fields: Sequence[str | int] | None = ()) NullObject[source]

Clone object into pdf_dest.

hash_bin() int[source]

Used to detect modified object.

Returns:

Hash considering type and value.

write_to_stream(stream: IO[Any], encryption_key: None | str | bytes = None) None[source]
static read_from_stream(stream: IO[Any]) NullObject[source]
class pypdf.generic.NumberObject(value: Any)[source]

Bases: int, PdfObject

NumberPattern = re.compile(b'[^+-.0-9]')
clone(pdf_dest: Any, force_duplicate: bool = False, ignore_fields: Sequence[str | int] | None = ()) NumberObject[source]

Clone object into pdf_dest.

hash_bin() int[source]

Used to detect modified object.

Returns:

Hash considering type and value.

as_numeric() int[source]
write_to_stream(stream: IO[Any], encryption_key: None | str | bytes = None) None[source]
static read_from_stream(stream: IO[Any]) NumberObject | FloatObject[source]
class pypdf.generic.OutlineFontFlag(value, names=<not given>, *values, module=None, qualname=None, type=None, start=1, boundary=None)[source]

Bases: IntFlag

A class used as an enumerable flag for formatting an outline font.

italic = 1
bold = 2
class pypdf.generic.OutlineItem(title: str, page: NumberObject | IndirectObject | NullObject | DictionaryObject, fit: Fit)[source]

Bases: Destination

write_to_stream(stream: IO[Any], encryption_key: None | str | bytes = None) None[source]
class pypdf.generic.PdfObject(*args, **kwargs)[source]

Bases: PdfObjectProtocol

hash_func(*, usedforsecurity=True)

Returns a sha1 hash object; optionally initialized with a string

indirect_reference: IndirectObject | None
hash_bin() int[source]

Used to detect modified object.

Returns:

Hash considering type and value.

hash_value_data() bytes[source]
hash_value() bytes[source]
replicate(pdf_dest: PdfWriterProtocol) PdfObject[source]

Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter) without ensuring links. This is used in clone_document_from_root with incremental = True.

Parameters:

pdf_dest – Target to clone to.

Returns:

The cloned PdfObject

clone(pdf_dest: PdfWriterProtocol, force_duplicate: bool = False, ignore_fields: Sequence[str | int] | None = ()) PdfObject[source]

Clone object into pdf_dest (PdfWriterProtocol which is an interface for PdfWriter).

By default, this method will call _reference_clone (see _reference).

Parameters:
  • pdf_dest – Target to clone to.

  • force_duplicate – By default, if the object has already been cloned and referenced, the copy will be returned; when True, a new copy will be created. (Default value = False)

  • ignore_fields – List/tuple of field names (for dictionaries) that will be ignored during cloning (applies to children duplication as well). If fields are to be considered for a limited number of levels, you have to add it as integer, for example [1,"/B","/TOTO"] means that "/B" will be ignored at the first level only but "/TOTO" on all levels.

Returns:

The cloned PdfObject

get_object() PdfObject | None[source]

Resolve indirect references.

write_to_stream(stream: IO[Any], encryption_key: None | str | bytes = None) None[source]
class pypdf.generic.StreamObject[source]

Bases: DictionaryObject

replicate(pdf_dest: PdfWriterProtocol) StreamObject[source]
hash_bin() int[source]

Used to detect modified object.

Returns:

Hash considering type and value.

get_data() bytes[source]
set_data(data: bytes) None[source]
hash_value_data() bytes[source]
write_to_stream(stream: IO[Any], encryption_key: None | str | bytes = None) None[source]
static initializeFromDictionary(data: Dict[str, Any]) None[source]
static initialize_from_dictionary(data: Dict[str, Any]) EncodedStreamObject | DecodedStreamObject[source]
flate_encode(level: int = -1) EncodedStreamObject[source]
decode_as_image() Any[source]

Try to decode the stream object as an image

Returns:

a PIL image if proper decoding has been found

Raises:

Exception – (any)during decoding to to invalid object or errors during decoding will be reported It is recommended to catch exceptions to prevent stops in your program.

class pypdf.generic.TextStringObject(value: Any)[source]

Bases: str, PdfObject

A string object that has been decoded into a real unicode string.

If read from a PDF document, this string appeared to match the PDFDocEncoding, or contained a UTF-16BE BOM mark to cause UTF-16 decoding to occur.

autodetect_pdfdocencoding: bool
autodetect_utf16: bool
utf16_bom: bytes
clone(pdf_dest: Any, force_duplicate: bool = False, ignore_fields: Sequence[str | int] | None = ()) TextStringObject[source]

Clone object into pdf_dest.

hash_bin() int[source]

Used to detect modified object.

Returns:

Hash considering type and value.

property original_bytes: bytes

It is occasionally possible that a text string object gets created where a byte string object was expected due to the autodetection mechanism – if that occurs, this “original_bytes” property can be used to back-calculate what the original encoded bytes were.

get_original_bytes() bytes[source]
get_encoded_bytes() bytes[source]
write_to_stream(stream: IO[Any], encryption_key: None | str | bytes = None) None[source]
class pypdf.generic.TreeObject(dct: DictionaryObject | None = None)[source]

Bases: DictionaryObject

has_children() bool[source]
children() Iterable[Any][source]
add_child(child: Any, pdf: PdfWriterProtocol) None[source]
inc_parent_counter_default(parent: None | IndirectObject | TreeObject, n: int) None[source]
inc_parent_counter_outline(parent: None | IndirectObject | TreeObject, n: int) None[source]
insert_child(child: Any, before: Any, pdf: PdfWriterProtocol, inc_parent_counter: Callable[[...], Any] | None = None) IndirectObject[source]
remove_child(child: Any) None[source]
remove_from_tree() None[source]

Remove the object from the tree it is in.

empty_tree() None[source]
class pypdf.generic.ViewerPreferences(value: Any = None)[source]

Bases: DictionaryObject

property PRINT_SCALING: NameObject
pypdf.generic.create_string_object(string: str | bytes, forced_encoding: None | str | List[str] | Dict[int, str] = None) TextStringObject | ByteStringObject[source]

Create a ByteStringObject or a TextStringObject from a string to represent the string.

Parameters:
  • string – The data being used

  • forced_encoding – Typically None, or an encoding string

Returns:

A ByteStringObject

Raises:

TypeError – If string is not of type str or bytes.

pypdf.generic.decode_pdfdocencoding(byte_array: bytes) str[source]
pypdf.generic.encode_pdfdocencoding(unicode_string: str) bytes[source]
pypdf.generic.hex_to_rgb(value: str) Tuple[float, float, float][source]
pypdf.generic.is_null_or_none(x: Any) TypeGuard[None | NullObject | IndirectObject][source]
Returns:

True if x is None or NullObject.

pypdf.generic.read_hex_string_from_stream(stream: IO[Any], forced_encoding: None | str | List[str] | Dict[int, str] = None) TextStringObject | ByteStringObject[source]
pypdf.generic.read_object(stream: IO[Any], pdf: PdfReaderProtocol | None, forced_encoding: None | str | List[str] | Dict[int, str] = None) PdfObject | int | str | ContentStream[source]
pypdf.generic.read_string_from_stream(stream: IO[Any], forced_encoding: None | str | List[str] | Dict[int, str] = None) TextStringObject | ByteStringObject[source]
class pypdf._protocols.PdfObjectProtocol(*args, **kwargs)[source]

Bases: Protocol

indirect_reference: Any
clone(pdf_dest: Any, force_duplicate: bool = False, ignore_fields: Tuple[str, ...] | List[str] | None = ()) Any[source]
get_object() PdfObjectProtocol | None[source]
hash_value() bytes[source]
write_to_stream(stream: IO[Any], encryption_key: None | str | bytes = None) None[source]
class pypdf._protocols.XmpInformationProtocol(*args, **kwargs)[source]

Bases: PdfObjectProtocol

class pypdf._protocols.PdfCommonDocProtocol(*args, **kwargs)[source]

Bases: Protocol

property pdf_header: str
property pages: List[Any]
property root_object: PdfObjectProtocol
get_object(indirect_reference: Any) PdfObjectProtocol | None[source]
property strict: bool
class pypdf._protocols.PdfReaderProtocol(*args, **kwargs)[source]

Bases: PdfCommonDocProtocol, Protocol

abstract property xref: Dict[int, Dict[int, Any]]
abstract property trailer: Dict[str, Any]
class pypdf._protocols.PdfWriterProtocol(*args, **kwargs)[source]

Bases: PdfCommonDocProtocol, Protocol

incremental: bool
abstract write(stream: Path | str | IO[Any]) Tuple[bool, IO[Any]][source]