module Pdftext:sig
..end
Parsing fonts and extracting text from content streams and PDF strings
type
type3_glpyhs = {
|
fontbbox : |
|
fontmatrix : |
|
charprocs : |
|
type3_resources : |
}
type
simple_fonttype =
| |
Type1 |
| |
MMType1 |
| |
Type3 of |
| |
Truetype |
type
fontfile =
| |
FontFile of |
| |
FontFile2 of |
| |
FontFile3 of |
type
fontdescriptor = {
|
ascent : |
|
descent : |
|
avgwidth : |
|
maxwidth : |
|
flags : |
|
fontbbox : |
|
italicangle : |
|
capheight : |
|
xheight : |
|
stemv : |
|
fontfile : |
|
charset : |
|
tounicode : |
}
typedifferences =
(string * int) list
type
encoding =
| |
ImplicitInFontFile |
| |
StandardEncoding |
| |
MacRomanEncoding |
| |
WinAnsiEncoding |
| |
MacExpertEncoding |
| |
CustomEncoding of |
| |
FillUndefinedWithStandard of |
typefontmetrics =
float array
type
simple_font = {
|
fonttype : |
|
basefont : |
|
firstchar : |
|
lastchar : |
|
widths : |
|
fontdescriptor : |
|
fontmetrics : |
|
encoding : |
}
type
standard_font =
| |
TimesRoman |
| |
TimesBold |
| |
TimesItalic |
| |
TimesBoldItalic |
| |
Helvetica |
| |
HelveticaBold |
| |
HelveticaOblique |
| |
HelveticaBoldOblique |
| |
Courier |
| |
CourierBold |
| |
CourierOblique |
| |
CourierBoldOblique |
| |
Symbol |
| |
ZapfDingbats |
type
cid_system_info = {
|
registry : |
|
ordering : |
|
supplement : |
}
type
composite_CIDfont = {
|
cid_system_info : |
|
cid_basefont : |
|
cid_fontdescriptor : |
|
cid_widths : |
|
cid_default_width : |
}
type
cmap_encoding =
| |
Predefined of |
| |
CMap of |
type
font =
| |
StandardFont of |
| |
SimpleFont of |
| |
CIDKeyedFont of |
val string_of_standard_font : standard_font -> string
Returns a string such as "Times-Bold" for Pdftext.TimesBold etc.
val standard_font_of_name : string -> standard_font option
Parses a string such as "/Times-Bold" or "/TimesNewRoman,Bold" to Pdftext.TimesRomanBold etc.
val string_of_font : font -> string
A debug string for the whole font datatype.
val read_font : Pdf.t -> Pdf.pdfobject -> font
Read a font from a given document and object
val write_font : ?objnum:int -> Pdf.t -> font -> int
Write a font to a given document, returning the object number for the main font dictionary
val is_unicode : string -> bool
Is a PDF string UTF16be (i.e does it have a byte order marker at the beginning)?
val is_identity_h : font -> bool
Is a font Identity H?
val codepoints_of_utf8 : string -> int list
A list of unicode codepoints for a UTF8 string
val utf8_of_codepoints : int list -> string
A UTF8 string for a list of unicode codepoints
val codepoints_of_utf16be : string -> int list
A list of unicode codepoints for a UTF16BE string
val utf16be_of_codepoints : int list -> string
A UTF16BE string for a list of unicode codepoints (with BOM)
val utf8_of_pdfdocstring : string -> string
Take a pdf string (which will be either pdfdocencoding or UTF16BE) and return a string representing the same unicode codepoints in UTF8
val pdfdocstring_of_utf8 : string -> string
Take a UTF8 string and convert to pdfdocencoding (if no unicode-only characters are used) or UTF16BE (if they are))
val pdfdocstring_of_codepoints : int list -> string
Build a pdf string in pdfdocencoding (if no unicode-only characters are used) or UTF16BE (if they are)
val codepoints_of_pdfdocstring : string -> int list
Produce a list of unicode codepoints from a pdfdocencoding or UTF16BE pdf document string
val simplify_utf16be : string -> string
Remake a UTF16BE string into a PDFDocEncoding string if all characters are in PDFDocEncoding
type
text_extractor
The type of text extractors.
val text_extractor_of_font : Pdf.t -> Pdf.pdfobject -> text_extractor
Build a text extractor from a document and font object
val text_extractor_of_font_real : font -> text_extractor
Build a text extractor from a document and a font
val codepoints_of_text : text_extractor -> string -> int list
Return a list of unicode points from a given extractor and string (for
example from a Pdfpages.Op_Tj
or Op_TJ
operator).
val glyphnames_of_text : text_extractor -> string -> string list
Return a list of glyph names from a given extractor and string
val charcode_extractor_of_font : ?debug:bool -> Pdf.t -> Pdf.pdfobject -> int -> int option
Return the character code for a given unicode codepoint, if it exists in
the encoding and font object. If debug
is set (default false) missing
characters are reported to stderr.
val charcode_extractor_of_font_real : ?debug:bool -> font -> int -> int option
Return the character code for a given unicode codepoint, if it exists in
the encoding and font. If debug
is set (default false) missing characters are
reported to stderr.
val table_of_encoding : encoding -> (int, string) Stdlib.Hashtbl.t
Table of all the entries in an encoding.
val reverse_table_of_encoding : encoding -> (string, int) Stdlib.Hashtbl.t
Reverse table of all the entries in an encoding.
val parse_tounicode : Pdf.t -> Pdf.pdfobject -> (int * string) list
Parse a /ToUnicode
entry.