← Back to docs

API Reference

Complete documentation for the JustHTML public API.

JustHTML

The main parser class.

from justhtml import JustHTML

Constructor

JustHTML(
    html,
    *,
    sanitize=True,
    safe=None,
    policy=None,
    collect_errors=False,
    track_node_locations=False,
    debug=False,
    encoding=None,
    fragment=False,
    fragment_context=None,
    iframe_srcdoc=False,
    strict=False,
    tokenizer_opts=None,
    tree_builder=None,
    transforms=None,
)

| Parameter | Type | Default | Description | |———–|——|———|————-| | html | str \| bytes \| bytearray \| memoryview | required | HTML input to parse. Bytes are decoded using HTML encoding sniffing. | | sanitize | bool | True | Sanitize untrusted HTML during construction | | safe | bool \| None | None | Backwards-compatible alias for sanitize (prefer sanitize) | | policy | SanitizationPolicy \| None | None | Override the default sanitization policy | | collect_errors | bool | False | Collect all parse errors (enables errors property) | | track_node_locations | bool | False | Track line/column positions for nodes (slower) | | debug | bool | False | Enable debug mode (internal) | | encoding | str \| None | None | Transport-supplied encoding label used as an override for byte input. See Encoding & Byte Input. | | fragment | bool | False | Parse as a fragment in a default <div> context (convenience). | | fragment_context | FragmentContext | None | Parse as fragment inside this context element | | scripting_enabled | bool | True | While this library does not support executing javascript inside <script> tags, this flag controls how the HTML5 algorithm parses noscript tags. Do NOT set this flag to False while sanitizing untrusted input; disabling scripting increases mXSS risk. | | strict | bool | False | Raise StrictModeError on the earliest parse error by source position | | transforms | list[Transform] \| None | None | Optional DOM transforms applied after parsing. See Transforms. | | iframe_srcdoc | bool | False | Parse whole document as if it’s inside an iframe srcdoc (HTML parsing quirk) | | tokenizer_opts | TokenizerOpts \| None | None | Advanced tokenizer configuration | | tree_builder | TreeBuilder \| None | None | Supply a custom tree builder |

Properties

Property Type Description
root Document \| DocumentFragment The document root
errors list[ParseError] Parse errors, ordered by source position (only if collect_errors=True)

Methods

to_html(pretty=True, indent_size=2, context=None, quote='"')

Serialize the document to HTML.

from justhtml import HTMLContext, JustHTML

doc = JustHTML("<p>Hello</p>")
doc.to_html()  # Pretty-printed HTML
doc.to_html(pretty=False)  # Compact HTML
doc.to_html(context=HTMLContext.JS_STRING)  # HTML -> JS string literal

# With enum:
# from justhtml import HTMLContext
# doc.to_html(context=HTMLContext.JS_STRING)

Parameters:

escape_js_string(value, quote='"')

Escape a value for safe inclusion in a JavaScript string literal.

from justhtml import JustHTML

JustHTML.escape_js_string('He said "hi"')
# => He said \"hi\"

escape_attr_value(value, quote='"')

Escape a value for safe inclusion in a quoted HTML attribute value.

from justhtml import JustHTML

JustHTML.escape_attr_value('" onerror="alert(1)')
# => &quot; onerror=&quot;alert(1)

escape_url_value(value)

Percent-encode a URL value.

from justhtml import JustHTML

JustHTML.escape_url_value('/path with space?x=1&y=2')
# => /path%20with%20space?x=1&y=2

escape_url_in_js_string(value, quote='"')

Convenience helper: URL-encode, then JS-string escape.

from justhtml import JustHTML

JustHTML.escape_url_in_js_string('/path with space?x=1&y=2')
# => /path%20with%20space?x=1&y=2

clean_url_value(value, url_rule)

Validate and rewrite a URL value using an explicit UrlRule. Returns None if the URL is disallowed.

from justhtml import JustHTML, UrlRule

url_rule = UrlRule(allowed_schemes={"https"})
JustHTML.clean_url_value(value="https://example.com/", url_rule=url_rule)
# => https://example.com/

clean_url_in_js_string(value, url_rule, quote='"')

Convenience helper: clean a URL, then percent-encode it and JS-string escape it. Returns None if the URL is disallowed.

from justhtml import JustHTML, UrlRule

url_rule = UrlRule(allowed_schemes={"https"})
JustHTML.clean_url_in_js_string(value="https://example.com/a b", url_rule=url_rule)
# => https://example.com/a%20b

to_text()

Return the document’s concatenated text.

doc = JustHTML("<p>Hello <b>world</b></p>")
doc.to_text()  # => Hello world

Parameters:

Sanitization happens at construction time. Use JustHTML(..., sanitize=False) for trusted input or JustHTML(..., policy=...) to customize the policy.

to_markdown(html_passthrough=False)

Return a pragmatic subset of GitHub Flavored Markdown (GFM).

Tables (<table>) and images (<img>) are preserved as raw HTML. Raw HTML tags like <script>, <style>, and <textarea> are dropped by default; pass html_passthrough=True to preserve them (including their contents).

doc = JustHTML("<h1>Title</h1><p>Hello <b>world</b></p>")
doc.to_markdown()  # => # Title
# =>
# => Hello **world**
doc.to_markdown(html_passthrough=True)

Sanitization happens at construction time. Use JustHTML(..., sanitize=False) for trusted input or JustHTML(..., policy=...) to customize the policy.

query(selector)

Find all nodes matching a CSS selector.

doc.query("div.container > p")  # Returns list of matching nodes
doc.query(":comment")           # Returns comment nodes

query_one(selector)

Return the first matching descendant for a CSS selector, or None.

node = doc.query_one("div.container > p")

Node

Base type for all DOM nodes.

Node types:

Template nodes expose a template_content document fragment (HTML namespace only), which holds the template’s children.

Direct DOM edits are supported, but transforms are the preferred way to make changes because they preserve ordering semantics and make sanitization explicit. See Transforms for the recommended workflow. If you mutate the DOM after construction, sanitization has already happened; re-sanitize by using sanitize_dom(...) or rebuild the document with a Sanitize(...) transform in the construction pipeline.

Properties

Property Type Description
name str Tag name (e.g., "div") or "#text", "#comment", "#document"
attrs dict \| None Attribute dictionary (None for comments/doctypes)
children list \| None Child nodes (None for comments/doctypes)
parent Node Parent node (or None for root)
text str Node-local text value. For text nodes this is the node data, otherwise "". Use to_text() for textContent semantics.
namespace str \| None Namespace for the node ("html" by default for elements).

Methods

to_html(indent=0, indent_size=2, pretty=True, context=None, quote='"')

Serialize the node to HTML string.

from justhtml import HTMLContext

node.to_html()                      # Pretty-printed HTML
node.to_html(pretty=False)          # Compact HTML
node.to_html(indent_size=4)         # 4-space indent
node.to_html(indent=2, indent_size=4)  # Start with 2 indents
node.to_html(context=HTMLContext.JS_STRING)  # HTML -> JS string literal

# Or use the enum from the public namespace:
# from justhtml import HTMLContext
# node.to_html(context=HTMLContext.JS_STRING)

# Context options:
# - HTMLContext.HTML (default): no extra escaping
# - HTMLContext.JS_STRING: JS-string escape (serialized HTML markup)
# - HTMLContext.HTML_ATTR_VALUE: escape the serialized HTML for a quoted HTML attribute value
#
# If you need to put plain text into `innerHTML` via a JS string, use:
# - JustHTML.escape_html_text_in_js_string(...)
#
# For escaping plain strings (no DOM required), use:
# - JustHTML.escape_js_string(...)
# - JustHTML.escape_attr_value(...)
# - JustHTML.escape_url_value(...)
# - JustHTML.escape_url_in_js_string(...)
# - JustHTML.clean_url_value(...)
# - JustHTML.clean_url_in_js_string(...)

# Safety happens at construction time:
# - default: JustHTML(..., sanitize=True)
# - raw/trusted: JustHTML(..., sanitize=False)
# - custom policy: JustHTML(..., policy=policy)

query(selector)

Find descendants matching a CSS selector.

div.query("p.intro")  # Search within this node

query_one(selector)

Return the first matching descendant for a CSS selector, or None.

p = div.query_one("p.intro")

to_text()

Return the node’s concatenated text.

node.to_text()

Parameters:

Text extraction is safe-by-default when you build documents with JustHTML(..., sanitize=True) (the default). Use sanitize=False at construction for trusted input.

to_markdown(html_passthrough=False)

Return a pragmatic subset of GitHub Flavored Markdown (GFM) for this subtree.

node.to_markdown()
node.to_markdown(html_passthrough=True)

Markdown output is safe-by-default when you build documents with JustHTML(..., sanitize=True) (the default). Use sanitize=False at construction for trusted input.

append_child(node)

Append a child node to this node.

insert_before(node, reference_node)

Insert node before reference_node (or append if reference_node is None).

remove_child(node)

Remove a direct child node.

replace_child(new_node, old_node)

Replace a direct child node with a new node.

clone_node(deep=False, override_attrs=None)

Clone this node. If deep=True, children are cloned recursively.

has_child_nodes()

Return True if this node has children.


Sanitization

JustHTML includes a built-in, policy-driven HTML sanitizer.

Guides:

from justhtml import DEFAULT_POLICY, SanitizationPolicy, UrlPolicy, UrlProxy, UrlRule, sanitize_dom

Sanitizing output vs sanitizing the DOM

from justhtml import JustHTML, Sanitize

doc = JustHTML(user_html, fragment=True, transforms=[Sanitize()])
clean_root = doc.root

sanitize_dom(node, *, policy=None, errors=None)

Re-sanitize a DOM tree after direct edits. For document roots (#document or #document-fragment), this mutates the tree in place. For other nodes, the node is sanitized as if it were the only child of a document fragment; the returned node may need to be reattached by the caller.

from justhtml import sanitize_dom

sanitize_dom(doc.root)  # In-place for document roots

DEFAULT_POLICY

Conservative built-in policy used for safe-by-default sanitization.

DEFAULT_DOCUMENT_POLICY

Conservative built-in policy used when sanitizing full documents (preserves <html>, <head>, and <body> wrappers).

SanitizationPolicy

Defines allowlists for tags and attributes, URL validation rules, and optional inline-style allowlisting.

Notable options:

UrlPolicy

Wraps URL rules and controls what happens to URL-valued attributes.

UrlPolicy(
    default_handling="strip",  # or "allow" / "proxy"
    default_allow_relative=True,
    allow_rules={},
    url_filter=None,
    proxy=None,
)

UrlProxy

Proxy rewrite configuration used when default_handling="proxy".

UrlProxy(
    url="/proxy",
    param="url",
)

UrlRule

Controls how URL-valued attributes like a[href] and img[src] are validated.

UrlRule(
    allow_fragment=True,
    resolve_protocol_relative="https",
    allowed_schemes=set(),
    allowed_hosts=None,
    proxy=None,
)
Parameter Type Default Description
allow_fragment bool True Allow fragment-only URLs (e.g. #anchor)
resolve_protocol_relative str \| None "https" Scheme to resolve protocol-relative URLs (//...) to before checking. If None, they are dropped.
allowed_schemes set[str] set() Allowed schemes for absolute URLs (e.g. {"https", "mailto"})
allowed_hosts set[str] \| None None If set, only allow these hosts (e.g. {"example.com"})
proxy UrlProxy \| None None Per-rule proxy override used when UrlPolicy.default_handling="proxy"

stream

Memory-efficient streaming parser.

from justhtml import stream

for event, data in stream(html):
    ...

stream() accepts the same input types as JustHTML. If you pass bytes, it will decode using HTML encoding sniffing. To override the encoding for byte input, pass encoding=....

Events

Event Data Description
"start" (tag_name, attrs_dict) Opening tag
"end" tag_name Closing tag
"text" text_content Text content
"comment" comment_text HTML comment
"doctype" doctype_name DOCTYPE declaration

FragmentContext

Specifies the context element for fragment parsing. See Fragment Parsing for detailed usage.

from justhtml.context import FragmentContext

Constructor

FragmentContext(tag_name, namespace=None)
Parameter Type Default Description
tag_name str required Context element tag name (e.g., "div", "tbody")
namespace str \| None None None for HTML, "svg" for SVG, "math" for MathML

Example

from justhtml import JustHTML
from justhtml.context import FragmentContext

# Parse table rows in correct context
ctx = FragmentContext("tbody")
doc = JustHTML("<tr><td>cell</td></tr>", fragment_context=ctx)

ParseError

Represents a parse error with location information.

from justhtml import ParseError

Properties

Property Type Description
code str Error code (e.g., "eof-in-tag")
line int Line number (1-indexed)
column int Column number (1-indexed)
message str Human-readable error message

Methods

as_exception()

Convert to a SyntaxError with source highlighting (Python 3.11+).

error.as_exception()  # Returns SyntaxError

StrictModeError

Exception raised when parsing with strict=True.

from justhtml import StrictModeError

Inherits from SyntaxError, so it displays source location in tracebacks.


Standalone Functions

query(node, selector)

Query a node without using the method syntax.

from justhtml import query
results = query(doc.root, "div.main")

matches(node, selector)

Check if a node matches a selector.

from justhtml import matches
if matches(node, "div.active"):
    ...

to_html(node, indent=0, indent_size=2, pretty=True, context=None, quote='"')

Serialize a node to HTML.

from justhtml import HTMLContext, to_html
html_string = to_html(node)
escaped = to_html(node, context=HTMLContext.JS_STRING)

# With enum:
# from justhtml import HTMLContext
# escaped = to_html(node, context=HTMLContext.JS_STRING)

# Context options:
# - HTMLContext.HTML (default)
# - HTMLContext.JS_STRING
# - HTMLContext.HTML_ATTR_VALUE
#
# For escaping plain strings (no DOM required), use:
# - JustHTML.escape_js_string(...)
# - JustHTML.escape_attr_value(...)
# - JustHTML.escape_url_value(...)
# - JustHTML.escape_url_in_js_string(...)
# - JustHTML.clean_url_value(...)
# - JustHTML.clean_url_in_js_string(...)

SelectorError

Exception raised for invalid CSS selectors.

from justhtml import SelectorError

try:
    doc.query("div[invalid")
except SelectorError as e:
    print(e)