refactor(assets): extract cursor JSON escaping helper; size wire cap above per-field caps
Some checks are pending
Python Linting / Run Ruff (push) Waiting to run
Python Linting / Run Pylint (push) Waiting to run

Addresses review feedback on cursor.py:

- Extract the inline escape chain into _apply_wire_compatible_json_escapes()
  with a comment pinning it to the wire format's escape set, so the parity
  intent is explicit rather than reading as an ad-hoc transform.
- Raise MAX_ENCODED_CURSOR_LENGTH to 8192 (comfortably above the ~5.2KB
  worst-case the per-field caps can produce) and drop the mint-time length
  guard. Encoder/decoder symmetry now holds by construction: the encoder
  can't produce a cursor the decode path rejects, so there is no confusing
  user-visible 'cursor too long' failure at mint time.
- Rewrite the two over-wire-cap tests to assert worst-case multibyte and
  escape-heavy values mint and round-trip, instead of being rejected.
This commit is contained in:
Matt Miller 2026-06-08 16:56:10 -07:00
parent 546d7d2eb6
commit f7558232fa
2 changed files with 58 additions and 38 deletions

View File

@ -42,7 +42,14 @@ class InvalidCursorError(ValueError):
# MAX_CURSOR_VALUE_LENGTH is 512 to fit the `AssetReference.name` column max
# (`String(512)`) — otherwise a long-named asset would mint a cursor the same
# server then refuses on the next request.
MAX_ENCODED_CURSOR_LENGTH = 1024
#
# MAX_ENCODED_CURSOR_LENGTH is the decode-path guard, sized comfortably above
# the largest cursor the per-field caps can produce. Worst case is value + id
# at their caps with every character escape-expanding to the six-byte `\uXXXX`
# form, which is ~5.2 KB once base64url-encoded. At 8192 the encoder can never
# mint a cursor that exceeds it, so a freshly minted cursor always decodes on
# the next request and there is no user-visible "cursor too long" failure.
MAX_ENCODED_CURSOR_LENGTH = 8192
MAX_CURSOR_VALUE_LENGTH = 512
MAX_CURSOR_ID_LENGTH = 128
@ -58,6 +65,27 @@ class CursorPayload:
_VALID_ORDERS = ("asc", "desc")
def _apply_wire_compatible_json_escapes(raw: str) -> str:
"""Escape the characters the cursor wire format requires escaped.
The wire format escapes `<`, `>`, `&`, U+2028, and U+2029 and nothing
else, leaving other non-ASCII as literal UTF-8 so a value carrying any of
them encodes to identical bytes across every compatible implementation of
the payload format. None of these characters appear in JSON structural
syntax, so a global replace on the serialized output can only touch encoded
string values. Explicit `\\uXXXX` escapes for U+2028 / U+2029 keep this
source stable against editor / git tooling that normalizes those invisible
separators.
"""
return (
raw.replace("<", "\\u003c")
.replace(">", "\\u003e")
.replace("&", "\\u0026")
.replace("\u2028", "\\u2028")
.replace("\u2029", "\\u2029")
)
def encode_cursor(sort_field: str, value: str, id: str, order: str = "desc") -> str:
"""Encode a cursor payload as a base64url (no-padding) string.
@ -78,29 +106,13 @@ def encode_cursor(sort_field: str, value: str, id: str, order: str = "desc") ->
raise InvalidCursorError("value exceeds maximum length")
payload = {"s": sort_field, "v": value, "id": id, "o": order}
raw = json.dumps(payload, separators=(",", ":"), ensure_ascii=False)
# Match the default JSON escaping of HTML-significant characters and JS
# line/paragraph separators (U+2028 / U+2029) so an asset name carrying
# any of them encodes to identical bytes across runtimes. None of these
# characters appear in JSON structural syntax, so a global replace on the
# serialized output can only touch encoded values. Use explicit \uXXXX
# escapes for U+2028 / U+2029 so the source survives any editor / git
# tooling that normalizes invisible separators.
raw = (
raw.replace("<", "\\u003c")
.replace(">", "\\u003e")
.replace("&", "\\u0026")
.replace("\u2028", "\\u2028")
.replace("\u2029", "\\u2029")
)
raw = _apply_wire_compatible_json_escapes(raw)
encoded = base64.urlsafe_b64encode(raw.encode("utf-8")).rstrip(b"=").decode("ascii")
# Final wire-size guard: the per-field caps above are char-counted, but the
# wire cap applies to the base64url of the UTF-8-encoded, escape-expanded
# payload. A value full of multibyte or HTML-significant characters (e.g.
# 512 \u00d7 "\u00e9" or 512 \u00d7 "<") inflates well past MAX_ENCODED_CURSOR_LENGTH even
# though it passes the char-count check. Refuse to mint a cursor the decoder
# on the next request would reject.
if len(encoded) > MAX_ENCODED_CURSOR_LENGTH:
raise InvalidCursorError("encoded cursor exceeds maximum length")
# No mint-time length guard is needed: the per-field caps above bound the
# encoded length well below MAX_ENCODED_CURSOR_LENGTH (see its definition),
# so the encoder can never produce a cursor the decode path would reject.
# This keeps encoder/decoder symmetry without a user-visible failure when a
# value happens to be multibyte- or escape-heavy.
return encoded

View File

@ -190,8 +190,10 @@ class TestDatetimeOverflow:
class TestEncoderDecoderSymmetry:
"""The encoder must reject inputs the decoder rejects, or the same server
will mint a cursor it then 400s on the next request.
"""The encoder must never mint a cursor the decoder would reject, or the
same server would 400 on a cursor it just handed out. Per-field caps keep
the encoded length below the wire cap, so a freshly minted cursor always
round-trips.
"""
def test_long_name_within_cap_round_trips(self):
@ -215,20 +217,26 @@ class TestEncoderDecoderSymmetry:
with pytest.raises(InvalidCursorError, match="value exceeds maximum length"):
encode_cursor("name", "v" * (MAX_CURSOR_VALUE_LENGTH + 1), "id-1")
def test_encoder_rejects_multibyte_value_over_wire_cap(self):
"""A value that passes the char-count cap can still inflate past the
wire cap once UTF-8-encoded. Asset name made of 512 × multibyte
characters (e.g. 'é' = 2 bytes) must be rejected at encode time, not
minted into a cursor the next request will 400."""
with pytest.raises(InvalidCursorError, match="encoded cursor exceeds maximum length"):
encode_cursor("name", "é" * MAX_CURSOR_VALUE_LENGTH, "asset-multibyte")
def test_multibyte_value_at_cap_round_trips(self):
"""A value at the char-count cap made of multibyte characters
(e.g. 'é' = 2 UTF-8 bytes) stays under the wire cap, so it mints and
round-trips the per-field caps, not a mint-time length check, are
what bound cursor size."""
value = "é" * MAX_CURSOR_VALUE_LENGTH
encoded = encode_cursor("name", value, "asset-multibyte")
assert len(encoded) <= MAX_ENCODED_CURSOR_LENGTH
payload = decode_cursor(encoded, ALLOWED)
assert payload.value == value
def test_encoder_rejects_escape_heavy_value_over_wire_cap(self):
"""Same wire-cap concern via escape expansion: each `<` serializes to
the six-byte sequence `\\u003c`, so 512 of them blow past the encoded
cap even though the raw char count is within the per-field limit."""
with pytest.raises(InvalidCursorError, match="encoded cursor exceeds maximum length"):
encode_cursor("name", "<" * MAX_CURSOR_VALUE_LENGTH, "asset-escape")
def test_escape_heavy_value_at_cap_round_trips(self):
"""Escape expansion is the worst case: each `<` serializes to the
six-byte `\\u003c`. A value of 512 of them is the largest a cursor can
get, and it still fits the wire cap, mints, and round-trips."""
value = "<" * MAX_CURSOR_VALUE_LENGTH
encoded = encode_cursor("name", value, "asset-escape")
assert len(encoded) <= MAX_ENCODED_CURSOR_LENGTH
payload = decode_cursor(encoded, ALLOWED)
assert payload.value == value
class TestOrderBinding: