refactor(assets): drop cross-runtime cursor escaping; cursors are opaque

The custom JSON escaping of <, >, &, U+2028, and U+2029 existed only to
keep the encoded cursor byte-identical with the Cloud implementation of
the same payload format. Cursors are opaque tokens, so byte-level
compatibility across implementations is not needed — plain json.dumps
output is sufficient. Remove the escaping helper and the byte-identity
test fixtures that pinned the wire format; keep round-trip coverage for
the affected characters.
This commit is contained in:
Matt Miller 2026-06-09 20:55:40 -07:00
parent f7558232fa
commit 9341fc6894
2 changed files with 20 additions and 128 deletions

View File

@ -10,10 +10,10 @@ so replaying a `desc` cursor against an `asc` request fails with
`o` is mandatory on every payload a cursor without it is rejected as `o` is mandatory on every payload a cursor without it is rejected as
malformed. malformed.
Encoding is base64url with no padding. JSON serialization escapes `<`, Encoding is base64url with no padding. Cursors are opaque tokens: the
`>`, `&`, U+2028, and U+2029 in encoded string values so asset names payload format is internal to this server, and clients must treat a
containing those characters produce a stable, byte-identical wire form cursor as a black box handed back via `next_cursor`. No byte-level
across any compatible implementation of the same payload format. compatibility with any other implementation is required.
Time values are serialized as Unix microseconds (UTC) microsecond Time values are serialized as Unix microseconds (UTC) microsecond
precision is sufficient to round-trip the timestamps stored by the precision is sufficient to round-trip the timestamps stored by the
@ -45,10 +45,11 @@ class InvalidCursorError(ValueError):
# #
# MAX_ENCODED_CURSOR_LENGTH is the decode-path guard, sized comfortably above # MAX_ENCODED_CURSOR_LENGTH is the decode-path guard, sized comfortably above
# the largest cursor the per-field caps can produce. Worst case is value + id # the largest cursor the per-field caps can produce. Worst case is value + id
# at their caps with every character escape-expanding to the six-byte `\uXXXX` # at their caps with every character JSON-escaping to the six-byte `\uXXXX`
# form, which is ~5.2 KB once base64url-encoded. At 8192 the encoder can never # form (control characters), which is ~5.2 KB once base64url-encoded. At 8192
# mint a cursor that exceeds it, so a freshly minted cursor always decodes on # the encoder can never mint a cursor that exceeds it, so a freshly minted
# the next request and there is no user-visible "cursor too long" failure. # cursor always decodes on the next request and there is no user-visible
# "cursor too long" failure.
MAX_ENCODED_CURSOR_LENGTH = 8192 MAX_ENCODED_CURSOR_LENGTH = 8192
MAX_CURSOR_VALUE_LENGTH = 512 MAX_CURSOR_VALUE_LENGTH = 512
MAX_CURSOR_ID_LENGTH = 128 MAX_CURSOR_ID_LENGTH = 128
@ -65,27 +66,6 @@ class CursorPayload:
_VALID_ORDERS = ("asc", "desc") _VALID_ORDERS = ("asc", "desc")
def _apply_wire_compatible_json_escapes(raw: str) -> str:
"""Escape the characters the cursor wire format requires escaped.
The wire format escapes `<`, `>`, `&`, U+2028, and U+2029 and nothing
else, leaving other non-ASCII as literal UTF-8 so a value carrying any of
them encodes to identical bytes across every compatible implementation of
the payload format. None of these characters appear in JSON structural
syntax, so a global replace on the serialized output can only touch encoded
string values. Explicit `\\uXXXX` escapes for U+2028 / U+2029 keep this
source stable against editor / git tooling that normalizes those invisible
separators.
"""
return (
raw.replace("<", "\\u003c")
.replace(">", "\\u003e")
.replace("&", "\\u0026")
.replace("\u2028", "\\u2028")
.replace("\u2029", "\\u2029")
)
def encode_cursor(sort_field: str, value: str, id: str, order: str = "desc") -> str: def encode_cursor(sort_field: str, value: str, id: str, order: str = "desc") -> str:
"""Encode a cursor payload as a base64url (no-padding) string. """Encode a cursor payload as a base64url (no-padding) string.
@ -106,14 +86,10 @@ def encode_cursor(sort_field: str, value: str, id: str, order: str = "desc") ->
raise InvalidCursorError("value exceeds maximum length") raise InvalidCursorError("value exceeds maximum length")
payload = {"s": sort_field, "v": value, "id": id, "o": order} payload = {"s": sort_field, "v": value, "id": id, "o": order}
raw = json.dumps(payload, separators=(",", ":"), ensure_ascii=False) raw = json.dumps(payload, separators=(",", ":"), ensure_ascii=False)
raw = _apply_wire_compatible_json_escapes(raw)
encoded = base64.urlsafe_b64encode(raw.encode("utf-8")).rstrip(b"=").decode("ascii")
# No mint-time length guard is needed: the per-field caps above bound the # No mint-time length guard is needed: the per-field caps above bound the
# encoded length well below MAX_ENCODED_CURSOR_LENGTH (see its definition), # encoded length well below MAX_ENCODED_CURSOR_LENGTH (see its definition),
# so the encoder can never produce a cursor the decode path would reject. # so the encoder can never produce a cursor the decode path would reject.
# This keeps encoder/decoder symmetry without a user-visible failure when a return base64.urlsafe_b64encode(raw.encode("utf-8")).rstrip(b"=").decode("ascii")
# value happens to be multibyte- or escape-heavy.
return encoded
def encode_cursor_from_time(sort_field: str, t: datetime, id: str, order: str = "desc") -> str: def encode_cursor_from_time(sort_field: str, t: datetime, id: str, order: str = "desc") -> str:

View File

@ -1,9 +1,8 @@
"""Tests for app.assets.services.cursor. """Tests for app.assets.services.cursor.
The byte-identity fixtures below pin the wire format so a parallel Cursors are opaque tokens internal to this server these tests cover
implementation in another runtime can mint exchange-compatible cursors round-tripping, validation, and length caps, not any particular wire
for the same payload. Drift here would break frontend pagination against byte layout.
any compatible backend.
""" """
from __future__ import annotations from __future__ import annotations
@ -37,6 +36,8 @@ class TestRoundTrip:
("size", "1024", "asset-123"), ("size", "1024", "asset-123"),
("name", "my-asset.png", "asset-abc"), ("name", "my-asset.png", "asset-abc"),
("name", "résumé.txt", "asset-uni"), ("name", "résumé.txt", "asset-uni"),
("name", "foo<&>bar.png", "asset-html"),
("name", 'quo"te\\back\nnewline.png', "asset-esc"),
], ],
) )
def test_encode_decode(self, sort_field, value, id): def test_encode_decode(self, sort_field, value, id):
@ -229,10 +230,11 @@ class TestEncoderDecoderSymmetry:
assert payload.value == value assert payload.value == value
def test_escape_heavy_value_at_cap_round_trips(self): def test_escape_heavy_value_at_cap_round_trips(self):
"""Escape expansion is the worst case: each `<` serializes to the """JSON escape expansion is the worst case: each control character
six-byte `\\u003c`. A value of 512 of them is the largest a cursor can serializes to the six-byte `\\uXXXX` form. A value of 512 of them is
get, and it still fits the wire cap, mints, and round-trips.""" the largest a cursor can get, and it still fits the wire cap, mints,
value = "<" * MAX_CURSOR_VALUE_LENGTH and round-trips."""
value = "\x01" * MAX_CURSOR_VALUE_LENGTH
encoded = encode_cursor("name", value, "asset-escape") encoded = encode_cursor("name", value, "asset-escape")
assert len(encoded) <= MAX_ENCODED_CURSOR_LENGTH assert len(encoded) <= MAX_ENCODED_CURSOR_LENGTH
payload = decode_cursor(encoded, ALLOWED) payload = decode_cursor(encoded, ALLOWED)
@ -274,89 +276,3 @@ class TestOrderBinding:
encoded = base64.urlsafe_b64encode(raw).rstrip(b"=").decode("ascii") encoded = base64.urlsafe_b64encode(raw).rstrip(b"=").decode("ascii")
with pytest.raises(InvalidCursorError, match="missing or non-string o"): with pytest.raises(InvalidCursorError, match="missing or non-string o"):
decode_cursor(encoded, ALLOWED, expected_order="desc") decode_cursor(encoded, ALLOWED, expected_order="desc")
class TestHtmlSignificantCharEscaping:
"""An asset name containing `<`, `>`, `&`, U+2028, or U+2029 must encode
to the same escaped wire bytes as any compatible implementation of the
same payload format. Drift here breaks cross-runtime byte-identity for
those characters.
"""
@pytest.mark.parametrize(
"value, escaped_substring",
[
("foo<bar>.png", "\\u003c"), # `<` escaped
("foo<bar>.png", "\\u003e"), # `>` escaped
("foo&bar.png", "\\u0026"),
("foobar.png", "\\u2028"), # JS line separator
("foobar.png", "\\u2029"), # JS paragraph separator
],
)
def test_html_significant_chars_escaped(self, value, escaped_substring):
encoded = encode_cursor("name", value, "id-1")
decoded_bytes = base64.urlsafe_b64decode(encoded + "=" * (-len(encoded) % 4))
assert escaped_substring in decoded_bytes.decode("ascii"), (
f"Expected {escaped_substring!r} in serialized payload, got: {decoded_bytes!r}"
)
def test_value_round_trips_through_escape(self):
"""Encoding then decoding a value with `<>&` should yield the original
string the escape only affects the wire form, not the decoded value."""
original = "foo<&>bar.png"
encoded = encode_cursor("name", original, "id-1")
payload = decode_cursor(encoded, ALLOWED)
assert payload.value == original
class TestByteIdentityFixtures:
"""Pin the wire format so it doesn't drift silently.
These fixtures assert exact byte equality of the encoded JSON payload
a change in key order, escape choice, separator whitespace, or anything
else that shifts a byte fails the test loudly rather than diverging
silently from any external consumer of the same payload format.
"""
@pytest.mark.parametrize(
"sort_field, value, id, order, expected_payload",
[
(
"created_at",
"1716200000000000",
"a1b2c3d4-e5f6-7a89-b0c1-d2e3f4a5b6c7",
"desc",
'{"s":"created_at","v":"1716200000000000","id":"a1b2c3d4-e5f6-7a89-b0c1-d2e3f4a5b6c7","o":"desc"}',
),
(
"size",
"1024",
"asset-123",
"asc",
'{"s":"size","v":"1024","id":"asset-123","o":"asc"}',
),
(
"name",
"my-asset.png",
"asset-abc",
"desc",
'{"s":"name","v":"my-asset.png","id":"asset-abc","o":"desc"}',
),
(
"name",
"foo<bar>&baz.png",
"asset-html",
"desc",
# `<`, `>`, `&` escape to <, >, & in the value.
'{"s":"name","v":"foo\\u003cbar\\u003e\\u0026baz.png","id":"asset-html","o":"desc"}',
),
],
)
def test_encoded_payload_shape_pinned(self, sort_field, value, id, order, expected_payload):
encoded = encode_cursor(sort_field, value, id, order=order)
decoded_bytes = base64.urlsafe_b64decode(encoded + "=" * (-len(encoded) % 4))
assert decoded_bytes.decode("utf-8") == expected_payload, (
f"wire format drifted for sort={sort_field!r}, value={value!r}:\n"
f" expected: {expected_payload!r}\n"
f" actual: {decoded_bytes.decode('utf-8')!r}"
)