refactor(assets): drop cross-runtime cursor escaping; cursors are opaque

The custom JSON escaping of <, >, &, U+2028, and U+2029 existed only to
keep the encoded cursor byte-identical with the Cloud implementation of
the same payload format. Cursors are opaque tokens, so byte-level
compatibility across implementations is not needed — plain json.dumps
output is sufficient. Remove the escaping helper and the byte-identity
test fixtures that pinned the wire format; keep round-trip coverage for
the affected characters.
This commit is contained in:
Matt Miller 2026-06-09 20:55:40 -07:00
parent f7558232fa
commit 9341fc6894
2 changed files with 20 additions and 128 deletions

View File

@ -10,10 +10,10 @@ so replaying a `desc` cursor against an `asc` request fails with
`o` is mandatory on every payload a cursor without it is rejected as
malformed.
Encoding is base64url with no padding. JSON serialization escapes `<`,
`>`, `&`, U+2028, and U+2029 in encoded string values so asset names
containing those characters produce a stable, byte-identical wire form
across any compatible implementation of the same payload format.
Encoding is base64url with no padding. Cursors are opaque tokens: the
payload format is internal to this server, and clients must treat a
cursor as a black box handed back via `next_cursor`. No byte-level
compatibility with any other implementation is required.
Time values are serialized as Unix microseconds (UTC) microsecond
precision is sufficient to round-trip the timestamps stored by the
@ -45,10 +45,11 @@ class InvalidCursorError(ValueError):
#
# MAX_ENCODED_CURSOR_LENGTH is the decode-path guard, sized comfortably above
# the largest cursor the per-field caps can produce. Worst case is value + id
# at their caps with every character escape-expanding to the six-byte `\uXXXX`
# form, which is ~5.2 KB once base64url-encoded. At 8192 the encoder can never
# mint a cursor that exceeds it, so a freshly minted cursor always decodes on
# the next request and there is no user-visible "cursor too long" failure.
# at their caps with every character JSON-escaping to the six-byte `\uXXXX`
# form (control characters), which is ~5.2 KB once base64url-encoded. At 8192
# the encoder can never mint a cursor that exceeds it, so a freshly minted
# cursor always decodes on the next request and there is no user-visible
# "cursor too long" failure.
MAX_ENCODED_CURSOR_LENGTH = 8192
MAX_CURSOR_VALUE_LENGTH = 512
MAX_CURSOR_ID_LENGTH = 128
@ -65,27 +66,6 @@ class CursorPayload:
_VALID_ORDERS = ("asc", "desc")
def _apply_wire_compatible_json_escapes(raw: str) -> str:
"""Escape the characters the cursor wire format requires escaped.
The wire format escapes `<`, `>`, `&`, U+2028, and U+2029 and nothing
else, leaving other non-ASCII as literal UTF-8 so a value carrying any of
them encodes to identical bytes across every compatible implementation of
the payload format. None of these characters appear in JSON structural
syntax, so a global replace on the serialized output can only touch encoded
string values. Explicit `\\uXXXX` escapes for U+2028 / U+2029 keep this
source stable against editor / git tooling that normalizes those invisible
separators.
"""
return (
raw.replace("<", "\\u003c")
.replace(">", "\\u003e")
.replace("&", "\\u0026")
.replace("\u2028", "\\u2028")
.replace("\u2029", "\\u2029")
)
def encode_cursor(sort_field: str, value: str, id: str, order: str = "desc") -> str:
"""Encode a cursor payload as a base64url (no-padding) string.
@ -106,14 +86,10 @@ def encode_cursor(sort_field: str, value: str, id: str, order: str = "desc") ->
raise InvalidCursorError("value exceeds maximum length")
payload = {"s": sort_field, "v": value, "id": id, "o": order}
raw = json.dumps(payload, separators=(",", ":"), ensure_ascii=False)
raw = _apply_wire_compatible_json_escapes(raw)
encoded = base64.urlsafe_b64encode(raw.encode("utf-8")).rstrip(b"=").decode("ascii")
# No mint-time length guard is needed: the per-field caps above bound the
# encoded length well below MAX_ENCODED_CURSOR_LENGTH (see its definition),
# so the encoder can never produce a cursor the decode path would reject.
# This keeps encoder/decoder symmetry without a user-visible failure when a
# value happens to be multibyte- or escape-heavy.
return encoded
return base64.urlsafe_b64encode(raw.encode("utf-8")).rstrip(b"=").decode("ascii")
def encode_cursor_from_time(sort_field: str, t: datetime, id: str, order: str = "desc") -> str:

View File

@ -1,9 +1,8 @@
"""Tests for app.assets.services.cursor.
The byte-identity fixtures below pin the wire format so a parallel
implementation in another runtime can mint exchange-compatible cursors
for the same payload. Drift here would break frontend pagination against
any compatible backend.
Cursors are opaque tokens internal to this server these tests cover
round-tripping, validation, and length caps, not any particular wire
byte layout.
"""
from __future__ import annotations
@ -37,6 +36,8 @@ class TestRoundTrip:
("size", "1024", "asset-123"),
("name", "my-asset.png", "asset-abc"),
("name", "résumé.txt", "asset-uni"),
("name", "foo<&>bar.png", "asset-html"),
("name", 'quo"te\\back\nnewline.png', "asset-esc"),
],
)
def test_encode_decode(self, sort_field, value, id):
@ -229,10 +230,11 @@ class TestEncoderDecoderSymmetry:
assert payload.value == value
def test_escape_heavy_value_at_cap_round_trips(self):
"""Escape expansion is the worst case: each `<` serializes to the
six-byte `\\u003c`. A value of 512 of them is the largest a cursor can
get, and it still fits the wire cap, mints, and round-trips."""
value = "<" * MAX_CURSOR_VALUE_LENGTH
"""JSON escape expansion is the worst case: each control character
serializes to the six-byte `\\uXXXX` form. A value of 512 of them is
the largest a cursor can get, and it still fits the wire cap, mints,
and round-trips."""
value = "\x01" * MAX_CURSOR_VALUE_LENGTH
encoded = encode_cursor("name", value, "asset-escape")
assert len(encoded) <= MAX_ENCODED_CURSOR_LENGTH
payload = decode_cursor(encoded, ALLOWED)
@ -274,89 +276,3 @@ class TestOrderBinding:
encoded = base64.urlsafe_b64encode(raw).rstrip(b"=").decode("ascii")
with pytest.raises(InvalidCursorError, match="missing or non-string o"):
decode_cursor(encoded, ALLOWED, expected_order="desc")
class TestHtmlSignificantCharEscaping:
"""An asset name containing `<`, `>`, `&`, U+2028, or U+2029 must encode
to the same escaped wire bytes as any compatible implementation of the
same payload format. Drift here breaks cross-runtime byte-identity for
those characters.
"""
@pytest.mark.parametrize(
"value, escaped_substring",
[
("foo<bar>.png", "\\u003c"), # `<` escaped
("foo<bar>.png", "\\u003e"), # `>` escaped
("foo&bar.png", "\\u0026"),
("foobar.png", "\\u2028"), # JS line separator
("foobar.png", "\\u2029"), # JS paragraph separator
],
)
def test_html_significant_chars_escaped(self, value, escaped_substring):
encoded = encode_cursor("name", value, "id-1")
decoded_bytes = base64.urlsafe_b64decode(encoded + "=" * (-len(encoded) % 4))
assert escaped_substring in decoded_bytes.decode("ascii"), (
f"Expected {escaped_substring!r} in serialized payload, got: {decoded_bytes!r}"
)
def test_value_round_trips_through_escape(self):
"""Encoding then decoding a value with `<>&` should yield the original
string the escape only affects the wire form, not the decoded value."""
original = "foo<&>bar.png"
encoded = encode_cursor("name", original, "id-1")
payload = decode_cursor(encoded, ALLOWED)
assert payload.value == original
class TestByteIdentityFixtures:
"""Pin the wire format so it doesn't drift silently.
These fixtures assert exact byte equality of the encoded JSON payload
a change in key order, escape choice, separator whitespace, or anything
else that shifts a byte fails the test loudly rather than diverging
silently from any external consumer of the same payload format.
"""
@pytest.mark.parametrize(
"sort_field, value, id, order, expected_payload",
[
(
"created_at",
"1716200000000000",
"a1b2c3d4-e5f6-7a89-b0c1-d2e3f4a5b6c7",
"desc",
'{"s":"created_at","v":"1716200000000000","id":"a1b2c3d4-e5f6-7a89-b0c1-d2e3f4a5b6c7","o":"desc"}',
),
(
"size",
"1024",
"asset-123",
"asc",
'{"s":"size","v":"1024","id":"asset-123","o":"asc"}',
),
(
"name",
"my-asset.png",
"asset-abc",
"desc",
'{"s":"name","v":"my-asset.png","id":"asset-abc","o":"desc"}',
),
(
"name",
"foo<bar>&baz.png",
"asset-html",
"desc",
# `<`, `>`, `&` escape to <, >, & in the value.
'{"s":"name","v":"foo\\u003cbar\\u003e\\u0026baz.png","id":"asset-html","o":"desc"}',
),
],
)
def test_encoded_payload_shape_pinned(self, sort_field, value, id, order, expected_payload):
encoded = encode_cursor(sort_field, value, id, order=order)
decoded_bytes = base64.urlsafe_b64decode(encoded + "=" * (-len(encoded) % 4))
assert decoded_bytes.decode("utf-8") == expected_payload, (
f"wire format drifted for sort={sort_field!r}, value={value!r}:\n"
f" expected: {expected_payload!r}\n"
f" actual: {decoded_bytes.decode('utf-8')!r}"
)