diff --git a/app/assets/services/cursor.py b/app/assets/services/cursor.py index 4b95be2df..6c7791528 100644 --- a/app/assets/services/cursor.py +++ b/app/assets/services/cursor.py @@ -10,10 +10,10 @@ so replaying a `desc` cursor against an `asc` request fails with `o` is mandatory on every payload — a cursor without it is rejected as malformed. -Encoding is base64url with no padding. JSON serialization escapes `<`, -`>`, `&`, U+2028, and U+2029 in encoded string values so asset names -containing those characters produce a stable, byte-identical wire form -across any compatible implementation of the same payload format. +Encoding is base64url with no padding. Cursors are opaque tokens: the +payload format is internal to this server, and clients must treat a +cursor as a black box handed back via `next_cursor`. No byte-level +compatibility with any other implementation is required. Time values are serialized as Unix microseconds (UTC) — microsecond precision is sufficient to round-trip the timestamps stored by the @@ -45,10 +45,11 @@ class InvalidCursorError(ValueError): # # MAX_ENCODED_CURSOR_LENGTH is the decode-path guard, sized comfortably above # the largest cursor the per-field caps can produce. Worst case is value + id -# at their caps with every character escape-expanding to the six-byte `\uXXXX` -# form, which is ~5.2 KB once base64url-encoded. At 8192 the encoder can never -# mint a cursor that exceeds it, so a freshly minted cursor always decodes on -# the next request and there is no user-visible "cursor too long" failure. +# at their caps with every character JSON-escaping to the six-byte `\uXXXX` +# form (control characters), which is ~5.2 KB once base64url-encoded. At 8192 +# the encoder can never mint a cursor that exceeds it, so a freshly minted +# cursor always decodes on the next request and there is no user-visible +# "cursor too long" failure. MAX_ENCODED_CURSOR_LENGTH = 8192 MAX_CURSOR_VALUE_LENGTH = 512 MAX_CURSOR_ID_LENGTH = 128 @@ -65,27 +66,6 @@ class CursorPayload: _VALID_ORDERS = ("asc", "desc") -def _apply_wire_compatible_json_escapes(raw: str) -> str: - """Escape the characters the cursor wire format requires escaped. - - The wire format escapes `<`, `>`, `&`, U+2028, and U+2029 — and nothing - else, leaving other non-ASCII as literal UTF-8 — so a value carrying any of - them encodes to identical bytes across every compatible implementation of - the payload format. None of these characters appear in JSON structural - syntax, so a global replace on the serialized output can only touch encoded - string values. Explicit `\\uXXXX` escapes for U+2028 / U+2029 keep this - source stable against editor / git tooling that normalizes those invisible - separators. - """ - return ( - raw.replace("<", "\\u003c") - .replace(">", "\\u003e") - .replace("&", "\\u0026") - .replace("\u2028", "\\u2028") - .replace("\u2029", "\\u2029") - ) - - def encode_cursor(sort_field: str, value: str, id: str, order: str = "desc") -> str: """Encode a cursor payload as a base64url (no-padding) string. @@ -106,14 +86,10 @@ def encode_cursor(sort_field: str, value: str, id: str, order: str = "desc") -> raise InvalidCursorError("value exceeds maximum length") payload = {"s": sort_field, "v": value, "id": id, "o": order} raw = json.dumps(payload, separators=(",", ":"), ensure_ascii=False) - raw = _apply_wire_compatible_json_escapes(raw) - encoded = base64.urlsafe_b64encode(raw.encode("utf-8")).rstrip(b"=").decode("ascii") # No mint-time length guard is needed: the per-field caps above bound the # encoded length well below MAX_ENCODED_CURSOR_LENGTH (see its definition), # so the encoder can never produce a cursor the decode path would reject. - # This keeps encoder/decoder symmetry without a user-visible failure when a - # value happens to be multibyte- or escape-heavy. - return encoded + return base64.urlsafe_b64encode(raw.encode("utf-8")).rstrip(b"=").decode("ascii") def encode_cursor_from_time(sort_field: str, t: datetime, id: str, order: str = "desc") -> str: diff --git a/tests-unit/assets_test/services/test_cursor.py b/tests-unit/assets_test/services/test_cursor.py index 0729b809b..47970e168 100644 --- a/tests-unit/assets_test/services/test_cursor.py +++ b/tests-unit/assets_test/services/test_cursor.py @@ -1,9 +1,8 @@ """Tests for app.assets.services.cursor. -The byte-identity fixtures below pin the wire format so a parallel -implementation in another runtime can mint exchange-compatible cursors -for the same payload. Drift here would break frontend pagination against -any compatible backend. +Cursors are opaque tokens internal to this server — these tests cover +round-tripping, validation, and length caps, not any particular wire +byte layout. """ from __future__ import annotations @@ -37,6 +36,8 @@ class TestRoundTrip: ("size", "1024", "asset-123"), ("name", "my-asset.png", "asset-abc"), ("name", "résumé.txt", "asset-uni"), + ("name", "foo<&>bar.png", "asset-html"), + ("name", 'quo"te\\back\nnewline.png', "asset-esc"), ], ) def test_encode_decode(self, sort_field, value, id): @@ -229,10 +230,11 @@ class TestEncoderDecoderSymmetry: assert payload.value == value def test_escape_heavy_value_at_cap_round_trips(self): - """Escape expansion is the worst case: each `<` serializes to the - six-byte `\\u003c`. A value of 512 of them is the largest a cursor can - get, and it still fits the wire cap, mints, and round-trips.""" - value = "<" * MAX_CURSOR_VALUE_LENGTH + """JSON escape expansion is the worst case: each control character + serializes to the six-byte `\\uXXXX` form. A value of 512 of them is + the largest a cursor can get, and it still fits the wire cap, mints, + and round-trips.""" + value = "\x01" * MAX_CURSOR_VALUE_LENGTH encoded = encode_cursor("name", value, "asset-escape") assert len(encoded) <= MAX_ENCODED_CURSOR_LENGTH payload = decode_cursor(encoded, ALLOWED) @@ -274,89 +276,3 @@ class TestOrderBinding: encoded = base64.urlsafe_b64encode(raw).rstrip(b"=").decode("ascii") with pytest.raises(InvalidCursorError, match="missing or non-string o"): decode_cursor(encoded, ALLOWED, expected_order="desc") - - -class TestHtmlSignificantCharEscaping: - """An asset name containing `<`, `>`, `&`, U+2028, or U+2029 must encode - to the same escaped wire bytes as any compatible implementation of the - same payload format. Drift here breaks cross-runtime byte-identity for - those characters. - """ - - @pytest.mark.parametrize( - "value, escaped_substring", - [ - ("foo.png", "\\u003c"), # `<` escaped - ("foo.png", "\\u003e"), # `>` escaped - ("foo&bar.png", "\\u0026"), - ("foo
bar.png", "\\u2028"), # JS line separator - ("foo
bar.png", "\\u2029"), # JS paragraph separator - ], - ) - def test_html_significant_chars_escaped(self, value, escaped_substring): - encoded = encode_cursor("name", value, "id-1") - decoded_bytes = base64.urlsafe_b64decode(encoded + "=" * (-len(encoded) % 4)) - assert escaped_substring in decoded_bytes.decode("ascii"), ( - f"Expected {escaped_substring!r} in serialized payload, got: {decoded_bytes!r}" - ) - - def test_value_round_trips_through_escape(self): - """Encoding then decoding a value with `<>&` should yield the original - string — the escape only affects the wire form, not the decoded value.""" - original = "foo<&>bar.png" - encoded = encode_cursor("name", original, "id-1") - payload = decode_cursor(encoded, ALLOWED) - assert payload.value == original - - -class TestByteIdentityFixtures: - """Pin the wire format so it doesn't drift silently. - - These fixtures assert exact byte equality of the encoded JSON payload — - a change in key order, escape choice, separator whitespace, or anything - else that shifts a byte fails the test loudly rather than diverging - silently from any external consumer of the same payload format. - """ - - @pytest.mark.parametrize( - "sort_field, value, id, order, expected_payload", - [ - ( - "created_at", - "1716200000000000", - "a1b2c3d4-e5f6-7a89-b0c1-d2e3f4a5b6c7", - "desc", - '{"s":"created_at","v":"1716200000000000","id":"a1b2c3d4-e5f6-7a89-b0c1-d2e3f4a5b6c7","o":"desc"}', - ), - ( - "size", - "1024", - "asset-123", - "asc", - '{"s":"size","v":"1024","id":"asset-123","o":"asc"}', - ), - ( - "name", - "my-asset.png", - "asset-abc", - "desc", - '{"s":"name","v":"my-asset.png","id":"asset-abc","o":"desc"}', - ), - ( - "name", - "foo&baz.png", - "asset-html", - "desc", - # `<`, `>`, `&` escape to <, >, & in the value. - '{"s":"name","v":"foo\\u003cbar\\u003e\\u0026baz.png","id":"asset-html","o":"desc"}', - ), - ], - ) - def test_encoded_payload_shape_pinned(self, sort_field, value, id, order, expected_payload): - encoded = encode_cursor(sort_field, value, id, order=order) - decoded_bytes = base64.urlsafe_b64decode(encoded + "=" * (-len(encoded) % 4)) - assert decoded_bytes.decode("utf-8") == expected_payload, ( - f"wire format drifted for sort={sort_field!r}, value={value!r}:\n" - f" expected: {expected_payload!r}\n" - f" actual: {decoded_bytes.decode('utf-8')!r}" - )