diff --git a/app/assets/services/cursor.py b/app/assets/services/cursor.py index 41ad99573..4b95be2df 100644 --- a/app/assets/services/cursor.py +++ b/app/assets/services/cursor.py @@ -42,7 +42,14 @@ class InvalidCursorError(ValueError): # MAX_CURSOR_VALUE_LENGTH is 512 to fit the `AssetReference.name` column max # (`String(512)`) — otherwise a long-named asset would mint a cursor the same # server then refuses on the next request. -MAX_ENCODED_CURSOR_LENGTH = 1024 +# +# MAX_ENCODED_CURSOR_LENGTH is the decode-path guard, sized comfortably above +# the largest cursor the per-field caps can produce. Worst case is value + id +# at their caps with every character escape-expanding to the six-byte `\uXXXX` +# form, which is ~5.2 KB once base64url-encoded. At 8192 the encoder can never +# mint a cursor that exceeds it, so a freshly minted cursor always decodes on +# the next request and there is no user-visible "cursor too long" failure. +MAX_ENCODED_CURSOR_LENGTH = 8192 MAX_CURSOR_VALUE_LENGTH = 512 MAX_CURSOR_ID_LENGTH = 128 @@ -58,6 +65,27 @@ class CursorPayload: _VALID_ORDERS = ("asc", "desc") +def _apply_wire_compatible_json_escapes(raw: str) -> str: + """Escape the characters the cursor wire format requires escaped. + + The wire format escapes `<`, `>`, `&`, U+2028, and U+2029 — and nothing + else, leaving other non-ASCII as literal UTF-8 — so a value carrying any of + them encodes to identical bytes across every compatible implementation of + the payload format. None of these characters appear in JSON structural + syntax, so a global replace on the serialized output can only touch encoded + string values. Explicit `\\uXXXX` escapes for U+2028 / U+2029 keep this + source stable against editor / git tooling that normalizes those invisible + separators. + """ + return ( + raw.replace("<", "\\u003c") + .replace(">", "\\u003e") + .replace("&", "\\u0026") + .replace("\u2028", "\\u2028") + .replace("\u2029", "\\u2029") + ) + + def encode_cursor(sort_field: str, value: str, id: str, order: str = "desc") -> str: """Encode a cursor payload as a base64url (no-padding) string. @@ -78,29 +106,13 @@ def encode_cursor(sort_field: str, value: str, id: str, order: str = "desc") -> raise InvalidCursorError("value exceeds maximum length") payload = {"s": sort_field, "v": value, "id": id, "o": order} raw = json.dumps(payload, separators=(",", ":"), ensure_ascii=False) - # Match the default JSON escaping of HTML-significant characters and JS - # line/paragraph separators (U+2028 / U+2029) so an asset name carrying - # any of them encodes to identical bytes across runtimes. None of these - # characters appear in JSON structural syntax, so a global replace on the - # serialized output can only touch encoded values. Use explicit \uXXXX - # escapes for U+2028 / U+2029 so the source survives any editor / git - # tooling that normalizes invisible separators. - raw = ( - raw.replace("<", "\\u003c") - .replace(">", "\\u003e") - .replace("&", "\\u0026") - .replace("\u2028", "\\u2028") - .replace("\u2029", "\\u2029") - ) + raw = _apply_wire_compatible_json_escapes(raw) encoded = base64.urlsafe_b64encode(raw.encode("utf-8")).rstrip(b"=").decode("ascii") - # Final wire-size guard: the per-field caps above are char-counted, but the - # wire cap applies to the base64url of the UTF-8-encoded, escape-expanded - # payload. A value full of multibyte or HTML-significant characters (e.g. - # 512 \u00d7 "\u00e9" or 512 \u00d7 "<") inflates well past MAX_ENCODED_CURSOR_LENGTH even - # though it passes the char-count check. Refuse to mint a cursor the decoder - # on the next request would reject. - if len(encoded) > MAX_ENCODED_CURSOR_LENGTH: - raise InvalidCursorError("encoded cursor exceeds maximum length") + # No mint-time length guard is needed: the per-field caps above bound the + # encoded length well below MAX_ENCODED_CURSOR_LENGTH (see its definition), + # so the encoder can never produce a cursor the decode path would reject. + # This keeps encoder/decoder symmetry without a user-visible failure when a + # value happens to be multibyte- or escape-heavy. return encoded diff --git a/tests-unit/assets_test/services/test_cursor.py b/tests-unit/assets_test/services/test_cursor.py index db114ad47..0729b809b 100644 --- a/tests-unit/assets_test/services/test_cursor.py +++ b/tests-unit/assets_test/services/test_cursor.py @@ -190,8 +190,10 @@ class TestDatetimeOverflow: class TestEncoderDecoderSymmetry: - """The encoder must reject inputs the decoder rejects, or the same server - will mint a cursor it then 400s on the next request. + """The encoder must never mint a cursor the decoder would reject, or the + same server would 400 on a cursor it just handed out. Per-field caps keep + the encoded length below the wire cap, so a freshly minted cursor always + round-trips. """ def test_long_name_within_cap_round_trips(self): @@ -215,20 +217,26 @@ class TestEncoderDecoderSymmetry: with pytest.raises(InvalidCursorError, match="value exceeds maximum length"): encode_cursor("name", "v" * (MAX_CURSOR_VALUE_LENGTH + 1), "id-1") - def test_encoder_rejects_multibyte_value_over_wire_cap(self): - """A value that passes the char-count cap can still inflate past the - wire cap once UTF-8-encoded. Asset name made of 512 × multibyte - characters (e.g. 'é' = 2 bytes) must be rejected at encode time, not - minted into a cursor the next request will 400.""" - with pytest.raises(InvalidCursorError, match="encoded cursor exceeds maximum length"): - encode_cursor("name", "é" * MAX_CURSOR_VALUE_LENGTH, "asset-multibyte") + def test_multibyte_value_at_cap_round_trips(self): + """A value at the char-count cap made of multibyte characters + (e.g. 'é' = 2 UTF-8 bytes) stays under the wire cap, so it mints and + round-trips — the per-field caps, not a mint-time length check, are + what bound cursor size.""" + value = "é" * MAX_CURSOR_VALUE_LENGTH + encoded = encode_cursor("name", value, "asset-multibyte") + assert len(encoded) <= MAX_ENCODED_CURSOR_LENGTH + payload = decode_cursor(encoded, ALLOWED) + assert payload.value == value - def test_encoder_rejects_escape_heavy_value_over_wire_cap(self): - """Same wire-cap concern via escape expansion: each `<` serializes to - the six-byte sequence `\\u003c`, so 512 of them blow past the encoded - cap even though the raw char count is within the per-field limit.""" - with pytest.raises(InvalidCursorError, match="encoded cursor exceeds maximum length"): - encode_cursor("name", "<" * MAX_CURSOR_VALUE_LENGTH, "asset-escape") + def test_escape_heavy_value_at_cap_round_trips(self): + """Escape expansion is the worst case: each `<` serializes to the + six-byte `\\u003c`. A value of 512 of them is the largest a cursor can + get, and it still fits the wire cap, mints, and round-trips.""" + value = "<" * MAX_CURSOR_VALUE_LENGTH + encoded = encode_cursor("name", value, "asset-escape") + assert len(encoded) <= MAX_ENCODED_CURSOR_LENGTH + payload = decode_cursor(encoded, ALLOWED) + assert payload.value == value class TestOrderBinding: