refactor(assets): extract cursor JSON escaping helper; size wire cap above per-field caps

Addresses review feedback on cursor.py: - Extract the inline escape chain into _apply_wire_compatible_json_escapes() with a comment pinning it to the wire format's escape set, so the parity intent is explicit rather than reading as an ad-hoc transform. - Raise MAX_ENCODED_CURSOR_LENGTH to 8192 (comfortably above the ~5.2KB worst-case the per-field caps can produce) and drop the mint-time length guard. Encoder/decoder symmetry now holds by construction: the encoder can't produce a cursor the decode path rejects, so there is no confusing user-visible 'cursor too long' failure at mint time. - Rewrite the two over-wire-cap tests to assert worst-case multibyte and escape-heavy values mint and round-trip, instead of being rejected.
2026-07-01 04:09:24 +08:00 · 2026-06-08 16:56:10 -07:00 · 2026-06-08 16:56:10 -07:00 · f7558232fa
commit f7558232fa
parent 546d7d2eb6
2 changed files with 58 additions and 38 deletions
--- a/app/assets/services/cursor.py
+++ b/app/assets/services/cursor.py
@ -42,7 +42,14 @@ class InvalidCursorError(ValueError):
 # MAX_CURSOR_VALUE_LENGTH is 512 to fit the `AssetReference.name` column max
 # (`String(512)`) — otherwise a long-named asset would mint a cursor the same
 # server then refuses on the next request.
-MAX_ENCODED_CURSOR_LENGTH = 1024
+#
+# MAX_ENCODED_CURSOR_LENGTH is the decode-path guard, sized comfortably above
+# the largest cursor the per-field caps can produce. Worst case is value + id
+# at their caps with every character escape-expanding to the six-byte `\uXXXX`
+# form, which is ~5.2 KB once base64url-encoded. At 8192 the encoder can never
+# mint a cursor that exceeds it, so a freshly minted cursor always decodes on
+# the next request and there is no user-visible "cursor too long" failure.
+MAX_ENCODED_CURSOR_LENGTH = 8192
 MAX_CURSOR_VALUE_LENGTH = 512
 MAX_CURSOR_ID_LENGTH = 128

@ -58,6 +65,27 @@ class CursorPayload:
 _VALID_ORDERS = ("asc", "desc")


+def _apply_wire_compatible_json_escapes(raw: str) -> str:
+    """Escape the characters the cursor wire format requires escaped.
+
+    The wire format escapes `<`, `>`, `&`, U+2028, and U+2029 — and nothing
+    else, leaving other non-ASCII as literal UTF-8 — so a value carrying any of
+    them encodes to identical bytes across every compatible implementation of
+    the payload format. None of these characters appear in JSON structural
+    syntax, so a global replace on the serialized output can only touch encoded
+    string values. Explicit `\\uXXXX` escapes for U+2028 / U+2029 keep this
+    source stable against editor / git tooling that normalizes those invisible
+    separators.
+    """
+    return (
+        raw.replace("<", "\\u003c")
+           .replace(">", "\\u003e")
+           .replace("&", "\\u0026")
+           .replace("\u2028", "\\u2028")
+           .replace("\u2029", "\\u2029")
+    )
+
+
 def encode_cursor(sort_field: str, value: str, id: str, order: str = "desc") -> str:
    """Encode a cursor payload as a base64url (no-padding) string.

@ -78,29 +106,13 @@ def encode_cursor(sort_field: str, value: str, id: str, order: str = "desc") ->
        raise InvalidCursorError("value exceeds maximum length")
    payload = {"s": sort_field, "v": value, "id": id, "o": order}
    raw = json.dumps(payload, separators=(",", ":"), ensure_ascii=False)
-    # Match the default JSON escaping of HTML-significant characters and JS
-    # line/paragraph separators (U+2028 / U+2029) so an asset name carrying
-    # any of them encodes to identical bytes across runtimes. None of these
-    # characters appear in JSON structural syntax, so a global replace on the
-    # serialized output can only touch encoded values. Use explicit \uXXXX
-    # escapes for U+2028 / U+2029 so the source survives any editor / git
-    # tooling that normalizes invisible separators.
-    raw = (
-        raw.replace("<", "\\u003c")
-           .replace(">", "\\u003e")
-           .replace("&", "\\u0026")
-           .replace("\u2028", "\\u2028")
-           .replace("\u2029", "\\u2029")
-    )
+    raw = _apply_wire_compatible_json_escapes(raw)
    encoded = base64.urlsafe_b64encode(raw.encode("utf-8")).rstrip(b"=").decode("ascii")
-    # Final wire-size guard: the per-field caps above are char-counted, but the
-    # wire cap applies to the base64url of the UTF-8-encoded, escape-expanded
-    # payload. A value full of multibyte or HTML-significant characters (e.g.
-    # 512 \u00d7 "\u00e9" or 512 \u00d7 "<") inflates well past MAX_ENCODED_CURSOR_LENGTH even
-    # though it passes the char-count check. Refuse to mint a cursor the decoder
-    # on the next request would reject.
-    if len(encoded) > MAX_ENCODED_CURSOR_LENGTH:
-        raise InvalidCursorError("encoded cursor exceeds maximum length")
+    # No mint-time length guard is needed: the per-field caps above bound the
+    # encoded length well below MAX_ENCODED_CURSOR_LENGTH (see its definition),
+    # so the encoder can never produce a cursor the decode path would reject.
+    # This keeps encoder/decoder symmetry without a user-visible failure when a
+    # value happens to be multibyte- or escape-heavy.
    return encoded


--- a/tests-unit/assets_test/services/test_cursor.py
+++ b/tests-unit/assets_test/services/test_cursor.py
@ -190,8 +190,10 @@ class TestDatetimeOverflow:


 class TestEncoderDecoderSymmetry:
-    """The encoder must reject inputs the decoder rejects, or the same server
-    will mint a cursor it then 400s on the next request.
+    """The encoder must never mint a cursor the decoder would reject, or the
+    same server would 400 on a cursor it just handed out. Per-field caps keep
+    the encoded length below the wire cap, so a freshly minted cursor always
+    round-trips.
    """

    def test_long_name_within_cap_round_trips(self):
@ -215,20 +217,26 @@ class TestEncoderDecoderSymmetry:
        with pytest.raises(InvalidCursorError, match="value exceeds maximum length"):
            encode_cursor("name", "v" * (MAX_CURSOR_VALUE_LENGTH + 1), "id-1")

-    def test_encoder_rejects_multibyte_value_over_wire_cap(self):
-        """A value that passes the char-count cap can still inflate past the
-        wire cap once UTF-8-encoded. Asset name made of 512 × multibyte
-        characters (e.g. 'é' = 2 bytes) must be rejected at encode time, not
-        minted into a cursor the next request will 400."""
-        with pytest.raises(InvalidCursorError, match="encoded cursor exceeds maximum length"):
-            encode_cursor("name", "é" * MAX_CURSOR_VALUE_LENGTH, "asset-multibyte")
+    def test_multibyte_value_at_cap_round_trips(self):
+        """A value at the char-count cap made of multibyte characters
+        (e.g. 'é' = 2 UTF-8 bytes) stays under the wire cap, so it mints and
+        round-trips — the per-field caps, not a mint-time length check, are
+        what bound cursor size."""
+        value = "é" * MAX_CURSOR_VALUE_LENGTH
+        encoded = encode_cursor("name", value, "asset-multibyte")
+        assert len(encoded) <= MAX_ENCODED_CURSOR_LENGTH
+        payload = decode_cursor(encoded, ALLOWED)
+        assert payload.value == value

-    def test_encoder_rejects_escape_heavy_value_over_wire_cap(self):
-        """Same wire-cap concern via escape expansion: each `<` serializes to
-        the six-byte sequence `\\u003c`, so 512 of them blow past the encoded
-        cap even though the raw char count is within the per-field limit."""
-        with pytest.raises(InvalidCursorError, match="encoded cursor exceeds maximum length"):
-            encode_cursor("name", "<" * MAX_CURSOR_VALUE_LENGTH, "asset-escape")
+    def test_escape_heavy_value_at_cap_round_trips(self):
+        """Escape expansion is the worst case: each `<` serializes to the
+        six-byte `\\u003c`. A value of 512 of them is the largest a cursor can
+        get, and it still fits the wire cap, mints, and round-trips."""
+        value = "<" * MAX_CURSOR_VALUE_LENGTH
+        encoded = encode_cursor("name", value, "asset-escape")
+        assert len(encoded) <= MAX_ENCODED_CURSOR_LENGTH
+        payload = decode_cursor(encoded, ALLOWED)
+        assert payload.value == value


 class TestOrderBinding: