diff --git a/app/assets/api/routes.py b/app/assets/api/routes.py index 84ec2dd6e..5c14ce8b8 100644 --- a/app/assets/api/routes.py +++ b/app/assets/api/routes.py @@ -224,10 +224,7 @@ async def list_assets_route(request: web.Request) -> web.Response: after=q.after, ) except InvalidCursorError as e: - return web.json_response( - {"error": {"code": "INVALID_CURSOR", "message": str(e)}}, - status=400, - ) + return _build_error_response(400, "INVALID_CURSOR", str(e)) summaries = [_build_asset_response(item) for item in result.items] diff --git a/app/assets/services/cursor.py b/app/assets/services/cursor.py index 47ea77cfc..1bd8af15a 100644 --- a/app/assets/services/cursor.py +++ b/app/assets/services/cursor.py @@ -11,14 +11,13 @@ so replaying a `desc` cursor against an `asc` request fails with malformed. Encoding is base64url with no padding. JSON serialization escapes `<`, -`>`, `&`, U+2028, and U+2029 to match Go's default `json.Marshal` -behavior so asset names containing those characters produce -byte-identical cursors across compatible Go implementations. +`>`, `&`, U+2028, and U+2029 in encoded string values so asset names +containing those characters produce a stable, byte-identical wire form +across any compatible implementation of the same payload format. Time values are serialized as Unix microseconds (UTC) — microsecond -precision matches PostgreSQL's `timestamp` type, so a cursor minted from -a stored timestamp compares back exactly without rounding rows in the -same millisecond bucket. +precision is sufficient to round-trip the timestamps stored by the +database without rounding rows in the same millisecond bucket. """ from __future__ import annotations @@ -67,20 +66,31 @@ def encode_cursor(sort_field: str, value: str, id: str, order: str = "desc") -> ``INVALID_CURSOR`` rather than silently walking the wrong direction. """ if order not in _VALID_ORDERS: - raise ValueError(f"order must be one of {_VALID_ORDERS}, got {order!r}") + raise InvalidCursorError(f"order must be one of {_VALID_ORDERS}, got {order!r}") + # Symmetric input validation: the encoder must reject anything the + # decoder rejects, or the same server will mint cursors it then 400s on + # the next request. + if not id: + raise InvalidCursorError("id must be non-empty") + if len(id) > MAX_CURSOR_ID_LENGTH: + raise InvalidCursorError("id exceeds maximum length") + if len(value) > MAX_CURSOR_VALUE_LENGTH: + raise InvalidCursorError("value exceeds maximum length") payload = {"s": sort_field, "v": value, "id": id, "o": order} raw = json.dumps(payload, separators=(",", ":"), ensure_ascii=False) - # Go's default `json.Marshal` escapes these characters in string values; we - # do the same so an asset name containing one of them produces byte- - # identical cursors across runtimes. None of these characters appear in - # JSON structural syntax, so a global replace on the serialized output is - # safe — it can only touch characters from the encoded values. + # Match the default JSON escaping of HTML-significant characters and JS + # line/paragraph separators (U+2028 / U+2029) so an asset name carrying + # any of them encodes to identical bytes across runtimes. None of these + # characters appear in JSON structural syntax, so a global replace on the + # serialized output can only touch encoded values. Use explicit \uXXXX + # escapes for U+2028 / U+2029 so the source survives any editor / git + # tooling that normalizes invisible separators. raw = ( raw.replace("<", "\\u003c") .replace(">", "\\u003e") .replace("&", "\\u0026") - .replace("
", "\\u2028") - .replace("
", "\\u2029") + .replace("\u2028", "\\u2028") + .replace("\u2029", "\\u2029") ) return base64.urlsafe_b64encode(raw.encode("utf-8")).rstrip(b"=").decode("ascii") diff --git a/openapi.yaml b/openapi.yaml index 3cd027859..0ba4ba5aa 100644 --- a/openapi.yaml +++ b/openapi.yaml @@ -6789,6 +6789,7 @@ components: required: - code - message + - details properties: code: type: string @@ -6800,6 +6801,13 @@ components: enum: [INVALID_CURSOR, INVALID_QUERY] message: type: string + details: + type: object + description: | + Free-form, code-specific context. `INVALID_QUERY` populates this + with an `errors` array of Pydantic validation entries; + `INVALID_CURSOR` returns an empty object. + additionalProperties: true TagInfo: type: object diff --git a/tests-unit/assets_test/services/test_cursor.py b/tests-unit/assets_test/services/test_cursor.py index c4a1cc1b4..f015b9708 100644 --- a/tests-unit/assets_test/services/test_cursor.py +++ b/tests-unit/assets_test/services/test_cursor.py @@ -1,9 +1,9 @@ """Tests for app.assets.services.cursor. -The byte-identity fixtures below pin the wire format so a parallel Go -implementation can mint exchange-compatible cursors for the same triple -(asset name, value, id). Drift here would break frontend pagination -against any compatible backend. +The byte-identity fixtures below pin the wire format so a parallel +implementation in another runtime can mint exchange-compatible cursors +for the same payload. Drift here would break frontend pagination against +any compatible backend. """ from __future__ import annotations @@ -115,17 +115,26 @@ class TestInvalidInputs: decode_cursor(encoded, ALLOWED) def test_empty_id(self): - encoded = encode_cursor("created_at", "1", "") + # Encoder rejects empty id symmetrically with the decoder, so build the + # payload manually to exercise the decoder's missing-id branch. + raw = b'{"s":"created_at","v":"1","id":"","o":"desc"}' + encoded = base64.urlsafe_b64encode(raw).rstrip(b"=").decode("ascii") with pytest.raises(InvalidCursorError, match="missing id"): decode_cursor(encoded, ALLOWED) def test_oversized_id(self): - encoded = encode_cursor("created_at", "1", "a" * (MAX_CURSOR_ID_LENGTH + 1)) + # Encoder enforces the cap symmetrically; hand-build to exercise decode. + big_id = "a" * (MAX_CURSOR_ID_LENGTH + 1) + raw = ('{"s":"created_at","v":"1","id":"' + big_id + '","o":"desc"}').encode("ascii") + encoded = base64.urlsafe_b64encode(raw).rstrip(b"=").decode("ascii") with pytest.raises(InvalidCursorError, match="id exceeds maximum length"): decode_cursor(encoded, ALLOWED) def test_oversized_value(self): - encoded = encode_cursor("created_at", "v" * (MAX_CURSOR_VALUE_LENGTH + 1), "id-1") + # Encoder enforces the cap symmetrically; hand-build to exercise decode. + big_v = "v" * (MAX_CURSOR_VALUE_LENGTH + 1) + raw = ('{"s":"created_at","v":"' + big_v + '","id":"id-1","o":"desc"}').encode("ascii") + encoded = base64.urlsafe_b64encode(raw).rstrip(b"=").decode("ascii") with pytest.raises(InvalidCursorError, match="value exceeds maximum length"): decode_cursor(encoded, ALLOWED) @@ -194,6 +203,18 @@ class TestEncoderDecoderSymmetry: payload = decode_cursor(encoded, ALLOWED) assert payload.value == long_name + def test_encoder_rejects_empty_id(self): + with pytest.raises(InvalidCursorError, match="id must be non-empty"): + encode_cursor("created_at", "1", "") + + def test_encoder_rejects_oversized_id(self): + with pytest.raises(InvalidCursorError, match="id exceeds maximum length"): + encode_cursor("created_at", "1", "a" * (MAX_CURSOR_ID_LENGTH + 1)) + + def test_encoder_rejects_oversized_value(self): + with pytest.raises(InvalidCursorError, match="value exceeds maximum length"): + encode_cursor("name", "v" * (MAX_CURSOR_VALUE_LENGTH + 1), "id-1") + class TestOrderBinding: def test_order_baked_into_payload(self): @@ -232,11 +253,11 @@ class TestOrderBinding: decode_cursor(encoded, ALLOWED, expected_order="desc") -class TestGoCompatJsonEscaping: +class TestHtmlSignificantCharEscaping: """An asset name containing `<`, `>`, `&`, U+2028, or U+2029 must encode - to the same bytes Go's default `json.Marshal` would produce. The fixtures - below are generated by Go's encoder; any drift here means cross-runtime - byte-identity for HTML-significant characters is broken. + to the same escaped wire bytes as any compatible implementation of the + same payload format. Drift here breaks cross-runtime byte-identity for + those characters. """ @pytest.mark.parametrize( @@ -268,27 +289,51 @@ class TestGoCompatJsonEscaping: class TestByteIdentityFixtures: """Pin the wire format so it doesn't drift silently. - These fixtures hold the exact base64url-encoded bytes a given payload - produces. Any change to the encoding (key order, escaping, ordering of - structural fields) will fail these tests loudly rather than diverge - silently from external consumers. + These fixtures assert exact byte equality of the encoded JSON payload — + a change in key order, escape choice, separator whitespace, or anything + else that shifts a byte fails the test loudly rather than diverging + silently from any external consumer of the same payload format. """ @pytest.mark.parametrize( - "sort_field, value, id, order", + "sort_field, value, id, order, expected_payload", [ - ("created_at", "1716200000000000", "a1b2c3d4-e5f6-7a89-b0c1-d2e3f4a5b6c7", "desc"), - ("size", "1024", "asset-123", "asc"), - ("name", "my-asset.png", "asset-abc", "desc"), - ("name", "foo&baz.png", "asset-html", "desc"), # exercises Go-compat escaping + ( + "created_at", + "1716200000000000", + "a1b2c3d4-e5f6-7a89-b0c1-d2e3f4a5b6c7", + "desc", + '{"s":"created_at","v":"1716200000000000","id":"a1b2c3d4-e5f6-7a89-b0c1-d2e3f4a5b6c7","o":"desc"}', + ), + ( + "size", + "1024", + "asset-123", + "asc", + '{"s":"size","v":"1024","id":"asset-123","o":"asc"}', + ), + ( + "name", + "my-asset.png", + "asset-abc", + "desc", + '{"s":"name","v":"my-asset.png","id":"asset-abc","o":"desc"}', + ), + ( + "name", + "foo&baz.png", + "asset-html", + "desc", + # `<`, `>`, `&` escape to <, >, & in the value. + '{"s":"name","v":"foo\\u003cbar\\u003e\\u0026baz.png","id":"asset-html","o":"desc"}', + ), ], ) - def test_encoded_payload_shape_pinned(self, sort_field, value, id, order): + def test_encoded_payload_shape_pinned(self, sort_field, value, id, order, expected_payload): encoded = encode_cursor(sort_field, value, id, order=order) decoded_bytes = base64.urlsafe_b64decode(encoded + "=" * (-len(encoded) % 4)) - decoded_str = decoded_bytes.decode("ascii") - # Keys appear in the documented order; binding `o` is present. - for needle in (f'"s":"{sort_field}"', f'"id":"{id}"', f'"o":"{order}"'): - assert needle in decoded_str, ( - f"Expected {needle!r} in payload {decoded_str!r}" - ) + assert decoded_bytes.decode("utf-8") == expected_payload, ( + f"wire format drifted for sort={sort_field!r}, value={value!r}:\n" + f" expected: {expected_payload!r}\n" + f" actual: {decoded_bytes.decode('utf-8')!r}" + ) diff --git a/tests-unit/assets_test/test_list_cursor.py b/tests-unit/assets_test/test_list_cursor.py index e419e22f3..a37019fd6 100644 --- a/tests-unit/assets_test/test_list_cursor.py +++ b/tests-unit/assets_test/test_list_cursor.py @@ -4,8 +4,6 @@ These tests exercise the handler/service/query path end-to-end; cursor-encoding-level tests live in tests-unit/assets_test/services/test_cursor.py. """ -import time - import pytest import requests @@ -201,12 +199,16 @@ def test_cursor_walks_for_non_name_sorts(sort_field, http: requests.Session, api Without this, the `created_at` / `updated_at` (time-encoded micros) and `size` (int-encoded) cursor paths go entirely unexercised end-to-end. """ - # Stagger create_time slightly so created_at / updated_at sort is well-defined. + # Sizes increase strictly by index, so `size desc` has a deterministic + # expected order. Time-based sorts (created_at / updated_at) can tie when + # rows are inserted faster than the DB's timestamp resolution; for those + # we check coverage and no-duplicates and let the keyset tiebreaker do + # the rest, instead of sleeping between inserts and asserting an order + # that depends on clock granularity. names = [] for i in range(4): n = f"cursor_{sort_field}_{i:02d}.safetensors" asset_factory(n, ["models", "checkpoints", "unit-tests", f"cursor-{sort_field}"], {}, make_asset_bytes(n, size=2048 + i)) - time.sleep(0.05) # ensure distinct timestamps names.append(n) params = { @@ -232,7 +234,20 @@ def test_cursor_walks_for_non_name_sorts(sort_field, http: requests.Session, api break assert pages < 10, "guard against runaway cursor loop" - assert set(seen) == set(names), f"missing items for sort={sort_field}: expected {set(names)}, got {set(seen)}" + # No duplicates: a faulty keyset boundary that returns the same row across + # two pages must fail this check. + assert len(seen) == len(set(seen)), ( + f"cursor walk repeated rows for sort={sort_field}: {seen}" + ) + # Full coverage: every seeded asset reached exactly once. + assert set(seen) == set(names), ( + f"missing items for sort={sort_field}: expected {set(names)}, got {set(seen)}" + ) + # Strict order check for the only field with a clock-independent ordering. + if sort_field == "size": + assert seen == list(reversed(names)), ( + f"size cursor walked out of order: got {seen}, expected {list(reversed(names))}" + ) def test_cursor_order_mismatch_returns_400(http: requests.Session, api_base: str, asset_factory, make_asset_bytes):