mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-01-10 06:10:50 +08:00
1256 lines
53 KiB
Python
1256 lines
53 KiB
Python
"""
|
|
Integration tests for distributed tracing across RabbitMQ and services.
|
|
|
|
These tests validate that trace context propagates correctly from frontend
|
|
to backend workers through RabbitMQ, and that Jaeger can reconstruct the
|
|
full distributed trace.
|
|
"""
|
|
import asyncio
|
|
import logging
|
|
import os
|
|
import subprocess
|
|
import tempfile
|
|
import time
|
|
import uuid
|
|
|
|
import pytest
|
|
import requests
|
|
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
|
|
from opentelemetry.instrumentation.aiohttp_client import AioHttpClientInstrumentor
|
|
from opentelemetry.sdk.resources import Resource
|
|
from opentelemetry.sdk.trace import TracerProvider
|
|
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
|
from opentelemetry.semconv.attributes import service_attributes
|
|
from testcontainers.core.container import DockerContainer
|
|
from testcontainers.core.waiting_utils import wait_for_logs
|
|
from testcontainers.nginx import NginxContainer
|
|
from testcontainers.rabbitmq import RabbitMqContainer
|
|
|
|
from comfy.client.sdxl_with_refiner_workflow import sdxl_workflow_with_refiner
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class JaegerContainer(DockerContainer):
|
|
"""Testcontainer for Jaeger all-in-one with OTLP support."""
|
|
|
|
def __init__(self, image: str = "jaegertracing/all-in-one:latest"):
|
|
super().__init__(image)
|
|
self.with_exposed_ports(16686, 4318, 14268) # UI, OTLP HTTP, Jaeger HTTP
|
|
self.with_env("COLLECTOR_OTLP_ENABLED", "true")
|
|
|
|
def get_query_url(self) -> str:
|
|
"""Get Jaeger Query API URL."""
|
|
host = self.get_container_host_ip()
|
|
port = self.get_exposed_port(16686)
|
|
return f"http://{host}:{port}"
|
|
|
|
def get_otlp_endpoint(self) -> str:
|
|
"""Get OTLP HTTP endpoint for sending traces."""
|
|
host = self.get_container_host_ip()
|
|
port = self.get_exposed_port(4318)
|
|
return f"http://{host}:{port}"
|
|
|
|
def start(self):
|
|
super().start()
|
|
wait_for_logs(self, ".*Starting GRPC server.*", timeout=30)
|
|
return self
|
|
|
|
|
|
@pytest.fixture(scope="function")
|
|
def nginx_proxy(frontend_backend_worker_with_rabbitmq):
|
|
"""
|
|
Provide an nginx proxy in front of the ComfyUI frontend.
|
|
This tests if nginx is blocking W3C trace context propagation.
|
|
"""
|
|
import socket
|
|
import subprocess
|
|
|
|
# Extract host and port from frontend address
|
|
frontend_url = frontend_backend_worker_with_rabbitmq
|
|
# frontend_url is like "http://127.0.0.1:19001"
|
|
import re
|
|
match = re.match(r'http://([^:]+):(\d+)', frontend_url)
|
|
if not match:
|
|
raise ValueError(f"Could not parse frontend URL: {frontend_url}")
|
|
|
|
frontend_host = match.group(1)
|
|
frontend_port = match.group(2)
|
|
nginx_port = 8085
|
|
|
|
# Get the Docker bridge gateway IP (this is how containers reach the host on Linux)
|
|
# Try to get the default Docker bridge gateway
|
|
try:
|
|
result = subprocess.run(
|
|
["docker", "network", "inspect", "bridge", "-f", "{{range .IPAM.Config}}{{.Gateway}}{{end}}"],
|
|
capture_output=True,
|
|
text=True,
|
|
check=True
|
|
)
|
|
docker_gateway = result.stdout.strip()
|
|
logger.info(f"Using Docker gateway IP: {docker_gateway}")
|
|
except Exception as e:
|
|
# Fallback: try common gateway IPs
|
|
docker_gateway = "172.17.0.1" # Default Docker bridge gateway on Linux
|
|
logger.warning(f"Could not detect Docker gateway, using default: {docker_gateway}")
|
|
|
|
# Create nginx config that proxies to the frontend and passes trace headers
|
|
nginx_conf = f"""
|
|
events {{
|
|
worker_connections 1024;
|
|
}}
|
|
|
|
http {{
|
|
upstream backend {{
|
|
server {docker_gateway}:{frontend_port};
|
|
}}
|
|
|
|
server {{
|
|
listen {nginx_port};
|
|
|
|
location / {{
|
|
proxy_pass http://backend;
|
|
proxy_set_header Host $host;
|
|
proxy_set_header X-Real-IP $remote_addr;
|
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
|
proxy_set_header X-Forwarded-Proto $scheme;
|
|
}}
|
|
}}
|
|
}}
|
|
"""
|
|
|
|
# Write config to a temporary file
|
|
with tempfile.NamedTemporaryFile(mode='w', suffix='.conf', delete=False) as f:
|
|
f.write(nginx_conf)
|
|
nginx_conf_path = f.name
|
|
|
|
try:
|
|
# Start nginx container with the config
|
|
nginx = NginxContainer(port=nginx_port)
|
|
nginx.with_volume_mapping(nginx_conf_path, "/etc/nginx/nginx.conf")
|
|
nginx.start()
|
|
|
|
# Get the nginx URL
|
|
host = nginx.get_container_host_ip()
|
|
port = nginx.get_exposed_port(nginx_port)
|
|
nginx_url = f"http://{host}:{port}"
|
|
|
|
logger.info(f"Nginx proxy started at {nginx_url} -> {frontend_url}")
|
|
|
|
# Wait for nginx to be ready
|
|
for _ in range(30):
|
|
try:
|
|
response = requests.get(nginx_url, timeout=1)
|
|
if response.status_code:
|
|
break
|
|
except Exception:
|
|
pass
|
|
time.sleep(0.5)
|
|
|
|
yield nginx_url
|
|
finally:
|
|
nginx.stop()
|
|
os.unlink(nginx_conf_path)
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def jaeger_container():
|
|
"""
|
|
Provide a Jaeger container for collecting traces.
|
|
|
|
This fixture automatically sets OTEL_EXPORTER_OTLP_ENDPOINT to point to the
|
|
Jaeger container, and cleans it up when the container stops.
|
|
"""
|
|
container = JaegerContainer()
|
|
container.start()
|
|
|
|
# Wait for Jaeger to be fully ready
|
|
query_url = container.get_query_url()
|
|
otlp_endpoint = container.get_otlp_endpoint()
|
|
|
|
for _ in range(30):
|
|
try:
|
|
response = requests.get(f"{query_url}/api/services")
|
|
if response.status_code == 200:
|
|
logger.info(f"Jaeger ready at {query_url}")
|
|
logger.info(f"OTLP endpoint: {otlp_endpoint}")
|
|
break
|
|
except Exception:
|
|
pass
|
|
time.sleep(1)
|
|
|
|
# Set OTEL_EXPORTER_OTLP_ENDPOINT for the duration of the test
|
|
old_endpoint = os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT")
|
|
os.environ["OTEL_EXPORTER_OTLP_ENDPOINT"] = otlp_endpoint
|
|
logger.info(f"Set OTEL_EXPORTER_OTLP_ENDPOINT={otlp_endpoint}")
|
|
|
|
try:
|
|
yield container
|
|
finally:
|
|
# Restore original OTEL_EXPORTER_OTLP_ENDPOINT
|
|
if old_endpoint is not None:
|
|
os.environ["OTEL_EXPORTER_OTLP_ENDPOINT"] = old_endpoint
|
|
logger.info(f"Restored OTEL_EXPORTER_OTLP_ENDPOINT={old_endpoint}")
|
|
else:
|
|
os.environ.pop("OTEL_EXPORTER_OTLP_ENDPOINT", None)
|
|
logger.info("Removed OTEL_EXPORTER_OTLP_ENDPOINT")
|
|
|
|
container.stop()
|
|
|
|
|
|
def query_jaeger_traces(jaeger_url: str, service: str, operation: str = None,
|
|
lookback: str = "1h", limit: int = 100) -> dict:
|
|
"""
|
|
Query Jaeger for traces.
|
|
|
|
Args:
|
|
jaeger_url: Base URL of Jaeger query service
|
|
service: Service name to query
|
|
operation: Optional operation name filter
|
|
lookback: Lookback period (e.g., "1h", "30m")
|
|
limit: Maximum number of traces to return
|
|
|
|
Returns:
|
|
JSON response from Jaeger API
|
|
"""
|
|
params = {
|
|
"service": service,
|
|
"lookback": lookback,
|
|
"limit": limit
|
|
}
|
|
if operation:
|
|
params["operation"] = operation
|
|
|
|
response = requests.get(f"{jaeger_url}/api/traces", params=params)
|
|
response.raise_for_status()
|
|
return response.json()
|
|
|
|
|
|
def find_trace_by_operation(traces_response: dict, operation_name: str) -> dict:
|
|
"""Find a specific trace by operation name."""
|
|
for trace in traces_response.get("data", []):
|
|
for span in trace.get("spans", []):
|
|
if span.get("operationName") == operation_name:
|
|
return trace
|
|
return None
|
|
|
|
|
|
def verify_trace_continuity(trace: dict, expected_services: list[str]) -> bool:
|
|
"""
|
|
Verify that a trace spans multiple services and maintains parent-child relationships.
|
|
|
|
Args:
|
|
trace: Jaeger trace object
|
|
expected_services: List of service names expected in the trace
|
|
|
|
Returns:
|
|
True if trace shows proper distributed tracing across services
|
|
"""
|
|
if not trace:
|
|
return False
|
|
|
|
spans = trace.get("spans", [])
|
|
if not spans:
|
|
return False
|
|
|
|
# Check that all expected services are present
|
|
trace_services = set()
|
|
for span in spans:
|
|
process_id = span.get("processID")
|
|
if process_id:
|
|
process = trace.get("processes", {}).get(process_id, {})
|
|
service_name = process.get("serviceName")
|
|
if service_name:
|
|
trace_services.add(service_name)
|
|
|
|
logger.info(f"Trace contains services: {trace_services}")
|
|
logger.info(f"Expected services: {set(expected_services)}")
|
|
|
|
# Verify all expected services are present
|
|
for service in expected_services:
|
|
if service not in trace_services:
|
|
logger.warning(f"Expected service '{service}' not found in trace")
|
|
return False
|
|
|
|
# Verify all spans share the same trace ID
|
|
trace_ids = set(span.get("traceID") for span in spans)
|
|
if len(trace_ids) != 1:
|
|
logger.warning(f"Multiple trace IDs found: {trace_ids}")
|
|
return False
|
|
|
|
# Verify parent-child relationships exist
|
|
span_ids = {span.get("spanID") for span in spans}
|
|
has_parent_refs = False
|
|
|
|
for span in spans:
|
|
references = span.get("references", [])
|
|
for ref in references:
|
|
if ref.get("refType") == "CHILD_OF":
|
|
parent_span_id = ref.get("spanID")
|
|
if parent_span_id in span_ids:
|
|
has_parent_refs = True
|
|
logger.info(f"Found parent-child relationship: {parent_span_id} -> {span.get('spanID')}")
|
|
|
|
if not has_parent_refs:
|
|
logger.warning("No parent-child relationships found in trace")
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
# order matters, execute jaeger_container first
|
|
@pytest.mark.skip
|
|
@pytest.mark.asyncio
|
|
async def test_tracing_integration(jaeger_container, nginx_proxy):
|
|
"""
|
|
Integration test for distributed tracing across services with nginx proxy.
|
|
|
|
This test:
|
|
1. Starts ComfyUI frontend and worker with RabbitMQ
|
|
2. Starts nginx proxy in front of the frontend to test trace context propagation through nginx
|
|
3. Configures OTLP export to Jaeger testcontainer
|
|
4. Submits a workflow through the nginx proxy
|
|
5. Queries Jaeger to verify trace propagation
|
|
6. Validates that the trace spans multiple services with proper relationships
|
|
|
|
This specifically tests if nginx is blocking W3C trace context (traceparent/tracestate headers).
|
|
"""
|
|
server_address = nginx_proxy
|
|
jaeger_url = jaeger_container.get_query_url()
|
|
otlp_endpoint = jaeger_container.get_otlp_endpoint()
|
|
|
|
logger.info(f"Frontend server: {server_address}")
|
|
logger.info(f"Jaeger UI: {jaeger_url}")
|
|
logger.info(f"OTLP endpoint: {otlp_endpoint}")
|
|
|
|
# Set up tracing for the async HTTP client
|
|
resource = Resource.create({
|
|
service_attributes.SERVICE_NAME: "comfyui-client",
|
|
})
|
|
provider = TracerProvider(resource=resource)
|
|
exporter = OTLPSpanExporter(endpoint=f"{otlp_endpoint}/v1/traces")
|
|
processor = BatchSpanProcessor(exporter)
|
|
provider.add_span_processor(processor)
|
|
from opentelemetry import trace
|
|
|
|
trace.set_tracer_provider(provider)
|
|
|
|
# Instrument aiohttp client
|
|
AioHttpClientInstrumentor().instrument()
|
|
|
|
# we have to call this very late, so that the instrumentation isn't initialized too early
|
|
from comfy.client.aio_client import AsyncRemoteComfyClient
|
|
|
|
# Note: In a real integration test, you'd need to configure the ComfyUI
|
|
# services to export traces to this Jaeger instance. For now, this test
|
|
# documents the expected behavior.
|
|
|
|
# Create a unique prompt to identify our trace
|
|
test_id = str(uuid.uuid4())[:8]
|
|
prompt = sdxl_workflow_with_refiner(f"test_trace_{test_id}", inference_steps=1, refiner_steps=1)
|
|
|
|
# Get the tracer for the client
|
|
client_tracer = trace.get_tracer("test_tracing_integration")
|
|
|
|
# Submit the workflow - wrap in a span to capture the trace ID
|
|
with client_tracer.start_as_current_span("submit_workflow") as workflow_span:
|
|
trace_id = format(workflow_span.get_span_context().trace_id, '032x')
|
|
logger.info(f"Started trace with trace_id: {trace_id}")
|
|
|
|
async with AsyncRemoteComfyClient(server_address=server_address) as client:
|
|
logger.info(f"Submitting workflow with test_id: {test_id}")
|
|
|
|
# Queue the prompt with async response
|
|
task_id = await client.queue_and_forget_prompt_api(prompt, prefer_header="respond-async")
|
|
assert task_id is not None, "Failed to get task ID"
|
|
|
|
logger.info(f"Queued task: {task_id}")
|
|
|
|
# Poll for completion
|
|
status_code, result = await client.poll_prompt_until_done(task_id, max_attempts=60, poll_interval=1.0)
|
|
assert status_code == 200, f"Task failed with status {status_code}"
|
|
logger.info("Task completed successfully")
|
|
|
|
# Give Jaeger time to receive and process spans
|
|
await asyncio.sleep(5)
|
|
|
|
# Query Jaeger for traces
|
|
# Note: The actual service names depend on how your services are configured
|
|
# Common service names might be: "slack-bot", "comfyui-frontend", "comfyui-worker"
|
|
|
|
expected_services = ["comfyui", "comfyui-client"] # Adjust based on actual service names
|
|
|
|
logger.info(f"Querying Jaeger for traces with trace_id: {trace_id}...")
|
|
|
|
# First, try to find our specific trace by trace_id from the client service
|
|
our_trace = None
|
|
for service in expected_services:
|
|
try:
|
|
traces_response = query_jaeger_traces(jaeger_url, service, lookback="5m")
|
|
if traces_response.get("data"):
|
|
logger.info(f"Found {len(traces_response['data'])} traces for service '{service}'")
|
|
for trace in traces_response["data"]:
|
|
if trace.get("traceID") == trace_id:
|
|
our_trace = trace
|
|
logger.info(f"Found our trace in service '{service}'")
|
|
break
|
|
if our_trace:
|
|
break
|
|
except Exception as e:
|
|
logger.warning(f"Could not query traces for service '{service}': {e}")
|
|
|
|
# Assert we can find the trace we just created
|
|
assert our_trace is not None, (
|
|
f"Could not find trace with trace_id {trace_id} in Jaeger. "
|
|
f"This indicates that spans from comfyui-client are not being exported correctly."
|
|
)
|
|
|
|
logger.info(f"Successfully found trace with trace_id {trace_id}")
|
|
|
|
# Extract services from the trace
|
|
trace_services = set()
|
|
for span in our_trace.get("spans", []):
|
|
process_id = span.get("processID")
|
|
if process_id:
|
|
process = our_trace.get("processes", {}).get(process_id, {})
|
|
service_name = process.get("serviceName")
|
|
if service_name:
|
|
trace_services.add(service_name)
|
|
|
|
logger.info(f"Services found in trace: {trace_services}")
|
|
|
|
# Assert that comfyui-client service is present (since we instrumented it)
|
|
assert "comfyui-client" in trace_services, (
|
|
f"Expected 'comfyui-client' service in trace, but found only: {trace_services}. "
|
|
f"This indicates the client instrumentation is not working."
|
|
)
|
|
|
|
# Validate trace structure
|
|
logger.info(f"Analyzing trace with {len(our_trace.get('spans', []))} spans")
|
|
|
|
# Log all spans for debugging
|
|
for span in our_trace.get("spans", []):
|
|
process_id = span.get("processID")
|
|
process = our_trace.get("processes", {}).get(process_id, {})
|
|
service_name = process.get("serviceName", "unknown")
|
|
operation = span.get("operationName", "unknown")
|
|
logger.info(f" Span: {service_name}.{operation}")
|
|
|
|
# Verify trace continuity - only if both services are present
|
|
assert "comfyui" in trace_services
|
|
is_continuous = verify_trace_continuity(our_trace, expected_services)
|
|
|
|
# This assertion documents what SHOULD happen when distributed tracing works
|
|
assert is_continuous, (
|
|
"Trace does not show proper distributed tracing. "
|
|
"Expected to see spans from multiple services with parent-child relationships. "
|
|
"This indicates that trace context is not being propagated correctly through RabbitMQ."
|
|
)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_trace_context_in_http_headers(frontend_backend_worker_with_rabbitmq):
|
|
"""
|
|
Test that HTTP requests include traceparent headers.
|
|
|
|
This validates that the HTTP layer is properly instrumented for tracing.
|
|
"""
|
|
server_address = frontend_backend_worker_with_rabbitmq
|
|
|
|
# Make a simple HTTP request and check for trace headers
|
|
# Note: We're checking the server's response headers to see if it's trace-aware
|
|
response = requests.get(f"{server_address}/system_stats")
|
|
|
|
logger.info(f"Response headers: {dict(response.headers)}")
|
|
|
|
# The server should be instrumented and may include trace context in responses
|
|
# or at minimum, should accept traceparent headers in requests
|
|
|
|
# Test sending a traceparent header
|
|
test_traceparent = "00-4bf92f3577b34da6a3ce929d0e0e4736-00f067aa0ba902b7-01"
|
|
response_with_trace = requests.get(
|
|
f"{server_address}/system_stats",
|
|
headers={"traceparent": test_traceparent}
|
|
)
|
|
|
|
# Should not error when traceparent is provided
|
|
assert response_with_trace.status_code == 200, "Server should accept traceparent header"
|
|
|
|
logger.info("✓ Server accepts traceparent headers in HTTP requests")
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.skip
|
|
async def test_multiple_requests_different_traces(frontend_backend_worker_with_rabbitmq, jaeger_container):
|
|
"""
|
|
Test that multiple independent requests create separate traces.
|
|
|
|
This validates that trace context is properly scoped per request.
|
|
"""
|
|
server_address = frontend_backend_worker_with_rabbitmq
|
|
|
|
# Submit multiple workflows
|
|
task_ids = []
|
|
|
|
from comfy.client.aio_client import AsyncRemoteComfyClient
|
|
async with AsyncRemoteComfyClient(server_address=server_address) as client:
|
|
for i in range(3):
|
|
prompt = sdxl_workflow_with_refiner(f"test_{i}", inference_steps=1, refiner_steps=1)
|
|
task_id = await client.queue_and_forget_prompt_api(prompt, prefer_header="respond-async")
|
|
task_ids.append(task_id)
|
|
logger.info(f"Queued task {i}: {task_id}")
|
|
|
|
# Wait for all to complete
|
|
for i, task_id in enumerate(task_ids):
|
|
status_code, result = await client.poll_prompt_until_done(task_id, max_attempts=60, poll_interval=1.0)
|
|
assert status_code == 200, f"Task {i} failed"
|
|
logger.info(f"Task {i} completed")
|
|
|
|
# Give Jaeger time to receive spans
|
|
await asyncio.sleep(5)
|
|
|
|
# Query Jaeger and verify we have multiple distinct traces
|
|
jaeger_url = jaeger_container.get_query_url()
|
|
|
|
traces_response = query_jaeger_traces(jaeger_url, "comfyui", lookback="5m", limit=10)
|
|
traces = traces_response.get("data", [])
|
|
|
|
assert len(traces) >= 2
|
|
# Get trace IDs
|
|
trace_ids = [trace.get("traceID") for trace in traces]
|
|
unique_trace_ids = set(trace_ids)
|
|
|
|
logger.info(f"Found {len(unique_trace_ids)} unique traces")
|
|
|
|
# Verify we have multiple distinct traces
|
|
assert len(unique_trace_ids) >= 2, (
|
|
f"Expected at least 2 distinct traces, found {len(unique_trace_ids)}. "
|
|
"Each request should create its own trace."
|
|
)
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_trace_contains_rabbitmq_operations(frontend_backend_worker_with_rabbitmq, jaeger_container):
|
|
"""
|
|
Test that traces include RabbitMQ publish/consume operations.
|
|
|
|
This is critical for distributed tracing - the RabbitMQ operations
|
|
are what link the frontend and backend spans together.
|
|
"""
|
|
server_address = frontend_backend_worker_with_rabbitmq
|
|
jaeger_url = jaeger_container.get_query_url()
|
|
|
|
# Submit a workflow
|
|
from comfy.client.aio_client import AsyncRemoteComfyClient
|
|
async with AsyncRemoteComfyClient(server_address=server_address) as client:
|
|
prompt = sdxl_workflow_with_refiner("test_rmq", inference_steps=1, refiner_steps=1)
|
|
task_id = await client.queue_and_forget_prompt_api(prompt)
|
|
status_code, result = await client.poll_prompt_until_done(task_id, max_attempts=60)
|
|
assert status_code == 200
|
|
|
|
await asyncio.sleep(5)
|
|
|
|
traces_response = query_jaeger_traces(jaeger_url, "comfyui", lookback="5m")
|
|
traces = traces_response.get("data", [])
|
|
|
|
# Look for RabbitMQ-related operations in any trace
|
|
rabbitmq_operations = [
|
|
"publish", "consume", "amq_queue_publish", "amq_queue_consume",
|
|
"amq.basic.publish", "amq.basic.consume", "send", "receive"
|
|
]
|
|
|
|
found_rabbitmq_ops = []
|
|
for trace in traces:
|
|
for span in trace.get("spans", []):
|
|
op_name = span.get("operationName", "").lower()
|
|
for rmq_op in rabbitmq_operations:
|
|
if rmq_op in op_name:
|
|
found_rabbitmq_ops.append(op_name)
|
|
|
|
assert found_rabbitmq_ops, "No RabbitMQ-related operations found in traces"
|
|
|
|
@pytest.mark.skip
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize("docker_image,otlp_endpoint,jaeger_url", [
|
|
pytest.param(
|
|
"ghcr.io/hiddenswitch/comfyui:latest",
|
|
None, # Will use jaeger_container
|
|
None, # Will use jaeger_container
|
|
id="test-containers"
|
|
),
|
|
# pytest.param(
|
|
# "ghcr.io/hiddenswitch/comfyui:latest",
|
|
# "http://10.152.184.34:4318", # otlp-collector IP
|
|
# "http://10.152.184.50:16686", # jaeger-production-query IP
|
|
# id="production-infrastructure"
|
|
# ),
|
|
])
|
|
async def test_full_docker_stack_trace_propagation(
|
|
jaeger_container,
|
|
docker_image,
|
|
otlp_endpoint,
|
|
jaeger_url
|
|
):
|
|
"""
|
|
Full integration test with frontend and backend running as Docker containers.
|
|
|
|
This test mirrors the production setup to diagnose trace context propagation issues:
|
|
1. Starts RabbitMQ container
|
|
2. Uses Jaeger container OR production infrastructure (via parametrization)
|
|
3. Starts backend worker container(s) with comfyui-worker
|
|
4. Starts frontend container with comfyui
|
|
5. Submits a workflow
|
|
6. Verifies that trace context propagates from frontend -> RabbitMQ -> backend
|
|
|
|
The test is parameterized to run with:
|
|
- test-containers: Uses ephemeral Jaeger container
|
|
- production-infrastructure: Uses cluster's Jaeger/OTLP services
|
|
"""
|
|
use_production = otlp_endpoint is not None
|
|
|
|
if use_production:
|
|
logger.info("=" * 80)
|
|
logger.info("Using PRODUCTION infrastructure:")
|
|
logger.info(f" Docker image: {docker_image}")
|
|
logger.info(f" OTLP endpoint: {otlp_endpoint}")
|
|
logger.info(f" Jaeger query URL: {jaeger_url}")
|
|
logger.info("=" * 80)
|
|
else:
|
|
# Use test container
|
|
jaeger_url = jaeger_container.get_query_url()
|
|
otlp_endpoint = jaeger_container.get_otlp_endpoint()
|
|
otlp_port = jaeger_container.get_exposed_port(4318)
|
|
|
|
logger.info("=" * 80)
|
|
logger.info("Using TEST container infrastructure:")
|
|
logger.info(f" Docker image: {docker_image}")
|
|
logger.info(f" OTLP endpoint: {otlp_endpoint}")
|
|
logger.info(f" Jaeger query URL: {jaeger_url}")
|
|
logger.info("=" * 80)
|
|
|
|
# Get Docker bridge gateway for container-to-host communication
|
|
if not use_production:
|
|
try:
|
|
result = subprocess.run(
|
|
["docker", "network", "inspect", "bridge", "-f", "{{(index .IPAM.Config 0).Gateway}}"],
|
|
capture_output=True,
|
|
text=True,
|
|
check=True,
|
|
timeout=5
|
|
)
|
|
docker_host = result.stdout.strip()
|
|
if not docker_host:
|
|
docker_host = "host.docker.internal"
|
|
except (subprocess.CalledProcessError, subprocess.TimeoutExpired, FileNotFoundError):
|
|
docker_host = "host.docker.internal"
|
|
|
|
logger.info(f"Docker host for container-to-host communication: {docker_host}")
|
|
|
|
# Set OTLP endpoint accessible from containers
|
|
otlp_endpoint_container = f"http://{docker_host}:{otlp_port}"
|
|
else:
|
|
# Production services are accessible directly by DNS
|
|
otlp_endpoint_container = otlp_endpoint
|
|
|
|
# Get docker_host for RabbitMQ connectivity
|
|
try:
|
|
result = subprocess.run(
|
|
["docker", "network", "inspect", "bridge", "-f", "{{(index .IPAM.Config 0).Gateway}}"],
|
|
capture_output=True,
|
|
text=True,
|
|
check=True,
|
|
timeout=5
|
|
)
|
|
docker_host = result.stdout.strip()
|
|
if not docker_host:
|
|
docker_host = "host.docker.internal"
|
|
except (subprocess.CalledProcessError, subprocess.TimeoutExpired, FileNotFoundError):
|
|
docker_host = "host.docker.internal"
|
|
|
|
with RabbitMqContainer("rabbitmq:latest") as rabbitmq:
|
|
rabbitmq_params = rabbitmq.get_connection_params()
|
|
rabbitmq_uri_container = f"amqp://guest:guest@{docker_host}:{rabbitmq_params.port}"
|
|
logger.info(f"RabbitMQ URI (from containers): {rabbitmq_uri_container}")
|
|
|
|
# Start backend workers (similar to production StatefulSet)
|
|
backend_containers = []
|
|
num_backends = 2
|
|
use_gpu = False # Run in CPU mode for tests
|
|
|
|
for i in range(num_backends):
|
|
backend = DockerContainer(docker_image)
|
|
backend.with_exposed_ports(9090) # health check port
|
|
|
|
backend_env = {
|
|
"OTEL_SERVICE_NAME": "comfyui", # Use same service name as frontend
|
|
"OTEL_EXPORTER_OTLP_ENDPOINT": otlp_endpoint_container,
|
|
"OTEL_METRICS_EXPORTER": "none",
|
|
"OTEL_LOGS_EXPORTER": "none",
|
|
# Configure BatchSpanProcessor to avoid silently dropping spans
|
|
"OTEL_BSP_MAX_QUEUE_SIZE": "10000", # Increased from default 2048
|
|
"OTEL_BSP_SCHEDULE_DELAY": "500", # Decreased from default 5000ms
|
|
"OTEL_BSP_MAX_EXPORT_BATCH_SIZE": "512",
|
|
"OTEL_BSP_EXPORT_TIMEOUT": "30000",
|
|
"COMFYUI_DISTRIBUTED_QUEUE_CONNECTION_URI": rabbitmq_uri_container,
|
|
"COMFYUI_EXECUTOR_FACTORY": "ThreadPoolExecutor",
|
|
"COMFYUI_LOGGING_LEVEL": "INFO",
|
|
}
|
|
|
|
for key, value in backend_env.items():
|
|
backend.with_env(key, value)
|
|
|
|
if use_gpu:
|
|
logger.info(f"Configuring backend {i+1} with GPU support")
|
|
# Add GPU support for backends
|
|
backend.with_kwargs(
|
|
device_requests=[
|
|
{
|
|
"Driver": "nvidia",
|
|
"Count": -1, # -1 means all GPUs
|
|
"Capabilities": [["gpu"]],
|
|
}
|
|
]
|
|
)
|
|
# Use comfyui-worker command like in production
|
|
# ENV vars are automatically picked up by cli_args
|
|
backend.with_command("comfyui-worker")
|
|
else:
|
|
logger.info(f"Configuring backend {i+1} with CPU mode (set USE_GPU=true for GPU support)")
|
|
# Run in CPU mode without GPU
|
|
# ENV vars are automatically picked up by cli_args
|
|
backend.with_command("comfyui-worker --cpu")
|
|
|
|
backend.start()
|
|
backend_containers.append(backend)
|
|
logger.info(f"Started backend worker {i+1}/{num_backends}")
|
|
|
|
try:
|
|
# Wait for backends to be ready
|
|
time.sleep(5)
|
|
|
|
# Verify backend health
|
|
for i, backend in enumerate(backend_containers):
|
|
backend_host = backend.get_container_host_ip()
|
|
backend_port = backend.get_exposed_port(9090)
|
|
health_url = f"http://{backend_host}:{backend_port}/health"
|
|
|
|
healthy = False
|
|
for attempt in range(10):
|
|
try:
|
|
response = requests.get(health_url, timeout=2)
|
|
if response.status_code == 200:
|
|
healthy = True
|
|
logger.info(f"Backend {i+1} is healthy at {health_url}")
|
|
break
|
|
except Exception as e:
|
|
logger.debug(f"Backend {i+1} health check attempt {attempt+1}: {e}")
|
|
time.sleep(2)
|
|
|
|
if not healthy:
|
|
logger.warning(f"Backend {i+1} health check failed, but continuing...")
|
|
|
|
# Start frontend container (similar to production Deployment)
|
|
frontend = DockerContainer(docker_image)
|
|
frontend.with_exposed_ports(8188)
|
|
|
|
frontend_env = {
|
|
"OTEL_SERVICE_NAME": "comfyui", # Use same service name as backend
|
|
"OTEL_EXPORTER_OTLP_ENDPOINT": otlp_endpoint_container,
|
|
"OTEL_METRICS_EXPORTER": "none",
|
|
"OTEL_LOGS_EXPORTER": "none",
|
|
# Configure BatchSpanProcessor to avoid silently dropping spans
|
|
"OTEL_BSP_MAX_QUEUE_SIZE": "10000", # Increased from default 2048
|
|
"OTEL_BSP_SCHEDULE_DELAY": "500", # Decreased from default 5000ms
|
|
"OTEL_BSP_MAX_EXPORT_BATCH_SIZE": "512",
|
|
"OTEL_BSP_EXPORT_TIMEOUT": "30000",
|
|
"COMFYUI_DISTRIBUTED_QUEUE_CONNECTION_URI": rabbitmq_uri_container,
|
|
"COMFYUI_DISTRIBUTED_QUEUE_FRONTEND": "1",
|
|
"COMFYUI_LOGGING_LEVEL": "INFO",
|
|
}
|
|
|
|
for key, value in frontend_env.items():
|
|
frontend.with_env(key, value)
|
|
|
|
# Use comfyui command like in production
|
|
# ENV vars are automatically picked up by cli_args
|
|
frontend.with_command("comfyui --listen 0.0.0.0 --port 8188 --cpu")
|
|
|
|
frontend.start()
|
|
logger.info("Started frontend container")
|
|
|
|
try:
|
|
frontend_host = frontend.get_container_host_ip()
|
|
frontend_port = frontend.get_exposed_port(8188)
|
|
frontend_url = f"http://{frontend_host}:{frontend_port}"
|
|
|
|
logger.info(f"Frontend URL: {frontend_url}")
|
|
|
|
# Wait for frontend to be ready
|
|
connected = False
|
|
for attempt in range(30):
|
|
try:
|
|
response = requests.get(frontend_url, timeout=2)
|
|
if response.status_code == 200:
|
|
connected = True
|
|
logger.info(f"Frontend is ready at {frontend_url}")
|
|
break
|
|
except Exception as e:
|
|
logger.debug(f"Frontend connection attempt {attempt+1}: {e}")
|
|
time.sleep(2)
|
|
|
|
if not connected:
|
|
# Capture frontend logs before failing
|
|
logger.error("=" * 80)
|
|
logger.error("FRONTEND FAILED TO START - Diagnostic Information:")
|
|
logger.error("=" * 80)
|
|
logger.error(f"Frontend URL: {frontend_url}")
|
|
logger.error("\n--- Frontend Container Logs ---")
|
|
try:
|
|
frontend_logs = frontend.get_logs()
|
|
if isinstance(frontend_logs, tuple):
|
|
frontend_logs = frontend_logs[0] + frontend_logs[1]
|
|
log_text = frontend_logs.decode('utf-8') if isinstance(frontend_logs, bytes) else str(frontend_logs)
|
|
for line in log_text.split('\n')[-200:]:
|
|
logger.error(line)
|
|
except Exception as e:
|
|
logger.error(f"Could not retrieve frontend logs: {e}")
|
|
logger.error("=" * 80)
|
|
|
|
assert connected, f"Could not connect to frontend at {frontend_url}. Check logs above."
|
|
|
|
# Set up tracing for the test client with properly configured BatchSpanProcessor
|
|
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
|
|
from opentelemetry.instrumentation.aiohttp_client import AioHttpClientInstrumentor
|
|
from opentelemetry.sdk.resources import Resource
|
|
from opentelemetry.sdk.trace import TracerProvider
|
|
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
|
from opentelemetry.semconv.attributes import service_attributes
|
|
from opentelemetry import trace
|
|
|
|
resource = Resource.create({
|
|
service_attributes.SERVICE_NAME: "test-client",
|
|
})
|
|
provider = TracerProvider(resource=resource)
|
|
exporter = OTLPSpanExporter(endpoint=f"{otlp_endpoint}/v1/traces")
|
|
|
|
# Configure BatchSpanProcessor to avoid silently dropping spans
|
|
# Default: maxQueueSize=2048, scheduleDelayMillis=5000
|
|
# Problem: High span volume workflows (>1000 spans) fill the queue and spans are silently dropped
|
|
# Solution: Increase queue size and decrease delay
|
|
processor = BatchSpanProcessor(
|
|
exporter,
|
|
max_queue_size=10000, # Increased from default 2048
|
|
schedule_delay_millis=500, # Decreased from default 5000ms
|
|
max_export_batch_size=512, # Keep default
|
|
export_timeout_millis=30000, # Keep default
|
|
)
|
|
provider.add_span_processor(processor)
|
|
trace.set_tracer_provider(provider)
|
|
|
|
# Instrument aiohttp client
|
|
AioHttpClientInstrumentor().instrument()
|
|
|
|
# Import client AFTER instrumentation
|
|
from comfy.client.aio_client import AsyncRemoteComfyClient
|
|
|
|
test_id = str(uuid.uuid4())[:8]
|
|
prompt = sdxl_workflow_with_refiner(f"docker_stack_{test_id}", inference_steps=1, refiner_steps=1)
|
|
|
|
logger.info(f"Submitting workflow with test_id: {test_id}")
|
|
|
|
# Get the tracer for the test client
|
|
client_tracer = trace.get_tracer("test_full_docker_stack_trace_propagation")
|
|
|
|
# Wrap the request in a span to capture the trace ID
|
|
with client_tracer.start_as_current_span("submit_workflow") as workflow_span:
|
|
trace_id = format(workflow_span.get_span_context().trace_id, '032x')
|
|
logger.info(f"Started trace with trace_id: {trace_id}")
|
|
|
|
async with AsyncRemoteComfyClient(server_address=frontend_url) as client:
|
|
task_id = await client.queue_and_forget_prompt_api(prompt, prefer_header="respond-async")
|
|
assert task_id is not None, "Failed to get task ID"
|
|
|
|
logger.info(f"Queued task: {task_id}")
|
|
|
|
status_code, result = await client.poll_prompt_until_done(task_id, max_attempts=120, poll_interval=2.0)
|
|
|
|
if status_code != 200:
|
|
# Capture logs from all containers
|
|
logger.error("=" * 80)
|
|
logger.error("TASK FAILED - Diagnostic Information:")
|
|
logger.error("=" * 80)
|
|
logger.error(f"Task ID: {task_id}")
|
|
logger.error(f"Status Code: {status_code}")
|
|
logger.error(f"Result: {result}")
|
|
|
|
logger.error("\n--- Frontend Container Logs (last 100 lines) ---")
|
|
frontend_logs = frontend.get_logs().decode('utf-8').split('\n')
|
|
for line in frontend_logs[-100:]:
|
|
logger.error(line)
|
|
|
|
for i, backend in enumerate(backend_containers):
|
|
logger.error(f"\n--- Backend {i+1} Container Logs (last 100 lines) ---")
|
|
backend_logs = backend.get_logs().decode('utf-8').split('\n')
|
|
for line in backend_logs[-100:]:
|
|
logger.error(line)
|
|
|
|
logger.error("=" * 80)
|
|
|
|
assert status_code == 200, f"Task failed with status {status_code}. Check logs above."
|
|
logger.info("Task completed successfully")
|
|
|
|
# Give Jaeger time to receive and process spans
|
|
logger.info("Waiting for Jaeger to collect spans...")
|
|
await asyncio.sleep(10)
|
|
|
|
# Query Jaeger for OUR specific trace by trace_id
|
|
logger.info(f"Querying Jaeger for trace_id: {trace_id}...")
|
|
|
|
# Query both services to find our trace
|
|
test_client_traces = query_jaeger_traces(jaeger_url, "test-client", lookback="10m").get("data", [])
|
|
comfyui_traces = query_jaeger_traces(jaeger_url, "comfyui", lookback="10m").get("data", [])
|
|
|
|
logger.info(f"Found {len(test_client_traces)} traces from 'test-client' service")
|
|
logger.info(f"Found {len(comfyui_traces)} traces from 'comfyui' service")
|
|
|
|
# Find our specific trace
|
|
our_trace = None
|
|
for trace in test_client_traces + comfyui_traces:
|
|
if trace.get("traceID") == trace_id:
|
|
our_trace = trace
|
|
logger.info(f"Found our trace {trace_id[:16]} with {len(trace.get('spans', []))} spans")
|
|
break
|
|
|
|
assert our_trace is not None, (
|
|
f"Could not find trace with trace_id {trace_id} in Jaeger. "
|
|
f"This indicates that spans from test-client are not being exported correctly. "
|
|
f"Jaeger UI: {jaeger_url}"
|
|
)
|
|
|
|
# Analyze our specific trace for the expected span hierarchy
|
|
spans = our_trace.get("spans", [])
|
|
logger.info(f"\nAnalyzing trace {trace_id[:16]} with {len(spans)} spans")
|
|
|
|
# Categorize spans
|
|
client_spans = []
|
|
server_spans = []
|
|
worker_spans = []
|
|
rabbitmq_spans = []
|
|
|
|
# Build a map of span_id -> span for reference lookup
|
|
span_map = {span.get("spanID"): span for span in spans}
|
|
|
|
for span in spans:
|
|
op_name = span.get("operationName", "")
|
|
span_id = span.get("spanID")
|
|
process_id = span.get("processID")
|
|
process = our_trace.get("processes", {}).get(process_id, {})
|
|
service_name = process.get("serviceName", "unknown")
|
|
|
|
logger.info(f" Span: {service_name}.{op_name} (id={span_id[:8]})")
|
|
|
|
# Categorize by operation and service
|
|
if service_name == "test-client":
|
|
client_spans.append(span)
|
|
elif "/api/v1/prompts" in op_name:
|
|
server_spans.append(span)
|
|
elif any(worker_op in op_name for worker_op in ["Execute", "execute", "queue_prompt", "Load", "Sample"]):
|
|
worker_spans.append(span)
|
|
elif any(rmq in op_name.lower() for rmq in ["publish", "consume", "send", "receive"]):
|
|
rabbitmq_spans.append(span)
|
|
|
|
logger.info(f"\nSpan summary:")
|
|
logger.info(f" Client spans: {len(client_spans)}")
|
|
logger.info(f" Server spans (/api/v1/prompts): {len(server_spans)}")
|
|
logger.info(f" Worker spans (Execute/Load/Sample): {len(worker_spans)}")
|
|
logger.info(f" RabbitMQ spans: {len(rabbitmq_spans)}")
|
|
|
|
# CRITICAL CHECKS: Verify complete trace propagation
|
|
|
|
# 1. Check that we have client spans
|
|
assert len(client_spans) > 0, (
|
|
f"NO CLIENT SPANS FOUND in trace {trace_id[:16]}!\n"
|
|
f"Expected to see spans from 'test-client' service but found none.\n"
|
|
f"This indicates the test client is not properly instrumented.\n"
|
|
f"Jaeger UI: {jaeger_url}/trace/{trace_id}"
|
|
)
|
|
logger.info(f"✓ Found {len(client_spans)} client span(s)")
|
|
|
|
# 2. Check that we have the server span /api/v1/prompts
|
|
assert len(server_spans) > 0, (
|
|
f"NO SERVER SPAN (/api/v1/prompts) FOUND in trace {trace_id[:16]}!\n"
|
|
f"Expected to see the HTTP server span but found none.\n"
|
|
f"This indicates the frontend is not properly instrumented or not in the same trace.\n"
|
|
f"Jaeger UI: {jaeger_url}/trace/{trace_id}"
|
|
)
|
|
logger.info(f"✓ Found {len(server_spans)} server span(s) for /api/v1/prompts")
|
|
|
|
# 3. Verify the server span is a CHILD of a client span
|
|
server_span = server_spans[0]
|
|
server_span_id = server_span.get("spanID")
|
|
server_references = server_span.get("references", [])
|
|
|
|
server_parent_found = False
|
|
for ref in server_references:
|
|
if ref.get("refType") == "CHILD_OF":
|
|
parent_span_id = ref.get("spanID")
|
|
if parent_span_id in span_map:
|
|
parent_span = span_map[parent_span_id]
|
|
parent_process_id = parent_span.get("processID")
|
|
parent_process = our_trace.get("processes", {}).get(parent_process_id, {})
|
|
parent_service = parent_process.get("serviceName", "unknown")
|
|
logger.info(f" Server span parent: {parent_service}.{parent_span.get('operationName', 'unknown')}")
|
|
|
|
if parent_service == "test-client":
|
|
server_parent_found = True
|
|
break
|
|
|
|
assert server_parent_found, (
|
|
f"SERVER SPAN IS NOT A CHILD OF CLIENT SPAN!\n"
|
|
f"The /api/v1/prompts span exists but is not linked to the test-client request.\n"
|
|
f"This indicates trace context (traceparent header) is not being propagated from client to server.\n"
|
|
f"Server span references: {server_references}\n"
|
|
f"Jaeger UI: {jaeger_url}/trace/{trace_id}"
|
|
)
|
|
logger.info("✓ Server span is correctly a child of client span")
|
|
|
|
# 4. Check that we have worker spans (Execute Node, etc.)
|
|
assert len(worker_spans) > 0, (
|
|
f"NO WORKER SPANS FOUND in trace {trace_id[:16]}!\n"
|
|
f"Expected to see worker spans like 'Execute Node', 'Load Checkpoint', etc.\n"
|
|
f"Found only:\n"
|
|
f" - Client spans: {len(client_spans)}\n"
|
|
f" - Server spans: {len(server_spans)}\n"
|
|
f" - RabbitMQ spans: {len(rabbitmq_spans)}\n"
|
|
f"\n"
|
|
f"THIS IS THE PRODUCTION ISSUE!\n"
|
|
f"Trace context is NOT propagating from frontend -> RabbitMQ -> worker.\n"
|
|
f"\n"
|
|
f"Possible causes:\n"
|
|
f" 1. aio-pika is not instrumented on frontend or worker\n"
|
|
f" 2. W3C trace context headers are not in AMQP message headers\n"
|
|
f" 3. OpenTelemetry context propagation is not configured correctly\n"
|
|
f" 4. OpenTelemetry Collector or Jaeger is dropping/corrupting spans\n"
|
|
f"\n"
|
|
f"Jaeger UI: {jaeger_url}/trace/{trace_id}"
|
|
)
|
|
logger.info(f"✓ Found {len(worker_spans)} worker span(s)")
|
|
|
|
# 5. Log all worker spans found
|
|
logger.info("\nWorker spans found:")
|
|
for worker_span in worker_spans:
|
|
logger.info(f" - {worker_span.get('operationName', 'unknown')}")
|
|
|
|
logger.info(f"\n✓✓✓ TRACE PROPAGATION SUCCESSFUL ✓✓✓")
|
|
logger.info(f"Trace {trace_id[:16]} contains complete span hierarchy:")
|
|
logger.info(f" Client ({len(client_spans)}) -> Server ({len(server_spans)}) -> Worker ({len(worker_spans)})")
|
|
logger.info(f"Jaeger UI: {jaeger_url}/trace/{trace_id}")
|
|
|
|
finally:
|
|
logger.info("Stopping frontend container...")
|
|
frontend.stop()
|
|
|
|
finally:
|
|
logger.info("Stopping backend containers...")
|
|
for i, backend in enumerate(backend_containers):
|
|
logger.info(f"Stopping backend {i+1}/{num_backends}...")
|
|
backend.stop()
|
|
|
|
@pytest.mark.skip
|
|
@pytest.mark.asyncio
|
|
async def test_aiohttp_and_aio_pika_spans_with_docker_frontend(jaeger_container):
|
|
"""
|
|
Test that both aiohttp and aio_pika instrumentation work in the Docker image.
|
|
|
|
This test helps diagnose if there's a dependency issue in the Docker image preventing
|
|
instrumentation from working correctly by:
|
|
1. Starting the ComfyUI frontend in a Docker container
|
|
2. Starting a local worker process
|
|
3. Submitting a workflow
|
|
4. Querying Jaeger to verify both aiohttp and aio_pika spans are present
|
|
|
|
Set COMFYUI_IMAGE env var to override default image, e.g.:
|
|
COMFYUI_IMAGE=ghcr.io/hiddenswitch/comfyui:latest
|
|
"""
|
|
docker_image = os.environ.get("COMFYUI_IMAGE", "ghcr.io/hiddenswitch/comfyui:latest")
|
|
|
|
jaeger_url = jaeger_container.get_query_url()
|
|
otlp_endpoint = jaeger_container.get_otlp_endpoint()
|
|
otlp_port = jaeger_container.get_exposed_port(4318)
|
|
|
|
with RabbitMqContainer("rabbitmq:latest") as rabbitmq:
|
|
params = rabbitmq.get_connection_params()
|
|
|
|
# Get Docker bridge gateway for container-to-host communication
|
|
try:
|
|
result = subprocess.run(
|
|
["docker", "network", "inspect", "bridge", "-f", "{{(index .IPAM.Config 0).Gateway}}"],
|
|
capture_output=True,
|
|
text=True,
|
|
check=True,
|
|
timeout=5
|
|
)
|
|
docker_host = result.stdout.strip()
|
|
if not docker_host:
|
|
docker_host = "host.docker.internal"
|
|
except (subprocess.CalledProcessError, subprocess.TimeoutExpired, FileNotFoundError):
|
|
docker_host = "host.docker.internal"
|
|
|
|
connection_uri_container = f"amqp://guest:guest@{docker_host}:{params.port}"
|
|
connection_uri_local = f"amqp://guest:guest@127.0.0.1:{params.port}"
|
|
|
|
# Start frontend in Docker container
|
|
frontend_container = DockerContainer(docker_image)
|
|
frontend_container.with_exposed_ports(8188)
|
|
|
|
otlp_endpoint_container = f"http://{docker_host}:{otlp_port}"
|
|
env_vars = {
|
|
"OTEL_SERVICE_NAME": "comfyui-docker-frontend",
|
|
"OTEL_EXPORTER_OTLP_ENDPOINT": otlp_endpoint_container,
|
|
}
|
|
|
|
for key, value in env_vars.items():
|
|
frontend_container.with_env(key, value)
|
|
|
|
frontend_container.with_command(
|
|
f"python -m comfy.cmd.main --listen 0.0.0.0 --port 8188 "
|
|
f"--cpu --distributed-queue-frontend "
|
|
f"--distributed-queue-connection-uri={connection_uri_container}"
|
|
)
|
|
|
|
frontend_container.start()
|
|
|
|
try:
|
|
frontend_host = frontend_container.get_container_host_ip()
|
|
frontend_port = frontend_container.get_exposed_port(8188)
|
|
frontend_url = f"http://{frontend_host}:{frontend_port}"
|
|
|
|
# Wait for frontend to be ready
|
|
connected = False
|
|
for _ in range(15):
|
|
try:
|
|
response = requests.get(frontend_url, timeout=1)
|
|
if response.status_code == 200:
|
|
connected = True
|
|
break
|
|
except Exception:
|
|
pass
|
|
time.sleep(1)
|
|
|
|
assert connected, f"Could not connect to Docker frontend at {frontend_url}"
|
|
|
|
# Start local worker
|
|
worker_env = os.environ.copy()
|
|
worker_env["OTEL_SERVICE_NAME"] = "comfyui-worker"
|
|
worker_env["OTEL_EXPORTER_OTLP_ENDPOINT"] = otlp_endpoint
|
|
|
|
worker_process = subprocess.Popen(
|
|
[
|
|
"comfyui-worker",
|
|
"--port=19099",
|
|
f"--distributed-queue-connection-uri={connection_uri_local}",
|
|
"--executor-factory=ThreadPoolExecutor"
|
|
],
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.STDOUT,
|
|
env=worker_env,
|
|
text=True,
|
|
bufsize=1
|
|
)
|
|
|
|
try:
|
|
time.sleep(5)
|
|
|
|
from comfy.client.aio_client import AsyncRemoteComfyClient
|
|
|
|
test_id = str(uuid.uuid4())[:8]
|
|
prompt = sdxl_workflow_with_refiner(f"docker_test_{test_id}", inference_steps=1, refiner_steps=1)
|
|
|
|
async with AsyncRemoteComfyClient(server_address=frontend_url) as client:
|
|
task_id = await client.queue_and_forget_prompt_api(prompt, prefer_header="respond-async")
|
|
assert task_id is not None, "Failed to get task ID"
|
|
|
|
status_code, result = await client.poll_prompt_until_done(task_id, max_attempts=60, poll_interval=2.0)
|
|
|
|
if status_code != 200:
|
|
# Capture worker logs
|
|
worker_output = ""
|
|
if worker_process.stdout:
|
|
worker_output = worker_process.stdout.read()
|
|
|
|
# Get frontend container logs
|
|
frontend_logs = frontend_container.get_logs()
|
|
|
|
logger.error("=" * 80)
|
|
logger.error("TASK FAILED - Diagnostic Information:")
|
|
logger.error("=" * 80)
|
|
logger.error(f"Task ID: {task_id}")
|
|
logger.error(f"Status Code: {status_code}")
|
|
logger.error(f"Result: {result}")
|
|
logger.error("\n--- Frontend Container Logs (last 100 lines) ---")
|
|
frontend_log_lines = frontend_logs.decode('utf-8').split('\n')
|
|
for line in frontend_log_lines[-100:]:
|
|
logger.error(line)
|
|
logger.error("\n--- Worker Process Output ---")
|
|
for line in worker_output.split('\n')[-100:]:
|
|
logger.error(line)
|
|
logger.error("=" * 80)
|
|
|
|
assert status_code == 200, f"Task failed with status {status_code}. Check logs above for details."
|
|
|
|
await asyncio.sleep(5)
|
|
|
|
# Query Jaeger for traces from both services
|
|
frontend_traces = query_jaeger_traces(jaeger_url, "comfyui-docker-frontend", lookback="5m").get("data", [])
|
|
worker_traces = query_jaeger_traces(jaeger_url, "comfyui-worker", lookback="5m").get("data", [])
|
|
|
|
assert frontend_traces, (
|
|
f"No traces found in Jaeger for service 'comfyui-docker-frontend'. "
|
|
f"Check that OTEL export is working from Docker container. Jaeger UI: {jaeger_url}"
|
|
)
|
|
|
|
assert worker_traces, (
|
|
f"No traces found in Jaeger for service 'comfyui-worker'. "
|
|
f"Check that OTEL export is working from worker. Jaeger UI: {jaeger_url}"
|
|
)
|
|
|
|
# Analyze span types from both services
|
|
aiohttp_spans = []
|
|
aio_pika_frontend_spans = []
|
|
aio_pika_worker_spans = []
|
|
|
|
for trace_item in frontend_traces:
|
|
for span in trace_item.get("spans", []):
|
|
operation_name = span.get("operationName", "")
|
|
if any(http_op in operation_name.upper() for http_op in ["GET", "POST", "PUT", "DELETE", "PATCH"]):
|
|
aiohttp_spans.append(operation_name)
|
|
elif "publish" in operation_name.lower() or "send" in operation_name.lower():
|
|
aio_pika_frontend_spans.append(operation_name)
|
|
|
|
for trace_item in worker_traces:
|
|
for span in trace_item.get("spans", []):
|
|
operation_name = span.get("operationName", "")
|
|
if "consume" in operation_name.lower() or "receive" in operation_name.lower() or "publish" in operation_name.lower():
|
|
aio_pika_worker_spans.append(operation_name)
|
|
|
|
assert aiohttp_spans, (
|
|
f"No aiohttp server spans found in traces from Docker frontend. "
|
|
f"This indicates aiohttp server instrumentation is not working in the Docker image. "
|
|
f"Image: {docker_image}. Jaeger UI: {jaeger_url}"
|
|
)
|
|
|
|
total_aio_pika_spans = len(aio_pika_frontend_spans) + len(aio_pika_worker_spans)
|
|
assert total_aio_pika_spans > 0, (
|
|
f"No aio_pika spans found in traces. "
|
|
f"Frontend aio_pika spans: {len(aio_pika_frontend_spans)}, Worker aio_pika spans: {len(aio_pika_worker_spans)}. "
|
|
f"Expected messaging spans for distributed queue operations. "
|
|
f"This indicates aio_pika instrumentation is not working. Jaeger UI: {jaeger_url}"
|
|
)
|
|
|
|
finally:
|
|
worker_process.terminate()
|
|
worker_process.wait(timeout=10)
|
|
|
|
finally:
|
|
frontend_container.stop() |