Object detection
In this guide you’ll wire a Python sidecar that subscribes to a Smelter input’s video side channel, runs YOLO object detection on every frame, and posts a scene update with bounding boxes that animate as people / objects move through the frame.
The full sidecar lives in one detect.py file built up across the steps below.
-
Start the Smelter server (learn more) with
SMELTER_SIDE_CHANNEL_SOCKET_DIRset to a directory the sidecar can also read, and install the sidecar’s dependencies. The input→output delay is configured per input viaside_channel.delay_msin step 3 (set to200below). The delay gives the sidecar time to run YOLO and schedule each scene update at the source frame’s own pts so the box lands at the moment that frame is rendered on the delayed output.pip install smelter-sdk ultralytics opencv-python -
Define the scene builders.
box_viewturns one detection into a borderedview, andboxes_sceneoverlays those boxes on the input. Each box uses a stableidso Smelter can interpolate position between updates, and a 200 mstransitionso it animates smoothly to its new place.detect.py INPUT_ID = "input"OUTPUT_W, OUTPUT_H = 1920, 1080def box_view(det: dict) -> dict:x1, y1, x2, y2 = det["xyxy"]return {"type": "view","id": f"det-{det['id']}" if det["id"] is not None else None,"left": int(x1 / det["w"] * OUTPUT_W),"top": int(y1 / det["h"] * OUTPUT_H),"width": max(2, int((x2 - x1) / det["w"] * OUTPUT_W)),"height": max(2, int((y2 - y1) / det["h"] * OUTPUT_H)),"border_width": 4,"border_color": "#00FF88FF","border_radius": 6,"transition": {"duration_ms": 200},}def boxes_scene(detections: list[dict]) -> dict:return {"type": "view","children": [{"type": "rescaler","child": {"type": "input_stream", "input_id": INPUT_ID}},*(box_view(d) for d in detections),],} -
Define
register_pipeline, which:- registers a WHIP input with the video side channel enabled,
- registers a WHEP output whose
initialscene isboxes_scene([]), - starts the pipeline.
detect.py import jsonimport urllib.request29 collapsed linesINPUT_ID = "input"OUTPUT_W, OUTPUT_H = 1920, 1080def box_view(det: dict) -> dict:x1, y1, x2, y2 = det["xyxy"]return {"type": "view","id": f"det-{det['id']}" if det["id"] is not None else None,"left": int(x1 / det["w"] * OUTPUT_W),"top": int(y1 / det["h"] * OUTPUT_H),"width": max(2, int((x2 - x1) / det["w"] * OUTPUT_W)),"height": max(2, int((y2 - y1) / det["h"] * OUTPUT_H)),"border_width": 4,"border_color": "#00FF88FF","border_radius": 6,"transition": {"duration_ms": 200},}def boxes_scene(detections: list[dict]) -> dict:return {"type": "view","children": [{"type": "rescaler","child": {"type": "input_stream", "input_id": INPUT_ID}},*(box_view(d) for d in detections),],}SMELTER_API = "http://127.0.0.1:8081"OUTPUT_ID = "output"def api_post(path: str, body: dict | None = None):req = urllib.request.Request(f"{SMELTER_API}{path}",data=json.dumps(body or {}).encode(),headers={"Content-Type": "application/json"},method="POST",)with urllib.request.urlopen(req) as r:return r.read()def register_pipeline():api_post(f"/api/input/{INPUT_ID}/register", {"type": "whip_server","bearer_token": "example","side_channel": {"video": True, "delay_ms": 200},})api_post(f"/api/output/{OUTPUT_ID}/register", {"type": "whep_server","bearer_token": "example","video": {"resolution": {"width": OUTPUT_W, "height": OUTPUT_H},"encoder": {"type": "ffmpeg_h264", "preset": "ultrafast"},"initial": {"root": boxes_scene([])},},"audio": {"encoder": {"type": "opus"},"initial": {"inputs": [{"input_id": INPUT_ID}]},},})api_post("/api/start")The
initialscene is just the input (no boxes yet); eachupdatecall swaps in the latest detections. -
Subscribe to the video side channel and run YOLO on every frame.
model.trackpersists a per-targetidacross frames, so the boxes interpolate smoothly rather than jumping between detections.detect.py import cv2from smelter import subscribe_video_channelfrom ultralytics import YOLO69 collapsed linesINPUT_ID = "input"OUTPUT_W, OUTPUT_H = 1920, 1080def box_view(det: dict) -> dict:x1, y1, x2, y2 = det["xyxy"]return {"type": "view","id": f"det-{det['id']}" if det["id"] is not None else None,"left": int(x1 / det["w"] * OUTPUT_W),"top": int(y1 / det["h"] * OUTPUT_H),"width": max(2, int((x2 - x1) / det["w"] * OUTPUT_W)),"height": max(2, int((y2 - y1) / det["h"] * OUTPUT_H)),"border_width": 4,"border_color": "#00FF88FF","border_radius": 6,"transition": {"duration_ms": 200},}def boxes_scene(detections: list[dict]) -> dict:return {"type": "view","children": [{"type": "rescaler","child": {"type": "input_stream", "input_id": INPUT_ID}},*(box_view(d) for d in detections),],}import jsonimport urllib.requestSMELTER_API = "http://127.0.0.1:8081"OUTPUT_ID = "output"def api_post(path: str, body: dict | None = None):req = urllib.request.Request(f"{SMELTER_API}{path}",data=json.dumps(body or {}).encode(),headers={"Content-Type": "application/json"},method="POST",)with urllib.request.urlopen(req) as r:return r.read()def register_pipeline():api_post(f"/api/input/{INPUT_ID}/register", {"type": "whip_server","bearer_token": "example","side_channel": {"video": True, "delay_ms": 200},})api_post(f"/api/output/{OUTPUT_ID}/register", {"type": "whep_server","bearer_token": "example","video": {"resolution": {"width": OUTPUT_W, "height": OUTPUT_H},"encoder": {"type": "ffmpeg_h264", "preset": "ultrafast"},"initial": {"root": boxes_scene([])},},"audio": {"encoder": {"type": "opus"},"initial": {"inputs": [{"input_id": INPUT_ID}]},},})api_post("/api/start")MIN_CONFIDENCE = 0.5def main():register_pipeline()model = YOLO("yolov8n.pt")for frame in subscribe_video_channel(INPUT_ID):bgr = cv2.cvtColor(frame.rgba, cv2.COLOR_RGBA2BGR)results = model.track(bgr, persist=True, verbose=False, classes=[0])if not results or results[0].boxes is None:continueboxes = results[0].boxesxyxy = boxes.xyxy.cpu().numpy()conf = boxes.conf.cpu().numpy()ids = boxes.id.cpu().numpy().astype(int).tolist() if boxes.id is not None else [None] * len(xyxy)detections = [{"xyxy": tuple(box), "id": tid, "w": frame.width, "h": frame.height}for box, p, tid in zip(xyxy, conf, ids)if p >= MIN_CONFIDENCE]# Schedule 100 ms before the frame's pts so the 200 ms transition animation# is half-complete when the matching output frame is rendered.schedule_ms = (frame.pts_nanos - 100_000_000) / 1e6api_post(f"/api/output/{OUTPUT_ID}/update", {"video": {"root": boxes_scene(detections)},"audio": {"inputs": [{"input_id": INPUT_ID}]},"schedule_time_ms": schedule_ms,})if __name__ == "__main__":main()classes=[0]restricts detection to people (COCO class 0); drop it or pass other class IDs to detect different objects. See the ultralytics docs for the full class list and other YOLO knobs (model size, GPU, NMS thresholds).Run it with
python detect.py. -
Stream a test source and watch the result with Smelter’s hosted browser tools (no install required):
- Publish your camera or screen with the WHIP streamer.
- Watch the composed output with the WHEP player.
Each detection appears as a green rectangle that follows its target across frames.