#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
#           This file was automatically generated from src/transformers/models/rf_detr/modular_rf_detr.py.
#               Do NOT edit this file manually as any edits will be overwritten by the generation of
#             the file from the modular. If any change should be done, please apply the change to the
#                          modular_rf_detr.py file directly. One of our CI enforces this.
#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pathlib
from typing import Any, Optional

import numpy as np
import torch
import torch.nn.functional as F
from torchvision.transforms.v2 import functional as tvF

from ...image_processing_backends import TorchvisionBackend
from ...image_processing_utils import BatchFeature, get_size_dict
from ...image_transforms import (
    center_to_corners_format,
    corners_to_center_format,
    get_size_with_aspect_ratio,
    safe_squeeze,
)
from ...image_utils import (
    IMAGENET_DEFAULT_MEAN,
    IMAGENET_DEFAULT_STD,
    AnnotationFormat,
    AnnotationType,
    ChannelDimension,
    ImageInput,
    PILImageResampling,
    SizeDict,
    get_image_size_for_max_height_width,
    get_max_height_width,
    validate_annotations,
)
from ...processing_utils import ImagesKwargs, Unpack
from ...utils import TensorType, auto_docstring


class RfDetrImageProcessorKwargs(ImagesKwargs, total=False):
    r"""
    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
    do_convert_annotations (`bool`, *optional*, defaults to `True`):
        Controls whether to convert the annotations to the format expected by the RF_DETR model. Converts the
        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
    """

    format: str | AnnotationFormat
    do_convert_annotations: bool


SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)


def binary_mask_to_rle(mask):
    """
    Converts given binary mask of shape `(height, width)` to the run-length encoding (RLE) format.

    Args:
        mask (`torch.Tensor` or `numpy.array`):
            A binary mask tensor of shape `(height, width)` where 0 denotes background and 1 denotes the target
            segment_id or class_id.
    Returns:
        `List`: Run-length encoded list of the binary mask. Refer to COCO API for more information about the RLE
        format.
    """
    from ...utils import is_torch_tensor

    if is_torch_tensor(mask):
        mask = mask.numpy()

    pixels = mask.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return list(runs)


def convert_segmentation_to_rle(segmentation):
    """
    Converts given segmentation map of shape `(height, width)` to the run-length encoding (RLE) format.

    Args:
        segmentation (`torch.Tensor` or `numpy.array`):
            A segmentation map of shape `(height, width)` where each value denotes a segment or class id.
    Returns:
        `list[List]`: A list of lists, where each list is the run-length encoding of a segment / class id.
    """
    segment_ids = torch.unique(segmentation)

    run_length_encodings = []
    for idx in segment_ids:
        mask = torch.where(segmentation == idx, 1, 0)
        rle = binary_mask_to_rle(mask)
        run_length_encodings.append(rle)

    return run_length_encodings


# inspired by https://github.com/facebookresearch/rf_detr/blob/master/datasets/coco.py#L33
def convert_coco_poly_to_mask(segmentations, height: int, width: int, device: torch.device) -> torch.Tensor:
    """
    Convert a COCO polygon annotation to a mask.

    Args:
        segmentations (`list[list[float]]`):
            List of polygons, each polygon represented by a list of x-y coordinates.
        height (`int`):
            Height of the mask.
        width (`int`):
            Width of the mask.
    """
    try:
        from pycocotools import mask as coco_mask
    except ImportError:
        raise ImportError("Pycocotools is not installed in your environment.")

    masks = []
    for polygons in segmentations:
        rles = coco_mask.frPyObjects(polygons, height, width)
        mask = coco_mask.decode(rles)
        if len(mask.shape) < 3:
            mask = mask[..., None]
        mask = torch.as_tensor(mask, dtype=torch.uint8, device=device)
        mask = torch.any(mask, axis=2)
        masks.append(mask)
    if masks:
        masks = torch.stack(masks, axis=0)
    else:
        masks = torch.zeros((0, height, width), dtype=torch.uint8, device=device)

    return masks


# inspired by https://github.com/facebookresearch/rf_detr/blob/master/datasets/coco.py#L50
def prepare_coco_detection_annotation(
    image,
    target,
    return_segmentation_masks: bool = False,
    input_data_format: ChannelDimension | str | None = None,
):
    """
    Convert the target in COCO format into the format expected by RF_DETR.
    """
    image_height, image_width = image.size()[-2:]

    image_id = target["image_id"]
    image_id = torch.as_tensor([image_id], dtype=torch.int64, device=image.device)

    # Get all COCO annotations for the given image.
    annotations = target["annotations"]
    classes = []
    area = []
    boxes = []
    keypoints = []
    for obj in annotations:
        if "iscrowd" not in obj or obj["iscrowd"] == 0:
            classes.append(obj["category_id"])
            area.append(obj["area"])
            boxes.append(obj["bbox"])
            if "keypoints" in obj:
                keypoints.append(obj["keypoints"])

    classes = torch.as_tensor(classes, dtype=torch.int64, device=image.device)
    area = torch.as_tensor(area, dtype=torch.float32, device=image.device)
    iscrowd = torch.zeros_like(classes, dtype=torch.int64, device=image.device)
    # guard against no boxes via resizing
    boxes = torch.as_tensor(boxes, dtype=torch.float32, device=image.device).reshape(-1, 4)
    boxes[:, 2:] += boxes[:, :2]
    boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
    boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)

    keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])

    new_target = {
        "image_id": image_id,
        "class_labels": classes[keep],
        "boxes": boxes[keep],
        "area": area[keep],
        "iscrowd": iscrowd[keep],
        "orig_size": torch.as_tensor([int(image_height), int(image_width)], dtype=torch.int64, device=image.device),
    }

    if keypoints:
        keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=image.device)
        # Apply the keep mask here to filter the relevant annotations
        keypoints = keypoints[keep]
        num_keypoints = keypoints.shape[0]
        keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
        new_target["keypoints"] = keypoints

    if return_segmentation_masks:
        segmentation_masks = [obj["segmentation"] for obj in annotations]
        masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width, device=image.device)
        new_target["masks"] = masks[keep]

    return new_target


@auto_docstring
class RfDetrImageProcessor(TorchvisionBackend):
    valid_kwargs = RfDetrImageProcessorKwargs
    resample = PILImageResampling.BILINEAR
    image_mean = IMAGENET_DEFAULT_MEAN
    image_std = IMAGENET_DEFAULT_STD
    format = AnnotationFormat.COCO_DETECTION
    do_resize = True
    do_rescale = True
    do_normalize = True
    do_pad = True
    size = {"shortest_edge": 800, "longest_edge": 1333}
    default_to_square = False
    model_input_names = ["pixel_values", "pixel_mask"]

    def __init__(self, **kwargs: Unpack[RfDetrImageProcessorKwargs]) -> None:
        kwargs.setdefault("do_pad", kwargs.pop("pad_and_return_pixel_mask", self.do_pad))

        size = kwargs.pop("size", None)
        max_size = None if size is None else kwargs.pop("max_size", 1333)
        size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
        # Convert size dict for backwards compat with max_size parameter
        kwargs["size"] = get_size_dict(size, max_size=max_size, default_to_square=False)

        # Backwards compatibility
        do_convert_annotations = kwargs.get("do_convert_annotations")
        do_normalize = kwargs.get("do_normalize")
        if do_convert_annotations is None and getattr(self, "do_convert_annotations", None) is None:
            self.do_convert_annotations = do_normalize if do_normalize is not None else self.do_normalize

        super().__init__(**kwargs)

    def prepare_annotation(
        self,
        image: torch.Tensor,
        target: dict,
        format: AnnotationFormat | None = None,
        return_segmentation_masks: bool | None = None,
        masks_path: str | pathlib.Path | None = None,
        input_data_format: str | ChannelDimension | None = None,
    ) -> dict:
        """
        Prepare an annotation for feeding into RF_DETR model.
        """
        format = format if format is not None else self.format

        if format == AnnotationFormat.COCO_DETECTION:
            return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
            target = prepare_coco_detection_annotation(
                image, target, return_segmentation_masks, input_data_format=input_data_format
            )
        else:
            raise ValueError(f"Format {format} is not supported.")
        return target

    def resize(
        self,
        image: torch.Tensor,
        size: SizeDict,
        resample: Optional["PILImageResampling | tvF.InterpolationMode | int"] = None,
        **kwargs,
    ) -> torch.Tensor:
        """
        Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
        int, smaller edge of the image will be matched to this number.

        Args:
            image (`torch.Tensor`):
                Image to resize.
            size (`SizeDict`):
                Size of the image's `(height, width)` dimensions after resizing. Available options are:
                    - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
                        Do NOT keep the aspect ratio.
                    - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
                        the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
                        less or equal to `longest_edge`.
                    - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
                        aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
                        `max_width`.
            resample (`PILImageResampling | tvF.InterpolationMode | int`, *optional*, defaults to `PILImageResampling.BILINEAR`):
                Resampling filter to use if resizing the image.
        """
        if size.shortest_edge and size.longest_edge:
            # Resize the image so that the shortest edge or the longest edge is of the given size
            # while maintaining the aspect ratio of the original image.
            new_size = get_size_with_aspect_ratio(image.shape[-2:], size.shortest_edge, size.longest_edge)
        elif size.max_height and size.max_width:
            new_size = get_image_size_for_max_height_width(image.shape[-2:], size.max_height, size.max_width)
        elif size.height and size.width:
            new_size = (size.height, size.width)
        else:
            raise ValueError(
                f"Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got {size}."
            )

        image = super().resize(
            image, size=SizeDict(height=new_size[0], width=new_size[1]), resample=resample, **kwargs
        )
        return image

    def resize_annotation(
        self,
        annotation: dict[str, Any],
        orig_size: tuple[int, int],
        target_size: tuple[int, int],
        threshold: float = 0.5,
        resample: Optional["PILImageResampling | tvF.InterpolationMode | int"] = PILImageResampling.NEAREST,
    ):
        """
        Resizes an annotation to a target size.

        Args:
            annotation (`dict[str, Any]`):
                The annotation dictionary.
            orig_size (`tuple[int, int]`):
                The original size of the input image.
            target_size (`tuple[int, int]`):
                The target size of the image, as returned by the preprocessing `resize` step.
            threshold (`float`, *optional*, defaults to 0.5):
                The threshold used to binarize the segmentation masks.
            resample (`PILImageResampling | tvF.InterpolationMode | int`, defaults to `tvF.InterpolationMode.NEAREST_EXACT`):
                The resampling filter to use when resizing the masks.
        """
        ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)]

        new_annotation = {}
        new_annotation["size"] = target_size

        for key, value in annotation.items():
            if key == "boxes":
                boxes = value
                scaled_boxes = boxes * torch.as_tensor(
                    [ratio_width, ratio_height, ratio_width, ratio_height], dtype=torch.float32, device=boxes.device
                )
                new_annotation["boxes"] = scaled_boxes
            elif key == "area":
                area = value
                scaled_area = area * (ratio_width * ratio_height)
                new_annotation["area"] = scaled_area
            elif key == "masks":
                masks = value[:, None]
                masks = [
                    super(RfDetrImageProcessor, self).resize(
                        mask, size=SizeDict(height=target_size[0], width=target_size[1]), resample=resample
                    )
                    for mask in masks
                ]
                masks = torch.stack(masks).to(torch.float32)
                masks = masks[:, 0] > threshold
                new_annotation["masks"] = masks
            elif key == "size":
                new_annotation["size"] = target_size
            else:
                new_annotation[key] = value

        return new_annotation

    def normalize_annotation(self, annotation: dict, image_size: tuple[int, int]) -> dict:
        image_height, image_width = image_size
        norm_annotation = {}
        for key, value in annotation.items():
            if key == "boxes":
                boxes = value
                boxes = corners_to_center_format(boxes)
                boxes /= torch.as_tensor(
                    [image_width, image_height, image_width, image_height], dtype=torch.float32, device=boxes.device
                )
                norm_annotation[key] = boxes
            else:
                norm_annotation[key] = value
        return norm_annotation

    def _update_annotation_for_padded_image(
        self,
        annotation: dict,
        input_image_size: tuple[int, int],
        output_image_size: tuple[int, int],
        padding,
        update_bboxes,
    ) -> dict:
        """
        Update the annotation for a padded image.
        """
        new_annotation = {}
        new_annotation["size"] = output_image_size
        ratio_height, ratio_width = (input / output for output, input in zip(output_image_size, input_image_size))

        for key, value in annotation.items():
            if key == "masks":
                masks = value
                masks = tvF.pad(
                    masks,
                    padding,
                    fill=0,
                )
                masks = safe_squeeze(masks, 1)
                new_annotation["masks"] = masks
            elif key == "boxes" and update_bboxes:
                boxes = value
                boxes *= torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height], device=boxes.device)
                new_annotation["boxes"] = boxes
            elif key == "size":
                new_annotation["size"] = output_image_size
            else:
                new_annotation[key] = value
        return new_annotation

    def pad(
        self,
        image: torch.Tensor,
        padded_size: tuple[int, int],
        annotation: dict[str, Any] | None = None,
        update_bboxes: bool = True,
        fill: int = 0,
    ):
        original_size = image.size()[-2:]
        padding_bottom = padded_size[0] - original_size[0]
        padding_right = padded_size[1] - original_size[1]
        if padding_bottom < 0 or padding_right < 0:
            raise ValueError(
                f"Padding dimensions are negative. Please make sure that the padded size is larger than the "
                f"original size. Got padded size: {padded_size}, original size: {original_size}."
            )
        if original_size != padded_size:
            padding = [0, 0, padding_right, padding_bottom]
            image = tvF.pad(image, padding, fill=fill)
            if annotation is not None:
                annotation = self._update_annotation_for_padded_image(
                    annotation, original_size, padded_size, padding, update_bboxes
                )

        # Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
        pixel_mask = torch.zeros(padded_size, dtype=torch.int64, device=image.device)
        pixel_mask[: original_size[0], : original_size[1]] = 1

        return image, pixel_mask, annotation

    @auto_docstring
    def preprocess(
        self,
        images: ImageInput,
        annotations: AnnotationType | list[AnnotationType] | None = None,
        return_segmentation_masks: bool | None = None,
        masks_path: str | pathlib.Path | None = None,
        **kwargs: Unpack[RfDetrImageProcessorKwargs],
    ) -> BatchFeature:
        r"""
        annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
            Annotations to transform according to the padding that is applied to the images.
        return_segmentation_masks (`bool`, *optional*, defaults to `self.return_segmentation_masks`):
            Whether to return segmentation masks.
        masks_path (`str` or `pathlib.Path`, *optional*):
            Path to the directory containing the segmentation masks.
        """
        return super().preprocess(images, annotations, return_segmentation_masks, masks_path, **kwargs)

    def _preprocess(
        self,
        images: list["torch.Tensor"],
        annotations: AnnotationType | list[AnnotationType] | None,
        return_segmentation_masks: bool,
        masks_path: str | pathlib.Path | None,
        do_resize: bool,
        size: SizeDict,
        resample: "PILImageResampling | tvF.InterpolationMode | int | None",
        do_rescale: bool,
        rescale_factor: float,
        do_normalize: bool,
        do_convert_annotations: bool,
        image_mean: float | list[float] | None,
        image_std: float | list[float] | None,
        do_pad: bool,
        pad_size: SizeDict | None,
        format: str | AnnotationFormat | None,
        return_tensors: str | TensorType | None,
        **kwargs,
    ) -> BatchFeature:
        """
        Preprocess an image or a batch of images so that it can be used by the model.
        """
        if annotations is not None and isinstance(annotations, dict):
            annotations = [annotations]

        if annotations is not None and len(images) != len(annotations):
            raise ValueError(
                f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match."
            )

        format = AnnotationFormat(format)
        if annotations is not None:
            validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)

        if (
            masks_path is not None
            and format == AnnotationFormat.COCO_PANOPTIC
            and not isinstance(masks_path, (pathlib.Path, str))
        ):
            raise ValueError(
                "The path to the directory containing the mask PNG files should be provided as a"
                f" `pathlib.Path` or string object, but is {type(masks_path)} instead."
            )

        data = {}

        processed_images = []
        processed_annotations = []
        pixel_masks = []  # Initialize pixel_masks here
        for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
            # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
            if annotations is not None:
                annotation = self.prepare_annotation(
                    image,
                    annotation,
                    format,
                    return_segmentation_masks=return_segmentation_masks,
                    masks_path=masks_path,
                    input_data_format=ChannelDimension.FIRST,
                )
            # Rescale then resize like in the original RF-DETR implementation
            if do_rescale:
                image = self.rescale(image, rescale_factor)
            if do_resize:
                resized_image = self.resize(image, size=size, resample=resample)
                if annotations is not None:
                    annotation = self.resize_annotation(
                        annotation,
                        orig_size=image.size()[-2:],
                        target_size=resized_image.size()[-2:],
                    )
                image = resized_image
            if do_normalize:
                image = self.normalize(image, image_mean, image_std)
            if do_convert_annotations and annotations is not None:
                annotation = self.normalize_annotation(annotation, image.shape[-2:])

            processed_images.append(image)
            processed_annotations.append(annotation)
        images = processed_images
        annotations = processed_annotations if annotations is not None else None

        if do_pad:
            # depends on all resized image shapes so we need another loop
            if pad_size is not None:
                padded_size = (pad_size.height, pad_size.width)
            else:
                padded_size = get_max_height_width(images)

            padded_images = []
            padded_annotations = []
            for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
                # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
                if padded_size == image.size()[-2:]:
                    padded_images.append(image)
                    pixel_masks.append(torch.ones(padded_size, dtype=torch.int64, device=image.device))
                    padded_annotations.append(annotation)
                    continue
                image, pixel_mask, annotation = self.pad(
                    image, padded_size, annotation=annotation, update_bboxes=do_convert_annotations
                )
                padded_images.append(image)
                padded_annotations.append(annotation)
                pixel_masks.append(pixel_mask)
            images = padded_images
            annotations = padded_annotations if annotations is not None else None
            data.update({"pixel_mask": torch.stack(pixel_masks, dim=0)})

        data.update({"pixel_values": torch.stack(images, dim=0)})
        encoded_inputs = BatchFeature(data, tensor_type=return_tensors)
        if annotations is not None:
            encoded_inputs["labels"] = [
                BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
            ]
        return encoded_inputs

    def post_process_object_detection(
        self,
        outputs,
        threshold: float = 0.5,
        target_sizes: TensorType | list[tuple] | None = None,
        top_k: int | None = None,
    ):
        """
        Converts the raw output of [`RfDetrForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
        bottom_right_x, bottom_right_y) format. Only supports PyTorch.

        Args:
            outputs ([`RfDetrObjectDetectionOutput`]):
                Raw outputs of the model.
            threshold (`float`, *optional*):
                Score threshold to keep object detection predictions.
            target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*):
                Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size
                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
        Returns:
            `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
            in the batch as predicted by the model.
        """
        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
        if target_sizes is not None:
            if len(out_logits) != len(target_sizes):
                raise ValueError(
                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
                )
        top_k = top_k if top_k is not None else out_logits.shape[1]
        prob = out_logits.sigmoid()
        batch_size, num_queries, num_classes = prob.shape
        k_value = min(top_k, num_queries * num_classes)
        scores, topk_indexes = torch.topk(prob.view(batch_size, -1), k_value, dim=1)
        topk_boxes = topk_indexes // num_classes
        labels = topk_indexes % num_classes
        boxes = center_to_corners_format(out_bbox)
        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))

        if target_sizes is not None:
            if isinstance(target_sizes, list):
                img_h = torch.tensor([i[0] for i in target_sizes], device=boxes.device)
                img_w = torch.tensor([i[1] for i in target_sizes], device=boxes.device)
            else:
                img_h, img_w = target_sizes.to(boxes.device).unbind(1)
            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
            boxes = boxes * scale_fct[:, None, :]

        results = []
        for score, label, box in zip(scores, labels, boxes):
            keep = score > threshold
            results.append({"scores": score[keep], "labels": label[keep], "boxes": box[keep]})
        return results

    def post_process_instance_segmentation(
        self,
        outputs,
        threshold: float = 0.5,
        mask_threshold: float = 0.0,
        target_sizes: list[tuple[int, int]] | None = None,
        return_coco_annotation: bool = False,
        return_binary_maps: bool = False,
        top_k: int | None = None,
    ) -> list[dict[str, Any]]:
        """
        Converts the output of [`RfDetrForInstanceSegmentation`] into instance segmentation predictions.

        Args:
            outputs ([`RfDetrInstanceSegmentationOutput`]):
                Raw outputs of the model.
            threshold (`float`, *optional*, defaults to 0.5):
                Score threshold to keep predicted instance masks.
            mask_threshold (`float`, *optional*, defaults to 0.0):
                Threshold to binarize predicted masks.
            target_sizes (`list[tuple[int, int]]`, *optional*):
                Target ``(height, width)`` for each image. If unset, masks are not resized.
            return_coco_annotation (`bool`, *optional*, defaults to `False`):
                If `True`, return segmentation maps as COCO run-length encoding instead of tensors.
                Mutually exclusive with `return_binary_maps`.
            return_binary_maps (`bool`, *optional*, defaults to `False`):
                If `True`, return segmentation maps as a stacked tensor of binary instance masks
                (one per detected instance), without overlap resolution. This matches the output
                format of the original ``rfdetr`` package. Mutually exclusive with
                `return_coco_annotation`.
            top_k (`int`, *optional*):
                Maximum number of candidate queries evaluated before score thresholding.
                Defaults to the total number of queries.

        Returns:
            `list[dict]`: One dict per image with keys:
            - **segmentation** -- `Tensor[H, W]` of ``int32`` segment ids (``-1`` = background),
              `Tensor[num_instances, H, W]` of bool binary masks when `return_binary_maps=True`,
              or a list of RLE encodings when `return_coco_annotation=True`.
            - **segments_info** -- List of dicts with keys ``id``, ``label_id``, and ``score``.
        """
        if return_coco_annotation and return_binary_maps:
            raise ValueError("`return_coco_annotation` and `return_binary_maps` cannot both be `True`.")
        out_logits, out_masks = outputs.logits, outputs.pred_masks
        top_k = top_k if top_k is not None else out_logits.shape[1]

        prob = out_logits.sigmoid()
        batch_size, num_queries, num_classes = prob.shape
        top_k = min(top_k, num_queries * num_classes)

        scores, topk_indexes = torch.topk(prob.view(batch_size, -1), top_k, dim=1)
        topk_queries = topk_indexes // num_classes
        labels = topk_indexes % num_classes
        masks = torch.gather(
            out_masks,
            1,
            topk_queries.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, out_masks.shape[-2], out_masks.shape[-1]),
        )

        results: list[dict[str, Any]] = []
        for batch_idx, (batch_scores, batch_labels, batch_masks) in enumerate(zip(scores, labels, masks)):
            keep = batch_scores > threshold
            pred_scores = batch_scores[keep]
            pred_labels = batch_labels[keep]
            mask_logits = batch_masks[keep]

            if target_sizes is not None:
                height, width = int(target_sizes[batch_idx][0]), int(target_sizes[batch_idx][1])
            else:
                height, width = mask_logits.shape[-2], mask_logits.shape[-1]

            if pred_scores.shape[0] == 0:
                segmentation = torch.full((height, width), -1, dtype=torch.int32, device=out_logits.device)
                results.append({"segmentation": segmentation, "segments_info": []})
                continue

            mask_logits_resized = F.interpolate(
                mask_logits.unsqueeze(1).float(),
                size=(height, width),
                mode="bilinear",
                align_corners=False,
            ).squeeze(1)

            segmentation = torch.full((height, width), -1, dtype=torch.int32, device=out_logits.device)
            segments: list[dict] = []
            instance_maps: list = []
            current_id = 0

            for i in range(pred_scores.shape[0]):
                binary_mask = mask_logits_resized[i] > mask_threshold
                if not binary_mask.any():
                    continue
                if return_binary_maps:
                    instance_maps.append(binary_mask)
                else:
                    pixels_to_paint = binary_mask & (segmentation == -1)
                    if not pixels_to_paint.any():
                        continue
                current_id += 1
                if not return_binary_maps:
                    segmentation[pixels_to_paint] = current_id
                segments.append(
                    {"id": current_id, "label_id": int(pred_labels[i]), "score": round(pred_scores[i].item(), 6)}
                )

            if return_coco_annotation:
                segmentation = convert_segmentation_to_rle(segmentation)
            elif return_binary_maps:
                segmentation = (
                    torch.stack(instance_maps, dim=0)
                    if instance_maps
                    else torch.zeros(0, height, width, dtype=torch.bool, device=out_logits.device)
                )

            results.append({"segmentation": segmentation, "segments_info": segments})

        return results


__all__ = ["RfDetrImageProcessor"]