| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| from typing import Optional, Union |
|
|
| import torch |
|
|
| from transformers.image_processing_utils import BatchFeature, get_patch_output_size, select_best_resolution |
| from transformers.image_processing_utils_fast import ( |
| BaseImageProcessorFast, |
| DefaultFastImageProcessorKwargs, |
| divide_to_patches, |
| group_images_by_shape, |
| reorder_images, |
| ) |
| from transformers.image_utils import ( |
| OPENAI_CLIP_MEAN, |
| OPENAI_CLIP_STD, |
| ChannelDimension, |
| ImageInput, |
| PILImageResampling, |
| SizeDict, |
| get_image_size, |
| make_flat_list_of_images, |
| ) |
| from transformers.processing_utils import Unpack |
| from transformers.utils import TensorType, auto_docstring, is_torchvision_v2_available |
|
|
|
|
| if is_torchvision_v2_available(): |
| from torchvision.transforms.v2 import functional as F |
| else: |
| from torchvision.transforms import functional as F |
|
|
|
|
| class RFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): |
| image_grid_pinpoints: Optional[list[list[int]]] |
| do_pad: Optional[bool] |
|
|
|
|
| @auto_docstring |
| class RImageProcessorFast(BaseImageProcessorFast): |
| resample = PILImageResampling.BICUBIC |
| image_mean = OPENAI_CLIP_MEAN |
| image_std = OPENAI_CLIP_STD |
| size = {"height": 384, "width": 384} |
| default_to_square = False |
| crop_size = None |
| do_resize = True |
| do_center_crop = None |
| do_rescale = True |
| do_normalize = True |
| do_convert_rgb = True |
| do_pad = True |
| image_grid_pinpoints = [[384,768],[768,384],[768,768],[1152,384],[384,1152]], |
| valid_kwargs = RFastImageProcessorKwargs |
| model_input_names = ["pixel_values_videos"] |
|
|
| def __init__(self, **kwargs: Unpack[RFastImageProcessorKwargs]): |
| super().__init__(**kwargs) |
|
|
| @auto_docstring |
| def preprocess( |
| self, images: ImageInput, **kwargs: Unpack[RFastImageProcessorKwargs] |
| ) -> BatchFeature: |
| if isinstance(images, (tuple, list)) and isinstance(images[0], (tuple, list)): |
| |
| batch_num_images = [len(x) for x in images] |
| elif isinstance(images, (tuple, list)): |
| |
| batch_num_images = [1] * len(images) |
| else: |
| batch_num_images = [1] |
| kwargs["batch_num_images"] = batch_num_images |
| return super().preprocess(images, **kwargs) |
|
|
| def _prepare_images_structure( |
| self, |
| images: ImageInput, |
| ) -> ImageInput: |
| return make_flat_list_of_images(images) |
|
|
| def _resize_for_patching( |
| self, |
| image: "torch.Tensor", |
| target_resolution: tuple, |
| interpolation: "F.InterpolationMode", |
| input_data_format: ChannelDimension, |
| ) -> "torch.Tensor": |
|
|
| new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format) |
|
|
| |
| resized_image = F.resize(image, (new_height, new_width), interpolation=interpolation) |
|
|
| return resized_image |
|
|
| def _get_padding_size(self, original_resolution: tuple, target_resolution: tuple): |
| original_height, original_width = original_resolution |
| target_height, target_width = target_resolution |
| paste_x, r_x = divmod(target_width - original_width, 2) |
| paste_y, r_y = divmod(target_height - original_height, 2) |
| return [paste_x, paste_y, paste_x + r_x, paste_y + r_y] |
|
|
| def _pad_for_patching( |
| self, image: "torch.Tensor", target_resolution: tuple, input_data_format: ChannelDimension |
| ) -> "torch.Tensor": |
| """ |
| Pad an image to a target resolution while maintaining aspect ratio. |
| """ |
| new_resolution = get_patch_output_size(image, target_resolution, input_data_format) |
| padding = self._get_padding_size(new_resolution, target_resolution) |
|
|
| padded_image = F.pad(image, padding=padding) |
|
|
| return padded_image |
|
|
| def _get_image_patches( |
| self, |
| image: "torch.Tensor", |
| grid_pinpoints, |
| size: tuple, |
| patch_size: int, |
| interpolation: "F.InterpolationMode", |
| ) -> list["torch.Tensor"]: |
| """ |
| Process an image with variable resolutions by dividing it into patches. |
| |
| Args: |
| image ("torch.Tensor"): |
| The input image to be processed. |
| grid_pinpoints (List): |
| A string representation of a list of possible resolutions. |
| size (`tuple`): |
| Size to resize the original image to. |
| patch_size (`int`): |
| Size of the patches to divide the image into. |
| interpolation (`"InterpolationMode"`): |
| Resampling filter to use if resizing the image. |
| |
| Returns: |
| list["torch.Tensor"]: A list of NumPy arrays containing the processed image patches. |
| """ |
| if not isinstance(grid_pinpoints, list): |
| raise TypeError("grid_pinpoints must be a list of possible resolutions.") |
|
|
| possible_resolutions = grid_pinpoints |
|
|
| image_size = get_image_size(image, channel_dim=ChannelDimension.FIRST) |
| best_resolution = select_best_resolution(image_size, possible_resolutions) |
| resized_image = self._resize_for_patching( |
| image, best_resolution, interpolation=interpolation, input_data_format=ChannelDimension.FIRST |
| ) |
| padded_image = self._pad_for_patching(resized_image, best_resolution, input_data_format=ChannelDimension.FIRST) |
| patches = divide_to_patches(padded_image, patch_size=patch_size) |
| resized_original_image = F.resize(image, size=size, interpolation=interpolation) |
|
|
| image_patches = [resized_original_image] + patches |
|
|
| return image_patches |
|
|
| def _pad_for_batching( |
| self, |
| pixel_values: list["torch.Tensor"], |
| ) -> list["torch.Tensor"]: |
| """ |
| Pads images on the `num_of_patches` dimension with zeros to form a batch of same number of patches. |
| |
| Args: |
| pixel_values (`list[torch.Tensor]`): |
| An array of pixel values of each images of shape (`batch_size`, `num_patches`, `image_in_3D`) |
| |
| Returns: |
| list[`torch.Tensor`]: The padded images. |
| """ |
| max_patch = max(len(x) for x in pixel_values) |
| pixel_values = [ |
| torch.nn.functional.pad(image, pad=[0, 0, 0, 0, 0, 0, 0, max_patch - image.shape[0]]) |
| for image in pixel_values |
| ] |
|
|
| return pixel_values |
|
|
| def _preprocess( |
| self, |
| images: list["torch.Tensor"], |
| do_resize: bool, |
| size: SizeDict, |
| image_grid_pinpoints: list[list[int]], |
| interpolation: Optional["F.InterpolationMode"], |
| do_center_crop: bool, |
| crop_size: SizeDict, |
| do_rescale: bool, |
| rescale_factor: float, |
| do_normalize: bool, |
| image_mean: Optional[Union[float, list[float]]], |
| image_std: Optional[Union[float, list[float]]], |
| do_pad: bool, |
| batch_num_images: list[int], |
| return_tensors: Optional[Union[str, TensorType]], |
| ) -> BatchFeature: |
| processed_images = [] |
| image_sizes = [] |
|
|
| |
| need_patching = [n == 1 for n in batch_num_images for _ in range(n)] |
|
|
| |
| if size and size.height and size.width: |
| size_tuple = (size.height, size.width) |
| else: |
| size_tuple = (size.shortest_edge, size.shortest_edge) |
|
|
| |
| if crop_size and crop_size.height: |
| patch_size = crop_size.height |
| elif size and size.height: |
| patch_size = size.height |
| else: |
| patch_size = size.shortest_edge |
|
|
| for i, image in enumerate(images): |
| if need_patching[i]: |
| image_patches = self._get_image_patches( |
| image, |
| image_grid_pinpoints, |
| size=size_tuple, |
| patch_size=patch_size, |
| interpolation=interpolation, |
| ) |
| else: |
| padded_image = self.pad_to_square( |
| images=image, background_color=tuple(int(x * 255) for x in self.image_mean) |
| ) |
| image_patches = [padded_image] |
|
|
| |
| processed_image_patches_grouped = {} |
| grouped_image_patches, grouped_image_patches_index = group_images_by_shape(image_patches) |
| for shape, stacked_image_patches in grouped_image_patches.items(): |
| if do_resize: |
| stacked_image_patches = self.resize( |
| image=stacked_image_patches, |
| size=size, |
| interpolation=interpolation, |
| ) |
| if do_center_crop: |
| stacked_image_patches = self.center_crop(stacked_image_patches, crop_size) |
| |
| stacked_image_patches = self.rescale_and_normalize( |
| stacked_image_patches, do_rescale, rescale_factor, do_normalize, image_mean, image_std |
| ) |
| processed_image_patches_grouped[shape] = stacked_image_patches |
| processed_image_patches = reorder_images(processed_image_patches_grouped, grouped_image_patches_index) |
| processed_image_patches = ( |
| torch.stack(processed_image_patches, dim=0) if return_tensors else processed_image_patches |
| ) |
| processed_images.append(processed_image_patches) |
| image_sizes.append(get_image_size(image, ChannelDimension.FIRST)) |
|
|
| if do_pad: |
| processed_images = self._pad_for_batching(processed_images) |
| processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images |
| return BatchFeature( |
| data={"pixel_values": processed_images, "image_sizes": image_sizes, "batch_num_images": batch_num_images}, |
| tensor_type=return_tensors, |
| ) |
|
|
| |
| def pad_to_square( |
| self, |
| images: "torch.Tensor", |
| background_color: Union[int, tuple[int, int, int]] = 0, |
| ) -> "torch.Tensor": |
| """ |
| Pads an image to a square based on the longest edge. |
| |
| Args: |
| images (`np.ndarray`): |
| The images to pad. |
| background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0): |
| The color to use for the padding. Can be an integer for single channel or a |
| tuple of integers representing for multi-channel images. If passed as integer |
| in mutli-channel mode, it will default to `0` in subsequent channels. |
| Returns: |
| `torch.Tensor`: The padded images. |
| """ |
| height, width = get_image_size(images, ChannelDimension.FIRST) |
|
|
| if height == width: |
| return images |
|
|
| num_channels = images.shape[1] if len(images.shape) == 4 else images.shape[0] |
| if isinstance(background_color, int): |
| background_color = [background_color] + [0] * (num_channels - 1) |
| elif len(background_color) != num_channels: |
| raise ValueError( |
| f"background_color must have no more than {num_channels} elements to match the number of channels" |
| ) |
|
|
| max_dim = max(height, width) |
| paste_x_left = (max_dim - width) // 2 |
| paste_y_left = (max_dim - height) // 2 |
| paste_x_right = max_dim - width - paste_x_left |
| paste_y_right = max_dim - height - paste_y_left |
| padded_images = F.pad( |
| images, padding=[paste_x_left, paste_y_left, paste_x_right, paste_y_right], fill=background_color |
| ) |
|
|
| return padded_images |
|
|
|
|
| __all__ = ["RImageProcessorFast"] |
|
|