Instructions to use YannQi/R-4B with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use YannQi/R-4B with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("image-text-to-text", model="YannQi/R-4B", trust_remote_code=True)
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
            {"type": "text", "text": "What animal is on the candy?"}
        ]
    },
]
pipe(text=messages)

# Load model directly
from transformers import AutoModel
model = AutoModel.from_pretrained("YannQi/R-4B", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use YannQi/R-4B with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "YannQi/R-4B"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "YannQi/R-4B",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Use Docker

docker model run hf.co/YannQi/R-4B

SGLang

How to use YannQi/R-4B with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "YannQi/R-4B" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "YannQi/R-4B",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "YannQi/R-4B" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "YannQi/R-4B",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Docker Model Runner
How to use YannQi/R-4B with Docker Model Runner:
```
docker model run hf.co/YannQi/R-4B
```

R-4B / image_processing_r_fast.py

YannQi

Upload folder using huggingface_hub

d68afe0 verified 9 months ago

raw

history blame contribute delete

12.6 kB

	# coding=utf-8
	# Copyright 2024 HuggingFace Inc. team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	from typing import Optional, Union

	import torch

	from transformers.image_processing_utils import BatchFeature, get_patch_output_size, select_best_resolution
	from transformers.image_processing_utils_fast import (
	BaseImageProcessorFast,
	DefaultFastImageProcessorKwargs,
	divide_to_patches,
	group_images_by_shape,
	reorder_images,
	)
	from transformers.image_utils import (
	OPENAI_CLIP_MEAN,
	OPENAI_CLIP_STD,
	ChannelDimension,
	ImageInput,
	PILImageResampling,
	SizeDict,
	get_image_size,
	make_flat_list_of_images,
	)
	from transformers.processing_utils import Unpack
	from transformers.utils import TensorType, auto_docstring, is_torchvision_v2_available


	if is_torchvision_v2_available():
	from torchvision.transforms.v2 import functional as F
	else:
	from torchvision.transforms import functional as F


	class RFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
	image_grid_pinpoints: Optional[list[list[int]]]
	do_pad: Optional[bool]


	@auto_docstring
	class RImageProcessorFast(BaseImageProcessorFast):
	resample = PILImageResampling.BICUBIC
	image_mean = OPENAI_CLIP_MEAN
	image_std = OPENAI_CLIP_STD
	size = {"height": 384, "width": 384}
	default_to_square = False
	crop_size = None
	do_resize = True
	do_center_crop = None
	do_rescale = True
	do_normalize = True
	do_convert_rgb = True
	do_pad = True
	image_grid_pinpoints = [[384,768],[768,384],[768,768],[1152,384],[384,1152]],
	valid_kwargs = RFastImageProcessorKwargs
	model_input_names = ["pixel_values_videos"]

	def __init__(self, **kwargs: Unpack[RFastImageProcessorKwargs]):
	super().__init__(**kwargs)

	@auto_docstring
	def preprocess(
	self, images: ImageInput, **kwargs: Unpack[RFastImageProcessorKwargs]
	) -> BatchFeature:
	if isinstance(images, (tuple, list)) and isinstance(images[0], (tuple, list)):
	# if the first element is a list, we assume that all elements are lists
	batch_num_images = [len(x) for x in images]
	elif isinstance(images, (tuple, list)):
	# treat this as a single-image case for backward compatibility
	batch_num_images = [1] * len(images)
	else:
	batch_num_images = [1]
	kwargs["batch_num_images"] = batch_num_images
	return super().preprocess(images, **kwargs)

	def _prepare_images_structure(
	self,
	images: ImageInput,
	) -> ImageInput:
	return make_flat_list_of_images(images)

	def _resize_for_patching(
	self,
	image: "torch.Tensor",
	target_resolution: tuple,
	interpolation: "F.InterpolationMode",
	input_data_format: ChannelDimension,
	) -> "torch.Tensor":

	new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format)

	# Resize the image
	resized_image = F.resize(image, (new_height, new_width), interpolation=interpolation)

	return resized_image

	def _get_padding_size(self, original_resolution: tuple, target_resolution: tuple):
	original_height, original_width = original_resolution
	target_height, target_width = target_resolution
	paste_x, r_x = divmod(target_width - original_width, 2)
	paste_y, r_y = divmod(target_height - original_height, 2)
	return [paste_x, paste_y, paste_x + r_x, paste_y + r_y]

	def _pad_for_patching(
	self, image: "torch.Tensor", target_resolution: tuple, input_data_format: ChannelDimension
	) -> "torch.Tensor":
	"""
	Pad an image to a target resolution while maintaining aspect ratio.
	"""
	new_resolution = get_patch_output_size(image, target_resolution, input_data_format)
	padding = self._get_padding_size(new_resolution, target_resolution)

	padded_image = F.pad(image, padding=padding)

	return padded_image

	def _get_image_patches(
	self,
	image: "torch.Tensor",
	grid_pinpoints,
	size: tuple,
	patch_size: int,
	interpolation: "F.InterpolationMode",
	) -> list["torch.Tensor"]:
	"""
	Process an image with variable resolutions by dividing it into patches.

	Args:
	image ("torch.Tensor"):
	The input image to be processed.
	grid_pinpoints (List):
	A string representation of a list of possible resolutions.
	size (`tuple`):
	Size to resize the original image to.
	patch_size (`int`):
	Size of the patches to divide the image into.
	interpolation (`"InterpolationMode"`):
	Resampling filter to use if resizing the image.

	Returns:
	list["torch.Tensor"]: A list of NumPy arrays containing the processed image patches.
	"""
	if not isinstance(grid_pinpoints, list):
	raise TypeError("grid_pinpoints must be a list of possible resolutions.")

	possible_resolutions = grid_pinpoints

	image_size = get_image_size(image, channel_dim=ChannelDimension.FIRST)
	best_resolution = select_best_resolution(image_size, possible_resolutions)
	resized_image = self._resize_for_patching(
	image, best_resolution, interpolation=interpolation, input_data_format=ChannelDimension.FIRST
	)
	padded_image = self._pad_for_patching(resized_image, best_resolution, input_data_format=ChannelDimension.FIRST)
	patches = divide_to_patches(padded_image, patch_size=patch_size)
	resized_original_image = F.resize(image, size=size, interpolation=interpolation)

	image_patches = [resized_original_image] + patches

	return image_patches

	def _pad_for_batching(
	self,
	pixel_values: list["torch.Tensor"],
	) -> list["torch.Tensor"]:
	"""
	Pads images on the `num_of_patches` dimension with zeros to form a batch of same number of patches.

	Args:
	pixel_values (`list[torch.Tensor]`):
	An array of pixel values of each images of shape (`batch_size`, `num_patches`, `image_in_3D`)

	Returns:
	list[`torch.Tensor`]: The padded images.
	"""
	max_patch = max(len(x) for x in pixel_values)
	pixel_values = [
	torch.nn.functional.pad(image, pad=[0, 0, 0, 0, 0, 0, 0, max_patch - image.shape[0]])
	for image in pixel_values
	]

	return pixel_values

	def _preprocess(
	self,
	images: list["torch.Tensor"],
	do_resize: bool,
	size: SizeDict,
	image_grid_pinpoints: list[list[int]],
	interpolation: Optional["F.InterpolationMode"],
	do_center_crop: bool,
	crop_size: SizeDict,
	do_rescale: bool,
	rescale_factor: float,
	do_normalize: bool,
	image_mean: Optional[Union[float, list[float]]],
	image_std: Optional[Union[float, list[float]]],
	do_pad: bool,
	batch_num_images: list[int],
	return_tensors: Optional[Union[str, TensorType]],
	) -> BatchFeature:
	processed_images = []
	image_sizes = []

	# only single image patching is supported
	need_patching = [n == 1 for n in batch_num_images for _ in range(n)]

	# Determine the size tuple
	if size and size.height and size.width:
	size_tuple = (size.height, size.width)
	else:
	size_tuple = (size.shortest_edge, size.shortest_edge)

	# Determine the patch size
	if crop_size and crop_size.height:
	patch_size = crop_size.height
	elif size and size.height:
	patch_size = size.height
	else:
	patch_size = size.shortest_edge

	for i, image in enumerate(images):
	if need_patching[i]:
	image_patches = self._get_image_patches(
	image,
	image_grid_pinpoints,
	size=size_tuple,
	patch_size=patch_size,
	interpolation=interpolation,
	)
	else:
	padded_image = self.pad_to_square(
	images=image, background_color=tuple(int(x * 255) for x in self.image_mean)
	)
	image_patches = [padded_image]

	# Group images by size for batched processing
	processed_image_patches_grouped = {}
	grouped_image_patches, grouped_image_patches_index = group_images_by_shape(image_patches)
	for shape, stacked_image_patches in grouped_image_patches.items():
	if do_resize:
	stacked_image_patches = self.resize(
	image=stacked_image_patches,
	size=size,
	interpolation=interpolation,
	)
	if do_center_crop:
	stacked_image_patches = self.center_crop(stacked_image_patches, crop_size)
	# Fused rescale and normalize
	stacked_image_patches = self.rescale_and_normalize(
	stacked_image_patches, do_rescale, rescale_factor, do_normalize, image_mean, image_std
	)
	processed_image_patches_grouped[shape] = stacked_image_patches
	processed_image_patches = reorder_images(processed_image_patches_grouped, grouped_image_patches_index)
	processed_image_patches = (
	torch.stack(processed_image_patches, dim=0) if return_tensors else processed_image_patches
	)
	processed_images.append(processed_image_patches)
	image_sizes.append(get_image_size(image, ChannelDimension.FIRST))

	if do_pad:
	processed_images = self._pad_for_batching(processed_images)
	processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
	return BatchFeature(
	data={"pixel_values": processed_images, "image_sizes": image_sizes, "batch_num_images": batch_num_images},
	tensor_type=return_tensors,
	)

	# Copied from transformers.models.llava.image_processing_llava_fast.LlavaImageProcessorFast.pad_to_square
	def pad_to_square(
	self,
	images: "torch.Tensor",
	background_color: Union[int, tuple[int, int, int]] = 0,
	) -> "torch.Tensor":
	"""
	Pads an image to a square based on the longest edge.

	Args:
	images (`np.ndarray`):
	The images to pad.
	background_color (`int` or `tuple[int, int, int]`, optional, defaults to 0):
	The color to use for the padding. Can be an integer for single channel or a
	tuple of integers representing for multi-channel images. If passed as integer
	in mutli-channel mode, it will default to `0` in subsequent channels.
	Returns:
	`torch.Tensor`: The padded images.
	"""
	height, width = get_image_size(images, ChannelDimension.FIRST)

	if height == width:
	return images

	num_channels = images.shape[1] if len(images.shape) == 4 else images.shape[0]
	if isinstance(background_color, int):
	background_color = [background_color] + [0] * (num_channels - 1)
	elif len(background_color) != num_channels:
	raise ValueError(
	f"background_color must have no more than {num_channels} elements to match the number of channels"
	)

	max_dim = max(height, width)
	paste_x_left = (max_dim - width) // 2
	paste_y_left = (max_dim - height) // 2
	paste_x_right = max_dim - width - paste_x_left
	paste_y_right = max_dim - height - paste_y_left
	padded_images = F.pad(
	images, padding=[paste_x_left, paste_y_left, paste_x_right, paste_y_right], fill=background_color
	)

	return padded_images


	__all__ = ["RImageProcessorFast"]