Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions src/transformers/models/fuyu/processing_fuyu.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,8 +355,9 @@ def __init__(self, image_processor, tokenizer, **kwargs):
self.max_position_embeddings = 16384 # TODO Can't derive this from model files: where to set it?
self.pad_token_id = 0
self.dummy_image_index = -1
self.image_token_id = tokenizer.encode("|SPEAKER|", add_special_tokens=False)[1]
self.image_newline_id = tokenizer.encode("|NEWLINE|", add_special_tokens=False)[1]
vocab = tokenizer.get_vocab()
self.image_token_id = vocab["|SPEAKER|"]
self.image_newline_id = vocab["|NEWLINE|"]
Comment thread
ydshieh marked this conversation as resolved.

@property
def image_token_ids(self) -> list[int]:
Expand Down
23 changes: 15 additions & 8 deletions tests/models/fuyu/test_processing_fuyu.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@
class FuyuProcessingTest(ProcessorTesterMixin, unittest.TestCase):
processor_class = FuyuProcessor
model_id = "adept/fuyu-8b"
# Fuyu uses a tokenizer with a very large vocabulary (~262K tokens), making tests slow and
# memory-intensive. tiny_model_id points to a trimmed tokenizer repo to keep tests lightweight.
tiny_model_id = "hf-internal-testing/tiny-processor-fuyu"

@classmethod
def _setup_test_attributes(cls, processor):
Expand Down Expand Up @@ -75,7 +78,8 @@ def test_fuyu_processing(self):
EXPECTED_IMAGE_PATCH_INPUTS = torch.Tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, -1, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, -1, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, -1, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, -1, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, -1, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, -1, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, -1, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, -1, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, -1, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, -1, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, -1, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, -1, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, -1, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,]]).to(torch.int64)
EXPECTED_PADDED_UNPACKED_TOKEN_INPUTS = torch.Tensor([[71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 1, 128340, 71374, 71389, 120412, 71377, 71835, 71374, 73615, 71375, 71399, 71435, 71122,]]).to(torch.int64)

one_image_bus_model_inputs = self.get_processor()(text=self.text_prompt, images=self.bus_image_pil)
processor = self.get_processor(use_tiny_ckpt=False)
one_image_bus_model_inputs = processor(text=self.text_prompt, images=self.bus_image_pil)

# fmt: on
torch.testing.assert_close(one_image_bus_model_inputs["image_patches_indices"], EXPECTED_IMAGE_PATCH_INPUTS)
Expand Down Expand Up @@ -137,9 +141,13 @@ def test_fuyu_processing_multiple_image_sample(self):
SINGLE_RESIZED_PADDED_UNPACKED_TOKEN_INPUTS = torch.Tensor([[ 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 1, 128340, 71374, 71389, 120412, 71377, 71835, 71374, 73615, 71375, 71399, 71435, 71122]])
# fmt: on

# Load once and reuse across all assertions in this test to avoid repeatedly loading the
# full processor (which carries the large 262K-vocab tokenizer).
processor = self.get_processor(use_tiny_ckpt=False)

# Batch of two images - equally sized
images = [self.bus_image_pil, self.bus_image_pil]
processor_outputs = self.get_processor()(text=[self.text_prompt, self.text_prompt], images=images)
processor_outputs = processor(text=[self.text_prompt, self.text_prompt], images=images)

self.assertTrue(
(
Expand All @@ -156,18 +164,18 @@ def test_fuyu_processing_multiple_image_sample(self):

# Processes single images with different sizes as expected
images = [self.bus_image_pil]
processor_outputs = self.get_processor()(text=self.text_prompt, images=images)
processor_outputs = processor(text=self.text_prompt, images=images)
self.assertTrue((processor_outputs["image_patches_indices"] == SINGLE_IMAGE_PATCH_INPUTS).all())
self.assertTrue((processor_outputs["input_ids"] == SINGLE_PADDED_UNPACKED_TOKEN_INPUTS).all())

images = [self.bus_image_pil.resize((64, 300))]
processor_outputs = self.get_processor()(text=self.text_prompt, images=images)
processor_outputs = processor(text=self.text_prompt, images=images)
self.assertTrue((processor_outputs["image_patches_indices"] == SINGLE_RESIZED_IMAGE_PATCH_INPUTS).all())
self.assertTrue((processor_outputs["input_ids"] == SINGLE_RESIZED_PADDED_UNPACKED_TOKEN_INPUTS).all())

# Batch of two images - different sizes. Left-pads the smaller image inputs
images = [self.bus_image_pil, self.bus_image_pil.resize((64, 300))]
processor_outputs = self.get_processor()(text=[self.text_prompt, self.text_prompt], images=images)
processor_outputs = processor(text=[self.text_prompt, self.text_prompt], images=images)

padding_len_patch = SINGLE_IMAGE_PATCH_INPUTS.shape[1] - SINGLE_RESIZED_IMAGE_PATCH_INPUTS.shape[1]
padded_single_resized_image_patch = torch.cat(
Expand Down Expand Up @@ -318,9 +326,8 @@ def test_unstructured_kwargs(self):
def test_unstructured_kwargs_batched(self):
if "image_processor" not in self.processor_class.get_attributes():
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")

image_processor = self.get_component("image_processor", use_tiny_ckpt=False)
tokenizer = self.get_component("tokenizer", use_tiny_ckpt=False)
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)

Expand Down
84 changes: 64 additions & 20 deletions tests/test_processing_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,13 @@ def floats_list(shape, scale=1.0, rng=None, name=None):
@require_vision
class ProcessorTesterMixin:
processor_class = None
model_id = (
None # Optional: set this to load from a specific pretrained model instead of creating generic components
)
# Optional: set this to a real Hub repo containing a complete set of processor files
# (tokenizer, image processor, etc.) so all components can be loaded via from_pretrained.
model_id = None
# Optional: set this to a Hub repo containing a complete set of processor files where some
# components represent a tiny version (e.g. a tokenizer with a trimmed vocab) for
# memory-sensitive tests. Must be a real Hub repo with all components loadable via from_pretrained.
tiny_model_id = None
text_input_name = "input_ids"
images_input_name = "pixel_values"
videos_input_name = "pixel_values_videos"
Expand Down Expand Up @@ -138,17 +142,40 @@ def setUpClass(cls):
)

cls.tmpdirname = tempfile.mkdtemp()

# If model_id is specified, load components from that model
if cls.model_id is not None:
processor = cls._setup_from_pretrained(cls.model_id)
cls.full_tmpdirname = None

if cls.tiny_model_id is not None:
# tiny_model_id is set: tmpdirname holds the lightweight processor (used by all tests),
# full_tmpdirname holds the full processor (used only by tests that call get_processor(use_tiny_ckpt=False)).
tiny_processor = cls._setup_from_pretrained(cls.tiny_model_id)
cls._setup_test_attributes(tiny_processor)
tiny_processor.save_pretrained(cls.tmpdirname)

cls.full_tmpdirname = tempfile.mkdtemp()
# If model_id is specified, load components from that model
if cls.model_id is not None:
full_processor = cls._setup_from_pretrained(cls.model_id)
else:
# Otherwise, create generic components
full_processor = cls._setup_from_components()
# TODO: make this more robust. We intentionally do NOT call _setup_test_attributes(full_processor)
# here because it would overwrite the class attributes already set from tiny_processor (e.g.
# image_token, video_token, audio_token). We assume these special tokens are identical between
# the tiny and full processor — but this is not guaranteed: if the tiny tokenizer is built
# differently (e.g. missing special tokens or using different token strings), cls.image_token
# etc. will silently reflect the wrong values for tests that use the full processor.
full_processor.save_pretrained(cls.full_tmpdirname)
else:
# Otherwise, create generic components
processor = cls._setup_from_components()

# setup test attributes
cls._setup_test_attributes(processor)
processor.save_pretrained(cls.tmpdirname)
# No tiny_model_id: tmpdirname holds the only processor.
# If model_id is specified, load components from that model
if cls.model_id is not None:
processor = cls._setup_from_pretrained(cls.model_id)
else:
# Otherwise, create generic components
processor = cls._setup_from_components()
# setup test attributes
cls._setup_test_attributes(processor)
processor.save_pretrained(cls.tmpdirname)

@classmethod
def _setup_test_attributes(cls, processor):
Expand All @@ -160,14 +187,19 @@ def _setup_test_attributes(cls, processor):

@classmethod
def _setup_from_pretrained(cls, model_id, **kwargs):
"""Load all components from a pretrained model."""
"""Load all components from model_id to build the processor.

If any component is provided via a _setup_<attribute>() hook, all remaining components
are loaded individually from model_id so that processor_class.__init__ receives a complete
set of components (all must be passed together when any one is customized).
"""
# check if there are any custom components to setup
custom_components = {}
for attribute in cls.processor_class.get_attributes():
if hasattr(cls, f"_setup_{attribute}"):
custom_method = getattr(cls, f"_setup_{attribute}")
custom_components[attribute] = custom_method()

# if there is one custom component, we need to add all the other ones (with from_pretrained)
if custom_components:
for attribute in cls.processor_class.get_attributes():
Expand Down Expand Up @@ -348,19 +380,26 @@ def tearDownClass(cls):
"""Clean up the temporary directory."""
if hasattr(cls, "tmpdirname"):
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
if hasattr(cls, "full_tmpdirname") and cls.full_tmpdirname is not None:
shutil.rmtree(cls.full_tmpdirname, ignore_errors=True)

@staticmethod
def prepare_processor_dict():
"""Override this method to provide custom kwargs for processor initialization."""
return {}

def get_component(self, attribute, **kwargs):
def get_component(self, attribute, use_tiny_ckpt=True, **kwargs):
# use_tiny_ckpt only has effect when tiny_model_id is set. In that case, tmpdirname holds the
# lightweight processor and full_tmpdirname holds the full one. If tiny_model_id is not set,
# tmpdirname already contains the full processor loaded from cls.model_id, and calling this
# function with use_tiny_ckpt=True still returns a full processor.
dirpath = self.tmpdirname if (use_tiny_ckpt or self.full_tmpdirname is None) else self.full_tmpdirname
if attribute not in MODALITY_TO_AUTOPROCESSOR_MAPPING and "tokenizer" in attribute:
auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING["tokenizer"]
component = auto_processor_class.from_pretrained(self.tmpdirname, subfolder=attribute, **kwargs) # noqa
component = auto_processor_class.from_pretrained(dirpath, subfolder=attribute, **kwargs) # noqa
else:
auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING[attribute]
component = auto_processor_class.from_pretrained(self.tmpdirname, **kwargs) # noqa
component = auto_processor_class.from_pretrained(dirpath, **kwargs) # noqa
if "tokenizer" in attribute and not component.pad_token:
component.pad_token = "[TEST_PAD]"
if component.pad_token_id is None:
Expand All @@ -376,9 +415,14 @@ def prepare_components(self, **kwargs):

return components

def get_processor(self):
processor = self.processor_class.from_pretrained(self.tmpdirname)
return processor
def get_processor(self, use_tiny_ckpt=True):
# use_tiny_ckpt only has effect when tiny_model_id is set. In that case, tmpdirname holds the
# lightweight processor and full_tmpdirname holds the full one. If tiny_model_id is not set,
# tmpdirname already contains the full processor loaded from cls.model_id, and calling this
# function with use_tiny_ckpt=True still returns a full processor.
if not use_tiny_ckpt and self.full_tmpdirname is not None:
return self.processor_class.from_pretrained(self.full_tmpdirname)
return self.processor_class.from_pretrained(self.tmpdirname)

def prepare_text_inputs(self, batch_size: int | None = None, modalities: str | list | None = None):
if isinstance(modalities, str):
Expand Down
Loading