diff --git a/src/transformers/models/fuyu/processing_fuyu.py b/src/transformers/models/fuyu/processing_fuyu.py index b93c3ce242e4..8b2871021dcc 100644 --- a/src/transformers/models/fuyu/processing_fuyu.py +++ b/src/transformers/models/fuyu/processing_fuyu.py @@ -355,8 +355,9 @@ def __init__(self, image_processor, tokenizer, **kwargs): self.max_position_embeddings = 16384 # TODO Can't derive this from model files: where to set it? self.pad_token_id = 0 self.dummy_image_index = -1 - self.image_token_id = tokenizer.encode("|SPEAKER|", add_special_tokens=False)[1] - self.image_newline_id = tokenizer.encode("|NEWLINE|", add_special_tokens=False)[1] + vocab = tokenizer.get_vocab() + self.image_token_id = vocab["|SPEAKER|"] + self.image_newline_id = vocab["|NEWLINE|"] @property def image_token_ids(self) -> list[int]: diff --git a/tests/models/fuyu/test_processing_fuyu.py b/tests/models/fuyu/test_processing_fuyu.py index 0421220610aa..81a19d8111c7 100644 --- a/tests/models/fuyu/test_processing_fuyu.py +++ b/tests/models/fuyu/test_processing_fuyu.py @@ -38,6 +38,9 @@ class FuyuProcessingTest(ProcessorTesterMixin, unittest.TestCase): processor_class = FuyuProcessor model_id = "adept/fuyu-8b" + # Fuyu uses a tokenizer with a very large vocabulary (~262K tokens), making tests slow and + # memory-intensive. tiny_model_id points to a trimmed tokenizer repo to keep tests lightweight. + tiny_model_id = "hf-internal-testing/tiny-processor-fuyu" @classmethod def _setup_test_attributes(cls, processor): @@ -75,7 +78,8 @@ def test_fuyu_processing(self): EXPECTED_IMAGE_PATCH_INPUTS = torch.Tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, -1, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, -1, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, -1, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, -1, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, -1, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, -1, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, -1, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, -1, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, -1, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, -1, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, -1, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, -1, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, -1, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,]]).to(torch.int64) EXPECTED_PADDED_UNPACKED_TOKEN_INPUTS = torch.Tensor([[71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71011, 71019, 1, 128340, 71374, 71389, 120412, 71377, 71835, 71374, 73615, 71375, 71399, 71435, 71122,]]).to(torch.int64) - one_image_bus_model_inputs = self.get_processor()(text=self.text_prompt, images=self.bus_image_pil) + processor = self.get_processor(use_tiny_ckpt=False) + one_image_bus_model_inputs = processor(text=self.text_prompt, images=self.bus_image_pil) # fmt: on torch.testing.assert_close(one_image_bus_model_inputs["image_patches_indices"], EXPECTED_IMAGE_PATCH_INPUTS) @@ -137,9 +141,13 @@ def test_fuyu_processing_multiple_image_sample(self): SINGLE_RESIZED_PADDED_UNPACKED_TOKEN_INPUTS = torch.Tensor([[ 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 71011, 71011, 71011, 71019, 1, 128340, 71374, 71389, 120412, 71377, 71835, 71374, 73615, 71375, 71399, 71435, 71122]]) # fmt: on + # Load once and reuse across all assertions in this test to avoid repeatedly loading the + # full processor (which carries the large 262K-vocab tokenizer). + processor = self.get_processor(use_tiny_ckpt=False) + # Batch of two images - equally sized images = [self.bus_image_pil, self.bus_image_pil] - processor_outputs = self.get_processor()(text=[self.text_prompt, self.text_prompt], images=images) + processor_outputs = processor(text=[self.text_prompt, self.text_prompt], images=images) self.assertTrue( ( @@ -156,18 +164,18 @@ def test_fuyu_processing_multiple_image_sample(self): # Processes single images with different sizes as expected images = [self.bus_image_pil] - processor_outputs = self.get_processor()(text=self.text_prompt, images=images) + processor_outputs = processor(text=self.text_prompt, images=images) self.assertTrue((processor_outputs["image_patches_indices"] == SINGLE_IMAGE_PATCH_INPUTS).all()) self.assertTrue((processor_outputs["input_ids"] == SINGLE_PADDED_UNPACKED_TOKEN_INPUTS).all()) images = [self.bus_image_pil.resize((64, 300))] - processor_outputs = self.get_processor()(text=self.text_prompt, images=images) + processor_outputs = processor(text=self.text_prompt, images=images) self.assertTrue((processor_outputs["image_patches_indices"] == SINGLE_RESIZED_IMAGE_PATCH_INPUTS).all()) self.assertTrue((processor_outputs["input_ids"] == SINGLE_RESIZED_PADDED_UNPACKED_TOKEN_INPUTS).all()) # Batch of two images - different sizes. Left-pads the smaller image inputs images = [self.bus_image_pil, self.bus_image_pil.resize((64, 300))] - processor_outputs = self.get_processor()(text=[self.text_prompt, self.text_prompt], images=images) + processor_outputs = processor(text=[self.text_prompt, self.text_prompt], images=images) padding_len_patch = SINGLE_IMAGE_PATCH_INPUTS.shape[1] - SINGLE_RESIZED_IMAGE_PATCH_INPUTS.shape[1] padded_single_resized_image_patch = torch.cat( @@ -318,9 +326,8 @@ def test_unstructured_kwargs(self): def test_unstructured_kwargs_batched(self): if "image_processor" not in self.processor_class.get_attributes(): self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - + image_processor = self.get_component("image_processor", use_tiny_ckpt=False) + tokenizer = self.get_component("tokenizer", use_tiny_ckpt=False) processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index 559f9d616e59..710660a06969 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -97,9 +97,13 @@ def floats_list(shape, scale=1.0, rng=None, name=None): @require_vision class ProcessorTesterMixin: processor_class = None - model_id = ( - None # Optional: set this to load from a specific pretrained model instead of creating generic components - ) + # Optional: set this to a real Hub repo containing a complete set of processor files + # (tokenizer, image processor, etc.) so all components can be loaded via from_pretrained. + model_id = None + # Optional: set this to a Hub repo containing a complete set of processor files where some + # components represent a tiny version (e.g. a tokenizer with a trimmed vocab) for + # memory-sensitive tests. Must be a real Hub repo with all components loadable via from_pretrained. + tiny_model_id = None text_input_name = "input_ids" images_input_name = "pixel_values" videos_input_name = "pixel_values_videos" @@ -138,17 +142,40 @@ def setUpClass(cls): ) cls.tmpdirname = tempfile.mkdtemp() - - # If model_id is specified, load components from that model - if cls.model_id is not None: - processor = cls._setup_from_pretrained(cls.model_id) + cls.full_tmpdirname = None + + if cls.tiny_model_id is not None: + # tiny_model_id is set: tmpdirname holds the lightweight processor (used by all tests), + # full_tmpdirname holds the full processor (used only by tests that call get_processor(use_tiny_ckpt=False)). + tiny_processor = cls._setup_from_pretrained(cls.tiny_model_id) + cls._setup_test_attributes(tiny_processor) + tiny_processor.save_pretrained(cls.tmpdirname) + + cls.full_tmpdirname = tempfile.mkdtemp() + # If model_id is specified, load components from that model + if cls.model_id is not None: + full_processor = cls._setup_from_pretrained(cls.model_id) + else: + # Otherwise, create generic components + full_processor = cls._setup_from_components() + # TODO: make this more robust. We intentionally do NOT call _setup_test_attributes(full_processor) + # here because it would overwrite the class attributes already set from tiny_processor (e.g. + # image_token, video_token, audio_token). We assume these special tokens are identical between + # the tiny and full processor — but this is not guaranteed: if the tiny tokenizer is built + # differently (e.g. missing special tokens or using different token strings), cls.image_token + # etc. will silently reflect the wrong values for tests that use the full processor. + full_processor.save_pretrained(cls.full_tmpdirname) else: - # Otherwise, create generic components - processor = cls._setup_from_components() - - # setup test attributes - cls._setup_test_attributes(processor) - processor.save_pretrained(cls.tmpdirname) + # No tiny_model_id: tmpdirname holds the only processor. + # If model_id is specified, load components from that model + if cls.model_id is not None: + processor = cls._setup_from_pretrained(cls.model_id) + else: + # Otherwise, create generic components + processor = cls._setup_from_components() + # setup test attributes + cls._setup_test_attributes(processor) + processor.save_pretrained(cls.tmpdirname) @classmethod def _setup_test_attributes(cls, processor): @@ -160,14 +187,19 @@ def _setup_test_attributes(cls, processor): @classmethod def _setup_from_pretrained(cls, model_id, **kwargs): - """Load all components from a pretrained model.""" + """Load all components from model_id to build the processor. + If any component is provided via a _setup_() hook, all remaining components + are loaded individually from model_id so that processor_class.__init__ receives a complete + set of components (all must be passed together when any one is customized). + """ # check if there are any custom components to setup custom_components = {} for attribute in cls.processor_class.get_attributes(): if hasattr(cls, f"_setup_{attribute}"): custom_method = getattr(cls, f"_setup_{attribute}") custom_components[attribute] = custom_method() + # if there is one custom component, we need to add all the other ones (with from_pretrained) if custom_components: for attribute in cls.processor_class.get_attributes(): @@ -348,19 +380,26 @@ def tearDownClass(cls): """Clean up the temporary directory.""" if hasattr(cls, "tmpdirname"): shutil.rmtree(cls.tmpdirname, ignore_errors=True) + if hasattr(cls, "full_tmpdirname") and cls.full_tmpdirname is not None: + shutil.rmtree(cls.full_tmpdirname, ignore_errors=True) @staticmethod def prepare_processor_dict(): """Override this method to provide custom kwargs for processor initialization.""" return {} - def get_component(self, attribute, **kwargs): + def get_component(self, attribute, use_tiny_ckpt=True, **kwargs): + # use_tiny_ckpt only has effect when tiny_model_id is set. In that case, tmpdirname holds the + # lightweight processor and full_tmpdirname holds the full one. If tiny_model_id is not set, + # tmpdirname already contains the full processor loaded from cls.model_id, and calling this + # function with use_tiny_ckpt=True still returns a full processor. + dirpath = self.tmpdirname if (use_tiny_ckpt or self.full_tmpdirname is None) else self.full_tmpdirname if attribute not in MODALITY_TO_AUTOPROCESSOR_MAPPING and "tokenizer" in attribute: auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING["tokenizer"] - component = auto_processor_class.from_pretrained(self.tmpdirname, subfolder=attribute, **kwargs) # noqa + component = auto_processor_class.from_pretrained(dirpath, subfolder=attribute, **kwargs) # noqa else: auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING[attribute] - component = auto_processor_class.from_pretrained(self.tmpdirname, **kwargs) # noqa + component = auto_processor_class.from_pretrained(dirpath, **kwargs) # noqa if "tokenizer" in attribute and not component.pad_token: component.pad_token = "[TEST_PAD]" if component.pad_token_id is None: @@ -376,9 +415,14 @@ def prepare_components(self, **kwargs): return components - def get_processor(self): - processor = self.processor_class.from_pretrained(self.tmpdirname) - return processor + def get_processor(self, use_tiny_ckpt=True): + # use_tiny_ckpt only has effect when tiny_model_id is set. In that case, tmpdirname holds the + # lightweight processor and full_tmpdirname holds the full one. If tiny_model_id is not set, + # tmpdirname already contains the full processor loaded from cls.model_id, and calling this + # function with use_tiny_ckpt=True still returns a full processor. + if not use_tiny_ckpt and self.full_tmpdirname is not None: + return self.processor_class.from_pretrained(self.full_tmpdirname) + return self.processor_class.from_pretrained(self.tmpdirname) def prepare_text_inputs(self, batch_size: int | None = None, modalities: str | list | None = None): if isinstance(modalities, str):