From b00e4c526bc943579ba6b9731ca9354e070697ee Mon Sep 17 00:00:00 2001 From: federetyk Date: Thu, 15 Jan 2026 11:26:39 +0100 Subject: [PATCH 1/7] refactor: generalize dataset indexing from language-based to dataset_id-based --- CONTRIBUTING.md | 10 +- README.md | 2 +- examples/custom_task_example.py | 6 +- src/workrb/config.py | 10 +- src/workrb/results.py | 23 ++- src/workrb/run.py | 42 ++--- src/workrb/tasks/abstract/base.py | 102 +++++++--- .../tasks/abstract/classification_base.py | 92 ++++++--- src/workrb/tasks/abstract/ranking_base.py | 60 ++++-- src/workrb/tasks/classification/job2skill.py | 40 ++-- src/workrb/tasks/ranking/job2skill.py | 30 +-- src/workrb/tasks/ranking/job_similarity.py | 17 +- src/workrb/tasks/ranking/jobnorm.py | 16 +- src/workrb/tasks/ranking/skill2job.py | 27 ++- src/workrb/tasks/ranking/skill_extraction.py | 16 +- src/workrb/tasks/ranking/skill_similarity.py | 17 +- src/workrb/tasks/ranking/skillnorm.py | 18 +- tests/test_contextmatch_model.py | 2 +- tests/test_curriculum_encoder_model.py | 2 +- tests/test_model_task_compatibility.py | 46 ++--- tests/test_multi_dataset_task.py | 177 ++++++++++++++++++ tests/test_task_registry.py | 6 +- tests/test_utils.py | 20 +- 23 files changed, 570 insertions(+), 211 deletions(-) create mode 100644 tests/test_multi_dataset_task.py diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 94b2360..278e500 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -180,10 +180,10 @@ class MyCustomRankingTask(RankingTask): """Override default metrics if needed""" return ["map", "mrr", "recall@5", "recall@10"] - def load_monolingual_data(self, split: DatasetSplit, language: Language) -> RankingDataset: + def load_dataset(self, dataset_id: str, split: DatasetSplit) -> RankingDataset: """ - Load dataset for a specific language and split. - + Load dataset for a specific dataset ID and split. + Returns: RankingDataset with query_texts, target_indices, and target_space """ @@ -195,12 +195,12 @@ class MyCustomRankingTask(RankingTask): [0, 2], # Software Engineer -> Python, SQL [0, 1], # Data Scientist -> Python, Machine Learning ] - + return RankingDataset( query_texts=query_texts, target_indices=target_indices, target_space=target_space, - language=language, + dataset_id=dataset_id, ) ``` diff --git a/README.md b/README.md index 7587625..5c6465a 100644 --- a/README.md +++ b/README.md @@ -112,7 +112,7 @@ Feel free to make a PR to add your models & tasks to the official package! See [ ### Checkpointing & Resuming -WorkRB automatically saves result checkpoints after each task completion in a specific language. +WorkRB automatically saves result checkpoints after each dataset evaluation within a task. **Automatic Resuming** - Simply rerun with the same `output_folder`: diff --git a/examples/custom_task_example.py b/examples/custom_task_example.py index cf44d8f..5b2eaa3 100644 --- a/examples/custom_task_example.py +++ b/examples/custom_task_example.py @@ -78,14 +78,14 @@ def supported_target_languages(self) -> list[Language]: """Supported target languages are English.""" return [Language.EN] - def load_monolingual_data(self, language: Language, split: DatasetSplit) -> RankingDataset: + def load_dataset(self, dataset_id: str, split: DatasetSplit) -> RankingDataset: """ Load data for evaluation. This method must return a RankingDataset. Args: - language: Language code (e.g., "en", "de", "fr") + dataset_id: Dataset identifier (e.g., "en", "de", "fr" for language-based tasks) split: Data split ("test", "validation", "train") Returns @@ -121,7 +121,7 @@ def load_monolingual_data(self, language: Language, split: DatasetSplit) -> Rank query_texts=queries, target_indices=labels, target_space=targets, - language=language, + dataset_id=dataset_id, ) # Note: The evaluate() method is inherited from RankingTask and doesn't need diff --git a/src/workrb/config.py b/src/workrb/config.py index b3294fe..0d5c2b3 100644 --- a/src/workrb/config.py +++ b/src/workrb/config.py @@ -208,21 +208,21 @@ def get_pending_work( ) -> list[tuple]: """Determine what work still needs to be done. - Work is defined as a (task, language) combination that is not completed. + Work is defined as a (task, dataset_id) combination that is not completed. """ pending_work = [] for task in tasks: - for language in task.languages: - # Successful completed (task, language) combination + for dataset_id in task.dataset_ids: + # Successful completed (task, dataset_id) combination if ( results is not None and task.name in results.task_results - and language in results.task_results[task.name].language_results + and dataset_id in results.task_results[task.name].language_results ): continue # Add to pending work - pending_work.append((task, language)) + pending_work.append((task, dataset_id)) return pending_work diff --git a/src/workrb/results.py b/src/workrb/results.py index 707039f..5537bcf 100644 --- a/src/workrb/results.py +++ b/src/workrb/results.py @@ -22,7 +22,7 @@ class TaskResultMetadata(BaseModel): class MetricsResult(BaseModel): """Metric results for a single evaluation run. - In the becnhmark, this is a single evaluation run for a single language. + In the benchmark, this is a single evaluation run for a single dataset. """ evaluation_time: float = Field(ge=0) @@ -34,8 +34,8 @@ class TaskResults(BaseModel): """Results for a task.""" metadata: TaskResultMetadata - language_results: dict[str, MetricsResult] # language -> results - """ Dictionary of language codes to their computed results. """ + language_results: dict[str, MetricsResult] # dataset_id -> results + """Dictionary of dataset IDs to their computed results.""" class BenchmarkMetadata(BaseModel): @@ -292,24 +292,27 @@ def _aggregate_per_language( ) -> dict[ResultTagString, float]: """Aggregate results per language. - Collects language-specific results over all tasks, and aggregates all availble results. + Collects language-specific results over all tasks, and aggregates all available results. Results may be imbalanced if tasks support different languages. """ # Collect metric values per task raw_results = defaultdict(list) for task_result in self.task_results.values(): - for language, metrics_result in task_result.language_results.items(): + for dataset_id, metrics_result in task_result.language_results.items(): for metric_name, metric_value in metrics_result.metrics_dict.items(): - raw_results[(language, metric_name)].append(metric_value) + raw_results[(dataset_id, metric_name)].append(metric_value) # Compute stats results = {} - for (language, metric_name), values in raw_results.items(): + for (dataset_id, metric_name), values in raw_results.items(): stats = self._compute_stats(values) for agg in aggregations: assert agg in stats, f"Aggregation {agg} not found in stats: {stats.keys()}" tag = ResultTagString( - name=tag_name, metric_name=metric_name, aggregation=agg, grouping_name=language + name=tag_name, + metric_name=metric_name, + aggregation=agg, + grouping_name=dataset_id, ) results[tag] = stats[agg] return results @@ -340,7 +343,7 @@ def _get_flat_dataframe(self) -> pd.DataFrame: """Get flat dataframe of the benchmark results with each metric value as a separate row.""" data = [] for task_name, task_result in self.task_results.items(): - for language, metrics_result in task_result.language_results.items(): + for dataset_id, metrics_result in task_result.language_results.items(): for metric_name, metric_value in metrics_result.metrics_dict.items(): data.append( { @@ -349,7 +352,7 @@ def _get_flat_dataframe(self) -> pd.DataFrame: "task_type": str(task_result.metadata.task_type), # "task_label_type": str(task_result.metadata.label_type), # "task_split": str(task_result.metadata.split), - "task_language": str(language), + "dataset_id": str(dataset_id), "metric_name": str(metric_name), "metric_value": float(metric_value), } diff --git a/src/workrb/run.py b/src/workrb/run.py index 1770512..7fd6e3a 100644 --- a/src/workrb/run.py +++ b/src/workrb/run.py @@ -22,7 +22,7 @@ TaskResultMetadata, TaskResults, ) -from workrb.tasks.abstract.base import Language, Task +from workrb.tasks.abstract.base import Task logger = logging.getLogger(__name__) setup_logger(__name__, verbose=False) @@ -80,10 +80,10 @@ def evaluate( # Group pending work by task for better organization work_by_task = {} - for task, language in pending_work: + for task, dataset_id in pending_work: if task.name not in work_by_task: - work_by_task[task.name] = {"task": task, "languages": []} - work_by_task[task.name]["languages"].append(language) + work_by_task[task.name] = {"task": task, "dataset_ids": []} + work_by_task[task.name]["dataset_ids"].append(dataset_id) # Run pending work start_time_benchmark = time.time() @@ -101,7 +101,7 @@ def evaluate( # Update metadata results.metadata.total_evaluation_time = time.time() - start_time_benchmark results.metadata.resumed_from_checkpoint = len(pending_work) < sum( - len(task.languages) for task in tasks + len(task.dataset_ids) for task in tasks ) # Save config and results @@ -206,11 +206,11 @@ def get_tasks_overview(tasks: Sequence[Task]) -> str: lines.append(f"{task_name:<40} {group:<20} {task_languages:<20}") - # Add size one-liner for each language - for lang in task.languages: - size_info = task.get_size_oneliner(lang) + # Add size one-liner for each dataset + for dataset_id in task.dataset_ids: + size_info = task.get_size_oneliner(dataset_id) if size_info: - lines.append(f" └─ {lang}: {size_info}") + lines.append(f" └─ {dataset_id}: {size_info}") lines.append("-" * 80) @@ -227,7 +227,7 @@ def _get_all_languages(tasks: Sequence[Task]) -> list[str]: def _get_total_evaluations(tasks: Sequence[Task]) -> int: """Get the total number of evaluations.""" - return sum(len(task.languages) for task in tasks) + return sum(len(task.dataset_ids) for task in tasks) def _init_checkpointing( @@ -307,12 +307,12 @@ def _run_pending_work( run_idx = results.get_num_evaluation_results() # Already completed evaluations for work_info in work_by_task.values(): task: Task = work_info["task"] - pending_languages: list[str] = work_info["languages"] + pending_dataset_ids: list[str] = work_info["dataset_ids"] logger.info(f"{'=' * 60}") logger.info(f"Evaluating task: {task.name}") logger.info(f"Completed {run_idx} / {_get_total_evaluations(tasks)} evaluations. ") - logger.info(f"Pending languages for this task: {len(pending_languages)}") + logger.info(f"Pending datasets for this task: {len(pending_dataset_ids)}") # Initialize task results if not exists if task.name not in results.task_results: @@ -327,11 +327,9 @@ def _run_pending_work( language_results={}, ) - # Evaluate pending languages - for language in pending_languages: - logger.info( - f"* Running language: {language} ({task.get_size_oneliner(Language(language))})" - ) + # Evaluate pending datasets + for dataset_id in pending_dataset_ids: + logger.info(f"* Running dataset: {dataset_id} ({task.get_size_oneliner(dataset_id)})") # Get metrics for this task task_metrics = None @@ -340,15 +338,15 @@ def _run_pending_work( try: start_time_eval = time.time() - lang_results: dict[str, float] = task.evaluate( - model=model, metrics=task_metrics, language=Language(language) + dataset_results: dict[str, float] = task.evaluate( + model=model, metrics=task_metrics, dataset_id=dataset_id ) evaluation_time = time.time() - start_time_eval # Store results - results.task_results[task.name].language_results[language] = MetricsResult( + results.task_results[task.name].language_results[dataset_id] = MetricsResult( evaluation_time=evaluation_time, - metrics_dict=lang_results, + metrics_dict=dataset_results, ) # Save incremental results to checkpoint @@ -357,7 +355,7 @@ def _run_pending_work( # Show key metrics key_metric = task.default_metrics[0] - logger.info(f"\t{key_metric}: {lang_results[key_metric]:.3f}") + logger.info(f"\t{key_metric}: {dataset_results[key_metric]:.3f}") run_idx += 1 except Exception as e: logger.error(f"Error: {e}") diff --git a/src/workrb/tasks/abstract/base.py b/src/workrb/tasks/abstract/base.py index 0ea733a..83b7f9b 100644 --- a/src/workrb/tasks/abstract/base.py +++ b/src/workrb/tasks/abstract/base.py @@ -55,10 +55,11 @@ def __init__( f"Invalid split: '{split}'. Supported splits: {list(DatasetSplit)}" ) from e - # Load datasets for all languages - self.lang_datasets = self._load_multilingual_data( - languages=self.languages, split=self.split - ) + # Select dataset identifiers that match the requested languages + self.dataset_ids = self.languages_to_dataset_ids(self.languages) + + # Load datasets for the selected dataset identifiers + self.datasets = self._load_datasets(dataset_ids=self.dataset_ids, split=self.split) def _parse_languages( self, languages: list[str], unsupported_lang_mode: Literal["error", "skip"] @@ -82,6 +83,41 @@ def _parse_languages( parsed_languages.append(lang) return parsed_languages + def languages_to_dataset_ids(self, languages: list[Language]) -> list[str]: + """Convert languages to dataset IDs. + + Default implementation returns language codes as dataset IDs (1:1 mapping). + This provides automatic backward compatibility for tasks that are a union of + monolingual datasets. + + Other tasks with multiple datasets per language can override this method to + return all datasets that use only languages from the provided set. + + Args: + languages: List of Language enums requested for evaluation + + Returns + ------- + List of dataset identifier strings + """ + return [lang.value for lang in languages] + + def _load_datasets(self, dataset_ids: list[str], split: DatasetSplit) -> dict[str, Any]: + """Load datasets for specified IDs. + + Args: + dataset_ids: List of dataset identifiers to load + split: Dataset split to load + + Returns + ------- + Dictionary mapping dataset_id to dataset object + """ + datasets = {} + for dataset_id in dataset_ids: + datasets[dataset_id] = self.load_dataset(dataset_id=dataset_id, split=split) + return datasets + def get_task_config(self) -> dict[str, Any]: """Get task configuration.""" return { @@ -154,8 +190,16 @@ def split_test_fraction(self) -> float: """Default fraction of data to use for test split.""" return 0.8 - def get_size_oneliner(self, language: Language) -> str: - """Get dataset size summary to display status.""" + def get_size_oneliner(self, dataset_id: str) -> str: + """Get dataset size summary to display status. + + Args: + dataset_id: Dataset identifier + + Returns + ------- + Human-readable size string + """ return "" @final @@ -165,27 +209,35 @@ def split_val_fraction(self) -> float: assert 0 <= self.split_test_fraction <= 1, "Split test fraction must be between 0 and 1" return 1 - self.split_test_fraction - def _load_multilingual_data( - self, languages: list[Language], split: DatasetSplit - ) -> dict[Language, Any]: - """Load datasets for all languages.""" - lang_datasets: dict[Language, Any] = {} + @abstractmethod + def load_dataset(self, dataset_id: str, split: DatasetSplit) -> Any: + """Load dataset for specific ID and split. - # Check if languages are supported - non_supported_languages = set(languages) - set(self.supported_languages) - if non_supported_languages: - raise ValueError( - f"The following languages are defined for '{self.name}' but are not supported: {non_supported_languages}. Supported languages: {self.supported_languages}" - ) + For tasks that are a union of monolingual datasets: dataset_id equals + language code (e.g., "en", "de"). - for lang in languages: - lang_datasets[lang] = self.load_monolingual_data(split=split, language=lang) - return lang_datasets + For other tasks: dataset_id can encode additional information like + country and languages (e.g., "deu_q_de_c_de"). - @abstractmethod - def load_monolingual_data(self, language: Language, split: DatasetSplit) -> Any: - pass + Args: + dataset_id: Unique identifier for the dataset + split: Dataset split to load + + Returns + ------- + Dataset object (RankingDataset or ClassificationDataset) + """ @abstractmethod - def evaluate(self, model, metrics=None, language: Language = Language.EN) -> dict[str, float]: - pass + def evaluate(self, model, metrics=None, dataset_id: str = "en") -> dict[str, float]: + """Evaluate model on specific dataset. + + Args: + model: Model to evaluate + metrics: List of metric names. If None, uses default_metrics + dataset_id: Dataset identifier to evaluate on + + Returns + ------- + Dictionary of metric names to values + """ diff --git a/src/workrb/tasks/abstract/classification_base.py b/src/workrb/tasks/abstract/classification_base.py index 78b97b7..999bea9 100644 --- a/src/workrb/tasks/abstract/classification_base.py +++ b/src/workrb/tasks/abstract/classification_base.py @@ -13,7 +13,6 @@ BaseTaskGroup, DatasetSplit, LabelType, - Language, Task, TaskType, ) @@ -43,7 +42,7 @@ def __init__( texts: list[str], labels: list[list[int]], label_space: list[str], - language: Language, + dataset_id: str, ): """Initialize classification dataset with validation. @@ -52,12 +51,12 @@ def __init__( labels: List with list of class indices corresponding to each text. Contains just 1 item per list for single-label classification. label_space: List of class names/labels (e.g., ["skill1", "skill2", "skill3"]) - language: Language enum + dataset_id: Unique identifier for this dataset """ self.texts = self._postprocess_texts(texts) self.labels = self._postprocess_labels(labels) self.label_space = self._postprocess_texts(label_space) - self.language = language + self.dataset_id = dataset_id self.validate_dataset() def validate_dataset( @@ -141,11 +140,10 @@ def get_labels_as_indicator_matrix(self) -> list[list[int]]: class ClassificationTask(Task): - """ - Abstract base class for classification tasks. + """Abstract base class for classification tasks. Supports both binary and multi-class classification. - Tasks should implement load_monolingual_data() to return ClassificationDataset. + Tasks should implement load_dataset() to return ClassificationDataset. """ @property @@ -177,8 +175,16 @@ def threshold(self) -> float | None: """Threshold to use for classification.""" @abstractmethod - def get_output_space_size(self, language: Language) -> int: - """Number of output classes for this classification task.""" + def get_output_space_size(self, dataset_id: str) -> int: + """Number of output classes for this classification task. + + Args: + dataset_id: Dataset identifier + + Returns + ------- + Number of classes in the output space + """ @property @abstractmethod @@ -186,24 +192,43 @@ def input_type(self) -> ModelInputType: """Input type for texts in the classification task.""" @abstractmethod - def load_monolingual_data( - self, split: DatasetSplit, language: Language - ) -> ClassificationDataset: - """Load dataset for a specific language.""" - - def get_size_oneliner(self, language: Language) -> str: - """Get dataset summary to display for progress.""" - dataset: ClassificationDataset = self.lang_datasets[language] + def load_dataset(self, dataset_id: str, split: DatasetSplit) -> ClassificationDataset: + """Load dataset for specific ID and split. + + For tasks that are a union of monolingual datasets: dataset_id equals + language code. + + For other tasks: dataset_id can encode arbitrary information. + + Args: + dataset_id: Unique identifier for the dataset + split: Dataset split to load + + Returns + ------- + ClassificationDataset object + """ + + def get_size_oneliner(self, dataset_id: str) -> str: + """Get dataset summary to display for progress. + + Args: + dataset_id: Dataset identifier + + Returns + ------- + Human-readable size string + """ + dataset: ClassificationDataset = self.datasets[dataset_id] return f"{len(dataset.texts)} samples, {len(dataset.label_space)} classes" def evaluate( self, model: ModelInterface, metrics: list[str] | None = None, - language: Language = Language.EN, + dataset_id: str = "en", ) -> dict[str, float]: - """ - Evaluate the model with threshold optimization. + """Evaluate the model with threshold optimization. For binary classification, this method: 1. Optimizes threshold on validation data @@ -213,7 +238,7 @@ def evaluate( Args: model: Model implementing classification interface metrics: List of metrics to compute - language: Language code for evaluation + dataset_id: Dataset identifier to evaluate on Returns ------- @@ -223,22 +248,22 @@ def evaluate( metrics = self.default_metrics # Get evaluation dataset - eval_dataset: ClassificationDataset = self.lang_datasets[language] + eval_dataset: ClassificationDataset = self.datasets[dataset_id] # Validate model output if it has a fixed classification label space model_label_space = model.classification_label_space if model_label_space is not None: # Model has fixed label space (e.g., classification head) - if len(model_label_space) != self.get_output_space_size(language): + if len(model_label_space) != self.get_output_space_size(dataset_id): raise ValueError( f"Model output size mismatch: model has {len(model_label_space)} outputs, " - f"but task requires {self.get_output_space_size(language)} outputs." + f"but task requires {self.get_output_space_size(dataset_id)} outputs." ) # Validate label order matches (critical for correct evaluation) self._validate_model_label_space(model_label_space, eval_dataset) best_threshold = ( - self.get_threshold_on_val_data(model, language) + self.get_threshold_on_val_data(model, dataset_id) if self.best_threshold_on_val_data else self.threshold ) @@ -307,12 +332,21 @@ def _validate_model_label_space( "The model must use the exact same label ordering as the task." ) - def get_threshold_on_val_data(self, model: ModelInterface, language: Language) -> float: - """Get the best threshold on validation data.""" + def get_threshold_on_val_data(self, model: ModelInterface, dataset_id: str) -> float: + """Get the best threshold on validation data. + + Args: + model: Model to evaluate + dataset_id: Dataset identifier + + Returns + ------- + Optimized threshold value + """ # Step 1: Optimize threshold on validation data # Load validation data (even if we're evaluating on test) - logger.info(f"Optimizing threshold on validation data for {language}...") - val_dataset = self.load_monolingual_data(DatasetSplit.VAL, language) + logger.info(f"Optimizing threshold on validation data for {dataset_id}...") + val_dataset = self.load_dataset(dataset_id, DatasetSplit.VAL) val_predictions = model.compute_classification( texts=val_dataset.texts, targets=val_dataset.label_space, diff --git a/src/workrb/tasks/abstract/ranking_base.py b/src/workrb/tasks/abstract/ranking_base.py index e980168..20265d0 100644 --- a/src/workrb/tasks/abstract/ranking_base.py +++ b/src/workrb/tasks/abstract/ranking_base.py @@ -10,7 +10,7 @@ import torch from workrb.metrics.ranking import calculate_ranking_metrics -from workrb.tasks.abstract.base import BaseTaskGroup, DatasetSplit, Language, Task, TaskType +from workrb.tasks.abstract.base import BaseTaskGroup, DatasetSplit, Task, TaskType from workrb.types import ModelInputType if TYPE_CHECKING: @@ -30,26 +30,27 @@ class RankingTaskGroup(BaseTaskGroup, str, Enum): class RankingDataset: - """Structure for monolingualranking datasets.""" + """Structure for ranking datasets.""" def __init__( self, query_texts: list[str], target_indices: list[list[int]], target_space: list[str], - language: Language, + dataset_id: str, ): """Initialize ranking dataset with validation. Args: - query: List of query strings - target_label: List of lists containing indices into the target vocabulary - target: List of target vocabulary strings + query_texts: List of query strings + target_indices: List of lists containing indices into the target vocabulary + target_space: List of target vocabulary strings + dataset_id: Unique identifier for this dataset """ self.query_texts = self._postprocess_texts(query_texts) self.target_indices = self._postprocess_indices(target_indices) self.target_space = self._postprocess_texts(target_space) - self.language = language + self.dataset_id = dataset_id self.validate_dataset() def validate_dataset( @@ -135,26 +136,48 @@ def target_input_type(self) -> ModelInputType: """Input type for target texts in the ranking task.""" @abstractmethod - def load_monolingual_data(self, split: DatasetSplit, language: Language) -> RankingDataset: - """Load dataset for a specific language.""" + def load_dataset(self, dataset_id: str, split: DatasetSplit) -> RankingDataset: + """Load dataset for specific ID and split. - def get_size_oneliner(self, language: Language) -> str: - """Get dataset summary to display for progress.""" - return f"{len(self.lang_datasets[language].query_texts)} queries x {len(self.lang_datasets[language].target_space)} targets" + For tasks that are a union of monolingual datasets: dataset_id equals + language code. + + For other tasks: dataset_id can encode arbitrary information. + + Args: + dataset_id: Unique identifier for the dataset + split: Dataset split to load + + Returns + ------- + RankingDataset object + """ + + def get_size_oneliner(self, dataset_id: str) -> str: + """Get dataset summary to display for progress. + + Args: + dataset_id: Dataset identifier + + Returns + ------- + Human-readable size string + """ + dataset = self.datasets[dataset_id] + return f"{len(dataset.query_texts)} queries x {len(dataset.target_space)} targets" def evaluate( self, model: ModelInterface, metrics: list[str] | None = None, - language: Language = Language.EN, + dataset_id: str = "en", ) -> dict[str, float]: - """ - Evaluate the model on this ranking task. + """Evaluate the model on this ranking task. Args: model: Model implementing ModelInterface (must have compute_rankings method) metrics: List of metrics to compute. If None, uses default_metrics - language: Language code for evaluation + dataset_id: Dataset identifier to evaluate on Returns ------- @@ -163,8 +186,8 @@ def evaluate( if metrics is None: metrics = self.default_metrics - # Use new dataset if available - dataset = self.lang_datasets[language] + # Retrieve dataset by ID + dataset = self.datasets[dataset_id] queries = dataset.query_texts targets = dataset.target_space labels = dataset.target_indices @@ -181,6 +204,7 @@ def evaluate( if isinstance(prediction_matrix, torch.Tensor): prediction_matrix = prediction_matrix.cpu().numpy() + # Calculate metrics metric_results = calculate_ranking_metrics( prediction_matrix=prediction_matrix, pos_label_idxs=labels, metrics=metrics ) diff --git a/src/workrb/tasks/classification/job2skill.py b/src/workrb/tasks/classification/job2skill.py index 7043907..431c0f5 100644 --- a/src/workrb/tasks/classification/job2skill.py +++ b/src/workrb/tasks/classification/job2skill.py @@ -68,32 +68,39 @@ def input_type(self) -> ModelInputType: """Input is job titles.""" return ModelInputType.JOB_TITLE - def get_output_space_size(self, language: Language) -> int: - """Number of output classes (skills) for this classification task.""" - ds: ClassificationDataset = self.lang_datasets[language] - return len(ds.label_space) + def get_output_space_size(self, dataset_id: str) -> int: + """Number of output classes (skills) for this classification task. + + Args: + dataset_id: Dataset identifier - def load_monolingual_data( - self, split: DatasetSplit, language: Language - ) -> ClassificationDataset: + Returns + ------- + Number of classes in the output space """ - Load job-skill classification data for specified language and split. + ds: ClassificationDataset = self.datasets[dataset_id] + return len(ds.label_space) + + def load_dataset(self, dataset_id: str, split: DatasetSplit) -> ClassificationDataset: + """Load job-skill classification data for specified dataset and split. Args: + dataset_id: Dataset identifier (language code for this task) split: Data split (VAL or TEST) - language: Language code Returns ------- ClassificationDataset with job titles and multi-label skill assignments """ + language = Language(dataset_id) + if split == DatasetSplit.VAL: - return self._load_val(language) + return self._load_val(language=language, dataset_id=dataset_id) if split == DatasetSplit.TEST: - return self._load_test(language) + return self._load_test(language=language, dataset_id=dataset_id) raise ValueError(f"Split '{split}' not supported. Use VAL or TEST") - def _load_test(self, language: Language) -> ClassificationDataset: + def _load_test(self, language: Language, dataset_id: str) -> ClassificationDataset: """Load test data from ESCO occupation-skill relations.""" target_esco = ESCO(version=self.esco_version, language=language) skill_vocab = target_esco.get_skills_vocabulary() @@ -122,12 +129,11 @@ def _load_test(self, language: Language) -> ClassificationDataset: texts=texts, labels=labels, label_space=skill_vocab, - language=language, + dataset_id=dataset_id, ) - def _load_val(self, language: Language) -> ClassificationDataset: - """ - Load validation set based on vacancies with job titles. + def _load_val(self, language: Language, dataset_id: str) -> ClassificationDataset: + """Load validation set based on vacancies with job titles. Static validation set only available in English. """ @@ -187,5 +193,5 @@ def _load_val(self, language: Language) -> ClassificationDataset: texts=texts, labels=labels, label_space=skill_vocab, - language=language, + dataset_id=dataset_id, ) diff --git a/src/workrb/tasks/ranking/job2skill.py b/src/workrb/tasks/ranking/job2skill.py index 2632c96..f51f5bb 100644 --- a/src/workrb/tasks/ranking/job2skill.py +++ b/src/workrb/tasks/ranking/job2skill.py @@ -61,22 +61,31 @@ def target_input_type(self) -> ModelInputType: """Target input type for skills.""" return ModelInputType.SKILL_NAME - def load_monolingual_data(self, split: DatasetSplit, language: Language) -> RankingDataset: - """ - Load job-to-skills data for a specific split and language. + def load_dataset(self, dataset_id: str, split: DatasetSplit) -> RankingDataset: + """Load job-to-skills data for a specific split and dataset. Static validation set only available in English. Test set is generated from ESCO relations for the selected version and language. + + Args: + dataset_id: Dataset identifier (language code for this task) + split: Dataset split to load + + Returns + ------- + RankingDataset object """ + language = Language(dataset_id) + if split == DatasetSplit.TEST: - return self._load_test(language=language) + return self._load_test(language=language, dataset_id=dataset_id) if split == DatasetSplit.VAL: - return self._load_val(language=language) + return self._load_val(language=language, dataset_id=dataset_id) raise ValueError(f"Invalid split: {split}") - def _load_test(self, language: Language) -> RankingDataset: + def _load_test(self, language: Language, dataset_id: str) -> RankingDataset: """Load test data for a specific version and language.""" target_esco = ESCO(version=self.esco_version, language=language) skill_vocab = target_esco.get_skills_vocabulary() @@ -105,12 +114,11 @@ def _load_test(self, language: Language) -> RankingDataset: query_texts=query_texts, target_indices=target_indices, target_space=skill_vocab, - language=language, + dataset_id=dataset_id, ) - def _load_val(self, language: Language) -> RankingDataset: - """ - Validation set based on vacancies with job titles, where description is used to extract ESCO skills. + def _load_val(self, language: Language, dataset_id: str) -> RankingDataset: + """Validation set based on vacancies with job titles, where description is used to extract ESCO skills. Static validation set only available in English. """ @@ -162,7 +170,7 @@ def _load_val(self, language: Language) -> RankingDataset: query_texts=query_texts, target_indices=target_indices, target_space=skill_vocab, - language=language, + dataset_id=dataset_id, ) @property diff --git a/src/workrb/tasks/ranking/job_similarity.py b/src/workrb/tasks/ranking/job_similarity.py index 4edc043..d18a534 100644 --- a/src/workrb/tasks/ranking/job_similarity.py +++ b/src/workrb/tasks/ranking/job_similarity.py @@ -104,8 +104,19 @@ def target_input_type(self) -> ModelInputType: """Target input type for job titles.""" return ModelInputType.JOB_TITLE - def load_monolingual_data(self, split: DatasetSplit, language: Language) -> RankingDataset: - """Load Job Title Similarity data from the HuggingFace dataset.""" + def load_dataset(self, dataset_id: str, split: DatasetSplit) -> RankingDataset: + """Load Job Title Similarity data from the HuggingFace dataset. + + Args: + dataset_id: Dataset identifier (language code for this task) + split: Dataset split to load + + Returns + ------- + RankingDataset object + """ + language = Language(dataset_id) + if split != DatasetSplit.TEST: raise ValueError(f"Split '{split}' not supported. Use TEST") @@ -118,7 +129,7 @@ def load_monolingual_data(self, split: DatasetSplit, language: Language) -> Rank relevancy_labels = list(ds["queries"]["labels"]) corpus = list(ds["corpus"]["text"]) - return RankingDataset(queries, relevancy_labels, corpus, language=language) + return RankingDataset(queries, relevancy_labels, corpus, dataset_id=dataset_id) @property def citation(self) -> str: diff --git a/src/workrb/tasks/ranking/jobnorm.py b/src/workrb/tasks/ranking/jobnorm.py index 95151df..23a4bd3 100644 --- a/src/workrb/tasks/ranking/jobnorm.py +++ b/src/workrb/tasks/ranking/jobnorm.py @@ -61,8 +61,18 @@ def target_input_type(self) -> ModelInputType: """Target input type for ESCO occupations.""" return ModelInputType.JOB_TITLE - def load_monolingual_data(self, split: DatasetSplit, language: Language) -> RankingDataset: - """Load job normalization data.""" + def load_dataset(self, dataset_id: str, split: DatasetSplit) -> RankingDataset: + """Load job normalization data for a specific split and dataset. + + Args: + dataset_id: Dataset identifier (language code for this task) + split: Dataset split to load + + Returns + ------- + RankingDataset object + """ + language = Language(dataset_id) # Login using e.g. `huggingface-cli login` to access this dataset ds = load_dataset("TechWolf/JobBERT-evaluation-dataset") assert isinstance(ds, DatasetDict) @@ -115,7 +125,7 @@ def load_monolingual_data(self, split: DatasetSplit, language: Language) -> Rank query_texts=query_texts, target_indices=label_indices, target_space=job_vocab, - language=language, + dataset_id=dataset_id, ) @property diff --git a/src/workrb/tasks/ranking/skill2job.py b/src/workrb/tasks/ranking/skill2job.py index 13071d7..d96093b 100644 --- a/src/workrb/tasks/ranking/skill2job.py +++ b/src/workrb/tasks/ranking/skill2job.py @@ -60,22 +60,31 @@ def target_input_type(self) -> ModelInputType: """Target input type for jobs.""" return ModelInputType.JOB_TITLE - def load_monolingual_data(self, split: DatasetSplit, language: Language) -> RankingDataset: - """ - Load skill-to-job data for a specific split and language. + def load_dataset(self, dataset_id: str, split: DatasetSplit) -> RankingDataset: + """Load skill-to-job data for a specific split and dataset. Validation set is static and only available in English. Test set is generated from ESCO relations for the selected version and language. + + Args: + dataset_id: Dataset identifier (language code for this task) + split: Dataset split to load + + Returns + ------- + RankingDataset object """ + language = Language(dataset_id) + if split == DatasetSplit.TEST: - return self._load_test(language=language) + return self._load_test(language=language, dataset_id=dataset_id) if split == DatasetSplit.VAL: - return self._load_val(language=language) + return self._load_val(language=language, dataset_id=dataset_id) raise ValueError(f"Invalid split: {split}") - def _load_test(self, language: Language) -> RankingDataset: + def _load_test(self, language: Language, dataset_id: str) -> RankingDataset: """Load test data for a specific version and language.""" target_esco = ESCO(version=self.esco_version, language=language) @@ -109,10 +118,10 @@ def _load_test(self, language: Language) -> RankingDataset: query_texts=query_texts, target_indices=target_indices, target_space=job_vocab, - language=language, + dataset_id=dataset_id, ) - def _load_val(self, language: Language) -> RankingDataset: + def _load_val(self, language: Language, dataset_id: str) -> RankingDataset: """ Use vacancies with job titles where descriptions yield ESCO skills. @@ -178,7 +187,7 @@ def _load_val(self, language: Language) -> RankingDataset: query_texts=query_texts, target_indices=target_indices, target_space=job_vocab, - language=language, + dataset_id=dataset_id, ) @property diff --git a/src/workrb/tasks/ranking/skill_extraction.py b/src/workrb/tasks/ranking/skill_extraction.py index 543219a..9950e17 100644 --- a/src/workrb/tasks/ranking/skill_extraction.py +++ b/src/workrb/tasks/ranking/skill_extraction.py @@ -68,8 +68,18 @@ def target_input_type(self) -> ModelInputType: """Target input type for ESCO skills.""" return ModelInputType.SKILL_NAME - def load_monolingual_data(self, split: DatasetSplit, language: Language) -> RankingDataset: - """Load skill extraction house data.""" + def load_dataset(self, dataset_id: str, split: DatasetSplit) -> RankingDataset: + """Load skill extraction data for a specific split and dataset. + + Args: + dataset_id: Dataset identifier (language code for this task) + split: Dataset split to load + + Returns + ------- + RankingDataset object + """ + language = Language(dataset_id) # Load data split_names = {DatasetSplit.TEST: "test", DatasetSplit.VAL: "validation"} dataset = load_dataset(self.hf_name, split=split_names[split]) @@ -119,7 +129,7 @@ def load_monolingual_data(self, split: DatasetSplit, language: Language) -> Rank query_texts=filtered_queries, target_indices=filtered_labels, target_space=skill_vocab, - language=language, + dataset_id=dataset_id, ) diff --git a/src/workrb/tasks/ranking/skill_similarity.py b/src/workrb/tasks/ranking/skill_similarity.py index fdeeaf6..f321969 100644 --- a/src/workrb/tasks/ranking/skill_similarity.py +++ b/src/workrb/tasks/ranking/skill_similarity.py @@ -63,13 +63,22 @@ def target_input_type(self) -> ModelInputType: """Target input type for skills.""" return ModelInputType.SKILL_NAME - def load_monolingual_data(self, split: DatasetSplit, language: Language) -> RankingDataset: - """ - Load skill similarity data from SkillMatch dataset. + def load_dataset(self, dataset_id: str, split: DatasetSplit) -> RankingDataset: + """Load skill similarity data from SkillMatch dataset. Uses only the 1k related pairs from the SkillMatch dataset, but uses all skills from the SkillMatch dataset for the vocabulary. + + Args: + dataset_id: Dataset identifier (language code for this task) + split: Dataset split to load + + Returns + ------- + RankingDataset object """ + language = Language(dataset_id) + if language != Language.EN: raise ValueError("The validation set of this task is only available in English.") @@ -105,7 +114,7 @@ def load_monolingual_data(self, split: DatasetSplit, language: Language) -> Rank selected_queries = query_test selected_labels = label_test - return RankingDataset(selected_queries, selected_labels, skill_vocab, language=language) + return RankingDataset(selected_queries, selected_labels, skill_vocab, dataset_id=dataset_id) @property def citation(self) -> str: diff --git a/src/workrb/tasks/ranking/skillnorm.py b/src/workrb/tasks/ranking/skillnorm.py index 01bb1e3..41bc2b3 100644 --- a/src/workrb/tasks/ranking/skillnorm.py +++ b/src/workrb/tasks/ranking/skillnorm.py @@ -101,9 +101,19 @@ def target_input_type(self) -> ModelInputType: """Target input type for canonical skill names.""" return ModelInputType.SKILL_NAME - def load_monolingual_data(self, split: DatasetSplit, language: Language) -> RankingDataset: - """Load skill normalization data from ESCO.""" - target_esco = ESCO(version=self.esco_version, language=Language(language)) + def load_dataset(self, dataset_id: str, split: DatasetSplit) -> RankingDataset: + """Load skill normalization data from ESCO. + + Args: + dataset_id: Dataset identifier (language code for this task) + split: Dataset split to load + + Returns + ------- + RankingDataset object + """ + language = Language(dataset_id) + target_esco = ESCO(version=self.esco_version, language=language) # Full vocab, even those without alternatives skill_vocab = target_esco.get_skills_vocabulary() @@ -121,7 +131,7 @@ def load_monolingual_data(self, split: DatasetSplit, language: Language) -> Rank alt2skills, skill2label, split ) - return RankingDataset(selected_queries, selected_labels, skill_vocab, language=language) + return RankingDataset(selected_queries, selected_labels, skill_vocab, dataset_id=dataset_id) def _rnd_split( self, alt2skills: dict[str, list[str]], skill2label: dict[str, int], split: DatasetSplit diff --git a/tests/test_contextmatch_model.py b/tests/test_contextmatch_model.py index 34f9696..6f92dc5 100644 --- a/tests/test_contextmatch_model.py +++ b/tests/test_contextmatch_model.py @@ -130,7 +130,7 @@ def test_tech_skill_extraction_benchmark_metrics(self): # Evaluate model on the task with the metrics from the paper metrics = ["mrr", "rp@1", "rp@5", "rp@10"] - results = task.evaluate(model=model, metrics=metrics, language=Language.EN) + results = task.evaluate(model=model, metrics=metrics, dataset_id=Language.EN.value) # Paper-reported values (RP metrics are percentages, convert to decimals) expected_mrr = 0.632 diff --git a/tests/test_curriculum_encoder_model.py b/tests/test_curriculum_encoder_model.py index 04e50c6..0585217 100644 --- a/tests/test_curriculum_encoder_model.py +++ b/tests/test_curriculum_encoder_model.py @@ -75,7 +75,7 @@ def test_skill_extraction_benchmark_metrics(self): # Evaluate model on the task with the metrics from the paper metrics = ["mrr", "rp@1", "rp@5", "rp@10"] - results = task.evaluate(model=model, metrics=metrics, language=Language.EN) + results = task.evaluate(model=model, metrics=metrics, dataset_id=Language.EN.value) # Paper-reported values (RP metrics are percentages, convert to decimals) [TECH] # expected_mrr = 0.5726 diff --git a/tests/test_model_task_compatibility.py b/tests/test_model_task_compatibility.py index 59f6ca5..f9053d2 100644 --- a/tests/test_model_task_compatibility.py +++ b/tests/test_model_task_compatibility.py @@ -41,7 +41,7 @@ def test_classification_task_with_biencoder_works(self): model = BiEncoderModel("all-MiniLM-L6-v2") # Should work - BiEncoder computes similarity between texts and label space - results = task.evaluate(model, language=Language.EN) + results = task.evaluate(model, dataset_id=Language.EN.value) # Validate results assert "f1_macro" in results @@ -56,7 +56,7 @@ def test_classification_task_output_shape(self): model = BiEncoderModel("all-MiniLM-L6-v2") # Get dataset - dataset: ClassificationDataset = task.lang_datasets[Language.EN] + dataset: ClassificationDataset = task.datasets[Language.EN.value] # Compute predictions predictions = model.compute_classification( @@ -80,7 +80,7 @@ def test_classification_task_with_classification_model_works(self): task = ToyJobSkill(split="val", languages=["en"]) # Get the label space from the task - dataset = task.lang_datasets[Language.EN] + dataset = task.datasets[Language.EN.value] label_space = dataset.label_space # Create classification model with matching label space @@ -90,7 +90,7 @@ def test_classification_task_with_classification_model_works(self): ) # Should work - model has classification head with matching label space - results = task.evaluate(model, language=Language.EN) + results = task.evaluate(model, dataset_id=Language.EN.value) # Validate results assert "f1_macro" in results @@ -112,7 +112,7 @@ def test_classification_task_label_space_size_mismatch_fails(self): # Should fail with clear error about size mismatch with pytest.raises(ValueError, match="Model output size mismatch"): - task.evaluate(model, language=Language.EN) + task.evaluate(model, dataset_id=Language.EN.value) def test_classification_task_label_space_order_mismatch_fails(self): """Classification model with wrong label order should fail.""" @@ -120,7 +120,7 @@ def test_classification_task_label_space_order_mismatch_fails(self): task = ToyJobSkill(split="val", languages=["en"]) # Get the label space and shuffle it - dataset = task.lang_datasets[Language.EN] + dataset = task.datasets[Language.EN.value] wrong_order_labels = list(reversed(dataset.label_space)) model = RndESCOClassificationModel( @@ -130,7 +130,7 @@ def test_classification_task_label_space_order_mismatch_fails(self): # Should fail with clear error about order mismatch with pytest.raises(ValueError, match="label order doesn't match"): - task.evaluate(model, language=Language.EN) + task.evaluate(model, dataset_id=Language.EN.value) class TestRankingTaskWithBiEncoder: @@ -146,7 +146,7 @@ def test_ranking_task_with_biencoder_works(self): model = BiEncoderModel("all-MiniLM-L6-v2") # Should work - standard ranking behavior - results = task.evaluate(model, language=Language.EN) + results = task.evaluate(model, dataset_id=Language.EN.value) # Validate results assert "map" in results @@ -161,7 +161,7 @@ def test_ranking_task_output_shape(self): model = BiEncoderModel("all-MiniLM-L6-v2") # Get dataset - dataset = task.lang_datasets[Language.EN] + dataset = task.datasets[Language.EN.value] # Compute predictions predictions = model.compute_rankings( @@ -186,7 +186,7 @@ def test_ranking_task_with_classification_model_matching_label_space_works(self) task = ToySkillSim(split="val", languages=["en"]) # Get the target space from the task - dataset = task.lang_datasets[Language.EN] + dataset = task.datasets[Language.EN.value] target_space = dataset.target_space # Create classification model with matching label space @@ -196,7 +196,7 @@ def test_ranking_task_with_classification_model_matching_label_space_works(self) ) # Should work - model's label space matches ranking target space - results = task.evaluate(model, language=Language.EN) + results = task.evaluate(model, dataset_id=Language.EN.value) # Validate results assert "map" in results @@ -218,7 +218,7 @@ def test_ranking_task_with_classification_model_size_mismatch_fails(self): # Should fail with clear error about size mismatch with pytest.raises(ValueError, match="target space size mismatch"): - task.evaluate(model, language=Language.EN) + task.evaluate(model, dataset_id=Language.EN.value) def test_ranking_task_with_classification_model_label_mismatch_fails(self): """Classification model with wrong labels should fail.""" @@ -226,7 +226,7 @@ def test_ranking_task_with_classification_model_label_mismatch_fails(self): task = ToySkillSim(split="val", languages=["en"]) # Get target space and create different labels with same size - dataset = task.lang_datasets[Language.EN] + dataset = task.datasets[Language.EN.value] wrong_labels = [f"WrongLabel_{i}" for i in range(len(dataset.target_space))] model = RndESCOClassificationModel( @@ -236,7 +236,7 @@ def test_ranking_task_with_classification_model_label_mismatch_fails(self): # Should fail with clear error about label mismatch with pytest.raises(ValueError, match="target labels don't match"): - task.evaluate(model, language=Language.EN) + task.evaluate(model, dataset_id=Language.EN.value) def test_ranking_task_with_classification_model_order_mismatch_fails(self): """Classification model with wrong label order should fail.""" @@ -244,7 +244,7 @@ def test_ranking_task_with_classification_model_order_mismatch_fails(self): task = ToySkillSim(split="val", languages=["en"]) # Get target space and reverse order - dataset = task.lang_datasets[Language.EN] + dataset = task.datasets[Language.EN.value] wrong_order_labels = list(reversed(dataset.target_space)) model = RndESCOClassificationModel( @@ -254,7 +254,7 @@ def test_ranking_task_with_classification_model_order_mismatch_fails(self): # Should fail with clear error about order mismatch with pytest.raises(ValueError, match="target label order doesn't match"): - task.evaluate(model, language=Language.EN) + task.evaluate(model, dataset_id=Language.EN.value) class TestModelTaskCompatibilitySummary: @@ -273,8 +273,8 @@ def test_all_model_task_combinations(self): biencoder_model = BiEncoderModel("all-MiniLM-L6-v2") # Get label spaces - class_dataset = classification_task.lang_datasets[Language.EN] - rank_dataset = ranking_task.lang_datasets[Language.EN] + class_dataset = classification_task.datasets[Language.EN.value] + rank_dataset = ranking_task.datasets[Language.EN.value] classification_model_for_class = RndESCOClassificationModel( base_model_name="all-MiniLM-L6-v2", @@ -290,23 +290,25 @@ def test_all_model_task_combinations(self): # 1. Classification Task + BiEncoder (NEW) results["class_biencoder"] = classification_task.evaluate( - biencoder_model, language=Language.EN + biencoder_model, dataset_id=Language.EN.value ) assert "f1_macro" in results["class_biencoder"] # 2. Classification Task + Classification Model (EXISTING) results["class_classification"] = classification_task.evaluate( - classification_model_for_class, language=Language.EN + classification_model_for_class, dataset_id=Language.EN.value ) assert "f1_macro" in results["class_classification"] # 3. Ranking Task + BiEncoder (EXISTING) - results["rank_biencoder"] = ranking_task.evaluate(biencoder_model, language=Language.EN) + results["rank_biencoder"] = ranking_task.evaluate( + biencoder_model, dataset_id=Language.EN.value + ) assert "map" in results["rank_biencoder"] # 4. Ranking Task + Classification Model (CONDITIONAL) results["rank_classification"] = ranking_task.evaluate( - classification_model_for_rank, language=Language.EN + classification_model_for_rank, dataset_id=Language.EN.value ) assert "map" in results["rank_classification"] diff --git a/tests/test_multi_dataset_task.py b/tests/test_multi_dataset_task.py new file mode 100644 index 0000000..0303246 --- /dev/null +++ b/tests/test_multi_dataset_task.py @@ -0,0 +1,177 @@ +""" +Test multi-dataset tasks that return multiple dataset IDs per language. + +This test suite validates that tasks can override languages_to_dataset_ids() +to return multiple dataset identifiers for each language, supporting use cases +like MELO benchmark where datasets encode additional metadata beyond language. +""" + +import pytest + +from workrb.models import BiEncoderModel +from workrb.tasks import ESCOJob2SkillRanking, RankingDataset +from workrb.types import Language + + +class TestMultiDatasetTask: + """Test tasks that return multiple dataset IDs per language.""" + + def test_languages_to_dataset_ids_multiple_per_language(self): + """Test task that returns multiple dataset IDs per language.""" + + # Create a custom task class that overrides languages_to_dataset_ids + class MultiDatasetTask(ESCOJob2SkillRanking): + def languages_to_dataset_ids(self, languages: list[Language]) -> list[str]: + """Map languages to multiple dataset IDs with custom logic.""" + dataset_ids = [] + lang_set = set(languages) + + # English -> 4 datasets + if Language.EN in lang_set: + dataset_ids.extend(["en1", "en2", "en3_sea", "en3_land"]) + + # French -> 2 datasets + if Language.FR in lang_set: + dataset_ids.extend(["fr1", "fr2"]) + + # German -> 1 dataset + if Language.DE in lang_set: + dataset_ids.append("de") + + # Spanish -> 3 datasets + if Language.ES in lang_set: + dataset_ids.extend(["es1", "es2", "es3_air"]) + + # Cross-language datasets when both French and German are present + if Language.FR in lang_set and Language.DE in lang_set: + dataset_ids.extend(["fr_de_land", "fr_de_sea"]) + + return dataset_ids + + def load_dataset(self, dataset_id: str, split): + """Mock load_dataset to avoid loading real data.""" + # For testing, we just need to verify the dataset_ids are correct + # Return a minimal mock dataset structure + return RankingDataset( + query_texts=["mock query"], + target_indices=[[0]], + target_space=["mock target"], + dataset_id=dataset_id, + ) + + # Test 1: English only + task_en = MultiDatasetTask(split="val", languages=["en"]) + assert task_en.dataset_ids == ["en1", "en2", "en3_sea", "en3_land"] + assert len(task_en.datasets) == 4 + assert all(dataset_id in task_en.datasets for dataset_id in task_en.dataset_ids) + + # Test 2: French only + task_fr = MultiDatasetTask(split="val", languages=["fr"]) + assert task_fr.dataset_ids == ["fr1", "fr2"] + assert len(task_fr.datasets) == 2 + + # Test 3: German only + task_de = MultiDatasetTask(split="val", languages=["de"]) + assert task_de.dataset_ids == ["de"] + assert len(task_de.datasets) == 1 + + # Test 4: Spanish only + task_es = MultiDatasetTask(split="val", languages=["es"]) + assert task_es.dataset_ids == ["es1", "es2", "es3_air"] + assert len(task_es.datasets) == 3 + + # Test 5: French + German (includes cross-language datasets) + task_fr_de = MultiDatasetTask(split="val", languages=["fr", "de"]) + assert set(task_fr_de.dataset_ids) == { + "fr1", + "fr2", + "de", + "fr_de_land", + "fr_de_sea", + } + assert len(task_fr_de.datasets) == 5 + + # Test 6: Multiple languages + task_multi = MultiDatasetTask(split="val", languages=["en", "fr", "es"]) + expected = ["en1", "en2", "en3_sea", "en3_land", "fr1", "fr2", "es1", "es2", "es3_air"] + assert task_multi.dataset_ids == expected + assert len(task_multi.datasets) == 9 + + def test_multi_dataset_task_with_biencoder(self): + """Test that multi-dataset tasks work with actual model evaluation.""" + + class ToyMultiDatasetTask(ESCOJob2SkillRanking): + def languages_to_dataset_ids(self, languages: list[Language]) -> list[str]: + """Return multiple dataset IDs per language.""" + dataset_ids = [] + if Language.EN in languages: + dataset_ids.extend(["en1", "en2"]) + return dataset_ids + + def load_dataset(self, dataset_id: str, split): + """Load minimal toy dataset.""" + from workrb.tasks.abstract.ranking_base import RankingDataset + + # Create tiny datasets for testing + return RankingDataset( + query_texts=["Software Engineer", "Data Scientist"], + target_indices=[[0, 1], [1, 2]], + target_space=["Python", "Machine Learning", "SQL"], + dataset_id=dataset_id, + ) + + # Create task with multiple datasets + task = ToyMultiDatasetTask(split="val", languages=["en"]) + assert task.dataset_ids == ["en1", "en2"] + + # Verify we can evaluate on each dataset + model = BiEncoderModel("all-MiniLM-L6-v2") + + # Evaluate on first dataset + results_en1 = task.evaluate(model, dataset_id="en1") + assert "map" in results_en1 + assert 0 <= results_en1["map"] <= 1 + + # Evaluate on second dataset + results_en2 = task.evaluate(model, dataset_id="en2") + assert "map" in results_en2 + assert 0 <= results_en2["map"] <= 1 + + def test_multi_dataset_task_evaluation_all_datasets(self): + """Test that evaluation pipeline processes all datasets.""" + + class ToyMultiDatasetTask(ESCOJob2SkillRanking): + def languages_to_dataset_ids(self, languages: list[Language]) -> list[Language]: + """Return 2 datasets for English.""" + dataset_ids = [] + if Language.EN in languages: + dataset_ids.extend(["en_region_a", "en_region_b"]) + return dataset_ids + + def load_dataset(self, dataset_id: str, split): + """Load minimal toy dataset.""" + from workrb.tasks.abstract.ranking_base import RankingDataset + + return RankingDataset( + query_texts=["test query"], + target_indices=[[0]], + target_space=["test target"], + dataset_id=dataset_id, + ) + + task = ToyMultiDatasetTask(split="val", languages=["en"]) + + # Verify dataset_ids are correct + assert task.dataset_ids == ["en_region_a", "en_region_b"] + + # Verify both datasets are loaded + assert "en_region_a" in task.datasets + assert "en_region_b" in task.datasets + + # Verify dataset objects have correct dataset_id + assert task.datasets["en_region_a"].dataset_id == "en_region_a" + assert task.datasets["en_region_b"].dataset_id == "en_region_b" + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tests/test_task_registry.py b/tests/test_task_registry.py index 0121ddb..b91e30b 100644 --- a/tests/test_task_registry.py +++ b/tests/test_task_registry.py @@ -53,10 +53,10 @@ def supported_target_languages(self): def default_metrics(self): return ["accuracy"] - def load_monolingual_data(self, language, split): - return {"test": "data", "language": str(language), "split": str(split)} + def load_dataset(self, dataset_id, split): + return {"test": "data", "dataset_id": dataset_id, "split": str(split)} - def evaluate(self, model, metrics=None, language="en"): + def evaluate(self, model, metrics=None, dataset_id="en"): return {"accuracy": 0.95, "test_metric": 1.0} diff --git a/tests/test_utils.py b/tests/test_utils.py index b8b4a85..7dcead9 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -42,10 +42,10 @@ def label_type(self) -> LabelType: def default_metrics(self) -> list[str]: return ["map"] - def load_monolingual_data(self, language: Language, split: DatasetSplit) -> Any: + def load_dataset(self, dataset_id: str, split: DatasetSplit) -> Any: return {} - def evaluate(self, model, metrics=None, language: Language = Language.EN) -> dict[str, float]: + def evaluate(self, model, metrics=None, dataset_id: str = "en") -> dict[str, float]: return {} @@ -122,7 +122,7 @@ def _limit_dataset(self, dataset: RankingDataset) -> RankingDataset: query_texts=filtered_queries, target_indices=remapped_indices, target_space=limited_target_space, - language=dataset.language, + dataset_id=dataset.dataset_id, ) @@ -161,7 +161,7 @@ def _limit_classification_dataset( texts=limited_texts, labels=limited_labels, label_space=dataset.label_space, # Keep full label space - language=dataset.language, + dataset_id=dataset.dataset_id, ) @@ -184,10 +184,8 @@ def create_toy_task_class( class ToyRankingTask(ToyTaskMixin, base_task_class): """Dynamically created toy ranking task.""" - def load_monolingual_data( - self, split: DatasetSplit, language: Language - ) -> RankingDataset: - full_dataset = super().load_monolingual_data(split=split, language=language) + def load_dataset(self, dataset_id: str, split: DatasetSplit) -> RankingDataset: + full_dataset = super().load_dataset(dataset_id=dataset_id, split=split) return self._limit_dataset(full_dataset) return_cls = ToyRankingTask @@ -197,10 +195,8 @@ def load_monolingual_data( class ToyClassificationTask(ToyClassificationTaskMixin, base_task_class): """Dynamically created toy classification task.""" - def load_monolingual_data( - self, split: DatasetSplit, language: Language - ) -> ClassificationDataset: - full_dataset = super().load_monolingual_data(split=split, language=language) + def load_dataset(self, dataset_id: str, split: DatasetSplit) -> ClassificationDataset: + full_dataset = super().load_dataset(dataset_id=dataset_id, split=split) return self._limit_classification_dataset(full_dataset) return_cls = ToyClassificationTask From 17b189753fbc1810af45c33c4cb0b73e03116fc0 Mon Sep 17 00:00:00 2001 From: federetyk Date: Thu, 15 Jan 2026 14:43:48 +0100 Subject: [PATCH 2/7] fix: solve issues in example files --- examples/custom_model_example.py | 3 +++ examples/custom_task_example.py | 1 + examples/run_multiple_models.py | 2 ++ src/workrb/config.py | 2 +- 4 files changed, 7 insertions(+), 1 deletion(-) diff --git a/examples/custom_model_example.py b/examples/custom_model_example.py index 2a2d8f7..c2a5d4e 100644 --- a/examples/custom_model_example.py +++ b/examples/custom_model_example.py @@ -9,6 +9,7 @@ import torch from sentence_transformers import SentenceTransformer +import workrb from workrb.models.base import ModelInterface from workrb.registry import register_model from workrb.types import ModelInputType @@ -47,10 +48,12 @@ def __init__( self.encoder.to(device) self.encoder.eval() + @property def name(self) -> str: """Return the unique name of this model.""" return f"MyCustomModel-{self.base_model_name.split('/')[-1]}" + @property def description(self) -> str: """Return the description of this model.""" return "A custom model that demonstrates WorkRB extensibility" diff --git a/examples/custom_task_example.py b/examples/custom_task_example.py index 5b2eaa3..99edb3e 100644 --- a/examples/custom_task_example.py +++ b/examples/custom_task_example.py @@ -6,6 +6,7 @@ and implement the required abstract methods. """ +import workrb from workrb.registry import register_task from workrb.tasks.abstract.base import DatasetSplit, LabelType, Language from workrb.tasks.abstract.ranking_base import RankingDataset, RankingTaskGroup diff --git a/examples/run_multiple_models.py b/examples/run_multiple_models.py index ed36ad5..fb3fbd7 100644 --- a/examples/run_multiple_models.py +++ b/examples/run_multiple_models.py @@ -2,6 +2,8 @@ Reproduce benchmark results. """ +import workrb + if __name__ == "__main__": # 1. Setup model and tasks models = [ diff --git a/src/workrb/config.py b/src/workrb/config.py index 0d5c2b3..8a0597c 100644 --- a/src/workrb/config.py +++ b/src/workrb/config.py @@ -205,7 +205,7 @@ def get_pending_work( self, results: BenchmarkResults | None, tasks: Sequence[Task], - ) -> list[tuple]: + ) -> list[tuple[Task, str]]: """Determine what work still needs to be done. Work is defined as a (task, dataset_id) combination that is not completed. From e16f8ddf7c6baefb61c15366a00428f6f7d581c9 Mon Sep 17 00:00:00 2001 From: federetyk Date: Thu, 15 Jan 2026 18:43:09 +0100 Subject: [PATCH 3/7] fix: add language field to MetricsResult for proper per-language aggregation --- src/workrb/results.py | 21 ++++++++++++------ src/workrb/run.py | 2 ++ src/workrb/tasks/abstract/base.py | 36 +++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 6 deletions(-) diff --git a/src/workrb/results.py b/src/workrb/results.py index 5537bcf..6976113 100644 --- a/src/workrb/results.py +++ b/src/workrb/results.py @@ -28,6 +28,10 @@ class MetricsResult(BaseModel): evaluation_time: float = Field(ge=0) metrics_dict: dict[str, Any] = Field(default_factory=dict) """ Dictionary of metric names to their computed values. """ + language: str | None = Field( + default=None, + description="Language code if this is a monolingual dataset, None for cross-language datasets.", + ) class TaskResults(BaseModel): @@ -292,19 +296,24 @@ def _aggregate_per_language( ) -> dict[ResultTagString, float]: """Aggregate results per language. - Collects language-specific results over all tasks, and aggregates all available results. + Collects results for monolingual datasets and aggregates by language across all tasks. + Cross-language datasets (where language is None) are excluded from this aggregation. Results may be imbalanced if tasks support different languages. """ - # Collect metric values per task + # Collect metric values per language raw_results = defaultdict(list) for task_result in self.task_results.values(): - for dataset_id, metrics_result in task_result.language_results.items(): + for metrics_result in task_result.language_results.values(): + # Skip cross-language datasets + if metrics_result.language is None: + continue + for metric_name, metric_value in metrics_result.metrics_dict.items(): - raw_results[(dataset_id, metric_name)].append(metric_value) + raw_results[(metrics_result.language, metric_name)].append(metric_value) # Compute stats results = {} - for (dataset_id, metric_name), values in raw_results.items(): + for (language, metric_name), values in raw_results.items(): stats = self._compute_stats(values) for agg in aggregations: assert agg in stats, f"Aggregation {agg} not found in stats: {stats.keys()}" @@ -312,7 +321,7 @@ def _aggregate_per_language( name=tag_name, metric_name=metric_name, aggregation=agg, - grouping_name=dataset_id, + grouping_name=language, ) results[tag] = stats[agg] return results diff --git a/src/workrb/run.py b/src/workrb/run.py index 7fd6e3a..b4646f2 100644 --- a/src/workrb/run.py +++ b/src/workrb/run.py @@ -344,9 +344,11 @@ def _run_pending_work( evaluation_time = time.time() - start_time_eval # Store results + dataset_language = task.get_dataset_language(dataset_id) results.task_results[task.name].language_results[dataset_id] = MetricsResult( evaluation_time=evaluation_time, metrics_dict=dataset_results, + language=dataset_language.value if dataset_language else None, ) # Save incremental results to checkpoint diff --git a/src/workrb/tasks/abstract/base.py b/src/workrb/tasks/abstract/base.py index 83b7f9b..4affe76 100644 --- a/src/workrb/tasks/abstract/base.py +++ b/src/workrb/tasks/abstract/base.py @@ -202,6 +202,42 @@ def get_size_oneliner(self, dataset_id: str) -> str: """ return "" + def get_dataset_language(self, dataset_id: str) -> Language | None: + """Return the language of a dataset if it's monolingual, None otherwise. + + Default implementation assumes dataset_id equals language code (1:1 mapping). + This works for standard monolingual tasks where each dataset corresponds to + exactly one language. + + Tasks with arbitrary dataset IDs (not 1:1 with languages) must override this + method to return the appropriate language for each dataset, or None for + cross-language datasets. + + Args: + dataset_id: Dataset identifier + + Returns + ------- + Language enum if this is a monolingual dataset, None for cross-language datasets + + Raises + ------ + NotImplementedError: If dataset_id is not a valid language code and the + subclass has not overridden this method + """ + try: + lang = Language(dataset_id) + if lang in self.languages: + return lang + return None + except ValueError as e: + raise NotImplementedError( + f"Dataset ID '{dataset_id}' is not a valid language code. " + f"Task '{self.__class__.__name__}' uses arbitrary dataset IDs (not 1:1 with languages) " + f"and must override the 'get_dataset_language' method to map each dataset ID " + f"to its corresponding language, or return None for cross-language datasets." + ) from e + @final @property def split_val_fraction(self) -> float: From e254bc2fd0d8a9efc6e722487e7aa0137e11da67 Mon Sep 17 00:00:00 2001 From: federetyk Date: Fri, 16 Jan 2026 10:01:03 +0100 Subject: [PATCH 4/7] style: update docstrings to comply with NumPy style --- src/workrb/tasks/abstract/base.py | 71 ++++++++++------ .../tasks/abstract/classification_base.py | 80 ++++++++++++------- src/workrb/tasks/abstract/ranking_base.py | 59 +++++++++----- 3 files changed, 138 insertions(+), 72 deletions(-) diff --git a/src/workrb/tasks/abstract/base.py b/src/workrb/tasks/abstract/base.py index 4affe76..8ab9516 100644 --- a/src/workrb/tasks/abstract/base.py +++ b/src/workrb/tasks/abstract/base.py @@ -93,25 +93,32 @@ def languages_to_dataset_ids(self, languages: list[Language]) -> list[str]: Other tasks with multiple datasets per language can override this method to return all datasets that use only languages from the provided set. - Args: - languages: List of Language enums requested for evaluation + Parameters + ---------- + languages : list[Language] + List of Language enums requested for evaluation. Returns ------- - List of dataset identifier strings + list[str] + List of dataset identifier strings. """ return [lang.value for lang in languages] def _load_datasets(self, dataset_ids: list[str], split: DatasetSplit) -> dict[str, Any]: """Load datasets for specified IDs. - Args: - dataset_ids: List of dataset identifiers to load - split: Dataset split to load + Parameters + ---------- + dataset_ids : list[str] + List of dataset identifiers to load. + split : DatasetSplit + Dataset split to load. Returns ------- - Dictionary mapping dataset_id to dataset object + dict[str, Any] + Dictionary mapping dataset_id to dataset object. """ datasets = {} for dataset_id in dataset_ids: @@ -193,12 +200,15 @@ def split_test_fraction(self) -> float: def get_size_oneliner(self, dataset_id: str) -> str: """Get dataset size summary to display status. - Args: - dataset_id: Dataset identifier + Parameters + ---------- + dataset_id : str + Dataset identifier. Returns ------- - Human-readable size string + str + Human-readable size string. """ return "" @@ -213,17 +223,21 @@ def get_dataset_language(self, dataset_id: str) -> Language | None: method to return the appropriate language for each dataset, or None for cross-language datasets. - Args: - dataset_id: Dataset identifier + Parameters + ---------- + dataset_id : str + Dataset identifier. Returns ------- - Language enum if this is a monolingual dataset, None for cross-language datasets + Language or None + Language enum if this is a monolingual dataset, None for cross-language datasets. Raises ------ - NotImplementedError: If dataset_id is not a valid language code and the - subclass has not overridden this method + NotImplementedError + If dataset_id is not a valid language code and the subclass has not + overridden this method. """ try: lang = Language(dataset_id) @@ -255,25 +269,34 @@ def load_dataset(self, dataset_id: str, split: DatasetSplit) -> Any: For other tasks: dataset_id can encode additional information like country and languages (e.g., "deu_q_de_c_de"). - Args: - dataset_id: Unique identifier for the dataset - split: Dataset split to load + Parameters + ---------- + dataset_id : str + Unique identifier for the dataset. + split : DatasetSplit + Dataset split to load. Returns ------- - Dataset object (RankingDataset or ClassificationDataset) + Any + Dataset object (RankingDataset or ClassificationDataset). """ @abstractmethod def evaluate(self, model, metrics=None, dataset_id: str = "en") -> dict[str, float]: """Evaluate model on specific dataset. - Args: - model: Model to evaluate - metrics: List of metric names. If None, uses default_metrics - dataset_id: Dataset identifier to evaluate on + Parameters + ---------- + model : ModelInterface + Model to evaluate. + metrics : list[str] or None, optional + List of metric names. If None, uses default_metrics. + dataset_id : str, optional + Dataset identifier to evaluate on. Default is "en". Returns ------- - Dictionary of metric names to values + dict[str, float] + Dictionary of metric names to values. """ diff --git a/src/workrb/tasks/abstract/classification_base.py b/src/workrb/tasks/abstract/classification_base.py index 999bea9..850bf4e 100644 --- a/src/workrb/tasks/abstract/classification_base.py +++ b/src/workrb/tasks/abstract/classification_base.py @@ -46,12 +46,17 @@ def __init__( ): """Initialize classification dataset with validation. - Args: - texts: List of input text strings - labels: List with list of class indices corresponding to each text. - Contains just 1 item per list for single-label classification. - label_space: List of class names/labels (e.g., ["skill1", "skill2", "skill3"]) - dataset_id: Unique identifier for this dataset + Parameters + ---------- + texts : list[str] + List of input text strings. + labels : list[list[int]] + List with list of class indices corresponding to each text. + Contains just 1 item per list for single-label classification. + label_space : list[str] + List of class names/labels (e.g., ["skill1", "skill2", "skill3"]). + dataset_id : str + Unique identifier for this dataset. """ self.texts = self._postprocess_texts(texts) self.labels = self._postprocess_labels(labels) @@ -178,12 +183,15 @@ def threshold(self) -> float | None: def get_output_space_size(self, dataset_id: str) -> int: """Number of output classes for this classification task. - Args: - dataset_id: Dataset identifier + Parameters + ---------- + dataset_id : str + Dataset identifier. Returns ------- - Number of classes in the output space + int + Number of classes in the output space. """ @property @@ -200,24 +208,31 @@ def load_dataset(self, dataset_id: str, split: DatasetSplit) -> ClassificationDa For other tasks: dataset_id can encode arbitrary information. - Args: - dataset_id: Unique identifier for the dataset - split: Dataset split to load + Parameters + ---------- + dataset_id : str + Unique identifier for the dataset. + split : DatasetSplit + Dataset split to load. Returns ------- - ClassificationDataset object + ClassificationDataset + ClassificationDataset object. """ def get_size_oneliner(self, dataset_id: str) -> str: """Get dataset summary to display for progress. - Args: - dataset_id: Dataset identifier + Parameters + ---------- + dataset_id : str + Dataset identifier. Returns ------- - Human-readable size string + str + Human-readable size string. """ dataset: ClassificationDataset = self.datasets[dataset_id] return f"{len(dataset.texts)} samples, {len(dataset.label_space)} classes" @@ -235,14 +250,19 @@ def evaluate( 2. Applies optimized threshold to test predictions 3. Calculates metrics on test data - Args: - model: Model implementing classification interface - metrics: List of metrics to compute - dataset_id: Dataset identifier to evaluate on + Parameters + ---------- + model : ModelInterface + Model implementing classification interface. + metrics : list[str] or None, optional + List of metrics to compute. + dataset_id : str, optional + Dataset identifier to evaluate on. Default is "en". Returns ------- - Dictionary containing metric scores and evaluation metadata + dict[str, float] + Dictionary containing metric scores and evaluation metadata. """ if metrics is None: metrics = self.default_metrics @@ -335,13 +355,17 @@ def _validate_model_label_space( def get_threshold_on_val_data(self, model: ModelInterface, dataset_id: str) -> float: """Get the best threshold on validation data. - Args: - model: Model to evaluate - dataset_id: Dataset identifier + Parameters + ---------- + model : ModelInterface + Model to evaluate. + dataset_id : str + Dataset identifier. Returns ------- - Optimized threshold value + float + Optimized threshold value. """ # Step 1: Optimize threshold on validation data # Load validation data (even if we're evaluating on test) @@ -431,8 +455,10 @@ def __init__( ): """Initialize classification task. - Args: - **kwargs: Arguments passed to parent Task class (languages, split, etc.) + Parameters + ---------- + **kwargs + Arguments passed to parent Task class (languages, split, etc.). """ super().__init__(**kwargs) diff --git a/src/workrb/tasks/abstract/ranking_base.py b/src/workrb/tasks/abstract/ranking_base.py index 20265d0..635ae78 100644 --- a/src/workrb/tasks/abstract/ranking_base.py +++ b/src/workrb/tasks/abstract/ranking_base.py @@ -41,11 +41,16 @@ def __init__( ): """Initialize ranking dataset with validation. - Args: - query_texts: List of query strings - target_indices: List of lists containing indices into the target vocabulary - target_space: List of target vocabulary strings - dataset_id: Unique identifier for this dataset + Parameters + ---------- + query_texts : list[str] + List of query strings. + target_indices : list[list[int]] + List of lists containing indices into the target vocabulary. + target_space : list[str] + List of target vocabulary strings. + dataset_id : str + Unique identifier for this dataset. """ self.query_texts = self._postprocess_texts(query_texts) self.target_indices = self._postprocess_indices(target_indices) @@ -118,10 +123,10 @@ def __init__( ): """Initialize ranking task. - Args: - mode: Evaluation mode ("test" or "val") - language: Language code - **kwargs: Additional arguments for legacy compatibility + Parameters + ---------- + **kwargs + Additional arguments passed to parent Task class. """ super().__init__(**kwargs) @@ -144,24 +149,31 @@ def load_dataset(self, dataset_id: str, split: DatasetSplit) -> RankingDataset: For other tasks: dataset_id can encode arbitrary information. - Args: - dataset_id: Unique identifier for the dataset - split: Dataset split to load + Parameters + ---------- + dataset_id : str + Unique identifier for the dataset. + split : DatasetSplit + Dataset split to load. Returns ------- - RankingDataset object + RankingDataset + RankingDataset object. """ def get_size_oneliner(self, dataset_id: str) -> str: """Get dataset summary to display for progress. - Args: - dataset_id: Dataset identifier + Parameters + ---------- + dataset_id : str + Dataset identifier. Returns ------- - Human-readable size string + str + Human-readable size string. """ dataset = self.datasets[dataset_id] return f"{len(dataset.query_texts)} queries x {len(dataset.target_space)} targets" @@ -174,14 +186,19 @@ def evaluate( ) -> dict[str, float]: """Evaluate the model on this ranking task. - Args: - model: Model implementing ModelInterface (must have compute_rankings method) - metrics: List of metrics to compute. If None, uses default_metrics - dataset_id: Dataset identifier to evaluate on + Parameters + ---------- + model : ModelInterface + Model implementing ModelInterface (must have compute_rankings method). + metrics : list[str] or None, optional + List of metrics to compute. If None, uses default_metrics. + dataset_id : str, optional + Dataset identifier to evaluate on. Default is "en". Returns ------- - Dictionary containing metric scores and evaluation metadata + dict[str, float] + Dictionary containing metric scores and evaluation metadata. """ if metrics is None: metrics = self.default_metrics From 054aef3e1998120fbe1e7e8a2d1ab86fb1688e14 Mon Sep 17 00:00:00 2001 From: federetyk Date: Fri, 16 Jan 2026 18:10:17 +0100 Subject: [PATCH 5/7] feat: implement a new ranking task for melo --- src/workrb/tasks/__init__.py | 2 + src/workrb/tasks/abstract/ranking_base.py | 4 +- src/workrb/tasks/ranking/__init__.py | 3 + src/workrb/tasks/ranking/melo.py | 251 ++++++++++++++++++++++ tests/test_task_loading.py | 73 +++++++ 5 files changed, 332 insertions(+), 1 deletion(-) create mode 100644 src/workrb/tasks/ranking/melo.py diff --git a/src/workrb/tasks/__init__.py b/src/workrb/tasks/__init__.py index 7987260..eb1aab7 100644 --- a/src/workrb/tasks/__init__.py +++ b/src/workrb/tasks/__init__.py @@ -11,6 +11,7 @@ from .ranking.job2skill import ESCOJob2SkillRanking from .ranking.job_similarity import JobTitleSimilarityRanking from .ranking.jobnorm import JobBERTJobNormRanking +from .ranking.melo import MELORanking from .ranking.skill2job import ESCOSkill2JobRanking from .ranking.skill_extraction import ( HouseSkillExtractRanking, @@ -35,6 +36,7 @@ "ESCOSkillNormRanking", "JobBERTJobNormRanking", "JobTitleSimilarityRanking", + "MELORanking", "HouseSkillExtractRanking", "TechSkillExtractRanking", "SkillSkapeExtractRanking", diff --git a/src/workrb/tasks/abstract/ranking_base.py b/src/workrb/tasks/abstract/ranking_base.py index 635ae78..0b9bb3c 100644 --- a/src/workrb/tasks/abstract/ranking_base.py +++ b/src/workrb/tasks/abstract/ranking_base.py @@ -38,6 +38,8 @@ def __init__( target_indices: list[list[int]], target_space: list[str], dataset_id: str, + allow_duplicate_queries: bool = True, + allow_duplicate_targets: bool = False, ): """Initialize ranking dataset with validation. @@ -56,7 +58,7 @@ def __init__( self.target_indices = self._postprocess_indices(target_indices) self.target_space = self._postprocess_texts(target_space) self.dataset_id = dataset_id - self.validate_dataset() + self.validate_dataset(allow_duplicate_queries, allow_duplicate_targets) def validate_dataset( self, diff --git a/src/workrb/tasks/ranking/__init__.py b/src/workrb/tasks/ranking/__init__.py index 68c8fe6..f34b79b 100644 --- a/src/workrb/tasks/ranking/__init__.py +++ b/src/workrb/tasks/ranking/__init__.py @@ -10,9 +10,11 @@ from workrb.tasks.ranking.job2skill import ESCOJob2SkillRanking from workrb.tasks.ranking.job_similarity import JobTitleSimilarityRanking from workrb.tasks.ranking.jobnorm import JobBERTJobNormRanking +from workrb.tasks.ranking.melo import MELORanking from workrb.tasks.ranking.skill2job import ESCOSkill2JobRanking from workrb.tasks.ranking.skill_extraction import ( HouseSkillExtractRanking, + SkillSkapeExtractRanking, TechSkillExtractRanking, ) from workrb.tasks.ranking.skill_similarity import SkillMatch1kSkillSimilarityRanking @@ -25,6 +27,7 @@ "HouseSkillExtractRanking", "JobBERTJobNormRanking", "JobTitleSimilarityRanking", + "MELORanking", "SkillMatch1kSkillSimilarityRanking", "SkillSkapeExtractRanking", "TechSkillExtractRanking", diff --git a/src/workrb/tasks/ranking/melo.py b/src/workrb/tasks/ranking/melo.py new file mode 100644 index 0000000..0504164 --- /dev/null +++ b/src/workrb/tasks/ranking/melo.py @@ -0,0 +1,251 @@ +"""Job Normalization ranking task using monolingual datasets from MELO (Retyk et al., 2024).""" + +from datasets import load_dataset + +from workrb.registry import register_task +from workrb.tasks.abstract.base import DatasetSplit, LabelType, Language +from workrb.tasks.abstract.ranking_base import RankingDataset, RankingTask, RankingTaskGroup +from workrb.types import ModelInputType + + +@register_task() +class MELORanking(RankingTask): + """ + MELO: Job Normalization ranking task using monolingual datasets from MELO (Retyk et al., 2024). + + This task includes monolingual datasets from the MELO Benchmark, which involve Entity Linking + of occupations (job titles) into ESCO, posed as a ranking task. + """ + + MELO_LANGUAGES = [ + Language.BG, + Language.CS, + Language.DA, + Language.DE, + Language.EN, + Language.ES, + Language.ET, + Language.FR, + Language.HR, + Language.HU, + Language.IT, + Language.LT, + Language.LV, + Language.NL, + Language.NO, + Language.PL, + Language.PT, + Language.RO, + Language.SK, + Language.SL, + Language.SV, + ] + + LANGUAGE_TO_DATASETS = [ + "bgr_q_bg_c_bg", + "bgr_q_bg_c_en", + "cze_q_cs_c_cs", + "cze_q_cs_c_en", + "deu_q_de_c_de", + "deu_q_de_c_en", + "dnk_q_da_c_da", + "dnk_q_da_c_en", + "esp_q_es_c_en", + "esp_q_es_c_es", + "est_q_et_c_en", + "est_q_et_c_et", + "fra_q_fr_c_en", + "fra_q_fr_c_fr", + "hrv_q_hr_c_en", + "hrv_q_hr_c_hr", + "hun_q_hu_c_en", + "hun_q_hu_c_hu", + "ita_q_it_c_en", + "ita_q_it_c_it", + "ltu_q_lt_c_en", + "ltu_q_lt_c_lt", + "lva_q_lv_c_en", + "lva_q_lv_c_lv", + "nld_q_nl_c_en", + "nld_q_nl_c_nl", + "nor_q_no_c_en", + "nor_q_no_c_no", + "pol_q_pl_c_en", + "pol_q_pl_c_pl", + "prt_q_pt_c_en", + "prt_q_pt_c_pt", + "rou_q_ro_c_en", + "rou_q_ro_c_ro", + "svk_q_sk_c_en", + "svk_q_sk_c_sk", + "svn_q_sl_c_en", + "svn_q_sl_c_sl", + "swe_q_sv_c_en", + "swe_q_sv_c_sv", + "usa_q_en_c_de_en_es_fr_it_nl_pl_pt", + "usa_q_en_c_en", + ] + + @property + def name(self) -> str: + """MELO task name.""" + return "MELO" + + @property + def description(self) -> str: + """MELO task description.""" + return "Job Normalization ranking task into ESCO." + + @property + def default_metrics(self) -> list[str]: + return ["mrr", "hit@1", "hit@5", "hit@10"] + + @property + def task_group(self) -> RankingTaskGroup: + """Job Normalization task group.""" + return RankingTaskGroup.JOB_NORMALIZATION + + @property + def supported_query_languages(self) -> list[Language]: + """Supported query languages.""" + return self.MELO_LANGUAGES + + @property + def supported_target_languages(self) -> list[Language]: + """Supported target languages.""" + return self.MELO_LANGUAGES + + @property + def split_test_fraction(self) -> float: + """Fraction of data to use for test split.""" + return 1.0 + + @property + def label_type(self) -> LabelType: + """Multi-label ranking for Job Normalization.""" + return LabelType.MULTI_LABEL + + @property + def query_input_type(self) -> ModelInputType: + """Query input type for job titles.""" + return ModelInputType.JOB_TITLE + + @property + def target_input_type(self) -> ModelInputType: + """Target input type for job titles.""" + return ModelInputType.JOB_TITLE + + def _parse_dataset_id(self, dataset_id: str) -> tuple[str, list[str]]: + """Parse dataset_id into query language and corpus languages. + + Parameters + ---------- + dataset_id : str + Dataset identifier in format: _q__c_[_]. + + Returns + ------- + tuple[str, list[str]] + Tuple of (query_language_code, list_of_corpus_language_codes). + """ + parts = dataset_id.split("_") + # Find the index of 'q' and 'c' markers + q_idx = parts.index("q") + c_idx = parts.index("c") + # Query language is between 'q' and 'c' + query_lang = "_".join(parts[q_idx + 1 : c_idx]) + # Corpus languages are everything after 'c' + corpus_langs = parts[c_idx + 1 :] + return query_lang, corpus_langs + + def languages_to_dataset_ids(self, languages: list[Language]) -> list[str]: + """Filter datasets based on the target languages. + + Parameters + ---------- + languages : list[Language] + List of Language enums requested for evaluation. + + Returns + ------- + list[str] + List of dataset identifier strings. + """ + lang_codes = {lang.value for lang in languages} + + result = [] + for dataset_id in self.LANGUAGE_TO_DATASETS: + query_lang, corpus_langs = self._parse_dataset_id(dataset_id) + # Check if all involved languages are in the target set + all_langs = {query_lang} | set(corpus_langs) + if all_langs <= lang_codes: + result.append(dataset_id) + + return result + + def get_dataset_language(self, dataset_id: str) -> Language | None: + """Return the language of a dataset if it's monolingual, None otherwise. + + Parameters + ---------- + dataset_id : str + Dataset identifier. + + Returns + ------- + Language or None + Language enum if this is a monolingual dataset, None for cross-language datasets. + """ + query_lang, corpus_langs = self._parse_dataset_id(dataset_id) + + # Not monolingual if multiple corpus languages or query != corpus + if len(corpus_langs) != 1 or query_lang != corpus_langs[0]: + return None + + return Language(query_lang) + + def load_dataset(self, dataset_id: str, split: DatasetSplit) -> RankingDataset: + """Load MELO data from the HuggingFace dataset. + + Parameters + ---------- + dataset_id : str + Unique identifier for the dataset. + split : DatasetSplit + Dataset split to load. + + Returns + ------- + RankingDataset + RankingDataset object. + """ + if split != DatasetSplit.TEST: + raise ValueError(f"Split '{split}' not supported. Use TEST") + + if dataset_id not in self.dataset_ids: + raise ValueError(f"Dataset '{dataset_id}' not supported.") + + ds = load_dataset("Avature/MELO-Benchmark", dataset_id) + + queries = list(ds["queries"]["text"]) + relevancy_labels = list(ds["queries"]["labels"]) + corpus = list(ds["corpus"]["text"]) + + return RankingDataset( + queries, relevancy_labels, corpus, dataset_id=dataset_id, allow_duplicate_targets=True + ) + + @property + def citation(self) -> str: + """MELO task citation.""" + return """ +@inproceedings{retyk2024melo, + title = {{MELO: An Evaluation Benchmark for Multilingual Entity Linking of Occupations}}, + author = {Federico Retyk and Luis Gasco and Casimiro Pio Carrino and Daniel Deniz and Rabih Zbib}, + booktitle = {Proceedings of the 4th Workshop on Recommender Systems for Human Resources + (RecSys in {HR} 2024), in conjunction with the 18th {ACM} Conference on + Recommender Systems}, + year = {2024}, + url = {https://recsyshr.aau.dk/wp-content/uploads/2024/10/RecSysHR2024-paper_2.pdf}, +} +""" diff --git a/tests/test_task_loading.py b/tests/test_task_loading.py index 5ff1d72..3eac51f 100644 --- a/tests/test_task_loading.py +++ b/tests/test_task_loading.py @@ -13,6 +13,7 @@ HouseSkillExtractRanking, JobBERTJobNormRanking, JobTitleSimilarityRanking, + MELORanking, SkillMatch1kSkillSimilarityRanking, SkillSkapeExtractRanking, TechSkillExtractRanking, @@ -44,6 +45,7 @@ def test_ranking_tasks_init_en_splits(): ("ESCOSkillNormRanking", ESCOSkillNormRanking), ("JobNormRanking", JobBERTJobNormRanking), ("JobTitleSimilarityRanking", JobTitleSimilarityRanking), + ("MELORanking", MELORanking), ("SkillExtractHouseRanking", HouseSkillExtractRanking), ("SkillExtractTechRanking", TechSkillExtractRanking), ("SkillExtractSkillSkapeRanking", SkillSkapeExtractRanking), @@ -52,6 +54,7 @@ def test_ranking_tasks_init_en_splits(): tasks_with_only_test_set = [ "JobTitleSimilarityRanking", + "MELORanking", ] results = {"success": [], "failures": []} @@ -208,3 +211,73 @@ def test_init_task_with_skip_on_unsupported_languages( split=self.split, unsupported_lang_mode=unsupported_lang_mode, ) + + +class TestMELORankingDatasetIds: + """Test MELORanking.languages_to_dataset_ids filtering logic.""" + + @pytest.mark.parametrize( + "languages,expected_dataset_ids", + [ + # Single language: only monolingual English dataset + ( + [Language.EN], + {"usa_q_en_c_en"}, + ), + # Bulgarian only: only monolingual Bulgarian dataset + ( + [Language.BG], + {"bgr_q_bg_c_bg"}, + ), + # Bulgarian + English: monolingual + cross-lingual datasets for both + ( + [Language.BG, Language.EN], + {"bgr_q_bg_c_bg", "bgr_q_bg_c_en", "usa_q_en_c_en"}, + ), + # Czech + English: monolingual + cross-lingual datasets + ( + [Language.CS, Language.EN], + {"cze_q_cs_c_cs", "cze_q_cs_c_en", "usa_q_en_c_en"}, + ), + # All languages needed for usa_q_en_c_de_en_es_fr_it_nl_pl_pt + ( + [ + Language.EN, + Language.DE, + Language.ES, + Language.FR, + Language.IT, + Language.NL, + Language.PL, + Language.PT, + ], + { + "deu_q_de_c_de", + "deu_q_de_c_en", + "esp_q_es_c_en", + "esp_q_es_c_es", + "fra_q_fr_c_en", + "fra_q_fr_c_fr", + "ita_q_it_c_en", + "ita_q_it_c_it", + "nld_q_nl_c_en", + "nld_q_nl_c_nl", + "pol_q_pl_c_en", + "pol_q_pl_c_pl", + "prt_q_pt_c_en", + "prt_q_pt_c_pt", + "usa_q_en_c_de_en_es_fr_it_nl_pl_pt", + "usa_q_en_c_en", + }, + ), + # Two languages without cross-lingual datasets between them + ( + [Language.BG, Language.CS], + {"bgr_q_bg_c_bg", "cze_q_cs_c_cs"}, + ), + ], + ) + def test_languages_to_dataset_ids(self, languages, expected_dataset_ids): + """Test that dataset_ids matches expected for given language combinations.""" + task = MELORanking(split="test", languages=[lang.value for lang in languages]) + assert set(task.dataset_ids) == expected_dataset_ids From fa861045100f0c60898f5042aa5a871f0e634f05 Mon Sep 17 00:00:00 2001 From: federetyk Date: Fri, 16 Jan 2026 19:30:09 +0100 Subject: [PATCH 6/7] feat: implement a new ranking task for mels --- src/workrb/tasks/__init__.py | 2 + src/workrb/tasks/ranking/__init__.py | 2 + src/workrb/tasks/ranking/melo.py | 6 +- src/workrb/tasks/ranking/mels.py | 208 +++++++++++++++++++++++++++ tests/test_e2e_toy_benchmark.py | 3 +- tests/test_task_loading.py | 3 + 6 files changed, 220 insertions(+), 4 deletions(-) create mode 100644 src/workrb/tasks/ranking/mels.py diff --git a/src/workrb/tasks/__init__.py b/src/workrb/tasks/__init__.py index eb1aab7..eba7398 100644 --- a/src/workrb/tasks/__init__.py +++ b/src/workrb/tasks/__init__.py @@ -12,6 +12,7 @@ from .ranking.job_similarity import JobTitleSimilarityRanking from .ranking.jobnorm import JobBERTJobNormRanking from .ranking.melo import MELORanking +from .ranking.mels import MELSRanking from .ranking.skill2job import ESCOSkill2JobRanking from .ranking.skill_extraction import ( HouseSkillExtractRanking, @@ -37,6 +38,7 @@ "JobBERTJobNormRanking", "JobTitleSimilarityRanking", "MELORanking", + "MELSRanking", "HouseSkillExtractRanking", "TechSkillExtractRanking", "SkillSkapeExtractRanking", diff --git a/src/workrb/tasks/ranking/__init__.py b/src/workrb/tasks/ranking/__init__.py index f34b79b..9f4f2f0 100644 --- a/src/workrb/tasks/ranking/__init__.py +++ b/src/workrb/tasks/ranking/__init__.py @@ -11,6 +11,7 @@ from workrb.tasks.ranking.job_similarity import JobTitleSimilarityRanking from workrb.tasks.ranking.jobnorm import JobBERTJobNormRanking from workrb.tasks.ranking.melo import MELORanking +from workrb.tasks.ranking.mels import MELSRanking from workrb.tasks.ranking.skill2job import ESCOSkill2JobRanking from workrb.tasks.ranking.skill_extraction import ( HouseSkillExtractRanking, @@ -28,6 +29,7 @@ "JobBERTJobNormRanking", "JobTitleSimilarityRanking", "MELORanking", + "MELSRanking", "SkillMatch1kSkillSimilarityRanking", "SkillSkapeExtractRanking", "TechSkillExtractRanking", diff --git a/src/workrb/tasks/ranking/melo.py b/src/workrb/tasks/ranking/melo.py index 0504164..5967534 100644 --- a/src/workrb/tasks/ranking/melo.py +++ b/src/workrb/tasks/ranking/melo.py @@ -1,4 +1,4 @@ -"""Job Normalization ranking task using monolingual datasets from MELO (Retyk et al., 2024).""" +"""Job Normalization ranking task using datasets from MELO (Retyk et al., 2024).""" from datasets import load_dataset @@ -11,9 +11,9 @@ @register_task() class MELORanking(RankingTask): """ - MELO: Job Normalization ranking task using monolingual datasets from MELO (Retyk et al., 2024). + MELO: Job Normalization ranking task using datasets from MELO (Retyk et al., 2024). - This task includes monolingual datasets from the MELO Benchmark, which involve Entity Linking + This task includes datasets from the MELO Benchmark, which involve Entity Linking of occupations (job titles) into ESCO, posed as a ranking task. """ diff --git a/src/workrb/tasks/ranking/mels.py b/src/workrb/tasks/ranking/mels.py new file mode 100644 index 0000000..533d9b7 --- /dev/null +++ b/src/workrb/tasks/ranking/mels.py @@ -0,0 +1,208 @@ +"""Skill Normalization ranking task using datasets inspired in MELO (Retyk et al., 2024).""" + +from datasets import load_dataset + +from workrb.registry import register_task +from workrb.tasks.abstract.base import DatasetSplit, LabelType, Language +from workrb.tasks.abstract.ranking_base import RankingDataset, RankingTask, RankingTaskGroup +from workrb.types import ModelInputType + + +@register_task() +class MELSRanking(RankingTask): + """ + MELS (Multilingual Entity Linking of Skills): Skills Normalization ranking task using datasets + that were inspired in MELO (Retyk et al., 2024). + + MELS is a sibling dataset to MELO (Multilingual Entity Linking of Occupations). Both datasets + were built using the same methodology and the same type of source data: crosswalks between + national taxonomies and ESCO, published by official labor-related organizations from EU member + states. + + The difference is the entity type: + - MELO links occupation mentions (job titles) to ESCO Occupations + - MELS links skill mentions to ESCO Skills + """ + + MELS_LANGUAGES = [ + Language.DE, + Language.EN, + Language.FR, + Language.NL, + Language.SV, + ] + + LANGUAGE_TO_DATASETS = [ + "bel_q_fr_c_fr", + "bel_q_fr_c_en", + "bel_q_nl_c_nl", + "bel_q_nl_c_en", + "deu_q_de_c_de", + "deu_q_de_c_en", + "swe_q_sv_c_sv", + "swe_q_sv_c_en", + ] + + @property + def name(self) -> str: + """MELS task name.""" + return "MELS" + + @property + def description(self) -> str: + """MELS task description.""" + return "Skill Normalization ranking task into ESCO." + + @property + def default_metrics(self) -> list[str]: + return ["mrr", "hit@1", "hit@5", "hit@10"] + + @property + def task_group(self) -> RankingTaskGroup: + """Skill Normalization task group.""" + return RankingTaskGroup.SKILL_NORMALIZATION + + @property + def supported_query_languages(self) -> list[Language]: + """Supported query languages.""" + return self.MELS_LANGUAGES + + @property + def supported_target_languages(self) -> list[Language]: + """Supported target languages.""" + return self.MELS_LANGUAGES + + @property + def split_test_fraction(self) -> float: + """Fraction of data to use for test split.""" + return 1.0 + + @property + def label_type(self) -> LabelType: + """Multi-label ranking for Skill Normalization.""" + return LabelType.MULTI_LABEL + + @property + def query_input_type(self) -> ModelInputType: + """Query input type for skill names.""" + return ModelInputType.SKILL_NAME + + @property + def target_input_type(self) -> ModelInputType: + """Target input type for skill names.""" + return ModelInputType.SKILL_NAME + + def _parse_dataset_id(self, dataset_id: str) -> tuple[str, list[str]]: + """Parse dataset_id into query language and corpus languages. + + Parameters + ---------- + dataset_id : str + Dataset identifier in format: _q__c_[_]. + + Returns + ------- + tuple[str, list[str]] + Tuple of (query_language_code, list_of_corpus_language_codes). + """ + parts = dataset_id.split("_") + # Find the index of 'q' and 'c' markers + q_idx = parts.index("q") + c_idx = parts.index("c") + # Query language is between 'q' and 'c' + query_lang = "_".join(parts[q_idx + 1 : c_idx]) + # Corpus languages are everything after 'c' + corpus_langs = parts[c_idx + 1 :] + return query_lang, corpus_langs + + def languages_to_dataset_ids(self, languages: list[Language]) -> list[str]: + """Filter datasets based on the target languages. + + Parameters + ---------- + languages : list[Language] + List of Language enums requested for evaluation. + + Returns + ------- + list[str] + List of dataset identifier strings. + """ + lang_codes = {lang.value for lang in languages} + + result = [] + for dataset_id in self.LANGUAGE_TO_DATASETS: + query_lang, corpus_langs = self._parse_dataset_id(dataset_id) + # Check if all involved languages are in the target set + all_langs = {query_lang} | set(corpus_langs) + if all_langs <= lang_codes: + result.append(dataset_id) + + return result + + def get_dataset_language(self, dataset_id: str) -> Language | None: + """Return the language of a dataset if it's monolingual, None otherwise. + + Parameters + ---------- + dataset_id : str + Dataset identifier. + + Returns + ------- + Language or None + Language enum if this is a monolingual dataset, None for cross-language datasets. + """ + query_lang, corpus_langs = self._parse_dataset_id(dataset_id) + + # Not monolingual if multiple corpus languages or query != corpus + if len(corpus_langs) != 1 or query_lang != corpus_langs[0]: + return None + + return Language(query_lang) + + def load_dataset(self, dataset_id: str, split: DatasetSplit) -> RankingDataset: + """Load MELS data from the HuggingFace dataset. + + Parameters + ---------- + dataset_id : str + Unique identifier for the dataset. + split : DatasetSplit + Dataset split to load. + + Returns + ------- + RankingDataset + RankingDataset object. + """ + if split != DatasetSplit.TEST: + raise ValueError(f"Split '{split}' not supported. Use TEST") + + if dataset_id not in self.dataset_ids: + raise ValueError(f"Dataset '{dataset_id}' not supported.") + + ds = load_dataset("Avature/MELS-Benchmark", dataset_id) + + queries = list(ds["queries"]["text"]) + relevancy_labels = list(ds["queries"]["labels"]) + corpus = list(ds["corpus"]["text"]) + + return RankingDataset( + queries, relevancy_labels, corpus, dataset_id=dataset_id, allow_duplicate_targets=True + ) + + @property + def citation(self) -> str: + """MELS task citation.""" + return """ +@inproceedings{retyk2024melo, + title = {{MELO: An Evaluation Benchmark for Multilingual Entity Linking of Occupations}}, + author = {Federico Retyk and Luis Gasco and Casimiro Pio Carrino and Daniel Deniz and Rabih Zbib}, + booktitle = {Proceedings of the 4th Workshop on Recommender Systems for Human Resources + (RecSys in {HR} 2024), in conjunction with the 18th {ACM} Conference on + Recommender Systems}, + year = {2024}, + url = {https://recsyshr.aau.dk/wp-content/uploads/2024/10/RecSysHR2024-paper_2.pdf}, +} +""" diff --git a/tests/test_e2e_toy_benchmark.py b/tests/test_e2e_toy_benchmark.py index 5964780..5761bea 100644 --- a/tests/test_e2e_toy_benchmark.py +++ b/tests/test_e2e_toy_benchmark.py @@ -62,7 +62,8 @@ def get_all_tasks(split: str = "val", languages: list[str] | None = None) -> lis # Some tasks may have required parameters beyond split/languages try: task_instance = toy_task_class(split=split, languages=languages) - toy_tasks.append(task_instance) + if len(task_instance.datasets) > 0: + toy_tasks.append(task_instance) except TypeError as e: # Task might require additional parameters skipped_tasks.append((task_name, f"Instantiation error: {e}")) diff --git a/tests/test_task_loading.py b/tests/test_task_loading.py index 3eac51f..5be98c4 100644 --- a/tests/test_task_loading.py +++ b/tests/test_task_loading.py @@ -14,6 +14,7 @@ JobBERTJobNormRanking, JobTitleSimilarityRanking, MELORanking, + MELSRanking, SkillMatch1kSkillSimilarityRanking, SkillSkapeExtractRanking, TechSkillExtractRanking, @@ -46,6 +47,7 @@ def test_ranking_tasks_init_en_splits(): ("JobNormRanking", JobBERTJobNormRanking), ("JobTitleSimilarityRanking", JobTitleSimilarityRanking), ("MELORanking", MELORanking), + ("MELSRanking", MELSRanking), ("SkillExtractHouseRanking", HouseSkillExtractRanking), ("SkillExtractTechRanking", TechSkillExtractRanking), ("SkillExtractSkillSkapeRanking", SkillSkapeExtractRanking), @@ -55,6 +57,7 @@ def test_ranking_tasks_init_en_splits(): tasks_with_only_test_set = [ "JobTitleSimilarityRanking", "MELORanking", + "MELSRanking", ] results = {"success": [], "failures": []} From f6daad2ab22b30116d61da1791e87dbdd4be35de Mon Sep 17 00:00:00 2001 From: federetyk Date: Fri, 16 Jan 2026 19:49:20 +0100 Subject: [PATCH 7/7] docs: add new ranking tasks melo and mels to the table of tasks in the readme --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 08faaef..1b47c73 100644 --- a/README.md +++ b/README.md @@ -207,12 +207,14 @@ lang_result_ci = summary["mean_per_language/en/f1_macro/ci_margin"] | Job to Skills WorkBench | multi_label | 3039 queries x 13939 targets | 28 | | Job Title Similarity | multi_label | 105 queries x 2619 targets | 11 | | Job Normalization | single_label | 15463 queries x 2942 targets | 28 | +| Job Normalization MELO | multi_label | 633 queries x 33813 targets | 21 | | Skill to Job WorkBench | multi_label | 13492 queries x 3039 targets | 28 | | Skill Extraction House | multi_label | 262 queries x 13891 targets | 28 | | Skill Extraction Tech | multi_label | 338 queries x 13891 targets | 28 | | Skill Extraction SkillSkape | multi_label | 1191 queries x 13891 targets | 28 | | Skill Similarity SkillMatch-1K | single_label | 900 queries x 2648 targets | 1 | | Skill Normalization ESCO | multi_label | 72008 queries x 13939 targets | 28 | +| Skill Normalization MELS | multi_label | 1722 queries x 19466 targets | 5 | | **Classification** | Job-Skill Classification | multi_label | 3039 samples, 13939 classes | 28 |