diff --git a/dalla_data_processing/cli.py b/dalla_data_processing/cli.py index bf02f7e..e891459 100644 --- a/dalla_data_processing/cli.py +++ b/dalla_data_processing/cli.py @@ -651,7 +651,15 @@ def pack( tokenizer = RBPETokenizer.from_pretrained(config_data["tokenizer_path"]) except ImportError: logger.error("Missing rbpe package") - logger.error("Install with: pip install rbpe") + logger.error( + "rbpe is not included in the default installation due to " + "dependency conflicts with camel-tools (transformers version requirements)" + ) + logger.error("Install separately with: pip install rbpe") + logger.error( + "Note: Installing rbpe may require a separate environment " + "if you also use dedup/stem/quality features" + ) sys.exit(1) else: try: diff --git a/dalla_data_processing/quality/checker.py b/dalla_data_processing/quality/checker.py index 077bbb1..f986ed6 100644 --- a/dalla_data_processing/quality/checker.py +++ b/dalla_data_processing/quality/checker.py @@ -11,6 +11,7 @@ from types import MethodType from typing import Any +from camel_tools.data.catalogue import Catalogue from camel_tools.disambig.bert import BERTUnfactoredDisambiguator from camel_tools.disambig.mle import MLEDisambiguator from datasets import Dataset @@ -53,6 +54,25 @@ def __init__(self, timeout: int = 3600, model: str = "mle", use_gpu: bool = Fals def _init_disambiguator(self): """Initialize and configure the disambiguator with caching.""" + # Install required CAMeL Tools packages based on model type + logger.info("Checking CAMeL Tools data packages...") + catalogue = Catalogue.load_catalogue() + + try: + catalogue.download_package("morphology-db-msa-r13") + catalogue.download_package("disambig-mle-calima-msa-r13") + logger.info("msa-r13 packages installed") + except Exception as e: + logger.warning(f"Package installation warning: {e}") + + # Install BERT package if using BERT model + if self.model == "bert": + try: + catalogue.download_package("disambig-bert-unfactored-all") + logger.info("BERT package installed") + except Exception as e: + logger.warning(f"BERT package installation warning: {e}") + if self.model == "mle": self.disambiguator = MLEDisambiguator.pretrained() logger.info("MLE disambiguator loaded") diff --git a/dalla_data_processing/stemming/stemmer.py b/dalla_data_processing/stemming/stemmer.py index 8bed063..7cbe0df 100644 --- a/dalla_data_processing/stemming/stemmer.py +++ b/dalla_data_processing/stemming/stemmer.py @@ -473,12 +473,19 @@ def stem_dataset( catalogue = Catalogue.load_catalogue() try: catalogue.download_package("morphology-db-msa-r13") - if model == "mle": - catalogue.download_package("disambig-mle-calima-msa-r13") - # For BERT, let it download automatically when pretrained() is called - logger.info("CAMeL Tools data packages ready") + catalogue.download_package("disambig-mle-calima-msa-r13") + logger.info("msa-r13 packages installed") except Exception as e: - logger.warning(f"Could not verify CAMeL packages: {e}") + logger.warning(f"Package installation warning: {e}") + + if model == "bert": + try: + catalogue.download_package("disambig-bert-unfactored-all") + logger.info("BERT package installed") + except Exception as e: + logger.warning(f"BERT package installation warning: {e}") + + logger.info("CAMeL Tools data packages ready") logger.info("Loading additional words lists...") words_dir = os.path.join(os.path.dirname(__file__), "data") @@ -597,15 +604,21 @@ def stem( if not all(isinstance(t, str) for t in text_list): raise TypeError("All items in text list must be strings") - # Initialize disambiguator (cached globally if possible) logger.info(f"Initializing {model.upper()} disambiguator...") catalogue = Catalogue.load_catalogue() try: catalogue.download_package("morphology-db-msa-r13") - if model == "mle": - catalogue.download_package("disambig-mle-calima-msa-r13") + catalogue.download_package("disambig-mle-calima-msa-r13") + logger.info("msa-r13 packages installed") except Exception as e: - logger.warning(f"Could not verify CAMeL packages: {e}") + logger.warning(f"Package installation warning: {e}") + + if model == "bert": + try: + catalogue.download_package("disambig-bert-unfactored-all") + logger.info("BERT package installed") + except Exception as e: + logger.warning(f"BERT package installation warning: {e}") if model == "mle": disambiguator = MLEDisambiguator.pretrained("calima-msa-r13", cache_size=1_000_000) diff --git a/pyproject.toml b/pyproject.toml index cbf1851..c0dd637 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ authors = [ {name = "Digital Research Unit - Arab Center", email = "dru@dohainstitute.edu.qa"} ] readme = "README.md" -requires-python = ">=3.12" +requires-python = ">=3.12,<3.13" keywords = ["arabic", "nlp", "data-processing", "deduplication", "stemming", "readability", "quality"] classifiers = [ "Intended Audience :: Developers", @@ -39,23 +39,24 @@ dev = [ "pre-commit>=3.0.0", ] dedup = [ - "camel-tools>=1.5.0", + "camel-tools==1.5.7", ] dedup-native = [ "cffi>=1.15.0", ] stem = [ - "camel-tools>=1.5.0", + "camel-tools==1.5.7", ] quality = [ - "camel-tools>=1.5.0", + "camel-tools==1.5.7", ] readability = [ "textstat>=0.7.0", ] pack = [ "sentencepiece>=0.2.0", - "rbpe", + # "rbpe", # excluded due to transformers version conflict with camel-tools + # users should install separately if needed: pip install rbpe "pyyaml", ] all = [