diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 72af194..c471bb0 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -18,6 +18,8 @@ jobs:
     
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0  # Fetch full history for setuptools-scm
     
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v5
@@ -58,6 +60,8 @@ jobs:
     
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0  # Fetch full history for setuptools-scm
     
     - name: Install build dependencies
       run: |
@@ -86,6 +90,8 @@ jobs:
     
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0  # Fetch full history for setuptools-scm
     
     - name: Set up Python
       uses: actions/setup-python@v5
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 3df56ac..babc49c 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -22,6 +22,8 @@ jobs:
     
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0  # Fetch full history for setuptools-scm
     
     - name: Install build dependencies
       run: |
@@ -60,6 +62,8 @@ jobs:
     
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0  # Fetch full history for setuptools-scm
     
     - name: Create bin directory
       run: mkdir -p dalla_data_processing/deduplication/bin
@@ -111,6 +115,8 @@ jobs:
     
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0  # Fetch full history for setuptools-scm
     
     - name: Set up Python
       uses: actions/setup-python@v5
@@ -167,6 +173,8 @@ jobs:
     
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0  # Fetch full history for setuptools-scm
     
     - uses: actions/download-artifact@v4
       with:
diff --git a/.gitignore b/.gitignore
index 5c952c5..54b501a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,6 +20,9 @@ wheels/
 .installed.cfg
 *.egg
 
+# setuptools-scm version file
+dalla_data_processing/_version.py
+
 # Virtual environments
 venv/
 env/
diff --git a/README.md b/README.md
index 030cb6e..41f91a9 100644
--- a/README.md
+++ b/README.md
@@ -10,49 +10,90 @@ A comprehensive Arabic data processing pipeline with deduplication, stemming, qu
 
 ## Installation
 
+### Quick Start (All Features)
+
+For most users, install with all features enabled:
+
 <b>Using uv</b>
 
 ```bash
-# Install the package
-uv pip install dalla-data-processing
+uv pip install "dalla-data-processing[all]"
 ```
 
-
 <b>Using pip</b>
 
 ```bash
-# Install the package
+pip install "dalla-data-processing[all]"
+```
+
+### Modular Installation (Advanced)
+
+Install only the components you need to keep dependencies minimal:
+
+```bash
+# Base installation (no processing features, only core dependencies)
 pip install dalla-data-processing
+
+# Install specific features
+pip install "dalla-data-processing[dedup]"        # Deduplication only
+pip install "dalla-data-processing[stem]"         # Stemming only
+pip install "dalla-data-processing[quality]"      # Quality checking only
+pip install "dalla-data-processing[readability]"  # Readability scoring only
+pip install "dalla-data-processing[pack]"         # Dataset packing only
+
+# Combine multiple features
+pip install "dalla-data-processing[dedup,stem,quality]"
 ```
 
+### Development Installation
 
-<b>From Source</b>
+<b>From Source (with uv - recommended)</b>
 
 ```bash
 git clone https://github.com/U4RASD/dalla-data-processing.git
 cd dalla-data-processing
 
-# Using uv
-uv pip install -e .
+# Install all features and dev dependencies
+uv sync --all-extras
 
-# Or using pip
-pip install -e .
+# Or install with specific extras only
+uv sync --extra dedup --extra stem
+```
+
+<b>From Source (with pip)</b>
+
+```bash
+git clone https://github.com/U4RASD/dalla-data-processing.git
+cd dalla-data-processing
+
+# Install with all features for development
+pip install -e ".[all,dev]"
 ```
 
 ## Components
 
+> **Note:** Each component requires its corresponding extra to be installed. Install with `[all]` to enable all features, or see [Modular Installation](#modular-installation-advanced) to install only what you need.
+
 ### 1. [Deduplication](dalla_data_processing/deduplication/README.md)
 Detect and remove duplicate or near-duplicate documents from your datasets using the Onion algorithm.
+- **Requires:** `[dedup]` extra
 
 ### 2. [Stemming](dalla_data_processing/stemming/README.md)
 Apply morphological analysis and stemming using CAMeL Tools.
+- **Requires:** `[stem]` extra
 
 ### 3. [Quality Checking](dalla_data_processing/quality/README.md)
 Check text quality using morphological analysis to detect errors and foreign words.
+- **Requires:** `[quality]` extra
 
 ### 4. [Readability Scoring](dalla_data_processing/readability/README.md)
 Calculate readability scores using Flesch Reading Ease and Osman methods.
 Contains also ranking according to both scores
+- **Requires:** `[readability]` extra
+
+### 5. [Dataset Packing](dalla_data_processing/packing/README.md)
+Pack and prepare datasets for training.
+- **Requires:** `[pack]` extra
 
 ## Links
 
diff --git a/dalla_data_processing/__init__.py b/dalla_data_processing/__init__.py
index bd9bb00..9231721 100644
--- a/dalla_data_processing/__init__.py
+++ b/dalla_data_processing/__init__.py
@@ -8,31 +8,49 @@
 - Readability scoring
 """
 
-__version__ = "0.0.1"
-
 try:
-    from dalla_data_processing.core.dataset import DatasetManager
-
-    _has_dataset = True
+    from dalla_data_processing._version import version as __version__
 except ImportError:
-    _has_dataset = False
-    DatasetManager = None
+    # Fallback for development without installation
+    try:
+        from importlib.metadata import PackageNotFoundError, version
 
-try:
-    from dalla_data_processing.utils.tokenize import simple_word_tokenize
+        __version__ = version("dalla-data-processing")
+    except PackageNotFoundError:
+        __version__ = "0.0.0+unknown"
 
-    _has_tokenize = True
-except ImportError:
-    _has_tokenize = False
-    simple_word_tokenize = None
 
-try:
-    from dalla_data_processing.stemming import stem, stem_dataset
+# Lazy imports - only import when actually used, not at package load time
+def __getattr__(name):
+    """Lazy load heavy modules only when accessed."""
+    if name == "DatasetManager":
+        from dalla_data_processing.core.dataset import DatasetManager
+
+        return DatasetManager
+    elif name == "simple_word_tokenize":
+        from dalla_data_processing.utils.tokenize import simple_word_tokenize
+
+        return simple_word_tokenize
+    elif name == "stem":
+        from dalla_data_processing.stemming import stem
+
+        return stem
+    elif name == "stem_dataset":
+        from dalla_data_processing.stemming import stem_dataset
+
+        return stem_dataset
+    elif name == "DatasetPacker":
+        from dalla_data_processing.packing import DatasetPacker
+
+        return DatasetPacker
+    raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
 
-    _has_stemming = True
-except ImportError:
-    _has_stemming = False
-    stem = None
-    stem_dataset = None
 
-__all__ = ["DatasetManager", "simple_word_tokenize", "stem", "stem_dataset", "__version__"]
+__all__ = [
+    "DatasetManager",
+    "simple_word_tokenize",
+    "stem",
+    "stem_dataset",
+    "DatasetPacker",
+    "__version__",
+]
diff --git a/dalla_data_processing/cli.py b/dalla_data_processing/cli.py
index 252b717..bf02f7e 100644
--- a/dalla_data_processing/cli.py
+++ b/dalla_data_processing/cli.py
@@ -1,5 +1,5 @@
 """
-Main CLI entry point for dalla-process.
+Main CLI entry point for dalla-data-processing (dalla-dp).
 
 This module provides the unified command-line interface for all
 Arabic data processing operations.
@@ -7,16 +7,18 @@
 
 import sys
 from pathlib import Path
+from typing import TYPE_CHECKING
 
 import click
-from datasets import Dataset, DatasetDict
 
 from dalla_data_processing import __version__
-from dalla_data_processing.core.dataset import DatasetManager
-from dalla_data_processing.utils import get_logger, setup_logging
+from dalla_data_processing.utils.logger import get_logger
 
-setup_logging(log_format="console", log_level="INFO")
-logger = get_logger(__name__)
+if TYPE_CHECKING:
+    from datasets import Dataset
+
+
+logger = get_logger("dalla.cli")
 
 
 class Context:
@@ -30,16 +32,103 @@ def __init__(self):
         self.verbose: bool = False
         self.overwrite: bool = False
         self.dataset: Dataset | None = None
-        self.dataset_manager = DatasetManager()
+        self._dataset_manager = None
+
+    @property
+    def dataset_manager(self):
+        """Lazy load DatasetManager."""
+        if self._dataset_manager is None:
+            from dalla_data_processing.core.dataset import DatasetManager
+
+            self._dataset_manager = DatasetManager()
+        return self._dataset_manager
 
 
 pass_context = click.make_pass_decorator(Context, ensure=True)
 
 
-@click.group(
-    context_settings={"help_option_names": ["-h", "--help"], "allow_interspersed_args": True}
-)
-@click.version_option(version=__version__, prog_name="dalla-process")
+def common_options(func):
+    """Decorator to add common dataset processing options to commands.
+
+    This allows options to be specified either before or after the subcommand,
+    and ensures they appear in each subcommand's help text.
+    """
+    decorators = [
+        click.option(
+            "--input-dataset",
+            "-i",
+            type=click.Path(exists=True, path_type=Path),
+            help="Path to input HuggingFace dataset",
+        ),
+        click.option(
+            "--output-dataset",
+            "-o",
+            type=click.Path(path_type=Path),
+            help="Path to save output HuggingFace dataset",
+        ),
+        click.option(
+            "--column",
+            "-c",
+            default="text",
+            help="Column name to process (default: 'text')",
+        ),
+        click.option(
+            "--num-workers",
+            "-w",
+            type=int,
+            help="Number of parallel workers (default: auto)",
+        ),
+        click.option(
+            "--verbose",
+            "-v",
+            is_flag=True,
+            help="Enable verbose output",
+        ),
+        click.option(
+            "--quiet",
+            "-q",
+            is_flag=True,
+            help="Suppress non-error output",
+        ),
+        click.option(
+            "--overwrite",
+            is_flag=True,
+            help="Overwrite output dataset if it already exists",
+        ),
+    ]
+    for decorator in reversed(decorators):
+        func = decorator(func)
+    return func
+
+
+def _setup_context_and_logging(
+    ctx: Context,
+    input_dataset: Path | None,
+    output_dataset: Path | None,
+    column: str,
+    num_workers: int | None,
+    verbose: bool,
+    quiet: bool,
+    overwrite: bool,
+):
+    """Helper to populate context from command-level parameters and setup logging."""
+    ctx.input_dataset = input_dataset or ctx.input_dataset
+    ctx.output_dataset = output_dataset or ctx.output_dataset
+    ctx.column = column if column != "text" else ctx.column or column
+    ctx.num_workers = num_workers or ctx.num_workers
+    ctx.verbose = verbose or ctx.verbose
+    ctx.overwrite = overwrite or ctx.overwrite
+
+    from dalla_data_processing.utils import setup_logging
+
+    if quiet:
+        setup_logging(log_format="console", log_level="ERROR")
+    elif ctx.verbose:
+        setup_logging(log_format="console", log_level="DEBUG")
+
+
+@click.group(context_settings={"help_option_names": ["-h", "--help"]})
+@click.version_option(version=__version__, prog_name="dalla-data-processing")
 @click.option(
     "--input-dataset",
     "-i",
@@ -95,22 +184,12 @@ def cli(
     """
     Dalla Data Processing - Unified Arabic Data Processing Pipeline
 
-    A comprehensive toolkit for processing Arabic text data with support for:
     - Deduplication using onion algorithm
     - Stemming and morphological analysis
     - Quality checking
     - Readability scoring
+    - Packing Dataset for training
 
-    Examples:
-
-        # Deduplicate a dataset
-        dalla-dp -i ./data/raw -o ./data/deduped deduplicate
-
-        # Stem text with 8 workers
-        dalla-dp -i ./data/raw -o ./data/stemmed -w 8 stem
-
-        # Check quality with custom column
-        dalla-dp -i ./data/raw -o ./data/quality -c content quality-check
     """
     ctx.input_dataset = input_dataset
     ctx.output_dataset = output_dataset
@@ -119,13 +198,17 @@ def cli(
     ctx.verbose = verbose
     ctx.overwrite = overwrite
 
-    if quiet:
-        setup_logging(log_format="console", log_level="ERROR")
-    elif verbose:
-        setup_logging(log_format="console", log_level="DEBUG")
+    if quiet or verbose:
+        from dalla_data_processing.utils import setup_logging
+
+        if quiet:
+            setup_logging(log_format="console", log_level="ERROR")
+        elif verbose:
+            setup_logging(log_format="console", log_level="DEBUG")
 
 
 @cli.command(context_settings={"help_option_names": ["-h", "--help"]})
+@common_options
 @click.option(
     "--threshold",
     "-t",
@@ -161,6 +244,13 @@ def cli(
 @pass_context
 def deduplicate(
     ctx: Context,
+    input_dataset: Path | None,
+    output_dataset: Path | None,
+    column: str,
+    num_workers: int | None,
+    verbose: bool,
+    quiet: bool,
+    overwrite: bool,
     threshold: float,
     return_pairs: bool,
     keep_vert_files: bool,
@@ -169,20 +259,28 @@ def deduplicate(
     onion_binary: str | None,
 ):
     """Remove duplicate entries using onion algorithm."""
+    _setup_context_and_logging(
+        ctx, input_dataset, output_dataset, column, num_workers, verbose, quiet, overwrite
+    )
     _require_io_paths(ctx)
 
-    click.echo(f"Loading dataset from {ctx.input_dataset}")
+    logger.info(f"Loading dataset from {ctx.input_dataset}")
     dataset = ctx.dataset_manager.load(ctx.input_dataset)
     dataset = _handle_dataset_dict(dataset)
 
     mode = "pairs" if return_pairs else "filter"
-    click.echo(f"Deduplicating with threshold={threshold}, mode={mode}")
+    logger.info(f"Deduplicating with threshold={threshold}, mode={mode}")
     if calculate_scores:
-        click.echo("  Phase 2: ON (calculating similarity scores)")
+        logger.info("  Phase 2: ON (calculating similarity scores)")
     else:
-        click.echo("  Phase 2: OFF (faster, sufficient for most use cases)")
+        logger.info("  Phase 2: OFF (faster, sufficient for most use cases)")
 
-    from dalla_data_processing.deduplication import deduplicate_dataset
+    try:
+        from dalla_data_processing.deduplication import deduplicate_dataset
+    except ImportError:
+        logger.error("Missing dependencies for deduplication")
+        logger.error("Install with: pip install 'dalla-data-processing[dedup]'")
+        sys.exit(1)
 
     deduplicated = deduplicate_dataset(
         dataset,
@@ -195,28 +293,31 @@ def deduplicate(
         onion_binary=Path(onion_binary) if onion_binary else None,
     )
 
-    click.echo(f"Saving deduplicated dataset to {ctx.output_dataset}")
+    logger.info(f"Saving deduplicated dataset to {ctx.output_dataset}")
     ctx.dataset_manager.save(deduplicated, ctx.output_dataset, overwrite=ctx.overwrite)
 
+    from dalla_data_processing.core.dataset import DatasetManager
+
     original_size = DatasetManager.get_size(dataset)
     final_size = DatasetManager.get_size(deduplicated)
 
-    click.echo(click.style("✓ Deduplication complete", fg="green"))
-    click.echo(f"  Original: {original_size:,} examples")
+    logger.info("✓ Deduplication complete")
+    logger.info(f"  Original: {original_size:,} examples")
 
     if return_pairs:
         num_dups = sum(1 for ex in deduplicated if ex.get("is_duplicate", False))
-        click.echo(
+        logger.info(
             f"  Documents with duplicates: {num_dups:,} ({num_dups / original_size * 100:.1f}%)"
         )
-        click.echo("  Added columns: duplicate_cluster, is_duplicate, duplicate_count")
+        logger.info("  Added columns: duplicate_cluster, is_duplicate, duplicate_count")
     else:
         removed = original_size - final_size
-        click.echo(f"  Removed: {removed:,} duplicates ({removed / original_size * 100:.1f}%)")
-        click.echo(f"  Final: {final_size:,} examples")
+        logger.info(f"  Removed: {removed:,} duplicates ({removed / original_size * 100:.1f}%)")
+        logger.info(f"  Final: {final_size:,} examples")
 
 
 @cli.command(context_settings={"help_option_names": ["-h", "--help"]})
+@common_options
 @click.option(
     "--sep-token",
     default="<+>",
@@ -245,19 +346,39 @@ def deduplicate(
 )
 @pass_context
 def stem(
-    ctx: Context, sep_token: str, normalize: bool, keep_diacritics: bool, model: str, use_gpu: bool
+    ctx: Context,
+    input_dataset: Path | None,
+    output_dataset: Path | None,
+    column: str,
+    num_workers: int | None,
+    verbose: bool,
+    quiet: bool,
+    overwrite: bool,
+    sep_token: str,
+    normalize: bool,
+    keep_diacritics: bool,
+    model: str,
+    use_gpu: bool,
 ):
     """Apply stemming and morphological analysis."""
+    _setup_context_and_logging(
+        ctx, input_dataset, output_dataset, column, num_workers, verbose, quiet, overwrite
+    )
     _require_io_paths(ctx)
 
-    click.echo(f"Loading dataset from {ctx.input_dataset}")
+    logger.info(f"Loading dataset from {ctx.input_dataset}")
     dataset = ctx.dataset_manager.load(ctx.input_dataset)
     dataset = _handle_dataset_dict(dataset)
 
-    click.echo(f"Stemming {ctx.column} column (workers={ctx.num_workers or 'auto'})")
-    click.echo(f"Model: {model.upper()}{' (GPU enabled)' if model == 'bert' and use_gpu else ''}")
+    logger.info(f"Stemming {ctx.column} column (workers={ctx.num_workers or 'auto'})")
+    logger.info(f"Model: {model.upper()}{' (GPU enabled)' if model == 'bert' and use_gpu else ''}")
 
-    from dalla_data_processing.stemming import stem_dataset
+    try:
+        from dalla_data_processing.stemming import stem_dataset
+    except ImportError:
+        logger.error("Missing dependencies for stemming")
+        logger.error("Install with: pip install 'dalla-data-processing[stem]'")
+        sys.exit(1)
 
     stemmed = stem_dataset(
         dataset,
@@ -270,13 +391,14 @@ def stem(
         use_gpu=use_gpu,
     )
 
-    click.echo(f"Saving stemmed dataset to {ctx.output_dataset}")
+    logger.info(f"Saving stemmed dataset to {ctx.output_dataset}")
     ctx.dataset_manager.save(stemmed, ctx.output_dataset, overwrite=ctx.overwrite)
 
-    click.echo(click.style("✓ Stemming complete", fg="green"))
+    logger.info("✓ Stemming complete")
 
 
 @cli.command(context_settings={"help_option_names": ["-h", "--help"]})
+@common_options
 @click.option(
     "--min-score",
     type=float,
@@ -300,18 +422,39 @@ def stem(
     help="Use GPU for BERT model (only applicable when --model=bert)",
 )
 @pass_context
-def quality_check(ctx: Context, min_score: float, save_errors: bool, model: str, use_gpu: bool):
+def quality_check(
+    ctx: Context,
+    input_dataset: Path | None,
+    output_dataset: Path | None,
+    column: str,
+    num_workers: int | None,
+    verbose: bool,
+    quiet: bool,
+    overwrite: bool,
+    min_score: float,
+    save_errors: bool,
+    model: str,
+    use_gpu: bool,
+):
     """Check text quality and calculate scores."""
+    _setup_context_and_logging(
+        ctx, input_dataset, output_dataset, column, num_workers, verbose, quiet, overwrite
+    )
     _require_io_paths(ctx)
 
-    click.echo(f"Loading dataset from {ctx.input_dataset}")
+    logger.info(f"Loading dataset from {ctx.input_dataset}")
     dataset = ctx.dataset_manager.load(ctx.input_dataset)
     dataset = _handle_dataset_dict(dataset)
 
-    click.echo(f"Checking quality of {ctx.column} column")
-    click.echo(f"Model: {model.upper()}{' (GPU enabled)' if model == 'bert' and use_gpu else ''}")
+    logger.info(f"Checking quality of {ctx.column} column")
+    logger.info(f"Model: {model.upper()}{' (GPU enabled)' if model == 'bert' and use_gpu else ''}")
 
-    from dalla_data_processing.quality import check_quality
+    try:
+        from dalla_data_processing.quality import check_quality
+    except ImportError:
+        logger.error("Missing dependencies for quality checking")
+        logger.error("Install with: pip install 'dalla-data-processing[quality]'")
+        sys.exit(1)
 
     scored = check_quality(
         dataset,
@@ -323,40 +466,61 @@ def quality_check(ctx: Context, min_score: float, save_errors: bool, model: str,
         use_gpu=use_gpu,
     )
 
-    click.echo(f"Saving quality-checked dataset to {ctx.output_dataset}")
+    logger.info(f"Saving quality-checked dataset to {ctx.output_dataset}")
     ctx.dataset_manager.save(scored, ctx.output_dataset, overwrite=ctx.overwrite)
 
+    from dalla_data_processing.core.dataset import DatasetManager
+
     original_size = DatasetManager.get_size(dataset)
     final_size = DatasetManager.get_size(scored)
 
-    click.echo(click.style("✓ Quality check complete", fg="green"))
+    logger.info("✓ Quality check complete")
     if min_score > 0:
         removed = original_size - final_size
-        click.echo(
+        logger.info(
             f"  Filtered {removed:,} low-quality examples ({removed / original_size * 100:.1f}%)"
         )
 
 
 @cli.command(context_settings={"help_option_names": ["-h", "--help"]})
+@common_options
 @click.option(
     "--add-ranks/--no-ranks",
     default=True,
     help="Add ranking and level columns (default: True)",
 )
 @pass_context
-def readability(ctx: Context, add_ranks: bool):
+def readability(
+    ctx: Context,
+    input_dataset: Path | None,
+    output_dataset: Path | None,
+    column: str,
+    num_workers: int | None,
+    verbose: bool,
+    quiet: bool,
+    overwrite: bool,
+    add_ranks: bool,
+):
     """Calculate readability scores using Flesch and Osman methods."""
+    _setup_context_and_logging(
+        ctx, input_dataset, output_dataset, column, num_workers, verbose, quiet, overwrite
+    )
     _require_io_paths(ctx)
 
-    click.echo(f"Loading dataset from {ctx.input_dataset}")
+    logger.info(f"Loading dataset from {ctx.input_dataset}")
     dataset = ctx.dataset_manager.load(ctx.input_dataset)
     dataset = _handle_dataset_dict(dataset)
 
-    click.echo(f"Calculating readability scores for {ctx.column} column")
+    logger.info(f"Calculating readability scores for {ctx.column} column")
     if add_ranks:
-        click.echo("  Including ranking and difficulty levels (0-4)")
+        logger.info("  Including ranking and difficulty levels (0-4)")
 
-    from dalla_data_processing.readability import score_readability
+    try:
+        from dalla_data_processing.readability import score_readability
+    except ImportError:
+        logger.error("Missing dependencies for readability scoring")
+        logger.error("Install with: pip install 'dalla-data-processing[readability]'")
+        sys.exit(1)
 
     scored = score_readability(
         dataset,
@@ -365,16 +529,168 @@ def readability(ctx: Context, add_ranks: bool):
         num_proc=ctx.num_workers,
     )
 
-    click.echo(f"Saving scored dataset to {ctx.output_dataset}")
+    logger.info(f"Saving scored dataset to {ctx.output_dataset}")
     ctx.dataset_manager.save(scored, ctx.output_dataset, overwrite=ctx.overwrite)
 
-    click.echo(click.style("✓ Readability scoring complete", fg="green"))
+    logger.info("✓ Readability scoring complete")
 
     if add_ranks:
-        click.echo("  Added columns: flesch_score, osman_score, flesch_rank, osman_rank,")
-        click.echo("                 readability_level")
+        logger.info("  Added columns: flesch_score, osman_score, flesch_rank, osman_rank,")
+        logger.info("                 readability_level")
+    else:
+        logger.info("  Added columns: flesch_score, osman_score")
+
+
+@cli.command(context_settings={"help_option_names": ["-h", "--help"]})
+@common_options
+@click.option(
+    "--config",
+    type=click.Path(exists=True, path_type=Path),
+    help="Path to config YAML file (optional)",
+)
+@click.option(
+    "--tokenizer-path",
+    type=str,
+    help="Path to tokenizer",
+)
+@click.option(
+    "--max-seq-length",
+    type=int,
+    help="Maximum sequence length for packing",
+)
+@click.option(
+    "--chunk-size-gb",
+    type=float,
+    help="Size of each processing chunk in GB",
+)
+@click.option(
+    "--subset-order",
+    multiple=True,
+    help="Subset processing order (can be specified multiple times)",
+)
+@click.option(
+    "--sft",
+    is_flag=True,
+    help="Enable SFT mode (uses tokenizer's chat template)",
+)
+@click.option(
+    "--rbpe",
+    is_flag=True,
+    help="Use R-BPE tokenizer",
+)
+@click.option(
+    "--text-column",
+    type=str,
+    help="Column name containing text data (defaults: 'text' for non-SFT, 'messages' for SFT)",
+)
+@pass_context
+def pack(
+    ctx: Context,
+    input_dataset: Path | None,
+    output_dataset: Path | None,
+    column: str,
+    num_workers: int | None,
+    verbose: bool,
+    quiet: bool,
+    overwrite: bool,
+    config: Path | None,
+    tokenizer_path: str | None,
+    max_seq_length: int | None,
+    chunk_size_gb: float | None,
+    subset_order: tuple[str, ...],
+    sft: bool,
+    rbpe: bool,
+    text_column: str | None,
+):
+    """Pack datasets for efficient LLM training.
+
+    Combines multiple examples into fixed-length sequences, optimizing for
+    efficient training. Preserves data integrity by ensuring no example is
+    split across multiple sequences.
+    """
+    _setup_context_and_logging(
+        ctx, input_dataset, output_dataset, column, num_workers, verbose, quiet, overwrite
+    )
+    _require_io_paths(ctx)
+
+    try:
+        import yaml
+    except ImportError:
+        logger.error("Missing dependencies for packing")
+        logger.error("Install with: pip install 'dalla-data-processing[pack]'")
+        sys.exit(1)
+
+    config_data = {}
+    if config:
+        with open(config) as f:
+            config_data = yaml.safe_load(f) or {}
+
+    if tokenizer_path:
+        config_data["tokenizer_path"] = tokenizer_path
+    if rbpe:
+        config_data["rbpe"] = True
+    if sft:
+        config_data["sft"] = True
+    if max_seq_length is not None:
+        config_data["max_seq_length"] = max_seq_length
+    if chunk_size_gb is not None:
+        config_data["chunk_size_gb"] = chunk_size_gb
+    if subset_order:
+        config_data["subset_order"] = list(subset_order)
+    if text_column:
+        config_data["text_column"] = text_column
+
+    if "tokenizer_path" not in config_data:
+        logger.error("Error: --tokenizer-path is required")
+        sys.exit(1)
+
+    if config_data.get("rbpe"):
+        try:
+            from rbpe import RBPETokenizer
+
+            tokenizer = RBPETokenizer.from_pretrained(config_data["tokenizer_path"])
+        except ImportError:
+            logger.error("Missing rbpe package")
+            logger.error("Install with: pip install rbpe")
+            sys.exit(1)
     else:
-        click.echo("  Added columns: flesch_score, osman_score")
+        try:
+            from transformers import AutoTokenizer
+
+            tokenizer = AutoTokenizer.from_pretrained(config_data["tokenizer_path"])
+        except ImportError:
+            logger.error("Missing transformers package")
+            logger.error("Install with: pip install transformers")
+            sys.exit(1)
+
+    if "text_column" not in config_data:
+        config_data["text_column"] = "messages" if config_data.get("sft", False) else "text"
+
+    logger.info("Starting dataset processing")
+    logger.info(f"Input path: {ctx.input_dataset}")
+    logger.info(f"Output directory: {ctx.output_dataset}")
+    logger.info(f"Using RBPE tokenizer: {config_data.get('rbpe', False)}")
+    logger.info(f"Chunk size: {config_data.get('chunk_size_gb', 2)}GB")
+    logger.info(f"Max sequence length: {config_data.get('max_seq_length', 2048)}")
+
+    from dalla_data_processing.packing import DatasetPacker
+
+    processor = DatasetPacker(
+        input_dataset=str(ctx.input_dataset),
+        output_dataset=str(ctx.output_dataset),
+        tokenizer=tokenizer,
+        subset_order=config_data.get("subset_order"),
+        num_workers=ctx.num_workers or 4,
+        chunk_size_gb=config_data.get("chunk_size_gb", 2.0),
+        max_seq_length=config_data.get("max_seq_length", 2048),
+        sft=config_data.get("sft", False),
+        rbpe=config_data.get("rbpe", False),
+        text_column=config_data.get("text_column"),
+    )
+
+    final_path = processor.process()
+    logger.info("Processing completed successfully!")
+    logger.info(f"Final dataset saved to: {final_path}")
 
 
 @cli.command(context_settings={"help_option_names": ["-h", "--help"]})
@@ -388,31 +704,34 @@ def readability(ctx: Context, add_ranks: bool):
 )
 def info(dataset_path: Path, split: str | None):
     """Display information about a dataset."""
+    from dalla_data_processing.core.dataset import DatasetManager
+
     dm = DatasetManager()
 
     try:
         dataset = dm.load(dataset_path, split=split)
         dm.print_info(dataset)
     except Exception as e:
-        click.echo(click.style(f"Error loading dataset: {e}", fg="red"), err=True)
+        logger.error(f"Error loading dataset: {e}")
         sys.exit(1)
 
 
 def _handle_dataset_dict(dataset, split_preference: str = "train"):
     """Handle DatasetDict by selecting appropriate split."""
+    from datasets import DatasetDict
 
     if isinstance(dataset, DatasetDict):
         splits = list(dataset.keys())
-        click.echo(f"Dataset has multiple splits: {', '.join(splits)}")
+        logger.info(f"Dataset has multiple splits: {', '.join(splits)}")
 
         if split_preference in dataset:
-            click.echo(
+            logger.info(
                 f"Using '{split_preference}' split ({len(dataset[split_preference])} examples)"
             )
             return dataset[split_preference]
         else:
             first_split = splits[0]
-            click.echo(f"Using '{first_split}' split ({len(dataset[first_split])} examples)")
+            logger.info(f"Using '{first_split}' split ({len(dataset[first_split])} examples)")
             return dataset[first_split]
     else:
         return dataset
@@ -421,19 +740,13 @@ def _handle_dataset_dict(dataset, split_preference: str = "train"):
 def _require_io_paths(ctx: Context):
     """Ensure input and output paths are provided."""
     if ctx.input_dataset is None:
-        click.echo(
-            click.style("Error: --input-dataset is required", fg="red"),
-            err=True,
-        )
-        click.echo("Use --help for usage information")
+        logger.error("Error: --input-dataset is required")
+        logger.info("Use --help for usage information")
         sys.exit(1)
 
     if ctx.output_dataset is None:
-        click.echo(
-            click.style("Error: --output-dataset is required", fg="red"),
-            err=True,
-        )
-        click.echo("Use --help for usage information")
+        logger.error("Error: --output-dataset is required")
+        logger.info("Use --help for usage information")
         sys.exit(1)
 
 
@@ -442,10 +755,10 @@ def main():
     try:
         cli(obj=Context())
     except KeyboardInterrupt:
-        click.echo("\n" + click.style("Interrupted by user", fg="yellow"))
+        logger.warning("\nInterrupted by user")
         sys.exit(130)
     except Exception as e:
-        click.echo(click.style(f"Error: {e}", fg="red"), err=True)
+        logger.error(f"Error: {e}")
         if "--verbose" in sys.argv or "-v" in sys.argv:
             raise
         sys.exit(1)
diff --git a/dalla_data_processing/core/dataset.py b/dalla_data_processing/core/dataset.py
index bf19068..84a4fe0 100644
--- a/dalla_data_processing/core/dataset.py
+++ b/dalla_data_processing/core/dataset.py
@@ -1,8 +1,5 @@
 """
 Dataset I/O utilities for unified HuggingFace dataset handling.
-
-This module provides a consistent interface for loading, saving, and manipulating
-HuggingFace datasets across all dalla-process components.
 """
 
 from collections.abc import Callable
diff --git a/dalla_data_processing/deduplication/README.md b/dalla_data_processing/deduplication/README.md
index 50fa4cb..18072d3 100644
--- a/dalla_data_processing/deduplication/README.md
+++ b/dalla_data_processing/deduplication/README.md
@@ -2,6 +2,15 @@
 
 Detect and remove duplicate or near-duplicate documents from your datasets using the Onion algorithm.
 
+## Installation
+
+This feature requires the `[dedup]` extra:
+
+```bash
+pip install "dalla-data-processing[dedup]"
+# or install all features: pip install "dalla-data-processing[all]"
+```
+
 ## CLI Usage
 
 **Command:** `dalla-dp deduplicate [OPTIONS]`
diff --git a/dalla_data_processing/packing/README.md b/dalla_data_processing/packing/README.md
new file mode 100644
index 0000000..a0fe008
--- /dev/null
+++ b/dalla_data_processing/packing/README.md
@@ -0,0 +1,244 @@
+# Dataset Packing
+
+Pack datasets efficiently for LLM training by combining multiple examples into fixed-length sequences. Pre-pack your datasets locally to avoid wasting expensive GPU time on servers.
+
+## Why Use This Tool?
+
+When training large language models, dataset packing is essential for efficient training. This tool allows you to:
+- **Save GPU time**: Pack datasets on your local machine before uploading to training servers
+- **Preserve data integrity**: Ensures no example is split across multiple packed sequences
+- **Handle large datasets**: Process datasets in chunks to manage memory efficiently
+- **Flexible tokenization**: Support for both standard text data and chat-formatted (SFT) data
+
+## Installation
+
+This feature requires the `[pack]` extra:
+
+```bash
+pip install "dalla-data-processing[pack]"
+# or install all features: pip install "dalla-data-processing[all]"
+```
+
+## CLI Usage
+
+The packing functionality is integrated into the unified `dalla-dp` CLI:
+
+### Basic Usage
+
+```bash
+dalla-dp -i input_dataset -o output_dataset pack --tokenizer-path /path/to/tokenizer
+```
+
+### Common Options (from main CLI)
+
+- `-i, --input-dataset`: Path to input HuggingFace dataset (required)
+- `-o, --output-dataset`: Path to save output dataset (required)
+- `-w, --num-workers`: Number of parallel workers for packing (default: 4)
+- `-v, --verbose`: Enable verbose output
+- `--overwrite`: Overwrite output if it exists
+
+### Pack-Specific Options
+
+- `--config`: Path to config YAML file (optional)
+- `--tokenizer-path`: Path to tokenizer
+- `--max-seq-length`: Maximum sequence length (default: 2048)
+- `--chunk-size-gb`: Chunk size in GB (default: 2.0)
+- `--text-column`: Text column name (default: 'text' or 'messages' for SFT)
+- `--subset-order`: Subset processing order
+- `--sft`: Enable SFT mode
+- `--rbpe`: Use R-BPE tokenizer
+
+### Examples
+
+**Basic packing:**
+```bash
+dalla-dp -i my_dataset -o packed_dataset pack --tokenizer-path /path/to/tokenizer
+```
+
+**Using a config file:**
+```bash
+dalla-dp -i my_dataset -o packed_dataset pack --config pack_config.yaml
+```
+
+**Override config with command line:**
+```bash
+dalla-dp -i my_dataset -o packed_dataset pack \
+  --config pack_config.yaml \
+  --max-seq-length 4096
+```
+
+**With custom sequence length and workers:**
+```bash
+dalla-dp -i my_dataset -o packed_dataset -w 8 pack \
+  --tokenizer-path /path/to/tokenizer \
+  --max-seq-length 4096
+```
+
+**SFT mode with subset order:**
+```bash
+dalla-dp -i my_dataset -o packed_dataset pack \
+  --tokenizer-path /path/to/tokenizer \
+  --sft \
+  --subset-order train --subset-order validation
+```
+
+**Using a custom text column:**
+```bash
+dalla-dp -i my_dataset -o packed_dataset pack \
+  --tokenizer-path /path/to/tokenizer \
+  --text-column content
+```
+
+**With verbose output:**
+```bash
+dalla-dp -i my_dataset -o packed_dataset -v pack \
+  --tokenizer-path /path/to/tokenizer \
+  --chunk-size-gb 1.0
+```
+
+## Configuration File
+
+```yaml
+tokenizer_path: "/path/to/tokenizer"
+max_seq_length: 2048
+chunk_size_gb: 2.0
+sft: false
+rbpe: false
+text_column: "content"
+
+subset_order:
+  - "train"
+  - "validation"
+```
+
+CLI arguments override config values.
+
+## Python API Usage
+
+You can also use the packing functionality directly in Python:
+
+```python
+from dalla_data_processing.packing import DatasetPacker
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("path/to/tokenizer")
+
+packer = DatasetPacker(
+    input_dataset="my_dataset",
+    output_dataset="packed_dataset",
+    tokenizer=tokenizer,
+    num_workers=4,
+    max_seq_length=2048,
+    chunk_size_gb=2.0,
+    text_column="content",
+)
+
+final_path = packer.process()
+```
+
+### API Parameters
+
+- `input_dataset` (str): Path to input dataset
+- `output_dataset` (str): Path for output dataset
+- `tokenizer`: HuggingFace tokenizer instance
+- `subset_order` (list[str], optional): Order to process subsets
+- `num_workers` (int): Number of parallel packing processes (default: 4)
+- `chunk_size_gb` (float): Size of processing chunks in GB (default: 2.0)
+- `max_seq_length` (int): Maximum sequence length (default: 2048)
+- `sft` (bool): Enable SFT mode (default: False)
+- `rbpe` (bool): Use R-BPE tokenizer (default: False)
+- `text_column` (str, optional): Text column name
+
+## The Packing Method
+
+- Packs multiple examples into a single sequence up to `max_seq_length`
+- Adds EOS token between examples
+- Pads remaining space with PAD tokens
+- **Guarantees no example is cut in the middle** - if an example doesn't fit, it starts in the next sequence
+- Best for preserving data integrity
+
+## SFT Mode
+
+Enable SFT mode when working with chat-formatted data:
+
+```bash
+dalla-dp -i my_dataset -o packed_dataset pack \
+  --tokenizer-path /path/to/tokenizer \
+  --sft
+```
+
+**When to use SFT mode:**
+- Your dataset has a `messages` field with chat conversations
+- Your tokenizer has a chat template defined
+- You're doing supervised fine-tuning (SFT) on conversational data
+
+**When NOT to use SFT mode:**
+- Continued pre-training (CPT) on plain text
+- Your tokenizer doesn't have a chat template
+- Your dataset only has a `text` field
+
+**Input format for SFT mode:**
+```python
+{
+    "messages": [
+        {"role": "user", "content": "Hello"},
+        {"role": "assistant", "content": "Hi there!"}
+    ]
+}
+```
+
+## Custom Text Column
+
+Defaults: `text` (non-SFT) or `messages` (SFT). Override with `--text-column`.
+
+## Dataset Format
+
+### Input
+Your dataset should be in Hugging Face datasets format:
+```
+my_dataset/
+  train/
+    data-00000-of-00001.arrow
+    dataset_info.json
+    state.json
+  validation/
+    data-00000-of-00001.arrow
+    dataset_info.json
+    state.json
+  dataset_dict.json
+```
+
+### Output
+The packed dataset will be saved to:
+```
+packed_dataset/
+  final_dataset/
+    train/
+      data-00000-of-00001.arrow
+      dataset_info.json
+      state.json
+```
+
+Each example in the final dataset will have:
+- `input_ids`: Token IDs (length = `max_seq_length`)
+- `labels`: Same as `input_ids` (or masked for SFT)
+
+## Memory Considerations
+
+The `--chunk-size-gb` parameter controls memory usage:
+- **Smaller values** (0.5-1 GB): Lower memory, more chunks
+- **Larger values** (2-4 GB): Higher memory, fewer chunks
+
+## How It Works
+
+1. **Analyze**: Calculate sizes of dataset subsets
+2. **Split**: Divide datasets into manageable chunks
+3. **Tokenize**: Convert text to token IDs using parallel processing
+4. **Pack**: Combine multiple examples into fixed-length sequences
+5. **Concatenate**: Merge all packed chunks into final dataset
+
+The tool automatically:
+- Preserves subset ordering
+- Removes intermediate files to save disk space
+- Handles empty subsets gracefully
+- Skips examples longer than `max_seq_length`
diff --git a/dalla_data_processing/packing/__init__.py b/dalla_data_processing/packing/__init__.py
new file mode 100644
index 0000000..678838c
--- /dev/null
+++ b/dalla_data_processing/packing/__init__.py
@@ -0,0 +1,10 @@
+"""
+Dataset packing module for efficient LLM training.
+
+This module provides functionality to pack datasets by combining multiple
+examples into fixed-length sequences, optimizing for efficient training.
+"""
+
+from dalla_data_processing.packing.dataset_packer import DatasetPacker
+
+__all__ = ["DatasetPacker"]
diff --git a/dalla_data_processing/packing/dataset_packer.py b/dalla_data_processing/packing/dataset_packer.py
new file mode 100644
index 0000000..92015a9
--- /dev/null
+++ b/dalla_data_processing/packing/dataset_packer.py
@@ -0,0 +1,449 @@
+import contextlib
+import math
+import os
+import shutil
+from multiprocessing import Pool, cpu_count
+
+from datasets import Dataset, DatasetDict, concatenate_datasets, load_from_disk
+from tqdm import tqdm
+
+from dalla_data_processing.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+def get_directory_size(path):
+    total_size = 0
+    for dirpath, _dirnames, filenames in os.walk(path):
+        for filename in filenames:
+            filepath = os.path.join(dirpath, filename)
+            # Skip if it's a symbolic link
+            if not os.path.islink(filepath):
+                with contextlib.suppress(OSError):
+                    total_size += os.path.getsize(filepath)
+    return total_size
+
+
+def remove_path(path):
+    """Safely remove a file, symlink, or directory tree."""
+    try:
+        if os.path.islink(path) or os.path.isfile(path):
+            os.remove(path)
+        elif os.path.isdir(path):
+            shutil.rmtree(path)
+    except Exception as e:
+        logger.warning("Failed to remove path", path=path, error=str(e))
+
+
+class DatasetPacker:
+    def __init__(
+        self,
+        input_dataset,
+        output_dataset,
+        tokenizer,
+        subset_order=None,
+        num_workers=4,
+        chunk_size_gb=2,
+        max_seq_length=2048,
+        sft=False,
+        rbpe=False,
+        text_column=None,
+    ):
+        self.input_dataset = input_dataset
+        self.output_dataset = output_dataset
+        self.tokenizer = tokenizer
+        self.num_workers = num_workers
+        self.chunk_size_bytes = int(chunk_size_gb * 1024**3)
+        self.max_seq_length = max_seq_length
+        self.rbpe = rbpe
+        self.subset_order = subset_order
+        self.sft = sft
+        # Set text_column: use provided value, or default based on sft mode
+        if text_column:
+            self.text_column = text_column
+        else:
+            self.text_column = "messages" if sft else "text"
+        if self.rbpe:
+            self.parallel = False
+            if self.sft:
+                self.add_special_tokens = False
+                self.append_concat_token = True
+            else:
+                self.add_special_tokens = True
+                self.append_concat_token = False
+        else:
+            self.parallel = True
+            self.add_special_tokens = True
+            self.append_concat_token = True
+        os.makedirs(output_dataset, exist_ok=True)
+
+    def get_directory_sizes(self, base_path):
+        # Handle both directory and direct dataset paths
+        if os.path.isfile(os.path.join(base_path, "dataset_info.json")):
+            # Direct dataset path
+            size_bytes = 0
+            for root, _dirs, files in os.walk(base_path):
+                for file in files:
+                    file_path = os.path.join(root, file)
+                    size_bytes += os.path.getsize(file_path)
+            return {"dataset": size_bytes}
+        else:
+            sizes = {}
+            dirs_to_check = self.subset_order if self.subset_order else os.listdir(base_path)
+            for dir_name in dirs_to_check:
+                path = os.path.join(base_path, dir_name)
+                if not os.path.exists(path):
+                    logger.warning("Directory not found", directory=dir_name, base_path=base_path)
+                    continue
+                size_bytes = 0
+                for root, _dirs, files in os.walk(path):
+                    for file in files:
+                        file_path = os.path.join(root, file)
+                        size_bytes += os.path.getsize(file_path)
+                sizes[dir_name] = size_bytes
+            return sizes
+
+    def split_dataset(self, base_path, sizes):
+        counter = 0
+        splits = []
+
+        # Handle single dataset case
+        if "dataset" in sizes:
+            dataset = load_from_disk(base_path)
+            size = sizes["dataset"]
+            num_splits = math.ceil(size / self.chunk_size_bytes)
+            total_size = len(dataset)
+            subset_size = total_size // num_splits
+            remainder = total_size % num_splits
+            start_idx = 0
+
+            for i in range(num_splits):
+                current_size = subset_size + (1 if i < remainder else 0)
+                end_idx = start_idx + current_size
+
+                subset_data = dataset.select(range(start_idx, end_idx))
+                name = f"{self.output_dataset}/split_{counter}"
+
+                counter += 1
+                subset_data.save_to_disk(name)
+                size_bytes = get_directory_size(name)
+                splits.append((name, size_bytes))
+                del subset_data
+                start_idx = end_idx
+            return splits
+
+        # Multiple datasets case
+        logger.info("Splitting datasets", subset_order=self.subset_order)
+        for subset in self.subset_order or sizes.keys():
+            size = sizes[subset]
+            dataset = load_from_disk(f"{base_path}/{subset}")
+            num_splits = math.ceil(size / self.chunk_size_bytes)
+            total_size = len(dataset)
+            subset_size = total_size // num_splits
+            remainder = total_size % num_splits
+            start_idx = 0
+
+            for i in range(num_splits):
+                # Add one extra item to early splits if there's a remainder
+                current_size = subset_size + (1 if i < remainder else 0)
+                end_idx = start_idx + current_size
+
+                subset_data = dataset.select(range(start_idx, end_idx))
+                name = f"{self.output_dataset}/split_{counter}"
+
+                counter += 1
+                subset_data.save_to_disk(name)
+                size_bytes = get_directory_size(name)
+                splits.append((name, size_bytes))
+                del subset_data
+                start_idx = end_idx
+        return splits
+
+    def create_chunks(self, sizes):
+        chunks = []
+        current_chunk = []
+        current_size = 0
+
+        # Use subset_order instead of sorting
+        for dir_name, size in sizes:
+            if current_size + size > self.chunk_size_bytes and current_chunk:
+                chunks.append(current_chunk)
+                current_chunk = []
+                current_size = 0
+            current_chunk.append(dir_name)
+            current_size += size
+
+        if current_chunk:
+            chunks.append(current_chunk)
+        return chunks
+
+    def tokenize_batch(self, texts):
+        if self.sft:
+            return self.tokenizer.apply_chat_template(
+                texts,
+                truncation=False,
+                padding=False,
+                return_assistant_tokens_mask=True,
+                return_dict=True,
+                add_special_tokens=self.add_special_tokens,
+            )
+        else:
+            return self.tokenizer(texts, truncation=False, padding=False)
+
+    def process_chunk(self, chunk_ranges, chunk_idx):
+        # Load and concatenate datasets in chunk
+        dataset_splits = []
+        for range_name in chunk_ranges:
+            try:
+                split = load_from_disk(range_name)
+                dataset_splits.append(split)
+                logger.info("Loaded split", split_name=range_name)
+            except Exception as e:
+                logger.error("Error loading split", split_name=range_name, error=str(e))
+
+        if not dataset_splits:
+            return None
+        # Concatenate splits
+        concatenated = concatenate_datasets(dataset_splits)
+        del split
+        del dataset_splits
+        # delete splits
+        for split in chunk_ranges:
+            remove_path(split)
+        # Tokenize
+        logger.info("Tokenizing chunk", chunk_idx=chunk_idx)
+        texts = concatenated[self.text_column]
+
+        num_cores = cpu_count() - 1  # Leave some cores free
+        text_chunk_size = len(texts) // num_cores
+        text_chunks = [
+            texts[i : i + text_chunk_size] for i in range(0, len(texts), text_chunk_size)
+        ]
+
+        if self.parallel:
+            logger.info("Tokenizing in parallel", num_cores=num_cores)
+            pool = Pool(num_cores)
+            try:
+                tokenized_chunks = list(
+                    tqdm(
+                        pool.imap(self.tokenize_batch, text_chunks),
+                        total=len(text_chunks),
+                        desc="Tokenizing",
+                    )
+                )
+            finally:
+                pool.close()
+                pool.join()
+        else:
+            tokenized_chunks = [
+                self.tokenize_batch(text_chunk)
+                for text_chunk in tqdm(text_chunks, desc="Tokenizing")
+            ]
+
+        input_ids = [i["input_ids"] for i in tokenized_chunks]
+        assistant_masks = (
+            [i["assistant_masks"] for i in tokenized_chunks] if self.sft else input_ids
+        )
+        all_input_ids = [item for sublist in input_ids for item in sublist]
+        all_assistant_masks = (
+            [item for sublist in assistant_masks for item in sublist] if self.sft else all_input_ids
+        )
+        zeros = False
+        for assistant_mask in all_assistant_masks:
+            if all(i == 0 for i in assistant_mask):
+                zeros = True
+                break
+        if zeros:
+            tokenized_dataset = Dataset.from_dict(
+                {"input_ids": all_input_ids, "labels": all_input_ids}
+            )
+        else:
+            logger.debug("Assistant masks not all zeros")
+            new_labels = []
+            for assistant_mask, input_id in zip(all_assistant_masks, all_input_ids, strict=True):
+                # new_labels.append([-100 * i*j for i, j in zip(assistant_mask, input_id)])
+                # if i in attention_mask is 0, then have -100, otherwise have input_id
+                new_labels.append(
+                    [-100 if i == 0 else j for i, j in zip(assistant_mask, input_id, strict=True)]
+                )
+
+            tokenized_dataset = Dataset.from_dict(
+                {"input_ids": all_input_ids, "labels": new_labels}
+            )
+
+        # Save tokenized dataset
+        chunk_name = f"chunk_{chunk_idx}_tokenized"
+        tokenized_path = os.path.join(self.output_dataset, chunk_name)
+        tokenized_dataset.save_to_disk(tokenized_path)
+
+        return tokenized_path
+
+    def pack_sequences(self, input_path, output_path):
+        packed_sequences = []
+        batch_input_ids = []
+        batch_labels = []
+        batch_len = 0
+        skipped_examples = []
+        eos_token_id = self.tokenizer.eos_token_id
+        pad_token_id = self.tokenizer.pad_token_id
+        tokenized = load_from_disk(input_path)
+        logger.debug("Processing tokenized dataset", input_path=input_path)
+        for n, example in tqdm(enumerate(tokenized), desc="Packing sequences"):
+            masking = True
+            if example["input_ids"] == example["labels"]:
+                masking = False
+
+            example_len = len(example["input_ids"])
+
+            # Account for separator token if appending concat token
+            sep_len = 1 if self.append_concat_token else 0
+            if example_len + sep_len > self.max_seq_length:
+                skipped_examples.append(n)
+                continue
+
+            if batch_len + example_len + sep_len > self.max_seq_length:
+                # Pad and add current batch
+                batch_input_ids.extend([pad_token_id] * (self.max_seq_length - batch_len))
+                if masking:
+                    batch_labels.extend([-100] * (self.max_seq_length - batch_len))
+                else:
+                    batch_labels.extend([pad_token_id] * (self.max_seq_length - batch_len))
+                packed_sequences.append({"input_ids": batch_input_ids, "labels": batch_labels})
+                batch_input_ids = []
+                batch_labels = []
+                batch_len = 0
+
+            batch_input_ids.extend(example["input_ids"])
+            batch_labels.extend(example["labels"])
+            if self.append_concat_token:
+                batch_input_ids.append(eos_token_id)  # Add separator token
+                if masking:
+                    batch_labels.append(-100)
+                else:
+                    batch_labels.append(eos_token_id)
+                batch_len += example_len + 1
+            else:
+                batch_len += example_len
+
+        # Handle last batch if not empty
+        if batch_input_ids:
+            batch_input_ids.extend([pad_token_id] * (self.max_seq_length - batch_len))
+            if masking:
+                batch_labels.extend([-100] * (self.max_seq_length - batch_len))
+            else:
+                batch_labels.extend([pad_token_id] * (self.max_seq_length - batch_len))
+            packed_sequences.append({"input_ids": batch_input_ids, "labels": batch_labels})
+        logger.info(
+            "Skipped examples that exceeded max sequence length", num_skipped=len(skipped_examples)
+        )
+        del tokenized
+        packed_dataset = Dataset.from_list(packed_sequences)
+        packed_dataset.save_to_disk(output_path)
+        remove_path(input_path)
+        return output_path
+
+    def _pack_dataset_wrapper(self, paths):
+        input_path, output_path = paths
+        try:
+            self.pack_sequences(input_path, output_path)
+            return output_path
+        except Exception as e:
+            logger.error("Error packing dataset", input_path=input_path, error=str(e))
+            return None
+
+    def pack_datasets_sequentially(self, tokenized_paths):
+        """Pack datasets one at a time without parallel processing"""
+        packed_paths = []
+        for input_path in tokenized_paths:
+            output_path = input_path.replace("_tokenized", "_packed")
+            logger.info("Packing dataset", input_path=input_path)
+            try:
+                self.pack_sequences(input_path, output_path)
+                packed_paths.append(output_path)
+            except Exception as e:
+                logger.error("Error packing dataset", input_path=input_path, error=str(e))
+        return packed_paths
+
+    def run_parallel_packing(self, tokenized_paths):
+        # Prepare input/output path pairs
+        pack_args = [
+            (input_path, input_path.replace("_tokenized", "_packed"))
+            for input_path in tokenized_paths
+        ]
+
+        pool = Pool(processes=self.num_workers)
+        try:
+            completed_paths = list(
+                tqdm(
+                    pool.imap(self._pack_dataset_wrapper, pack_args),
+                    total=len(pack_args),
+                    desc="Packing datasets",
+                )
+            )
+        finally:
+            pool.close()
+            pool.join()
+
+        # Filter out None values (failed packing attempts)
+        logger.debug("Completed packing paths", completed_paths=completed_paths)
+        return [path for path in completed_paths if path is not None]
+
+    def concatenate_final_dataset(self, packed_paths):
+        # Create a mapping of chunk index to dataset
+        chunk_datasets = {}
+        for path in packed_paths:
+            # Extract chunk index from path
+            chunk_idx = int(os.path.basename(path).split("_")[1])
+            chunk_datasets[chunk_idx] = load_from_disk(path)
+
+        # Load datasets in the original chunk order
+        datasets = [chunk_datasets[i] for i in range(len(chunk_datasets))]
+        final_dataset = concatenate_datasets(datasets)
+        final_path = os.path.join(self.output_dataset, "final_dataset")
+        dataset_dict = DatasetDict({"train": final_dataset})
+        dataset_dict.save_to_disk(final_path)
+        return final_path
+
+    def process(self):
+        # Step 1: Analyze and create chunks
+        logger.info("Analyzing directory sizes")
+        log_data = {}
+        sizes = self.get_directory_sizes(self.input_dataset)
+        log_data["sizes"] = sizes
+
+        splits = self.split_dataset(self.input_dataset, sizes)
+        log_data["splits"] = splits
+
+        chunks = self.create_chunks(splits)
+        log_data["chunks"] = chunks
+
+        # Step 2: Process each chunk (concatenate and tokenize)
+        tokenized_paths = []
+        for i, chunk in enumerate(chunks):
+            logger.info("Processing chunk", chunk_num=i + 1, total_chunks=len(chunks))
+            tokenized_path = self.process_chunk(chunk, i)
+            if tokenized_path:
+                tokenized_paths.append(tokenized_path)
+
+        log_data["tokenized"] = tokenized_paths
+
+        # Step 3: Pack datasets (parallel or sequential)
+        if self.parallel:
+            logger.info("Packing datasets in parallel")
+            packed_paths = self.run_parallel_packing(tokenized_paths)
+        else:
+            logger.info("Packing datasets sequentially")
+            packed_paths = self.pack_datasets_sequentially(tokenized_paths)
+
+        log_data["packed"] = packed_paths
+
+        # Step 4: Concatenate final dataset
+        logger.info("Concatenating final dataset")
+        final_path = self.concatenate_final_dataset(packed_paths)
+        for path in packed_paths:
+            remove_path(path)
+
+        log_data["final"] = final_path
+        self.log_data = log_data
+        logger.info("Processing complete! Final dataset saved", final_path=final_path)
+        return final_path
diff --git a/dalla_data_processing/packing/pack_config.example.yaml b/dalla_data_processing/packing/pack_config.example.yaml
new file mode 100644
index 0000000..ed73582
--- /dev/null
+++ b/dalla_data_processing/packing/pack_config.example.yaml
@@ -0,0 +1,11 @@
+tokenizer_path: "/path/to/tokenizer"
+max_seq_length: 2048
+chunk_size_gb: 2.0
+rbpe: false
+sft: false
+
+# text_column: "content"
+
+subset_order:
+  - "train"
+  - "validation"
diff --git a/dalla_data_processing/quality/README.md b/dalla_data_processing/quality/README.md
index 7d1fefa..5d6f33c 100644
--- a/dalla_data_processing/quality/README.md
+++ b/dalla_data_processing/quality/README.md
@@ -2,6 +2,15 @@
 
 Check text quality using morphological analysis to detect errors and foreign words.
 
+## Installation
+
+This feature requires the `[quality]` extra:
+
+```bash
+pip install "dalla-data-processing[quality]"
+# or install all features: pip install "dalla-data-processing[all]"
+```
+
 ## CLI Usage
 
 **Command:** `dalla-dp quality-check [OPTIONS]`
diff --git a/dalla_data_processing/readability/README.md b/dalla_data_processing/readability/README.md
index 1edb3d6..39b987a 100644
--- a/dalla_data_processing/readability/README.md
+++ b/dalla_data_processing/readability/README.md
@@ -2,6 +2,15 @@
 
 Calculate readability scores using Flesch Reading Ease and Osman methods.
 
+## Installation
+
+This feature requires the `[readability]` extra:
+
+```bash
+pip install "dalla-data-processing[readability]"
+# or install all features: pip install "dalla-data-processing[all]"
+```
+
 ## CLI Usage
 
 **Command:** `dalla-dp readability [OPTIONS]`
diff --git a/dalla_data_processing/stemming/README.md b/dalla_data_processing/stemming/README.md
index 3c8d206..5ad4f5a 100644
--- a/dalla_data_processing/stemming/README.md
+++ b/dalla_data_processing/stemming/README.md
@@ -2,6 +2,15 @@
 
 Apply morphological analysis and stemming using CAMeL Tools.
 
+## Installation
+
+This feature requires the `[stem]` extra:
+
+```bash
+pip install "dalla-data-processing[stem]"
+# or install all features: pip install "dalla-data-processing[all]"
+```
+
 ## CLI Usage
 
 **Command:** `dalla-dp stem [OPTIONS]`
diff --git a/dalla_data_processing/stemming/__init__.py b/dalla_data_processing/stemming/__init__.py
index febd264..cc12608 100644
--- a/dalla_data_processing/stemming/__init__.py
+++ b/dalla_data_processing/stemming/__init__.py
@@ -1,668 +1,9 @@
 """Stemming and morphological analysis module.
 
-This module is completely self-contained and does not depend on external scripts.
-All necessary functions are included here.
+This module provides Arabic stemming and morphological tokenization
+functionality using CAMeL Tools disambiguators.
 """
 
-import os
-import re
-from collections import deque
-from types import MethodType
-
-from camel_tools.data.catalogue import Catalogue
-from camel_tools.disambig.bert import BERTUnfactoredDisambiguator
-from camel_tools.disambig.mle import MLEDisambiguator
-from camel_tools.utils.dediac import dediac_ar
-from datasets import Dataset
-
-from dalla_data_processing.utils.logger import get_logger
-from dalla_data_processing.utils.tokenize import simple_word_tokenize
-
-logger = get_logger(__name__)
-
-
-def normalize_arabic(text: str) -> str:
-    """Normalize Arabic text."""
-    _DIAC_RE = re.compile(
-        r"[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06DC\u06DF-\u06E8\u06EA-\u06ED\u08D3-\u08FF]"
-    )
-    _TATWEEL_RE = re.compile(r"\u0640")
-    _ALIF_RE = re.compile(r"[آأإٱ]")
-    _ALIF_MAK_RE = re.compile(r"ى")
-    _TEH_MARB_RE = re.compile(r"ة")
-
-    text = _DIAC_RE.sub("", text)
-    text = _TATWEEL_RE.sub("", text)
-    text = _ALIF_RE.sub("ا", text)
-    text = _ALIF_MAK_RE.sub("ي", text)
-    text = _TEH_MARB_RE.sub("ه", text)
-    return text
-
-
-def has_diacritics(word):
-    """Check if word has diacritics."""
-    diacritic_marks = {
-        "\u064b",
-        "\u064c",
-        "\u064d",
-        "\u064e",
-        "\u064f",
-        "\u0650",
-        "\u0651",
-        "\u0652",
-        "\u0670",
-    }
-    return any(char in diacritic_marks for char in word)
-
-
-def apply_diacritics_to_segments_keep_markers(segments, diacritized_word, sep_token="<+>"):
-    """Apply diacritics from original word to segmented tokens."""
-    result = []
-    diacritic_marks = {
-        "\u064b",
-        "\u064c",
-        "\u064d",
-        "\u064e",
-        "\u064f",
-        "\u0650",
-        "\u0651",
-        "\u0652",
-        "\u0670",
-    }
-    sep_len = len(sep_token)
-
-    leading_diacritics = []
-    i = 0
-    while i < len(diacritized_word) and diacritized_word[i] in diacritic_marks:
-        leading_diacritics.append(diacritized_word[i])
-        i += 1
-
-    diacritic_index = len(leading_diacritics)
-
-    for segment_idx, segment in enumerate(segments):
-        if segment == sep_token:
-            result.append(segment)
-        else:
-            diacritized_segment = []
-
-            if segment_idx == 0 and leading_diacritics:
-                diacritized_segment.extend(leading_diacritics)
-
-            i = 0
-            while i < len(segment):
-                char = segment[i]
-                if segment[i : i + sep_len] == sep_token:
-                    diacritized_segment.append(sep_token)
-                    i += sep_len
-                    continue
-
-                if diacritic_index < len(diacritized_word):
-                    while (
-                        diacritic_index < len(diacritized_word)
-                        and diacritized_word[diacritic_index] in diacritic_marks
-                    ):
-                        diacritic_index += 1
-
-                    if (
-                        diacritic_index < len(diacritized_word)
-                        and diacritized_word[diacritic_index] == char
-                    ):
-                        diacritized_segment.append(char)
-                        diacritic_index += 1
-
-                        while (
-                            diacritic_index < len(diacritized_word)
-                            and diacritized_word[diacritic_index] in diacritic_marks
-                        ):
-                            diacritized_segment.append(diacritized_word[diacritic_index])
-                            diacritic_index += 1
-                    else:
-                        diacritized_segment.append(char)
-                else:
-                    diacritized_segment.append(char)
-
-                i += 1
-
-            result.append("".join(diacritized_segment))
-
-    return result
-
-
-def read_and_dediacritize(file_name):
-    """Read words from file and dediacritize them."""
-    words = []
-    with open(file_name, encoding="utf-8") as file:
-        for line in file:
-            word = line.strip()
-            dediacritized_word = dediac_ar(word)
-            words.append(dediacritized_word)
-    return words
-
-
-def par_is_utf8_encoded(paragraph):
-    """Check if paragraph is UTF-8 encoded."""
-    try:
-        paragraph.encode("utf-8")
-        return True
-    except UnicodeEncodeError:
-        return False
-
-
-def tokenize(text):
-    """Tokenize text into words."""
-    if par_is_utf8_encoded(text):
-        text_list = simple_word_tokenize(text)
-        return text_list
-    else:
-        return None
-
-
-def merge_alef_and_alef_lam(input_list, sep_token="<+>"):
-    """Merge specific Arabic morpheme patterns."""
-    pattern = [f"\u0644{sep_token}".encode(), f"\u0627\u0644{sep_token}".encode()]
-    replacement = f"\u0644\u0644{sep_token}"
-
-    modified_list = []
-    i = 0
-
-    while i < len(input_list):
-        if i < len(input_list) - 1:
-            current_element = input_list[i].encode("utf-8")
-            next_element = input_list[i + 1].encode("utf-8")
-
-            if current_element == pattern[0] and next_element == pattern[1]:
-                modified_list.append(replacement)
-                i += 2
-                continue
-
-        modified_list.append(input_list[i])
-        i += 1
-
-    return modified_list
-
-
-def process_NOAN_word(list_al_t, list_al, list_t, word, sep_token="<+>"):
-    """Process words marked as NOAN (no analysis)."""
-    alef_lam = b"\xd8\xa7\xd9\x84"
-    taa_marbouta_detached = b"\xef\xba\x93"
-    taa_marbouta_attached = b"\xd8\xa9"
-    word_bytes = word.encode("utf-8")
-
-    if (
-        word_bytes.startswith(alef_lam)
-        and (
-            word_bytes.endswith(taa_marbouta_detached) or word_bytes.endswith(taa_marbouta_attached)
-        )
-        and word in list_al_t
-    ):
-        stripped_word = word[2:-1]
-        first_part = word[0:2] + sep_token
-        last_part = sep_token + word[-1]
-        return [first_part, stripped_word, last_part]
-
-    if word_bytes.startswith(alef_lam) and word in list_al:
-        stripped_word = word[2:]
-        first_part = word[0:2] + sep_token
-        return [first_part, stripped_word]
-
-    if word_bytes.endswith(taa_marbouta_detached) or word_bytes.endswith(taa_marbouta_attached):
-        if word in list_t:
-            stripped_word = word[:-1]
-            last_part = sep_token + word[-1]
-            return [stripped_word, last_part]
-
-    return [word]
-
-
-def merge_tokens(tokens, original_word, sep_token="<+>"):
-    """Merge tokenized segments back into a word."""
-    parts = []
-    sep_len = len(sep_token)
-    for tok in tokens:
-        if tok == sep_token:
-            parts.append("_")
-        elif tok.endswith(sep_token):
-            tok = tok[:-sep_len]
-            parts.append(tok)
-        elif tok.startswith(sep_token):
-            tok = tok[sep_len:]
-            parts.append(tok)
-        elif tok.endswith("+"):
-            tok = tok[:-1]
-            parts.append(tok)
-        elif tok.startswith("+"):
-            tok = tok[1:]
-            parts.append(tok)
-        else:
-            parts.append(tok)
-
-    merged_word = "".join(parts)
-    return merged_word
-
-
-def split_token_on_t(list_toks, sep_token="<+>"):
-    """Split tokens on taa marbouta character."""
-    new_list = []
-    taa_marbouta_detached = b"\xef\xba\x93"
-    taa_marbouta_attached = b"\xd8\xa9"
-    haa_attached = b"\xd9\x87"
-
-    for token in list_toks:
-        token_bytes = token.encode("utf-8")
-        if (
-            token_bytes.endswith(taa_marbouta_detached)
-            or token_bytes.endswith(taa_marbouta_attached)
-            or token_bytes.endswith(haa_attached)
-        ):
-            if token_bytes == b"\xd9\x87":
-                token = sep_token + taa_marbouta_attached.decode("utf-8")
-                new_list.append(token)
-            else:
-                part1 = token[:-1]
-                part2 = sep_token + token[-1]
-                new_list.append(part1)
-                new_list.append(part2)
-        else:
-            new_list.append(token)
-
-    return new_list
-
-
-def replace_separator(toks, sep_token="<+>"):
-    """Replace + with sep_token in tokens."""
-    result = list(toks)
-
-    for i, tok in enumerate(result):
-        if tok.startswith("+"):
-            result[i] = sep_token + tok[1:]
-        if tok.endswith("+"):
-            result[i] = tok[:-1] + sep_token
-    return result
-
-
-def morph_tokenize(
-    words, disambiguator, list_al_t, list_al, list_t, scheme="d3tok", split=True, sep_token="<+>"
-):
-    """Generate morphological tokens for a list of words."""
-    disambig_words = disambiguator.disambiguate(words)
-    result = deque()
-    err_disambig = []
-    err_camel = []
-    has_diacritics_in_par = False
-
-    for original, disambig_word in zip(words, disambig_words, strict=False):
-        scored_analyses = disambig_word.analyses
-        original_word = original
-        dediac_word = dediac_ar(original_word)
-
-        if has_diacritics(original_word):
-            has_diacritics_in_par = True
-
-        if not scored_analyses:
-            result.append(original_word)
-            continue
-
-        analysis = scored_analyses[0].analysis
-        tok = dediac_ar(analysis.get(scheme, None))
-        tok_bw = dediac_ar(analysis.get("bwtok", None))
-        seg_d3 = dediac_ar(analysis.get("d3seg", None))
-
-        taa_marbouta_detached = b"\xef\xba\x93"
-        taa_marbouta_attached = b"\xd8\xa9"
-        original_word_bytes = dediac_word.encode("utf-8")
-
-        if original_word_bytes.endswith(taa_marbouta_attached) or original_word_bytes.endswith(
-            taa_marbouta_detached
-        ):
-            if "+ة_+" in tok_bw or "+ه" in tok_bw or "+ة" in tok_bw:
-                toks = tok.split("_")
-                toks = split_token_on_t(toks, sep_token)
-                toks = replace_separator(toks, sep_token)
-                toks = merge_alef_and_alef_lam(toks, sep_token)
-                merged_toks = dediac_ar(merge_tokens(toks, dediac_word, sep_token))
-
-                d3_seg_tok = seg_d3.split("_")
-                d3_seg_tok = split_token_on_t(d3_seg_tok, sep_token)
-                d3_seg_tok = replace_separator(d3_seg_tok, sep_token)
-                d3_seg_tok = merge_alef_and_alef_lam(d3_seg_tok, sep_token)
-                merged_toks_seg = dediac_ar(merge_tokens(d3_seg_tok, dediac_word, sep_token))
-
-                bw_toks = tok_bw.split("_")
-                bw_toks = split_token_on_t(bw_toks, sep_token)
-                bw_toks = replace_separator(bw_toks, sep_token)
-                bw_toks = merge_alef_and_alef_lam(bw_toks, sep_token)
-                merged_toks_bw = dediac_ar(merge_tokens(bw_toks, dediac_word, sep_token))
-
-                if merged_toks == dediac_word and len(toks) > 1:
-                    if has_diacritics(original):
-                        toks = apply_diacritics_to_segments_keep_markers(toks, original, sep_token)
-                    result.extend(toks)
-                    continue
-
-                elif merged_toks_seg == dediac_word and len(d3_seg_tok) > 1:
-                    if has_diacritics(original):
-                        d3_seg_tok = apply_diacritics_to_segments_keep_markers(
-                            d3_seg_tok, original, sep_token
-                        )
-                    result.extend(d3_seg_tok)
-                    continue
-
-                elif merged_toks_bw == dediac_word and len(bw_toks) > 1:
-                    if has_diacritics(original):
-                        bw_toks = apply_diacritics_to_segments_keep_markers(
-                            bw_toks, original, sep_token
-                        )
-                    result.extend(bw_toks)
-                    continue
-
-                else:
-                    result.append(original_word)
-                    err_disambig.append(dediac_word)
-                    err_camel.append(merged_toks)
-                    continue
-
-        if tok is None or "NOAN" in tok:
-            tok = process_NOAN_word(list_al_t, list_al, list_t, dediac_word, sep_token)
-            if has_diacritics(original):
-                toks = apply_diacritics_to_segments_keep_markers(tok, original, sep_token)
-            else:
-                toks = tok
-            result.extend(toks)
-
-        elif split:
-            tok = dediac_ar(tok)
-            toks = tok.split("_")
-            toks = replace_separator(toks, sep_token)
-            toks = merge_alef_and_alef_lam(toks, sep_token)
-            merged_toks = dediac_ar(merge_tokens(toks, dediac_word, sep_token))
-
-            bw_toks = tok_bw.split("_")
-            bw_toks = replace_separator(bw_toks, sep_token)
-            bw_toks = merge_alef_and_alef_lam(bw_toks, sep_token)
-            merged_toks_bw = dediac_ar(merge_tokens(bw_toks, dediac_word, sep_token))
-
-            d3_seg_tok = seg_d3.split("_")
-            d3_seg_tok = replace_separator(d3_seg_tok, sep_token)
-            d3_seg_tok = merge_alef_and_alef_lam(d3_seg_tok, sep_token)
-            merged_toks_seg = dediac_ar(merge_tokens(d3_seg_tok, dediac_word, sep_token))
-
-            if merged_toks == dediac_word and len(toks) > 1:
-                if has_diacritics(original):
-                    toks = apply_diacritics_to_segments_keep_markers(toks, original, sep_token)
-                result.extend(toks)
-            elif merged_toks_seg == dediac_word and len(d3_seg_tok) > 1:
-                if has_diacritics(original):
-                    d3_seg_tok = apply_diacritics_to_segments_keep_markers(
-                        d3_seg_tok, original, sep_token
-                    )
-                result.extend(d3_seg_tok)
-            elif merged_toks_bw == dediac_word and len(bw_toks) > 1:
-                if has_diacritics(original):
-                    bw_toks = apply_diacritics_to_segments_keep_markers(
-                        bw_toks, original, sep_token
-                    )
-                result.extend(bw_toks)
-            else:
-                result.append(original_word)
-                err_disambig.append(dediac_word)
-                err_camel.append(merged_toks)
-
-        else:
-            tok = dediac_ar(tok)
-            if tok == dediac_word:
-                result.append(original_word)
-            else:
-                result.append(original_word)
-                err_disambig.append(dediac_word)
-                err_camel.append(tok)
-
-    return list(result), err_disambig, err_camel, has_diacritics_in_par
-
-
-def stem_dataset(
-    dataset: Dataset,
-    column: str = "text",
-    sep_token: str = "<+>",
-    normalize: bool = False,
-    keep_diacritics: bool = True,
-    num_proc: int | None = None,
-    model: str = "mle",
-    use_gpu: bool = False,
-) -> Dataset:
-    """
-    Apply stemming and morphological analysis to dataset.
-
-    Args:
-        dataset: HuggingFace dataset
-        column: Column to process
-        sep_token: Separator token for morphological splits (default: '<+>')
-        normalize: Apply Arabic normalization (default: False)
-        keep_diacritics: Keep dediacritized column (default: True)
-        num_proc: Number of parallel processes
-        model: Disambiguator model to use - "mle" or "bert" (default: "mle")
-        use_gpu: Whether to use GPU for BERT model (default: False)
-
-    Returns:
-        Dataset with {column}_stemmed and optionally {column}_dediac columns
-
-    Example:
-        >>> # Stem with defaults (MLE, keeps diacritics)
-        >>> stemmed = stem_dataset(dataset)
-        >>> # Result has 'text_stemmed' and 'text_dediac' columns
-
-        >>> # Stem using BERT with GPU
-        >>> stemmed = stem_dataset(dataset, model="bert", use_gpu=True)
-
-        >>> # Stem without keeping diacritics
-        >>> stemmed = stem_dataset(dataset, keep_diacritics=False)
-        >>> # Result has only 'text_stemmed' column
-    """
-    model = model.lower()
-    if model not in ["mle", "bert"]:
-        raise ValueError(f"Invalid model '{model}'. Must be 'mle' or 'bert'")
-
-    logger.info(f"Starting stemming of {len(dataset)} examples")
-    logger.info(
-        f"Model: {model.upper()}, Column: {column}, Sep token: {sep_token}, Normalize: {normalize}"
-    )
-    logger.info(f"Keep diacritics: {keep_diacritics}, Workers: {num_proc or 'auto'}")
-    if model == "bert":
-        logger.info(f"GPU: {use_gpu}")
-
-    logger.info("Checking CAMeL Tools data packages...")
-    catalogue = Catalogue.load_catalogue()
-    try:
-        catalogue.download_package("morphology-db-msa-r13")
-        if model == "mle":
-            catalogue.download_package("disambig-mle-calima-msa-r13")
-        # For BERT, let it download automatically when pretrained() is called
-        logger.info("CAMeL Tools data packages ready")
-    except Exception as e:
-        logger.warning(f"Could not verify CAMeL packages: {e}")
-
-    logger.info("Loading additional words lists...")
-    words_dir = os.path.join(os.path.dirname(__file__), "data")
-    list_al_t = set(read_and_dediacritize(os.path.join(words_dir, "words_al_t.txt")))
-    list_al = set(read_and_dediacritize(os.path.join(words_dir, "words_al.txt")))
-    list_t = set(read_and_dediacritize(os.path.join(words_dir, "words_t.txt")))
-    logger.info("Loaded word list entries")
-
-    logger.info(f"Initializing {model.upper()} disambiguator...")
-    if model == "mle":
-        disambiguator = MLEDisambiguator.pretrained("calima-msa-r13", cache_size=1_000_000)
-    else:  # bert
-        disambiguator = BERTUnfactoredDisambiguator.pretrained(use_gpu=use_gpu)
-    logger.info("Disambiguator ready")
-
-    def new_scored_analysis(self, word_dd):
-        if word_dd in self._cache:
-            return self._cache[word_dd]
-        result = self._scored_analyses(word_dd)
-        self._cache[word_dd] = result
-        return result
-
-    disambiguator._scored_analyses_cached = MethodType(new_scored_analysis, disambiguator)
-    disambiguator._score_fn = disambiguator._scored_analyses_cached
-
-    def process_row(row):
-        text = row.get(column, "")
-        if not text:
-            row[f"{column}_stemmed"] = ""
-            if keep_diacritics:
-                row[f"{column}_dediac"] = ""
-            return row
-
-        word_list = tokenize(text)
-        if word_list is None:
-            row[f"{column}_stemmed"] = text
-            if keep_diacritics:
-                row[f"{column}_dediac"] = dediac_ar(text)
-            return row
-
-        tokenized, _, _, has_diacs = morph_tokenize(
-            word_list, disambiguator, list_al_t, list_al, list_t, sep_token=sep_token
-        )
-
-        if tokenized is not None:
-            tokenized = merge_alef_and_alef_lam(tokenized, sep_token)
-            stemmed = "".join(tokenized)
-
-            if normalize:
-                stemmed = normalize_arabic(stemmed)
-
-            row[f"{column}_stemmed"] = stemmed
-
-            if keep_diacritics:
-                row[f"{column}_dediac"] = dediac_ar(stemmed)
-        else:
-            row[f"{column}_stemmed"] = text
-            if keep_diacritics:
-                row[f"{column}_dediac"] = dediac_ar(text)
-
-        return row
-
-    logger.info("Starting morphological tokenization...")
-    result = dataset.map(process_row, num_proc=num_proc, desc="Stemming")
-
-    logger.info(f"Stemming complete! Processed {len(result)} examples")
-    return result
-
-
-def stem(
-    text: str | list[str],
-    sep_token: str = "<+>",
-    normalize: bool = False,
-    keep_diacritics: bool = False,
-    model: str = "mle",
-    use_gpu: bool = False,
-) -> str | list[str]:
-    """
-    Stem Arabic text or list of texts.
-
-    Args:
-        text: Single string or list of strings to stem
-        sep_token: Separator token for morphological splits (default: '<+>')
-        normalize: Apply Arabic normalization (default: False)
-        keep_diacritics: Keep diacritics in output (default: False)
-        model: Disambiguator model to use - "mle" or "bert" (default: "mle")
-        use_gpu: Whether to use GPU for BERT model (default: False)
-
-    Returns:
-        Stemmed text in the same format as input (string or list of strings)
-
-    Example:
-        >>> # Stem a single string
-        >>> stemmed = stem("النص العربي")
-        >>> # Returns: "ال<+>نص ال<+>عربي"
-
-        >>> # Stem a list of strings
-        >>> stemmed = stem(["النص العربي", "مثال آخر"])
-        >>> # Returns: ["ال<+>نص ال<+>عربي", "مثال آخر"]
-
-        >>> # Stem with BERT model and GPU
-        >>> stemmed = stem("النص", model="bert", use_gpu=True)
-    """
-    # Validate model parameter
-    model = model.lower()
-    if model not in ["mle", "bert"]:
-        raise ValueError(f"Invalid model '{model}'. Must be 'mle' or 'bert'")
-
-    # Track whether input was a single string
-    is_single_string = isinstance(text, str)
-
-    # Convert single string to list for uniform processing
-    text_list = [text] if is_single_string else text
-
-    # Validate all items are strings
-    if not all(isinstance(t, str) for t in text_list):
-        raise TypeError("All items in text list must be strings")
-
-    # Initialize disambiguator (cached globally if possible)
-    logger.info(f"Initializing {model.upper()} disambiguator...")
-    catalogue = Catalogue.load_catalogue()
-    try:
-        catalogue.download_package("morphology-db-msa-r13")
-        if model == "mle":
-            catalogue.download_package("disambig-mle-calima-msa-r13")
-    except Exception as e:
-        logger.warning(f"Could not verify CAMeL packages: {e}")
-
-    if model == "mle":
-        disambiguator = MLEDisambiguator.pretrained("calima-msa-r13", cache_size=1_000_000)
-    else:  # bert
-        disambiguator = BERTUnfactoredDisambiguator.pretrained(use_gpu=use_gpu)
-
-    # Add caching to disambiguator
-    def new_scored_analysis(self, word_dd):
-        if word_dd in self._cache:
-            return self._cache[word_dd]
-        result = self._scored_analyses(word_dd)
-        self._cache[word_dd] = result
-        return result
-
-    disambiguator._scored_analyses_cached = MethodType(new_scored_analysis, disambiguator)
-    disambiguator._score_fn = disambiguator._scored_analyses_cached
-
-    # Load word lists
-    words_dir = os.path.join(os.path.dirname(__file__), "data")
-    list_al_t = set(read_and_dediacritize(os.path.join(words_dir, "words_al_t.txt")))
-    list_al = set(read_and_dediacritize(os.path.join(words_dir, "words_al.txt")))
-    list_t = set(read_and_dediacritize(os.path.join(words_dir, "words_t.txt")))
-
-    # Process each text
-    results = []
-    for txt in text_list:
-        if not txt:
-            results.append("")
-            continue
-
-        word_list = tokenize(txt)
-        if word_list is None:
-            stemmed = dediac_ar(txt) if not keep_diacritics else txt
-            results.append(stemmed)
-            continue
-
-        tokenized, _, _, has_diacs = morph_tokenize(
-            word_list, disambiguator, list_al_t, list_al, list_t, sep_token=sep_token
-        )
-
-        if tokenized is not None:
-            tokenized = merge_alef_and_alef_lam(tokenized, sep_token)
-            stemmed = "".join(tokenized)
-
-            if normalize:
-                stemmed = normalize_arabic(stemmed)
-
-            if not keep_diacritics:
-                stemmed = dediac_ar(stemmed)
-
-            results.append(stemmed)
-        else:
-            stemmed = dediac_ar(txt) if not keep_diacritics else txt
-            results.append(stemmed)
-
-    # Return in the same format as input
-    return results[0] if is_single_string else results
-
+from dalla_data_processing.stemming.stemmer import stem, stem_dataset
 
 __all__ = ["stem_dataset", "stem"]
diff --git a/dalla_data_processing/stemming/stemmer.py b/dalla_data_processing/stemming/stemmer.py
new file mode 100644
index 0000000..8bed063
--- /dev/null
+++ b/dalla_data_processing/stemming/stemmer.py
@@ -0,0 +1,665 @@
+"""Stemming and morphological analysis implementation.
+
+This module contains all the implementation details for Arabic stemming
+and morphological tokenization using CAMeL Tools.
+"""
+
+import os
+import re
+from collections import deque
+from types import MethodType
+
+from camel_tools.data.catalogue import Catalogue
+from camel_tools.disambig.bert import BERTUnfactoredDisambiguator
+from camel_tools.disambig.mle import MLEDisambiguator
+from camel_tools.utils.dediac import dediac_ar
+from datasets import Dataset
+
+from dalla_data_processing.utils.logger import get_logger
+from dalla_data_processing.utils.tokenize import simple_word_tokenize
+
+logger = get_logger(__name__)
+
+
+def normalize_arabic(text: str) -> str:
+    """Normalize Arabic text."""
+    _DIAC_RE = re.compile(
+        r"[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06DC\u06DF-\u06E8\u06EA-\u06ED\u08D3-\u08FF]"
+    )
+    _TATWEEL_RE = re.compile(r"\u0640")
+    _ALIF_RE = re.compile(r"[آأإٱ]")
+    _ALIF_MAK_RE = re.compile(r"ى")
+    _TEH_MARB_RE = re.compile(r"ة")
+
+    text = _DIAC_RE.sub("", text)
+    text = _TATWEEL_RE.sub("", text)
+    text = _ALIF_RE.sub("ا", text)
+    text = _ALIF_MAK_RE.sub("ي", text)
+    text = _TEH_MARB_RE.sub("ه", text)
+    return text
+
+
+def has_diacritics(word):
+    """Check if word has diacritics."""
+    diacritic_marks = {
+        "\u064b",
+        "\u064c",
+        "\u064d",
+        "\u064e",
+        "\u064f",
+        "\u0650",
+        "\u0651",
+        "\u0652",
+        "\u0670",
+    }
+    return any(char in diacritic_marks for char in word)
+
+
+def apply_diacritics_to_segments_keep_markers(segments, diacritized_word, sep_token="<+>"):
+    """Apply diacritics from original word to segmented tokens."""
+    result = []
+    diacritic_marks = {
+        "\u064b",
+        "\u064c",
+        "\u064d",
+        "\u064e",
+        "\u064f",
+        "\u0650",
+        "\u0651",
+        "\u0652",
+        "\u0670",
+    }
+    sep_len = len(sep_token)
+
+    leading_diacritics = []
+    i = 0
+    while i < len(diacritized_word) and diacritized_word[i] in diacritic_marks:
+        leading_diacritics.append(diacritized_word[i])
+        i += 1
+
+    diacritic_index = len(leading_diacritics)
+
+    for segment_idx, segment in enumerate(segments):
+        if segment == sep_token:
+            result.append(segment)
+        else:
+            diacritized_segment = []
+
+            if segment_idx == 0 and leading_diacritics:
+                diacritized_segment.extend(leading_diacritics)
+
+            i = 0
+            while i < len(segment):
+                char = segment[i]
+                if segment[i : i + sep_len] == sep_token:
+                    diacritized_segment.append(sep_token)
+                    i += sep_len
+                    continue
+
+                if diacritic_index < len(diacritized_word):
+                    while (
+                        diacritic_index < len(diacritized_word)
+                        and diacritized_word[diacritic_index] in diacritic_marks
+                    ):
+                        diacritic_index += 1
+
+                    if (
+                        diacritic_index < len(diacritized_word)
+                        and diacritized_word[diacritic_index] == char
+                    ):
+                        diacritized_segment.append(char)
+                        diacritic_index += 1
+
+                        while (
+                            diacritic_index < len(diacritized_word)
+                            and diacritized_word[diacritic_index] in diacritic_marks
+                        ):
+                            diacritized_segment.append(diacritized_word[diacritic_index])
+                            diacritic_index += 1
+                    else:
+                        diacritized_segment.append(char)
+                else:
+                    diacritized_segment.append(char)
+
+                i += 1
+
+            result.append("".join(diacritized_segment))
+
+    return result
+
+
+def read_and_dediacritize(file_name):
+    """Read words from file and dediacritize them."""
+    words = []
+    with open(file_name, encoding="utf-8") as file:
+        for line in file:
+            word = line.strip()
+            dediacritized_word = dediac_ar(word)
+            words.append(dediacritized_word)
+    return words
+
+
+def par_is_utf8_encoded(paragraph):
+    """Check if paragraph is UTF-8 encoded."""
+    try:
+        paragraph.encode("utf-8")
+        return True
+    except UnicodeEncodeError:
+        return False
+
+
+def tokenize(text):
+    """Tokenize text into words."""
+    if par_is_utf8_encoded(text):
+        text_list = simple_word_tokenize(text)
+        return text_list
+    else:
+        return None
+
+
+def merge_alef_and_alef_lam(input_list, sep_token="<+>"):
+    """Merge specific Arabic morpheme patterns."""
+    pattern = [f"\u0644{sep_token}".encode(), f"\u0627\u0644{sep_token}".encode()]
+    replacement = f"\u0644\u0644{sep_token}"
+
+    modified_list = []
+    i = 0
+
+    while i < len(input_list):
+        if i < len(input_list) - 1:
+            current_element = input_list[i].encode("utf-8")
+            next_element = input_list[i + 1].encode("utf-8")
+
+            if current_element == pattern[0] and next_element == pattern[1]:
+                modified_list.append(replacement)
+                i += 2
+                continue
+
+        modified_list.append(input_list[i])
+        i += 1
+
+    return modified_list
+
+
+def process_NOAN_word(list_al_t, list_al, list_t, word, sep_token="<+>"):
+    """Process words marked as NOAN (no analysis)."""
+    alef_lam = b"\xd8\xa7\xd9\x84"
+    taa_marbouta_detached = b"\xef\xba\x93"
+    taa_marbouta_attached = b"\xd8\xa9"
+    word_bytes = word.encode("utf-8")
+
+    if (
+        word_bytes.startswith(alef_lam)
+        and (
+            word_bytes.endswith(taa_marbouta_detached) or word_bytes.endswith(taa_marbouta_attached)
+        )
+        and word in list_al_t
+    ):
+        stripped_word = word[2:-1]
+        first_part = word[0:2] + sep_token
+        last_part = sep_token + word[-1]
+        return [first_part, stripped_word, last_part]
+
+    if word_bytes.startswith(alef_lam) and word in list_al:
+        stripped_word = word[2:]
+        first_part = word[0:2] + sep_token
+        return [first_part, stripped_word]
+
+    if word_bytes.endswith(taa_marbouta_detached) or word_bytes.endswith(taa_marbouta_attached):
+        if word in list_t:
+            stripped_word = word[:-1]
+            last_part = sep_token + word[-1]
+            return [stripped_word, last_part]
+
+    return [word]
+
+
+def merge_tokens(tokens, original_word, sep_token="<+>"):
+    """Merge tokenized segments back into a word."""
+    parts = []
+    sep_len = len(sep_token)
+    for tok in tokens:
+        if tok == sep_token:
+            parts.append("_")
+        elif tok.endswith(sep_token):
+            tok = tok[:-sep_len]
+            parts.append(tok)
+        elif tok.startswith(sep_token):
+            tok = tok[sep_len:]
+            parts.append(tok)
+        elif tok.endswith("+"):
+            tok = tok[:-1]
+            parts.append(tok)
+        elif tok.startswith("+"):
+            tok = tok[1:]
+            parts.append(tok)
+        else:
+            parts.append(tok)
+
+    merged_word = "".join(parts)
+    return merged_word
+
+
+def split_token_on_t(list_toks, sep_token="<+>"):
+    """Split tokens on taa marbouta character."""
+    new_list = []
+    taa_marbouta_detached = b"\xef\xba\x93"
+    taa_marbouta_attached = b"\xd8\xa9"
+    haa_attached = b"\xd9\x87"
+
+    for token in list_toks:
+        token_bytes = token.encode("utf-8")
+        if (
+            token_bytes.endswith(taa_marbouta_detached)
+            or token_bytes.endswith(taa_marbouta_attached)
+            or token_bytes.endswith(haa_attached)
+        ):
+            if token_bytes == b"\xd9\x87":
+                token = sep_token + taa_marbouta_attached.decode("utf-8")
+                new_list.append(token)
+            else:
+                part1 = token[:-1]
+                part2 = sep_token + token[-1]
+                new_list.append(part1)
+                new_list.append(part2)
+        else:
+            new_list.append(token)
+
+    return new_list
+
+
+def replace_separator(toks, sep_token="<+>"):
+    """Replace + with sep_token in tokens."""
+    result = list(toks)
+
+    for i, tok in enumerate(result):
+        if tok.startswith("+"):
+            result[i] = sep_token + tok[1:]
+        if tok.endswith("+"):
+            result[i] = tok[:-1] + sep_token
+    return result
+
+
+def morph_tokenize(
+    words, disambiguator, list_al_t, list_al, list_t, scheme="d3tok", split=True, sep_token="<+>"
+):
+    """Generate morphological tokens for a list of words."""
+    disambig_words = disambiguator.disambiguate(words)
+    result = deque()
+    err_disambig = []
+    err_camel = []
+    has_diacritics_in_par = False
+
+    for original, disambig_word in zip(words, disambig_words, strict=False):
+        scored_analyses = disambig_word.analyses
+        original_word = original
+        dediac_word = dediac_ar(original_word)
+
+        if has_diacritics(original_word):
+            has_diacritics_in_par = True
+
+        if not scored_analyses:
+            result.append(original_word)
+            continue
+
+        analysis = scored_analyses[0].analysis
+        tok = dediac_ar(analysis.get(scheme, None))
+        tok_bw = dediac_ar(analysis.get("bwtok", None))
+        seg_d3 = dediac_ar(analysis.get("d3seg", None))
+
+        taa_marbouta_detached = b"\xef\xba\x93"
+        taa_marbouta_attached = b"\xd8\xa9"
+        original_word_bytes = dediac_word.encode("utf-8")
+
+        if original_word_bytes.endswith(taa_marbouta_attached) or original_word_bytes.endswith(
+            taa_marbouta_detached
+        ):
+            if "+ة_+" in tok_bw or "+ه" in tok_bw or "+ة" in tok_bw:
+                toks = tok.split("_")
+                toks = split_token_on_t(toks, sep_token)
+                toks = replace_separator(toks, sep_token)
+                toks = merge_alef_and_alef_lam(toks, sep_token)
+                merged_toks = dediac_ar(merge_tokens(toks, dediac_word, sep_token))
+
+                d3_seg_tok = seg_d3.split("_")
+                d3_seg_tok = split_token_on_t(d3_seg_tok, sep_token)
+                d3_seg_tok = replace_separator(d3_seg_tok, sep_token)
+                d3_seg_tok = merge_alef_and_alef_lam(d3_seg_tok, sep_token)
+                merged_toks_seg = dediac_ar(merge_tokens(d3_seg_tok, dediac_word, sep_token))
+
+                bw_toks = tok_bw.split("_")
+                bw_toks = split_token_on_t(bw_toks, sep_token)
+                bw_toks = replace_separator(bw_toks, sep_token)
+                bw_toks = merge_alef_and_alef_lam(bw_toks, sep_token)
+                merged_toks_bw = dediac_ar(merge_tokens(bw_toks, dediac_word, sep_token))
+
+                if merged_toks == dediac_word and len(toks) > 1:
+                    if has_diacritics(original):
+                        toks = apply_diacritics_to_segments_keep_markers(toks, original, sep_token)
+                    result.extend(toks)
+                    continue
+
+                elif merged_toks_seg == dediac_word and len(d3_seg_tok) > 1:
+                    if has_diacritics(original):
+                        d3_seg_tok = apply_diacritics_to_segments_keep_markers(
+                            d3_seg_tok, original, sep_token
+                        )
+                    result.extend(d3_seg_tok)
+                    continue
+
+                elif merged_toks_bw == dediac_word and len(bw_toks) > 1:
+                    if has_diacritics(original):
+                        bw_toks = apply_diacritics_to_segments_keep_markers(
+                            bw_toks, original, sep_token
+                        )
+                    result.extend(bw_toks)
+                    continue
+
+                else:
+                    result.append(original_word)
+                    err_disambig.append(dediac_word)
+                    err_camel.append(merged_toks)
+                    continue
+
+        if tok is None or "NOAN" in tok:
+            tok = process_NOAN_word(list_al_t, list_al, list_t, dediac_word, sep_token)
+            if has_diacritics(original):
+                toks = apply_diacritics_to_segments_keep_markers(tok, original, sep_token)
+            else:
+                toks = tok
+            result.extend(toks)
+
+        elif split:
+            tok = dediac_ar(tok)
+            toks = tok.split("_")
+            toks = replace_separator(toks, sep_token)
+            toks = merge_alef_and_alef_lam(toks, sep_token)
+            merged_toks = dediac_ar(merge_tokens(toks, dediac_word, sep_token))
+
+            bw_toks = tok_bw.split("_")
+            bw_toks = replace_separator(bw_toks, sep_token)
+            bw_toks = merge_alef_and_alef_lam(bw_toks, sep_token)
+            merged_toks_bw = dediac_ar(merge_tokens(bw_toks, dediac_word, sep_token))
+
+            d3_seg_tok = seg_d3.split("_")
+            d3_seg_tok = replace_separator(d3_seg_tok, sep_token)
+            d3_seg_tok = merge_alef_and_alef_lam(d3_seg_tok, sep_token)
+            merged_toks_seg = dediac_ar(merge_tokens(d3_seg_tok, dediac_word, sep_token))
+
+            if merged_toks == dediac_word and len(toks) > 1:
+                if has_diacritics(original):
+                    toks = apply_diacritics_to_segments_keep_markers(toks, original, sep_token)
+                result.extend(toks)
+            elif merged_toks_seg == dediac_word and len(d3_seg_tok) > 1:
+                if has_diacritics(original):
+                    d3_seg_tok = apply_diacritics_to_segments_keep_markers(
+                        d3_seg_tok, original, sep_token
+                    )
+                result.extend(d3_seg_tok)
+            elif merged_toks_bw == dediac_word and len(bw_toks) > 1:
+                if has_diacritics(original):
+                    bw_toks = apply_diacritics_to_segments_keep_markers(
+                        bw_toks, original, sep_token
+                    )
+                result.extend(bw_toks)
+            else:
+                result.append(original_word)
+                err_disambig.append(dediac_word)
+                err_camel.append(merged_toks)
+
+        else:
+            tok = dediac_ar(tok)
+            if tok == dediac_word:
+                result.append(original_word)
+            else:
+                result.append(original_word)
+                err_disambig.append(dediac_word)
+                err_camel.append(tok)
+
+    return list(result), err_disambig, err_camel, has_diacritics_in_par
+
+
+def stem_dataset(
+    dataset: Dataset,
+    column: str = "text",
+    sep_token: str = "<+>",
+    normalize: bool = False,
+    keep_diacritics: bool = True,
+    num_proc: int | None = None,
+    model: str = "mle",
+    use_gpu: bool = False,
+) -> Dataset:
+    """
+    Apply stemming and morphological analysis to dataset.
+
+    Args:
+        dataset: HuggingFace dataset
+        column: Column to process
+        sep_token: Separator token for morphological splits (default: '<+>')
+        normalize: Apply Arabic normalization (default: False)
+        keep_diacritics: Keep dediacritized column (default: True)
+        num_proc: Number of parallel processes
+        model: Disambiguator model to use - "mle" or "bert" (default: "mle")
+        use_gpu: Whether to use GPU for BERT model (default: False)
+
+    Returns:
+        Dataset with {column}_stemmed and optionally {column}_dediac columns
+
+    Example:
+        >>> # Stem with defaults (MLE, keeps diacritics)
+        >>> stemmed = stem_dataset(dataset)
+        >>> # Result has 'text_stemmed' and 'text_dediac' columns
+
+        >>> # Stem using BERT with GPU
+        >>> stemmed = stem_dataset(dataset, model="bert", use_gpu=True)
+
+        >>> # Stem without keeping diacritics
+        >>> stemmed = stem_dataset(dataset, keep_diacritics=False)
+        >>> # Result has only 'text_stemmed' column
+    """
+    model = model.lower()
+    if model not in ["mle", "bert"]:
+        raise ValueError(f"Invalid model '{model}'. Must be 'mle' or 'bert'")
+
+    logger.info(f"Starting stemming of {len(dataset)} examples")
+    logger.info(
+        f"Model: {model.upper()}, Column: {column}, Sep token: {sep_token}, Normalize: {normalize}"
+    )
+    logger.info(f"Keep diacritics: {keep_diacritics}, Workers: {num_proc or 'auto'}")
+    if model == "bert":
+        logger.info(f"GPU: {use_gpu}")
+
+    logger.info("Checking CAMeL Tools data packages...")
+    catalogue = Catalogue.load_catalogue()
+    try:
+        catalogue.download_package("morphology-db-msa-r13")
+        if model == "mle":
+            catalogue.download_package("disambig-mle-calima-msa-r13")
+        # For BERT, let it download automatically when pretrained() is called
+        logger.info("CAMeL Tools data packages ready")
+    except Exception as e:
+        logger.warning(f"Could not verify CAMeL packages: {e}")
+
+    logger.info("Loading additional words lists...")
+    words_dir = os.path.join(os.path.dirname(__file__), "data")
+    list_al_t = set(read_and_dediacritize(os.path.join(words_dir, "words_al_t.txt")))
+    list_al = set(read_and_dediacritize(os.path.join(words_dir, "words_al.txt")))
+    list_t = set(read_and_dediacritize(os.path.join(words_dir, "words_t.txt")))
+    logger.info("Loaded word list entries")
+
+    logger.info(f"Initializing {model.upper()} disambiguator...")
+    if model == "mle":
+        disambiguator = MLEDisambiguator.pretrained("calima-msa-r13", cache_size=1_000_000)
+    else:  # bert
+        disambiguator = BERTUnfactoredDisambiguator.pretrained(use_gpu=use_gpu)
+    logger.info("Disambiguator ready")
+
+    def new_scored_analysis(self, word_dd):
+        if word_dd in self._cache:
+            return self._cache[word_dd]
+        result = self._scored_analyses(word_dd)
+        self._cache[word_dd] = result
+        return result
+
+    disambiguator._scored_analyses_cached = MethodType(new_scored_analysis, disambiguator)
+    disambiguator._score_fn = disambiguator._scored_analyses_cached
+
+    def process_row(row):
+        text = row.get(column, "")
+        if not text:
+            row[f"{column}_stemmed"] = ""
+            if keep_diacritics:
+                row[f"{column}_dediac"] = ""
+            return row
+
+        word_list = tokenize(text)
+        if word_list is None:
+            row[f"{column}_stemmed"] = text
+            if keep_diacritics:
+                row[f"{column}_dediac"] = dediac_ar(text)
+            return row
+
+        tokenized, _, _, has_diacs = morph_tokenize(
+            word_list, disambiguator, list_al_t, list_al, list_t, sep_token=sep_token
+        )
+
+        if tokenized is not None:
+            tokenized = merge_alef_and_alef_lam(tokenized, sep_token)
+            stemmed = "".join(tokenized)
+
+            if normalize:
+                stemmed = normalize_arabic(stemmed)
+
+            row[f"{column}_stemmed"] = stemmed
+
+            if keep_diacritics:
+                row[f"{column}_dediac"] = dediac_ar(stemmed)
+        else:
+            row[f"{column}_stemmed"] = text
+            if keep_diacritics:
+                row[f"{column}_dediac"] = dediac_ar(text)
+
+        return row
+
+    logger.info("Starting morphological tokenization...")
+    result = dataset.map(process_row, num_proc=num_proc, desc="Stemming")
+
+    logger.info(f"Stemming complete! Processed {len(result)} examples")
+    return result
+
+
+def stem(
+    text: str | list[str],
+    sep_token: str = "<+>",
+    normalize: bool = False,
+    keep_diacritics: bool = False,
+    model: str = "mle",
+    use_gpu: bool = False,
+) -> str | list[str]:
+    """
+    Stem Arabic text or list of texts.
+
+    Args:
+        text: Single string or list of strings to stem
+        sep_token: Separator token for morphological splits (default: '<+>')
+        normalize: Apply Arabic normalization (default: False)
+        keep_diacritics: Keep diacritics in output (default: False)
+        model: Disambiguator model to use - "mle" or "bert" (default: "mle")
+        use_gpu: Whether to use GPU for BERT model (default: False)
+
+    Returns:
+        Stemmed text in the same format as input (string or list of strings)
+
+    Example:
+        >>> # Stem a single string
+        >>> stemmed = stem("النص العربي")
+        >>> # Returns: "ال<+>نص ال<+>عربي"
+
+        >>> # Stem a list of strings
+        >>> stemmed = stem(["النص العربي", "مثال آخر"])
+        >>> # Returns: ["ال<+>نص ال<+>عربي", "مثال آخر"]
+
+        >>> # Stem with BERT model and GPU
+        >>> stemmed = stem("النص", model="bert", use_gpu=True)
+    """
+    # Validate model parameter
+    model = model.lower()
+    if model not in ["mle", "bert"]:
+        raise ValueError(f"Invalid model '{model}'. Must be 'mle' or 'bert'")
+
+    # Track whether input was a single string
+    is_single_string = isinstance(text, str)
+
+    # Convert single string to list for uniform processing
+    text_list = [text] if is_single_string else text
+
+    # Validate all items are strings
+    if not all(isinstance(t, str) for t in text_list):
+        raise TypeError("All items in text list must be strings")
+
+    # Initialize disambiguator (cached globally if possible)
+    logger.info(f"Initializing {model.upper()} disambiguator...")
+    catalogue = Catalogue.load_catalogue()
+    try:
+        catalogue.download_package("morphology-db-msa-r13")
+        if model == "mle":
+            catalogue.download_package("disambig-mle-calima-msa-r13")
+    except Exception as e:
+        logger.warning(f"Could not verify CAMeL packages: {e}")
+
+    if model == "mle":
+        disambiguator = MLEDisambiguator.pretrained("calima-msa-r13", cache_size=1_000_000)
+    else:  # bert
+        disambiguator = BERTUnfactoredDisambiguator.pretrained(use_gpu=use_gpu)
+
+    # Add caching to disambiguator
+    def new_scored_analysis(self, word_dd):
+        if word_dd in self._cache:
+            return self._cache[word_dd]
+        result = self._scored_analyses(word_dd)
+        self._cache[word_dd] = result
+        return result
+
+    disambiguator._scored_analyses_cached = MethodType(new_scored_analysis, disambiguator)
+    disambiguator._score_fn = disambiguator._scored_analyses_cached
+
+    # Load word lists
+    words_dir = os.path.join(os.path.dirname(__file__), "data")
+    list_al_t = set(read_and_dediacritize(os.path.join(words_dir, "words_al_t.txt")))
+    list_al = set(read_and_dediacritize(os.path.join(words_dir, "words_al.txt")))
+    list_t = set(read_and_dediacritize(os.path.join(words_dir, "words_t.txt")))
+
+    # Process each text
+    results = []
+    for txt in text_list:
+        if not txt:
+            results.append("")
+            continue
+
+        word_list = tokenize(txt)
+        if word_list is None:
+            stemmed = dediac_ar(txt) if not keep_diacritics else txt
+            results.append(stemmed)
+            continue
+
+        tokenized, _, _, has_diacs = morph_tokenize(
+            word_list, disambiguator, list_al_t, list_al, list_t, sep_token=sep_token
+        )
+
+        if tokenized is not None:
+            tokenized = merge_alef_and_alef_lam(tokenized, sep_token)
+            stemmed = "".join(tokenized)
+
+            if normalize:
+                stemmed = normalize_arabic(stemmed)
+
+            if not keep_diacritics:
+                stemmed = dediac_ar(stemmed)
+
+            results.append(stemmed)
+        else:
+            stemmed = dediac_ar(txt) if not keep_diacritics else txt
+            results.append(stemmed)
+
+    # Return in the same format as input
+    return results[0] if is_single_string else results
diff --git a/dalla_data_processing/utils/__init__.py b/dalla_data_processing/utils/__init__.py
index acd2474..9912b42 100644
--- a/dalla_data_processing/utils/__init__.py
+++ b/dalla_data_processing/utils/__init__.py
@@ -5,6 +5,14 @@
 """
 
 from dalla_data_processing.utils.logger import get_logger, logger, setup_logging
-from dalla_data_processing.utils.tokenize import simple_word_tokenize
 
 __all__ = ["simple_word_tokenize", "logger", "get_logger", "setup_logging"]
+
+
+def __getattr__(name):
+    """Lazy load modules with optional dependencies."""
+    if name == "simple_word_tokenize":
+        from dalla_data_processing.utils.tokenize import simple_word_tokenize
+
+        return simple_word_tokenize
+    raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
diff --git a/pyproject.toml b/pyproject.toml
index 0f02da0..cbf1851 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,10 +1,10 @@
 [build-system]
-requires = ["setuptools>=61.0", "wheel"]
+requires = ["setuptools>=61.0", "setuptools-scm>=8.0", "wheel"]
 build-backend = "setuptools.build_meta"
 
 [project]
 name = "dalla-data-processing"
-version = "0.0.1"
+dynamic = ["version"]
 description = "data processing pipeline with deduplication, stemming, quality checking, and readability scoring, used for the DALLA Models"
 authors = [
     {name = "Hadi Hamoud", email = "hhamoud@dohainstitute.edu.qa"},
@@ -25,13 +25,9 @@ classifiers = [
 dependencies = [
     "datasets>=2.14.0",
     "transformers>=4.30.0",
-    "camel-tools>=1.5.0",
     "click>=8.0.0",
     "tqdm>=4.65.0",
-    "pandas>=2.0.0",
-    "numpy>=1.24.0",
     "pyarrow>=12.0.0",
-    "textstat>=0.7.0",
     "structlog>=24.0.0",
 ]
 
@@ -42,11 +38,28 @@ dev = [
     "ruff>=0.1.0",
     "pre-commit>=3.0.0",
 ]
+dedup = [
+    "camel-tools>=1.5.0",
+]
 dedup-native = [
     "cffi>=1.15.0",
 ]
+stem = [
+    "camel-tools>=1.5.0",
+]
+quality = [
+    "camel-tools>=1.5.0",
+]
+readability = [
+    "textstat>=0.7.0",
+]
+pack = [
+    "sentencepiece>=0.2.0",
+    "rbpe",
+    "pyyaml",
+]
 all = [
-    "dalla-data-processing[dev,dedup-native]",
+    "dalla-data-processing[dev,dedup,dedup-native,stem,quality,readability,pack]",
 ]
 
 [project.scripts]
@@ -59,7 +72,7 @@ Repository = "https://github.com/U4RASD/dalla-data-processing"
 "Bug Tracker" = "https://github.com/U4RASD/dalla-data-processing/issues"
 
 [tool.setuptools]
-packages = ["dalla_data_processing", "dalla_data_processing.core", "dalla_data_processing.deduplication", "dalla_data_processing.stemming", "dalla_data_processing.quality", "dalla_data_processing.readability", "dalla_data_processing.utils"]
+packages = ["dalla_data_processing", "dalla_data_processing.core", "dalla_data_processing.deduplication", "dalla_data_processing.packing", "dalla_data_processing.stemming", "dalla_data_processing.quality", "dalla_data_processing.readability", "dalla_data_processing.utils"]
 include-package-data = true
 
 [tool.setuptools.package-data]
@@ -86,6 +99,9 @@ select = [
 ]
 ignore = [
     "E501",
+    "SIM102",
+    "N802",
+    "N806"
     ]
 
 
@@ -102,8 +118,11 @@ known-first-party = ["dalla_data_processing"]
 "dalla_data_processing/deduplication/onion/**/*.py" = ["N", "SIM", "UP"]  
 "dalla_data_processing/stemming/__init__.py" = ["N802", "N806", "SIM102"]  
 
-[tool.uv]
-dev-dependencies = [
+[tool.setuptools_scm]
+version_file = "dalla_data_processing/_version.py"
+
+[dependency-groups]
+dev = [
     "pytest>=7.0.0",
     "pytest-cov>=4.0.0",
     "ruff>=0.1.0",