Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
include README.md
include LICENSE
recursive-include dalla *.py
recursive-include dalla/stemming/data *.txt
recursive-include dalla/deduplication/bin *
recursive-include dalla/deduplication/onion *.c *.h Makefile*
recursive-include dalla_data_processing *.py
recursive-include dalla_data_processing/stemming/data *.txt
recursive-include dalla_data_processing/deduplication/bin *
recursive-include dalla_data_processing/deduplication/onion *.c *.h Makefile*
global-exclude __pycache__
global-exclude *.py[co]
16 changes: 8 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ dalla-dp -i ./data/raw -o ./data/deduped deduplicate --onion-binary /path/to/oni

```python
from datasets import load_from_disk
from dalla.deduplication import deduplicate_dataset
from dalla_data_processing.deduplication import deduplicate_dataset

# Load dataset
dataset = load_from_disk("./data/raw")
Expand Down Expand Up @@ -137,7 +137,7 @@ dalla-dp -i ./data/raw -o ./data/stemmed stem --keep-diacritics

```python
from datasets import load_from_disk
from dalla.stemming import stem_dataset
from dalla_data_processing.stemming import stem_dataset

# Load dataset
dataset = load_from_disk("./data/raw")
Expand Down Expand Up @@ -166,7 +166,7 @@ stemmed.save_to_disk("./data/stemmed")
**Direct Text Processing**

```python
from dalla.stemming import stem
from dalla_data_processing.stemming import stem

text = "الكتاب الجميل"
result = stem(text)
Expand Down Expand Up @@ -210,7 +210,7 @@ dalla-dp -i ./data/raw -o ./data/quality -c content quality-check

```python
from datasets import load_from_disk
from dalla.quality import check_quality
from dalla_data_processing.quality import check_quality

dataset = load_from_disk("./data/raw")

Expand Down Expand Up @@ -258,7 +258,7 @@ dalla-dp -i ./data/raw -o ./data/scored -c content readability

```python
from datasets import load_from_disk
from dalla.readability import score_readability
from dalla_data_processing.readability import score_readability

# Load dataset
dataset = load_from_disk("./data/raw")
Expand Down Expand Up @@ -298,7 +298,7 @@ dalla-dp info ./data/my_dataset
**Python API**

```python
from dalla.core.dataset import DatasetManager
from dalla_data_processing.core.dataset import DatasetManager

dm = DatasetManager()

Expand Down Expand Up @@ -330,7 +330,7 @@ splits = dm.train_test_split(dataset, test_size=0.2, seed=42)

```python
from datasets import DatasetDict, load_from_disk
from dalla.quality import check_quality
from dalla_data_processing.quality import check_quality

dataset_dict = load_from_disk("./data/my_dataset")

Expand All @@ -349,7 +349,7 @@ train_processed = check_quality(dataset_dict['train'], min_score=60.0)
The onion deduplication tool needs to be compiled for your system:

```bash
cd dalla/deduplication/onion/src_sc
cd dalla_data_processing/deduplication/onion/src_sc

# Compile
make -f Makefile.g
Expand Down
6 changes: 0 additions & 6 deletions dalla/core/__init__.py

This file was deleted.

10 changes: 6 additions & 4 deletions dalla/__init__.py → dalla_data_processing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,29 +8,31 @@
- Readability scoring
"""

__version__ = "0.0.1"

try:
from dalla.core.dataset import DatasetManager
from dalla_data_processing.core.dataset import DatasetManager

_has_dataset = True
except ImportError:
_has_dataset = False
DatasetManager = None

try:
from dalla.utils.tokenize import simple_word_tokenize
from dalla_data_processing.utils.tokenize import simple_word_tokenize

_has_tokenize = True
except ImportError:
_has_tokenize = False
simple_word_tokenize = None

try:
from dalla.stemming import stem, stem_dataset
from dalla_data_processing.stemming import stem, stem_dataset

_has_stemming = True
except ImportError:
_has_stemming = False
stem = None
stem_dataset = None

__all__ = ["DatasetManager", "simple_word_tokenize", "stem", "stem_dataset"]
__all__ = ["DatasetManager", "simple_word_tokenize", "stem", "stem_dataset", "__version__"]
14 changes: 7 additions & 7 deletions dalla/cli.py → dalla_data_processing/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@
import click
from datasets import Dataset, DatasetDict

from dalla import __version__
from dalla.core.dataset import DatasetManager
from dalla.utils import get_logger, setup_logging
from dalla_data_processing import __version__
from dalla_data_processing.core.dataset import DatasetManager
from dalla_data_processing.utils import get_logger, setup_logging

setup_logging(log_format="console", log_level="INFO")
logger = get_logger(__name__)
Expand Down Expand Up @@ -182,7 +182,7 @@ def deduplicate(
else:
click.echo(" Phase 2: OFF (faster, sufficient for most use cases)")

from dalla.deduplication import deduplicate_dataset
from dalla_data_processing.deduplication import deduplicate_dataset

deduplicated = deduplicate_dataset(
dataset,
Expand Down Expand Up @@ -257,7 +257,7 @@ def stem(
click.echo(f"Stemming {ctx.column} column (workers={ctx.num_workers or 'auto'})")
click.echo(f"Model: {model.upper()}{' (GPU enabled)' if model == 'bert' and use_gpu else ''}")

from dalla.stemming import stem_dataset
from dalla_data_processing.stemming import stem_dataset

stemmed = stem_dataset(
dataset,
Expand Down Expand Up @@ -311,7 +311,7 @@ def quality_check(ctx: Context, min_score: float, save_errors: bool, model: str,
click.echo(f"Checking quality of {ctx.column} column")
click.echo(f"Model: {model.upper()}{' (GPU enabled)' if model == 'bert' and use_gpu else ''}")

from dalla.quality import check_quality
from dalla_data_processing.quality import check_quality

scored = check_quality(
dataset,
Expand Down Expand Up @@ -356,7 +356,7 @@ def readability(ctx: Context, add_ranks: bool):
if add_ranks:
click.echo(" Including ranking and difficulty levels (0-4)")

from dalla.readability import score_readability
from dalla_data_processing.readability import score_readability

scored = score_readability(
dataset,
Expand Down
6 changes: 6 additions & 0 deletions dalla_data_processing/core/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"""Core utilities for dataset I/O and parallel processing."""

from dalla_data_processing.core.dataset import DatasetManager
from dalla_data_processing.core.parallel import ParallelProcessor

__all__ = ["DatasetManager", "ParallelProcessor"]
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from datasets import Dataset, DatasetDict, concatenate_datasets, load_from_disk

from dalla.utils.logger import get_logger
from dalla_data_processing.utils.logger import get_logger

logger = get_logger(__name__)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from datasets import Dataset
from tqdm import tqdm

from dalla.utils.logger import get_logger
from dalla_data_processing.utils.logger import get_logger

logger = get_logger(__name__)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
from datasets import Dataset
from tqdm import tqdm

from dalla.deduplication.onion_wrapper import find_onion_binary, run_onion
from dalla.deduplication.postprocessing import extract_duplicates_from_csvs
from dalla.deduplication.preprocessing import create_file_list, create_vert_files
from dalla.utils.logger import get_logger
from dalla_data_processing.deduplication.onion_wrapper import find_onion_binary, run_onion
from dalla_data_processing.deduplication.postprocessing import extract_duplicates_from_csvs
from dalla_data_processing.deduplication.preprocessing import create_file_list, create_vert_files
from dalla_data_processing.utils.logger import get_logger

logger = get_logger(__name__)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import subprocess
from pathlib import Path

from dalla.utils.logger import get_logger
from dalla_data_processing.utils.logger import get_logger

logger = get_logger(__name__)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from tqdm import tqdm

from dalla.utils.logger import get_logger
from dalla_data_processing.utils.logger import get_logger

logger = get_logger(__name__)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from camel_tools.utils.dediac import dediac_ar
from tqdm import tqdm

from dalla.utils.logger import get_logger
from dalla_data_processing.utils.logger import get_logger

logger = get_logger(__name__)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Quality checking module for text quality assessment."""

from dalla.quality.checker import QualityChecker, check_quality
from dalla_data_processing.quality.checker import QualityChecker, check_quality

__all__ = ["check_quality", "QualityChecker"]
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
from camel_tools.disambig.mle import MLEDisambiguator
from datasets import Dataset

from dalla.core.parallel import ParallelProcessor
from dalla.utils.logger import get_logger
from dalla_data_processing.core.parallel import ParallelProcessor
from dalla_data_processing.utils.logger import get_logger

logger = get_logger(__name__)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

from datasets import Dataset

from dalla.readability.ranking import compute_ranks_and_levels
from dalla.readability.scorer import ReadabilityScorer
from dalla.utils.logger import get_logger
from dalla_data_processing.readability.ranking import compute_ranks_and_levels
from dalla_data_processing.readability.scorer import ReadabilityScorer
from dalla_data_processing.utils.logger import get_logger

logger = get_logger(__name__)

Expand Down Expand Up @@ -37,7 +37,7 @@ def score_readability(
Dataset with readability scores and optional rankings

Example:
>>> from dalla.readability import score_readability
>>> from dalla_data_processing.readability import score_readability
>>> scored = score_readability(dataset)
>>> # Columns: flesch_score, osman_score, readability_level, etc.
"""
Expand All @@ -54,7 +54,7 @@ def score_readability(

def score_example(example):
# Create scorer inside worker (for multiprocessing compatibility)
from dalla.readability.scorer import ReadabilityScorer
from dalla_data_processing.readability.scorer import ReadabilityScorer

worker_scorer = ReadabilityScorer()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
Converts raw Flesch and Osman scores into 5-level difficulty rankings.
"""

from dalla.utils.logger import get_logger
from dalla_data_processing.utils.logger import get_logger

logger = get_logger(__name__)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import textstat

from dalla.utils.logger import get_logger
from dalla_data_processing.utils.logger import get_logger

logger = get_logger(__name__)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
from camel_tools.utils.dediac import dediac_ar
from datasets import Dataset

from dalla.utils.logger import get_logger
from dalla.utils.tokenize import simple_word_tokenize
from dalla_data_processing.utils.logger import get_logger
from dalla_data_processing.utils.tokenize import simple_word_tokenize

logger = get_logger(__name__)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
This module provides utilities for tokenization, text manipulation, and logging.
"""

from dalla.utils.logger import get_logger, logger, setup_logging
from dalla.utils.tokenize import simple_word_tokenize
from dalla_data_processing.utils.logger import get_logger, logger, setup_logging
from dalla_data_processing.utils.tokenize import simple_word_tokenize

__all__ = ["simple_word_tokenize", "logger", "get_logger", "setup_logging"]
File renamed without changes.
File renamed without changes.
18 changes: 9 additions & 9 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ all = [
]

[project.scripts]
dalla-dp = "dalla.cli:main"
dalla-dp = "dalla_data_processing.cli:main"

[project.urls]
Homepage = "https://github.com/U4RASD/dalla-data-processing"
Expand All @@ -59,18 +59,18 @@ Repository = "https://github.com/U4RASD/dalla-data-processing"
"Bug Tracker" = "https://github.com/U4RASD/dalla-data-processing/issues"

[tool.setuptools]
packages = ["dalla", "dalla.core", "dalla.deduplication", "dalla.stemming", "dalla.quality", "dalla.readability", "dalla.utils"]
packages = ["dalla_data_processing", "dalla_data_processing.core", "dalla_data_processing.deduplication", "dalla_data_processing.stemming", "dalla_data_processing.quality", "dalla_data_processing.readability", "dalla_data_processing.utils"]
include-package-data = true

[tool.setuptools.package-data]
dalla = ["py.typed"]
"dalla.stemming" = ["data/*.txt"]
"dalla.deduplication" = ["bin/*", "onion/**/*"]
dalla_data_processing = ["py.typed"]
"dalla_data_processing.stemming" = ["data/*.txt"]
"dalla_data_processing.deduplication" = ["bin/*", "onion/**/*"]

[tool.ruff]
line-length = 100
target-version = "py312"
src = ["dalla"]
src = ["dalla_data_processing"]

[tool.ruff.lint]
select = [
Expand All @@ -96,11 +96,11 @@ skip-magic-trailing-comma = false
line-ending = "auto"

[tool.ruff.lint.isort]
known-first-party = ["dalla"]
known-first-party = ["dalla_data_processing"]

[tool.ruff.lint.per-file-ignores]
"dalla/deduplication/onion/**/*.py" = ["N", "SIM", "UP"]
"dalla/stemming/__init__.py" = ["N802", "N806", "SIM102"]
"dalla_data_processing/deduplication/onion/**/*.py" = ["N", "SIM", "UP"]
"dalla_data_processing/stemming/__init__.py" = ["N802", "N806", "SIM102"]

[tool.uv]
dev-dependencies = [
Expand Down
4 changes: 2 additions & 2 deletions scripts/build_onion.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ echo -e "${GREEN}=== Building Onion Binary ===${NC}"
# Get script directory and project root
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
ONION_SOURCE="$PROJECT_ROOT/dalla/deduplication/onion/src_sc"
OUTPUT_DIR="$PROJECT_ROOT/dalla/deduplication/bin"
ONION_SOURCE="$PROJECT_ROOT/dalla_data_processing/deduplication/onion/src_sc"
OUTPUT_DIR="$PROJECT_ROOT/dalla_data_processing/deduplication/bin"

# Check if source exists
if [ ! -d "$ONION_SOURCE" ]; then
Expand Down
Loading