From d0bfb79df57ec665501d5be6fb30c14fd65f68eb Mon Sep 17 00:00:00 2001 From: Hadi Hamoud Date: Thu, 20 Nov 2025 14:35:37 +0300 Subject: [PATCH] changed from dalla. to dalla_data_processing. --- MANIFEST.in | 8 ++++---- README.md | 16 ++++++++-------- dalla/core/__init__.py | 6 ------ {dalla => dalla_data_processing}/__init__.py | 10 ++++++---- {dalla => dalla_data_processing}/cli.py | 14 +++++++------- dalla_data_processing/core/__init__.py | 6 ++++++ .../core/dataset.py | 2 +- .../core/parallel.py | 2 +- .../deduplication/__init__.py | 8 ++++---- .../deduplication/bin/.gitignore | 0 .../deduplication/onion/COPYING | 0 .../deduplication/onion/Makefile | 0 .../deduplication/onion/Makefile.config | 0 .../deduplication/onion/README.md | 0 .../deduplication/onion/src/Makefile | 0 .../deduplication/onion/src/Makefile.g | 0 .../deduplication/onion/src/buzhash.c | 0 .../deduplication/onion/src/buzhash.h | 0 .../deduplication/onion/src/hashdup.c | 0 .../deduplication/onion/src/hashgen.c | 0 .../deduplication/onion/src/onion | Bin .../deduplication/onion/src/onion.c | 0 .../deduplication/onion/src/onion_dup.c | 0 .../deduplication/onion/src/version.c | 0 .../deduplication/onion/src/version.h | 0 .../deduplication/onion/src_sc/.gitignore | 0 .../deduplication/onion/src_sc/Makefile | 0 .../deduplication/onion/src_sc/Makefile.g | 0 .../deduplication/onion/src_sc/buzhash.c | 0 .../deduplication/onion/src_sc/buzhash.h | 0 .../deduplication/onion/src_sc/hashdup | Bin .../deduplication/onion/src_sc/hashdup.c | 0 .../deduplication/onion/src_sc/hashgen | Bin .../deduplication/onion/src_sc/hashgen.c | 0 .../deduplication/onion/src_sc/onion.c | 0 .../deduplication/onion/src_sc/onion_dup.c | 0 .../deduplication/onion/src_sc/version.c | 0 .../deduplication/onion/src_sc/version.h | 0 .../deduplication/onion_wrapper.py | 2 +- .../deduplication/postprocessing.py | 2 +- .../deduplication/preprocessing.py | 2 +- .../quality/__init__.py | 2 +- .../quality/checker.py | 4 ++-- .../readability/__init__.py | 10 +++++----- .../readability/ranking.py | 2 +- .../readability/scorer.py | 2 +- .../stemming/__init__.py | 4 ++-- .../stemming/data/words_al.txt | 0 .../stemming/data/words_al_t.txt | 0 .../stemming/data/words_t.txt | 0 .../utils/__init__.py | 4 ++-- .../utils/logger.py | 0 .../utils/tokenize.py | 0 pyproject.toml | 18 +++++++++--------- scripts/build_onion.sh | 4 ++-- 55 files changed, 65 insertions(+), 63 deletions(-) delete mode 100644 dalla/core/__init__.py rename {dalla => dalla_data_processing}/__init__.py (69%) rename {dalla => dalla_data_processing}/cli.py (96%) create mode 100644 dalla_data_processing/core/__init__.py rename {dalla => dalla_data_processing}/core/dataset.py (99%) rename {dalla => dalla_data_processing}/core/parallel.py (99%) rename {dalla => dalla_data_processing}/deduplication/__init__.py (97%) rename {dalla => dalla_data_processing}/deduplication/bin/.gitignore (100%) rename {dalla => dalla_data_processing}/deduplication/onion/COPYING (100%) rename {dalla => dalla_data_processing}/deduplication/onion/Makefile (100%) rename {dalla => dalla_data_processing}/deduplication/onion/Makefile.config (100%) rename {dalla => dalla_data_processing}/deduplication/onion/README.md (100%) rename {dalla => dalla_data_processing}/deduplication/onion/src/Makefile (100%) rename {dalla => dalla_data_processing}/deduplication/onion/src/Makefile.g (100%) rename {dalla => dalla_data_processing}/deduplication/onion/src/buzhash.c (100%) rename {dalla => dalla_data_processing}/deduplication/onion/src/buzhash.h (100%) rename {dalla => dalla_data_processing}/deduplication/onion/src/hashdup.c (100%) rename {dalla => dalla_data_processing}/deduplication/onion/src/hashgen.c (100%) rename {dalla => dalla_data_processing}/deduplication/onion/src/onion (100%) rename {dalla => dalla_data_processing}/deduplication/onion/src/onion.c (100%) rename {dalla => dalla_data_processing}/deduplication/onion/src/onion_dup.c (100%) rename {dalla => dalla_data_processing}/deduplication/onion/src/version.c (100%) rename {dalla => dalla_data_processing}/deduplication/onion/src/version.h (100%) rename {dalla => dalla_data_processing}/deduplication/onion/src_sc/.gitignore (100%) rename {dalla => dalla_data_processing}/deduplication/onion/src_sc/Makefile (100%) rename {dalla => dalla_data_processing}/deduplication/onion/src_sc/Makefile.g (100%) rename {dalla => dalla_data_processing}/deduplication/onion/src_sc/buzhash.c (100%) rename {dalla => dalla_data_processing}/deduplication/onion/src_sc/buzhash.h (100%) rename {dalla => dalla_data_processing}/deduplication/onion/src_sc/hashdup (100%) rename {dalla => dalla_data_processing}/deduplication/onion/src_sc/hashdup.c (100%) rename {dalla => dalla_data_processing}/deduplication/onion/src_sc/hashgen (100%) rename {dalla => dalla_data_processing}/deduplication/onion/src_sc/hashgen.c (100%) rename {dalla => dalla_data_processing}/deduplication/onion/src_sc/onion.c (100%) rename {dalla => dalla_data_processing}/deduplication/onion/src_sc/onion_dup.c (100%) rename {dalla => dalla_data_processing}/deduplication/onion/src_sc/version.c (100%) rename {dalla => dalla_data_processing}/deduplication/onion/src_sc/version.h (100%) rename {dalla => dalla_data_processing}/deduplication/onion_wrapper.py (99%) rename {dalla => dalla_data_processing}/deduplication/postprocessing.py (99%) rename {dalla => dalla_data_processing}/deduplication/preprocessing.py (98%) rename {dalla => dalla_data_processing}/quality/__init__.py (57%) rename {dalla => dalla_data_processing}/quality/checker.py (99%) rename {dalla => dalla_data_processing}/readability/__init__.py (94%) rename {dalla => dalla_data_processing}/readability/ranking.py (98%) rename {dalla => dalla_data_processing}/readability/scorer.py (98%) rename {dalla => dalla_data_processing}/stemming/__init__.py (99%) rename {dalla => dalla_data_processing}/stemming/data/words_al.txt (100%) rename {dalla => dalla_data_processing}/stemming/data/words_al_t.txt (100%) rename {dalla => dalla_data_processing}/stemming/data/words_t.txt (100%) rename {dalla => dalla_data_processing}/utils/__init__.py (57%) rename {dalla => dalla_data_processing}/utils/logger.py (100%) rename {dalla => dalla_data_processing}/utils/tokenize.py (100%) diff --git a/MANIFEST.in b/MANIFEST.in index 46878b9..7f05576 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,8 +1,8 @@ include README.md include LICENSE -recursive-include dalla *.py -recursive-include dalla/stemming/data *.txt -recursive-include dalla/deduplication/bin * -recursive-include dalla/deduplication/onion *.c *.h Makefile* +recursive-include dalla_data_processing *.py +recursive-include dalla_data_processing/stemming/data *.txt +recursive-include dalla_data_processing/deduplication/bin * +recursive-include dalla_data_processing/deduplication/onion *.c *.h Makefile* global-exclude __pycache__ global-exclude *.py[co] diff --git a/README.md b/README.md index 7b5d3c7..da2cf9f 100644 --- a/README.md +++ b/README.md @@ -80,7 +80,7 @@ dalla-dp -i ./data/raw -o ./data/deduped deduplicate --onion-binary /path/to/oni ```python from datasets import load_from_disk -from dalla.deduplication import deduplicate_dataset +from dalla_data_processing.deduplication import deduplicate_dataset # Load dataset dataset = load_from_disk("./data/raw") @@ -137,7 +137,7 @@ dalla-dp -i ./data/raw -o ./data/stemmed stem --keep-diacritics ```python from datasets import load_from_disk -from dalla.stemming import stem_dataset +from dalla_data_processing.stemming import stem_dataset # Load dataset dataset = load_from_disk("./data/raw") @@ -166,7 +166,7 @@ stemmed.save_to_disk("./data/stemmed") **Direct Text Processing** ```python -from dalla.stemming import stem +from dalla_data_processing.stemming import stem text = "الكتاب الجميل" result = stem(text) @@ -210,7 +210,7 @@ dalla-dp -i ./data/raw -o ./data/quality -c content quality-check ```python from datasets import load_from_disk -from dalla.quality import check_quality +from dalla_data_processing.quality import check_quality dataset = load_from_disk("./data/raw") @@ -258,7 +258,7 @@ dalla-dp -i ./data/raw -o ./data/scored -c content readability ```python from datasets import load_from_disk -from dalla.readability import score_readability +from dalla_data_processing.readability import score_readability # Load dataset dataset = load_from_disk("./data/raw") @@ -298,7 +298,7 @@ dalla-dp info ./data/my_dataset **Python API** ```python -from dalla.core.dataset import DatasetManager +from dalla_data_processing.core.dataset import DatasetManager dm = DatasetManager() @@ -330,7 +330,7 @@ splits = dm.train_test_split(dataset, test_size=0.2, seed=42) ```python from datasets import DatasetDict, load_from_disk -from dalla.quality import check_quality +from dalla_data_processing.quality import check_quality dataset_dict = load_from_disk("./data/my_dataset") @@ -349,7 +349,7 @@ train_processed = check_quality(dataset_dict['train'], min_score=60.0) The onion deduplication tool needs to be compiled for your system: ```bash -cd dalla/deduplication/onion/src_sc +cd dalla_data_processing/deduplication/onion/src_sc # Compile make -f Makefile.g diff --git a/dalla/core/__init__.py b/dalla/core/__init__.py deleted file mode 100644 index 540751a..0000000 --- a/dalla/core/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -"""Core utilities for dataset I/O and parallel processing.""" - -from dalla.core.dataset import DatasetManager -from dalla.core.parallel import ParallelProcessor - -__all__ = ["DatasetManager", "ParallelProcessor"] diff --git a/dalla/__init__.py b/dalla_data_processing/__init__.py similarity index 69% rename from dalla/__init__.py rename to dalla_data_processing/__init__.py index 72e5450..bd9bb00 100644 --- a/dalla/__init__.py +++ b/dalla_data_processing/__init__.py @@ -8,8 +8,10 @@ - Readability scoring """ +__version__ = "0.0.1" + try: - from dalla.core.dataset import DatasetManager + from dalla_data_processing.core.dataset import DatasetManager _has_dataset = True except ImportError: @@ -17,7 +19,7 @@ DatasetManager = None try: - from dalla.utils.tokenize import simple_word_tokenize + from dalla_data_processing.utils.tokenize import simple_word_tokenize _has_tokenize = True except ImportError: @@ -25,7 +27,7 @@ simple_word_tokenize = None try: - from dalla.stemming import stem, stem_dataset + from dalla_data_processing.stemming import stem, stem_dataset _has_stemming = True except ImportError: @@ -33,4 +35,4 @@ stem = None stem_dataset = None -__all__ = ["DatasetManager", "simple_word_tokenize", "stem", "stem_dataset"] +__all__ = ["DatasetManager", "simple_word_tokenize", "stem", "stem_dataset", "__version__"] diff --git a/dalla/cli.py b/dalla_data_processing/cli.py similarity index 96% rename from dalla/cli.py rename to dalla_data_processing/cli.py index b441710..252b717 100644 --- a/dalla/cli.py +++ b/dalla_data_processing/cli.py @@ -11,9 +11,9 @@ import click from datasets import Dataset, DatasetDict -from dalla import __version__ -from dalla.core.dataset import DatasetManager -from dalla.utils import get_logger, setup_logging +from dalla_data_processing import __version__ +from dalla_data_processing.core.dataset import DatasetManager +from dalla_data_processing.utils import get_logger, setup_logging setup_logging(log_format="console", log_level="INFO") logger = get_logger(__name__) @@ -182,7 +182,7 @@ def deduplicate( else: click.echo(" Phase 2: OFF (faster, sufficient for most use cases)") - from dalla.deduplication import deduplicate_dataset + from dalla_data_processing.deduplication import deduplicate_dataset deduplicated = deduplicate_dataset( dataset, @@ -257,7 +257,7 @@ def stem( click.echo(f"Stemming {ctx.column} column (workers={ctx.num_workers or 'auto'})") click.echo(f"Model: {model.upper()}{' (GPU enabled)' if model == 'bert' and use_gpu else ''}") - from dalla.stemming import stem_dataset + from dalla_data_processing.stemming import stem_dataset stemmed = stem_dataset( dataset, @@ -311,7 +311,7 @@ def quality_check(ctx: Context, min_score: float, save_errors: bool, model: str, click.echo(f"Checking quality of {ctx.column} column") click.echo(f"Model: {model.upper()}{' (GPU enabled)' if model == 'bert' and use_gpu else ''}") - from dalla.quality import check_quality + from dalla_data_processing.quality import check_quality scored = check_quality( dataset, @@ -356,7 +356,7 @@ def readability(ctx: Context, add_ranks: bool): if add_ranks: click.echo(" Including ranking and difficulty levels (0-4)") - from dalla.readability import score_readability + from dalla_data_processing.readability import score_readability scored = score_readability( dataset, diff --git a/dalla_data_processing/core/__init__.py b/dalla_data_processing/core/__init__.py new file mode 100644 index 0000000..0cb8ea0 --- /dev/null +++ b/dalla_data_processing/core/__init__.py @@ -0,0 +1,6 @@ +"""Core utilities for dataset I/O and parallel processing.""" + +from dalla_data_processing.core.dataset import DatasetManager +from dalla_data_processing.core.parallel import ParallelProcessor + +__all__ = ["DatasetManager", "ParallelProcessor"] diff --git a/dalla/core/dataset.py b/dalla_data_processing/core/dataset.py similarity index 99% rename from dalla/core/dataset.py rename to dalla_data_processing/core/dataset.py index 3aa62eb..bf19068 100644 --- a/dalla/core/dataset.py +++ b/dalla_data_processing/core/dataset.py @@ -11,7 +11,7 @@ from datasets import Dataset, DatasetDict, concatenate_datasets, load_from_disk -from dalla.utils.logger import get_logger +from dalla_data_processing.utils.logger import get_logger logger = get_logger(__name__) diff --git a/dalla/core/parallel.py b/dalla_data_processing/core/parallel.py similarity index 99% rename from dalla/core/parallel.py rename to dalla_data_processing/core/parallel.py index 1a48a3b..070c807 100644 --- a/dalla/core/parallel.py +++ b/dalla_data_processing/core/parallel.py @@ -12,7 +12,7 @@ from datasets import Dataset from tqdm import tqdm -from dalla.utils.logger import get_logger +from dalla_data_processing.utils.logger import get_logger logger = get_logger(__name__) diff --git a/dalla/deduplication/__init__.py b/dalla_data_processing/deduplication/__init__.py similarity index 97% rename from dalla/deduplication/__init__.py rename to dalla_data_processing/deduplication/__init__.py index c1dce81..498fc82 100644 --- a/dalla/deduplication/__init__.py +++ b/dalla_data_processing/deduplication/__init__.py @@ -13,10 +13,10 @@ from datasets import Dataset from tqdm import tqdm -from dalla.deduplication.onion_wrapper import find_onion_binary, run_onion -from dalla.deduplication.postprocessing import extract_duplicates_from_csvs -from dalla.deduplication.preprocessing import create_file_list, create_vert_files -from dalla.utils.logger import get_logger +from dalla_data_processing.deduplication.onion_wrapper import find_onion_binary, run_onion +from dalla_data_processing.deduplication.postprocessing import extract_duplicates_from_csvs +from dalla_data_processing.deduplication.preprocessing import create_file_list, create_vert_files +from dalla_data_processing.utils.logger import get_logger logger = get_logger(__name__) diff --git a/dalla/deduplication/bin/.gitignore b/dalla_data_processing/deduplication/bin/.gitignore similarity index 100% rename from dalla/deduplication/bin/.gitignore rename to dalla_data_processing/deduplication/bin/.gitignore diff --git a/dalla/deduplication/onion/COPYING b/dalla_data_processing/deduplication/onion/COPYING similarity index 100% rename from dalla/deduplication/onion/COPYING rename to dalla_data_processing/deduplication/onion/COPYING diff --git a/dalla/deduplication/onion/Makefile b/dalla_data_processing/deduplication/onion/Makefile similarity index 100% rename from dalla/deduplication/onion/Makefile rename to dalla_data_processing/deduplication/onion/Makefile diff --git a/dalla/deduplication/onion/Makefile.config b/dalla_data_processing/deduplication/onion/Makefile.config similarity index 100% rename from dalla/deduplication/onion/Makefile.config rename to dalla_data_processing/deduplication/onion/Makefile.config diff --git a/dalla/deduplication/onion/README.md b/dalla_data_processing/deduplication/onion/README.md similarity index 100% rename from dalla/deduplication/onion/README.md rename to dalla_data_processing/deduplication/onion/README.md diff --git a/dalla/deduplication/onion/src/Makefile b/dalla_data_processing/deduplication/onion/src/Makefile similarity index 100% rename from dalla/deduplication/onion/src/Makefile rename to dalla_data_processing/deduplication/onion/src/Makefile diff --git a/dalla/deduplication/onion/src/Makefile.g b/dalla_data_processing/deduplication/onion/src/Makefile.g similarity index 100% rename from dalla/deduplication/onion/src/Makefile.g rename to dalla_data_processing/deduplication/onion/src/Makefile.g diff --git a/dalla/deduplication/onion/src/buzhash.c b/dalla_data_processing/deduplication/onion/src/buzhash.c similarity index 100% rename from dalla/deduplication/onion/src/buzhash.c rename to dalla_data_processing/deduplication/onion/src/buzhash.c diff --git a/dalla/deduplication/onion/src/buzhash.h b/dalla_data_processing/deduplication/onion/src/buzhash.h similarity index 100% rename from dalla/deduplication/onion/src/buzhash.h rename to dalla_data_processing/deduplication/onion/src/buzhash.h diff --git a/dalla/deduplication/onion/src/hashdup.c b/dalla_data_processing/deduplication/onion/src/hashdup.c similarity index 100% rename from dalla/deduplication/onion/src/hashdup.c rename to dalla_data_processing/deduplication/onion/src/hashdup.c diff --git a/dalla/deduplication/onion/src/hashgen.c b/dalla_data_processing/deduplication/onion/src/hashgen.c similarity index 100% rename from dalla/deduplication/onion/src/hashgen.c rename to dalla_data_processing/deduplication/onion/src/hashgen.c diff --git a/dalla/deduplication/onion/src/onion b/dalla_data_processing/deduplication/onion/src/onion similarity index 100% rename from dalla/deduplication/onion/src/onion rename to dalla_data_processing/deduplication/onion/src/onion diff --git a/dalla/deduplication/onion/src/onion.c b/dalla_data_processing/deduplication/onion/src/onion.c similarity index 100% rename from dalla/deduplication/onion/src/onion.c rename to dalla_data_processing/deduplication/onion/src/onion.c diff --git a/dalla/deduplication/onion/src/onion_dup.c b/dalla_data_processing/deduplication/onion/src/onion_dup.c similarity index 100% rename from dalla/deduplication/onion/src/onion_dup.c rename to dalla_data_processing/deduplication/onion/src/onion_dup.c diff --git a/dalla/deduplication/onion/src/version.c b/dalla_data_processing/deduplication/onion/src/version.c similarity index 100% rename from dalla/deduplication/onion/src/version.c rename to dalla_data_processing/deduplication/onion/src/version.c diff --git a/dalla/deduplication/onion/src/version.h b/dalla_data_processing/deduplication/onion/src/version.h similarity index 100% rename from dalla/deduplication/onion/src/version.h rename to dalla_data_processing/deduplication/onion/src/version.h diff --git a/dalla/deduplication/onion/src_sc/.gitignore b/dalla_data_processing/deduplication/onion/src_sc/.gitignore similarity index 100% rename from dalla/deduplication/onion/src_sc/.gitignore rename to dalla_data_processing/deduplication/onion/src_sc/.gitignore diff --git a/dalla/deduplication/onion/src_sc/Makefile b/dalla_data_processing/deduplication/onion/src_sc/Makefile similarity index 100% rename from dalla/deduplication/onion/src_sc/Makefile rename to dalla_data_processing/deduplication/onion/src_sc/Makefile diff --git a/dalla/deduplication/onion/src_sc/Makefile.g b/dalla_data_processing/deduplication/onion/src_sc/Makefile.g similarity index 100% rename from dalla/deduplication/onion/src_sc/Makefile.g rename to dalla_data_processing/deduplication/onion/src_sc/Makefile.g diff --git a/dalla/deduplication/onion/src_sc/buzhash.c b/dalla_data_processing/deduplication/onion/src_sc/buzhash.c similarity index 100% rename from dalla/deduplication/onion/src_sc/buzhash.c rename to dalla_data_processing/deduplication/onion/src_sc/buzhash.c diff --git a/dalla/deduplication/onion/src_sc/buzhash.h b/dalla_data_processing/deduplication/onion/src_sc/buzhash.h similarity index 100% rename from dalla/deduplication/onion/src_sc/buzhash.h rename to dalla_data_processing/deduplication/onion/src_sc/buzhash.h diff --git a/dalla/deduplication/onion/src_sc/hashdup b/dalla_data_processing/deduplication/onion/src_sc/hashdup similarity index 100% rename from dalla/deduplication/onion/src_sc/hashdup rename to dalla_data_processing/deduplication/onion/src_sc/hashdup diff --git a/dalla/deduplication/onion/src_sc/hashdup.c b/dalla_data_processing/deduplication/onion/src_sc/hashdup.c similarity index 100% rename from dalla/deduplication/onion/src_sc/hashdup.c rename to dalla_data_processing/deduplication/onion/src_sc/hashdup.c diff --git a/dalla/deduplication/onion/src_sc/hashgen b/dalla_data_processing/deduplication/onion/src_sc/hashgen similarity index 100% rename from dalla/deduplication/onion/src_sc/hashgen rename to dalla_data_processing/deduplication/onion/src_sc/hashgen diff --git a/dalla/deduplication/onion/src_sc/hashgen.c b/dalla_data_processing/deduplication/onion/src_sc/hashgen.c similarity index 100% rename from dalla/deduplication/onion/src_sc/hashgen.c rename to dalla_data_processing/deduplication/onion/src_sc/hashgen.c diff --git a/dalla/deduplication/onion/src_sc/onion.c b/dalla_data_processing/deduplication/onion/src_sc/onion.c similarity index 100% rename from dalla/deduplication/onion/src_sc/onion.c rename to dalla_data_processing/deduplication/onion/src_sc/onion.c diff --git a/dalla/deduplication/onion/src_sc/onion_dup.c b/dalla_data_processing/deduplication/onion/src_sc/onion_dup.c similarity index 100% rename from dalla/deduplication/onion/src_sc/onion_dup.c rename to dalla_data_processing/deduplication/onion/src_sc/onion_dup.c diff --git a/dalla/deduplication/onion/src_sc/version.c b/dalla_data_processing/deduplication/onion/src_sc/version.c similarity index 100% rename from dalla/deduplication/onion/src_sc/version.c rename to dalla_data_processing/deduplication/onion/src_sc/version.c diff --git a/dalla/deduplication/onion/src_sc/version.h b/dalla_data_processing/deduplication/onion/src_sc/version.h similarity index 100% rename from dalla/deduplication/onion/src_sc/version.h rename to dalla_data_processing/deduplication/onion/src_sc/version.h diff --git a/dalla/deduplication/onion_wrapper.py b/dalla_data_processing/deduplication/onion_wrapper.py similarity index 99% rename from dalla/deduplication/onion_wrapper.py rename to dalla_data_processing/deduplication/onion_wrapper.py index 5fda30a..9b78026 100644 --- a/dalla/deduplication/onion_wrapper.py +++ b/dalla_data_processing/deduplication/onion_wrapper.py @@ -7,7 +7,7 @@ import subprocess from pathlib import Path -from dalla.utils.logger import get_logger +from dalla_data_processing.utils.logger import get_logger logger = get_logger(__name__) diff --git a/dalla/deduplication/postprocessing.py b/dalla_data_processing/deduplication/postprocessing.py similarity index 99% rename from dalla/deduplication/postprocessing.py rename to dalla_data_processing/deduplication/postprocessing.py index 28883be..0730561 100644 --- a/dalla/deduplication/postprocessing.py +++ b/dalla_data_processing/deduplication/postprocessing.py @@ -11,7 +11,7 @@ from tqdm import tqdm -from dalla.utils.logger import get_logger +from dalla_data_processing.utils.logger import get_logger logger = get_logger(__name__) diff --git a/dalla/deduplication/preprocessing.py b/dalla_data_processing/deduplication/preprocessing.py similarity index 98% rename from dalla/deduplication/preprocessing.py rename to dalla_data_processing/deduplication/preprocessing.py index 3d05076..aab2e0c 100644 --- a/dalla/deduplication/preprocessing.py +++ b/dalla_data_processing/deduplication/preprocessing.py @@ -10,7 +10,7 @@ from camel_tools.utils.dediac import dediac_ar from tqdm import tqdm -from dalla.utils.logger import get_logger +from dalla_data_processing.utils.logger import get_logger logger = get_logger(__name__) diff --git a/dalla/quality/__init__.py b/dalla_data_processing/quality/__init__.py similarity index 57% rename from dalla/quality/__init__.py rename to dalla_data_processing/quality/__init__.py index 9350d02..a23ab33 100644 --- a/dalla/quality/__init__.py +++ b/dalla_data_processing/quality/__init__.py @@ -1,5 +1,5 @@ """Quality checking module for text quality assessment.""" -from dalla.quality.checker import QualityChecker, check_quality +from dalla_data_processing.quality.checker import QualityChecker, check_quality __all__ = ["check_quality", "QualityChecker"] diff --git a/dalla/quality/checker.py b/dalla_data_processing/quality/checker.py similarity index 99% rename from dalla/quality/checker.py rename to dalla_data_processing/quality/checker.py index 826c315..077bbb1 100644 --- a/dalla/quality/checker.py +++ b/dalla_data_processing/quality/checker.py @@ -15,8 +15,8 @@ from camel_tools.disambig.mle import MLEDisambiguator from datasets import Dataset -from dalla.core.parallel import ParallelProcessor -from dalla.utils.logger import get_logger +from dalla_data_processing.core.parallel import ParallelProcessor +from dalla_data_processing.utils.logger import get_logger logger = get_logger(__name__) diff --git a/dalla/readability/__init__.py b/dalla_data_processing/readability/__init__.py similarity index 94% rename from dalla/readability/__init__.py rename to dalla_data_processing/readability/__init__.py index bb2ba71..7edfb98 100644 --- a/dalla/readability/__init__.py +++ b/dalla_data_processing/readability/__init__.py @@ -2,9 +2,9 @@ from datasets import Dataset -from dalla.readability.ranking import compute_ranks_and_levels -from dalla.readability.scorer import ReadabilityScorer -from dalla.utils.logger import get_logger +from dalla_data_processing.readability.ranking import compute_ranks_and_levels +from dalla_data_processing.readability.scorer import ReadabilityScorer +from dalla_data_processing.utils.logger import get_logger logger = get_logger(__name__) @@ -37,7 +37,7 @@ def score_readability( Dataset with readability scores and optional rankings Example: - >>> from dalla.readability import score_readability + >>> from dalla_data_processing.readability import score_readability >>> scored = score_readability(dataset) >>> # Columns: flesch_score, osman_score, readability_level, etc. """ @@ -54,7 +54,7 @@ def score_readability( def score_example(example): # Create scorer inside worker (for multiprocessing compatibility) - from dalla.readability.scorer import ReadabilityScorer + from dalla_data_processing.readability.scorer import ReadabilityScorer worker_scorer = ReadabilityScorer() diff --git a/dalla/readability/ranking.py b/dalla_data_processing/readability/ranking.py similarity index 98% rename from dalla/readability/ranking.py rename to dalla_data_processing/readability/ranking.py index 4039ea6..8a71a78 100644 --- a/dalla/readability/ranking.py +++ b/dalla_data_processing/readability/ranking.py @@ -4,7 +4,7 @@ Converts raw Flesch and Osman scores into 5-level difficulty rankings. """ -from dalla.utils.logger import get_logger +from dalla_data_processing.utils.logger import get_logger logger = get_logger(__name__) diff --git a/dalla/readability/scorer.py b/dalla_data_processing/readability/scorer.py similarity index 98% rename from dalla/readability/scorer.py rename to dalla_data_processing/readability/scorer.py index 10a4e77..8e2696b 100644 --- a/dalla/readability/scorer.py +++ b/dalla_data_processing/readability/scorer.py @@ -6,7 +6,7 @@ import textstat -from dalla.utils.logger import get_logger +from dalla_data_processing.utils.logger import get_logger logger = get_logger(__name__) diff --git a/dalla/stemming/__init__.py b/dalla_data_processing/stemming/__init__.py similarity index 99% rename from dalla/stemming/__init__.py rename to dalla_data_processing/stemming/__init__.py index 1ac8e91..febd264 100644 --- a/dalla/stemming/__init__.py +++ b/dalla_data_processing/stemming/__init__.py @@ -15,8 +15,8 @@ from camel_tools.utils.dediac import dediac_ar from datasets import Dataset -from dalla.utils.logger import get_logger -from dalla.utils.tokenize import simple_word_tokenize +from dalla_data_processing.utils.logger import get_logger +from dalla_data_processing.utils.tokenize import simple_word_tokenize logger = get_logger(__name__) diff --git a/dalla/stemming/data/words_al.txt b/dalla_data_processing/stemming/data/words_al.txt similarity index 100% rename from dalla/stemming/data/words_al.txt rename to dalla_data_processing/stemming/data/words_al.txt diff --git a/dalla/stemming/data/words_al_t.txt b/dalla_data_processing/stemming/data/words_al_t.txt similarity index 100% rename from dalla/stemming/data/words_al_t.txt rename to dalla_data_processing/stemming/data/words_al_t.txt diff --git a/dalla/stemming/data/words_t.txt b/dalla_data_processing/stemming/data/words_t.txt similarity index 100% rename from dalla/stemming/data/words_t.txt rename to dalla_data_processing/stemming/data/words_t.txt diff --git a/dalla/utils/__init__.py b/dalla_data_processing/utils/__init__.py similarity index 57% rename from dalla/utils/__init__.py rename to dalla_data_processing/utils/__init__.py index 724a743..acd2474 100644 --- a/dalla/utils/__init__.py +++ b/dalla_data_processing/utils/__init__.py @@ -4,7 +4,7 @@ This module provides utilities for tokenization, text manipulation, and logging. """ -from dalla.utils.logger import get_logger, logger, setup_logging -from dalla.utils.tokenize import simple_word_tokenize +from dalla_data_processing.utils.logger import get_logger, logger, setup_logging +from dalla_data_processing.utils.tokenize import simple_word_tokenize __all__ = ["simple_word_tokenize", "logger", "get_logger", "setup_logging"] diff --git a/dalla/utils/logger.py b/dalla_data_processing/utils/logger.py similarity index 100% rename from dalla/utils/logger.py rename to dalla_data_processing/utils/logger.py diff --git a/dalla/utils/tokenize.py b/dalla_data_processing/utils/tokenize.py similarity index 100% rename from dalla/utils/tokenize.py rename to dalla_data_processing/utils/tokenize.py diff --git a/pyproject.toml b/pyproject.toml index bccfc46..0f02da0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,7 +50,7 @@ all = [ ] [project.scripts] -dalla-dp = "dalla.cli:main" +dalla-dp = "dalla_data_processing.cli:main" [project.urls] Homepage = "https://github.com/U4RASD/dalla-data-processing" @@ -59,18 +59,18 @@ Repository = "https://github.com/U4RASD/dalla-data-processing" "Bug Tracker" = "https://github.com/U4RASD/dalla-data-processing/issues" [tool.setuptools] -packages = ["dalla", "dalla.core", "dalla.deduplication", "dalla.stemming", "dalla.quality", "dalla.readability", "dalla.utils"] +packages = ["dalla_data_processing", "dalla_data_processing.core", "dalla_data_processing.deduplication", "dalla_data_processing.stemming", "dalla_data_processing.quality", "dalla_data_processing.readability", "dalla_data_processing.utils"] include-package-data = true [tool.setuptools.package-data] -dalla = ["py.typed"] -"dalla.stemming" = ["data/*.txt"] -"dalla.deduplication" = ["bin/*", "onion/**/*"] +dalla_data_processing = ["py.typed"] +"dalla_data_processing.stemming" = ["data/*.txt"] +"dalla_data_processing.deduplication" = ["bin/*", "onion/**/*"] [tool.ruff] line-length = 100 target-version = "py312" -src = ["dalla"] +src = ["dalla_data_processing"] [tool.ruff.lint] select = [ @@ -96,11 +96,11 @@ skip-magic-trailing-comma = false line-ending = "auto" [tool.ruff.lint.isort] -known-first-party = ["dalla"] +known-first-party = ["dalla_data_processing"] [tool.ruff.lint.per-file-ignores] -"dalla/deduplication/onion/**/*.py" = ["N", "SIM", "UP"] -"dalla/stemming/__init__.py" = ["N802", "N806", "SIM102"] +"dalla_data_processing/deduplication/onion/**/*.py" = ["N", "SIM", "UP"] +"dalla_data_processing/stemming/__init__.py" = ["N802", "N806", "SIM102"] [tool.uv] dev-dependencies = [ diff --git a/scripts/build_onion.sh b/scripts/build_onion.sh index e7a0f32..00dfef2 100755 --- a/scripts/build_onion.sh +++ b/scripts/build_onion.sh @@ -15,8 +15,8 @@ echo -e "${GREEN}=== Building Onion Binary ===${NC}" # Get script directory and project root SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" -ONION_SOURCE="$PROJECT_ROOT/dalla/deduplication/onion/src_sc" -OUTPUT_DIR="$PROJECT_ROOT/dalla/deduplication/bin" +ONION_SOURCE="$PROJECT_ROOT/dalla_data_processing/deduplication/onion/src_sc" +OUTPUT_DIR="$PROJECT_ROOT/dalla_data_processing/deduplication/bin" # Check if source exists if [ ! -d "$ONION_SOURCE" ]; then