From 333f0c1ef96b446a318a88674ae5ec0f0aad8132 Mon Sep 17 00:00:00 2001 From: Martin Dreier Date: Thu, 2 Apr 2020 11:36:31 +0200 Subject: [PATCH 1/3] Allow setting of file path when loading data Prevents call to download and preprocessing when data is already available. --- deeplearning2020/datasets.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/deeplearning2020/datasets.py b/deeplearning2020/datasets.py index 1acbddc..ee96fca 100644 --- a/deeplearning2020/datasets.py +++ b/deeplearning2020/datasets.py @@ -44,15 +44,16 @@ class ImageWoof: image_count: int = 0 list_ds: "_tf.data.Dataset" = None - def __init__(self, dataset: str) -> None: + def __init__(self, dataset: str, file_path: str = None) -> None: if dataset not in ["train", "val"]: raise ValueError("Dataset not found") - - file_path = tf.keras.utils.get_file( - origin="https://s3.amazonaws.com/fast-ai-imageclas/imagewoof2-320.tgz", - fname="imagewoof", - untar=True, - ) + + if file_path == None: + file_path = tf.keras.utils.get_file( + origin="https://s3.amazonaws.com/fast-ai-imageclas/imagewoof2-320.tgz", + fname="imagewoof", + untar=True, + ) self.data_dir = pathlib.Path(file_path + "2-320/" + dataset) print(self.data_dir) self.image_count = len(list(self.data_dir.glob("*/*.JPEG"))) @@ -80,12 +81,12 @@ def __init__(self, dataset: str) -> None: self.list_ds = tf.data.Dataset.list_files(str(self.data_dir / "*/*")) @classmethod - def train(cls: typing.Type[ImageWoofType]) -> ImageWoofType: - return cls("train") + def train(cls: typing.Type[ImageWoofType], data_dir: str = None) -> ImageWoofType: + return cls("train", data_dir) @classmethod - def validation(cls: typing.Type[ImageWoofType]) -> ImageWoofType: - return cls("val") + def validation(cls: typing.Type[ImageWoofType], data_dir: str = None) -> ImageWoofType: + return cls("val", data_dir) def map_class(self, raw_cls: str) -> str: return self.class_name_mapping[raw_cls] @@ -117,10 +118,11 @@ def wrapped_load_data(self) -> "_tf.data.Dataset": @classmethod def load_data( cls: typing.Type[ImageWoofType], + data_dir: str = None ) -> typing.Tuple["_tf.data.Dataset", "_tf.data.Dataset", np.ndarray]: - train_ds = cls.train() + train_ds = cls.train(data_dir) return ( train_ds.wrapped_load_data(), - cls.validation().wrapped_load_data(), + cls.validation(data_dir).wrapped_load_data(), train_ds.CLASS_NAMES, ) From 2c17d7b098d6bec5b63012d860ceca514834a03c Mon Sep 17 00:00:00 2001 From: Martin Dreier Date: Thu, 2 Apr 2020 13:23:33 +0200 Subject: [PATCH 2/3] Switch to TensorFlow IO GFile Library tensorflow.io.gfile uses Tensorflow's C layer to access data. This includes support for some file systems (e.g. for Google Cloud Storage) which are not supported by Python's OS libraries. --- deeplearning2020/datasets.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/deeplearning2020/datasets.py b/deeplearning2020/datasets.py index ee96fca..3acb0ec 100644 --- a/deeplearning2020/datasets.py +++ b/deeplearning2020/datasets.py @@ -40,7 +40,6 @@ class ImageWoof: BATCH_SIZE: int = 32 CLASS_NAMES: np.ndarray = None - data_dir: pathlib.Path image_count: int = 0 list_ds: "_tf.data.Dataset" = None @@ -54,13 +53,16 @@ def __init__(self, dataset: str, file_path: str = None) -> None: fname="imagewoof", untar=True, ) - self.data_dir = pathlib.Path(file_path + "2-320/" + dataset) - print(self.data_dir) - self.image_count = len(list(self.data_dir.glob("*/*.JPEG"))) + data_dir = file_path + "2-320/" + dataset + else: + data_dir = file_path + "/imagewoof2-320/" + dataset + print(data_dir) + # Might not work on *nix systems, see https://github.com/tensorflow/tensorflow/issues/20557 + self.image_count = len(list(tf.io.gfile.glob(data_dir + "/*/*.JPEG"))) print(f"Loaded {self.image_count} images") self.raw_class_names = [ - item.name for item in self.data_dir.glob("*") if item.name != "LICENSE.txt" + item for item in tf.io.gfile.listdir(data_dir) if item != "LICENSE.txt" ] self.raw_class_names.sort() @@ -78,7 +80,7 @@ def __init__(self, dataset: str, file_path: str = None) -> None: ) self.CLASS_NAMES = np.array([self.map_class(c) for c in self.raw_class_names]) - self.list_ds = tf.data.Dataset.list_files(str(self.data_dir / "*/*")) + self.list_ds = tf.data.Dataset.list_files(data_dir + "/*/*") @classmethod def train(cls: typing.Type[ImageWoofType], data_dir: str = None) -> ImageWoofType: From 8c877c7a6f7b47484dd048624fdb0e2215251cde Mon Sep 17 00:00:00 2001 From: Martin Dreier Date: Thu, 2 Apr 2020 15:01:13 +0200 Subject: [PATCH 3/3] Strip trailing slashes from paths Some file systems may add trailing slashing when listing folder contents --- deeplearning2020/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deeplearning2020/datasets.py b/deeplearning2020/datasets.py index 3acb0ec..a3f7958 100644 --- a/deeplearning2020/datasets.py +++ b/deeplearning2020/datasets.py @@ -62,7 +62,7 @@ def __init__(self, dataset: str, file_path: str = None) -> None: print(f"Loaded {self.image_count} images") self.raw_class_names = [ - item for item in tf.io.gfile.listdir(data_dir) if item != "LICENSE.txt" + item.strip("/") for item in tf.io.gfile.listdir(data_dir) if item != "LICENSE.txt" ] self.raw_class_names.sort()