From 7d8cafcd4b2a036aa5b298bd02903e7b2cf7d93a Mon Sep 17 00:00:00 2001 From: "Daniel Y.T. Kim" Date: Tue, 1 Jul 2025 17:51:31 +0900 Subject: [PATCH 1/2] [fix]: ignore Unicode encoding errors when reading metadata.csv --- dataset_toolkits/download.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dataset_toolkits/download.py b/dataset_toolkits/download.py index 36e684ff..1b45c232 100644 --- a/dataset_toolkits/download.py +++ b/dataset_toolkits/download.py @@ -27,7 +27,10 @@ # get file list if not os.path.exists(os.path.join(opt.output_dir, 'metadata.csv')): raise ValueError('metadata.csv not found') - metadata = pd.read_csv(os.path.join(opt.output_dir, 'metadata.csv')) + + with open(os.path.join(opt.output_dir, 'metadata.csv'), 'r', encoding='utf-8', errors='ignore') as f: + metadata = pd.read_csv(f) + if opt.instances is None: if opt.filter_low_aesthetic_score is not None: metadata = metadata[metadata['aesthetic_score'] >= opt.filter_low_aesthetic_score] From 353c6f80a64bde24657df67c9cfb47f47798a9a0 Mon Sep 17 00:00:00 2001 From: "Daniel Y.T. Kim" Date: Thu, 3 Jul 2025 11:52:30 +0900 Subject: [PATCH 2/2] [fix]: ignore encoding error in writing csv file --- dataset_toolkits/download.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dataset_toolkits/download.py b/dataset_toolkits/download.py index 1b45c232..8040bd40 100644 --- a/dataset_toolkits/download.py +++ b/dataset_toolkits/download.py @@ -52,4 +52,5 @@ # process objects downloaded = dataset_utils.download(metadata, **opt) - downloaded.to_csv(os.path.join(opt.output_dir, f'downloaded_{opt.rank}.csv'), index=False) + with open(os.path.join(opt.output_dir, f'downloaded_{opt.rank}.csv'), 'w', encoding='utf-8', errors='ignore') as f: + downloaded.to_csv(f, index=False) \ No newline at end of file