From f540a3ffea07d066fe5ed2373f9fccaa2e96338b Mon Sep 17 00:00:00 2001 From: d-maclean Date: Fri, 3 Jan 2025 00:00:51 -0500 Subject: [PATCH 01/10] new method in MesaData.read_log_data to improve speed --- mesa_reader/__init__.py | 63 ++++++++++++++++++++++++++++++----------- 1 file changed, 46 insertions(+), 17 deletions(-) diff --git a/mesa_reader/__init__.py b/mesa_reader/__init__.py index 4833cc4..32868f2 100644 --- a/mesa_reader/__init__.py +++ b/mesa_reader/__init__.py @@ -1,6 +1,7 @@ import os from os.path import join import re +from ast import literal_eval import numpy as np @@ -148,6 +149,32 @@ def __str__(self): return "MESA model # {:6}, t = {:20.10g} yr".format(model_number, age) except Exception: return "{}".format(self.file_name) + + + def _get_dtype(self, names, data) -> np.ndarray: + """Heuristic datatype determination using the first line of the log file.""" + if not hasattr(data, '__iter__'): + data = np.asarray([data]) + + types = [] + + for i, record in enumerate(data): + try: + record = literal_eval(record) + if type(record) == float: + types.append((names[i], 'float64')) + elif type(record) == int: + types.append((names[i],'int64')) + + except ValueError: + if record == "NaN": + types.append((names[i], 'float64')) + elif type(record) == str: + types.append((names[i], 'U128')) + + dtype = np.dtype(types) + + return dtype def read_data(self): """Decide if data file is log output or a model, then load the data @@ -196,23 +223,25 @@ def read_log_data(self): ------- None """ - self.bulk_data = np.genfromtxt( - self.file_name, - skip_header=MesaData.bulk_names_line - 1, - names=True, - ndmin=1, # Make sure a single entry is still a 1D array - dtype=None, - ) - self.bulk_names = self.bulk_data.dtype.names - header_data = [] - with open(self.file_name) as f: - for i, line in enumerate(f): - if i == MesaData.header_names_line - 1: - self.header_names = line.split() - elif i == MesaData.header_names_line: - header_data = [eval(datum) for datum in line.split()] - elif i > MesaData.header_names_line: - break + # attempting to speed up this process with some dirty tricks + with open(self.file_name, "r") as file: + + for _ in range(MesaData.header_names_line - 1): + file.readline() # skip 1st line + + self.header_names = file.readline().split(None, -1) + header_data = file.readline().split(None, -1) + + for _ in range(2): + file.readline() + + self.bulk_names = file.readline().split(None, -1) + data_elements = file.readline().split(None, -1) + + data_types = self.get_dtype(self.bulk_names, data_elements) + + self.bulk_data = np.loadtxt(file, dtype=data_types, skiprows=MesaData.bulk_names_line) + self.header_data = dict(zip(self.header_names, header_data)) self.remove_backups() From aeb127167b40c0d80ef896f20a3dc23649402301 Mon Sep 17 00:00:00 2001 From: d-maclean Date: Fri, 3 Jan 2025 00:04:50 -0500 Subject: [PATCH 02/10] fixed a missing underscore :( --- mesa_reader/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mesa_reader/__init__.py b/mesa_reader/__init__.py index 32868f2..f5e5a84 100644 --- a/mesa_reader/__init__.py +++ b/mesa_reader/__init__.py @@ -238,7 +238,7 @@ def read_log_data(self): self.bulk_names = file.readline().split(None, -1) data_elements = file.readline().split(None, -1) - data_types = self.get_dtype(self.bulk_names, data_elements) + data_types = self._get_dtype(self.bulk_names, data_elements) self.bulk_data = np.loadtxt(file, dtype=data_types, skiprows=MesaData.bulk_names_line) From dbff63a278384fbfd366689b60e1a9d7234698bb Mon Sep 17 00:00:00 2001 From: d-maclean Date: Fri, 3 Jan 2025 00:15:42 -0500 Subject: [PATCH 03/10] added dtype handling for logicals --- mesa_reader/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mesa_reader/__init__.py b/mesa_reader/__init__.py index f5e5a84..8aa60b3 100644 --- a/mesa_reader/__init__.py +++ b/mesa_reader/__init__.py @@ -169,6 +169,8 @@ def _get_dtype(self, names, data) -> np.ndarray: except ValueError: if record == "NaN": types.append((names[i], 'float64')) + if record.lower() in ["true", "false"]: + types.append((names[i], '?')) elif type(record) == str: types.append((names[i], 'U128')) From 5cf022b5c9ca13ed48586ac7e0439c60defc7795 Mon Sep 17 00:00:00 2001 From: d-maclean Date: Fri, 3 Jan 2025 01:50:14 -0500 Subject: [PATCH 04/10] fixed rewind file to read bulk_data --- mesa_reader/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mesa_reader/__init__.py b/mesa_reader/__init__.py index 8aa60b3..fcc3424 100644 --- a/mesa_reader/__init__.py +++ b/mesa_reader/__init__.py @@ -238,10 +238,12 @@ def read_log_data(self): file.readline() self.bulk_names = file.readline().split(None, -1) - data_elements = file.readline().split(None, -1) + data_elements = file.readline().split(None, -1) data_types = self._get_dtype(self.bulk_names, data_elements) + # rewind & read data + file.seek(0) self.bulk_data = np.loadtxt(file, dtype=data_types, skiprows=MesaData.bulk_names_line) self.header_data = dict(zip(self.header_names, header_data)) From c0bbacd3e41ce764c7c605b2d4e65a3eef5a1e71 Mon Sep 17 00:00:00 2001 From: d-maclean Date: Fri, 3 Jan 2025 03:54:38 -0500 Subject: [PATCH 05/10] switched to using numpy.fromfile method --- mesa_reader/__init__.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mesa_reader/__init__.py b/mesa_reader/__init__.py index fcc3424..a07c0f9 100644 --- a/mesa_reader/__init__.py +++ b/mesa_reader/__init__.py @@ -239,12 +239,17 @@ def read_log_data(self): self.bulk_names = file.readline().split(None, -1) + pos_0 = file.tell() data_elements = file.readline().split(None, -1) + pos_1 = file.tell() + pos_diff = pos_1 - pos_0 # length of data line 1 + data_types = self._get_dtype(self.bulk_names, data_elements) # rewind & read data - file.seek(0) - self.bulk_data = np.loadtxt(file, dtype=data_types, skiprows=MesaData.bulk_names_line) + file.seek(-pos_diff) + self.bulk_data = np.fromfile(file, dtype=data_types, sep=" ") + #self.bulk_data = np.loadtxt(file, dtype=data_types, skiprows=MesaData.bulk_names_line) self.header_data = dict(zip(self.header_names, header_data)) self.remove_backups() From a203eeb062cc8163b0564c1082c127519808d704 Mon Sep 17 00:00:00 2001 From: d-maclean Date: Fri, 3 Jan 2025 10:42:44 -0500 Subject: [PATCH 06/10] switched to use pandas.read_csv for extremely fast performance --- mesa_reader/__init__.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/mesa_reader/__init__.py b/mesa_reader/__init__.py index a07c0f9..f25e73e 100644 --- a/mesa_reader/__init__.py +++ b/mesa_reader/__init__.py @@ -2,6 +2,7 @@ from os.path import join import re from ast import literal_eval +from pandas import read_csv import numpy as np @@ -225,7 +226,9 @@ def read_log_data(self): ------- None """ - # attempting to speed up this process with some dirty tricks + # I'm attempting to speed up this process with some dirty tricks + # Using pandas's read_csv function gives us c-like performance as + # opposed to genfromtxt's native (slow, icky) python with open(self.file_name, "r") as file: for _ in range(MesaData.header_names_line - 1): @@ -238,18 +241,18 @@ def read_log_data(self): file.readline() self.bulk_names = file.readline().split(None, -1) - - pos_0 = file.tell() data_elements = file.readline().split(None, -1) - pos_1 = file.tell() - pos_diff = pos_1 - pos_0 # length of data line 1 - data_types = self._get_dtype(self.bulk_names, data_elements) - # rewind & read data - file.seek(-pos_diff) - self.bulk_data = np.fromfile(file, dtype=data_types, sep=" ") - #self.bulk_data = np.loadtxt(file, dtype=data_types, skiprows=MesaData.bulk_names_line) + # rewind & read + with open(self.file_name, "r") as file: + for _ in range(MesaData.bulk_names_line - 1): + file.readline() + + _dataframe = read_csv(file, sep="\s+", dtype=None) + _records = _dataframe.to_records(index=False) + + self.bulk_data = np.array(_records, dtype=_records.dtype.descr) self.header_data = dict(zip(self.header_names, header_data)) self.remove_backups() From 46da20fcb90c7fc009cd78c3782fee9448594233 Mon Sep 17 00:00:00 2001 From: d-maclean Date: Fri, 3 Jan 2025 10:42:56 -0500 Subject: [PATCH 07/10] added pandas --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 58c6229..ed113e6 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ author_email='wolfwm@uwec.edu', license='MIT', packages=['mesa_reader'], - install_requires=['numpy'], + install_requires=['numpy', 'pandas'], classifiers=[ 'Programming Language :: Python :: 3', 'License :: OSI Approved :: MIT License', From d50e39e656ad6005f592282c0c113dd14682ba82 Mon Sep 17 00:00:00 2001 From: d-maclean Date: Fri, 3 Jan 2025 11:43:22 -0500 Subject: [PATCH 08/10] changed remove_backups to use vectorized ops --- mesa_reader/__init__.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/mesa_reader/__init__.py b/mesa_reader/__init__.py index f25e73e..2d35e9f 100644 --- a/mesa_reader/__init__.py +++ b/mesa_reader/__init__.py @@ -2,7 +2,7 @@ from os.path import join import re from ast import literal_eval -from pandas import read_csv +from pandas import DataFrame, read_csv import numpy as np @@ -251,7 +251,7 @@ def read_log_data(self): _dataframe = read_csv(file, sep="\s+", dtype=None) _records = _dataframe.to_records(index=False) - + self.bulk_data = np.array(_records, dtype=_records.dtype.descr) self.header_data = dict(zip(self.header_names, header_data)) @@ -731,18 +731,19 @@ def remove_backups(self, dbg=False): return None if dbg: print("Scrubbing history...") - to_remove = [] - for i in range(len(self.data("model_number")) - 1): - smallest_future = np.min(self.data("model_number")[i + 1 :]) - if self.data("model_number")[i] >= smallest_future: - to_remove.append(i) - if len(to_remove) == 0: + + model_numbers = DataFrame(self.data["model_number"]) + kept_indices = model_numbers.drop_duplicates(keep="last").index + + if len(model_numbers) - len(kept_indices) == 0: if dbg: print("Already clean!") - return None + return if dbg: - print("Removing {} lines.".format(len(to_remove))) - self.bulk_data = np.delete(self.bulk_data, to_remove) + print(f"Found {len(model_numbers) - len(kept_indices)} lines to remove.") + + self.bulk_data = self.bulk_data[kept_indices] + return def __getattr__(self, method_name): if self._any_version(method_name): From ac91c921b56278feef2e19b2fbb7d75c160dff62 Mon Sep 17 00:00:00 2001 From: d-maclean Date: Fri, 3 Jan 2025 12:28:56 -0500 Subject: [PATCH 09/10] fixed a method call --- mesa_reader/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mesa_reader/__init__.py b/mesa_reader/__init__.py index 2d35e9f..dd75992 100644 --- a/mesa_reader/__init__.py +++ b/mesa_reader/__init__.py @@ -732,7 +732,7 @@ def remove_backups(self, dbg=False): if dbg: print("Scrubbing history...") - model_numbers = DataFrame(self.data["model_number"]) + model_numbers = DataFrame(self.data("model_number")) kept_indices = model_numbers.drop_duplicates(keep="last").index if len(model_numbers) - len(kept_indices) == 0: From 00f6aa502eed108e4f9011c8baa767b9c2385632 Mon Sep 17 00:00:00 2001 From: d-maclean Date: Sat, 4 Jan 2025 11:28:59 -0500 Subject: [PATCH 10/10] cleaned up read_log_data and removed superfluous heuristic algorihm --- mesa_reader/__init__.py | 37 +------------------------------------ 1 file changed, 1 insertion(+), 36 deletions(-) diff --git a/mesa_reader/__init__.py b/mesa_reader/__init__.py index dd75992..17ff400 100644 --- a/mesa_reader/__init__.py +++ b/mesa_reader/__init__.py @@ -151,33 +151,6 @@ def __str__(self): except Exception: return "{}".format(self.file_name) - - def _get_dtype(self, names, data) -> np.ndarray: - """Heuristic datatype determination using the first line of the log file.""" - if not hasattr(data, '__iter__'): - data = np.asarray([data]) - - types = [] - - for i, record in enumerate(data): - try: - record = literal_eval(record) - if type(record) == float: - types.append((names[i], 'float64')) - elif type(record) == int: - types.append((names[i],'int64')) - - except ValueError: - if record == "NaN": - types.append((names[i], 'float64')) - if record.lower() in ["true", "false"]: - types.append((names[i], '?')) - elif type(record) == str: - types.append((names[i], 'U128')) - - dtype = np.dtype(types) - - return dtype def read_data(self): """Decide if data file is log output or a model, then load the data @@ -240,18 +213,10 @@ def read_log_data(self): for _ in range(2): file.readline() - self.bulk_names = file.readline().split(None, -1) - data_elements = file.readline().split(None, -1) - data_types = self._get_dtype(self.bulk_names, data_elements) - - # rewind & read - with open(self.file_name, "r") as file: - for _ in range(MesaData.bulk_names_line - 1): - file.readline() - _dataframe = read_csv(file, sep="\s+", dtype=None) _records = _dataframe.to_records(index=False) + self.bulk_names = _dataframe.columns.values self.bulk_data = np.array(_records, dtype=_records.dtype.descr) self.header_data = dict(zip(self.header_names, header_data))