From 09c39ac9181c5c00ea6be7e8a23b57b45d2beb91 Mon Sep 17 00:00:00 2001 From: Alexander Morgan Date: Wed, 13 Mar 2024 17:15:41 +0100 Subject: [PATCH 1/8] Avoid df and str copies and unneeded iterables --- reView/utils/bespoke.py | 52 ++++++++++++++++++------------- reView/utils/characterizations.py | 15 +++------ reView/utils/functions.py | 45 +++++++++++++------------- 3 files changed, 58 insertions(+), 54 deletions(-) diff --git a/reView/utils/bespoke.py b/reView/utils/bespoke.py index b1ca674..fa7da3a 100644 --- a/reView/utils/bespoke.py +++ b/reView/utils/bespoke.py @@ -87,11 +87,7 @@ def to_wgs(self, rdf): self.src_crs, always_xy=True ) - lons, lats = transformer.transform(xs, ys) - rdf["longitude"] = lons - rdf["latitude"] = lats - del rdf["x"] - del rdf["y"] + rdf[['longitude', 'latitude']] = transformer.transform(xs, ys) rdf = rdf[self.df.columns] return rdf @@ -179,7 +175,8 @@ def batch_unpack_from_supply_curve(sc_df, n_workers=1): Parameters ---------- sc_df : pd.core.frame.DataFrame - A reV supply curve pandas data frame. + A reV supply curve pandas data frame. This will get modified in + place. n_workers : int Number of workers to use for parallel processing. Default is 1 which will run in serial (and will be slow). @@ -193,8 +190,7 @@ def batch_unpack_from_supply_curve(sc_df, n_workers=1): """ # cap nb_workers to the total CPUs on the machine/node - if n_workers > cpu_count(): - n_workers = cpu_count() + n_workers = min(cpu_count(), n_workers) if n_workers > 1: # initialize functionality for parallela dataframe.apply @@ -202,43 +198,55 @@ def batch_unpack_from_supply_curve(sc_df, n_workers=1): progress_bar=True, nb_workers=n_workers, use_memory_fs=False) # filter out supply curve points with no capacity (i.e., no turbines) - sc_developable_df = sc_df[sc_df['capacity'] > 0].copy() + sc_df = sc_df[sc_df['capacity'] > 0] # reset the index because otherwise the unpacker will get messed up - sc_developable_df.reset_index(drop=True, inplace=True) + sc_df.reset_index(drop=True, inplace=True) # unpack the turbine coordinates if n_workers > 1: # run in parallel - all_turbines = sc_developable_df.parallel_apply( + all_turbines = sc_df.parallel_apply( lambda row: BespokeUnpacker( - sc_developable_df, + sc_df, sc_point_gid=row['sc_point_gid'] ).unpack_turbines(drop_sc_points=True), axis=1 ) else: # run in serial - all_turbines = sc_developable_df.apply( + all_turbines = sc_df.apply( lambda row: BespokeUnpacker( - sc_developable_df, + sc_df, sc_point_gid=row['sc_point_gid'] ).unpack_turbines(drop_sc_points=True), axis=1 ) # stack the results back into a single df - all_turbines_df = pd.concat(all_turbines.tolist()) + all_turbines_df = pd.concat(all_turbines.values) # extract the geometries - all_turbines_df['geometry'] = all_turbines_df.apply( - lambda row: geometry.Point( - row['longitude'], - row['latitude'] - ), - axis=1 - ) + if n_workers > 1: + # run in parallel + all_turbines_df['geometry'] = all_turbines_df.parallel_apply( + lambda row: geometry.Point( + row['longitude'], + row['latitude'] + ), + axis=1 + ) + else: + # run in serial + all_turbines_df['geometry'] = all_turbines_df.apply( + lambda row: geometry.Point( + row['longitude'], + row['latitude'] + ), + axis=1 + ) + # turn into a geodataframe all_turbines_gdf = gpd.GeoDataFrame(all_turbines_df, crs='EPSG:4326') diff --git a/reView/utils/characterizations.py b/reView/utils/characterizations.py index 81c3c23..29fbbb7 100644 --- a/reView/utils/characterizations.py +++ b/reView/utils/characterizations.py @@ -72,16 +72,13 @@ def recast_categories(df, col, lkup, cell_size_sq_km): col_df = pd.DataFrame(col_data) col_df.fillna(0, inplace=True) col_df.drop( - columns=[c for c in col_df.columns if c not in lkup.keys()], + columns=[c for c in col_df.columns if c not in lkup], inplace=True ) col_df.rename(columns=lkup, inplace=True) if cell_size_sq_km is not None: col_df *= cell_size_sq_km - col_df.rename( - columns={c: f"{c}_area_sq_km" for c in col_df.columns}, - inplace=True - ) + col_df.columns += "_area_sq_km" col_df.index = df.index @@ -216,14 +213,12 @@ def validate_characterization_remapper( # noqa: C901 parameters are encountered in characterization_remapper. """ - characterization_cols = list(characterization_remapper.keys()) - df_cols = supply_curve_df.columns.tolist() - cols_not_in_df = list(set(characterization_cols).difference(set(df_cols))) - if len(cols_not_in_df) > 0: + if any(key not in df.columns for key in characterization_remapper): + keys = [key not in df.columns for key in characterization_remapper] raise KeyError( "Invalid column name(s) in characterization_remapper. " "The following column name(s) were not found in the input " - f"dataframe: {cols_not_in_df}." + f"dataframe: {keys}." ) for col_name, col_remapper in characterization_remapper.items(): diff --git a/reView/utils/functions.py b/reView/utils/functions.py index 635c6ab..b047046 100644 --- a/reView/utils/functions.py +++ b/reView/utils/functions.py @@ -34,7 +34,18 @@ logger = logging.getLogger(__name__) - +_trans_table_1 = str.maketrans({",": None, "$": None, "%": None}) +_trans_table_2 = str.maketrans({ + "-": "_", + " ": "_", + "/": "_", + "$": "usd", + "?": None, + "(": None, + ")": None, + "%": "pct", + "&": "and" + }) TIME_PATTERN = "%Y-%m-%d %H:%M:%S+00:00" @@ -90,7 +101,7 @@ def as_float(value): Input string value represented as a float. """ if isinstance(value, str): - value = value.replace(",", "").replace("$", "").replace("%", "") + value = value.translate(_trans_table_1) value = float(value) return value @@ -684,32 +695,20 @@ def to_geo(df, dst, layer): if "index" in df: del df["index"] + replace_columns = False + new_columns = [] # Remove or rename columns - replacements = { - "-": "_", - " ": "_", - "/": "_", - "$": "usd", - "?": "", - "(": "", - ")": "", - "%": "pct", - "&": "and" - } for col in df.columns: # Remove columns that start with numbers if is_int(col[0]): del df[col] print(col) - # This happens when you save the index - if "Unnamed:" in col: + elif "Unnamed:" in col: del df[col] else: # Remove unnacceptable characters - ncol = col - for char, repl in replacements.items(): - ncol = ncol.replace(char, repl) + ncol = col.translate(_trans_table_2) # Lower case just because ncol = ncol.lower() @@ -722,9 +721,12 @@ def to_geo(df, dst, layer): # npart2 = ncol.split("_")[0] # ncol = "_".join([npart1, npart2]) - # Rename column + + new_columns.append(ncol) if col != ncol: - df = df.rename({col: ncol}, axis=1) + replace_columns = True + if replace_columns: + df.columns = new_columns # Create fields and set types fields = [] @@ -761,8 +763,7 @@ def to_geo(df, dst, layer): lat = row["latitude"] lon = row["longitude"] wkb = point_to_gpkg_point(header, lon, lat) - values = list(row.values) - values.insert(0, wkb) + values = [wkb, *row.values] rows.append(values) # Finally insert rows From a3079dd0c72c1029e58c4393bd8484f90cf41b69 Mon Sep 17 00:00:00 2001 From: Alexander Morgan Date: Wed, 13 Mar 2024 17:36:59 +0100 Subject: [PATCH 2/8] Put comment above block --- reView/utils/functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reView/utils/functions.py b/reView/utils/functions.py index b047046..b90fa91 100644 --- a/reView/utils/functions.py +++ b/reView/utils/functions.py @@ -695,9 +695,9 @@ def to_geo(df, dst, layer): if "index" in df: del df["index"] + # Remove or rename columns replace_columns = False new_columns = [] - # Remove or rename columns for col in df.columns: # Remove columns that start with numbers if is_int(col[0]): From e4f436f7fa1f5730d2abb4ed05700f78f83e4e51 Mon Sep 17 00:00:00 2001 From: Alexander Morgan Date: Thu, 14 Mar 2024 00:38:58 +0100 Subject: [PATCH 3/8] Revert to using str.replace --- reView/utils/functions.py | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/reView/utils/functions.py b/reView/utils/functions.py index b90fa91..9d1a7df 100644 --- a/reView/utils/functions.py +++ b/reView/utils/functions.py @@ -34,18 +34,6 @@ logger = logging.getLogger(__name__) -_trans_table_1 = str.maketrans({",": None, "$": None, "%": None}) -_trans_table_2 = str.maketrans({ - "-": "_", - " ": "_", - "/": "_", - "$": "usd", - "?": None, - "(": None, - ")": None, - "%": "pct", - "&": "and" - }) TIME_PATTERN = "%Y-%m-%d %H:%M:%S+00:00" @@ -101,7 +89,7 @@ def as_float(value): Input string value represented as a float. """ if isinstance(value, str): - value = value.translate(_trans_table_1) + value = value.replace(",", "").replace("$", "").replace("%", "") value = float(value) return value @@ -696,6 +684,17 @@ def to_geo(df, dst, layer): del df["index"] # Remove or rename columns + replacements = { + "-": "_", + " ": "_", + "/": "_", + "$": "usd", + "?": "", + "(": "", + ")": "", + "%": "pct", + "&": "and" + } replace_columns = False new_columns = [] for col in df.columns: @@ -703,12 +702,15 @@ def to_geo(df, dst, layer): if is_int(col[0]): del df[col] print(col) + # This happens when you save the index elif "Unnamed:" in col: del df[col] else: - # Remove unnacceptable characters - ncol = col.translate(_trans_table_2) + # Remove unacceptable characters + ncol = col + for char, repl in replacements.items(): + ncol = ncol.replace(char, repl) # Lower case just because ncol = ncol.lower() @@ -721,10 +723,10 @@ def to_geo(df, dst, layer): # npart2 = ncol.split("_")[0] # ncol = "_".join([npart1, npart2]) - new_columns.append(ncol) if col != ncol: replace_columns = True + if replace_columns: df.columns = new_columns From 626c5fffee21997e2de690cc588a330e04e39aed Mon Sep 17 00:00:00 2001 From: Alexander Morgan Date: Tue, 19 Mar 2024 13:24:23 +0100 Subject: [PATCH 4/8] Refactor read_timeseries to reuse more code --- reView/utils/functions.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/reView/utils/functions.py b/reView/utils/functions.py index 9d1a7df..f244ba7 100644 --- a/reView/utils/functions.py +++ b/reView/utils/functions.py @@ -485,12 +485,6 @@ def read_timeseries(file, gids=None, nsteps=None): if "bespoke" not in str(file): # Break down time entries time = [t.decode() for t in ds["time_index"][:nsteps]] - dtime = [dt.datetime.strptime(t, TIME_PATTERN) for t in time] - minutes = [t.minute for t in dtime] - hours = [t.hour for t in dtime] - days = [t.timetuple().tm_yday for t in dtime] - weeks = [t.isocalendar().week for t in dtime] - months = [t.month for t in dtime] # Process generation data cf = ds["rep_profiles_0"][:nsteps, idx] @@ -524,15 +518,16 @@ def read_timeseries(file, gids=None, nsteps=None): # This will only take the average across the year time = [t.decode() for t in time] - dtime = [dt.datetime.strptime(t, TIME_PATTERN) for t in time] - days = [t.timetuple().tm_yday for t in dtime] - weeks = [t.isocalendar().week for t in dtime] - months = [t.month for t in dtime] - hours = [t.hour for t in dtime] - minutes = [t.minute for t in dtime] ds.close() + dtime = [dt.datetime.strptime(t, TIME_PATTERN) for t in time] + minutes = [t.minute for t in dtime] + hours = [t.hour for t in dtime] + days = [t.timetuple().tm_yday for t in dtime] + weeks = [t.isocalendar().week for t in dtime] + months = [t.month for t in dtime] + data = pd.DataFrame({ "time": time, "minute": minutes, From b922a5db2ce33b3623d2d1930815eccd006f3d1e Mon Sep 17 00:00:00 2001 From: Alexander Morgan Date: Fri, 22 Mar 2024 18:10:24 +0100 Subject: [PATCH 5/8] Fix bad var reference --- reView/utils/characterizations.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/reView/utils/characterizations.py b/reView/utils/characterizations.py index 29fbbb7..3fa97fa 100644 --- a/reView/utils/characterizations.py +++ b/reView/utils/characterizations.py @@ -213,8 +213,10 @@ def validate_characterization_remapper( # noqa: C901 parameters are encountered in characterization_remapper. """ - if any(key not in df.columns for key in characterization_remapper): - keys = [key not in df.columns for key in characterization_remapper] + if any(key not in supply_curve_df.columns + for key in characterization_remapper): + keys = [key not in supply_curve_df.columns + for key in characterization_remapper] raise KeyError( "Invalid column name(s) in characterization_remapper. " "The following column name(s) were not found in the input " From 7367cab61fb136f6a2b3e6e7fce238916cc39b8c Mon Sep 17 00:00:00 2001 From: Alexander Morgan Date: Tue, 16 Apr 2024 18:08:58 +0200 Subject: [PATCH 6/8] Fix to_wgs, optimize unpack_turbines --- reView/utils/bespoke.py | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/reView/utils/bespoke.py b/reView/utils/bespoke.py index fa7da3a..1dd32de 100644 --- a/reView/utils/bespoke.py +++ b/reView/utils/bespoke.py @@ -87,7 +87,10 @@ def to_wgs(self, rdf): self.src_crs, always_xy=True ) - rdf[['longitude', 'latitude']] = transformer.transform(xs, ys) + lons, lats = transformer.transform(xs, ys) + rdf["x"] = lons + rdf["y"] = lats + rdf.rename(columns={'x': 'longitude', 'y': 'latitude'}, inplace=True) rdf = rdf[self.df.columns] return rdf @@ -106,8 +109,7 @@ def unpack_turbines(self, drop_sc_points=False): # Get coordinates from equal area projection x, y = self.get_xy(row) - del row["longitude"] - del row["latitude"] + row.drop(['longitude', 'latitude'], inplace=True) # Get bottom left coordinates blx = x - (self.spacing / 2) @@ -119,28 +121,21 @@ def unpack_turbines(self, drop_sc_points=False): xs = [x + blx for x in xs] ys = [y + bly for y in ys] - # Build new data frame entries for each turbine - nrows = [] - # use len(xs) to determine number of turbines because # nturbines does not appear to be a standard column turbine_capacity_mw = row['capacity'] / len(xs) - for i, x in enumerate(xs): - nrow = row.copy() - # overwrite existing capacity column (which is typically system - # capacity in mw) with turbine capacity in kw for this turbine row. - # This maintains compatibility with how capacity is summed and - # displayed in the dashboard - nrow["capacity"] = turbine_capacity_mw - nrow["x"] = x - nrow["y"] = ys[i] - nrows.append(nrow) - - # Build new data frame - rdf = pd.DataFrame(nrows) - rdf = rdf.reset_index(drop=True) - rdf.index = df.index[-1] + rdf.index + 1 + # Build new data frame with a row for each turbine + new_index = range(df.index[-1] + 1, df.index[-1] + 1 + len(xs)) + rdf = pd.DataFrame([row]*len(xs), index=new_index) + + # overwrite existing capacity column (which is typically system + # capacity in mw) with turbine capacity in kw for this turbine row. + # This maintains compatibility with how capacity is summed and + # displayed in the dashboard + rdf['capacity'] = turbine_capacity_mw + rdf['x'] = xs + rdf['y'] = ys # Convert back to WGS84 rdf = self.to_wgs(rdf) @@ -150,7 +145,7 @@ def unpack_turbines(self, drop_sc_points=False): # Replace the original row with one of the new rows. df.iloc[self.index] = rdf.iloc[-1] - rdf = rdf.iloc[:-1] + rdf.drop(rdf.index[-1], inplace=True) df = pd.concat([df, rdf]) return df From e860b2a13b6ad185ac4f5fba5fdccce3970d5b92 Mon Sep 17 00:00:00 2001 From: Alexander Morgan Date: Tue, 16 Apr 2024 18:36:49 +0200 Subject: [PATCH 7/8] Use inplace=True where possible with reset_index() --- reView/pages/rev/controller/element_builders.py | 2 +- reView/pages/rev/model.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/reView/pages/rev/controller/element_builders.py b/reView/pages/rev/controller/element_builders.py index da5614b..ade2458 100644 --- a/reView/pages/rev/controller/element_builders.py +++ b/reView/pages/rev/controller/element_builders.py @@ -570,7 +570,7 @@ def _distributions(self, data, y_var, nbins=100): pdf = count / sum(count) cdf = np.cumsum(pdf) df = pd.DataFrame({y_var: cbins, "cdf": cdf, "pdf": pdf}) - df = df.reset_index() + df.reset_index(inplace=True) return df def _histogram(self, main_df, y_var, bins): diff --git a/reView/pages/rev/model.py b/reView/pages/rev/model.py index 2a183da..118abea 100644 --- a/reView/pages/rev/model.py +++ b/reView/pages/rev/model.py @@ -545,7 +545,7 @@ def composite(dfs, composite_variable="total_lcoe", """Return a single least cost df from a list dfs.""" # Make one big data frame bdf = pd.concat(dfs) - bdf = bdf.reset_index(drop=True) + bdf.reset_index(drop=True, inplace=True) # Group, find minimum, and subset if composite_function == "min": From adfedc0832cc9505a643a1c7bc84fc6c3470f199 Mon Sep 17 00:00:00 2001 From: Alexander Morgan Date: Tue, 16 Apr 2024 18:38:11 +0200 Subject: [PATCH 8/8] Remove unneeded .copy() call --- reView/utils/characterizations.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/reView/utils/characterizations.py b/reView/utils/characterizations.py index 3fa97fa..575dfab 100644 --- a/reView/utils/characterizations.py +++ b/reView/utils/characterizations.py @@ -182,8 +182,6 @@ def unpack_characterizations( # noqa: C901 elif method is None: warnings.warn(f"Skipping {char_col}: No method provided") - in_df = in_df.copy() - return in_df