From 09c39ac9181c5c00ea6be7e8a23b57b45d2beb91 Mon Sep 17 00:00:00 2001
From: Alexander Morgan <alexanderpmorgan@gmail.com>
Date: Wed, 13 Mar 2024 17:15:41 +0100
Subject: [PATCH 1/8] Avoid df and str copies and unneeded iterables

---
 reView/utils/bespoke.py           | 52 ++++++++++++++++++-------------
 reView/utils/characterizations.py | 15 +++------
 reView/utils/functions.py         | 45 +++++++++++++-------------
 3 files changed, 58 insertions(+), 54 deletions(-)

diff --git a/reView/utils/bespoke.py b/reView/utils/bespoke.py
index b1ca674..fa7da3a 100644
--- a/reView/utils/bespoke.py
+++ b/reView/utils/bespoke.py
@@ -87,11 +87,7 @@ def to_wgs(self, rdf):
             self.src_crs,
             always_xy=True
         )
-        lons, lats = transformer.transform(xs, ys)
-        rdf["longitude"] = lons
-        rdf["latitude"] = lats
-        del rdf["x"]
-        del rdf["y"]
+        rdf[['longitude', 'latitude']] = transformer.transform(xs, ys)
         rdf = rdf[self.df.columns]
         return rdf
 
@@ -179,7 +175,8 @@ def batch_unpack_from_supply_curve(sc_df, n_workers=1):
         Parameters
         ----------
         sc_df : pd.core.frame.DataFrame
-            A reV supply curve pandas data frame.
+            A reV supply curve pandas data frame. This will get modified in
+            place.
         n_workers : int
             Number of workers to use for parallel processing.
             Default is 1 which will run in serial (and will be slow).
@@ -193,8 +190,7 @@ def batch_unpack_from_supply_curve(sc_df, n_workers=1):
     """
 
     # cap nb_workers to the total CPUs on the machine/node
-    if n_workers > cpu_count():
-        n_workers = cpu_count()
+    n_workers = min(cpu_count(), n_workers)
 
     if n_workers > 1:
         # initialize functionality for parallela dataframe.apply
@@ -202,43 +198,55 @@ def batch_unpack_from_supply_curve(sc_df, n_workers=1):
             progress_bar=True, nb_workers=n_workers, use_memory_fs=False)
 
     # filter out supply curve points with no capacity (i.e., no turbines)
-    sc_developable_df = sc_df[sc_df['capacity'] > 0].copy()
+    sc_df = sc_df[sc_df['capacity'] > 0]
     # reset the index because otherwise the unpacker will get messed up
-    sc_developable_df.reset_index(drop=True, inplace=True)
+    sc_df.reset_index(drop=True, inplace=True)
 
     # unpack the turbine coordinates
     if n_workers > 1:
         # run in parallel
-        all_turbines = sc_developable_df.parallel_apply(
+        all_turbines = sc_df.parallel_apply(
             lambda row:
                 BespokeUnpacker(
-                    sc_developable_df,
+                    sc_df,
                     sc_point_gid=row['sc_point_gid']
                 ).unpack_turbines(drop_sc_points=True),
             axis=1
         )
     else:
         # run in serial
-        all_turbines = sc_developable_df.apply(
+        all_turbines = sc_df.apply(
             lambda row:
                 BespokeUnpacker(
-                    sc_developable_df,
+                    sc_df,
                     sc_point_gid=row['sc_point_gid']
                 ).unpack_turbines(drop_sc_points=True),
             axis=1
         )
 
     # stack the results back into a single df
-    all_turbines_df = pd.concat(all_turbines.tolist())
+    all_turbines_df = pd.concat(all_turbines.values)
 
     # extract the geometries
-    all_turbines_df['geometry'] = all_turbines_df.apply(
-        lambda row: geometry.Point(
-            row['longitude'],
-            row['latitude']
-        ),
-        axis=1
-    )
+    if n_workers > 1:
+        # run in parallel
+        all_turbines_df['geometry'] = all_turbines_df.parallel_apply(
+            lambda row: geometry.Point(
+                row['longitude'],
+                row['latitude']
+            ),
+            axis=1
+        )
+    else:
+        # run in serial
+        all_turbines_df['geometry'] = all_turbines_df.apply(
+            lambda row: geometry.Point(
+                row['longitude'],
+                row['latitude']
+            ),
+            axis=1
+        )
+
     # turn into a geodataframe
     all_turbines_gdf = gpd.GeoDataFrame(all_turbines_df, crs='EPSG:4326')
 
diff --git a/reView/utils/characterizations.py b/reView/utils/characterizations.py
index 81c3c23..29fbbb7 100644
--- a/reView/utils/characterizations.py
+++ b/reView/utils/characterizations.py
@@ -72,16 +72,13 @@ def recast_categories(df, col, lkup, cell_size_sq_km):
     col_df = pd.DataFrame(col_data)
     col_df.fillna(0, inplace=True)
     col_df.drop(
-        columns=[c for c in col_df.columns if c not in lkup.keys()],
+        columns=[c for c in col_df.columns if c not in lkup],
         inplace=True
     )
     col_df.rename(columns=lkup, inplace=True)
     if cell_size_sq_km is not None:
         col_df *= cell_size_sq_km
-        col_df.rename(
-            columns={c: f"{c}_area_sq_km" for c in col_df.columns},
-            inplace=True
-        )
+        col_df.columns += "_area_sq_km"
 
     col_df.index = df.index
 
@@ -216,14 +213,12 @@ def validate_characterization_remapper(  # noqa: C901
         parameters are encountered in characterization_remapper.
     """
 
-    characterization_cols = list(characterization_remapper.keys())
-    df_cols = supply_curve_df.columns.tolist()
-    cols_not_in_df = list(set(characterization_cols).difference(set(df_cols)))
-    if len(cols_not_in_df) > 0:
+    if any(key not in df.columns for key in characterization_remapper):
+        keys = [key not in df.columns for key in characterization_remapper]
         raise KeyError(
             "Invalid column name(s) in characterization_remapper. "
             "The following column name(s) were not found in the input "
-            f"dataframe: {cols_not_in_df}."
+            f"dataframe: {keys}."
         )
 
     for col_name, col_remapper in characterization_remapper.items():
diff --git a/reView/utils/functions.py b/reView/utils/functions.py
index 635c6ab..b047046 100644
--- a/reView/utils/functions.py
+++ b/reView/utils/functions.py
@@ -34,7 +34,18 @@
 
 logger = logging.getLogger(__name__)
 
-
+_trans_table_1 = str.maketrans({",": None, "$": None, "%": None})
+_trans_table_2 = str.maketrans({
+        "-": "_",
+        " ": "_",
+        "/": "_",
+        "$": "usd",
+        "?": None,
+        "(": None,
+        ")": None,
+        "%": "pct",
+        "&": "and"
+    })
 TIME_PATTERN = "%Y-%m-%d %H:%M:%S+00:00"
 
 
@@ -90,7 +101,7 @@ def as_float(value):
         Input string value represented as a float.
     """
     if isinstance(value, str):
-        value = value.replace(",", "").replace("$", "").replace("%", "")
+        value = value.translate(_trans_table_1)
         value = float(value)
     return value
 
@@ -684,32 +695,20 @@ def to_geo(df, dst, layer):
     if "index" in df:
         del df["index"]
 
+    replace_columns = False
+    new_columns = []
     # Remove or rename columns
-    replacements = {
-        "-": "_",
-        " ": "_",
-        "/": "_",
-        "$": "usd",
-        "?": "",
-        "(": "",
-        ")": "",
-        "%": "pct",
-        "&": "and"
-    }
     for col in df.columns:
         # Remove columns that start with numbers
         if is_int(col[0]):
             del df[col]
             print(col)
-
         # This happens when you save the index
-        if "Unnamed:" in col:
+        elif "Unnamed:" in col:
             del df[col]
         else:
             # Remove unnacceptable characters
-            ncol = col
-            for char, repl in replacements.items():
-                ncol = ncol.replace(char, repl)
+            ncol = col.translate(_trans_table_2)
 
             # Lower case just because
             ncol = ncol.lower()
@@ -722,9 +721,12 @@ def to_geo(df, dst, layer):
             #         npart2 = ncol.split("_")[0]
             #         ncol = "_".join([npart1, npart2])
 
-            # Rename column
+
+            new_columns.append(ncol)
             if col != ncol:
-                df = df.rename({col: ncol}, axis=1)
+                replace_columns = True
+    if replace_columns:
+        df.columns = new_columns
 
     # Create fields and set types
     fields = []
@@ -761,8 +763,7 @@ def to_geo(df, dst, layer):
         lat = row["latitude"]
         lon = row["longitude"]
         wkb = point_to_gpkg_point(header, lon, lat)
-        values = list(row.values)
-        values.insert(0, wkb)
+        values = [wkb, *row.values]
         rows.append(values)
 
     # Finally insert rows

From a3079dd0c72c1029e58c4393bd8484f90cf41b69 Mon Sep 17 00:00:00 2001
From: Alexander Morgan <alexanderpmorgan@gmail.com>
Date: Wed, 13 Mar 2024 17:36:59 +0100
Subject: [PATCH 2/8] Put comment above block

---
 reView/utils/functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/reView/utils/functions.py b/reView/utils/functions.py
index b047046..b90fa91 100644
--- a/reView/utils/functions.py
+++ b/reView/utils/functions.py
@@ -695,9 +695,9 @@ def to_geo(df, dst, layer):
     if "index" in df:
         del df["index"]
 
+    # Remove or rename columns
     replace_columns = False
     new_columns = []
-    # Remove or rename columns
     for col in df.columns:
         # Remove columns that start with numbers
         if is_int(col[0]):

From e4f436f7fa1f5730d2abb4ed05700f78f83e4e51 Mon Sep 17 00:00:00 2001
From: Alexander Morgan <alexanderpmorgan@gmail.com>
Date: Thu, 14 Mar 2024 00:38:58 +0100
Subject: [PATCH 3/8] Revert to using str.replace

---
 reView/utils/functions.py | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/reView/utils/functions.py b/reView/utils/functions.py
index b90fa91..9d1a7df 100644
--- a/reView/utils/functions.py
+++ b/reView/utils/functions.py
@@ -34,18 +34,6 @@
 
 logger = logging.getLogger(__name__)
 
-_trans_table_1 = str.maketrans({",": None, "$": None, "%": None})
-_trans_table_2 = str.maketrans({
-        "-": "_",
-        " ": "_",
-        "/": "_",
-        "$": "usd",
-        "?": None,
-        "(": None,
-        ")": None,
-        "%": "pct",
-        "&": "and"
-    })
 TIME_PATTERN = "%Y-%m-%d %H:%M:%S+00:00"
 
 
@@ -101,7 +89,7 @@ def as_float(value):
         Input string value represented as a float.
     """
     if isinstance(value, str):
-        value = value.translate(_trans_table_1)
+        value = value.replace(",", "").replace("$", "").replace("%", "")
         value = float(value)
     return value
 
@@ -696,6 +684,17 @@ def to_geo(df, dst, layer):
         del df["index"]
 
     # Remove or rename columns
+    replacements = {
+        "-": "_",
+        " ": "_",
+        "/": "_",
+        "$": "usd",
+        "?": "",
+        "(": "",
+        ")": "",
+        "%": "pct",
+        "&": "and"
+    }
     replace_columns = False
     new_columns = []
     for col in df.columns:
@@ -703,12 +702,15 @@ def to_geo(df, dst, layer):
         if is_int(col[0]):
             del df[col]
             print(col)
+
         # This happens when you save the index
         elif "Unnamed:" in col:
             del df[col]
         else:
-            # Remove unnacceptable characters
-            ncol = col.translate(_trans_table_2)
+            # Remove unacceptable characters
+            ncol = col
+            for char, repl in replacements.items():
+                ncol = ncol.replace(char, repl)
 
             # Lower case just because
             ncol = ncol.lower()
@@ -721,10 +723,10 @@ def to_geo(df, dst, layer):
             #         npart2 = ncol.split("_")[0]
             #         ncol = "_".join([npart1, npart2])
 
-
             new_columns.append(ncol)
             if col != ncol:
                 replace_columns = True
+
     if replace_columns:
         df.columns = new_columns
 

From 626c5fffee21997e2de690cc588a330e04e39aed Mon Sep 17 00:00:00 2001
From: Alexander Morgan <alexanderpmorgan@gmail.com>
Date: Tue, 19 Mar 2024 13:24:23 +0100
Subject: [PATCH 4/8] Refactor read_timeseries to reuse more code

---
 reView/utils/functions.py | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/reView/utils/functions.py b/reView/utils/functions.py
index 9d1a7df..f244ba7 100644
--- a/reView/utils/functions.py
+++ b/reView/utils/functions.py
@@ -485,12 +485,6 @@ def read_timeseries(file, gids=None, nsteps=None):
     if "bespoke" not in str(file):
         # Break down time entries
         time = [t.decode() for t in ds["time_index"][:nsteps]]
-        dtime = [dt.datetime.strptime(t, TIME_PATTERN) for t in time]
-        minutes = [t.minute for t in dtime]
-        hours = [t.hour for t in dtime]
-        days = [t.timetuple().tm_yday for t in dtime]
-        weeks = [t.isocalendar().week for t in dtime]
-        months = [t.month for t in dtime]
 
         # Process generation data
         cf = ds["rep_profiles_0"][:nsteps, idx]
@@ -524,15 +518,16 @@ def read_timeseries(file, gids=None, nsteps=None):
 
         # This will only take the average across the year
         time = [t.decode() for t in time]
-        dtime = [dt.datetime.strptime(t, TIME_PATTERN) for t in time]
-        days = [t.timetuple().tm_yday for t in dtime]
-        weeks = [t.isocalendar().week for t in dtime]
-        months = [t.month for t in dtime]
-        hours = [t.hour for t in dtime]
-        minutes = [t.minute for t in dtime]
 
     ds.close()
 
+    dtime = [dt.datetime.strptime(t, TIME_PATTERN) for t in time]
+    minutes = [t.minute for t in dtime]
+    hours = [t.hour for t in dtime]
+    days = [t.timetuple().tm_yday for t in dtime]
+    weeks = [t.isocalendar().week for t in dtime]
+    months = [t.month for t in dtime]
+
     data = pd.DataFrame({
         "time": time,
         "minute": minutes,

From b922a5db2ce33b3623d2d1930815eccd006f3d1e Mon Sep 17 00:00:00 2001
From: Alexander Morgan <alexanderpmorgan@gmail.com>
Date: Fri, 22 Mar 2024 18:10:24 +0100
Subject: [PATCH 5/8] Fix bad var reference

---
 reView/utils/characterizations.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/reView/utils/characterizations.py b/reView/utils/characterizations.py
index 29fbbb7..3fa97fa 100644
--- a/reView/utils/characterizations.py
+++ b/reView/utils/characterizations.py
@@ -213,8 +213,10 @@ def validate_characterization_remapper(  # noqa: C901
         parameters are encountered in characterization_remapper.
     """
 
-    if any(key not in df.columns for key in characterization_remapper):
-        keys = [key not in df.columns for key in characterization_remapper]
+    if any(key not in supply_curve_df.columns
+           for key in characterization_remapper):
+        keys = [key not in supply_curve_df.columns
+                for key in characterization_remapper]
         raise KeyError(
             "Invalid column name(s) in characterization_remapper. "
             "The following column name(s) were not found in the input "

From 7367cab61fb136f6a2b3e6e7fce238916cc39b8c Mon Sep 17 00:00:00 2001
From: Alexander Morgan <alexanderpmorgan@gmail.com>
Date: Tue, 16 Apr 2024 18:08:58 +0200
Subject: [PATCH 6/8] Fix to_wgs, optimize unpack_turbines

---
 reView/utils/bespoke.py | 39 +++++++++++++++++----------------------
 1 file changed, 17 insertions(+), 22 deletions(-)

diff --git a/reView/utils/bespoke.py b/reView/utils/bespoke.py
index fa7da3a..1dd32de 100644
--- a/reView/utils/bespoke.py
+++ b/reView/utils/bespoke.py
@@ -87,7 +87,10 @@ def to_wgs(self, rdf):
             self.src_crs,
             always_xy=True
         )
-        rdf[['longitude', 'latitude']] = transformer.transform(xs, ys)
+        lons, lats = transformer.transform(xs, ys)
+        rdf["x"] = lons
+        rdf["y"] = lats
+        rdf.rename(columns={'x': 'longitude', 'y': 'latitude'}, inplace=True)
         rdf = rdf[self.df.columns]
         return rdf
 
@@ -106,8 +109,7 @@ def unpack_turbines(self, drop_sc_points=False):
 
         # Get coordinates from equal area projection
         x, y = self.get_xy(row)
-        del row["longitude"]
-        del row["latitude"]
+        row.drop(['longitude', 'latitude'], inplace=True)
 
         # Get bottom left coordinates
         blx = x - (self.spacing / 2)
@@ -119,28 +121,21 @@ def unpack_turbines(self, drop_sc_points=False):
         xs = [x + blx for x in xs]
         ys = [y + bly for y in ys]
 
-        # Build new data frame entries for each turbine
-        nrows = []
-
         # use len(xs) to determine number of turbines because
         # nturbines does not appear to be a standard column
         turbine_capacity_mw = row['capacity'] / len(xs)
 
-        for i, x in enumerate(xs):
-            nrow = row.copy()
-            # overwrite existing capacity column (which is typically system
-            # capacity in mw) with turbine capacity in kw for this turbine row.
-            # This maintains compatibility with how capacity is summed and
-            # displayed in the dashboard
-            nrow["capacity"] = turbine_capacity_mw
-            nrow["x"] = x
-            nrow["y"] = ys[i]
-            nrows.append(nrow)
-
-        # Build new data frame
-        rdf = pd.DataFrame(nrows)
-        rdf = rdf.reset_index(drop=True)
-        rdf.index = df.index[-1] + rdf.index + 1
+        # Build new data frame with a row for each turbine
+        new_index = range(df.index[-1] + 1, df.index[-1] + 1 + len(xs))
+        rdf = pd.DataFrame([row]*len(xs), index=new_index)
+
+        # overwrite existing capacity column (which is typically system
+        # capacity in mw) with turbine capacity in kw for this turbine row.
+        # This maintains compatibility with how capacity is summed and
+        # displayed in the dashboard
+        rdf['capacity'] = turbine_capacity_mw
+        rdf['x'] = xs
+        rdf['y'] = ys
 
         # Convert back to WGS84
         rdf = self.to_wgs(rdf)
@@ -150,7 +145,7 @@ def unpack_turbines(self, drop_sc_points=False):
 
         # Replace the original row with one of the new rows.
         df.iloc[self.index] = rdf.iloc[-1]
-        rdf = rdf.iloc[:-1]
+        rdf.drop(rdf.index[-1], inplace=True)
         df = pd.concat([df, rdf])
 
         return df

From e860b2a13b6ad185ac4f5fba5fdccce3970d5b92 Mon Sep 17 00:00:00 2001
From: Alexander Morgan <alexanderpmorgan@gmail.com>
Date: Tue, 16 Apr 2024 18:36:49 +0200
Subject: [PATCH 7/8] Use inplace=True where possible with reset_index()

---
 reView/pages/rev/controller/element_builders.py | 2 +-
 reView/pages/rev/model.py                       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/reView/pages/rev/controller/element_builders.py b/reView/pages/rev/controller/element_builders.py
index da5614b..ade2458 100644
--- a/reView/pages/rev/controller/element_builders.py
+++ b/reView/pages/rev/controller/element_builders.py
@@ -570,7 +570,7 @@ def _distributions(self, data, y_var, nbins=100):
         pdf = count / sum(count)
         cdf = np.cumsum(pdf)
         df = pd.DataFrame({y_var: cbins, "cdf": cdf, "pdf": pdf})
-        df = df.reset_index()
+        df.reset_index(inplace=True)
         return df
 
     def _histogram(self, main_df, y_var, bins):
diff --git a/reView/pages/rev/model.py b/reView/pages/rev/model.py
index 2a183da..118abea 100644
--- a/reView/pages/rev/model.py
+++ b/reView/pages/rev/model.py
@@ -545,7 +545,7 @@ def composite(dfs, composite_variable="total_lcoe",
     """Return a single least cost df from a list dfs."""
     # Make one big data frame
     bdf = pd.concat(dfs)
-    bdf = bdf.reset_index(drop=True)
+    bdf.reset_index(drop=True, inplace=True)
 
     # Group, find minimum, and subset
     if composite_function == "min":

From adfedc0832cc9505a643a1c7bc84fc6c3470f199 Mon Sep 17 00:00:00 2001
From: Alexander Morgan <alexanderpmorgan@gmail.com>
Date: Tue, 16 Apr 2024 18:38:11 +0200
Subject: [PATCH 8/8] Remove unneeded .copy() call

---
 reView/utils/characterizations.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/reView/utils/characterizations.py b/reView/utils/characterizations.py
index 3fa97fa..575dfab 100644
--- a/reView/utils/characterizations.py
+++ b/reView/utils/characterizations.py
@@ -182,8 +182,6 @@ def unpack_characterizations(  # noqa: C901
         elif method is None:
             warnings.warn(f"Skipping {char_col}: No method provided")
 
-        in_df = in_df.copy()
-
     return in_df