KernelTuner
diff --git a/‎CHANGELOG.md‎
Lines changed: 10 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎MANIFEST.in‎
Lines changed: 1 addition & 0 deletions b/‎MANIFEST.in‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎doc/source/conf.py‎
Lines changed: 2 additions & 2 deletions b/‎doc/source/conf.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎doc/source/matrix_multiplication.ipynb‎
Lines changed: 7 additions & 2 deletions b/‎doc/source/matrix_multiplication.ipynb‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎doc/source/observers.rst‎
Lines changed: 10 additions & 3 deletions b/‎doc/source/observers.rst‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎examples/cuda/expdist.py‎
Lines changed: 3 additions & 2 deletions b/‎examples/cuda/expdist.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎kernel_tuner/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎kernel_tuner/__init__.py‎
Lines changed: 1 addition & 1 deletion
@@ -4,6 +4,16 @@ This project adheres to [Semantic Versioning](http://semver.org/).
 
 ## Unreleased
 
+## [0.4.5] - 2023-06-01
+### Added
+- PMTObserver to measure power and energy on various platforms
+
+### Changed
+- Improved functionality for storing output and metadata files
+- Updated PowerSensorObserver to support PowerSensor3
+- Refactored interal interfaces of runners and backends
+- Bugfix in interface to set objective and optimization direction
+
 ## [0.4.4] - 2023-03-09
 ### Added
 - Support for using time_limit in simulation mode
 
@@ -1,3 +1,4 @@
 include setup.py
 include LICENSE
 include README.rst
+include kernel_tuner/schema/T4/1.0.0/*
@@ -59,9 +59,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = u'0.4.4'
+version = u'0.4.5'
 # The full version, including alpha/beta/rc tags.
-release = u'0.4.4'
+release = u'0.4.5'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
 
@@ -314,6 +314,11 @@
     "    int k, kb;\n",
     "\n",
     "    float sum[tile_size_y][tile_size_x];\n",
+    "    for (int i=0; i < tile_size_y; i++) {\n",
+    "        for (int j=0; j < tile_size_x; j++) {\n",
+    "            sum[i][j] = 0.0;\n",
+    "        }\n",
+    "    }    \n",
     "\n",
     "    for (k = 0; k < WIDTH; k += block_size_x) {\n",
     "\n",
@@ -430,7 +435,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -444,7 +449,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.9"
+   "version": "3.9.12"
   }
  },
  "nbformat": 4,
 
@@ -12,7 +12,7 @@ observer will be called when the event takes place.
 
 Kernel Tuner implements an abstract BenchmarkObserver with methods that may be overwritten by classes extending
 the BenchmarkObserver class, shown below. The only mandatory method to implement
-is ``get\_results``, which is used to return the resulting observations at the end of benchmarking a
+is ``get_results``, which is used to return the resulting observations at the end of benchmarking a
 particular kernel configuration and usually returns aggregated results over multiple iterations of kernel
 execution. Before tuning starts, each observer is given a reference to the lower-level backend that is used for
 compiling and benchmarking the kernel configurations. In this way, the observer can inspect the compiled module,
@@ -53,7 +53,7 @@ the user to record power and/or energy consumption of kernel configurations duri
 Kernel Tuner to accurately determine the power and energy consumption of all kernel configurations it benchmarks
 during auto-tuning.
 
-.. autoclass:: kernel_tuner.observers.PowerSensorObserver
+.. autoclass:: kernel_tuner.observers.powersensor.PowerSensorObserver
 
 
 NVMLObserver
@@ -74,7 +74,7 @@ time it takes to benchmark different kernel configurations. However, NVML can be
 almost all Nvidia GPUs, so this method is much more accessible to end-users compared to solutions that require
 custom hardware, such as PowerSensor2.
 
-.. autoclass:: kernel_tuner.nvml.NVMLObserver
+.. autoclass:: kernel_tuner.observers.nvml.NVMLObserver
 
 
 Tuning execution parameters with NVML
@@ -101,7 +101,14 @@ the path where you are allowed to run nvidia-smi with privileges. This allows yo
 limits will be done through nvidia-smi.
 
 
+PMTObserver
+~~~~~~~~~~~
 
+The PMTObserver can be used to measure power and energy on various platforms including Nvidia Jetson, Nvidia NVML,
+the RAPL interface, AMD ROCM, and Xilinx. It requires PMT to be installed, as well as the PMT's Python interface. 
+More information about PMT can be found here: https://git.astron.nl/RD/pmt/
+
+.. autoclass:: kernel_tuner.observers.pmt.PMTObserver
 
 
 
@@ -4,6 +4,7 @@
 import numpy
 
 from kernel_tuner import tune_kernel
+from kernel_tuner import util
 
 def tune_expdist():
 
@@ -43,7 +44,7 @@ def tune_expdist():
         json.dump(kernel1, fp)
 
     #get the number of blocks used by the best configuration in the first kernel
-    best_config1 = min(kernel1[0], key=lambda x:x['time'])
+    best_config1 = util.get_best_config(kernel1[0], 'time')
     nblocks = numpy.int32( numpy.ceil(size / float(best_config1["block_size_x"]*best_config1["tile_size_x"])) *
                            numpy.ceil(size / float(best_config1["block_size_y"]*best_config1["tile_size_y"])) )
 
@@ -56,7 +57,7 @@ def tune_expdist():
     kernel2 = tune_kernel("reduce_cross_term", kernel_string, 1, arguments, tune_params,
                 grid_div_x=[], verbose=True)
 
-    best_config2 = min(kernel2[0], key=lambda x:x['time'])
+    best_config2 = util.get_best_config(kernel2[0], 'time')
     print("best GPU configuration, total time=", best_config1['time'] + best_config2['time'])
     print(best_config1)
     print(best_config2)
 
@@ -1,4 +1,4 @@
 from kernel_tuner.integration import store_results, create_device_targets
 from kernel_tuner.interface import tune_kernel, run_kernel
 
-__version__ = "0.4.4"
+__version__ = "0.4.5"