From 0fc9696d8f119881d07db3179de66a3b38c3d85a Mon Sep 17 00:00:00 2001
From: Katie Graham <graham63@llnl.gov>
Date: Wed, 9 Nov 2022 21:32:52 -0800
Subject: [PATCH 1/7] Added callback for mlperf logs

---
 CMakeLists.txt                             |   5 +
 include/lbann/callbacks/CMakeLists.txt     |   1 +
 include/lbann/callbacks/mlperf_logging.hpp | 135 +++++++++
 src/callbacks/CMakeLists.txt               |   1 +
 src/callbacks/mlperf_logging.cpp           | 324 +++++++++++++++++++++
 src/proto/callbacks.proto                  |  11 +-
 src/proto/factories/callback_factory.cpp   |   3 +
 7 files changed, 477 insertions(+), 3 deletions(-)
 create mode 100644 include/lbann/callbacks/mlperf_logging.hpp
 create mode 100644 src/callbacks/mlperf_logging.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 21951fd7ea1..8a7bae7687f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -237,15 +237,19 @@ if (LBANN_WITH_DISTCONV)
   find_package(DiHydrogen 0.3.0 CONFIG REQUIRED COMPONENTS Meta Patterns DistConv)
   set(LBANN_HAS_DISTCONV TRUE)
   set(LBANN_H2_LIBS
+    H2::H2Core
     H2::H2Meta
     H2::H2Patterns
     H2::H2DistConv)
 else ()
   find_package(DiHydrogen CONFIG REQUIRED COMPONENTS Meta Patterns)
   set(LBANN_H2_LIBS
+    H2::H2Core
     H2::H2Meta
     H2::H2Patterns)
 endif ()
+#FIXME(KLG): There is no H2CoreConfig.cmake in H2
+#find_package(H2Core REQUIRED)
 set(LBANN_HAS_DIHYDROGEN TRUE)
 message(STATUS "Found DiHydrogen: ${DiHydrogen_DIR}")
 
@@ -660,6 +664,7 @@ target_link_libraries(lbann PUBLIC
   ${CLARA_LIBRARIES}
   ${LBANN_PYTHON_LIBS}
   protobuf::libprotobuf
+  spdlog::spdlog
   ${CEREAL_LIBRARIES}
   ZSTR::ZSTR)
 
diff --git a/include/lbann/callbacks/CMakeLists.txt b/include/lbann/callbacks/CMakeLists.txt
index e901d63831f..f000f6875a1 100644
--- a/include/lbann/callbacks/CMakeLists.txt
+++ b/include/lbann/callbacks/CMakeLists.txt
@@ -51,6 +51,7 @@ set_full_path(THIS_DIR_HEADERS
   learning_rate.hpp
   ltfb.hpp
   mixup.hpp
+  mlperf_logging.hpp
   monitor_io.hpp
   perturb_adam.hpp
   perturb_dropout.hpp
diff --git a/include/lbann/callbacks/mlperf_logging.hpp b/include/lbann/callbacks/mlperf_logging.hpp
new file mode 100644
index 00000000000..061823bba89
--- /dev/null
+++ b/include/lbann/callbacks/mlperf_logging.hpp
@@ -0,0 +1,135 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2022, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// mlperf_logging .hpp .cpp - Prints mlperf compliant benchmark logs
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_MLPERF_LOGGING_HPP_INCLUDED
+#define LBANN_CALLBACKS_MLPERF_LOGGING_HPP_INCLUDED
+
+#include "lbann/callbacks/callback.hpp"
+#include <h2/utils/Logger.hpp>
+
+namespace lbann {
+namespace callback {
+
+/** @class mlperf_logging
+ *  @brief Callback to print mlperf compliant benchmark logs
+ */
+class mlperf_logging : public callback_base {
+
+public:
+
+  enum class event_type {
+    TIME_POINT,
+    INT_START,
+    INT_END,
+  };
+
+public:
+
+  /** @brief mlperf_logging Constructor.
+   *  @param output_filename Output filename (default = results.txt)
+   */
+  mlperf_logging(std::string output_filename)
+    : callback_base(/*batch_interval=*/1),
+      m_output_filename{output_filename.size() ?
+                        std::move(output_filename) :
+                        std::string("results.txt")}
+  {}
+
+  /** @brief Copy interface */
+  mlperf_logging* copy() const override {
+    return new mlperf_logging(*this);
+  }
+
+  /** @brief Return name of callback */
+  std::string name() const override { return "mlperf_logging"; }
+
+  /** @brief Push mlperf formatted log string to stream object.
+   *  @param ostream os Stores log strings.
+   *  @param event_type et Type of mlperf style event.
+   *  @param string key Mlperf log key.
+   *  @param T value Mlperf log value.
+   *  @param char const* file Current file name.
+   *  @param size_t line File line number.
+   *  @param double epoch Current epoch number.
+   */
+  template <typename T>
+  void print(std::ostream& os, mlperf_logging::event_type et, std::string key,
+             T value, char const* file, size_t line, double epoch = -1) const;
+
+  void setup(model *m) override;
+  void on_setup_end(model *m) override;
+  void on_epoch_begin(model *m) override;
+  void on_epoch_end(model *m) override;
+  void on_train_begin(model *m) override;
+  void on_train_end(model *m) override;
+  void on_batch_evaluate_begin(model *m) override;
+  void on_batch_evaluate_end(model *m) override;
+
+private:
+
+  /** @brief Populate log with mlperf event type.
+   *  @param ostream os Stores log string.
+   *  @param event_type et Type of mlperf style event.
+   */
+  void print_event_type(std::ostream& os, mlperf_logging::event_type et) const;
+
+  /** @brief Populate log with value.
+   *  @param ostream os Stores log string.
+   *  @param event_type et Mlperf log value.
+   */
+  void print_value(std::ostream& os, double value) const;
+  void print_value(std::ostream& os, long value) const;
+  void print_value(std::ostream& os, size_t value) const;
+  void print_value(std::ostream& os, std::string value) const;
+  //FIXME: Always picks this function first
+  //template <typename T>
+  //void print_value(std::ostream& os, T value) const;
+
+  static size_t get_ms_since_epoch();
+
+private:
+
+  //FIXME: get logger to output file
+  /* @brief name of output file. Default = results.txt */
+  std::string m_output_filename;
+
+  //FIXME: Add custom logging tag
+  /* @brief DiHydrogen logger */
+  h2::Logger m_logger;
+
+}; // class mlperf_logging
+
+std::unique_ptr<callback_base>
+build_mlperf_logging_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg,
+  const std::shared_ptr<lbann_summary>&);
+
+} // namespace callback
+} // namespace lbann
+
+#endif  // LBANN_CALLBACKS_MLPERF_LOGGING_HPP_INCLUDED
diff --git a/src/callbacks/CMakeLists.txt b/src/callbacks/CMakeLists.txt
index a423c3eb315..8b6713f966d 100644
--- a/src/callbacks/CMakeLists.txt
+++ b/src/callbacks/CMakeLists.txt
@@ -52,6 +52,7 @@ set_full_path(THIS_DIR_SOURCES
   load_model.cpp
   ltfb.cpp
   mixup.cpp
+  mlperf_logging.cpp
   monitor_io.cpp
   perturb_adam.cpp
   perturb_dropout.cpp
diff --git a/src/callbacks/mlperf_logging.cpp b/src/callbacks/mlperf_logging.cpp
new file mode 100644
index 00000000000..18244ec33f4
--- /dev/null
+++ b/src/callbacks/mlperf_logging.cpp
@@ -0,0 +1,324 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2022, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// mlperf_logging .hpp .cpp - Prints mlperf compliant benchmark logs
+////////////////////////////////////////////////////////////////////////////////
+
+#include "lbann/callbacks/mlperf_logging.hpp"
+#include "lbann/metrics/metric.hpp"
+#include "lbann/weights/weights.hpp"
+
+#include <callbacks.pb.h>
+
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <chrono>
+#include <any>
+#include <sstream>
+
+namespace lbann {
+namespace callback {
+
+template <typename T>
+void mlperf_logging::print(std::ostream& os, mlperf_logging::event_type et,
+                           std::string key, T value, char const* file,
+                           size_t line, double epoch) const
+{
+  os << "{"
+     << "\"namespace\": \"\", "
+     << "\"time_ms\": " << get_ms_since_epoch() << ", "
+     << "\"event_type\": \"";
+  print_event_type(os, et);
+
+  os << "\", "
+     << "\"key\": " << key << "\", "
+     << "\"value\": ";
+  print_value(os, value);
+  os << ", "
+     << "\"metadata\": {\"file\": \"" << file << "\", "
+     << "\"lineno\": " << line;
+  if(epoch < 0)
+    os << "}}\n";
+  else
+    os << ", " << "\"epoch_num\": " << epoch << "}}\n";
+}
+
+void mlperf_logging::print_event_type(std::ostream& os, mlperf_logging::event_type et) const
+{
+  switch (et) {
+  case mlperf_logging::event_type::TIME_POINT: os << "POINT_IN_TIME"; break;
+  case mlperf_logging::event_type::INT_START: os << "INTERVAL_START"; break;
+  case mlperf_logging::event_type::INT_END: os << "INTERVAL_END"; break;
+  default: os << "INVALID_EVENT_TYPE"; break;
+  }
+}
+
+void mlperf_logging::print_value(std::ostream& os, double value) const
+{
+  os << value;
+}
+void mlperf_logging::print_value(std::ostream& os, long value) const
+{
+  os << value;
+}
+void mlperf_logging::print_value(std::ostream& os, size_t value) const
+{
+  os << value;
+}
+void mlperf_logging::print_value(std::ostream& os, std::string value) const
+{
+  os << value;
+}
+/*template <typename T>
+void mlperf_logging::print_value(std::ostream& os, T value) const
+{
+  //FIXME: Should I push the value anyway?
+  os << "UNKNOWN_DATA_TYPE";
+}
+*/
+
+size_t mlperf_logging::get_ms_since_epoch()
+{
+  using namespace std::chrono;
+  return duration_cast< milliseconds >(
+    system_clock::now().time_since_epoch()).count();
+}
+
+//FIXME(KLG): There is no on_setup_begin. Can I steal this as a callback hook?
+void mlperf_logging::setup(model *m)
+{
+  std::ostringstream os;
+
+  //FIXME: What is this?
+  std::string value = "null";
+  print(os, mlperf_logging::event_type::TIME_POINT, "cache_clear", value,
+        __FILE__, __LINE__);
+
+  //FIXME: Make these user input vars
+  value = "oc20";
+  print(os, mlperf_logging::event_type::TIME_POINT, "submission_benchmark",
+        value, __FILE__, __LINE__);
+
+  value = "LBANN";
+  print(os, mlperf_logging::event_type::TIME_POINT, "submission_org",
+        value, __FILE__, __LINE__);
+
+  //FIXME: value = closed?
+  value = "closed";
+  print(os, mlperf_logging::event_type::TIME_POINT, "submission_division",
+        value, __FILE__, __LINE__);
+
+  //FIXME: value = onprem?
+  value = "onprem";
+  print(os, mlperf_logging::event_type::TIME_POINT, "submission_status",
+        value, __FILE__, __LINE__);
+
+  //FIXME:  value = SUBMISSION_PLATFORM_PLACEHOLDER?
+  value = "?";
+  print(os, mlperf_logging::event_type::TIME_POINT, "submission_platform",
+        value, __FILE__, __LINE__);
+
+  value = "null";
+  print(os, mlperf_logging::event_type::TIME_POINT, "init_start", value,
+        __FILE__, __LINE__);
+
+  H2_INFO(os.str());
+}
+void mlperf_logging::on_setup_end(model *m)
+{
+  std::ostringstream os;
+  lbann_comm *comm = m->get_comm();
+
+  //FIXME: num_trainers or world size?
+  print(os, mlperf_logging::event_type::TIME_POINT, "number_of_ranks",
+        static_cast<double>(comm->get_num_trainers()), __FILE__, __LINE__);
+
+  //FIXME
+  auto nodes = -1;
+  print(os, mlperf_logging::event_type::TIME_POINT, "number_of_nodes",
+        static_cast<double>(nodes), __FILE__, __LINE__);
+
+  //FIXME
+  auto accelerators = -1;
+  print(os, mlperf_logging::event_type::TIME_POINT, "accelerators_per_node",
+        static_cast<double>(accelerators), __FILE__, __LINE__);
+
+  //FIXME: From trainer.hpp?
+  auto seed = -1;
+  print(os, mlperf_logging::event_type::TIME_POINT, "seed",
+        static_cast<double>(seed), __FILE__, __LINE__);
+
+  //FIXME: Add get_minibatch_size to model or metrics?
+  auto batch_size = -1;
+  print(os, mlperf_logging::event_type::TIME_POINT, "global_batch_size",
+        static_cast<double>(batch_size), __FILE__, __LINE__);
+
+  metric_statistics metrics;
+  auto samples = metrics.get_num_samples();
+  print(os, mlperf_logging::event_type::TIME_POINT, "train_samples",
+        static_cast<double>(samples), __FILE__, __LINE__);
+
+  //FIXME
+  auto eval_samples = -1;
+  print(os, mlperf_logging::event_type::TIME_POINT, "eval_samples",
+        static_cast<double>(eval_samples), __FILE__, __LINE__);
+
+  //FIXME: I couldn't get this to work
+  //auto* optimizer = m->get_weights().get_optimizer();
+  std::string opt = "opt_name";
+  print(os, mlperf_logging::event_type::TIME_POINT, "opt_name",
+        opt, __FILE__, __LINE__);
+
+  //FIXME
+  auto opt_learning_rate = -1;
+  print(os, mlperf_logging::event_type::TIME_POINT, "opt_base_learning_rate",
+        static_cast<double>(opt_learning_rate), __FILE__, __LINE__);
+
+  //FIXME
+  auto opt_warmup_steps = -1;
+  print(os, mlperf_logging::event_type::TIME_POINT,
+        "opt_learning_rate_warmup_steps",
+        static_cast<double>(opt_warmup_steps),
+        __FILE__, __LINE__);
+
+  //FIXME
+  auto opt_warmup_factor = -1;
+  print(os, mlperf_logging::event_type::TIME_POINT,
+        "opt_learning_rate_warmup_factor",
+        static_cast<double>(opt_warmup_factor),
+        __FILE__, __LINE__);
+
+  //FIXME
+  auto opt_decay_bound_steps = -1;
+  print(os, mlperf_logging::event_type::TIME_POINT,
+        "opt_learning_rate_decay_boundary_steps",
+        static_cast<double>(opt_decay_bound_steps),
+        __FILE__, __LINE__);
+
+  //FIXME
+  auto opt_decay_factor = -1;
+  print(os, mlperf_logging::event_type::TIME_POINT,
+        "opt_learning_rate_decay_factor",
+        static_cast<double>(opt_decay_factor),
+        __FILE__, __LINE__);
+
+  print(os, mlperf_logging::event_type::TIME_POINT, "init_stop", "null",
+        __FILE__, __LINE__);
+
+  H2_INFO(os.str());
+}
+
+void mlperf_logging::on_epoch_begin(model *m)
+{
+  std::ostringstream os;
+  const auto& epoch = static_cast<const SGDExecutionContext&>(
+    m->get_execution_context()).get_epoch();
+
+  print(os, mlperf_logging::event_type::INT_START, "epoch_start", "null",
+        __FILE__, __LINE__, epoch);
+
+  H2_INFO(os.str());
+}
+
+void mlperf_logging::on_epoch_end(model *m)
+{
+  std::ostringstream os;
+  const auto& epoch = static_cast<const SGDExecutionContext&>(
+    m->get_execution_context()).get_epoch();
+
+  print(os, mlperf_logging::event_type::INT_START, "epoch_stop", "null",
+        __FILE__, __LINE__, epoch);
+
+  H2_INFO(os.str());
+}
+
+void mlperf_logging::on_train_begin(model *m)
+{
+  std::ostringstream os;
+  const auto& epoch = static_cast<const SGDExecutionContext&>(
+    m->get_execution_context()).get_epoch();
+
+  //FIXME: run_start? Same time stamp as epoch 1 in results
+  print(os, mlperf_logging::event_type::INT_START, "run_start", "null",
+        __FILE__, __LINE__, epoch);
+
+  H2_INFO(os.str());
+}
+
+void mlperf_logging::on_train_end(model *m)
+{
+  std::ostringstream os;
+  const auto& epoch = static_cast<const SGDExecutionContext&>(
+    m->get_execution_context()).get_epoch();
+
+  //FIXME: run_stop? End of training?
+  print(os, mlperf_logging::event_type::INT_START, "run_stop", "null",
+        __FILE__, __LINE__, epoch);
+
+  H2_INFO(os.str());
+}
+
+void mlperf_logging::on_batch_evaluate_begin(model *m)
+{
+  std::ostringstream os;
+  const auto& epoch = static_cast<const SGDExecutionContext&>(
+    m->get_execution_context()).get_epoch();
+
+  print(os, mlperf_logging::event_type::INT_START, "eval_start", "null",
+        __FILE__, __LINE__, epoch);
+
+  H2_INFO(os.str());
+}
+
+void mlperf_logging::on_batch_evaluate_end(model *m)
+{
+  std::ostringstream os;
+  const auto& epoch = static_cast<const SGDExecutionContext&>(
+    m->get_execution_context()).get_epoch();
+
+  print(os, mlperf_logging::event_type::INT_START, "eval_stop", "null",
+        __FILE__, __LINE__, epoch);
+
+  //FIXME
+  auto eval_error = -1;
+  print(os, mlperf_logging::event_type::TIME_POINT, "eval_error",
+        static_cast<double>(eval_error), __FILE__,
+        __LINE__, epoch);
+
+  H2_INFO(os.str());
+}
+
+std::unique_ptr<callback_base>
+build_mlperf_logging_callback_from_pbuf(
+  const google::protobuf::Message& proto_msg,
+  const std::shared_ptr<lbann_summary>&)
+{
+  const auto& params =
+    dynamic_cast<const lbann_data::Callback::CallbackMlperfLogging&>(proto_msg);
+  return std::make_unique<mlperf_logging>(params.output_filename());
+}
+} // namespace callback
+} // namespace lbann
diff --git a/src/proto/callbacks.proto b/src/proto/callbacks.proto
index 71fee45c9ac..79f7d164db8 100644
--- a/src/proto/callbacks.proto
+++ b/src/proto/callbacks.proto
@@ -86,7 +86,7 @@ message Callback {
     CallbackPerturbWeights perturb_weights = 52;
     CallbackExportOnnx export_onnx = 53;
     CallbackAlternateUpdates alternate_updates = 54;
-  }
+    CallbackMlperfLogging mlperf_logging = 55;
 
   message CallbackLTFB {
     int64 batch_interval = 1;
@@ -433,8 +433,13 @@ message Callback {
 
   /** @brief Export trained model in onnx format */
   message CallbackExportOnnx {
-    string output_filename = 1;        // name of onnx output file
-    string debug_string_filename = 2;  // print debug string to file
+    string output_filename = 1; // Name of onnx output file
+    string debug_string_filename = 2; // Print debug string to file
+  }
+
+  /** @brief Prints mlperf compliant benchmark logs */
+  message CallbackMlperfLogging {
+    string output_filename = 1;
   }
 
   message CallbackAlternateUpdates {
diff --git a/src/proto/factories/callback_factory.cpp b/src/proto/factories/callback_factory.cpp
index 8302cb29840..2ee5a4847d7 100644
--- a/src/proto/factories/callback_factory.cpp
+++ b/src/proto/factories/callback_factory.cpp
@@ -55,6 +55,7 @@
 #include "lbann/callbacks/load_model.hpp"
 #include "lbann/callbacks/ltfb.hpp"
 #include "lbann/callbacks/mixup.hpp"
+#include "lbann/callbacks/mlperf_logging.hpp"
 #include "lbann/callbacks/monitor_io.hpp"
 #include "lbann/callbacks/perturb_adam.hpp"
 #include "lbann/callbacks/perturb_dropout.hpp"
@@ -162,6 +163,8 @@ void register_default_builders(factory_type& factory)
   factory.register_builder("CallbackMinibatchSchedule",
                            build_minibatch_schedule_callback_from_pbuf);
   factory.register_builder("CallbackMixup", build_mixup_callback_from_pbuf);
+  factory.register_builder("CallbackMlperfLogging",
+                           build_mlperf_logging_callback_from_pbuf);
   factory.register_builder(
     "CallbackOptimizerwiseAdaptiveLearningRate",
     build_optimizerwise_adaptive_learning_rate_callback_from_pbuf);

From 579ac6b0a95368c9004697737637cc9b76fd7c65 Mon Sep 17 00:00:00 2001
From: Katie Graham <graham63@llnl.gov>
Date: Tue, 22 Nov 2022 08:58:49 -0800
Subject: [PATCH 2/7] Added user input args to mlperf callback, moved mlperf
 data class out of separate class

---
 include/lbann/callbacks/mlperf_logging.hpp |  48 +++++----
 src/callbacks/mlperf_logging.cpp           | 107 +++++++++------------
 src/proto/callbacks.proto                  |   7 +-
 3 files changed, 84 insertions(+), 78 deletions(-)

diff --git a/include/lbann/callbacks/mlperf_logging.hpp b/include/lbann/callbacks/mlperf_logging.hpp
index 061823bba89..708fb2f892a 100644
--- a/include/lbann/callbacks/mlperf_logging.hpp
+++ b/include/lbann/callbacks/mlperf_logging.hpp
@@ -53,11 +53,28 @@ class mlperf_logging : public callback_base {
   /** @brief mlperf_logging Constructor.
    *  @param output_filename Output filename (default = results.txt)
    */
-  mlperf_logging(std::string output_filename)
+  mlperf_logging(std::string output_filename, std::string sub_benchmark,
+                 std::string sub_org, std::string sub_division,
+                 std::string sub_status, std::string sub_platform)
     : callback_base(/*batch_interval=*/1),
       m_output_filename{output_filename.size() ?
                         std::move(output_filename) :
-                        std::string("results.txt")}
+                        std::string("results.txt")},
+      m_sub_benchmark{sub_benchmark.size() ?
+                      std::move(sub_benchmark) :
+                      std::string("UNKNOWN_SUBMISSION_BENCHMARK")},
+      m_sub_org{sub_org.size() ?
+                std::move(sub_org) :
+                std::string("LBANN")},
+      m_sub_division{sub_division.size() ?
+                     std::move(sub_division) :
+                     std::string("UNKNOWN_SUBMISSION_DIVISION")},
+      m_sub_status{sub_status.size() ?
+                   std::move(sub_status) :
+                   std::string("UNKNOWN_SUBMISSION_STATUS")},
+      m_sub_platform{sub_platform.size() ?
+                     std::move(sub_platform) :
+                        std::string("UNKNOWN_SUBMISSION_PLATFORM")}
   {}
 
   /** @brief Copy interface */
@@ -69,7 +86,7 @@ class mlperf_logging : public callback_base {
   std::string name() const override { return "mlperf_logging"; }
 
   /** @brief Push mlperf formatted log string to stream object.
-   *  @param ostream os Stores log strings.
+   *  @param ostringstream os Stores log strings.
    *  @param event_type et Type of mlperf style event.
    *  @param string key Mlperf log key.
    *  @param T value Mlperf log value.
@@ -78,7 +95,7 @@ class mlperf_logging : public callback_base {
    *  @param double epoch Current epoch number.
    */
   template <typename T>
-  void print(std::ostream& os, mlperf_logging::event_type et, std::string key,
+  void print(std::ostringstream& os, mlperf_logging::event_type et, std::string key,
              T value, char const* file, size_t line, double epoch = -1) const;
 
   void setup(model *m) override;
@@ -93,22 +110,15 @@ class mlperf_logging : public callback_base {
 private:
 
   /** @brief Populate log with mlperf event type.
-   *  @param ostream os Stores log string.
+   *  @param ostringstream os Stores log string.
    *  @param event_type et Type of mlperf style event.
    */
-  void print_event_type(std::ostream& os, mlperf_logging::event_type et) const;
+  void print_event_type(std::ostringstream& os, mlperf_logging::event_type et) const;
 
   /** @brief Populate log with value.
-   *  @param ostream os Stores log string.
+   *  @param ostringstream os Stores log string.
    *  @param event_type et Mlperf log value.
    */
-  void print_value(std::ostream& os, double value) const;
-  void print_value(std::ostream& os, long value) const;
-  void print_value(std::ostream& os, size_t value) const;
-  void print_value(std::ostream& os, std::string value) const;
-  //FIXME: Always picks this function first
-  //template <typename T>
-  //void print_value(std::ostream& os, T value) const;
 
   static size_t get_ms_since_epoch();
 
@@ -117,10 +127,14 @@ class mlperf_logging : public callback_base {
   //FIXME: get logger to output file
   /* @brief name of output file. Default = results.txt */
   std::string m_output_filename;
-
-  //FIXME: Add custom logging tag
   /* @brief DiHydrogen logger */
-  h2::Logger m_logger;
+  h2::Logger m_logger{":::MLLOG", m_output_filename};
+  std::string m_sub_benchmark;
+  std::string m_sub_org;
+  std::string m_sub_division;
+  std::string m_sub_status;
+  std::string m_sub_platform;
+
 
 }; // class mlperf_logging
 
diff --git a/src/callbacks/mlperf_logging.cpp b/src/callbacks/mlperf_logging.cpp
index 18244ec33f4..c48ed59837c 100644
--- a/src/callbacks/mlperf_logging.cpp
+++ b/src/callbacks/mlperf_logging.cpp
@@ -42,8 +42,36 @@
 namespace lbann {
 namespace callback {
 
+// FIXME Does this need an anon namespace since it's only in the cpp file?
+void print_value(std::ostringstream& os, double value)
+{
+  os << value;
+}
+void print_value(std::ostringstream& os, long value)
+{
+  os << value;
+}
+void print_value(std::ostringstream& os, size_t value)
+{
+  os << value;
+}
+void print_value(std::ostringstream& os, std::string const& value)
+{
+  os << "\"" << value << "\"";
+}
+void print_value(std::ostringstream& os, char const* value)
+{
+  os << "\"" << value << "\"";
+}
 template <typename T>
-void mlperf_logging::print(std::ostream& os, mlperf_logging::event_type et,
+void print_value(std::ostringstream& os, T value)
+{
+  //FIXME: Should I push the value anyway?
+  os << "UNKNOWN_DATA_TYPE";
+}
+
+template <typename T>
+void mlperf_logging::print(std::ostringstream& os, mlperf_logging::event_type et,
                            std::string key, T value, char const* file,
                            size_t line, double epoch) const
 {
@@ -54,19 +82,22 @@ void mlperf_logging::print(std::ostream& os, mlperf_logging::event_type et,
   print_event_type(os, et);
 
   os << "\", "
-     << "\"key\": " << key << "\", "
+     << "\"key\": \"" << key << "\", "
      << "\"value\": ";
   print_value(os, value);
   os << ", "
      << "\"metadata\": {\"file\": \"" << file << "\", "
      << "\"lineno\": " << line;
   if(epoch < 0)
-    os << "}}\n";
+    os << "}}";
   else
-    os << ", " << "\"epoch_num\": " << epoch << "}}\n";
+    os << ", " << "\"epoch_num\": " << epoch << "}}";
+
+  H2_INFO(os.str());
+  os.flush();
 }
 
-void mlperf_logging::print_event_type(std::ostream& os, mlperf_logging::event_type et) const
+void mlperf_logging::print_event_type(std::ostringstream& os, mlperf_logging::event_type et) const
 {
   switch (et) {
   case mlperf_logging::event_type::TIME_POINT: os << "POINT_IN_TIME"; break;
@@ -76,30 +107,6 @@ void mlperf_logging::print_event_type(std::ostream& os, mlperf_logging::event_ty
   }
 }
 
-void mlperf_logging::print_value(std::ostream& os, double value) const
-{
-  os << value;
-}
-void mlperf_logging::print_value(std::ostream& os, long value) const
-{
-  os << value;
-}
-void mlperf_logging::print_value(std::ostream& os, size_t value) const
-{
-  os << value;
-}
-void mlperf_logging::print_value(std::ostream& os, std::string value) const
-{
-  os << value;
-}
-/*template <typename T>
-void mlperf_logging::print_value(std::ostream& os, T value) const
-{
-  //FIXME: Should I push the value anyway?
-  os << "UNKNOWN_DATA_TYPE";
-}
-*/
-
 size_t mlperf_logging::get_ms_since_epoch()
 {
   using namespace std::chrono;
@@ -117,35 +124,24 @@ void mlperf_logging::setup(model *m)
   print(os, mlperf_logging::event_type::TIME_POINT, "cache_clear", value,
         __FILE__, __LINE__);
 
-  //FIXME: Make these user input vars
-  value = "oc20";
   print(os, mlperf_logging::event_type::TIME_POINT, "submission_benchmark",
-        value, __FILE__, __LINE__);
+        m_sub_benchmark, __FILE__, __LINE__);
 
-  value = "LBANN";
   print(os, mlperf_logging::event_type::TIME_POINT, "submission_org",
-        value, __FILE__, __LINE__);
+        m_sub_org, __FILE__, __LINE__);
 
-  //FIXME: value = closed?
-  value = "closed";
   print(os, mlperf_logging::event_type::TIME_POINT, "submission_division",
-        value, __FILE__, __LINE__);
+        m_sub_division, __FILE__, __LINE__);
 
-  //FIXME: value = onprem?
-  value = "onprem";
   print(os, mlperf_logging::event_type::TIME_POINT, "submission_status",
-        value, __FILE__, __LINE__);
+        m_sub_status, __FILE__, __LINE__);
 
-  //FIXME:  value = SUBMISSION_PLATFORM_PLACEHOLDER?
-  value = "?";
   print(os, mlperf_logging::event_type::TIME_POINT, "submission_platform",
-        value, __FILE__, __LINE__);
+        m_sub_platform, __FILE__, __LINE__);
 
   value = "null";
   print(os, mlperf_logging::event_type::TIME_POINT, "init_start", value,
         __FILE__, __LINE__);
-
-  H2_INFO(os.str());
 }
 void mlperf_logging::on_setup_end(model *m)
 {
@@ -227,8 +223,6 @@ void mlperf_logging::on_setup_end(model *m)
 
   print(os, mlperf_logging::event_type::TIME_POINT, "init_stop", "null",
         __FILE__, __LINE__);
-
-  H2_INFO(os.str());
 }
 
 void mlperf_logging::on_epoch_begin(model *m)
@@ -239,8 +233,6 @@ void mlperf_logging::on_epoch_begin(model *m)
 
   print(os, mlperf_logging::event_type::INT_START, "epoch_start", "null",
         __FILE__, __LINE__, epoch);
-
-  H2_INFO(os.str());
 }
 
 void mlperf_logging::on_epoch_end(model *m)
@@ -251,8 +243,6 @@ void mlperf_logging::on_epoch_end(model *m)
 
   print(os, mlperf_logging::event_type::INT_START, "epoch_stop", "null",
         __FILE__, __LINE__, epoch);
-
-  H2_INFO(os.str());
 }
 
 void mlperf_logging::on_train_begin(model *m)
@@ -264,8 +254,6 @@ void mlperf_logging::on_train_begin(model *m)
   //FIXME: run_start? Same time stamp as epoch 1 in results
   print(os, mlperf_logging::event_type::INT_START, "run_start", "null",
         __FILE__, __LINE__, epoch);
-
-  H2_INFO(os.str());
 }
 
 void mlperf_logging::on_train_end(model *m)
@@ -277,8 +265,6 @@ void mlperf_logging::on_train_end(model *m)
   //FIXME: run_stop? End of training?
   print(os, mlperf_logging::event_type::INT_START, "run_stop", "null",
         __FILE__, __LINE__, epoch);
-
-  H2_INFO(os.str());
 }
 
 void mlperf_logging::on_batch_evaluate_begin(model *m)
@@ -289,8 +275,6 @@ void mlperf_logging::on_batch_evaluate_begin(model *m)
 
   print(os, mlperf_logging::event_type::INT_START, "eval_start", "null",
         __FILE__, __LINE__, epoch);
-
-  H2_INFO(os.str());
 }
 
 void mlperf_logging::on_batch_evaluate_end(model *m)
@@ -307,8 +291,6 @@ void mlperf_logging::on_batch_evaluate_end(model *m)
   print(os, mlperf_logging::event_type::TIME_POINT, "eval_error",
         static_cast<double>(eval_error), __FILE__,
         __LINE__, epoch);
-
-  H2_INFO(os.str());
 }
 
 std::unique_ptr<callback_base>
@@ -318,7 +300,12 @@ build_mlperf_logging_callback_from_pbuf(
 {
   const auto& params =
     dynamic_cast<const lbann_data::Callback::CallbackMlperfLogging&>(proto_msg);
-  return std::make_unique<mlperf_logging>(params.output_filename());
+  return std::make_unique<mlperf_logging>(params.sub_benchmark(),
+                                          params.sub_org(),
+                                          params.sub_division(),
+                                          params.sub_status(),
+                                          params.sub_platform(),
+                                          params.output_filename());
 }
 } // namespace callback
 } // namespace lbann
diff --git a/src/proto/callbacks.proto b/src/proto/callbacks.proto
index 79f7d164db8..ac05ec8af8a 100644
--- a/src/proto/callbacks.proto
+++ b/src/proto/callbacks.proto
@@ -439,7 +439,12 @@ message Callback {
 
   /** @brief Prints mlperf compliant benchmark logs */
   message CallbackMlperfLogging {
-    string output_filename = 1;
+    string output_filename = 1; // Output filename
+    string sub_benchmark = 2; // FIXME(KLG): document these
+    string sub_org = 3;
+    string sub_division = 4;
+    string sub_status = 5;
+    string sub_platform = 6;
   }
 
   message CallbackAlternateUpdates {

From c45a184ca30359877cef5250d614e3933227d450 Mon Sep 17 00:00:00 2001
From: Katie Graham <graham63@llnl.gov>
Date: Tue, 29 Nov 2022 09:51:08 -0800
Subject: [PATCH 3/7] Added mlperf callback documentation, added values for
 optimizer

---
 docs/callbacks/mlperf_logging.rst          |  69 ++++++++++
 include/lbann/callbacks/mlperf_logging.hpp |  32 ++---
 src/callbacks/mlperf_logging.cpp           | 150 +++++++++++----------
 src/proto/callbacks.proto                  |  11 +-
 4 files changed, 167 insertions(+), 95 deletions(-)
 create mode 100644 docs/callbacks/mlperf_logging.rst

diff --git a/docs/callbacks/mlperf_logging.rst b/docs/callbacks/mlperf_logging.rst
new file mode 100644
index 00000000000..4fdbb4c50b0
--- /dev/null
+++ b/docs/callbacks/mlperf_logging.rst
@@ -0,0 +1,69 @@
+.. role:: python(code)
+          :language: python
+
+.. role:: c(code)
+          :language: c
+
+.. _mlperf-logging-callback:
+
+============================================================
+MLPerf Logging Callback
+============================================================
+
+The MLPerf callback exports an MLPerf compatible log for running
+benchmarks on LBANN. The logging output is included in the out.log
+file located in the LBANN run directory.
+
+---------------------------------------------
+Execution Points
+---------------------------------------------
+
++ setup
++ on setup end
++ on epoch begin
++ on epoch end
++ on train begin
++ on train end
++ on batch evaluate begin
++ on batch evaluate end
+
+.. _callback-arguments:
+
+---------------------------------------------
+Callback Arguments
+---------------------------------------------
+
+   .. note:: While technically optional, omitting arguments will
+             result in "UNKNOWN_<FIELD_NAME>" appearing in the log
+             results (with the exception of sub_org).
+
+   :sub_benchmark: (``string``) Benchmark name. A list of benchmarks
+                   can be found in the `MLPerf Benchmarks Suite
+                   <https://github.com/mlcommons/training_policies/blob/master/training_rules.adoc#3-benchmarks>`_.
+
+   :sub_org: (``string``, optional) Organization running the
+             benchmarks. Default: `LBANN`.
+
+   :sub_division: (``string``) Closed or open division. See `Divisions <https://github.com/mlcommons/training_policies/blob/master/training_rules.adoc#4-divisions`_
+
+   :sub_status: (``string``) Submission status. (onprem, cloud, or
+                preview)
+
+   :sub_platform: (``string``) Submission platform/hardware. (Example:
+                  Longhorn, NVIDIA DGX A100, JUWELS_Booster)
+
+.. _examples-using-export-onnx:
+
+------------------------------------------------------
+Example Using Export ONNX Callback (Python Front-End)
+------------------------------------------------------
+
+.. code-block:: python
+
+   # Pass parameters to callback
+   mlperf_logging = lbann.CallbackMlperfLogging(
+                                   sub_benchmark="SUBMISSION_BENCHMARK",
+                                   sub_org="SUBMISSION_ORGANIZATION",
+                                   sub_division="SUBMISSION_DIVISION",
+                                   sub_status="SUBMISSION_STATUS",
+                                   sub_platform="SUBMISSION_PLATFORM")
diff --git a/include/lbann/callbacks/mlperf_logging.hpp b/include/lbann/callbacks/mlperf_logging.hpp
index 708fb2f892a..2e19046033d 100644
--- a/include/lbann/callbacks/mlperf_logging.hpp
+++ b/include/lbann/callbacks/mlperf_logging.hpp
@@ -51,15 +51,16 @@ class mlperf_logging : public callback_base {
 public:
 
   /** @brief mlperf_logging Constructor.
-   *  @param output_filename Output filename (default = results.txt)
+   *  @param string sub_benchmark Name of benchmark.
+   *  @param string sub_org Name of submission organization (Default: LBANN)
+   *  @param string sub_division Division of benchmark suite (open or closed)
+   *  @param string sub_status Submission status (onprem, cloud, or preview)
+   *  @param string sub_platform Submission platform/hardware
    */
-  mlperf_logging(std::string output_filename, std::string sub_benchmark,
-                 std::string sub_org, std::string sub_division,
-                 std::string sub_status, std::string sub_platform)
+  mlperf_logging(std::string sub_benchmark, std::string sub_org,
+                 std::string sub_division, std::string sub_status,
+                 std::string sub_platform)
     : callback_base(/*batch_interval=*/1),
-      m_output_filename{output_filename.size() ?
-                        std::move(output_filename) :
-                        std::string("results.txt")},
       m_sub_benchmark{sub_benchmark.size() ?
                       std::move(sub_benchmark) :
                       std::string("UNKNOWN_SUBMISSION_BENCHMARK")},
@@ -74,7 +75,7 @@ class mlperf_logging : public callback_base {
                    std::string("UNKNOWN_SUBMISSION_STATUS")},
       m_sub_platform{sub_platform.size() ?
                      std::move(sub_platform) :
-                        std::string("UNKNOWN_SUBMISSION_PLATFORM")}
+                     std::string("UNKNOWN_SUBMISSION_PLATFORM")}
   {}
 
   /** @brief Copy interface */
@@ -95,8 +96,9 @@ class mlperf_logging : public callback_base {
    *  @param double epoch Current epoch number.
    */
   template <typename T>
-  void print(std::ostringstream& os, mlperf_logging::event_type et, std::string key,
-             T value, char const* file, size_t line, double epoch = -1) const;
+  void print(std::ostringstream& os, mlperf_logging::event_type et,
+             std::string key, T value, char const* file, size_t line,
+             double epoch = -1) const;
 
   void setup(model *m) override;
   void on_setup_end(model *m) override;
@@ -124,16 +126,16 @@ class mlperf_logging : public callback_base {
 
 private:
 
-  //FIXME: get logger to output file
-  /* @brief name of output file. Default = results.txt */
-  std::string m_output_filename;
-  /* @brief DiHydrogen logger */
-  h2::Logger m_logger{":::MLLOG", m_output_filename};
   std::string m_sub_benchmark;
   std::string m_sub_org;
   std::string m_sub_division;
   std::string m_sub_status;
   std::string m_sub_platform;
+  //FIXME: Include option to create separate file with just logging data?
+  /* @brief name of output file. Default = results.txt */
+  //std::string m_output_filename;
+  /* @brief DiHydrogen logger */
+  h2::Logger m_logger{":::MLLOG"};
 
 
 }; // class mlperf_logging
diff --git a/src/callbacks/mlperf_logging.cpp b/src/callbacks/mlperf_logging.cpp
index c48ed59837c..6dab3d99f7e 100644
--- a/src/callbacks/mlperf_logging.cpp
+++ b/src/callbacks/mlperf_logging.cpp
@@ -29,6 +29,7 @@
 #include "lbann/callbacks/mlperf_logging.hpp"
 #include "lbann/metrics/metric.hpp"
 #include "lbann/weights/weights.hpp"
+#include "lbann/trainers/trainer.hpp"
 
 #include <callbacks.pb.h>
 
@@ -42,7 +43,11 @@
 namespace lbann {
 namespace callback {
 
-// FIXME Does this need an anon namespace since it's only in the cpp file?
+namespace {
+void print_value(std::ostringstream& os, int value)
+{
+  os << value;
+}
 void print_value(std::ostringstream& os, double value)
 {
   os << value;
@@ -67,8 +72,9 @@ template <typename T>
 void print_value(std::ostringstream& os, T value)
 {
   //FIXME: Should I push the value anyway?
-  os << "UNKNOWN_DATA_TYPE";
+  os << "\"UNKNOWN_DATA_TYPE\"";
 }
+}// namespace
 
 template <typename T>
 void mlperf_logging::print(std::ostringstream& os, mlperf_logging::event_type et,
@@ -114,15 +120,14 @@ size_t mlperf_logging::get_ms_since_epoch()
     system_clock::now().time_since_epoch()).count();
 }
 
-//FIXME(KLG): There is no on_setup_begin. Can I steal this as a callback hook?
 void mlperf_logging::setup(model *m)
 {
   std::ostringstream os;
 
-  //FIXME: What is this?
-  std::string value = "null";
-  print(os, mlperf_logging::event_type::TIME_POINT, "cache_clear", value,
-        __FILE__, __LINE__);
+  // Not a good/portable way to do this in C++
+  // std::string value = "null";
+  // print(os, mlperf_logging::event_type::TIME_POINT, "cache_clear", value,
+  //      __FILE__, __LINE__);
 
   print(os, mlperf_logging::event_type::TIME_POINT, "submission_benchmark",
         m_sub_benchmark, __FILE__, __LINE__);
@@ -139,89 +144,89 @@ void mlperf_logging::setup(model *m)
   print(os, mlperf_logging::event_type::TIME_POINT, "submission_platform",
         m_sub_platform, __FILE__, __LINE__);
 
-  value = "null";
-  print(os, mlperf_logging::event_type::TIME_POINT, "init_start", value,
+  //value = "null";
+  print(os, mlperf_logging::event_type::INT_START, "init_start", "null",
         __FILE__, __LINE__);
 }
 void mlperf_logging::on_setup_end(model *m)
 {
   std::ostringstream os;
   lbann_comm *comm = m->get_comm();
+  auto const& trainer = get_const_trainer();
 
-  //FIXME: num_trainers or world size?
   print(os, mlperf_logging::event_type::TIME_POINT, "number_of_ranks",
-        static_cast<double>(comm->get_num_trainers()), __FILE__, __LINE__);
+        static_cast<int>(comm->get_procs_in_world()), __FILE__, __LINE__);
 
   //FIXME
   auto nodes = -1;
   print(os, mlperf_logging::event_type::TIME_POINT, "number_of_nodes",
-        static_cast<double>(nodes), __FILE__, __LINE__);
+        static_cast<int>(nodes), __FILE__, __LINE__);
 
   //FIXME
   auto accelerators = -1;
   print(os, mlperf_logging::event_type::TIME_POINT, "accelerators_per_node",
-        static_cast<double>(accelerators), __FILE__, __LINE__);
+        static_cast<int>(accelerators), __FILE__, __LINE__);
 
-  //FIXME: From trainer.hpp?
-  auto seed = -1;
+  auto const seed = trainer.get_random_seed();
   print(os, mlperf_logging::event_type::TIME_POINT, "seed",
-        static_cast<double>(seed), __FILE__, __LINE__);
+        seed, __FILE__, __LINE__);
 
-  //FIXME: Add get_minibatch_size to model or metrics?
-  auto batch_size = -1;
+  auto const& dc = trainer.get_data_coordinator();
+  auto const batch_size = dc.get_global_mini_batch_size(
+    execution_mode::training);
   print(os, mlperf_logging::event_type::TIME_POINT, "global_batch_size",
-        static_cast<double>(batch_size), __FILE__, __LINE__);
+        batch_size, __FILE__, __LINE__);
 
-  metric_statistics metrics;
-  auto samples = metrics.get_num_samples();
+  auto samples = dc.get_total_num_samples(execution_mode::training);
   print(os, mlperf_logging::event_type::TIME_POINT, "train_samples",
-        static_cast<double>(samples), __FILE__, __LINE__);
+        samples, __FILE__, __LINE__);
 
-  //FIXME
-  auto eval_samples = -1;
+  //FIXME: Should this be execution_mode::validation? Tom thinks no
+  auto eval_samples = dc.get_total_num_samples(execution_mode::testing);
   print(os, mlperf_logging::event_type::TIME_POINT, "eval_samples",
-        static_cast<double>(eval_samples), __FILE__, __LINE__);
-
-  //FIXME: I couldn't get this to work
-  //auto* optimizer = m->get_weights().get_optimizer();
-  std::string opt = "opt_name";
-  print(os, mlperf_logging::event_type::TIME_POINT, "opt_name",
-        opt, __FILE__, __LINE__);
-
-  //FIXME
-  auto opt_learning_rate = -1;
-  print(os, mlperf_logging::event_type::TIME_POINT, "opt_base_learning_rate",
-        static_cast<double>(opt_learning_rate), __FILE__, __LINE__);
-
-  //FIXME
-  auto opt_warmup_steps = -1;
-  print(os, mlperf_logging::event_type::TIME_POINT,
-        "opt_learning_rate_warmup_steps",
-        static_cast<double>(opt_warmup_steps),
-        __FILE__, __LINE__);
-
-  //FIXME
-  auto opt_warmup_factor = -1;
-  print(os, mlperf_logging::event_type::TIME_POINT,
-        "opt_learning_rate_warmup_factor",
-        static_cast<double>(opt_warmup_factor),
-        __FILE__, __LINE__);
-
-  //FIXME
-  auto opt_decay_bound_steps = -1;
-  print(os, mlperf_logging::event_type::TIME_POINT,
-        "opt_learning_rate_decay_boundary_steps",
-        static_cast<double>(opt_decay_bound_steps),
-        __FILE__, __LINE__);
-
-  //FIXME
-  auto opt_decay_factor = -1;
-  print(os, mlperf_logging::event_type::TIME_POINT,
-        "opt_learning_rate_decay_factor",
-        static_cast<double>(opt_decay_factor),
-        __FILE__, __LINE__);
-
-  print(os, mlperf_logging::event_type::TIME_POINT, "init_stop", "null",
+        eval_samples, __FILE__, __LINE__);
+
+  auto const weights = m->get_weights();
+  for (auto const w : weights)
+    if( w->get_optimizer() != nullptr ){
+      std::string opt = w->get_optimizer()->get_type();
+      print(os, mlperf_logging::event_type::TIME_POINT, "opt_name",
+            opt, __FILE__, __LINE__);
+
+      auto opt_learning_rate = w->get_optimizer()->get_learning_rate();
+      print(os, mlperf_logging::event_type::TIME_POINT,
+            "opt_base_learning_rate", static_cast<double>(opt_learning_rate),
+            __FILE__, __LINE__);
+      break;
+    }
+
+  // LBANN does not perform warmup steps.
+  //  auto opt_warmup_steps = -1;
+  //  print(os, mlperf_logging::event_type::TIME_POINT,
+  //      "opt_learning_rate_warmup_steps",
+  //      static_cast<size_t>(opt_warmup_steps),
+  //      __FILE__, __LINE__);
+
+  // auto opt_warmup_factor = -1;
+  // print(os, mlperf_logging::event_type::TIME_POINT,
+  //      "opt_learning_rate_warmup_factor",
+  //      static_cast<double>(opt_warmup_factor),
+  //      __FILE__, __LINE__);
+
+  // FIXME (Tom's problem)
+  //auto opt_decay_bound_steps = -1;
+  //print(os, mlperf_logging::event_type::TIME_POINT,
+  //      "opt_learning_rate_decay_boundary_steps",
+  //      static_cast<size_t>(opt_decay_bound_steps),
+  //      __FILE__, __LINE__);
+
+  // auto opt_decay_factor = -1;
+  // print(os, mlperf_logging::event_type::TIME_POINT,
+  //      "opt_learning_rate_decay_factor",
+  //      static_cast<double>(opt_decay_factor),
+  //      __FILE__, __LINE__);
+
+  print(os, mlperf_logging::event_type::INT_END, "init_stop", "null",
         __FILE__, __LINE__);
 }
 
@@ -241,7 +246,7 @@ void mlperf_logging::on_epoch_end(model *m)
   const auto& epoch = static_cast<const SGDExecutionContext&>(
     m->get_execution_context()).get_epoch();
 
-  print(os, mlperf_logging::event_type::INT_START, "epoch_stop", "null",
+  print(os, mlperf_logging::event_type::INT_END, "epoch_stop", "null",
         __FILE__, __LINE__, epoch);
 }
 
@@ -251,7 +256,6 @@ void mlperf_logging::on_train_begin(model *m)
   const auto& epoch = static_cast<const SGDExecutionContext&>(
     m->get_execution_context()).get_epoch();
 
-  //FIXME: run_start? Same time stamp as epoch 1 in results
   print(os, mlperf_logging::event_type::INT_START, "run_start", "null",
         __FILE__, __LINE__, epoch);
 }
@@ -262,8 +266,7 @@ void mlperf_logging::on_train_end(model *m)
   const auto& epoch = static_cast<const SGDExecutionContext&>(
     m->get_execution_context()).get_epoch();
 
-  //FIXME: run_stop? End of training?
-  print(os, mlperf_logging::event_type::INT_START, "run_stop", "null",
+  print(os, mlperf_logging::event_type::INT_END, "run_stop", "null",
         __FILE__, __LINE__, epoch);
 }
 
@@ -283,10 +286,10 @@ void mlperf_logging::on_batch_evaluate_end(model *m)
   const auto& epoch = static_cast<const SGDExecutionContext&>(
     m->get_execution_context()).get_epoch();
 
-  print(os, mlperf_logging::event_type::INT_START, "eval_stop", "null",
+  print(os, mlperf_logging::event_type::INT_END, "eval_stop", "null",
         __FILE__, __LINE__, epoch);
 
-  //FIXME
+  //FIXME (Tom's problem)
   auto eval_error = -1;
   print(os, mlperf_logging::event_type::TIME_POINT, "eval_error",
         static_cast<double>(eval_error), __FILE__,
@@ -304,8 +307,7 @@ build_mlperf_logging_callback_from_pbuf(
                                           params.sub_org(),
                                           params.sub_division(),
                                           params.sub_status(),
-                                          params.sub_platform(),
-                                          params.output_filename());
+                                          params.sub_platform());
 }
 } // namespace callback
 } // namespace lbann
diff --git a/src/proto/callbacks.proto b/src/proto/callbacks.proto
index ac05ec8af8a..1114dc10b14 100644
--- a/src/proto/callbacks.proto
+++ b/src/proto/callbacks.proto
@@ -439,12 +439,11 @@ message Callback {
 
   /** @brief Prints mlperf compliant benchmark logs */
   message CallbackMlperfLogging {
-    string output_filename = 1; // Output filename
-    string sub_benchmark = 2; // FIXME(KLG): document these
-    string sub_org = 3;
-    string sub_division = 4;
-    string sub_status = 5;
-    string sub_platform = 6;
+    string sub_benchmark = 1; // Name of benchmark
+    string sub_org = 2; // Name of submission organization
+    string sub_division = 3; // Open or closed division
+    string sub_status = 4; // Submission status: onprem, cloud, or preview
+    string sub_platform = 5; // Submission platform/hardware
   }
 
   message CallbackAlternateUpdates {

From d47e2257ae8674917da1e68204f9984927e1cc19 Mon Sep 17 00:00:00 2001
From: Katie Graham <graham63@llnl.gov>
Date: Tue, 29 Nov 2022 23:47:49 -0800
Subject: [PATCH 4/7] added function to get num nodes

---
 src/callbacks/mlperf_logging.cpp | 32 +++++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/src/callbacks/mlperf_logging.cpp b/src/callbacks/mlperf_logging.cpp
index 6dab3d99f7e..883404630c0 100644
--- a/src/callbacks/mlperf_logging.cpp
+++ b/src/callbacks/mlperf_logging.cpp
@@ -74,12 +74,29 @@ void print_value(std::ostringstream& os, T value)
   //FIXME: Should I push the value anyway?
   os << "\"UNKNOWN_DATA_TYPE\"";
 }
+
+//FIXME: Tom's problem
+int get_real_num_accelerators()
+{
+  return 0;
+}
+
+int get_num_nodes()
+{
+  if (std::getenv("SLURM_NNODES"))
+    return atoi(std::getenv("SLURM_NNODES"));
+  else if (std::getenv("FLUX_JOB_NNODES"))
+    return atoi(std::getenv("FLUX_JOB_NNODES"));
+  else return -1;
+  //FIXME: count number of unique hostnames in universe?
+}
 }// namespace
 
 template <typename T>
-void mlperf_logging::print(std::ostringstream& os, mlperf_logging::event_type et,
-                           std::string key, T value, char const* file,
-                           size_t line, double epoch) const
+void mlperf_logging::print(std::ostringstream& os,
+                           mlperf_logging::event_type et, std::string key,
+                           T value, char const* file, size_t line,
+                           double epoch) const
 {
   os << "{"
      << "\"namespace\": \"\", "
@@ -144,7 +161,6 @@ void mlperf_logging::setup(model *m)
   print(os, mlperf_logging::event_type::TIME_POINT, "submission_platform",
         m_sub_platform, __FILE__, __LINE__);
 
-  //value = "null";
   print(os, mlperf_logging::event_type::INT_START, "init_start", "null",
         __FILE__, __LINE__);
 }
@@ -157,13 +173,10 @@ void mlperf_logging::on_setup_end(model *m)
   print(os, mlperf_logging::event_type::TIME_POINT, "number_of_ranks",
         static_cast<int>(comm->get_procs_in_world()), __FILE__, __LINE__);
 
-  //FIXME
-  auto nodes = -1;
   print(os, mlperf_logging::event_type::TIME_POINT, "number_of_nodes",
-        static_cast<int>(nodes), __FILE__, __LINE__);
+        static_cast<int>(get_num_nodes()), __FILE__, __LINE__);
 
-  //FIXME
-  auto accelerators = -1;
+  auto accelerators = get_real_num_accelerators();
   print(os, mlperf_logging::event_type::TIME_POINT, "accelerators_per_node",
         static_cast<int>(accelerators), __FILE__, __LINE__);
 
@@ -308,6 +321,7 @@ build_mlperf_logging_callback_from_pbuf(
                                           params.sub_division(),
                                           params.sub_status(),
                                           params.sub_platform());
+  //params.num_nodes());
 }
 } // namespace callback
 } // namespace lbann

From 3e95495b4cd2003cd0f3b2084917fae6c3cf784f Mon Sep 17 00:00:00 2001
From: Katie Graham <graham63@llnl.gov>
Date: Mon, 27 Feb 2023 09:04:38 -0800
Subject: [PATCH 5/7] Updated mlperf_logging to be compliant with new H2 logger
 format

---
 include/lbann/callbacks/mlperf_logging.hpp | 3 +--
 src/callbacks/mlperf_logging.cpp           | 3 ++-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/lbann/callbacks/mlperf_logging.hpp b/include/lbann/callbacks/mlperf_logging.hpp
index 2e19046033d..fd1097fc56c 100644
--- a/include/lbann/callbacks/mlperf_logging.hpp
+++ b/include/lbann/callbacks/mlperf_logging.hpp
@@ -131,11 +131,10 @@ class mlperf_logging : public callback_base {
   std::string m_sub_division;
   std::string m_sub_status;
   std::string m_sub_platform;
-  //FIXME: Include option to create separate file with just logging data?
   /* @brief name of output file. Default = results.txt */
   //std::string m_output_filename;
   /* @brief DiHydrogen logger */
-  h2::Logger m_logger{":::MLLOG"};
+  h2::Logger m_logger{"mlperf_logger", "stdout", ":::MLLOG"};
 
 
 }; // class mlperf_logging
diff --git a/src/callbacks/mlperf_logging.cpp b/src/callbacks/mlperf_logging.cpp
index 883404630c0..d842c0f711e 100644
--- a/src/callbacks/mlperf_logging.cpp
+++ b/src/callbacks/mlperf_logging.cpp
@@ -44,6 +44,7 @@ namespace lbann {
 namespace callback {
 
 namespace {
+
 void print_value(std::ostringstream& os, int value)
 {
   os << value;
@@ -116,7 +117,7 @@ void mlperf_logging::print(std::ostringstream& os,
   else
     os << ", " << "\"epoch_num\": " << epoch << "}}";
 
-  H2_INFO(os.str());
+  m_logger.get()->info(os.str());
   os.flush();
 }
 

From 1d8937dbaa6d98d15c74558a266360a6054a9ec5 Mon Sep 17 00:00:00 2001
From: Katie Graham <graham63@llnl.gov>
Date: Tue, 28 Feb 2023 15:11:44 -0800
Subject: [PATCH 6/7] fixed typo in callbacks.proto

---
 src/proto/callbacks.proto | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/proto/callbacks.proto b/src/proto/callbacks.proto
index 1114dc10b14..b75a19bb17d 100644
--- a/src/proto/callbacks.proto
+++ b/src/proto/callbacks.proto
@@ -86,7 +86,8 @@ message Callback {
     CallbackPerturbWeights perturb_weights = 52;
     CallbackExportOnnx export_onnx = 53;
     CallbackAlternateUpdates alternate_updates = 54;
-    CallbackMlperfLogging mlperf_logging = 55;
+    CallbackMlperfLogging mlperf_logging = 56;
+}
 
   message CallbackLTFB {
     int64 batch_interval = 1;

From 31326f9889e3e4db66b10c1649f5775251428cb2 Mon Sep 17 00:00:00 2001
From: Katie Graham <graham63@llnl.gov>
Date: Tue, 21 Mar 2023 12:58:25 -0700
Subject: [PATCH 7/7] Resolved errors introduced by include-what-you-use and
 other various PRs

---
 include/lbann/callbacks/mlperf_logging.hpp |  8 +++++++-
 src/callbacks/mlperf_logging.cpp           | 19 ++++++++++++++++---
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/include/lbann/callbacks/mlperf_logging.hpp b/include/lbann/callbacks/mlperf_logging.hpp
index fd1097fc56c..ba246429365 100644
--- a/include/lbann/callbacks/mlperf_logging.hpp
+++ b/include/lbann/callbacks/mlperf_logging.hpp
@@ -32,6 +32,10 @@
 #include "lbann/callbacks/callback.hpp"
 #include <h2/utils/Logger.hpp>
 
+namespace lbann_data {
+class Callback;
+}
+
 namespace lbann {
 namespace callback {
 
@@ -111,6 +115,8 @@ class mlperf_logging : public callback_base {
 
 private:
 
+  void write_specific_proto(lbann_data::Callback& proto) const final;
+
   /** @brief Populate log with mlperf event type.
    *  @param ostringstream os Stores log string.
    *  @param event_type et Type of mlperf style event.
@@ -134,7 +140,7 @@ class mlperf_logging : public callback_base {
   /* @brief name of output file. Default = results.txt */
   //std::string m_output_filename;
   /* @brief DiHydrogen logger */
-  h2::Logger m_logger{"mlperf_logger", "stdout", ":::MLLOG"};
+  mutable h2::Logger m_logger{"mlperf_logger", "stdout", ":::MLLOG"};
 
 
 }; // class mlperf_logging
diff --git a/src/callbacks/mlperf_logging.cpp b/src/callbacks/mlperf_logging.cpp
index d842c0f711e..adc752df35f 100644
--- a/src/callbacks/mlperf_logging.cpp
+++ b/src/callbacks/mlperf_logging.cpp
@@ -30,8 +30,12 @@
 #include "lbann/metrics/metric.hpp"
 #include "lbann/weights/weights.hpp"
 #include "lbann/trainers/trainer.hpp"
+#include "lbann/models/model.hpp"
+#include "lbann/data_coordinator/data_coordinator.hpp"
+#include "lbann/execution_algorithms/sgd_execution_context.hpp"
+#include "lbann/optimizers/optimizer.hpp"
 
-#include <callbacks.pb.h>
+#include "lbann/proto/callbacks.pb.h"
 
 #include <fstream>
 #include <iostream>
@@ -44,7 +48,6 @@ namespace lbann {
 namespace callback {
 
 namespace {
-
 void print_value(std::ostringstream& os, int value)
 {
   os << value;
@@ -93,6 +96,16 @@ int get_num_nodes()
 }
 }// namespace
 
+void mlperf_logging::write_specific_proto(lbann_data::Callback& proto) const
+{
+  auto* msg = proto.mutable_mlperf_logging();
+  msg->set_sub_benchmark(m_sub_benchmark);
+  msg->set_sub_org(m_sub_org);
+  msg->set_sub_division(m_sub_division);
+  msg->set_sub_status(m_sub_status);
+  msg->set_sub_platform(m_sub_platform);
+}
+
 template <typename T>
 void mlperf_logging::print(std::ostringstream& os,
                            mlperf_logging::event_type et, std::string key,
@@ -117,7 +130,7 @@ void mlperf_logging::print(std::ostringstream& os,
   else
     os << ", " << "\"epoch_num\": " << epoch << "}}";
 
-  m_logger.get()->info(os.str());
+  m_logger.get().info(os.str());
   os.flush();
 }