From 0fc9696d8f119881d07db3179de66a3b38c3d85a Mon Sep 17 00:00:00 2001 From: Katie Graham Date: Wed, 9 Nov 2022 21:32:52 -0800 Subject: [PATCH 1/7] Added callback for mlperf logs --- CMakeLists.txt | 5 + include/lbann/callbacks/CMakeLists.txt | 1 + include/lbann/callbacks/mlperf_logging.hpp | 135 +++++++++ src/callbacks/CMakeLists.txt | 1 + src/callbacks/mlperf_logging.cpp | 324 +++++++++++++++++++++ src/proto/callbacks.proto | 11 +- src/proto/factories/callback_factory.cpp | 3 + 7 files changed, 477 insertions(+), 3 deletions(-) create mode 100644 include/lbann/callbacks/mlperf_logging.hpp create mode 100644 src/callbacks/mlperf_logging.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 21951fd7ea1..8a7bae7687f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -237,15 +237,19 @@ if (LBANN_WITH_DISTCONV) find_package(DiHydrogen 0.3.0 CONFIG REQUIRED COMPONENTS Meta Patterns DistConv) set(LBANN_HAS_DISTCONV TRUE) set(LBANN_H2_LIBS + H2::H2Core H2::H2Meta H2::H2Patterns H2::H2DistConv) else () find_package(DiHydrogen CONFIG REQUIRED COMPONENTS Meta Patterns) set(LBANN_H2_LIBS + H2::H2Core H2::H2Meta H2::H2Patterns) endif () +#FIXME(KLG): There is no H2CoreConfig.cmake in H2 +#find_package(H2Core REQUIRED) set(LBANN_HAS_DIHYDROGEN TRUE) message(STATUS "Found DiHydrogen: ${DiHydrogen_DIR}") @@ -660,6 +664,7 @@ target_link_libraries(lbann PUBLIC ${CLARA_LIBRARIES} ${LBANN_PYTHON_LIBS} protobuf::libprotobuf + spdlog::spdlog ${CEREAL_LIBRARIES} ZSTR::ZSTR) diff --git a/include/lbann/callbacks/CMakeLists.txt b/include/lbann/callbacks/CMakeLists.txt index e901d63831f..f000f6875a1 100644 --- a/include/lbann/callbacks/CMakeLists.txt +++ b/include/lbann/callbacks/CMakeLists.txt @@ -51,6 +51,7 @@ set_full_path(THIS_DIR_HEADERS learning_rate.hpp ltfb.hpp mixup.hpp + mlperf_logging.hpp monitor_io.hpp perturb_adam.hpp perturb_dropout.hpp diff --git a/include/lbann/callbacks/mlperf_logging.hpp b/include/lbann/callbacks/mlperf_logging.hpp new file mode 100644 index 00000000000..061823bba89 --- /dev/null +++ b/include/lbann/callbacks/mlperf_logging.hpp @@ -0,0 +1,135 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2022, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +// mlperf_logging .hpp .cpp - Prints mlperf compliant benchmark logs +//////////////////////////////////////////////////////////////////////////////// + +#ifndef LBANN_CALLBACKS_MLPERF_LOGGING_HPP_INCLUDED +#define LBANN_CALLBACKS_MLPERF_LOGGING_HPP_INCLUDED + +#include "lbann/callbacks/callback.hpp" +#include

+ +namespace lbann { +namespace callback { + +/** @class mlperf_logging + * @brief Callback to print mlperf compliant benchmark logs + */ +class mlperf_logging : public callback_base { + +public: + + enum class event_type { + TIME_POINT, + INT_START, + INT_END, + }; + +public: + + /** @brief mlperf_logging Constructor. + * @param output_filename Output filename (default = results.txt) + */ + mlperf_logging(std::string output_filename) + : callback_base(/*batch_interval=*/1), + m_output_filename{output_filename.size() ? + std::move(output_filename) : + std::string("results.txt")} + {} + + /** @brief Copy interface */ + mlperf_logging* copy() const override { + return new mlperf_logging(*this); + } + + /** @brief Return name of callback */ + std::string name() const override { return "mlperf_logging"; } + + /** @brief Push mlperf formatted log string to stream object. + * @param ostream os Stores log strings. + * @param event_type et Type of mlperf style event. + * @param string key Mlperf log key. + * @param T value Mlperf log value. + * @param char const* file Current file name. + * @param size_t line File line number. + * @param double epoch Current epoch number. + */ + template + void print(std::ostream& os, mlperf_logging::event_type et, std::string key, + T value, char const* file, size_t line, double epoch = -1) const; + + void setup(model *m) override; + void on_setup_end(model *m) override; + void on_epoch_begin(model *m) override; + void on_epoch_end(model *m) override; + void on_train_begin(model *m) override; + void on_train_end(model *m) override; + void on_batch_evaluate_begin(model *m) override; + void on_batch_evaluate_end(model *m) override; + +private: + + /** @brief Populate log with mlperf event type. + * @param ostream os Stores log string. + * @param event_type et Type of mlperf style event. + */ + void print_event_type(std::ostream& os, mlperf_logging::event_type et) const; + + /** @brief Populate log with value. + * @param ostream os Stores log string. + * @param event_type et Mlperf log value. + */ + void print_value(std::ostream& os, double value) const; + void print_value(std::ostream& os, long value) const; + void print_value(std::ostream& os, size_t value) const; + void print_value(std::ostream& os, std::string value) const; + //FIXME: Always picks this function first + //template + //void print_value(std::ostream& os, T value) const; + + static size_t get_ms_since_epoch(); + +private: + + //FIXME: get logger to output file + /* @brief name of output file. Default = results.txt */ + std::string m_output_filename; + + //FIXME: Add custom logging tag + /* @brief DiHydrogen logger */ + h2::Logger m_logger; + +}; // class mlperf_logging + +std::unique_ptr +build_mlperf_logging_callback_from_pbuf( + const google::protobuf::Message& proto_msg, + const std::shared_ptr&); + +} // namespace callback +} // namespace lbann + +#endif // LBANN_CALLBACKS_MLPERF_LOGGING_HPP_INCLUDED diff --git a/src/callbacks/CMakeLists.txt b/src/callbacks/CMakeLists.txt index a423c3eb315..8b6713f966d 100644 --- a/src/callbacks/CMakeLists.txt +++ b/src/callbacks/CMakeLists.txt @@ -52,6 +52,7 @@ set_full_path(THIS_DIR_SOURCES load_model.cpp ltfb.cpp mixup.cpp + mlperf_logging.cpp monitor_io.cpp perturb_adam.cpp perturb_dropout.cpp diff --git a/src/callbacks/mlperf_logging.cpp b/src/callbacks/mlperf_logging.cpp new file mode 100644 index 00000000000..18244ec33f4 --- /dev/null +++ b/src/callbacks/mlperf_logging.cpp @@ -0,0 +1,324 @@ +//////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2014-2022, Lawrence Livermore National Security, LLC. +// Produced at the Lawrence Livermore National Laboratory. +// Written by the LBANN Research Team (B. Van Essen, et al.) listed in +// the CONTRIBUTORS file. +// +// LLNL-CODE-697807. +// All rights reserved. +// +// This file is part of LBANN: Livermore Big Artificial Neural Network +// Toolkit. For details, see http://software.llnl.gov/LBANN or +// https://github.com/LLNL/LBANN. +// +// Licensed under the Apache License, Version 2.0 (the "Licensee"); you +// may not use this file except in compliance with the License. You may +// obtain a copy of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the license. +// +// mlperf_logging .hpp .cpp - Prints mlperf compliant benchmark logs +//////////////////////////////////////////////////////////////////////////////// + +#include "lbann/callbacks/mlperf_logging.hpp" +#include "lbann/metrics/metric.hpp" +#include "lbann/weights/weights.hpp" + +#include + +#include +#include +#include +#include +#include +#include + +namespace lbann { +namespace callback { + +template +void mlperf_logging::print(std::ostream& os, mlperf_logging::event_type et, + std::string key, T value, char const* file, + size_t line, double epoch) const +{ + os << "{" + << "\"namespace\": \"\", " + << "\"time_ms\": " << get_ms_since_epoch() << ", " + << "\"event_type\": \""; + print_event_type(os, et); + + os << "\", " + << "\"key\": " << key << "\", " + << "\"value\": "; + print_value(os, value); + os << ", " + << "\"metadata\": {\"file\": \"" << file << "\", " + << "\"lineno\": " << line; + if(epoch < 0) + os << "}}\n"; + else + os << ", " << "\"epoch_num\": " << epoch << "}}\n"; +} + +void mlperf_logging::print_event_type(std::ostream& os, mlperf_logging::event_type et) const +{ + switch (et) { + case mlperf_logging::event_type::TIME_POINT: os << "POINT_IN_TIME"; break; + case mlperf_logging::event_type::INT_START: os << "INTERVAL_START"; break; + case mlperf_logging::event_type::INT_END: os << "INTERVAL_END"; break; + default: os << "INVALID_EVENT_TYPE"; break; + } +} + +void mlperf_logging::print_value(std::ostream& os, double value) const +{ + os << value; +} +void mlperf_logging::print_value(std::ostream& os, long value) const +{ + os << value; +} +void mlperf_logging::print_value(std::ostream& os, size_t value) const +{ + os << value; +} +void mlperf_logging::print_value(std::ostream& os, std::string value) const +{ + os << value; +} +/*template +void mlperf_logging::print_value(std::ostream& os, T value) const +{ + //FIXME: Should I push the value anyway? + os << "UNKNOWN_DATA_TYPE"; +} +*/ + +size_t mlperf_logging::get_ms_since_epoch() +{ + using namespace std::chrono; + return duration_cast< milliseconds >( + system_clock::now().time_since_epoch()).count(); +} + +//FIXME(KLG): There is no on_setup_begin. Can I steal this as a callback hook? +void mlperf_logging::setup(model *m) +{ + std::ostringstream os; + + //FIXME: What is this? + std::string value = "null"; + print(os, mlperf_logging::event_type::TIME_POINT, "cache_clear", value, + __FILE__, __LINE__); + + //FIXME: Make these user input vars + value = "oc20"; + print(os, mlperf_logging::event_type::TIME_POINT, "submission_benchmark", + value, __FILE__, __LINE__); + + value = "LBANN"; + print(os, mlperf_logging::event_type::TIME_POINT, "submission_org", + value, __FILE__, __LINE__); + + //FIXME: value = closed? + value = "closed"; + print(os, mlperf_logging::event_type::TIME_POINT, "submission_division", + value, __FILE__, __LINE__); + + //FIXME: value = onprem? + value = "onprem"; + print(os, mlperf_logging::event_type::TIME_POINT, "submission_status", + value, __FILE__, __LINE__); + + //FIXME: value = SUBMISSION_PLATFORM_PLACEHOLDER? + value = "?"; + print(os, mlperf_logging::event_type::TIME_POINT, "submission_platform", + value, __FILE__, __LINE__); + + value = "null"; + print(os, mlperf_logging::event_type::TIME_POINT, "init_start", value, + __FILE__, __LINE__); + + H2_INFO(os.str()); +} +void mlperf_logging::on_setup_end(model *m) +{ + std::ostringstream os; + lbann_comm *comm = m->get_comm(); + + //FIXME: num_trainers or world size? + print(os, mlperf_logging::event_type::TIME_POINT, "number_of_ranks", + static_cast(comm->get_num_trainers()), __FILE__, __LINE__); + + //FIXME + auto nodes = -1; + print(os, mlperf_logging::event_type::TIME_POINT, "number_of_nodes", + static_cast(nodes), __FILE__, __LINE__); + + //FIXME + auto accelerators = -1; + print(os, mlperf_logging::event_type::TIME_POINT, "accelerators_per_node", + static_cast(accelerators), __FILE__, __LINE__); + + //FIXME: From trainer.hpp? + auto seed = -1; + print(os, mlperf_logging::event_type::TIME_POINT, "seed", + static_cast(seed), __FILE__, __LINE__); + + //FIXME: Add get_minibatch_size to model or metrics? + auto batch_size = -1; + print(os, mlperf_logging::event_type::TIME_POINT, "global_batch_size", + static_cast(batch_size), __FILE__, __LINE__); + + metric_statistics metrics; + auto samples = metrics.get_num_samples(); + print(os, mlperf_logging::event_type::TIME_POINT, "train_samples", + static_cast(samples), __FILE__, __LINE__); + + //FIXME + auto eval_samples = -1; + print(os, mlperf_logging::event_type::TIME_POINT, "eval_samples", + static_cast(eval_samples), __FILE__, __LINE__); + + //FIXME: I couldn't get this to work + //auto* optimizer = m->get_weights().get_optimizer(); + std::string opt = "opt_name"; + print(os, mlperf_logging::event_type::TIME_POINT, "opt_name", + opt, __FILE__, __LINE__); + + //FIXME + auto opt_learning_rate = -1; + print(os, mlperf_logging::event_type::TIME_POINT, "opt_base_learning_rate", + static_cast(opt_learning_rate), __FILE__, __LINE__); + + //FIXME + auto opt_warmup_steps = -1; + print(os, mlperf_logging::event_type::TIME_POINT, + "opt_learning_rate_warmup_steps", + static_cast(opt_warmup_steps), + __FILE__, __LINE__); + + //FIXME + auto opt_warmup_factor = -1; + print(os, mlperf_logging::event_type::TIME_POINT, + "opt_learning_rate_warmup_factor", + static_cast(opt_warmup_factor), + __FILE__, __LINE__); + + //FIXME + auto opt_decay_bound_steps = -1; + print(os, mlperf_logging::event_type::TIME_POINT, + "opt_learning_rate_decay_boundary_steps", + static_cast(opt_decay_bound_steps), + __FILE__, __LINE__); + + //FIXME + auto opt_decay_factor = -1; + print(os, mlperf_logging::event_type::TIME_POINT, + "opt_learning_rate_decay_factor", + static_cast(opt_decay_factor), + __FILE__, __LINE__); + + print(os, mlperf_logging::event_type::TIME_POINT, "init_stop", "null", + __FILE__, __LINE__); + + H2_INFO(os.str()); +} + +void mlperf_logging::on_epoch_begin(model *m) +{ + std::ostringstream os; + const auto& epoch = static_cast( + m->get_execution_context()).get_epoch(); + + print(os, mlperf_logging::event_type::INT_START, "epoch_start", "null", + __FILE__, __LINE__, epoch); + + H2_INFO(os.str()); +} + +void mlperf_logging::on_epoch_end(model *m) +{ + std::ostringstream os; + const auto& epoch = static_cast( + m->get_execution_context()).get_epoch(); + + print(os, mlperf_logging::event_type::INT_START, "epoch_stop", "null", + __FILE__, __LINE__, epoch); + + H2_INFO(os.str()); +} + +void mlperf_logging::on_train_begin(model *m) +{ + std::ostringstream os; + const auto& epoch = static_cast( + m->get_execution_context()).get_epoch(); + + //FIXME: run_start? Same time stamp as epoch 1 in results + print(os, mlperf_logging::event_type::INT_START, "run_start", "null", + __FILE__, __LINE__, epoch); + + H2_INFO(os.str()); +} + +void mlperf_logging::on_train_end(model *m) +{ + std::ostringstream os; + const auto& epoch = static_cast( + m->get_execution_context()).get_epoch(); + + //FIXME: run_stop? End of training? + print(os, mlperf_logging::event_type::INT_START, "run_stop", "null", + __FILE__, __LINE__, epoch); + + H2_INFO(os.str()); +} + +void mlperf_logging::on_batch_evaluate_begin(model *m) +{ + std::ostringstream os; + const auto& epoch = static_cast( + m->get_execution_context()).get_epoch(); + + print(os, mlperf_logging::event_type::INT_START, "eval_start", "null", + __FILE__, __LINE__, epoch); + + H2_INFO(os.str()); +} + +void mlperf_logging::on_batch_evaluate_end(model *m) +{ + std::ostringstream os; + const auto& epoch = static_cast( + m->get_execution_context()).get_epoch(); + + print(os, mlperf_logging::event_type::INT_START, "eval_stop", "null", + __FILE__, __LINE__, epoch); + + //FIXME + auto eval_error = -1; + print(os, mlperf_logging::event_type::TIME_POINT, "eval_error", + static_cast(eval_error), __FILE__, + __LINE__, epoch); + + H2_INFO(os.str()); +} + +std::unique_ptr +build_mlperf_logging_callback_from_pbuf( + const google::protobuf::Message& proto_msg, + const std::shared_ptr&) +{ + const auto& params = + dynamic_cast(proto_msg); + return std::make_unique(params.output_filename()); +} +} // namespace callback +} // namespace lbann diff --git a/src/proto/callbacks.proto b/src/proto/callbacks.proto index 71fee45c9ac..79f7d164db8 100644 --- a/src/proto/callbacks.proto +++ b/src/proto/callbacks.proto @@ -86,7 +86,7 @@ message Callback { CallbackPerturbWeights perturb_weights = 52; CallbackExportOnnx export_onnx = 53; CallbackAlternateUpdates alternate_updates = 54; - } + CallbackMlperfLogging mlperf_logging = 55; message CallbackLTFB { int64 batch_interval = 1; @@ -433,8 +433,13 @@ message Callback { /** @brief Export trained model in onnx format */ message CallbackExportOnnx { - string output_filename = 1; // name of onnx output file - string debug_string_filename = 2; // print debug string to file + string output_filename = 1; // Name of onnx output file + string debug_string_filename = 2; // Print debug string to file + } + + /** @brief Prints mlperf compliant benchmark logs */ + message CallbackMlperfLogging { + string output_filename = 1; } message CallbackAlternateUpdates { diff --git a/src/proto/factories/callback_factory.cpp b/src/proto/factories/callback_factory.cpp index 8302cb29840..2ee5a4847d7 100644 --- a/src/proto/factories/callback_factory.cpp +++ b/src/proto/factories/callback_factory.cpp @@ -55,6 +55,7 @@ #include "lbann/callbacks/load_model.hpp" #include "lbann/callbacks/ltfb.hpp" #include "lbann/callbacks/mixup.hpp" +#include "lbann/callbacks/mlperf_logging.hpp" #include "lbann/callbacks/monitor_io.hpp" #include "lbann/callbacks/perturb_adam.hpp" #include "lbann/callbacks/perturb_dropout.hpp" @@ -162,6 +163,8 @@ void register_default_builders(factory_type& factory) factory.register_builder("CallbackMinibatchSchedule", build_minibatch_schedule_callback_from_pbuf); factory.register_builder("CallbackMixup", build_mixup_callback_from_pbuf); + factory.register_builder("CallbackMlperfLogging", + build_mlperf_logging_callback_from_pbuf); factory.register_builder( "CallbackOptimizerwiseAdaptiveLearningRate", build_optimizerwise_adaptive_learning_rate_callback_from_pbuf); From 579ac6b0a95368c9004697737637cc9b76fd7c65 Mon Sep 17 00:00:00 2001 From: Katie Graham Date: Tue, 22 Nov 2022 08:58:49 -0800 Subject: [PATCH 2/7] Added user input args to mlperf callback, moved mlperf data class out of separate class --- include/lbann/callbacks/mlperf_logging.hpp | 48 +++++---- src/callbacks/mlperf_logging.cpp | 107 +++++++++------------ src/proto/callbacks.proto | 7 +- 3 files changed, 84 insertions(+), 78 deletions(-) diff --git a/include/lbann/callbacks/mlperf_logging.hpp b/include/lbann/callbacks/mlperf_logging.hpp index 061823bba89..708fb2f892a 100644 --- a/include/lbann/callbacks/mlperf_logging.hpp +++ b/include/lbann/callbacks/mlperf_logging.hpp @@ -53,11 +53,28 @@ class mlperf_logging : public callback_base { /** @brief mlperf_logging Constructor. * @param output_filename Output filename (default = results.txt) */ - mlperf_logging(std::string output_filename) + mlperf_logging(std::string output_filename, std::string sub_benchmark, + std::string sub_org, std::string sub_division, + std::string sub_status, std::string sub_platform) : callback_base(/*batch_interval=*/1), m_output_filename{output_filename.size() ? std::move(output_filename) : - std::string("results.txt")} + std::string("results.txt")}, + m_sub_benchmark{sub_benchmark.size() ? + std::move(sub_benchmark) : + std::string("UNKNOWN_SUBMISSION_BENCHMARK")}, + m_sub_org{sub_org.size() ? + std::move(sub_org) : + std::string("LBANN")}, + m_sub_division{sub_division.size() ? + std::move(sub_division) : + std::string("UNKNOWN_SUBMISSION_DIVISION")}, + m_sub_status{sub_status.size() ? + std::move(sub_status) : + std::string("UNKNOWN_SUBMISSION_STATUS")}, + m_sub_platform{sub_platform.size() ? + std::move(sub_platform) : + std::string("UNKNOWN_SUBMISSION_PLATFORM")} {} /** @brief Copy interface */ @@ -69,7 +86,7 @@ class mlperf_logging : public callback_base { std::string name() const override { return "mlperf_logging"; } /** @brief Push mlperf formatted log string to stream object. - * @param ostream os Stores log strings. + * @param ostringstream os Stores log strings. * @param event_type et Type of mlperf style event. * @param string key Mlperf log key. * @param T value Mlperf log value. @@ -78,7 +95,7 @@ class mlperf_logging : public callback_base { * @param double epoch Current epoch number. */ template - void print(std::ostream& os, mlperf_logging::event_type et, std::string key, + void print(std::ostringstream& os, mlperf_logging::event_type et, std::string key, T value, char const* file, size_t line, double epoch = -1) const; void setup(model *m) override; @@ -93,22 +110,15 @@ class mlperf_logging : public callback_base { private: /** @brief Populate log with mlperf event type. - * @param ostream os Stores log string. + * @param ostringstream os Stores log string. * @param event_type et Type of mlperf style event. */ - void print_event_type(std::ostream& os, mlperf_logging::event_type et) const; + void print_event_type(std::ostringstream& os, mlperf_logging::event_type et) const; /** @brief Populate log with value. - * @param ostream os Stores log string. + * @param ostringstream os Stores log string. * @param event_type et Mlperf log value. */ - void print_value(std::ostream& os, double value) const; - void print_value(std::ostream& os, long value) const; - void print_value(std::ostream& os, size_t value) const; - void print_value(std::ostream& os, std::string value) const; - //FIXME: Always picks this function first - //template - //void print_value(std::ostream& os, T value) const; static size_t get_ms_since_epoch(); @@ -117,10 +127,14 @@ class mlperf_logging : public callback_base { //FIXME: get logger to output file /* @brief name of output file. Default = results.txt */ std::string m_output_filename; - - //FIXME: Add custom logging tag /* @brief DiHydrogen logger */ - h2::Logger m_logger; + h2::Logger m_logger{":::MLLOG", m_output_filename}; + std::string m_sub_benchmark; + std::string m_sub_org; + std::string m_sub_division; + std::string m_sub_status; + std::string m_sub_platform; + }; // class mlperf_logging diff --git a/src/callbacks/mlperf_logging.cpp b/src/callbacks/mlperf_logging.cpp index 18244ec33f4..c48ed59837c 100644 --- a/src/callbacks/mlperf_logging.cpp +++ b/src/callbacks/mlperf_logging.cpp @@ -42,8 +42,36 @@ namespace lbann { namespace callback { +// FIXME Does this need an anon namespace since it's only in the cpp file? +void print_value(std::ostringstream& os, double value) +{ + os << value; +} +void print_value(std::ostringstream& os, long value) +{ + os << value; +} +void print_value(std::ostringstream& os, size_t value) +{ + os << value; +} +void print_value(std::ostringstream& os, std::string const& value) +{ + os << "\"" << value << "\""; +} +void print_value(std::ostringstream& os, char const* value) +{ + os << "\"" << value << "\""; +} template -void mlperf_logging::print(std::ostream& os, mlperf_logging::event_type et, +void print_value(std::ostringstream& os, T value) +{ + //FIXME: Should I push the value anyway? + os << "UNKNOWN_DATA_TYPE"; +} + +template +void mlperf_logging::print(std::ostringstream& os, mlperf_logging::event_type et, std::string key, T value, char const* file, size_t line, double epoch) const { @@ -54,19 +82,22 @@ void mlperf_logging::print(std::ostream& os, mlperf_logging::event_type et, print_event_type(os, et); os << "\", " - << "\"key\": " << key << "\", " + << "\"key\": \"" << key << "\", " << "\"value\": "; print_value(os, value); os << ", " << "\"metadata\": {\"file\": \"" << file << "\", " << "\"lineno\": " << line; if(epoch < 0) - os << "}}\n"; + os << "}}"; else - os << ", " << "\"epoch_num\": " << epoch << "}}\n"; + os << ", " << "\"epoch_num\": " << epoch << "}}"; + + H2_INFO(os.str()); + os.flush(); } -void mlperf_logging::print_event_type(std::ostream& os, mlperf_logging::event_type et) const +void mlperf_logging::print_event_type(std::ostringstream& os, mlperf_logging::event_type et) const { switch (et) { case mlperf_logging::event_type::TIME_POINT: os << "POINT_IN_TIME"; break; @@ -76,30 +107,6 @@ void mlperf_logging::print_event_type(std::ostream& os, mlperf_logging::event_ty } } -void mlperf_logging::print_value(std::ostream& os, double value) const -{ - os << value; -} -void mlperf_logging::print_value(std::ostream& os, long value) const -{ - os << value; -} -void mlperf_logging::print_value(std::ostream& os, size_t value) const -{ - os << value; -} -void mlperf_logging::print_value(std::ostream& os, std::string value) const -{ - os << value; -} -/*template -void mlperf_logging::print_value(std::ostream& os, T value) const -{ - //FIXME: Should I push the value anyway? - os << "UNKNOWN_DATA_TYPE"; -} -*/ - size_t mlperf_logging::get_ms_since_epoch() { using namespace std::chrono; @@ -117,35 +124,24 @@ void mlperf_logging::setup(model *m) print(os, mlperf_logging::event_type::TIME_POINT, "cache_clear", value, __FILE__, __LINE__); - //FIXME: Make these user input vars - value = "oc20"; print(os, mlperf_logging::event_type::TIME_POINT, "submission_benchmark", - value, __FILE__, __LINE__); + m_sub_benchmark, __FILE__, __LINE__); - value = "LBANN"; print(os, mlperf_logging::event_type::TIME_POINT, "submission_org", - value, __FILE__, __LINE__); + m_sub_org, __FILE__, __LINE__); - //FIXME: value = closed? - value = "closed"; print(os, mlperf_logging::event_type::TIME_POINT, "submission_division", - value, __FILE__, __LINE__); + m_sub_division, __FILE__, __LINE__); - //FIXME: value = onprem? - value = "onprem"; print(os, mlperf_logging::event_type::TIME_POINT, "submission_status", - value, __FILE__, __LINE__); + m_sub_status, __FILE__, __LINE__); - //FIXME: value = SUBMISSION_PLATFORM_PLACEHOLDER? - value = "?"; print(os, mlperf_logging::event_type::TIME_POINT, "submission_platform", - value, __FILE__, __LINE__); + m_sub_platform, __FILE__, __LINE__); value = "null"; print(os, mlperf_logging::event_type::TIME_POINT, "init_start", value, __FILE__, __LINE__); - - H2_INFO(os.str()); } void mlperf_logging::on_setup_end(model *m) { @@ -227,8 +223,6 @@ void mlperf_logging::on_setup_end(model *m) print(os, mlperf_logging::event_type::TIME_POINT, "init_stop", "null", __FILE__, __LINE__); - - H2_INFO(os.str()); } void mlperf_logging::on_epoch_begin(model *m) @@ -239,8 +233,6 @@ void mlperf_logging::on_epoch_begin(model *m) print(os, mlperf_logging::event_type::INT_START, "epoch_start", "null", __FILE__, __LINE__, epoch); - - H2_INFO(os.str()); } void mlperf_logging::on_epoch_end(model *m) @@ -251,8 +243,6 @@ void mlperf_logging::on_epoch_end(model *m) print(os, mlperf_logging::event_type::INT_START, "epoch_stop", "null", __FILE__, __LINE__, epoch); - - H2_INFO(os.str()); } void mlperf_logging::on_train_begin(model *m) @@ -264,8 +254,6 @@ void mlperf_logging::on_train_begin(model *m) //FIXME: run_start? Same time stamp as epoch 1 in results print(os, mlperf_logging::event_type::INT_START, "run_start", "null", __FILE__, __LINE__, epoch); - - H2_INFO(os.str()); } void mlperf_logging::on_train_end(model *m) @@ -277,8 +265,6 @@ void mlperf_logging::on_train_end(model *m) //FIXME: run_stop? End of training? print(os, mlperf_logging::event_type::INT_START, "run_stop", "null", __FILE__, __LINE__, epoch); - - H2_INFO(os.str()); } void mlperf_logging::on_batch_evaluate_begin(model *m) @@ -289,8 +275,6 @@ void mlperf_logging::on_batch_evaluate_begin(model *m) print(os, mlperf_logging::event_type::INT_START, "eval_start", "null", __FILE__, __LINE__, epoch); - - H2_INFO(os.str()); } void mlperf_logging::on_batch_evaluate_end(model *m) @@ -307,8 +291,6 @@ void mlperf_logging::on_batch_evaluate_end(model *m) print(os, mlperf_logging::event_type::TIME_POINT, "eval_error", static_cast(eval_error), __FILE__, __LINE__, epoch); - - H2_INFO(os.str()); } std::unique_ptr @@ -318,7 +300,12 @@ build_mlperf_logging_callback_from_pbuf( { const auto& params = dynamic_cast(proto_msg); - return std::make_unique(params.output_filename()); + return std::make_unique(params.sub_benchmark(), + params.sub_org(), + params.sub_division(), + params.sub_status(), + params.sub_platform(), + params.output_filename()); } } // namespace callback } // namespace lbann diff --git a/src/proto/callbacks.proto b/src/proto/callbacks.proto index 79f7d164db8..ac05ec8af8a 100644 --- a/src/proto/callbacks.proto +++ b/src/proto/callbacks.proto @@ -439,7 +439,12 @@ message Callback { /** @brief Prints mlperf compliant benchmark logs */ message CallbackMlperfLogging { - string output_filename = 1; + string output_filename = 1; // Output filename + string sub_benchmark = 2; // FIXME(KLG): document these + string sub_org = 3; + string sub_division = 4; + string sub_status = 5; + string sub_platform = 6; } message CallbackAlternateUpdates { From c45a184ca30359877cef5250d614e3933227d450 Mon Sep 17 00:00:00 2001 From: Katie Graham Date: Tue, 29 Nov 2022 09:51:08 -0800 Subject: [PATCH 3/7] Added mlperf callback documentation, added values for optimizer --- docs/callbacks/mlperf_logging.rst | 69 ++++++++++ include/lbann/callbacks/mlperf_logging.hpp | 32 ++--- src/callbacks/mlperf_logging.cpp | 150 +++++++++++---------- src/proto/callbacks.proto | 11 +- 4 files changed, 167 insertions(+), 95 deletions(-) create mode 100644 docs/callbacks/mlperf_logging.rst diff --git a/docs/callbacks/mlperf_logging.rst b/docs/callbacks/mlperf_logging.rst new file mode 100644 index 00000000000..4fdbb4c50b0 --- /dev/null +++ b/docs/callbacks/mlperf_logging.rst @@ -0,0 +1,69 @@ +.. role:: python(code) + :language: python + +.. role:: c(code) + :language: c + +.. _mlperf-logging-callback: + +============================================================ +MLPerf Logging Callback +============================================================ + +The MLPerf callback exports an MLPerf compatible log for running +benchmarks on LBANN. The logging output is included in the out.log +file located in the LBANN run directory. + +--------------------------------------------- +Execution Points +--------------------------------------------- + ++ setup ++ on setup end ++ on epoch begin ++ on epoch end ++ on train begin ++ on train end ++ on batch evaluate begin ++ on batch evaluate end + +.. _callback-arguments: + +--------------------------------------------- +Callback Arguments +--------------------------------------------- + + .. note:: While technically optional, omitting arguments will + result in "UNKNOWN_" appearing in the log + results (with the exception of sub_org). + + :sub_benchmark: (``string``) Benchmark name. A list of benchmarks + can be found in the `MLPerf Benchmarks Suite + `_. + + :sub_org: (``string``, optional) Organization running the + benchmarks. Default: `LBANN`. + + :sub_division: (``string``) Closed or open division. See `Divisions - void print(std::ostringstream& os, mlperf_logging::event_type et, std::string key, - T value, char const* file, size_t line, double epoch = -1) const; + void print(std::ostringstream& os, mlperf_logging::event_type et, + std::string key, T value, char const* file, size_t line, + double epoch = -1) const; void setup(model *m) override; void on_setup_end(model *m) override; @@ -124,16 +126,16 @@ class mlperf_logging : public callback_base { private: - //FIXME: get logger to output file - /* @brief name of output file. Default = results.txt */ - std::string m_output_filename; - /* @brief DiHydrogen logger */ - h2::Logger m_logger{":::MLLOG", m_output_filename}; std::string m_sub_benchmark; std::string m_sub_org; std::string m_sub_division; std::string m_sub_status; std::string m_sub_platform; + //FIXME: Include option to create separate file with just logging data? + /* @brief name of output file. Default = results.txt */ + //std::string m_output_filename; + /* @brief DiHydrogen logger */ + h2::Logger m_logger{":::MLLOG"}; }; // class mlperf_logging diff --git a/src/callbacks/mlperf_logging.cpp b/src/callbacks/mlperf_logging.cpp index c48ed59837c..6dab3d99f7e 100644 --- a/src/callbacks/mlperf_logging.cpp +++ b/src/callbacks/mlperf_logging.cpp @@ -29,6 +29,7 @@ #include "lbann/callbacks/mlperf_logging.hpp" #include "lbann/metrics/metric.hpp" #include "lbann/weights/weights.hpp" +#include "lbann/trainers/trainer.hpp" #include @@ -42,7 +43,11 @@ namespace lbann { namespace callback { -// FIXME Does this need an anon namespace since it's only in the cpp file? +namespace { +void print_value(std::ostringstream& os, int value) +{ + os << value; +} void print_value(std::ostringstream& os, double value) { os << value; @@ -67,8 +72,9 @@ template void print_value(std::ostringstream& os, T value) { //FIXME: Should I push the value anyway? - os << "UNKNOWN_DATA_TYPE"; + os << "\"UNKNOWN_DATA_TYPE\""; } +}// namespace template void mlperf_logging::print(std::ostringstream& os, mlperf_logging::event_type et, @@ -114,15 +120,14 @@ size_t mlperf_logging::get_ms_since_epoch() system_clock::now().time_since_epoch()).count(); } -//FIXME(KLG): There is no on_setup_begin. Can I steal this as a callback hook? void mlperf_logging::setup(model *m) { std::ostringstream os; - //FIXME: What is this? - std::string value = "null"; - print(os, mlperf_logging::event_type::TIME_POINT, "cache_clear", value, - __FILE__, __LINE__); + // Not a good/portable way to do this in C++ + // std::string value = "null"; + // print(os, mlperf_logging::event_type::TIME_POINT, "cache_clear", value, + // __FILE__, __LINE__); print(os, mlperf_logging::event_type::TIME_POINT, "submission_benchmark", m_sub_benchmark, __FILE__, __LINE__); @@ -139,89 +144,89 @@ void mlperf_logging::setup(model *m) print(os, mlperf_logging::event_type::TIME_POINT, "submission_platform", m_sub_platform, __FILE__, __LINE__); - value = "null"; - print(os, mlperf_logging::event_type::TIME_POINT, "init_start", value, + //value = "null"; + print(os, mlperf_logging::event_type::INT_START, "init_start", "null", __FILE__, __LINE__); } void mlperf_logging::on_setup_end(model *m) { std::ostringstream os; lbann_comm *comm = m->get_comm(); + auto const& trainer = get_const_trainer(); - //FIXME: num_trainers or world size? print(os, mlperf_logging::event_type::TIME_POINT, "number_of_ranks", - static_cast(comm->get_num_trainers()), __FILE__, __LINE__); + static_cast(comm->get_procs_in_world()), __FILE__, __LINE__); //FIXME auto nodes = -1; print(os, mlperf_logging::event_type::TIME_POINT, "number_of_nodes", - static_cast(nodes), __FILE__, __LINE__); + static_cast(nodes), __FILE__, __LINE__); //FIXME auto accelerators = -1; print(os, mlperf_logging::event_type::TIME_POINT, "accelerators_per_node", - static_cast(accelerators), __FILE__, __LINE__); + static_cast(accelerators), __FILE__, __LINE__); - //FIXME: From trainer.hpp? - auto seed = -1; + auto const seed = trainer.get_random_seed(); print(os, mlperf_logging::event_type::TIME_POINT, "seed", - static_cast(seed), __FILE__, __LINE__); + seed, __FILE__, __LINE__); - //FIXME: Add get_minibatch_size to model or metrics? - auto batch_size = -1; + auto const& dc = trainer.get_data_coordinator(); + auto const batch_size = dc.get_global_mini_batch_size( + execution_mode::training); print(os, mlperf_logging::event_type::TIME_POINT, "global_batch_size", - static_cast(batch_size), __FILE__, __LINE__); + batch_size, __FILE__, __LINE__); - metric_statistics metrics; - auto samples = metrics.get_num_samples(); + auto samples = dc.get_total_num_samples(execution_mode::training); print(os, mlperf_logging::event_type::TIME_POINT, "train_samples", - static_cast(samples), __FILE__, __LINE__); + samples, __FILE__, __LINE__); - //FIXME - auto eval_samples = -1; + //FIXME: Should this be execution_mode::validation? Tom thinks no + auto eval_samples = dc.get_total_num_samples(execution_mode::testing); print(os, mlperf_logging::event_type::TIME_POINT, "eval_samples", - static_cast(eval_samples), __FILE__, __LINE__); - - //FIXME: I couldn't get this to work - //auto* optimizer = m->get_weights().get_optimizer(); - std::string opt = "opt_name"; - print(os, mlperf_logging::event_type::TIME_POINT, "opt_name", - opt, __FILE__, __LINE__); - - //FIXME - auto opt_learning_rate = -1; - print(os, mlperf_logging::event_type::TIME_POINT, "opt_base_learning_rate", - static_cast(opt_learning_rate), __FILE__, __LINE__); - - //FIXME - auto opt_warmup_steps = -1; - print(os, mlperf_logging::event_type::TIME_POINT, - "opt_learning_rate_warmup_steps", - static_cast(opt_warmup_steps), - __FILE__, __LINE__); - - //FIXME - auto opt_warmup_factor = -1; - print(os, mlperf_logging::event_type::TIME_POINT, - "opt_learning_rate_warmup_factor", - static_cast(opt_warmup_factor), - __FILE__, __LINE__); - - //FIXME - auto opt_decay_bound_steps = -1; - print(os, mlperf_logging::event_type::TIME_POINT, - "opt_learning_rate_decay_boundary_steps", - static_cast(opt_decay_bound_steps), - __FILE__, __LINE__); - - //FIXME - auto opt_decay_factor = -1; - print(os, mlperf_logging::event_type::TIME_POINT, - "opt_learning_rate_decay_factor", - static_cast(opt_decay_factor), - __FILE__, __LINE__); - - print(os, mlperf_logging::event_type::TIME_POINT, "init_stop", "null", + eval_samples, __FILE__, __LINE__); + + auto const weights = m->get_weights(); + for (auto const w : weights) + if( w->get_optimizer() != nullptr ){ + std::string opt = w->get_optimizer()->get_type(); + print(os, mlperf_logging::event_type::TIME_POINT, "opt_name", + opt, __FILE__, __LINE__); + + auto opt_learning_rate = w->get_optimizer()->get_learning_rate(); + print(os, mlperf_logging::event_type::TIME_POINT, + "opt_base_learning_rate", static_cast(opt_learning_rate), + __FILE__, __LINE__); + break; + } + + // LBANN does not perform warmup steps. + // auto opt_warmup_steps = -1; + // print(os, mlperf_logging::event_type::TIME_POINT, + // "opt_learning_rate_warmup_steps", + // static_cast(opt_warmup_steps), + // __FILE__, __LINE__); + + // auto opt_warmup_factor = -1; + // print(os, mlperf_logging::event_type::TIME_POINT, + // "opt_learning_rate_warmup_factor", + // static_cast(opt_warmup_factor), + // __FILE__, __LINE__); + + // FIXME (Tom's problem) + //auto opt_decay_bound_steps = -1; + //print(os, mlperf_logging::event_type::TIME_POINT, + // "opt_learning_rate_decay_boundary_steps", + // static_cast(opt_decay_bound_steps), + // __FILE__, __LINE__); + + // auto opt_decay_factor = -1; + // print(os, mlperf_logging::event_type::TIME_POINT, + // "opt_learning_rate_decay_factor", + // static_cast(opt_decay_factor), + // __FILE__, __LINE__); + + print(os, mlperf_logging::event_type::INT_END, "init_stop", "null", __FILE__, __LINE__); } @@ -241,7 +246,7 @@ void mlperf_logging::on_epoch_end(model *m) const auto& epoch = static_cast( m->get_execution_context()).get_epoch(); - print(os, mlperf_logging::event_type::INT_START, "epoch_stop", "null", + print(os, mlperf_logging::event_type::INT_END, "epoch_stop", "null", __FILE__, __LINE__, epoch); } @@ -251,7 +256,6 @@ void mlperf_logging::on_train_begin(model *m) const auto& epoch = static_cast( m->get_execution_context()).get_epoch(); - //FIXME: run_start? Same time stamp as epoch 1 in results print(os, mlperf_logging::event_type::INT_START, "run_start", "null", __FILE__, __LINE__, epoch); } @@ -262,8 +266,7 @@ void mlperf_logging::on_train_end(model *m) const auto& epoch = static_cast( m->get_execution_context()).get_epoch(); - //FIXME: run_stop? End of training? - print(os, mlperf_logging::event_type::INT_START, "run_stop", "null", + print(os, mlperf_logging::event_type::INT_END, "run_stop", "null", __FILE__, __LINE__, epoch); } @@ -283,10 +286,10 @@ void mlperf_logging::on_batch_evaluate_end(model *m) const auto& epoch = static_cast( m->get_execution_context()).get_epoch(); - print(os, mlperf_logging::event_type::INT_START, "eval_stop", "null", + print(os, mlperf_logging::event_type::INT_END, "eval_stop", "null", __FILE__, __LINE__, epoch); - //FIXME + //FIXME (Tom's problem) auto eval_error = -1; print(os, mlperf_logging::event_type::TIME_POINT, "eval_error", static_cast(eval_error), __FILE__, @@ -304,8 +307,7 @@ build_mlperf_logging_callback_from_pbuf( params.sub_org(), params.sub_division(), params.sub_status(), - params.sub_platform(), - params.output_filename()); + params.sub_platform()); } } // namespace callback } // namespace lbann diff --git a/src/proto/callbacks.proto b/src/proto/callbacks.proto index ac05ec8af8a..1114dc10b14 100644 --- a/src/proto/callbacks.proto +++ b/src/proto/callbacks.proto @@ -439,12 +439,11 @@ message Callback { /** @brief Prints mlperf compliant benchmark logs */ message CallbackMlperfLogging { - string output_filename = 1; // Output filename - string sub_benchmark = 2; // FIXME(KLG): document these - string sub_org = 3; - string sub_division = 4; - string sub_status = 5; - string sub_platform = 6; + string sub_benchmark = 1; // Name of benchmark + string sub_org = 2; // Name of submission organization + string sub_division = 3; // Open or closed division + string sub_status = 4; // Submission status: onprem, cloud, or preview + string sub_platform = 5; // Submission platform/hardware } message CallbackAlternateUpdates { From d47e2257ae8674917da1e68204f9984927e1cc19 Mon Sep 17 00:00:00 2001 From: Katie Graham Date: Tue, 29 Nov 2022 23:47:49 -0800 Subject: [PATCH 4/7] added function to get num nodes --- src/callbacks/mlperf_logging.cpp | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/src/callbacks/mlperf_logging.cpp b/src/callbacks/mlperf_logging.cpp index 6dab3d99f7e..883404630c0 100644 --- a/src/callbacks/mlperf_logging.cpp +++ b/src/callbacks/mlperf_logging.cpp @@ -74,12 +74,29 @@ void print_value(std::ostringstream& os, T value) //FIXME: Should I push the value anyway? os << "\"UNKNOWN_DATA_TYPE\""; } + +//FIXME: Tom's problem +int get_real_num_accelerators() +{ + return 0; +} + +int get_num_nodes() +{ + if (std::getenv("SLURM_NNODES")) + return atoi(std::getenv("SLURM_NNODES")); + else if (std::getenv("FLUX_JOB_NNODES")) + return atoi(std::getenv("FLUX_JOB_NNODES")); + else return -1; + //FIXME: count number of unique hostnames in universe? +} }// namespace template -void mlperf_logging::print(std::ostringstream& os, mlperf_logging::event_type et, - std::string key, T value, char const* file, - size_t line, double epoch) const +void mlperf_logging::print(std::ostringstream& os, + mlperf_logging::event_type et, std::string key, + T value, char const* file, size_t line, + double epoch) const { os << "{" << "\"namespace\": \"\", " @@ -144,7 +161,6 @@ void mlperf_logging::setup(model *m) print(os, mlperf_logging::event_type::TIME_POINT, "submission_platform", m_sub_platform, __FILE__, __LINE__); - //value = "null"; print(os, mlperf_logging::event_type::INT_START, "init_start", "null", __FILE__, __LINE__); } @@ -157,13 +173,10 @@ void mlperf_logging::on_setup_end(model *m) print(os, mlperf_logging::event_type::TIME_POINT, "number_of_ranks", static_cast(comm->get_procs_in_world()), __FILE__, __LINE__); - //FIXME - auto nodes = -1; print(os, mlperf_logging::event_type::TIME_POINT, "number_of_nodes", - static_cast(nodes), __FILE__, __LINE__); + static_cast(get_num_nodes()), __FILE__, __LINE__); - //FIXME - auto accelerators = -1; + auto accelerators = get_real_num_accelerators(); print(os, mlperf_logging::event_type::TIME_POINT, "accelerators_per_node", static_cast(accelerators), __FILE__, __LINE__); @@ -308,6 +321,7 @@ build_mlperf_logging_callback_from_pbuf( params.sub_division(), params.sub_status(), params.sub_platform()); + //params.num_nodes()); } } // namespace callback } // namespace lbann From 3e95495b4cd2003cd0f3b2084917fae6c3cf784f Mon Sep 17 00:00:00 2001 From: Katie Graham Date: Mon, 27 Feb 2023 09:04:38 -0800 Subject: [PATCH 5/7] Updated mlperf_logging to be compliant with new H2 logger format --- include/lbann/callbacks/mlperf_logging.hpp | 3 +-- src/callbacks/mlperf_logging.cpp | 3 ++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/lbann/callbacks/mlperf_logging.hpp b/include/lbann/callbacks/mlperf_logging.hpp index 2e19046033d..fd1097fc56c 100644 --- a/include/lbann/callbacks/mlperf_logging.hpp +++ b/include/lbann/callbacks/mlperf_logging.hpp @@ -131,11 +131,10 @@ class mlperf_logging : public callback_base { std::string m_sub_division; std::string m_sub_status; std::string m_sub_platform; - //FIXME: Include option to create separate file with just logging data? /* @brief name of output file. Default = results.txt */ //std::string m_output_filename; /* @brief DiHydrogen logger */ - h2::Logger m_logger{":::MLLOG"}; + h2::Logger m_logger{"mlperf_logger", "stdout", ":::MLLOG"}; }; // class mlperf_logging diff --git a/src/callbacks/mlperf_logging.cpp b/src/callbacks/mlperf_logging.cpp index 883404630c0..d842c0f711e 100644 --- a/src/callbacks/mlperf_logging.cpp +++ b/src/callbacks/mlperf_logging.cpp @@ -44,6 +44,7 @@ namespace lbann { namespace callback { namespace { + void print_value(std::ostringstream& os, int value) { os << value; @@ -116,7 +117,7 @@ void mlperf_logging::print(std::ostringstream& os, else os << ", " << "\"epoch_num\": " << epoch << "}}"; - H2_INFO(os.str()); + m_logger.get()->info(os.str()); os.flush(); } From 1d8937dbaa6d98d15c74558a266360a6054a9ec5 Mon Sep 17 00:00:00 2001 From: Katie Graham Date: Tue, 28 Feb 2023 15:11:44 -0800 Subject: [PATCH 6/7] fixed typo in callbacks.proto --- src/proto/callbacks.proto | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/proto/callbacks.proto b/src/proto/callbacks.proto index 1114dc10b14..b75a19bb17d 100644 --- a/src/proto/callbacks.proto +++ b/src/proto/callbacks.proto @@ -86,7 +86,8 @@ message Callback { CallbackPerturbWeights perturb_weights = 52; CallbackExportOnnx export_onnx = 53; CallbackAlternateUpdates alternate_updates = 54; - CallbackMlperfLogging mlperf_logging = 55; + CallbackMlperfLogging mlperf_logging = 56; +} message CallbackLTFB { int64 batch_interval = 1; From 31326f9889e3e4db66b10c1649f5775251428cb2 Mon Sep 17 00:00:00 2001 From: Katie Graham Date: Tue, 21 Mar 2023 12:58:25 -0700 Subject: [PATCH 7/7] Resolved errors introduced by include-what-you-use and other various PRs --- include/lbann/callbacks/mlperf_logging.hpp | 8 +++++++- src/callbacks/mlperf_logging.cpp | 19 ++++++++++++++++--- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/include/lbann/callbacks/mlperf_logging.hpp b/include/lbann/callbacks/mlperf_logging.hpp index fd1097fc56c..ba246429365 100644 --- a/include/lbann/callbacks/mlperf_logging.hpp +++ b/include/lbann/callbacks/mlperf_logging.hpp @@ -32,6 +32,10 @@ #include "lbann/callbacks/callback.hpp" #include

+namespace lbann_data { +class Callback; +} + namespace lbann { namespace callback { @@ -111,6 +115,8 @@ class mlperf_logging : public callback_base { private: + void write_specific_proto(lbann_data::Callback& proto) const final; + /** @brief Populate log with mlperf event type. * @param ostringstream os Stores log string. * @param event_type et Type of mlperf style event. @@ -134,7 +140,7 @@ class mlperf_logging : public callback_base { /* @brief name of output file. Default = results.txt */ //std::string m_output_filename; /* @brief DiHydrogen logger */ - h2::Logger m_logger{"mlperf_logger", "stdout", ":::MLLOG"}; + mutable h2::Logger m_logger{"mlperf_logger", "stdout", ":::MLLOG"}; }; // class mlperf_logging diff --git a/src/callbacks/mlperf_logging.cpp b/src/callbacks/mlperf_logging.cpp index d842c0f711e..adc752df35f 100644 --- a/src/callbacks/mlperf_logging.cpp +++ b/src/callbacks/mlperf_logging.cpp @@ -30,8 +30,12 @@ #include "lbann/metrics/metric.hpp" #include "lbann/weights/weights.hpp" #include "lbann/trainers/trainer.hpp" +#include "lbann/models/model.hpp" +#include "lbann/data_coordinator/data_coordinator.hpp" +#include "lbann/execution_algorithms/sgd_execution_context.hpp" +#include "lbann/optimizers/optimizer.hpp" -#include +#include "lbann/proto/callbacks.pb.h" #include #include @@ -44,7 +48,6 @@ namespace lbann { namespace callback { namespace { - void print_value(std::ostringstream& os, int value) { os << value; @@ -93,6 +96,16 @@ int get_num_nodes() } }// namespace +void mlperf_logging::write_specific_proto(lbann_data::Callback& proto) const +{ + auto* msg = proto.mutable_mlperf_logging(); + msg->set_sub_benchmark(m_sub_benchmark); + msg->set_sub_org(m_sub_org); + msg->set_sub_division(m_sub_division); + msg->set_sub_status(m_sub_status); + msg->set_sub_platform(m_sub_platform); +} + template void mlperf_logging::print(std::ostringstream& os, mlperf_logging::event_type et, std::string key, @@ -117,7 +130,7 @@ void mlperf_logging::print(std::ostringstream& os, else os << ", " << "\"epoch_num\": " << epoch << "}}"; - m_logger.get()->info(os.str()); + m_logger.get().info(os.str()); os.flush(); }