StyleTTS2-onnx-cpp/main.cpp at master · DDATT/StyleTTS2-onnx-cpp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#include <iostream>
#include <chrono>
#include <string>
#include "styletts2.h"
#include "wavfile.hpp"
int main() {
    try {
        // Text to synthesize
        std::string text = "Every time I'm done sorting my album and look out the window, I always want to take another picture... Even though I've seen the view a thousand times, I still feel happy looking at the stars like this.";

        // Paths configuration
        std::string modelDir = "trained_models";
        std::ofstream audioFile("test.wav", std::ios::binary);

        std::cout << "Initializing StyleTTS2..." << std::endl;

        // Initialize StyleTTS2 (loads all models)
        auto startTime = std::chrono::steady_clock::now();
        StyleTTS2 tts(modelDir, "path-to-espeak/espeak-ng/share/espeak-ng-data", false);  // false = use CPU, true = use CUDA
        tts.LoadStyle(modelDir + "/ref_s.bin", modelDir + "/ref_p.bin");
        auto initEndTime = std::chrono::steady_clock::now();
        auto initDuration = std::chrono::duration<double>(initEndTime - startTime);
        std::cout << "Initialization took: " << initDuration.count() << " seconds" << std::endl;

        std::cout << "Synthesizing speech..." << std::endl;

        // Synthesize with default voice (from ref_s.bin and ref_p.bin)
        auto inferStartTime = std::chrono::steady_clock::now();
        std::vector<int16_t> audioBuffer = tts.synthesize(text, 1.0f);  // speed = 1.0

        auto inferEndTime = std::chrono::steady_clock::now();
        auto inferDuration = std::chrono::duration<double>(inferEndTime - inferStartTime);

        // Save to WAV file
        writeWavHeader(24000, 2, 1, (int32_t)audioBuffer.size(), audioFile);
        audioFile.write((const char *)audioBuffer.data(), sizeof(int16_t) * audioBuffer.size());
        audioFile.close();
        auto endTime = std::chrono::steady_clock::now();
        auto totalDuration = std::chrono::duration<double>(endTime - startTime);

        // Print statistics
        double audioDuration = (double)audioBuffer.size() / 24000;
        std::cout << "\n=== Results ===" << std::endl;
        std::cout << "Initialization time: " << initDuration.count() << " seconds" << std::endl;
        std::cout << "Inference time: " << inferDuration.count() << " seconds" << std::endl;
        std::cout << "Total runtime: " << totalDuration.count() << " seconds" << std::endl;
        std::cout << "Audio duration: " << audioDuration << " seconds" << std::endl;
        std::cout << "Real-time factor: " << inferDuration.count() / audioDuration << "x" << std::endl;
        std::cout << "\nSynthesis completed successfully!" << std::endl;

    } catch (const std::exception& e) {
        std::cerr << "Error: " << e.what() << std::endl;
        return 1;
    }

    return 0;
}