-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbasic_use.rs
More file actions
78 lines (67 loc) · 3.01 KB
/
basic_use.rs
File metadata and controls
78 lines (67 loc) · 3.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
/*
wget https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.bin
wget https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav
cargo run --example basic_use ggml-tiny.bin jfk.wav
*/
use whisper_rs::{FullParams, SamplingStrategy, WhisperContext, WhisperContextParameters};
fn main() {
let model_path = std::env::args()
.nth(1)
.expect("Please specify path to model as argument 1");
let wav_path = std::env::args()
.nth(2)
.expect("Please specify path to wav file as argument 2");
let samples: Vec<i16> = hound::WavReader::open(wav_path)
.unwrap()
.into_samples::<i16>()
.map(|x| x.unwrap())
.collect();
// load a context and model
let ctx = WhisperContext::new_with_params(&model_path, WhisperContextParameters::default())
.expect("failed to load model");
// create a state attached to the model
let mut state = ctx.create_state().expect("failed to create state");
// the sampling strategy will determine how accurate your final output is going to be
// typically BeamSearch is more accurate at the cost of significantly increased CPU time
let mut params = FullParams::new(SamplingStrategy::BeamSearch {
// whisper.cpp defaults to a beam size of 5, a reasonable default
beam_size: 5,
// this parameter is currently unused but defaults to -1.0
patience: -1.0,
});
// and set the language to translate to as english
params.set_language(Some("en"));
// we also explicitly disable anything that prints to stdout
// despite all of this you will still get things printing to stdout,
// be prepared to deal with it
params.set_print_special(false);
params.set_print_progress(false);
params.set_print_realtime(false);
params.set_print_timestamps(false);
// we must convert to 16KHz mono f32 samples for the model
// some utilities exist for this
// note that you don't need to use these, you can do it yourself or any other way you want
// these are just provided for convenience
let mut inter_samples = vec![Default::default(); samples.len()];
let mut mono_samples = vec![Default::default(); samples.len() / 2];
whisper_rs::convert_integer_to_float_audio(&samples, &mut inter_samples)
.expect("failed to convert audio data");
whisper_rs::convert_stereo_to_mono_audio(&inter_samples, &mut mono_samples)
.expect("failed to convert audio data");
// now we can run the model
state
.full(params, &mono_samples[..])
.expect("failed to run model");
// fetch the results
for segment in state.as_iter() {
println!(
"[{} - {}]: {}",
// these timestamps are in centiseconds (10s of milliseconds)
segment.start_timestamp(),
segment.end_timestamp(),
// this default Display implementation will result in any invalid UTF-8
// being converted into the Unicode replacement character, U+FFFD
segment
);
}
}