Skip to content

Commit 8cbe327

Browse files
committed
Add feature config to critical uapi error handler
Added two new fields to FeatureWireguard to control the critical error handler. Default values are present that should be safe enough but can be overriden if needed. These changes do not break the API. Additionally integration tests are added and some renaming. Signed-off-by: Lukas Pukenis <lukas.pukenis@nordsec.com>
1 parent db615f1 commit 8cbe327

File tree

8 files changed

+199
-31
lines changed

8 files changed

+199
-31
lines changed

.unreleased/LLT-6859

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
FeatureConfig allows configuring critical UAPI error handler parameters

.unreleased/restore_interface_gone_check_in_uapi_error_handler

Whitespace-only changes.

crates/telio-model/src/event.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ macro_rules! report_event {
2222
}
2323

2424
/// Error levels. Used for app to decide what to do with `telio` device when error happens.
25-
#[derive(Clone, Debug, Default, Serialize)]
25+
#[derive(Clone, Debug, Default, Serialize, Eq, PartialEq)]
2626
#[serde(rename_all = "lowercase")]
2727
pub enum ErrorLevel {
2828
/// The error level is critical (highest priority)
@@ -37,7 +37,7 @@ pub enum ErrorLevel {
3737
}
3838

3939
/// Error code. Common error code representation (for statistics).
40-
#[derive(Clone, Debug, Default, Serialize)]
40+
#[derive(Clone, Debug, Default, Serialize, Eq, PartialEq)]
4141
#[serde(rename_all = "lowercase")]
4242
pub enum ErrorCode {
4343
/// There is no error in the execution

crates/telio-model/src/features.rs

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ pub struct FeatureBatching {
9393
}
9494

9595
/// Configurable features for Wireguard peers
96-
#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
96+
#[derive(Clone, Debug, SmartDefault, PartialEq, Eq, Serialize, Deserialize)]
9797
#[cfg_attr(test, derive(proptest_derive::Arbitrary))]
9898
pub struct FeatureWireguard {
9999
/// Configurable persistent keepalive periods for wireguard peers
@@ -114,6 +114,12 @@ pub struct FeatureWireguard {
114114
/// Configurable socket buffer size for NepTUN
115115
#[serde(default)]
116116
pub max_inter_thread_batched_pkts: Option<u32>,
117+
/// Configure time needed to emit critical error when adapter is gone
118+
#[default(15)]
119+
pub adapter_gone_error_threshold_secs: u32,
120+
/// /// Configure the expected time between uapi calls to consider critical error
121+
#[default(30)]
122+
pub adapter_gone_max_uapi_inerval_secs: u32,
117123
}
118124

119125
impl FeatureWireguard {
@@ -701,7 +707,9 @@ mod tests {
701707
"enable_dynamic_wg_nt_control": true,
702708
"skt_buffer_size": 123456,
703709
"inter_thread_channel_size": 123456,
704-
"max_inter_thread_batched_pkts": 123456
710+
"max_inter_thread_batched_pkts": 123456,
711+
"adapter_gone_error_threshold_secs": 123,
712+
"adapter_gone_max_uapi_inerval_secs": 456
705713
},
706714
"nurse": {
707715
"fingerprint": "test_fingerprint",
@@ -805,6 +813,8 @@ mod tests {
805813
skt_buffer_size: Some(123456),
806814
inter_thread_channel_size: Some(123456),
807815
max_inter_thread_batched_pkts: Some(123456),
816+
adapter_gone_error_threshold_secs: 123,
817+
adapter_gone_max_uapi_inerval_secs: 456
808818
},
809819
nurse: Some(FeatureNurse {
810820
heartbeat_interval: 5,

crates/telio-wg/src/wg.rs

Lines changed: 172 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -125,12 +125,6 @@ pub struct Io {
125125
pub const KEEPALIVE_PACKET_SIZE: u64 = 32;
126126
/// Default wireguard keepalive duration
127127
pub const WG_KEEPALIVE: Duration = Duration::from_secs(10);
128-
/// Used to ensure uapi errors happen for at least this long before considering critical error event.
129-
/// The failures are usually transient and observed during power transitions when using WireguardNT.
130-
const REQUIRED_CRITICAL_ERROR_GAP: Duration = Duration::from_secs(15);
131-
/// Used to detect long gaps between UAPI commands that suggest the device might have been suspended.
132-
/// Usually libtelio should poll every second.
133-
const MAX_UAPI_CMD_GAP: Duration = Duration::from_secs(30);
134128

135129
pub use telio_utils::{BytesAndTimestamps, Instant};
136130

@@ -139,12 +133,12 @@ pub use telio_utils::{BytesAndTimestamps, Instant};
139133
#[derive(Debug)]
140134
struct UAPIErrorTracker {
141135
// Minimum required time between first error and later ones to consider returning critical verdict.
142-
required_critical_error_gap: Duration,
136+
adapter_gone_error_threshold: Duration,
143137

144138
// Maximum time allowed between uapi commands.
145-
max_uapi_cmd_gap: Duration,
139+
adapter_gone_max_uapi_inerval: Duration,
146140

147-
// Timestamp of the first error from which the required_critical_error_gap is measured.
141+
// Timestamp of the first error from which the adapter_gone_error_threshold is measured.
148142
first_error_at: Option<Instant>,
149143

150144
// Timestamp of the most recent UAPI command.
@@ -160,10 +154,13 @@ enum UAPIErrorTrackerVerdict {
160154
}
161155

162156
impl UAPIErrorTracker {
163-
fn new(required_critical_error_gap: Duration, max_uapi_cmd_gap: Duration) -> Self {
157+
fn new(
158+
adapter_gone_error_threshold: Duration,
159+
adapter_gone_max_uapi_inerval: Duration,
160+
) -> Self {
164161
UAPIErrorTracker {
165-
required_critical_error_gap,
166-
max_uapi_cmd_gap,
162+
adapter_gone_error_threshold,
163+
adapter_gone_max_uapi_inerval,
167164
first_error_at: None,
168165
last_uapi_cmd_at: None,
169166
}
@@ -177,16 +174,16 @@ impl UAPIErrorTracker {
177174
return UAPIErrorTrackerVerdict::OK;
178175
}
179176

180-
let uapi_within_allowed_window =
181-
prev_uapi_cmd_time.is_none_or(|t| inst.duration_since(t) < self.max_uapi_cmd_gap);
177+
let uapi_within_allowed_window = prev_uapi_cmd_time
178+
.is_none_or(|t| inst.duration_since(t) < self.adapter_gone_max_uapi_inerval);
182179

183180
if !uapi_within_allowed_window {
184181
self.first_error_at = Some(inst);
185182
}
186183

187184
match self.first_error_at {
188185
Some(t) => {
189-
if inst.duration_since(t) >= self.required_critical_error_gap
186+
if inst.duration_since(t) >= self.adapter_gone_error_threshold
190187
&& resp.interface.is_none()
191188
{
192189
self.first_error_at = None;
@@ -298,7 +295,9 @@ impl DynamicWg {
298295
/// },
299296
/// None,
300297
/// Duration::from_millis(1000),
301-
/// Duration::from_millis(50)
298+
/// Duration::from_millis(50),
299+
/// Duration::from_secs(15),
300+
/// Duration::from_secs(30),
302301
/// );
303302
/// }
304303
/// ```
@@ -308,6 +307,8 @@ impl DynamicWg {
308307
link_detection: Option<LinkDetection>,
309308
polling_period: Duration,
310309
polling_period_after_update: Duration,
310+
adapter_gone_error_duration: Duration,
311+
adapter_gone_error_max_uapi_interval: Duration,
311312
) -> Result<Self, Error>
312313
where
313314
Self: Sized,
@@ -321,6 +322,8 @@ impl DynamicWg {
321322
cfg,
322323
polling_period,
323324
polling_period_after_update,
325+
adapter_gone_error_duration,
326+
adapter_gone_error_max_uapi_interval,
324327
));
325328
#[cfg(windows)]
326329
return Ok(Self::start_with(
@@ -329,16 +332,21 @@ impl DynamicWg {
329332
link_detection,
330333
polling_period,
331334
polling_period_after_update,
335+
adapter_gone_error_duration,
336+
adapter_gone_error_max_uapi_interval,
332337
));
333338
}
334339

340+
#[allow(clippy::too_many_arguments)]
335341
fn start_with(
336342
io: Io,
337343
adapter: Box<dyn Adapter>,
338344
link_detection: Option<LinkDetection>,
339345
#[cfg(unix)] cfg: Config,
340346
polling_period: Duration,
341347
polling_period_after_update: Duration,
348+
adapter_gone_error_duration: Duration,
349+
adapter_gone_error_max_uapi_interval: Duration,
342350
) -> Self {
343351
let interval = interval(polling_period);
344352
Self {
@@ -351,8 +359,8 @@ impl DynamicWg {
351359
event: io.events,
352360
analytics_tx: io.analytics_tx,
353361
uapi_error_tracker: UAPIErrorTracker::new(
354-
REQUIRED_CRITICAL_ERROR_GAP,
355-
MAX_UAPI_CMD_GAP,
362+
adapter_gone_error_duration,
363+
adapter_gone_error_max_uapi_interval,
356364
),
357365
link_detection,
358366
libtelio_event: io.libtelio_wide_event_publisher,
@@ -1095,6 +1103,8 @@ pub mod tests {
10951103

10961104
const DEFAULT_POLLING_PERIOD_MS: u64 = 1000;
10971105
const DEFAULT_POLLING_PERIOD_AFTER_UPDATE_MS: u64 = 50;
1106+
const ADAPTER_GONE_ERROR_THRESHOLD_S: u64 = 15;
1107+
const ADAPTER_GONE_MAX_UAPI_INERVAL_S: u64 = 30;
10981108

10991109
fn random_interface() -> Interface {
11001110
let mut rng = rand::thread_rng();
@@ -1253,6 +1263,8 @@ pub mod tests {
12531263
cfg,
12541264
Duration::from_millis(DEFAULT_POLLING_PERIOD_MS),
12551265
Duration::from_millis(DEFAULT_POLLING_PERIOD_AFTER_UPDATE_MS),
1266+
Duration::from_secs(ADAPTER_GONE_ERROR_THRESHOLD_S),
1267+
Duration::from_secs(ADAPTER_GONE_MAX_UAPI_INERVAL_S),
12561268
);
12571269
time::advance(Duration::from_millis(0)).await;
12581270
adapter.lock().await.checkpoint();
@@ -1276,6 +1288,138 @@ pub mod tests {
12761288
wg.stop().await;
12771289
}
12781290

1291+
#[tokio::test(start_paused = true)]
1292+
async fn wg_test_uapi_critical_error() {
1293+
struct ErrorReturningAdapter {}
1294+
1295+
#[async_trait]
1296+
impl Adapter for ErrorReturningAdapter {
1297+
async fn send_uapi_cmd(&self, _cmd: &Cmd) -> Result<Response, AdapterError> {
1298+
return Ok(Response {
1299+
errno: 2,
1300+
interface: None,
1301+
});
1302+
}
1303+
1304+
async fn stop(&self) {}
1305+
fn get_adapter_luid(&self) -> u64 {
1306+
0
1307+
}
1308+
async fn set_tun(&self, _tun: Tun) -> Result<(), Error> {
1309+
Ok(())
1310+
}
1311+
fn clone_box(&self) -> Option<Box<dyn Adapter>> {
1312+
None
1313+
}
1314+
}
1315+
1316+
let events_ch = Chan::default();
1317+
let (event_tx, mut event_rx) = tokio::sync::broadcast::channel(256);
1318+
1319+
let analytics_ch = Some(McChan::default().tx);
1320+
let wg = DynamicWg::start_with(
1321+
Io {
1322+
events: events_ch.tx,
1323+
analytics_tx: analytics_ch.clone(),
1324+
libtelio_wide_event_publisher: Some(event_tx),
1325+
},
1326+
Box::new(ErrorReturningAdapter {}),
1327+
None,
1328+
#[cfg(all(unix, test))]
1329+
Config::new().unwrap(),
1330+
#[cfg(all(unix, not(test)))]
1331+
cfg,
1332+
Duration::from_millis(DEFAULT_POLLING_PERIOD_MS),
1333+
Duration::from_millis(DEFAULT_POLLING_PERIOD_AFTER_UPDATE_MS),
1334+
Duration::from_secs(ADAPTER_GONE_ERROR_THRESHOLD_S),
1335+
Duration::from_secs(ADAPTER_GONE_MAX_UAPI_INERVAL_S),
1336+
);
1337+
1338+
// Now lets advance time and eventually expect critical error since adapter is gone
1339+
for _ in 0..ADAPTER_GONE_ERROR_THRESHOLD_S {
1340+
tokio::time::advance(Duration::from_secs(1)).await;
1341+
tokio::task::yield_now().await;
1342+
}
1343+
1344+
assert_eq!(event_rx.len(), 0);
1345+
1346+
tokio::time::advance(Duration::from_secs(1)).await;
1347+
tokio::task::yield_now().await;
1348+
1349+
assert!(matches!(
1350+
*event_rx.try_recv().unwrap(),
1351+
telio_model::event::Event::Error { body }
1352+
if body.code == ErrorCode::Unknown
1353+
&& body.level == ErrorLevel::Critical
1354+
&& body.msg == "Interface gone"
1355+
));
1356+
1357+
wg.stop().await;
1358+
}
1359+
1360+
#[tokio::test(start_paused = true)]
1361+
async fn wg_test_uapi_no_critical_error() {
1362+
struct ErrorReturningAdapter {}
1363+
1364+
#[async_trait]
1365+
impl Adapter for ErrorReturningAdapter {
1366+
async fn send_uapi_cmd(&self, _cmd: &Cmd) -> Result<Response, AdapterError> {
1367+
return Ok(Response {
1368+
errno: 2,
1369+
interface: Some(Interface::default()),
1370+
});
1371+
}
1372+
1373+
async fn stop(&self) {}
1374+
fn get_adapter_luid(&self) -> u64 {
1375+
0
1376+
}
1377+
async fn set_tun(&self, _tun: Tun) -> Result<(), Error> {
1378+
Ok(())
1379+
}
1380+
fn clone_box(&self) -> Option<Box<dyn Adapter>> {
1381+
None
1382+
}
1383+
}
1384+
1385+
let events_ch = Chan::default();
1386+
let (event_tx, event_rx) = tokio::sync::broadcast::channel(256);
1387+
1388+
let analytics_ch = Some(McChan::default().tx);
1389+
let wg = DynamicWg::start_with(
1390+
Io {
1391+
events: events_ch.tx,
1392+
analytics_tx: analytics_ch.clone(),
1393+
libtelio_wide_event_publisher: Some(event_tx),
1394+
},
1395+
Box::new(ErrorReturningAdapter {}),
1396+
None,
1397+
#[cfg(all(unix, test))]
1398+
Config::new().unwrap(),
1399+
#[cfg(all(unix, not(test)))]
1400+
cfg,
1401+
Duration::from_millis(DEFAULT_POLLING_PERIOD_MS),
1402+
Duration::from_millis(DEFAULT_POLLING_PERIOD_AFTER_UPDATE_MS),
1403+
Duration::from_secs(ADAPTER_GONE_ERROR_THRESHOLD_S),
1404+
Duration::from_secs(ADAPTER_GONE_MAX_UAPI_INERVAL_S),
1405+
);
1406+
1407+
// Now lets advance time and since adapter is present, no critical event should be there
1408+
for _ in 0..ADAPTER_GONE_ERROR_THRESHOLD_S {
1409+
tokio::time::advance(Duration::from_secs(1)).await;
1410+
tokio::task::yield_now().await;
1411+
}
1412+
1413+
assert_eq!(event_rx.len(), 0);
1414+
1415+
tokio::time::advance(Duration::from_secs(1)).await;
1416+
tokio::task::yield_now().await;
1417+
1418+
assert_eq!(event_rx.len(), 0);
1419+
1420+
wg.stop().await;
1421+
}
1422+
12791423
/// See RFC LLT-0083 for more details on how and why polling speed up was added.
12801424
#[tokio::test(start_paused = true)]
12811425
async fn faster_polling() {
@@ -2079,7 +2223,12 @@ pub mod tests {
20792223
interface: Some(Interface::default()),
20802224
};
20812225

2082-
let mut tracker = UAPIErrorTracker::new(REQUIRED_CRITICAL_ERROR_GAP, MAX_UAPI_CMD_GAP);
2226+
use super::*;
2227+
2228+
let mut tracker = UAPIErrorTracker::new(
2229+
Duration::from_secs(ADAPTER_GONE_ERROR_THRESHOLD_S),
2230+
Duration::from_secs(ADAPTER_GONE_MAX_UAPI_INERVAL_S),
2231+
);
20832232
let mut now = Instant::now();
20842233

20852234
// Initially, no error: should Ignore
@@ -2092,12 +2241,12 @@ pub mod tests {
20922241
assert_eq!(verdict, UAPIErrorTrackerVerdict::OK);
20932242

20942243
// After threshold duration, should Ignore (gap too big between uapi calls)
2095-
now += MAX_UAPI_CMD_GAP;
2244+
now += Duration::from_secs(ADAPTER_GONE_MAX_UAPI_INERVAL_S);
20962245
let verdict = tracker.update(&err_resp, now);
20972246
assert_eq!(verdict, UAPIErrorTrackerVerdict::OK);
20982247

20992248
// Trigger errors every second, until just before the threshold, should Ignore
2100-
for _ in 1..REQUIRED_CRITICAL_ERROR_GAP.as_secs() {
2249+
for _ in 1..ADAPTER_GONE_ERROR_THRESHOLD_S {
21012250
now += Duration::from_secs(1);
21022251
let verdict = tracker.update(&err_resp, now);
21032252
assert_eq!(verdict, UAPIErrorTrackerVerdict::OK);
@@ -2113,7 +2262,7 @@ pub mod tests {
21132262
assert_eq!(verdict, UAPIErrorTrackerVerdict::OK);
21142263

21152264
// Trigger errors every second, until just before the threshold, should Ignore
2116-
for _ in 1..REQUIRED_CRITICAL_ERROR_GAP.as_secs() {
2265+
for _ in 1..ADAPTER_GONE_ERROR_THRESHOLD_S {
21172266
now += Duration::from_secs(1);
21182267
let verdict = tracker.update(&err_resp, now);
21192268
assert_eq!(verdict, UAPIErrorTrackerVerdict::OK);
@@ -2131,7 +2280,7 @@ pub mod tests {
21312280

21322281
// Trigger errors repeatedly with big time gaps in between, should be Ignore
21332282
for _ in 0..50 {
2134-
now += MAX_UAPI_CMD_GAP;
2283+
now += Duration::from_secs(ADAPTER_GONE_MAX_UAPI_INERVAL_S);
21352284
let verdict = tracker.update(&err_resp, now);
21362285
assert_eq!(verdict, UAPIErrorTrackerVerdict::OK);
21372286
}

0 commit comments

Comments
 (0)