Skip to content
This repository was archived by the owner on Jun 3, 2025. It is now read-only.

Commit 76fe720

Browse files
Add oBERT docs (#886)
Co-authored-by: Benjamin Fineran <bfineran@users.noreply.github.com>
1 parent 7db0bdd commit 76fe720

28 files changed

+1590
-0
lines changed

research/optimal_BERT_surgeon_oBERT/README.md

Lines changed: 100 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
modifiers:
2+
- !EpochRangeModifier
3+
start_epoch: 0
4+
end_epoch: 30
5+
6+
training_modifiers:
7+
- !LearningRateFunctionModifier
8+
start_epoch: 0.0
9+
end_epoch: 2.0
10+
lr_func: linear
11+
init_lr: 8e-5
12+
final_lr: 8e-6
13+
- !LearningRateFunctionModifier
14+
start_epoch: 2.0
15+
end_epoch: 30.0
16+
lr_func: cyclic_linear
17+
cycle_epochs: 4.0
18+
init_lr: 8e-5
19+
final_lr: 8e-6
20+
21+
- !OBSPruningModifier
22+
params: [
23+
"re:bert.encoder.layer.*.attention.self.query.weight",
24+
"re:bert.encoder.layer.*.attention.self.key.weight",
25+
"re:bert.encoder.layer.*.attention.self.value.weight",
26+
"re:bert.encoder.layer.*.attention.output.dense.weight",
27+
"re:bert.encoder.layer.*.intermediate.dense.weight",
28+
"re:bert.encoder.layer.*.output.dense.weight",
29+
]
30+
init_sparsity: 0.7
31+
final_sparsity: 0.8
32+
start_epoch: 2
33+
end_epoch: 26
34+
update_frequency: 4.0
35+
inter_func: cubic
36+
global_sparsity: True
37+
mask_type: block4
38+
num_grads: 1024
39+
damp: 1e-7
40+
fisher_block_size: 32
41+
grad_sampler_kwargs:
42+
batch_size: 8
43+
44+
distillation_modifiers:
45+
- !DistillationModifier
46+
hardness: 1.0
47+
temperature: 2.0
48+
distill_output_keys: [start_logits, end_logits]
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
modifiers:
2+
- !EpochRangeModifier
3+
start_epoch: 0
4+
end_epoch: 30
5+
6+
training_modifiers:
7+
- !LearningRateFunctionModifier
8+
start_epoch: 0.0
9+
end_epoch: 2.0
10+
lr_func: linear
11+
init_lr: 8e-5
12+
final_lr: 8e-6
13+
- !LearningRateFunctionModifier
14+
start_epoch: 2.0
15+
end_epoch: 30.0
16+
lr_func: cyclic_linear
17+
cycle_epochs: 4.0
18+
init_lr: 8e-5
19+
final_lr: 8e-6
20+
21+
- !OBSPruningModifier
22+
params: [
23+
"re:bert.encoder.layer.*.attention.self.query.weight",
24+
"re:bert.encoder.layer.*.attention.self.key.weight",
25+
"re:bert.encoder.layer.*.attention.self.value.weight",
26+
"re:bert.encoder.layer.*.attention.output.dense.weight",
27+
"re:bert.encoder.layer.*.intermediate.dense.weight",
28+
"re:bert.encoder.layer.*.output.dense.weight",
29+
]
30+
init_sparsity: 0.7
31+
final_sparsity: 0.9
32+
start_epoch: 2
33+
end_epoch: 26
34+
update_frequency: 4.0
35+
inter_func: cubic
36+
global_sparsity: True
37+
mask_type: block4
38+
num_grads: 1024
39+
damp: 1e-7
40+
fisher_block_size: 32
41+
grad_sampler_kwargs:
42+
batch_size: 8
43+
44+
distillation_modifiers:
45+
- !DistillationModifier
46+
hardness: 1.0
47+
temperature: 2.0
48+
distill_output_keys: [start_logits, end_logits]
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
modifiers:
2+
- !EpochRangeModifier
3+
start_epoch: 0
4+
end_epoch: 30
5+
6+
training_modifiers:
7+
- !LearningRateFunctionModifier
8+
start_epoch: 0.0
9+
end_epoch: 2.0
10+
lr_func: linear
11+
init_lr: 8e-5
12+
final_lr: 8e-6
13+
- !LearningRateFunctionModifier
14+
start_epoch: 2.0
15+
end_epoch: 30.0
16+
lr_func: cyclic_linear
17+
cycle_epochs: 4.0
18+
init_lr: 8e-5
19+
final_lr: 8e-6
20+
21+
distillation_modifiers:
22+
- !DistillationModifier
23+
hardness: 1.0
24+
temperature: 2.0
25+
distill_output_keys: [start_logits, end_logits]
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
modifiers:
2+
- !EpochRangeModifier
3+
start_epoch: 0
4+
end_epoch: 30
5+
6+
training_modifiers:
7+
- !LearningRateFunctionModifier
8+
start_epoch: 0.0
9+
end_epoch: 2.0
10+
lr_func: linear
11+
init_lr: 8e-5
12+
final_lr: 8e-6
13+
- !LearningRateFunctionModifier
14+
start_epoch: 2.0
15+
end_epoch: 30.0
16+
lr_func: cyclic_linear
17+
cycle_epochs: 4.0
18+
init_lr: 8e-5
19+
final_lr: 8e-6
20+
21+
- !OBSPruningModifier
22+
params: [
23+
"re:bert.encoder.layer.*.attention.self.query.weight",
24+
"re:bert.encoder.layer.*.attention.self.key.weight",
25+
"re:bert.encoder.layer.*.attention.self.value.weight",
26+
"re:bert.encoder.layer.*.attention.output.dense.weight",
27+
"re:bert.encoder.layer.*.intermediate.dense.weight",
28+
"re:bert.encoder.layer.*.output.dense.weight",
29+
]
30+
init_sparsity: 0.3
31+
final_sparsity: 0.8
32+
start_epoch: 2
33+
end_epoch: 26
34+
update_frequency: 4.0
35+
inter_func: cubic
36+
global_sparsity: True
37+
mask_type: block4
38+
num_grads: 1024
39+
damp: 1e-7
40+
fisher_block_size: 32
41+
grad_sampler_kwargs:
42+
batch_size: 8
43+
44+
distillation_modifiers:
45+
- !DistillationModifier
46+
hardness: 1.0
47+
temperature: 2.0
48+
distill_output_keys: [start_logits, end_logits]
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
modifiers:
2+
- !EpochRangeModifier
3+
start_epoch: 0
4+
end_epoch: 30
5+
6+
training_modifiers:
7+
- !LearningRateFunctionModifier
8+
start_epoch: 0.0
9+
end_epoch: 2.0
10+
lr_func: linear
11+
init_lr: 8e-5
12+
final_lr: 8e-6
13+
- !LearningRateFunctionModifier
14+
start_epoch: 2.0
15+
end_epoch: 30.0
16+
lr_func: cyclic_linear
17+
cycle_epochs: 4.0
18+
init_lr: 8e-5
19+
final_lr: 8e-6
20+
21+
- !OBSPruningModifier
22+
params: [
23+
"re:bert.encoder.layer.*.attention.self.query.weight",
24+
"re:bert.encoder.layer.*.attention.self.key.weight",
25+
"re:bert.encoder.layer.*.attention.self.value.weight",
26+
"re:bert.encoder.layer.*.attention.output.dense.weight",
27+
"re:bert.encoder.layer.*.intermediate.dense.weight",
28+
"re:bert.encoder.layer.*.output.dense.weight",
29+
]
30+
init_sparsity: 0.3
31+
final_sparsity: 0.9
32+
start_epoch: 2
33+
end_epoch: 26
34+
update_frequency: 4.0
35+
inter_func: cubic
36+
global_sparsity: True
37+
mask_type: block4
38+
num_grads: 1024
39+
damp: 1e-7
40+
fisher_block_size: 32
41+
grad_sampler_kwargs:
42+
batch_size: 8
43+
44+
distillation_modifiers:
45+
- !DistillationModifier
46+
hardness: 1.0
47+
temperature: 2.0
48+
distill_output_keys: [start_logits, end_logits]
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
modifiers:
2+
- !EpochRangeModifier
3+
start_epoch: 0
4+
end_epoch: 30
5+
6+
training_modifiers:
7+
- !LearningRateFunctionModifier
8+
start_epoch: 0.0
9+
end_epoch: 2.0
10+
lr_func: linear
11+
init_lr: 8e-5
12+
final_lr: 8e-6
13+
- !LearningRateFunctionModifier
14+
start_epoch: 2.0
15+
end_epoch: 30.0
16+
lr_func: cyclic_linear
17+
cycle_epochs: 4.0
18+
init_lr: 8e-5
19+
final_lr: 8e-6
20+
21+
- !OBSPruningModifier
22+
params: [
23+
"re:bert.encoder.layer.*.attention.self.query.weight",
24+
"re:bert.encoder.layer.*.attention.self.key.weight",
25+
"re:bert.encoder.layer.*.attention.self.value.weight",
26+
"re:bert.encoder.layer.*.attention.output.dense.weight",
27+
"re:bert.encoder.layer.*.intermediate.dense.weight",
28+
"re:bert.encoder.layer.*.output.dense.weight",
29+
]
30+
init_sparsity: 0.3
31+
final_sparsity: 0.8
32+
start_epoch: 2
33+
end_epoch: 26
34+
update_frequency: 4.0
35+
inter_func: cubic
36+
global_sparsity: True
37+
mask_type: unstructured
38+
num_grads: 1024
39+
damp: 1e-7
40+
fisher_block_size: 50
41+
grad_sampler_kwargs:
42+
batch_size: 8
43+
44+
distillation_modifiers:
45+
- !DistillationModifier
46+
hardness: 1.0
47+
temperature: 2.0
48+
distill_output_keys: [start_logits, end_logits]
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
modifiers:
2+
- !EpochRangeModifier
3+
start_epoch: 0
4+
end_epoch: 30
5+
6+
training_modifiers:
7+
- !LearningRateFunctionModifier
8+
start_epoch: 0.0
9+
end_epoch: 2.0
10+
lr_func: linear
11+
init_lr: 8e-5
12+
final_lr: 8e-6
13+
- !LearningRateFunctionModifier
14+
start_epoch: 2.0
15+
end_epoch: 30.0
16+
lr_func: cyclic_linear
17+
cycle_epochs: 4.0
18+
init_lr: 8e-5
19+
final_lr: 8e-6
20+
21+
- !OBSPruningModifier
22+
params: [
23+
"re:bert.encoder.layer.*.attention.self.query.weight",
24+
"re:bert.encoder.layer.*.attention.self.key.weight",
25+
"re:bert.encoder.layer.*.attention.self.value.weight",
26+
"re:bert.encoder.layer.*.attention.output.dense.weight",
27+
"re:bert.encoder.layer.*.intermediate.dense.weight",
28+
"re:bert.encoder.layer.*.output.dense.weight",
29+
]
30+
init_sparsity: 0.3
31+
final_sparsity: 0.9
32+
start_epoch: 2
33+
end_epoch: 26
34+
update_frequency: 4.0
35+
inter_func: cubic
36+
global_sparsity: True
37+
mask_type: unstructured
38+
num_grads: 1024
39+
damp: 1e-7
40+
fisher_block_size: 50
41+
grad_sampler_kwargs:
42+
batch_size: 8
43+
44+
distillation_modifiers:
45+
- !DistillationModifier
46+
hardness: 1.0
47+
temperature: 2.0
48+
distill_output_keys: [start_logits, end_logits]
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
modifiers:
2+
- !EpochRangeModifier
3+
start_epoch: 0
4+
end_epoch: 30
5+
6+
training_modifiers:
7+
- !LearningRateFunctionModifier
8+
start_epoch: 0.0
9+
end_epoch: 2.0
10+
lr_func: linear
11+
init_lr: 5e-5
12+
final_lr: 5e-6
13+
- !LearningRateFunctionModifier
14+
start_epoch: 2.0
15+
end_epoch: 30.0
16+
lr_func: cyclic_linear
17+
cycle_epochs: 4.0
18+
init_lr: 5e-5
19+
final_lr: 5e-6
20+
21+
- !OBSPruningModifier
22+
params: [
23+
"re:bert.encoder.layer.*.attention.self.query.weight",
24+
"re:bert.encoder.layer.*.attention.self.key.weight",
25+
"re:bert.encoder.layer.*.attention.self.value.weight",
26+
"re:bert.encoder.layer.*.attention.output.dense.weight",
27+
"re:bert.encoder.layer.*.intermediate.dense.weight",
28+
"re:bert.encoder.layer.*.output.dense.weight",
29+
]
30+
init_sparsity: 0.7
31+
final_sparsity: 0.8
32+
start_epoch: 2
33+
end_epoch: 26
34+
update_frequency: 4.0
35+
inter_func: cubic
36+
global_sparsity: True
37+
mask_type: unstructured
38+
num_grads: 1024
39+
damp: 1e-7
40+
fisher_block_size: 50
41+
grad_sampler_kwargs:
42+
batch_size: 16
43+
44+
distillation_modifiers:
45+
- !DistillationModifier
46+
hardness: 1.0
47+
temperature: 2.0
48+
distill_output_keys: [logits]

0 commit comments

Comments
 (0)