neuralmagic
diff --git a/‎research/optimal_BERT_surgeon_oBERT/README.md‎
Lines changed: 100 additions & 0 deletions b/‎research/optimal_BERT_surgeon_oBERT/README.md‎
Lines changed: 100 additions & 0 deletions
diff --git a/‎research/optimal_BERT_surgeon_oBERT/recipes/30epochs_4block80_squad.yaml‎
Lines changed: 48 additions & 0 deletions b/‎research/optimal_BERT_surgeon_oBERT/recipes/30epochs_4block80_squad.yaml‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎research/optimal_BERT_surgeon_oBERT/recipes/30epochs_4block90_squad.yaml‎
Lines changed: 48 additions & 0 deletions b/‎research/optimal_BERT_surgeon_oBERT/recipes/30epochs_4block90_squad.yaml‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎research/optimal_BERT_surgeon_oBERT/recipes/30epochs_dense_squad.yaml‎
Lines changed: 25 additions & 0 deletions b/‎research/optimal_BERT_surgeon_oBERT/recipes/30epochs_dense_squad.yaml‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎research/optimal_BERT_surgeon_oBERT/recipes/30epochs_init30_4block80_squad.yaml‎
Lines changed: 48 additions & 0 deletions b/‎research/optimal_BERT_surgeon_oBERT/recipes/30epochs_init30_4block80_squad.yaml‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎research/optimal_BERT_surgeon_oBERT/recipes/30epochs_init30_4block90_squad.yaml‎
Lines changed: 48 additions & 0 deletions b/‎research/optimal_BERT_surgeon_oBERT/recipes/30epochs_init30_4block90_squad.yaml‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎research/optimal_BERT_surgeon_oBERT/recipes/30epochs_init30_unstructured80_squad.yaml‎
Lines changed: 48 additions & 0 deletions b/‎research/optimal_BERT_surgeon_oBERT/recipes/30epochs_init30_unstructured80_squad.yaml‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎research/optimal_BERT_surgeon_oBERT/recipes/30epochs_init30_unstructured90_squad.yaml‎
Lines changed: 48 additions & 0 deletions b/‎research/optimal_BERT_surgeon_oBERT/recipes/30epochs_init30_unstructured90_squad.yaml‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎research/optimal_BERT_surgeon_oBERT/recipes/30epochs_unstructured80_mnli.yaml‎
Lines changed: 48 additions & 0 deletions b/‎research/optimal_BERT_surgeon_oBERT/recipes/30epochs_unstructured80_mnli.yaml‎
Lines changed: 48 additions & 0 deletions
@@ -0,0 +1,48 @@
+modifiers:
+  - !EpochRangeModifier
+    start_epoch: 0
+    end_epoch: 30
+
+training_modifiers:
+  - !LearningRateFunctionModifier
+    start_epoch: 0.0
+    end_epoch: 2.0
+    lr_func: linear
+    init_lr: 8e-5
+    final_lr: 8e-6
+  - !LearningRateFunctionModifier
+    start_epoch: 2.0
+    end_epoch: 30.0
+    lr_func: cyclic_linear
+    cycle_epochs: 4.0
+    init_lr: 8e-5
+    final_lr: 8e-6
+
+  - !OBSPruningModifier
+    params: [
+      "re:bert.encoder.layer.*.attention.self.query.weight",
+      "re:bert.encoder.layer.*.attention.self.key.weight",
+      "re:bert.encoder.layer.*.attention.self.value.weight",
+      "re:bert.encoder.layer.*.attention.output.dense.weight",
+      "re:bert.encoder.layer.*.intermediate.dense.weight",
+      "re:bert.encoder.layer.*.output.dense.weight",
+    ]
+    init_sparsity: 0.7
+    final_sparsity: 0.8
+    start_epoch: 2
+    end_epoch: 26
+    update_frequency: 4.0
+    inter_func: cubic
+    global_sparsity: True
+    mask_type: block4
+    num_grads: 1024
+    damp: 1e-7
+    fisher_block_size: 32
+    grad_sampler_kwargs:
+      batch_size: 8
+
+distillation_modifiers:
+  - !DistillationModifier
+     hardness: 1.0
+     temperature: 2.0
+     distill_output_keys: [start_logits, end_logits]
@@ -0,0 +1,48 @@
+modifiers:
+  - !EpochRangeModifier
+    start_epoch: 0
+    end_epoch: 30
+
+training_modifiers:
+  - !LearningRateFunctionModifier
+    start_epoch: 0.0
+    end_epoch: 2.0
+    lr_func: linear
+    init_lr: 8e-5
+    final_lr: 8e-6
+  - !LearningRateFunctionModifier
+    start_epoch: 2.0
+    end_epoch: 30.0
+    lr_func: cyclic_linear
+    cycle_epochs: 4.0
+    init_lr: 8e-5
+    final_lr: 8e-6
+
+  - !OBSPruningModifier
+    params: [
+      "re:bert.encoder.layer.*.attention.self.query.weight",
+      "re:bert.encoder.layer.*.attention.self.key.weight",
+      "re:bert.encoder.layer.*.attention.self.value.weight",
+      "re:bert.encoder.layer.*.attention.output.dense.weight",
+      "re:bert.encoder.layer.*.intermediate.dense.weight",
+      "re:bert.encoder.layer.*.output.dense.weight",
+    ]
+    init_sparsity: 0.7
+    final_sparsity: 0.9
+    start_epoch: 2
+    end_epoch: 26
+    update_frequency: 4.0
+    inter_func: cubic
+    global_sparsity: True
+    mask_type: block4
+    num_grads: 1024
+    damp: 1e-7
+    fisher_block_size: 32
+    grad_sampler_kwargs:
+      batch_size: 8
+
+distillation_modifiers:
+  - !DistillationModifier
+     hardness: 1.0
+     temperature: 2.0
+     distill_output_keys: [start_logits, end_logits]
@@ -0,0 +1,25 @@
+modifiers:
+  - !EpochRangeModifier
+    start_epoch: 0
+    end_epoch: 30
+
+training_modifiers:
+  - !LearningRateFunctionModifier
+    start_epoch: 0.0
+    end_epoch: 2.0
+    lr_func: linear
+    init_lr: 8e-5
+    final_lr: 8e-6
+  - !LearningRateFunctionModifier
+    start_epoch: 2.0
+    end_epoch: 30.0
+    lr_func: cyclic_linear
+    cycle_epochs: 4.0
+    init_lr: 8e-5
+    final_lr: 8e-6
+
+distillation_modifiers:
+  - !DistillationModifier
+     hardness: 1.0
+     temperature: 2.0
+     distill_output_keys: [start_logits, end_logits]
@@ -0,0 +1,48 @@
+modifiers:
+  - !EpochRangeModifier
+    start_epoch: 0
+    end_epoch: 30
+
+training_modifiers:
+  - !LearningRateFunctionModifier
+    start_epoch: 0.0
+    end_epoch: 2.0
+    lr_func: linear
+    init_lr: 8e-5
+    final_lr: 8e-6
+  - !LearningRateFunctionModifier
+    start_epoch: 2.0
+    end_epoch: 30.0
+    lr_func: cyclic_linear
+    cycle_epochs: 4.0
+    init_lr: 8e-5
+    final_lr: 8e-6
+
+  - !OBSPruningModifier
+    params: [
+      "re:bert.encoder.layer.*.attention.self.query.weight",
+      "re:bert.encoder.layer.*.attention.self.key.weight",
+      "re:bert.encoder.layer.*.attention.self.value.weight",
+      "re:bert.encoder.layer.*.attention.output.dense.weight",
+      "re:bert.encoder.layer.*.intermediate.dense.weight",
+      "re:bert.encoder.layer.*.output.dense.weight",
+    ]
+    init_sparsity: 0.3
+    final_sparsity: 0.8
+    start_epoch: 2
+    end_epoch: 26
+    update_frequency: 4.0
+    inter_func: cubic
+    global_sparsity: True
+    mask_type: block4
+    num_grads: 1024
+    damp: 1e-7
+    fisher_block_size: 32
+    grad_sampler_kwargs:
+      batch_size: 8
+
+distillation_modifiers:
+  - !DistillationModifier
+     hardness: 1.0
+     temperature: 2.0
+     distill_output_keys: [start_logits, end_logits]
@@ -0,0 +1,48 @@
+modifiers:
+  - !EpochRangeModifier
+    start_epoch: 0
+    end_epoch: 30
+
+training_modifiers:
+  - !LearningRateFunctionModifier
+    start_epoch: 0.0
+    end_epoch: 2.0
+    lr_func: linear
+    init_lr: 8e-5
+    final_lr: 8e-6
+  - !LearningRateFunctionModifier
+    start_epoch: 2.0
+    end_epoch: 30.0
+    lr_func: cyclic_linear
+    cycle_epochs: 4.0
+    init_lr: 8e-5
+    final_lr: 8e-6
+
+  - !OBSPruningModifier
+    params: [
+      "re:bert.encoder.layer.*.attention.self.query.weight",
+      "re:bert.encoder.layer.*.attention.self.key.weight",
+      "re:bert.encoder.layer.*.attention.self.value.weight",
+      "re:bert.encoder.layer.*.attention.output.dense.weight",
+      "re:bert.encoder.layer.*.intermediate.dense.weight",
+      "re:bert.encoder.layer.*.output.dense.weight",
+    ]
+    init_sparsity: 0.3
+    final_sparsity: 0.9
+    start_epoch: 2
+    end_epoch: 26
+    update_frequency: 4.0
+    inter_func: cubic
+    global_sparsity: True
+    mask_type: block4
+    num_grads: 1024
+    damp: 1e-7
+    fisher_block_size: 32
+    grad_sampler_kwargs:
+      batch_size: 8
+
+distillation_modifiers:
+  - !DistillationModifier
+     hardness: 1.0
+     temperature: 2.0
+     distill_output_keys: [start_logits, end_logits]
@@ -0,0 +1,48 @@
+modifiers:
+  - !EpochRangeModifier
+    start_epoch: 0
+    end_epoch: 30
+
+training_modifiers:
+  - !LearningRateFunctionModifier
+    start_epoch: 0.0
+    end_epoch: 2.0
+    lr_func: linear
+    init_lr: 8e-5
+    final_lr: 8e-6
+  - !LearningRateFunctionModifier
+    start_epoch: 2.0
+    end_epoch: 30.0
+    lr_func: cyclic_linear
+    cycle_epochs: 4.0
+    init_lr: 8e-5
+    final_lr: 8e-6
+
+  - !OBSPruningModifier
+    params: [
+      "re:bert.encoder.layer.*.attention.self.query.weight",
+      "re:bert.encoder.layer.*.attention.self.key.weight",
+      "re:bert.encoder.layer.*.attention.self.value.weight",
+      "re:bert.encoder.layer.*.attention.output.dense.weight",
+      "re:bert.encoder.layer.*.intermediate.dense.weight",
+      "re:bert.encoder.layer.*.output.dense.weight",
+    ]
+    init_sparsity: 0.3
+    final_sparsity: 0.8
+    start_epoch: 2
+    end_epoch: 26
+    update_frequency: 4.0
+    inter_func: cubic
+    global_sparsity: True
+    mask_type: unstructured
+    num_grads: 1024
+    damp: 1e-7
+    fisher_block_size: 50
+    grad_sampler_kwargs:
+      batch_size: 8
+
+distillation_modifiers:
+  - !DistillationModifier
+     hardness: 1.0
+     temperature: 2.0
+     distill_output_keys: [start_logits, end_logits]
@@ -0,0 +1,48 @@
+modifiers:
+  - !EpochRangeModifier
+    start_epoch: 0
+    end_epoch: 30
+
+training_modifiers:
+  - !LearningRateFunctionModifier
+    start_epoch: 0.0
+    end_epoch: 2.0
+    lr_func: linear
+    init_lr: 8e-5
+    final_lr: 8e-6
+  - !LearningRateFunctionModifier
+    start_epoch: 2.0
+    end_epoch: 30.0
+    lr_func: cyclic_linear
+    cycle_epochs: 4.0
+    init_lr: 8e-5
+    final_lr: 8e-6
+
+  - !OBSPruningModifier
+    params: [
+      "re:bert.encoder.layer.*.attention.self.query.weight",
+      "re:bert.encoder.layer.*.attention.self.key.weight",
+      "re:bert.encoder.layer.*.attention.self.value.weight",
+      "re:bert.encoder.layer.*.attention.output.dense.weight",
+      "re:bert.encoder.layer.*.intermediate.dense.weight",
+      "re:bert.encoder.layer.*.output.dense.weight",
+    ]
+    init_sparsity: 0.3
+    final_sparsity: 0.9
+    start_epoch: 2
+    end_epoch: 26
+    update_frequency: 4.0
+    inter_func: cubic
+    global_sparsity: True
+    mask_type: unstructured
+    num_grads: 1024
+    damp: 1e-7
+    fisher_block_size: 50
+    grad_sampler_kwargs:
+      batch_size: 8
+
+distillation_modifiers:
+  - !DistillationModifier
+     hardness: 1.0
+     temperature: 2.0
+     distill_output_keys: [start_logits, end_logits]
@@ -0,0 +1,48 @@
+modifiers:
+  - !EpochRangeModifier
+    start_epoch: 0
+    end_epoch: 30
+
+training_modifiers:
+  - !LearningRateFunctionModifier
+    start_epoch: 0.0
+    end_epoch: 2.0
+    lr_func: linear
+    init_lr: 5e-5
+    final_lr: 5e-6
+  - !LearningRateFunctionModifier
+    start_epoch: 2.0
+    end_epoch: 30.0
+    lr_func: cyclic_linear
+    cycle_epochs: 4.0
+    init_lr: 5e-5
+    final_lr: 5e-6
+
+  - !OBSPruningModifier
+    params: [
+      "re:bert.encoder.layer.*.attention.self.query.weight",
+      "re:bert.encoder.layer.*.attention.self.key.weight",
+      "re:bert.encoder.layer.*.attention.self.value.weight",
+      "re:bert.encoder.layer.*.attention.output.dense.weight",
+      "re:bert.encoder.layer.*.intermediate.dense.weight",
+      "re:bert.encoder.layer.*.output.dense.weight",
+    ]
+    init_sparsity: 0.7
+    final_sparsity: 0.8
+    start_epoch: 2
+    end_epoch: 26
+    update_frequency: 4.0
+    inter_func: cubic
+    global_sparsity: True
+    mask_type: unstructured
+    num_grads: 1024
+    damp: 1e-7
+    fisher_block_size: 50
+    grad_sampler_kwargs:
+      batch_size: 16
+
+distillation_modifiers:
+  - !DistillationModifier
+     hardness: 1.0
+     temperature: 2.0
+     distill_output_keys: [logits]